From: Serhiy Storchaka Date: Thu, 18 May 2017 09:34:40 +0000 (+0300) Subject: [2.7] bpo-30363: Backport warnings in the re module. (#1577) X-Git-Tag: v2.7.14rc1~142 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=955b6760cfa73c54bae9b6f2b335eb0cd806c7b0;p=python [2.7] bpo-30363: Backport warnings in the re module. (#1577) Running Python with the -3 option now warns about regular expression syntax that is invalid or has different semantic in Python 3 or will change the behavior in future Python versions. --- diff --git a/Lib/_strptime.py b/Lib/_strptime.py index feac05a001..8eb2718d5c 100644 --- a/Lib/_strptime.py +++ b/Lib/_strptime.py @@ -254,8 +254,8 @@ class TimeRE(dict): # format directives (%m, etc.). regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") format = regex_chars.sub(r"\\\1", format) - whitespace_replacement = re_compile('\s+') - format = whitespace_replacement.sub('\s+', format) + whitespace_replacement = re_compile(r'\s+') + format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: directive_index = format.index('%')+1 processed_format = "%s%s%s" % (processed_format, diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c5a7e89d07..b6689fa7a7 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -435,7 +435,7 @@ def _compile_info(code, pattern, flags): # this contains min/max pattern width, and an optional literal # prefix or a character map lo, hi = pattern.getwidth() - if lo == 0: + if not lo and hi: return # not worth it # look for a literal prefix prefix = [] diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 75f488b547..e0d003ed85 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -23,6 +23,7 @@ DIGITS = set("0123456789") OCTDIGITS = set("01234567") HEXDIGITS = set("0123456789abcdefABCDEF") +ASCIILETTERS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") WHITESPACE = set(" \t\n\r\v\f") @@ -260,6 +261,15 @@ def _class_escape(source, escape): elif c in DIGITS: raise error, "bogus escape: %s" % repr(escape) if len(escape) == 2: + if sys.py3kwarning and c in ASCIILETTERS: + import warnings + if c in 'Uu': + warnings.warn('bad escape %s; Unicode escapes are ' + 'supported only since Python 3.3' % escape, + FutureWarning, stacklevel=8) + else: + warnings.warnpy3k('bad escape %s' % escape, + DeprecationWarning, stacklevel=8) return LITERAL, ord(escape[1]) except ValueError: pass @@ -309,6 +319,15 @@ def _escape(source, escape, state): return GROUPREF, group raise ValueError if len(escape) == 2: + if sys.py3kwarning and c in ASCIILETTERS: + import warnings + if c in 'Uu': + warnings.warn('bad escape %s; Unicode escapes are ' + 'supported only since Python 3.3' % escape, + FutureWarning, stacklevel=8) + else: + warnings.warnpy3k('bad escape %s' % escape, + DeprecationWarning, stacklevel=8) return LITERAL, ord(escape[1]) except ValueError: pass @@ -714,6 +733,12 @@ def parse(str, flags=0, pattern=None): pattern.str = str p = _parse_sub(source, pattern, 0) + if (sys.py3kwarning and + (p.pattern.flags & SRE_FLAG_LOCALE) and + (p.pattern.flags & SRE_FLAG_UNICODE)): + import warnings + warnings.warnpy3k("LOCALE and UNICODE flags are incompatible", + DeprecationWarning, stacklevel=5) tail = source.get() if tail == ")": @@ -801,7 +826,10 @@ def parse_template(source, pattern): try: this = makechar(ESCAPES[this][1]) except KeyError: - pass + if sys.py3kwarning and c in ASCIILETTERS: + import warnings + warnings.warnpy3k('bad escape %s' % this, + DeprecationWarning, stacklevel=4) literal(this) else: literal(this) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 5725a99ad6..174c5ca462 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -3,7 +3,7 @@ from test.test_support import ( verbose, run_unittest, import_module, precisionbigmemtest, _2G, cpython_only, captured_stdout, have_unicode, requires_unicode, u, - check_warnings) + check_warnings, check_py3k_warnings) import locale import re from re import Scanner @@ -66,11 +66,13 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', '\g<1>\g<1>', 'xx'), 'xxxx') - self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), - '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), - (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) + self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), + (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) + for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + with check_py3k_warnings(): + self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') @@ -223,11 +225,11 @@ class ReTests(unittest.TestCase): def test_re_split(self): self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) - self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:*)", ":a:b::c"), + self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:+)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c']) - self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:)*", ":a:b::c"), + self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:)+", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) @@ -237,13 +239,34 @@ class ReTests(unittest.TestCase): self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) + for sep, expected in [ + (':*', ['', 'a', 'b', 'c']), + ('(?::*)', ['', 'a', 'b', 'c']), + ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), + ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), + ]: + with check_py3k_warnings(('', FutureWarning)): + self.assertEqual(re.split(sep, ':a:b::c'), expected) + + for sep, expected in [ + ('', [':a:b::c']), + (r'\b', [':a:b::c']), + (r'(?=:)', [':a:b::c']), + (r'(?<=:)', [':a:b::c']), + ]: + with check_py3k_warnings(): + self.assertEqual(re.split(sep, ':a:b::c'), expected) + def test_qualified_re_split(self): self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) self.assertEqual(re.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) - self.assertEqual(re.split("(:*)", ":a:b::c", 2), + self.assertEqual(re.split("(:+)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) + with check_py3k_warnings(('', FutureWarning)): + self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -404,6 +427,29 @@ class ReTests(unittest.TestCase): self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.UNICODE).group(0), "1aa! a") + def test_other_escapes(self): + self.assertRaises(re.error, re.compile, "\\") + self.assertEqual(re.match(r"\(", '(').group(), '(') + self.assertIsNone(re.match(r"\(", ')')) + self.assertEqual(re.match(r"\\", '\\').group(), '\\') + self.assertEqual(re.match(r"[\]]", ']').group(), ']') + self.assertIsNone(re.match(r"[\]]", '[')) + self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') + self.assertIsNone(re.match(r"[a\-c]", 'b')) + self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') + self.assertIsNone(re.match(r"[\^a]+", 'b')) + re.purge() # for warnings + for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY': + warn = FutureWarning if c in 'Uu' else DeprecationWarning + with check_py3k_warnings(('', warn)): + self.assertEqual(re.match('\\%c$' % c, c).group(), c) + self.assertIsNone(re.match('\\%c' % c, 'a')) + for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ': + warn = FutureWarning if c in 'Uu' else DeprecationWarning + with check_py3k_warnings(('', warn)): + self.assertEqual(re.match('[\\%c]$' % c, c).group(), c) + self.assertIsNone(re.match('[\\%c]' % c, 'a')) + def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), @@ -931,6 +977,19 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char)) self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char)) + # Incompatibilities + re.purge() + with check_py3k_warnings(): + re.compile('', re.LOCALE|re.UNICODE) + with check_py3k_warnings(): + re.compile('(?L)', re.UNICODE) + with check_py3k_warnings(): + re.compile('(?u)', re.LOCALE) + with check_py3k_warnings(): + re.compile('(?Lu)') + with check_py3k_warnings(): + re.compile('(?uL)') + def test_dollar_matches_twice(self): "$ matches the end of string, and just before the terminating \n" pattern = re.compile('$') @@ -967,8 +1026,9 @@ class ReTests(unittest.TestCase): def test_bug_13899(self): # Issue #13899: re pattern r"[\A]" should work like "A" but matches # nothing. Ditto B and Z. - self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), - ['A', 'B', '\b', 'C', 'Z']) + with check_py3k_warnings(): + self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), + ['A', 'B', '\b', 'C', 'Z']) @precisionbigmemtest(size=_2G, memuse=1) def test_large_search(self, size): @@ -1261,7 +1321,11 @@ def run_re_tests(): def test_main(): run_unittest(ReTests) - run_re_tests() + deprecations = [ + ('bad escape', DeprecationWarning), + ] + with check_py3k_warnings(*deprecations): + run_re_tests() if __name__ == "__main__": test_main() diff --git a/Misc/NEWS b/Misc/NEWS index dd6ec1b419..3f34c6c264 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -42,6 +42,10 @@ Extension Modules Library ------- +- bpo-30363: Running Python with the -3 option now warns about regular + expression syntax that is invalid or has different semantic in Python 3 + or will change the behavior in future Python versions. + - bpo-30365: Running Python with the -3 option now emits deprecation warnings for getchildren() and getiterator() methods of the Element class in the xml.etree.cElementTree module and when pass the html argument to diff --git a/Modules/_sre.c b/Modules/_sre.c index 8e16c1d140..6fd3affb09 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -2267,6 +2267,20 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw) if (!string) return NULL; + if (Py_Py3kWarningFlag && + (self->code[0] != SRE_OP_INFO || self->code[3] == 0)) + { + if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) { + if (PyErr_WarnPy3k("split() requires a non-empty pattern match.", + 1) < 0) + return NULL; + } + else if (PyErr_WarnEx(PyExc_FutureWarning, + "split() requires a non-empty pattern match.", + 1) < 0) + return NULL; + } + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); if (!string) return NULL;