bpo-30349: Raise FutureWarning for nested sets and set operations (#1553)

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 16 Nov 2017 10:38:26 +0000 (12:38 +0200)

committer GitHub <noreply@github.com>

Thu, 16 Nov 2017 10:38:26 +0000 (12:38 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 16 Nov 2017 10:38:26 +0000 (12:38 +0200)
committer GitHub <noreply@github.com>
Thu, 16 Nov 2017 10:38:26 +0000 (12:38 +0200)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index cbb2f439d15391020c9dd238aa48cd78a5a3cb85..8c15462871b8f152ad8523427ba15b7052c8048b 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -200,6 +200,20 @@ The special characters are:
       place it at the beginning of the set.  For example, both ``[()[\]{}]`` and
       ``[]()[{}]`` will both match a parenthesis.
  
+   * Support of nested sets and set operations as in `Unicode Technical
+     Standard #18`_ might be added in the future.  This would change the
+     syntax, so to facilitate this change a :exc:`FutureWarning` will be raised
+     in ambiguous cases for the time being.
+     That include sets starting with a literal ``'['`` or containing literal
+     character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``.  To
+     avoid a warning escape them with a backslash.
+
+   .. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
+
+   .. versionchanged:: 3.7
+      :exc:`FutureWarning` is raised if a character set contains constructs
+      that will change semantically in the future.
+
  ``|``
     ``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that
     will match either *A* or *B*.  An arbitrary number of REs can be separated by the
@@ -829,7 +843,7 @@ form.
  
        >>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
        >>> print('[%s]+' % re.escape(legal_chars))
-      [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
+      [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%\&'\*\+\-\.\^_`\|\~:]+
  
        >>> operators = ['+', '-', '*', '/', '**']
        >>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
diff --git a/Doc/tools/susp-ignored.csv b/Doc/tools/susp-ignored.csv

index 2b3ccf3ac607000f8e4df3deb478f6d9f157107c..d52f81b76b52f0a0f409e3c2aec2e3da940012b5 100644 (file)
--- a/Doc/tools/susp-ignored.csv
+++ b/Doc/tools/susp-ignored.csv
@@ -300,7 +300,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
  whatsnew/3.2,,:location,zope9-location = ${zope9:location}
  whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
  library/re,,`,!#$%&'*+-.^_`|~:
-library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
+library/re,,`,!\#\$%\&'\*\+\-\.\^_`\|\~:
  library/tarfile,,:xz,'x:xz'
  library/xml.etree.elementtree,,:sometag,prefix:sometag
  library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst

index a2fea50d091cc990f933d9fee1b117fdfae599e9..9d63540e630991466652ef3f0a6daa3ca93975d0 100644 (file)
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -700,6 +700,17 @@ Changes in the Python API
    argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory
    is failed.
  
+* Support of nested sets and set operations in regular expressions as in
+  `Unicode Technical Standard #18`_ might be added in the future.  This would
+  change the syntax, so to facilitate this change a :exc:`FutureWarning` will
+  be raised in ambiguous cases for the time being.
+  That include sets starting with a literal ``'['`` or containing literal
+  character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``.  To
+  avoid a warning escape them with a backslash.
+  (Contributed by Serhiy Storchaka in :issue:`30349`.)
+
+.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
+
  
  Changes in the C API
  --------------------
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 9b9697f77346a60ab37321f46c4b58845135bd1a..b4737c806e1ddc8cbf64fc213b36bd69b2971764 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -1354,15 +1354,14 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
  
  _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
  _non_atom_end_matcher = re.compile(r"[^{}]+".format(
-    ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
+    re.escape(''.join(ATOM_ENDS)))).match
  _non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
  _non_token_end_matcher = re.compile(r"[^{}]+".format(
-    ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
+    re.escape(''.join(TOKEN_ENDS)))).match
  _non_attribute_end_matcher = re.compile(r"[^{}]+".format(
-    ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
+    re.escape(''.join(ATTRIBUTE_ENDS)))).match
  _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
-    ''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
-                                    '\\','\\\\').replace(']',r'\]'))).match
+    re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
  
  def _validate_xtext(xtext):
      """If input token contains ASCII non-printables, register a defect."""
diff --git a/Lib/re.py b/Lib/re.py

index abbf8d6e290e542fb5bf185fba408303652b249a..a8b6753d3909643e34ea1e50f4fe6791bd7c04fb 100644 (file)
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -251,8 +251,9 @@ def template(pattern, flags=0):
  # SPECIAL_CHARS
  # closing ')', '}' and ']'
  # '-' (a range in character set)
+# '&', '~', (extended character set operations)
  # '#' (comment) and WHITESPACE (ignored) in verbose mode
-_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
+_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}
  
  def escape(pattern):
      """
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py

index 85274122938bedf0188799f04f6cc6434120b3be..a53735b07ded420ca2bc9fa636f40b9eb2241500 100644 (file)
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -517,6 +517,12 @@ def _parse(source, state, verbose, nested, first=False):
              setappend = set.append
  ##          if sourcematch(":"):
  ##              pass # handle character classes
+            if source.next == '[':
+                import warnings
+                warnings.warn(
+                    'Possible nested set at position %d' % source.tell(),
+                    FutureWarning, stacklevel=nested + 6
+                )
              negate = sourcematch("^")
              # check remaining characters
              while True:
@@ -529,6 +535,17 @@ def _parse(source, state, verbose, nested, first=False):
                  elif this[0] == "\\":
                      code1 = _class_escape(source, this)
                  else:
+                    if set and this in '-&~|' and source.next == this:
+                        import warnings
+                        warnings.warn(
+                            'Possible set %s at position %d' % (
+                                'difference' if this == '-' else
+                                'intersection' if this == '&' else
+                                'symmetric difference' if this == '~' else
+                                'union',
+                                source.tell() - 1),
+                            FutureWarning, stacklevel=nested + 6
+                        )
                      code1 = LITERAL, _ord(this)
                  if sourcematch("-"):
                      # potential range
@@ -545,6 +562,13 @@ def _parse(source, state, verbose, nested, first=False):
                      if that[0] == "\\":
                          code2 = _class_escape(source, that)
                      else:
+                        if that == '-':
+                            import warnings
+                            warnings.warn(
+                                'Possible set difference at position %d' % (
+                                    source.tell() - 2),
+                                FutureWarning, stacklevel=nested + 6
+                            )
                          code2 = LITERAL, _ord(that)
                      if code1[0] != LITERAL or code2[0] != LITERAL:
                          msg = "bad character range %s-%s" % (this, that)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index fc015e4ed9be0fe2b7f9c6adc22973cfc1b8c975..ee87446b7924a92f8a306cd6cd66831ba0a6b3c9 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -914,6 +914,51 @@ class ReTests(unittest.TestCase):
          self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
          self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
  
+    def test_possible_set_operations(self):
+        s = bytes(range(128)).decode()
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[0-9--1]')
+        self.assertEqual(p.findall(s), list('-./0123456789'))
+        self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[%--1]')
+        self.assertEqual(p.findall(s), list("%&'()*+,-1"))
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[%--]')
+        self.assertEqual(p.findall(s), list("%&'()*+,-"))
+
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[0-9&&1]')
+        self.assertEqual(p.findall(s), list('&0123456789'))
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[\d&&1]')
+        self.assertEqual(p.findall(s), list('&0123456789'))
+        self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
+
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[0-9||a]')
+        self.assertEqual(p.findall(s), list('0123456789a|'))
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[\d||a]')
+        self.assertEqual(p.findall(s), list('0123456789a|'))
+        self.assertEqual(re.findall(r'[||1]', s), list('1|'))
+
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[0-9~~1]')
+        self.assertEqual(p.findall(s), list('0123456789~'))
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[\d~~1]')
+        self.assertEqual(p.findall(s), list('0123456789~'))
+        self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
+
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[[0-9]|]')
+        self.assertEqual(p.findall(s), list('0123456789[]'))
+
+        with self.assertWarns(FutureWarning):
+            p = re.compile(r'[[:digit:]|]')
+        self.assertEqual(p.findall(s), list(':[]dgit'))
+
      def test_search_coverage(self):
          self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
          self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
@@ -932,7 +977,7 @@ class ReTests(unittest.TestCase):
          self.assertEqual(m.group(), match)
          self.assertEqual(m.span(), span)
  
-    LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
+    LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
  
      def test_re_escape(self):
          p = ''.join(chr(i) for i in range(256))
diff --git a/Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst b/Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst

new file mode 100644 (file)

index 0000000..6862e02
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst
@@ -0,0 +1,3 @@
+FutureWarning is now emitted if a regular expression contains character set
+constructs that will change semantically in the future (nested sets and set
+operations).
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 16 Nov 2017 10:38:26 +0000 (12:38 +0200)
committer	GitHub <noreply@github.com>
	Thu, 16 Nov 2017 10:38:26 +0000 (12:38 +0200)
Doc/library/re.rst		patch \| blob \| history
Doc/tools/susp-ignored.csv		patch \| blob \| history
Doc/whatsnew/3.7.rst		patch \| blob \| history
Lib/email/_header_value_parser.py		patch \| blob \| history
Lib/re.py		patch \| blob \| history
Lib/sre_parse.py		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history
Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst	[new file with mode: 0644]	patch \| blob