bpo-29995: re.escape() now escapes only special characters. (#1007)

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 13 Apr 2017 18:06:43 +0000 (21:06 +0300)

committer GitHub <noreply@github.com>

Thu, 13 Apr 2017 18:06:43 +0000 (21:06 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 13 Apr 2017 18:06:43 +0000 (21:06 +0300)
committer GitHub <noreply@github.com>
Thu, 13 Apr 2017 18:06:43 +0000 (21:06 +0300)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index 3213daf6cfb6a0736f3aa88caeecb0c7ba46c5f9..ce90ec7e01a2e8996e6ae788e66fa1b672a8b0c4 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -786,7 +786,7 @@ form.
  
  .. function:: escape(pattern)
  
-   Escape all the characters in *pattern* except ASCII letters, numbers and ``'_'``.
+   Escape special characters in *pattern*.
     This is useful if you want to match an arbitrary literal string that may
     have regular expression metacharacters in it.  For example::
  
@@ -795,15 +795,19 @@ form.
  
        >>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
        >>> print('[%s]+' % re.escape(legal_chars))
-      [abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+
+      [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
  
        >>> operators = ['+', '-', '*', '/', '**']
        >>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
-      \/|\-|\+|\*\*|\*
+      /|\-|\+|\*\*|\*
  
     .. versionchanged:: 3.3
        The ``'_'`` character is no longer escaped.
  
+   .. versionchanged:: 3.7
+      Only characters that can have special meaning in a regular expression
+      are escaped.
+
  
  .. function:: purge()
  
diff --git a/Doc/tools/susp-ignored.csv b/Doc/tools/susp-ignored.csv

index df67f7590dc2e90c82ab58e7cf2a390dcec63016..01b1d98c149efce90412faca9a96848e7a3e5a52 100644 (file)
--- a/Doc/tools/susp-ignored.csv
+++ b/Doc/tools/susp-ignored.csv
@@ -303,7 +303,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
  whatsnew/3.2,,:location,zope9-location = ${zope9:location}
  whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
  library/re,,`,!#$%&'*+-.^_`|~:
-library/re,,`,\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:
+library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
  library/tarfile,,:xz,'x:xz'
  library/xml.etree.elementtree,,:sometag,prefix:sometag
  library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
diff --git a/Lib/idlelib/idle_test/test_replace.py b/Lib/idlelib/idle_test/test_replace.py

index 9913ed2b7c81e140828303064dc619d02df7bc58..2ecbd34168c54e8196dc7cedaba792c778c4dba0 100644 (file)
--- a/Lib/idlelib/idle_test/test_replace.py
+++ b/Lib/idlelib/idle_test/test_replace.py
@@ -221,8 +221,8 @@ class ReplaceDialogTest(unittest.TestCase):
          self.assertIn('Invalid Replace Expression', showerror.message)
  
          # test access method
-        self.engine.setcookedpat("\'")
-        equal(pv.get(), "\\'")
+        self.engine.setcookedpat("?")
+        equal(pv.get(), "\\?")
  
      def test_replace_backwards(self):
          equal = self.assertEqual
diff --git a/Lib/re.py b/Lib/re.py

index d321cff92c9cb5d2688cab51896a21d19adbb524..7053eddbe027e67ac8e0f4232a15c683a6ad250b 100644 (file)
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -241,39 +241,21 @@ def template(pattern, flags=0):
      "Compile a template pattern, returning a pattern object"
      return _compile(pattern, flags|T)
  
-_alphanum_str = frozenset(
-    "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
-_alphanum_bytes = frozenset(
-    b"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
+# SPECIAL_CHARS
+# closing ')', '}' and ']'
+# '-' (a range in character set)
+# '#' (comment) and WHITESPACE (ignored) in verbose mode
+_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
  
  def escape(pattern):
      """
-    Escape all the characters in pattern except ASCII letters, numbers and '_'.
+    Escape special characters in a string.
      """
      if isinstance(pattern, str):
-        alphanum = _alphanum_str
-        s = list(pattern)
-        for i, c in enumerate(pattern):
-            if c not in alphanum:
-                if c == "\000":
-                    s[i] = "\\000"
-                else:
-                    s[i] = "\\" + c
-        return "".join(s)
+        return pattern.translate(_special_chars_map)
      else:
-        alphanum = _alphanum_bytes
-        s = []
-        esc = ord(b"\\")
-        for c in pattern:
-            if c in alphanum:
-                s.append(c)
-            else:
-                if c == 0:
-                    s.extend(b"\\000")
-                else:
-                    s.append(esc)
-                    s.append(c)
-        return bytes(s)
+        pattern = str(pattern, 'latin1')
+        return pattern.translate(_special_chars_map).encode('latin1')
  
  # --------------------------------------------------------------------
  # internals
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index a1fddfb4b6b6a69164fbd1e8c56c1416731a5f49..b3b29f847e619ea2112942101d04cbe36716bd63 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -904,7 +904,7 @@ class ReTests(unittest.TestCase):
          self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
  
      def assertMatch(self, pattern, text, match=None, span=None,
-                    matcher=re.match):
+                    matcher=re.fullmatch):
          if match is None and span is None:
              # the pattern matches the whole text
              match = text
@@ -917,37 +917,38 @@ class ReTests(unittest.TestCase):
          self.assertEqual(m.group(), match)
          self.assertEqual(m.span(), span)
  
+    LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
+
      def test_re_escape(self):
-        alnum_chars = string.ascii_letters + string.digits + '_'
          p = ''.join(chr(i) for i in range(256))
          for c in p:
-            if c in alnum_chars:
-                self.assertEqual(re.escape(c), c)
-            elif c == '\x00':
-                self.assertEqual(re.escape(c), '\\000')
-            else:
-                self.assertEqual(re.escape(c), '\\' + c)
              self.assertMatch(re.escape(c), c)
+            self.assertMatch('[' + re.escape(c) + ']', c)
+            self.assertMatch('(?x)' + re.escape(c), c)
          self.assertMatch(re.escape(p), p)
+        for c in '-.]{}':
+            self.assertEqual(re.escape(c)[:1], '\\')
+        literal_chars = self.LITERAL_CHARS
+        self.assertEqual(re.escape(literal_chars), literal_chars)
  
-    def test_re_escape_byte(self):
-        alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
+    def test_re_escape_bytes(self):
          p = bytes(range(256))
          for i in p:
              b = bytes([i])
-            if b in alnum_chars:
-                self.assertEqual(re.escape(b), b)
-            elif i == 0:
-                self.assertEqual(re.escape(b), b'\\000')
-            else:
-                self.assertEqual(re.escape(b), b'\\' + b)
              self.assertMatch(re.escape(b), b)
+            self.assertMatch(b'[' + re.escape(b) + b']', b)
+            self.assertMatch(b'(?x)' + re.escape(b), b)
          self.assertMatch(re.escape(p), p)
+        for i in b'-.]{}':
+            b = bytes([i])
+            self.assertEqual(re.escape(b)[:1], b'\\')
+        literal_chars = self.LITERAL_CHARS.encode('ascii')
+        self.assertEqual(re.escape(literal_chars), literal_chars)
  
      def test_re_escape_non_ascii(self):
          s = 'xxx\u2620\u2620\u2620xxx'
          s_escaped = re.escape(s)
-        self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
+        self.assertEqual(s_escaped, s)
          self.assertMatch(s_escaped, s)
          self.assertMatch('.%s+.' % re.escape('\u2620'), s,
                           'x\u2620\u2620\u2620x', (2, 7), re.search)
@@ -955,7 +956,7 @@ class ReTests(unittest.TestCase):
      def test_re_escape_non_ascii_bytes(self):
          b = 'y\u2620y\u2620y'.encode('utf-8')
          b_escaped = re.escape(b)
-        self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
+        self.assertEqual(b_escaped, b)
          self.assertMatch(b_escaped, b)
          res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
          self.assertEqual(len(res), 2)
diff --git a/Misc/NEWS b/Misc/NEWS

index 440f0b25cb682b4545bebb3d5ade6321f9786fac..ec85455007730705f5292f8c2683983b2cb147c9 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -320,6 +320,8 @@ Library
  - bpo-29998: Pickling and copying ImportError now preserves name and path
    attributes.
  
+- bpo-29995: re.escape() now escapes only regex special characters.
+
  - bpo-29962: Add math.remainder operation, implementing remainder
    as specified in IEEE 754.
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 13 Apr 2017 18:06:43 +0000 (21:06 +0300)
committer	GitHub <noreply@github.com>
	Thu, 13 Apr 2017 18:06:43 +0000 (21:06 +0300)
Doc/library/re.rst		patch \| blob \| history
Doc/tools/susp-ignored.csv		patch \| blob \| history
Lib/idlelib/idle_test/test_replace.py		patch \| blob \| history
Lib/re.py		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history