bpo-30688: Support \N{name} escapes in re patterns. (GH-5588)

author Serhiy Storchaka <storchaka@gmail.com>

Fri, 9 Feb 2018 22:08:17 +0000 (00:08 +0200)

committer GitHub <noreply@github.com>

Fri, 9 Feb 2018 22:08:17 +0000 (00:08 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Fri, 9 Feb 2018 22:08:17 +0000 (00:08 +0200)
committer GitHub <noreply@github.com>
Fri, 9 Feb 2018 22:08:17 +0000 (00:08 +0200)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index 83ebe7db01ad96490830b82a4123bf9df79abf56..475a8d285550f27826e06311eb20379f71213aea 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -468,13 +468,13 @@ Most of the standard escapes supported by Python string literals are also
  accepted by the regular expression parser::
  
     \a      \b      \f      \n
-   \r      \t      \u      \U
-   \v      \x      \\
+   \N      \r      \t      \u
+   \U      \v      \x      \\
  
  (Note that ``\b`` is used to represent word boundaries, and means "backspace"
  only inside character classes.)
  
-``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode
+``'\u'``, ``'\U'``, and ``'\N'`` escape sequences are only recognized in Unicode
  patterns.  In bytes patterns they are errors.
  
  Octal escapes are included in a limited form.  If the first digit is a 0, or if
@@ -488,6 +488,9 @@ three digits in length.
  .. versionchanged:: 3.6
     Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors.
  
+.. versionchanged:: 3.8
+   The ``'\N{name}'`` escape sequence has been added. As in string literals,
+   it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``).
  
  .. seealso::
  
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst

index 60f54a0561e54efc5a569c78edab635155255b80..41819815e399437662f0bea8753b612c1a96e99f 100644 (file)
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -75,6 +75,8 @@ New Features
  Other Language Changes
  ======================
  
+* Added support of ``\N{name}`` escapes in :mod:`regular expressions <re>`.
+  (Contributed by Jonathan Eunice and Serhiy Storchaka in :issue:`30688`.)
  
  
  New Modules
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py

index a53735b07ded420ca2bc9fa636f40b9eb2241500..db01e844b43249abf974216a4ecd04af0c2e8ed8 100644 (file)
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -13,6 +13,7 @@
  # XXX: show string offset and offending character for all errors
  
  from sre_constants import *
+import unicodedata
  
  SPECIAL_CHARS = ".\\[{()*+?^$|"
  REPEAT_CHARS = "*+?{"
@@ -264,19 +265,19 @@ class Tokenizer:
              result += c
              self.__next()
          return result
-    def getuntil(self, terminator):
+    def getuntil(self, terminator, name):
          result = ''
          while True:
              c = self.next
              self.__next()
              if c is None:
                  if not result:
-                    raise self.error("missing group name")
+                    raise self.error("missing " + name)
                  raise self.error("missing %s, unterminated name" % terminator,
                                   len(result))
              if c == terminator:
                  if not result:
-                    raise self.error("missing group name", 1)
+                    raise self.error("missing " + name, 1)
                  break
              result += c
          return result
@@ -322,6 +323,17 @@ def _class_escape(source, escape):
              c = int(escape[2:], 16)
              chr(c) # raise ValueError for invalid code
              return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
+            try:
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
+            return LITERAL, c
          elif c in OCTDIGITS:
              # octal escape (up to three digits)
              escape += source.getwhile(2, OCTDIGITS)
@@ -370,6 +382,17 @@ def _escape(source, escape, state):
              c = int(escape[2:], 16)
              chr(c) # raise ValueError for invalid code
              return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
+            try:
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
+            return LITERAL, c
          elif c == "0":
              # octal escape
              escape += source.getwhile(2, OCTDIGITS)
@@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False):
                      # python extensions
                      if sourcematch("<"):
                          # named group: skip forward to end of name
-                        name = source.getuntil(">")
+                        name = source.getuntil(">", "group name")
                          if not name.isidentifier():
                              msg = "bad character in group name %r" % name
                              raise source.error(msg, len(name) + 1)
                      elif sourcematch("="):
                          # named backreference
-                        name = source.getuntil(")")
+                        name = source.getuntil(")", "group name")
                          if not name.isidentifier():
                              msg = "bad character in group name %r" % name
                              raise source.error(msg, len(name) + 1)
@@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False):
  
                  elif char == "(":
                      # conditional backreference group
-                    condname = source.getuntil(")")
+                    condname = source.getuntil(")", "group name")
                      if condname.isidentifier():
                          condgroup = state.groupdict.get(condname)
                          if condgroup is None:
@@ -977,7 +1000,7 @@ def parse_template(source, pattern):
                  name = ""
                  if not s.match("<"):
                      raise s.error("missing <")
-                name = s.getuntil(">")
+                name = s.getuntil(">", "group name")
                  if name.isidentifier():
                      try:
                          index = groupindex[name]
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index 9fed4bef8809fcc2029a76842ec10277a4f01266..ab1d985d59f87872a0df9d1f7c13aa54ead00bf0 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -694,6 +694,42 @@ class ReTests(unittest.TestCase):
              with self.subTest(c):
                  self.assertRaises(re.error, re.compile, '[\\%c]' % c)
  
+    def test_named_unicode_escapes(self):
+        # test individual Unicode named escapes
+        self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
+        self.assertTrue(re.match(r'\N{less-than sign}', '<'))
+        self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
+        self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
+        self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
+                                 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
+                                 '\ufbf9'))
+        self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
+                                 '='))
+        self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
+                                   ';'))
+
+        # test errors in \N{name} handling - only valid names should pass
+        self.checkPatternError(r'\N', 'missing {', 2)
+        self.checkPatternError(r'[\N]', 'missing {', 3)
+        self.checkPatternError(r'\N{', 'missing character name', 3)
+        self.checkPatternError(r'[\N{', 'missing character name', 4)
+        self.checkPatternError(r'\N{}', 'missing character name', 3)
+        self.checkPatternError(r'[\N{}]', 'missing character name', 4)
+        self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
+        self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
+        self.checkPatternError(r'\N{SNAKE',
+                               'missing }, unterminated name', 3)
+        self.checkPatternError(r'[\N{SNAKE]',
+                               'missing }, unterminated name', 4)
+        self.checkPatternError(r'[\N{SNAKE]}',
+                               "undefined character name 'SNAKE]'", 1)
+        self.checkPatternError(r'\N{SPAM}',
+                               "undefined character name 'SPAM'", 0)
+        self.checkPatternError(r'[\N{SPAM}]',
+                               "undefined character name 'SPAM'", 1)
+        self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
+        self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
+
      def test_string_boundaries(self):
          # See http://bugs.python.org/issue10713
          self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
diff --git a/Misc/ACKS b/Misc/ACKS

index b31190ca52833e02aa67fb2d67df0f2240c50fa7..ea1d9418870aa9a555cbc5e789cae44d6a44f9bb 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -441,6 +441,7 @@ Andy Eskilsson
  André Espaze
  Stefan Esser
  Nicolas Estibals
+Jonathan Eunice
  Carey Evans
  Stephen D Evans
  Tim Everett
diff --git a/Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst b/Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst

new file mode 100644 (file)

index 0000000..7d31680
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst
@@ -0,0 +1,2 @@
+Added support of ``\N{name}`` escapes in regular expressions.  Based on
+patch by Jonathan Eunice.
author	Serhiy Storchaka <storchaka@gmail.com>
	Fri, 9 Feb 2018 22:08:17 +0000 (00:08 +0200)
committer	GitHub <noreply@github.com>
	Fri, 9 Feb 2018 22:08:17 +0000 (00:08 +0200)
Doc/library/re.rst		patch \| blob \| history
Doc/whatsnew/3.8.rst		patch \| blob \| history
Lib/sre_parse.py		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history
Misc/ACKS		patch \| blob \| history
Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst	[new file with mode: 0644]	patch \| blob