Applying modified version of patch #1018386, which fixes

author Gustavo Niemeyer <gustavo@niemeyer.net>

Fri, 3 Sep 2004 17:06:10 +0000 (17:06 +0000)

committer Gustavo Niemeyer <gustavo@niemeyer.net>

Fri, 3 Sep 2004 17:06:10 +0000 (17:06 +0000)
author Gustavo Niemeyer <gustavo@niemeyer.net>
Fri, 3 Sep 2004 17:06:10 +0000 (17:06 +0000)
committer Gustavo Niemeyer <gustavo@niemeyer.net>
Fri, 3 Sep 2004 17:06:10 +0000 (17:06 +0000)
diff --git a/Doc/lib/libre.tex b/Doc/lib/libre.tex

index 7d59ee51428937b756f62251bcaf96fce5a9ca83..704db30a3fb876650bad43d59bdc90c185c94f78 100644 (file)
--- a/Doc/lib/libre.tex
+++ b/Doc/lib/libre.tex
@@ -387,7 +387,8 @@ also accepted by the regular expression parser:
  
  Octal escapes are included in a limited form: If the first digit is a
  0, or if there are three octal digits, it is considered an octal
-escape. Otherwise, it is a group reference.
+escape. Otherwise, it is a group reference.  As for string literals,
+octal escapes are always at most three digits in length.
  
  
  % Note the lack of a period in the section title; it causes problems
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py

index 5c4298af1d65db1b2b7d66c343b54b3289fa3bb9..3e27145074e00d9d857c669f2ee3dbb3a58da413 100644 (file)
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -217,21 +217,11 @@ def isname(name):
      # check that group name is a valid string
      if not isident(name[0]):
          return False
-    for char in name:
+    for char in name[1:]:
          if not isident(char) and not isdigit(char):
              return False
      return True
  
-def _group(escape, groups):
-    # check if the escape string represents a valid group
-    try:
-        gid = int(escape[1:])
-        if gid and gid < groups:
-            return gid
-    except ValueError:
-        pass
-    return None # not a valid group
-
  def _class_escape(source, escape):
      # handle escape code inside character class
      code = ESCAPES.get(escape)
@@ -241,7 +231,8 @@ def _class_escape(source, escape):
      if code:
          return code
      try:
-        if escape[1:2] == "x":
+        c = escape[1:2]
+        if c == "x":
              # hexadecimal escape (exactly two digits)
              while source.next in HEXDIGITS and len(escape) < 4:
                  escape = escape + source.get()
@@ -249,12 +240,14 @@ def _class_escape(source, escape):
              if len(escape) != 2:
                  raise error, "bogus escape: %s" % repr("\\" + escape)
              return LITERAL, int(escape, 16) & 0xff
-        elif escape[1:2] in OCTDIGITS:
+        elif c in OCTDIGITS:
              # octal escape (up to three digits)
-            while source.next in OCTDIGITS and len(escape) < 5:
+            while source.next in OCTDIGITS and len(escape) < 4:
                  escape = escape + source.get()
              escape = escape[1:]
              return LITERAL, int(escape, 8) & 0xff
+        elif c in DIGITS:
+            raise error, "bogus escape: %s" % repr(escape)
          if len(escape) == 2:
              return LITERAL, ord(escape[1])
      except ValueError:
@@ -270,19 +263,20 @@ def _escape(source, escape, state):
      if code:
          return code
      try:
-        if escape[1:2] == "x":
+        c = escape[1:2]
+        if c == "x":
              # hexadecimal escape
              while source.next in HEXDIGITS and len(escape) < 4:
                  escape = escape + source.get()
              if len(escape) != 4:
                  raise ValueError
              return LITERAL, int(escape[2:], 16) & 0xff
-        elif escape[1:2] == "0":
+        elif c == "0":
              # octal escape
              while source.next in OCTDIGITS and len(escape) < 4:
                  escape = escape + source.get()
              return LITERAL, int(escape[1:], 8) & 0xff
-        elif escape[1:2] in DIGITS:
+        elif c in DIGITS:
              # octal escape *or* decimal group reference (sigh)
              if source.next in DIGITS:
                  escape = escape + source.get()
@@ -291,9 +285,9 @@ def _escape(source, escape, state):
                      # got three octal digits; this is an octal escape
                      escape = escape + source.get()
                      return LITERAL, int(escape[1:], 8) & 0xff
-            # got at least one decimal digit; this is a group reference
-            group = _group(escape, state.groups)
-            if group:
+            # not an octal escape, so this is a group reference
+            group = int(escape[1:])
+            if group < state.groups:
                  if not state.checkgroup(group):
                      raise error, "cannot refer to open group"
                  return GROUPREF, group
@@ -709,7 +703,8 @@ def parse_template(source, pattern):
              break # end of replacement string
          if this and this[0] == "\\":
              # group
-            if this == "\\g":
+            c = this[1:2]
+            if c == "g":
                  name = ""
                  if s.match("<"):
                      while 1:
@@ -723,6 +718,8 @@ def parse_template(source, pattern):
                      raise error, "bad group name"
                  try:
                      index = int(name)
+                    if index < 0:
+                        raise error, "negative group number"
                  except ValueError:
                      if not isname(name):
                          raise error, "bad character in group name"
@@ -731,26 +728,23 @@ def parse_template(source, pattern):
                      except KeyError:
                          raise IndexError, "unknown group name"
                  a((MARK, index))
-            elif len(this) > 1 and this[1] in DIGITS:
-                code = None
-                while 1:
-                    group = _group(this, pattern.groups+1)
-                    if group:
-                        if (s.next not in DIGITS or
-                            not _group(this + s.next, pattern.groups+1)):
-                            code = MARK, group
-                            break
-                    elif s.next in OCTDIGITS:
+            elif c == "0":
+                if s.next in OCTDIGITS:
+                    this = this + sget()
+                    if s.next in OCTDIGITS:
                          this = this + sget()
-                    else:
-                        break
-                if not code:
-                    this = this[1:]
-                    code = LITERAL, makechar(int(this[-6:], 8) & 0xff)
-                if code[0] is LITERAL:
-                    literal(code[1])
-                else:
-                    a(code)
+                literal(makechar(int(this[1:], 8) & 0xff))
+            elif c in DIGITS:
+                isoctal = False
+                if s.next in DIGITS:
+                    this = this + sget()
+                    if (c in OCTDIGITS and s.next in OCTDIGITS and
+                        this[2] in OCTDIGITS):
+                        this = this + sget()
+                        isoctal = True
+                        literal(makechar(int(this[1:], 8) & 0xff))
+                if not isoctal:
+                    a((MARK, int(this[1:])))
              else:
                  try:
                      this = makechar(ESCAPES[this][1])
@@ -782,7 +776,7 @@ def expand_template(template, match):
          for index, group in groups:
              literals[index] = s = g(group)
              if s is None:
-                raise IndexError
+                raise error, "unmatched group"
      except IndexError:
-        raise error, "empty group"
+        raise error, "invalid group reference"
      return sep.join(literals)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index c7afdc59a2a036ab748d2c0e4762ccba708ca402..8f66ae918c74ecf04db02aa6845965ee11d51f01 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -83,6 +83,48 @@ class ReTests(unittest.TestCase):
          self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
                           'abc\ndef\n')
  
+    def test_sub_template_numeric_escape(self):
+        # bug 776311 and friends
+        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
+        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
+        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
+        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
+        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
+        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
+        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
+
+        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
+        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
+
+        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
+        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
+        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
+        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
+        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
+
+        self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
+        self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
+        
+        self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
+        self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
+        self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
+        self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
+
+        # in python2.3 (etc), these loop endlessly in sre_parser.py
+        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
+        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
+                         'xz8')
+        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
+                         'xza')
+
      def test_qualified_re_sub(self):
          self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
          self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
@@ -105,6 +147,7 @@ class ReTests(unittest.TestCase):
          self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
          self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
          self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
+        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
  
      def test_re_subn(self):
          self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -386,6 +429,16 @@ class ReTests(unittest.TestCase):
              self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
          self.assertRaises(re.error, re.match, "\911", "")
  
+    def test_sre_character_class_literals(self):
+        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
+            self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
+            self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
+            self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
+            self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
+            self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
+            self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
+        self.assertRaises(re.error, re.match, "[\911]", "")
+
      def test_bug_113254(self):
          self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
          self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
author	Gustavo Niemeyer <gustavo@niemeyer.net>
	Fri, 3 Sep 2004 17:06:10 +0000 (17:06 +0000)
committer	Gustavo Niemeyer <gustavo@niemeyer.net>
	Fri, 3 Sep 2004 17:06:10 +0000 (17:06 +0000)
Doc/lib/libre.tex		patch \| blob \| history
Lib/sre_parse.py		patch \| blob \| history
Lib/test/test_re.py		patch \| blob \| history