]> granicus.if.org Git - python/commitdiff
- fixed split behaviour on empty matches
authorFredrik Lundh <fredrik@pythonware.com>
Fri, 30 Jun 2000 00:27:46 +0000 (00:27 +0000)
committerFredrik Lundh <fredrik@pythonware.com>
Fri, 30 Jun 2000 00:27:46 +0000 (00:27 +0000)
- fixed compiler problems when using locale/unicode flags

- fixed group/octal code parsing in sub/subn templates

Lib/sre.py
Lib/sre_compile.py
Lib/sre_parse.py
Modules/_sre.c

index 49e3140bd4f06d5db723fe48ba8df68c6ab81fdb..d5bb462e7bcd740c54b00ae979b43a01a45f7bf0 100644 (file)
@@ -109,16 +109,13 @@ def _subn(pattern, template, string, count=0):
         m = c.search()
         if not m:
             break
-        j = m.start()
-        if j > i:
-            append(string[i:j])
+       b, e = m.span()
+        if i < b:
+            append(string[i:b])
         append(filter(m))
-        i = m.end()
-       if i <= j:
-           break
+       i = e
         n = n + 1
-    if i < len(string):
-        append(string[i:])
+    append(string[i:])
     return string[:0].join(s), n
 
 def _split(pattern, string, maxsplit=0):
@@ -128,7 +125,7 @@ def _split(pattern, string, maxsplit=0):
     append = s.append
     extend = s.extend
     c = pattern.scanner(string)
-    g = c.groups
+    g = pattern.groups
     while not maxsplit or n < maxsplit:
         m = c.search()
         if not m:
index 344dc29113fcb02ade9fcd9e91aaf0eba5c21731..ea5f5bca3ddea53325bf0e21ee3a818eba22ee01 100644 (file)
@@ -61,9 +61,9 @@ def _compile(code, pattern, flags):
        elif op is CATEGORY:
            emit(OPCODES[op])
            if flags & SRE_FLAG_LOCALE:
-               emit(CH_LOCALE[CHCODES[av]])
+               emit(CHCODES[CH_LOCALE[av]])
            elif flags & SRE_FLAG_UNICODE:
-               emit(CH_UNICODE[CHCODES[av]])
+               emit(CHCODES[CH_UNICODE[av]])
            else:
                emit(CHCODES[av])
        elif op is GROUP:
@@ -92,9 +92,9 @@ def _compile(code, pattern, flags):
                    emit(fixup(av[1]))
                elif op is CATEGORY:
                    if flags & SRE_FLAG_LOCALE:
-                       emit(CH_LOCALE[CHCODES[av]])
+                       emit(CHCODES[CH_LOCALE[av]])
                    elif flags & SRE_FLAG_UNICODE:
-                       emit(CH_UNICODE[CHCODES[av]])
+                       emit(CHCODES[CH_UNICODE[av]])
                    else:
                        emit(CHCODES[av])
                else:
index 93a7b5dc997b1fe3e3fe2369c8974a518ad18571..ec934fe6b49a563deff54585831c088f3acc324e 100644 (file)
@@ -30,26 +30,27 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF")
 WHITESPACE = string.whitespace
 
 ESCAPES = {
-    "\\a": (LITERAL, chr(7)),
-    "\\b": (LITERAL, chr(8)),
-    "\\f": (LITERAL, chr(12)),
-    "\\n": (LITERAL, chr(10)),
-    "\\r": (LITERAL, chr(13)),
-    "\\t": (LITERAL, chr(9)),
-    "\\v": (LITERAL, chr(11))
+    r"\a": (LITERAL, chr(7)),
+    r"\b": (LITERAL, chr(8)),
+    r"\f": (LITERAL, chr(12)),
+    r"\n": (LITERAL, chr(10)),
+    r"\r": (LITERAL, chr(13)),
+    r"\t": (LITERAL, chr(9)),
+    r"\v": (LITERAL, chr(11)),
+    r"\\": (LITERAL, "\\")
 }
 
 CATEGORIES = {
-    "\\A": (AT, AT_BEGINNING), # start of string
-    "\\b": (AT, AT_BOUNDARY),
-    "\\B": (AT, AT_NON_BOUNDARY),
-    "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
-    "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
-    "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
-    "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
-    "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
-    "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
-    "\\Z": (AT, AT_END), # end of string
+    r"\A": (AT, AT_BEGINNING), # start of string
+    r"\b": (AT, AT_BOUNDARY),
+    r"\B": (AT, AT_NON_BOUNDARY),
+    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
+    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
+    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
+    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
+    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
+    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
+    r"\Z": (AT, AT_END), # end of string
 }
 
 FLAGS = {
@@ -185,11 +186,11 @@ def isname(name):
            return 0
     return 1
 
-def _group(escape, state):
+def _group(escape, groups):
     # check if the escape string represents a valid group
     try:
        group = int(escape[1:])
-       if group and group < state.groups:
+       if group and group < groups:
            return group
     except ValueError:
        pass
@@ -239,10 +240,10 @@ def _escape(source, escape, state):
            return LITERAL, chr(int(escape[-4:], 16) & 0xff)
        elif escape[1:2] in DIGITS:
            while 1:
-               group = _group(escape, state)
+               group = _group(escape, state.groups)
                if group:
                    if (not source.next or
-                       not _group(escape + source.next, state)):
+                       not _group(escape + source.next, state.groups)):
                        return GROUP, group
                    escape = escape + source.get()
                elif source.next in OCTDIGITS:
@@ -534,6 +535,7 @@ def parse_template(source, pattern):
        if this is None:
            break # end of replacement string
        if this and this[0] == "\\":
+           # group
            if this == "\\g":
                name = ""
                if s.match("<"):
@@ -557,15 +559,29 @@ def parse_template(source, pattern):
                        raise IndexError, "unknown group name"
                a((MARK, index))
            elif len(this) > 1 and this[1] in DIGITS:
-               while s.next in DIGITS:
-                   this = this + s.get()
-               a((MARK, int(this[1:])))
+               code = None
+               while 1:
+                   group = _group(this, pattern.groups+1)
+                   if group:
+                       if (not s.next or
+                           not _group(this + s.next, pattern.groups+1)):
+                           code = MARK, int(group)
+                           break
+                   elif s.next in OCTDIGITS:
+                       this = this + s.get()
+                   else:
+                       break
+               if not code:
+                   this = this[1:]
+                   # FIXME: support unicode characters!
+                   code = LITERAL, chr(int(this[-6:], 8) & 0xff)
+               a(code)
            else:
                try:
                    a(ESCAPES[this])
                except KeyError:
-                   for char in this:
-                       a((LITERAL, char))
+                   for c in this:
+                       a((LITERAL, c))
        else:
            a((LITERAL, this))
     return p
index 6b0fa61a7080cf7207cb778e28573ce31b620971..7b1adbd177ff6ed829698427be9db86eab41321c 100644 (file)
@@ -1534,6 +1534,9 @@ pattern_getattr(PatternObject* self, char* name)
     if (!strcmp(name, "flags"))
                return Py_BuildValue("i", self->flags);
 
+    if (!strcmp(name, "groups"))
+               return Py_BuildValue("i", self->groups);
+
        if (!strcmp(name, "groupindex") && self->groupindex) {
         Py_INCREF(self->groupindex);
                return self->groupindex;
@@ -1939,9 +1942,6 @@ scanner_getattr(ScannerObject* self, char* name)
                return self->pattern;
     }
 
-    if (!strcmp(name, "groups"))
-               return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups);
-
        PyErr_SetString(PyExc_AttributeError, name);
        return NULL;
 }