]> granicus.if.org Git - python/commitdiff
- fixed split
authorFredrik Lundh <fredrik@pythonware.com>
Thu, 29 Jun 2000 16:57:40 +0000 (16:57 +0000)
committerFredrik Lundh <fredrik@pythonware.com>
Thu, 29 Jun 2000 16:57:40 +0000 (16:57 +0000)
  (test_sre still complains about split, but that's caused by
  the group reset bug, not split itself)

- added more mark slots
  (should be dynamically allocated, but 100 is better than 32.
  and checking for the upper limit is better than overwriting
  the memory ;-)

- internal: renamed the cursor helper class

- internal: removed some bloat from sre_compile

Lib/sre.py
Lib/sre_compile.py
Modules/_sre.c
Modules/sre.h

index 32b3e8fe292470b63758fdf7542aa3054f5847ef..e0a51e345bcec2af9ed35ac861d6d79805a3a704 100644 (file)
@@ -26,7 +26,7 @@ T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
 U = UNICODE = sre_compile.SRE_FLAG_UNICODE
 
 # sre exception
-error = sre_parse.error
+error = sre_compile.error
 
 # --------------------------------------------------------------------
 # public interface
@@ -105,7 +105,7 @@ def _subn(pattern, template, string, count=0):
     n = i = 0
     s = []
     append = s.append
-    c = pattern.cursor(string)
+    c = pattern.scanner(string)
     while not count or n < count:
         m = c.search()
         if not m:
@@ -127,16 +127,20 @@ def _split(pattern, string, maxsplit=0):
     n = i = 0
     s = []
     append = s.append
-    c = pattern.cursor(string)
+    extend = s.extend
+    c = pattern.scanner(string)
+    g = c.groups
     while not maxsplit or n < maxsplit:
         m = c.search()
         if not m:
             break
-        j = m.start()
-        append(string[i:j])
-        i = m.end()
-       if i <= j:
-           break
+       b, e = m.span()
+       if e == i:
+           continue
+        append(string[i:b])
+       if g and b != e:
+           extend(m.groups())
+       i = e
         n = n + 1
     if i < len(string):
         append(string[i:])
index 2d7c0212733e6e23c0f7cd54339158b24f79b0c1..a51531b4980d1e75f11fb03f5017770557659697 100644 (file)
@@ -11,8 +11,7 @@
 # other compatibility work.
 #
 
-import array, string, sys
-
+import array
 import _sre
 
 from sre_constants import *
@@ -24,123 +23,101 @@ for WORDSIZE in "BHil":
 else:
     raise RuntimeError, "cannot find a useable array type"
 
-# FIXME: <fl> should move some optimizations from the parser to here!
-
-class Code:
-    def __init__(self):
-       self.data = []
-    def __len__(self):
-       return len(self.data)
-    def __getitem__(self, index):
-       return self.data[index]
-    def __setitem__(self, index, code):
-       self.data[index] = code
-    def append(self, code):
-       self.data.append(code)
-    def todata(self):
-       # print self.data
-       try:
-           return array.array(WORDSIZE, self.data).tostring()
-       except OverflowError:
-           print self.data
-           raise
-
 def _compile(code, pattern, flags):
-    append = code.append
+    emit = code.append
     for op, av in pattern:
        if op is ANY:
            if flags & SRE_FLAG_DOTALL:
-               append(OPCODES[op]) # any character at all!
+               emit(OPCODES[op])
            else:
-               append(OPCODES[CATEGORY])
-               append(CHCODES[CATEGORY_NOT_LINEBREAK])
+               emit(OPCODES[CATEGORY])
+               emit(CHCODES[CATEGORY_NOT_LINEBREAK])
        elif op in (SUCCESS, FAILURE):
-           append(OPCODES[op])
+           emit(OPCODES[op])
        elif op is AT:
-           append(OPCODES[op])
+           emit(OPCODES[op])
            if flags & SRE_FLAG_MULTILINE:
-               append(ATCODES[AT_MULTILINE[av]])
+               emit(ATCODES[AT_MULTILINE[av]])
            else:
-               append(ATCODES[av])
+               emit(ATCODES[av])
        elif op is BRANCH:
-           append(OPCODES[op])
+           emit(OPCODES[op])
            tail = []
            for av in av[1]:
-               skip = len(code); append(0)
+               skip = len(code); emit(0)
                _compile(code, av, flags)
-##             append(OPCODES[SUCCESS])
-               append(OPCODES[JUMP])
-               tail.append(len(code)); append(0)
+               emit(OPCODES[JUMP])
+               tail.append(len(code)); emit(0)
                code[skip] = len(code) - skip
-           append(0) # end of branch
+           emit(0) # end of branch
            for tail in tail:
                code[tail] = len(code) - tail
        elif op is CALL:
-           append(OPCODES[op])
-           skip = len(code); append(0)
+           emit(OPCODES[op])
+           skip = len(code); emit(0)
            _compile(code, av, flags)
-           append(OPCODES[SUCCESS])
+           emit(OPCODES[SUCCESS])
            code[skip] = len(code) - skip
        elif op is CATEGORY:
-           append(OPCODES[op])
+           emit(OPCODES[op])
            if flags & SRE_FLAG_LOCALE:
-               append(CH_LOCALE[CHCODES[av]])
+               emit(CH_LOCALE[CHCODES[av]])
            elif flags & SRE_FLAG_UNICODE:
-               append(CH_UNICODE[CHCODES[av]])
+               emit(CH_UNICODE[CHCODES[av]])
            else:
-               append(CHCODES[av])
+               emit(CHCODES[av])
        elif op is GROUP:
            if flags & SRE_FLAG_IGNORECASE:
-               append(OPCODES[OP_IGNORE[op]])
+               emit(OPCODES[OP_IGNORE[op]])
            else:
-               append(OPCODES[op])
-           append(av-1)
+               emit(OPCODES[op])
+           emit(av-1)
        elif op is IN:
            if flags & SRE_FLAG_IGNORECASE:
-               append(OPCODES[OP_IGNORE[op]])
+               emit(OPCODES[OP_IGNORE[op]])
                def fixup(literal, flags=flags):
                    return _sre.getlower(ord(literal), flags)
            else:
-               append(OPCODES[op])
+               emit(OPCODES[op])
                fixup = ord
-           skip = len(code); append(0)
+           skip = len(code); emit(0)
            for op, av in av:
-               append(OPCODES[op])
+               emit(OPCODES[op])
                if op is NEGATE:
                    pass
                elif op is LITERAL:
-                   append(fixup(av))
+                   emit(fixup(av))
                elif op is RANGE:
-                   append(fixup(av[0]))
-                   append(fixup(av[1]))
+                   emit(fixup(av[0]))
+                   emit(fixup(av[1]))
                elif op is CATEGORY:
                    if flags & SRE_FLAG_LOCALE:
-                       append(CH_LOCALE[CHCODES[av]])
+                       emit(CH_LOCALE[CHCODES[av]])
                    elif flags & SRE_FLAG_UNICODE:
-                       append(CH_UNICODE[CHCODES[av]])
+                       emit(CH_UNICODE[CHCODES[av]])
                    else:
-                       append(CHCODES[av])
+                       emit(CHCODES[av])
                else:
-                   raise ValueError, "unsupported set operator"
-           append(OPCODES[FAILURE])
+                   raise error, "internal: unsupported set operator"
+           emit(OPCODES[FAILURE])
            code[skip] = len(code) - skip
        elif op in (LITERAL, NOT_LITERAL):
            if flags & SRE_FLAG_IGNORECASE:
-               append(OPCODES[OP_IGNORE[op]])
+               emit(OPCODES[OP_IGNORE[op]])
            else:
-               append(OPCODES[op])
-           append(ord(av))
+               emit(OPCODES[op])
+           emit(ord(av))
        elif op is MARK:
-           append(OPCODES[op])
-           append(av)
+           emit(OPCODES[op])
+           emit(av)
        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
            if flags & SRE_FLAG_TEMPLATE:
-               append(OPCODES[REPEAT])
-               skip = len(code); append(0)
-               append(av[0])
-               append(av[1])
+               emit(OPCODES[REPEAT])
+               skip = len(code); emit(0)
+               emit(av[0])
+               emit(av[1])
                _compile(code, av[2], flags)
-               append(OPCODES[SUCCESS])
+               emit(OPCODES[SUCCESS])
                code[skip] = len(code) - skip
            else:
                lo, hi = av[2].getwidth()
@@ -149,35 +126,35 @@ def _compile(code, pattern, flags):
                if 0 and lo == hi == 1 and op is MAX_REPEAT:
                    # FIXME: <fl> need a better way to figure out when
                    # it's safe to use this one (in the parser, probably)
-                   append(OPCODES[MAX_REPEAT_ONE])
-                   skip = len(code); append(0)
-                   append(av[0])
-                   append(av[1])
+                   emit(OPCODES[MAX_REPEAT_ONE])
+                   skip = len(code); emit(0)
+                   emit(av[0])
+                   emit(av[1])
                    _compile(code, av[2], flags)
-                   append(OPCODES[SUCCESS])
+                   emit(OPCODES[SUCCESS])
                    code[skip] = len(code) - skip
                else:
-                   append(OPCODES[op])
-                   skip = len(code); append(0)
-                   append(av[0])
-                   append(av[1])
+                   emit(OPCODES[op])
+                   skip = len(code); emit(0)
+                   emit(av[0])
+                   emit(av[1])
                    _compile(code, av[2], flags)
-                   append(OPCODES[SUCCESS])
+                   emit(OPCODES[SUCCESS])
                    code[skip] = len(code) - skip
        elif op is SUBPATTERN:
            group = av[0]
            if group:
-               append(OPCODES[MARK])
-               append((group-1)*2)
+               emit(OPCODES[MARK])
+               emit((group-1)*2)
            _compile(code, av[1], flags)
            if group:
-               append(OPCODES[MARK])
-               append((group-1)*2+1)
+               emit(OPCODES[MARK])
+               emit((group-1)*2+1)
        else:
            raise ValueError, ("unsupported operand type", op)
 
 def compile(p, flags=0):
-    # convert pattern list to internal format
+    # internal: convert pattern list to internal format
     if type(p) in (type(""), type(u"")):
        import sre_parse
        pattern = p
@@ -185,18 +162,14 @@ def compile(p, flags=0):
     else:
        pattern = None
     flags = p.pattern.flags | flags
-    code = Code()
+    code = []
     _compile(code, p.data, flags)
     code.append(OPCODES[SUCCESS])
-    data = code.todata()
-    if 0: # debugging
-       print
-       print "-" * 68
-       import sre_disasm
-       sre_disasm.disasm(data)
-       print "-" * 68
+    # FIXME: <fl> get rid of this limitation
+    assert p.pattern.groups <= 100,\
+          "sorry, but this version only supports 100 named groups"
     return _sre.compile(
        pattern, flags,
-       data,
+       array.array(WORDSIZE, code).tostring(),
        p.pattern.groups-1, p.pattern.groupdict
        )
index 21214edcac058bd3d33707e20450823a0067971d..dba2afd8eb75d4e0a21697d762e99f7a8d4ed65e 100644 (file)
  * 00-03-06 fl first alpha, sort of (0.5)
  * 00-03-14 fl removed most compatibility stuff (0.6)
  * 00-05-10 fl towards third alpha (0.8.2)
- * 00-05-13 fl added experimental cursor stuff (0.8.3)
+ * 00-05-13 fl added experimental scanner stuff (0.8.3)
  * 00-05-27 fl final bug hunt (0.8.4)
  * 00-06-21 fl less bugs, more taste (0.8.5)
  * 00-06-25 fl major changes to better deal with nested repeats (0.9)
  * 00-06-28 fl fixed findall (0.9.1)
+ * 00-06-29 fl fixed split, added more scanner features (0.9.2)
  *
  * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
  *
@@ -384,7 +385,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
        int i, count;
 
     /* FIXME: this is a hack! */
-    void* mark_copy[64];
+    void* mark_copy[SRE_MARK_SIZE];
     void* mark = NULL;
 
     TRACE(("%8d: enter\n", PTR(ptr)));
@@ -954,7 +955,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
 
 staticforward PyTypeObject Pattern_Type;
 staticforward PyTypeObject Match_Type;
-staticforward PyTypeObject Cursor_Type;
+staticforward PyTypeObject Scanner_Type;
 
 static PyObject *
 _compile(PyObject* self_, PyObject* args)
@@ -1074,7 +1075,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
     state->lastmark = 0;
 
        /* FIXME: dynamic! */
-       for (i = 0; i < 64; i++)
+       for (i = 0; i < SRE_MARK_SIZE; i++)
                state->mark[i] = NULL;
 
        state->stack = NULL;
@@ -1176,15 +1177,15 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
 }
 
 static PyObject*
-pattern_cursor(PatternObject* pattern, PyObject* args)
+pattern_scanner(PatternObject* pattern, PyObject* args)
 {
        /* create search state object */
 
-       CursorObject* self;
+       ScannerObject* self;
     PyObject* string;
 
     /* create match object (with room for extra group marks) */
-    self = PyObject_NEW(CursorObject, &Cursor_Type);
+    self = PyObject_NEW(ScannerObject, &Scanner_Type);
     if (self == NULL)
         return NULL;
 
@@ -1431,7 +1432,7 @@ static PyMethodDef pattern_methods[] = {
        {"split", (PyCFunction) pattern_split, 1},
        {"findall", (PyCFunction) pattern_findall, 1},
     /* experimental */
-       {"cursor", (PyCFunction) pattern_cursor, 1},
+       {"scanner", (PyCFunction) pattern_scanner, 1},
        {NULL, NULL}
 };
 
@@ -1467,7 +1468,7 @@ pattern_getattr(PatternObject* self, char* name)
 
 statichere PyTypeObject Pattern_Type = {
        PyObject_HEAD_INIT(NULL)
-       0, "Pattern", sizeof(PatternObject), 0,
+       0, "SRE_Pattern", sizeof(PatternObject), 0,
        (destructor)pattern_dealloc, /*tp_dealloc*/
        0, /*tp_print*/
        (getattrfunc)pattern_getattr, /*tp_getattr*/
@@ -1761,7 +1762,7 @@ match_getattr(MatchObject* self, char* name)
 
 statichere PyTypeObject Match_Type = {
        PyObject_HEAD_INIT(NULL)
-       0, "Match",
+       0, "SRE_Match",
        sizeof(MatchObject), /* size of basic object */
        sizeof(int), /* space for group item */
        (destructor)match_dealloc, /*tp_dealloc*/
@@ -1770,10 +1771,10 @@ statichere PyTypeObject Match_Type = {
 };
 
 /* -------------------------------------------------------------------- */
-/* cursor methods (experimental) */
+/* scanner methods (experimental) */
 
 static void
-cursor_dealloc(CursorObject* self)
+scanner_dealloc(ScannerObject* self)
 {
        state_fini(&self->state);
     Py_DECREF(self->string);
@@ -1782,7 +1783,7 @@ cursor_dealloc(CursorObject* self)
 }
 
 static PyObject*
-cursor_match(CursorObject* self, PyObject* args)
+scanner_match(ScannerObject* self, PyObject* args)
 {
     SRE_STATE* state = &self->state;
     PyObject* match;
@@ -1811,7 +1812,7 @@ cursor_match(CursorObject* self, PyObject* args)
 
 
 static PyObject*
-cursor_search(CursorObject* self, PyObject* args)
+scanner_search(ScannerObject* self, PyObject* args)
 {
     SRE_STATE* state = &self->state;
     PyObject* match;
@@ -1830,24 +1831,26 @@ cursor_search(CursorObject* self, PyObject* args)
     match = pattern_new_match((PatternObject*) self->pattern,
                                state, self->string, status);
 
-    if (status >= 0)
+    if (status == 0 || state->ptr == state->start)
+        state->start = (void*) ((char*) state->ptr + state->charsize);
+    else
         state->start = state->ptr;
 
     return match;
 }
 
-static PyMethodDef cursor_methods[] = {
-       {"match", (PyCFunction) cursor_match, 0},
-       {"search", (PyCFunction) cursor_search, 0},
+static PyMethodDef scanner_methods[] = {
+       {"match", (PyCFunction) scanner_match, 0},
+       {"search", (PyCFunction) scanner_search, 0},
        {NULL, NULL}
 };
 
 static PyObject*  
-cursor_getattr(CursorObject* self, char* name)
+scanner_getattr(ScannerObject* self, char* name)
 {
        PyObject* res;
 
-       res = Py_FindMethod(cursor_methods, (PyObject*) self, name);
+       res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
        if (res)
                return res;
 
@@ -1859,18 +1862,21 @@ cursor_getattr(CursorObject* self, char* name)
                return self->pattern;
     }
 
+    if (!strcmp(name, "groups"))
+               return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups);
+
        PyErr_SetString(PyExc_AttributeError, name);
        return NULL;
 }
 
-statichere PyTypeObject Cursor_Type = {
+statichere PyTypeObject Scanner_Type = {
        PyObject_HEAD_INIT(NULL)
-       0, "Cursor",
-       sizeof(CursorObject), /* size of basic object */
+       0, "SRE_Scanner",
+       sizeof(ScannerObject), /* size of basic object */
        0,
-       (destructor)cursor_dealloc, /*tp_dealloc*/
+       (destructor)scanner_dealloc, /*tp_dealloc*/
        0, /*tp_print*/
-       (getattrfunc)cursor_getattr, /*tp_getattr*/
+       (getattrfunc)scanner_getattr, /*tp_getattr*/
 };
 
 static PyMethodDef _functions[] = {
@@ -1888,7 +1894,7 @@ init_sre()
 {
        /* Patch object types */
        Pattern_Type.ob_type = Match_Type.ob_type =
-        Cursor_Type.ob_type = &PyType_Type;
+        Scanner_Type.ob_type = &PyType_Type;
 
        Py_InitModule("_" MODULE, _functions);
 }
index a7786ede012a2984d80911f602ccb4caa41cd0f5..722f890499052ff984a3e7d5dbde6c34e8f63c6a 100644 (file)
@@ -46,6 +46,9 @@ typedef struct {
     void* ptr;
 } SRE_STACK;
 
+/* FIXME: <fl> shouldn't be a constant, really... */
+#define SRE_MARK_SIZE 200
+
 typedef struct {
     /* string pointers */
     void* ptr; /* current position (also end of current slice) */
@@ -56,7 +59,7 @@ typedef struct {
     int charsize;
     /* registers */
     int lastmark;
-    void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
+    void* mark[SRE_MARK_SIZE];
     /* backtracking stack */
     SRE_STACK* stack;
     int stacksize;
@@ -66,11 +69,11 @@ typedef struct {
 } SRE_STATE;
 
 typedef struct {
-    /* search helper */
+    /* scanner (internal helper object) */
     PyObject_HEAD
     PyObject* pattern;
     PyObject* string;
     SRE_STATE state;
-} CursorObject;
+} ScannerObject;
 
 #endif