]> granicus.if.org Git - python/commitdiff
Issue #24426: Fast searching optimization in regular expressions now works
authorSerhiy Storchaka <storchaka@gmail.com>
Sun, 21 Jun 2015 11:06:55 +0000 (14:06 +0300)
committerSerhiy Storchaka <storchaka@gmail.com>
Sun, 21 Jun 2015 11:06:55 +0000 (14:06 +0300)
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

Lib/sre_compile.py
Misc/NEWS
Modules/_sre.c
Modules/sre_lib.h

index 502b0616c6e54ea683877ba8d37d0e6b31410ae8..4edb03fa300e77146520c91a8b79605728c16b27 100644 (file)
@@ -409,57 +409,39 @@ def _generate_overlap_table(prefix):
             table[i] = idx + 1
     return table
 
-def _compile_info(code, pattern, flags):
-    # internal: compile an info block.  in the current version,
-    # this contains min/max pattern width, and an optional literal
-    # prefix or a character map
-    lo, hi = pattern.getwidth()
-    if hi > MAXCODE:
-        hi = MAXCODE
-    if lo == 0:
-        code.extend([INFO, 4, 0, lo, hi])
-        return
-    # look for a literal prefix
+def _get_literal_prefix(pattern):
+    # look for literal prefix
     prefix = []
     prefixappend = prefix.append
-    prefix_skip = 0
+    prefix_skip = None
+    got_all = True
+    for op, av in pattern.data:
+        if op is LITERAL:
+            prefixappend(av)
+        elif op is SUBPATTERN:
+            prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
+            if prefix_skip is None:
+                if av[0] is not None:
+                    prefix_skip = len(prefix)
+                elif prefix_skip1 is not None:
+                    prefix_skip = len(prefix) + prefix_skip1
+            prefix.extend(prefix1)
+            if not got_all:
+                break
+        else:
+            got_all = False
+            break
+    return prefix, prefix_skip, got_all
+
+def _get_charset_prefix(pattern):
     charset = [] # not used
     charsetappend = charset.append
-    if not (flags & SRE_FLAG_IGNORECASE):
-        # look for literal prefix
-        for op, av in pattern.data:
+    if pattern.data:
+        op, av = pattern.data[0]
+        if op is SUBPATTERN and av[1]:
+            op, av = av[1][0]
             if op is LITERAL:
-                if len(prefix) == prefix_skip:
-                    prefix_skip = prefix_skip + 1
-                prefixappend(av)
-            elif op is SUBPATTERN and len(av[1]) == 1:
-                op, av = av[1][0]
-                if op is LITERAL:
-                    prefixappend(av)
-                else:
-                    break
-            else:
-                break
-        # if no prefix, look for charset prefix
-        if not prefix and pattern.data:
-            op, av = pattern.data[0]
-            if op is SUBPATTERN and av[1]:
-                op, av = av[1][0]
-                if op is LITERAL:
-                    charsetappend((op, av))
-                elif op is BRANCH:
-                    c = []
-                    cappend = c.append
-                    for p in av[1]:
-                        if not p:
-                            break
-                        op, av = p[0]
-                        if op is LITERAL:
-                            cappend((op, av))
-                        else:
-                            break
-                    else:
-                        charset = c
+                charsetappend((op, av))
             elif op is BRANCH:
                 c = []
                 cappend = c.append
@@ -473,8 +455,43 @@ def _compile_info(code, pattern, flags):
                         break
                 else:
                     charset = c
-            elif op is IN:
-                charset = av
+        elif op is BRANCH:
+            c = []
+            cappend = c.append
+            for p in av[1]:
+                if not p:
+                    break
+                op, av = p[0]
+                if op is LITERAL:
+                    cappend((op, av))
+                else:
+                    break
+            else:
+                charset = c
+        elif op is IN:
+            charset = av
+    return charset
+
+def _compile_info(code, pattern, flags):
+    # internal: compile an info block.  in the current version,
+    # this contains min/max pattern width, and an optional literal
+    # prefix or a character map
+    lo, hi = pattern.getwidth()
+    if hi > MAXCODE:
+        hi = MAXCODE
+    if lo == 0:
+        code.extend([INFO, 4, 0, lo, hi])
+        return
+    # look for a literal prefix
+    prefix = []
+    prefix_skip = 0
+    charset = [] # not used
+    if not (flags & SRE_FLAG_IGNORECASE):
+        # look for literal prefix
+        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+        # if no prefix, look for charset prefix
+        if not prefix:
+            charset = _get_charset_prefix(pattern)
 ##     if prefix:
 ##         print("*** PREFIX", prefix, prefix_skip)
 ##     if charset:
@@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags):
     mask = 0
     if prefix:
         mask = SRE_INFO_PREFIX
-        if len(prefix) == prefix_skip == len(pattern.data):
+        if prefix_skip is None and got_all:
             mask = mask | SRE_INFO_LITERAL
     elif charset:
         mask = mask | SRE_INFO_CHARSET
@@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags):
     # add literal prefix
     if prefix:
         emit(len(prefix)) # length
+        if prefix_skip is None:
+            prefix_skip =  len(prefix)
         emit(prefix_skip) # skip
         code.extend(prefix)
         # generate overlap table
index 3fdda95150ce97fccfffc91e315328ddaec17609..dda128616575e3d6422c73d4416003d94d188a57 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -13,6 +13,10 @@ Core and Builtins
 Library
 -------
 
+- Issue #24426: Fast searching optimization in regular expressions now works
+  for patterns that starts with capturing groups.  Fast searching optimization
+  now can't be disabled at compile time.
+
 Documentation
 -------------
 
index 4016a4533e0bccf6a7658bbccad20d8a5c3a958f..1d90281a58326e6cbde0e2d3062b2a4c937ee5c7 100644 (file)
@@ -62,9 +62,6 @@ static char copyright[] =
 /* -------------------------------------------------------------------- */
 /* optional features */
 
-/* enables fast searching */
-#define USE_FAST_SEARCH
-
 /* enables copy/deepcopy handling (work in progress) */
 #undef USE_BUILTIN_COPY
 
index 463a908b0064af3794021e9f107c5b4a0cccf264..422f1684096b5414d5ef1a28554e7408e0af74c1 100644 (file)
@@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
            prefix, prefix_len, prefix_skip));
     TRACE(("charset = %p\n", charset));
 
-#if defined(USE_FAST_SEARCH)
+    if (prefix_len == 1) {
+        /* pattern starts with a literal character */
+        SRE_CHAR c = (SRE_CHAR) prefix[0];
+#if SIZEOF_SRE_CHAR < 4
+        if ((SRE_CODE) c != prefix[0])
+            return 0; /* literal can't match: doesn't fit in char width */
+#endif
+        end = (SRE_CHAR *)state->end;
+        while (ptr < end) {
+            while (*ptr != c) {
+                if (++ptr >= end)
+                    return 0;
+            }
+            TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
+            state->start = ptr;
+            state->ptr = ptr + prefix_skip;
+            if (flags & SRE_INFO_LITERAL)
+                return 1; /* we got all of it */
+            status = SRE(match)(state, pattern + 2*prefix_skip, 0);
+            if (status != 0)
+                return status;
+            ++ptr;
+        }
+        return 0;
+    }
+
     if (prefix_len > 1) {
         /* pattern starts with a known prefix.  use the overlap
            table to skip forward as fast as we possibly can */
@@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
         }
         return 0;
     }
-#endif
 
-    if (pattern[0] == SRE_OP_LITERAL) {
-        /* pattern starts with a literal character.  this is used
-           for short prefixes, and if fast search is disabled */
-        SRE_CHAR c = (SRE_CHAR) pattern[1];
-#if SIZEOF_SRE_CHAR < 4
-        if ((SRE_CODE) c != pattern[1])
-            return 0; /* literal can't match: doesn't fit in char width */
-#endif
-        end = (SRE_CHAR *)state->end;
-        while (ptr < end) {
-            while (*ptr != c) {
-                if (++ptr >= end)
-                    return 0;
-            }
-            TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
-            state->start = ptr;
-            state->ptr = ++ptr;
-            if (flags & SRE_INFO_LITERAL)
-                return 1; /* we got all of it */
-            status = SRE(match)(state, pattern + 2, 0);
-            if (status != 0)
-                break;
-        }
-    } else if (charset) {
+    if (charset) {
         /* pattern starts with a character from a known set */
         end = (SRE_CHAR *)state->end;
         for (;;) {