]> granicus.if.org Git - python/commitdiff
Issue #19387: explain and test the sre overlap table
authorAntoine Pitrou <solipsis@pitrou.net>
Fri, 25 Oct 2013 19:36:10 +0000 (21:36 +0200)
committerAntoine Pitrou <solipsis@pitrou.net>
Fri, 25 Oct 2013 19:36:10 +0000 (21:36 +0200)
Lib/sre_compile.py
Lib/test/test_re.py

index e08ec661aaa25aa8675a3efde038df23744b9149..e194aaace76f735ef72ba83e0ebab363b9bde24e 100644 (file)
@@ -353,6 +353,27 @@ def _simple(av):
     lo, hi = av[2].getwidth()
     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
 
+def _generate_overlap_table(prefix):
+    """
+    Generate an overlap table for the following prefix.
+    An overlap table is a table of the same size as the prefix which
+    informs about the potential self-overlap for each index in the prefix:
+    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
+    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
+      prefix[0:k]
+    """
+    table = [0] * len(prefix)
+    for i in range(1, len(prefix)):
+        idx = table[i - 1]
+        while prefix[i] != prefix[idx]:
+            if idx == 0:
+                table[i] = 0
+                break
+            idx = table[idx - 1]
+        else:
+            table[i] = idx + 1
+    return table
+
 def _compile_info(code, pattern, flags):
     # internal: compile an info block.  in the current version,
     # this contains min/max pattern width, and an optional literal
@@ -449,12 +470,7 @@ def _compile_info(code, pattern, flags):
         emit(prefix_skip) # skip
         code.extend(prefix)
         # generate overlap table
-        table = [-1] + ([0]*len(prefix))
-        for i in range(len(prefix)):
-            table[i+1] = table[i]+1
-            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
-                table[i+1] = table[table[i+1]-1]+1
-        code.extend(table[1:]) # don't store first entry
+        code.extend(_generate_overlap_table(prefix))
     elif charset:
         _compile_charset(charset, flags, code)
     code[skip] = len(code) - skip
index 5e6858506531f86ba9cf0cd3d2f2b7bb57a485f2..9ee077da7b43491d2b10ccc8dc581c2c68f1fbfa 100644 (file)
@@ -3,10 +3,12 @@ from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
 import io
 import re
 from re import Scanner
+import sre_compile
 import sre_constants
 import sys
 import string
 import traceback
+import unittest
 from weakref import proxy
 
 # Misc tests from Tim Peters' re.doc
@@ -15,8 +17,6 @@ from weakref import proxy
 # what you're doing. Some of these tests were carefully modeled to
 # cover most of the code.
 
-import unittest
-
 class S(str):
     def __getitem__(self, index):
         return S(super().__getitem__(index))
@@ -1140,6 +1140,22 @@ class ReTests(unittest.TestCase):
                 self.assertEqual(m.group(1), "")
                 self.assertEqual(m.group(2), "y")
 
+
+class ImplementationTest(unittest.TestCase):
+    """
+    Test implementation details of the re module.
+    """
+
+    def test_overlap_table(self):
+        f = sre_compile._generate_overlap_table
+        self.assertEqual(f(""), [])
+        self.assertEqual(f("a"), [0])
+        self.assertEqual(f("abcd"), [0, 0, 0, 0])
+        self.assertEqual(f("aaaa"), [0, 1, 2, 3])
+        self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
+        self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
+
+
 def run_re_tests():
     from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
     if verbose:
@@ -1269,7 +1285,7 @@ def run_re_tests():
 
 
 def test_main():
-    run_unittest(ReTests)
+    run_unittest(__name__)
     run_re_tests()
 
 if __name__ == "__main__":