raise RuntimeError, "cannot find a useable array type"
def _compile(code, pattern, flags):
+ # internal: compile a (sub)pattern
emit = code.append
for op, av in pattern:
if op is ANY:
else:
raise ValueError, ("unsupported operand type", op)
+def _compile_info(code, pattern, flags):
+ # internal: compile an info block. in the current version,
+ # this contains min/max pattern width and a literal prefix,
+ # if any
+ lo, hi = pattern.getwidth()
+ if lo == 0:
+ return # not worth it
+ # look for a literal prefix
+ prefix = []
+ if not (flags & SRE_FLAG_IGNORECASE):
+ for op, av in pattern.data:
+ if op is LITERAL:
+ prefix.append(ord(av))
+ else:
+ break
+ # add an info block
+ emit = code.append
+ emit(OPCODES[INFO])
+ skip = len(code); emit(0)
+ # literal flag
+ mask = 0
+ if len(prefix) == len(pattern.data):
+ mask = 1
+ emit(mask)
+ # pattern length
+ emit(lo)
+ if hi < 32768:
+ emit(hi)
+ else:
+ emit(0)
+ # add literal prefix
+ emit(len(prefix))
+ if prefix:
+ code.extend(prefix)
+ # generate overlap table
+ table = [-1] + ([0]*len(prefix))
+ for i in range(len(prefix)):
+ table[i+1] = table[i]+1
+ while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
+ table[i+1] = table[table[i+1]-1]+1
+ code.extend(table[1:]) # don't store first entry
+ code[skip] = len(code) - skip
+
def compile(p, flags=0):
# internal: convert pattern list to internal format
+
+ # compile, as necessary
if type(p) in (type(""), type(u"")):
import sre_parse
pattern = p
p = sre_parse.parse(p)
else:
pattern = None
+
flags = p.pattern.flags | flags
code = []
+
+ # compile info block
+ _compile_info(code, p, flags)
+
+ # compile the pattern
_compile(code, p.data, flags)
+
code.append(OPCODES[SUCCESS])
- # FIXME: <fl> get rid of this limitation
+
+ # FIXME: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups"
+
return _sre.compile(
pattern, flags,
array.array(WORDSIZE, code).tostring(),
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
+ * 00-06-30 fl tuning, fast search (0.9.3)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
#ifndef SRE_RECURSIVE
-static char
-copyright[] = " SRE 0.9.2 Copyright (c) 1997-2000 by Secret Labs AB ";
+char copyright[] = " SRE 0.9.3 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
#define HAVE_UNICODE
#endif
+/* optional features */
+#define USE_FAST_SEARCH
+
#if defined(_MSC_VER)
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
/* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall
#else
-#define LOCAL(type) static type
+#define LOCAL(type) static inline type
#endif
/* error codes */
TRACE(("%8d: enter\n", PTR(ptr)));
+ if (pattern[0] == SRE_OP_INFO) {
+ /* optimization info block */
+ /* args: <1=skip> <2=flags> <3=min> ... */
+ if (pattern[3] && (end - ptr) < pattern[3]) {
+ TRACE(("reject (got %d chars, need %d)\n",
+ (end - ptr), pattern[3]));
+ return 0;
+ }
+ pattern += pattern[1] + 1;
+ }
+
stackbase = stack = state->stackbase;
lastmark = state->lastmark;
SRE_CHAR* end = state->end;
int status = 0;
int prefix_len = 0;
- SRE_CODE* prefix = NULL;
+ SRE_CODE* prefix;
+ SRE_CODE* overlap;
+ int literal = 0;
if (pattern[0] == SRE_OP_INFO) {
- /* args: <skip> <min> <max> <prefix> <prefix data...> */
- end -= pattern[2];
- prefix_len = pattern[4];
- prefix = pattern + 5;
- pattern += pattern[1];
+ /* optimization info block */
+ /* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
+
+ if (pattern[3] > 0) {
+ /* adjust end point (but make sure we leave at least one
+ character in there) */
+ end -= pattern[3]-1;
+ if (end <= ptr)
+ end = ptr+1;
+ }
+
+ literal = pattern[2];
+
+ prefix = pattern + 6;
+ prefix_len = pattern[5];
+
+ overlap = prefix + prefix_len - 1;
+
+ pattern += 1 + pattern[1];
}
- /* if (prefix_len > 0) ... */
+#if defined(USE_FAST_SEARCH)
+ if (prefix_len > 1) {
+ /* pattern starts with a known prefix. use the overlap
+ table to skip forward as fast as we possibly can */
+ int i = 0;
+ end = state->end;
+ while (ptr < end) {
+ for (;;) {
+ if (*ptr != (SRE_CHAR) prefix[i]) {
+ if (!i)
+ break;
+ else
+ i = overlap[i];
+ } else {
+ if (++i == prefix_len) {
+ /* found a potential match */
+ TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
+ state->start = ptr - prefix_len + 1;
+ state->ptr = ptr + 1;
+ if (literal)
+ return 1; /* all of it */
+ status = SRE_MATCH(state, pattern + 2*prefix_len);
+ if (status != 0)
+ return status;
+ /* close but no cigar -- try again */
+ i = overlap[i];
+ }
+ break;
+ }
+
+ }
+ ptr++;
+ }
+ return 0;
+ }
+#endif
if (pattern[0] == SRE_OP_LITERAL) {
- /* pattern starts with a literal */
+ /* pattern starts with a literal character. this is used for
+ short prefixes, and if fast search is disabled*/
SRE_CHAR chr = (SRE_CHAR) pattern[1];
for (;;) {
while (ptr < end && *ptr != chr)
if (status != 0)
break;
}
-
} else
+ /* general case */
while (ptr <= end) {
TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
state->start = state->ptr = ptr++;