]> granicus.if.org Git - python/commitdiff
Adding Fredrik Lundh's _sre.c module and its header files.
authorGuido van Rossum <guido@python.org>
Fri, 31 Mar 2000 14:59:30 +0000 (14:59 +0000)
committerGuido van Rossum <guido@python.org>
Fri, 31 Mar 2000 14:59:30 +0000 (14:59 +0000)
NOTE: THIS IS VERY ROUGH ALPHA CODE!

Modules/_sre.c [new file with mode: 0644]
Modules/sre.h [new file with mode: 0644]
Modules/sre_constants.h [new file with mode: 0644]

diff --git a/Modules/_sre.c b/Modules/_sre.c
new file mode 100644 (file)
index 0000000..8af876f
--- /dev/null
@@ -0,0 +1,1453 @@
+/* -*- Mode: C; tab-width: 4 -*-
+ *
+ * Secret Labs' Regular Expression Engine
+ * $Id$
+ *
+ * simple regular expression matching engine
+ *
+ * partial history:
+ * 99-10-24 fl created (bits and pieces from the template matcher)
+ * 99-11-13 fl added categories, branching, and more (0.2)
+ * 99-11-16 fl some tweaks to compile on non-Windows platforms
+ * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
+ * 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
+ * 99-03-06 fl first alpha, sort of (0.5)
+ * 99-03-14 fl removed most compatibility stuff (0.6)
+ *
+ * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+ *
+ * This code can only be used for 1.6 alpha testing.  All other use
+ * require explicit permission from Secret Labs AB.
+ *
+ * Portions of this engine have been developed in cooperation with
+ * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+ * other compatibility work.
+ */
+
+#ifndef SRE_RECURSIVE
+
+char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
+
+#include "Python.h"
+
+#include "sre.h"
+
+#include "unicodeobject.h"
+
+#if defined(HAVE_LIMITS_H)
+#include <limits.h>
+#else
+#define INT_MAX 2147483647
+#endif
+
+#include <ctype.h> /* temporary hack */
+
+/* defining this one enables tracing */
+#undef DEBUG
+
+#ifdef WIN32 /* FIXME: <fl> don't assume Windows == MSVC */
+#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
+/* fastest possible local call under MSVC */
+#define LOCAL(type) static __inline type __fastcall
+#else
+#define LOCAL(type) static type
+#endif
+
+/* error codes */
+#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
+#define SRE_ERROR_MEMORY -9 /* out of memory */
+
+#ifdef DEBUG
+#define TRACE(v) printf v
+#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
+#else
+#define TRACE(v)
+#endif
+
+#define SRE_CODE unsigned short /* unsigned short or larger */
+
+typedef struct {
+       /* string pointers */
+       void* ptr; /* current position (also end of current slice) */
+       void* beginning; /* start of original string */
+       void* start; /* start of current slice */
+       void* end; /* end of original string */
+       /* character size */
+       int charsize;
+       /* registers */
+       int marks;
+       void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
+       /* FIXME */
+       /* backtracking stack */
+       void** stack;
+       int stacksize;
+       int stackbase;
+} SRE_STATE;
+
+#if 1 /* FIXME: <fl> fix this one! */
+#define SRE_TO_LOWER Py_UNICODE_TOLOWER
+#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
+#define SRE_IS_SPACE Py_UNICODE_ISSPACE
+#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
+#else
+#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
+#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
+#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
+#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
+#endif
+
+#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
+
+LOCAL(int)
+sre_category(SRE_CODE category, unsigned int ch)
+{
+       switch (category) {
+       case 'd':
+               return SRE_IS_DIGIT(ch);
+       case 'D':
+               return !SRE_IS_DIGIT(ch);
+       case 's':
+               return SRE_IS_SPACE(ch);
+       case 'S':
+               return !SRE_IS_SPACE(ch);
+       case 'w':
+               return SRE_IS_WORD(ch);
+       case 'W':
+               return !SRE_IS_WORD(ch);
+       }
+       return 0;
+}
+
+/* helpers */
+
+LOCAL(int)
+_stack_free(SRE_STATE* state)
+{
+       if (state->stack) {
+               TRACE(("release stack\n"));
+               free(state->stack);
+               state->stack = NULL;
+       }
+       state->stacksize = 0;
+       return 0;
+}
+
+static int /* shouldn't be LOCAL */
+_stack_extend(SRE_STATE* state, int lo, int hi)
+{
+       void** stack;
+       int stacksize;
+
+       /* grow the stack to a suitable size; we need at least lo entries,
+          at most hi entries.  if for some reason hi is lower than lo, lo
+          wins */
+
+       stacksize = state->stacksize;
+
+       if (stacksize == 0) {
+               /* create new stack */
+               stacksize = 512;
+               if (stacksize < lo)
+                       stacksize = lo;
+               else if (stacksize > hi)
+                       stacksize = hi;
+               TRACE(("allocate stack %d\n", stacksize));
+               stack = malloc(sizeof(void*) * stacksize);
+       } else {
+               /* grow the stack (typically by a factor of two) */
+               while (stacksize < lo)
+                       stacksize = 2 * stacksize;
+               /* FIXME: <fl> could trim size if it's larger than lo, and
+                  much larger than hi */
+               TRACE(("grow stack to %d\n", stacksize));
+               stack = realloc(state->stack, sizeof(void*) * stacksize);
+       }
+
+       if (!stack) {
+               _stack_free(state);
+               return SRE_ERROR_MEMORY;
+       }
+
+       state->stack = stack;
+       state->stacksize = stacksize;
+
+       return 0;
+}
+
+/* set things up for the 8-bit version */
+
+#define SRE_CHAR unsigned char
+#define SRE_AT sre_at
+#define SRE_MEMBER sre_member
+#define SRE_MATCH sre_match
+#define SRE_SEARCH sre_search
+#define SRE_RECURSIVE
+
+#include "_sre.c"
+
+#undef SRE_RECURSIVE
+#undef SRE_SEARCH
+#undef SRE_MATCH
+#undef SRE_MEMBER
+#undef SRE_AT
+#undef SRE_CHAR
+
+/* set things up for the 16-bit unicode version */
+
+#define SRE_CHAR Py_UNICODE
+#define SRE_AT sre_uat
+#define SRE_MEMBER sre_umember
+#define SRE_MATCH sre_umatch
+#define SRE_SEARCH sre_usearch
+
+#endif /* SRE_RECURSIVE */
+
+/* -------------------------------------------------------------------- */
+/* String matching engine */
+
+/* the following section is compiled twice, with different character
+   settings */
+
+LOCAL(int)
+SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
+{
+       /* check if pointer is at given position.  return 1 if so, 0
+          otherwise */
+
+       int this, that;
+
+       switch (at) {
+       case 'a':
+               /* beginning */
+               return (ptr == state->beginning);
+       case 'z':
+               /* end */
+               return (ptr == state->end);
+       case 'b':
+               /* word boundary */
+               if (state->beginning == state->end)
+                       return 0;
+               that = ((void*) ptr > state->beginning) ?
+                       SRE_IS_WORD((int) ptr[-1]) : 0;
+               this = ((void*) ptr < state->end) ?
+                       SRE_IS_WORD((int) ptr[0]) : 0;
+               return this != that;
+       case 'B':
+               /* word non-boundary */
+               if (state->beginning == state->end)
+                       return 0;
+               that = ((void*) ptr > state->beginning) ?
+                       SRE_IS_WORD((int) ptr[-1]) : 0;
+               this = ((void*) ptr < state->end) ?
+                       SRE_IS_WORD((int) ptr[0]) : 0;
+               return this == that;
+       }
+
+       return 0;
+}
+
+LOCAL(int)
+SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
+{
+       /* check if character is a member of the given set.      return 1 if
+          so, 0 otherwise */
+
+       int ok = 1;
+
+       for (;;) {
+               switch (*set++) {
+
+               case SRE_OP_NEGATE:
+                       ok = !ok;
+                       break;
+
+               case SRE_OP_FAILURE:
+                       return !ok;
+
+               case SRE_OP_LITERAL:
+                       if (ch == (SRE_CHAR) set[0])
+                               return ok;
+                       set++;
+                       break;
+
+               case SRE_OP_RANGE:
+                       if ((SRE_CHAR) set[0] <= ch && ch <= (SRE_CHAR) set[1])
+                               return ok;
+                       set += 2;
+                       break;
+
+               case SRE_OP_CATEGORY:
+                       if (sre_category(set[0], (int) ch))
+                               return ok;
+                       set += 1;
+                       break;
+
+               default:
+                       /* FIXME: internal error */
+                       return 0;
+               }
+       }
+}
+
+LOCAL(int)
+SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
+{
+       /* check if string matches the given pattern.  returns -1 for
+          error, 0 for failure, and 1 for success */
+
+       SRE_CHAR* end = state->end;
+       SRE_CHAR* ptr = state->ptr;
+       int stacksize;
+       int stackbase;
+       int i, count;
+
+       for (;;) {
+
+               TRACE(("[%p]\n", pattern));
+
+               switch (*pattern++) {
+
+               case SRE_OP_FAILURE:
+                       /* immediate failure */
+                       TRACE(("%8d: failure\n", PTR(ptr)));
+                       return 0;
+
+               case SRE_OP_SUCCESS:
+                       /* end of pattern */
+                       TRACE(("%8d: success\n", PTR(ptr)));
+                       state->ptr = ptr;
+                       return 1;
+
+               case SRE_OP_AT:
+                       /* match at given position */
+                       TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
+                       if (!SRE_AT(state, ptr, *pattern))
+                               return 0;
+                       pattern++;
+                       break;
+
+               case SRE_OP_LITERAL:
+                       /* match literal character */
+                       /* args: <code> */
+                       TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
+                       if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
+                               return 0;
+                       pattern++;
+                       ptr++;
+                       break;
+
+               case SRE_OP_NOT_LITERAL:
+                       /* match anything that is not literal character */
+                       /* args: <code> */
+                       TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
+                       if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
+                               return 0;
+                       pattern++;
+                       ptr++;
+                       break;
+
+               case SRE_OP_ANY:
+                       /* match anything */
+                       TRACE(("%8d: any\n", PTR(ptr)));
+                       if (ptr >= end)
+                               return 0;
+                       ptr++;
+                       break;
+
+               case SRE_OP_IN:
+                       /* match set member (or non_member) */
+                       /* args: <skip> <set> */
+                       TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
+                       if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
+                               return 0;
+                       pattern += pattern[0];
+                       ptr++;
+                       break;
+
+               case SRE_OP_GROUP:
+                       /* match backreference */
+                       i = pattern[0];
+                       {
+                               /* FIXME: optimize size! */
+                               SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
+                               SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
+                               if (!p || !e || e < p)
+                                       return 0;
+                               while (p < e) {
+                                       if (ptr >= end || *ptr != *p)
+                                               return 0;
+                                       p++; ptr++;
+                               }
+                       }
+                       pattern++;
+                       break;
+
+               case SRE_OP_LITERAL_IGNORE:
+                       TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
+                       if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
+                               return 0;
+                       pattern++;
+                       ptr++;
+                       break;
+
+               case SRE_OP_NOT_LITERAL_IGNORE:
+                       TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
+                                  (SRE_CHAR) *pattern));
+                       if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
+                               return 0;
+                       pattern++;
+                       ptr++;
+                       break;
+
+               case SRE_OP_IN_IGNORE:
+                       TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
+                       if (ptr >= end
+                               || !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
+                               return 0;
+                       pattern += pattern[0];
+                       ptr++;
+                       break;
+
+               case SRE_OP_MARK:
+                       /* set mark */
+                       /* args: <mark> */
+                       TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
+                       state->mark[pattern[0]] = ptr;
+                       pattern++;
+                       break;
+
+               case SRE_OP_JUMP:
+                       /* jump forward */
+                       /* args: <skip> */
+                       TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
+                       pattern += pattern[0];
+                       break;
+
+               case SRE_OP_CALL:
+                       /* match subpattern, without backtracking */
+                       /* args: <skip> <pattern> */
+                       TRACE(("%8d: match subpattern\n", PTR(ptr)));
+                       state->ptr = ptr;
+                       if (!SRE_MATCH(state, pattern + 1))
+                               return 0;
+                       pattern += pattern[0];
+                       ptr = state->ptr;
+                       break;
+
+               case SRE_OP_MAX_REPEAT_ONE:
+
+                       /* match repeated sequence (maximizing regexp).  this
+                          variant only works if the repeated item is exactly one
+                          character wide, and we're not already collecting
+                          backtracking points.  for other cases, use the
+                          MAX_REPEAT operator instead */
+
+                       /* args: <skip> <min> <max> <step> */
+
+                       TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
+                                  pattern[1], pattern[2]));
+
+                       count = 0;
+
+                       if (pattern[3] == SRE_OP_ANY) {
+                               /* repeated wildcard.  skip to the end of the target
+                                  string, and backtrack from there */
+                               /* FIXME: must look for line endings */
+                               if (ptr + pattern[1] > end)
+                                       return 0; /* cannot match */
+                               count = pattern[2];
+                               if (count > end - ptr)
+                                       count = end - ptr;
+                               ptr += count;
+
+                       } else if (pattern[3] == SRE_OP_LITERAL) {
+                               /* repeated literal */
+                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               while (count < (int) pattern[2]) {
+                                       if (ptr >= end || *ptr != chr)
+                                               break;
+                                       ptr++;
+                                       count++;
+                               }
+
+                       } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
+                               /* repeated literal */
+                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               while (count < (int) pattern[2]) {
+                                       if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) != chr)
+                                               break;
+                                       ptr++;
+                                       count++;
+                               }
+
+                       } else if (pattern[3] == SRE_OP_NOT_LITERAL) {
+                               /* repeated non-literal */
+                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               while (count < (int) pattern[2]) {
+                                       if (ptr >= end || *ptr == chr)
+                                               break;
+                                       ptr++;
+                                       count++;
+                               }
+
+                       } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
+                               /* repeated non-literal */
+                               SRE_CHAR chr = (SRE_CHAR) pattern[4];
+                               while (count < (int) pattern[2]) {
+                                       if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) == chr)
+                                               break;
+                                       ptr++;
+                                       count++;
+                               }
+
+                       } else if (pattern[3] == SRE_OP_IN) {
+                               /* repeated set */
+                               while (count < (int) pattern[2]) {
+                                       if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
+                                               break;
+                                       ptr++;
+                                       count++;
+                               }
+
+                       } else {
+                               /* repeated single character pattern */
+                               state->ptr = ptr;
+                               while (count < (int) pattern[2]) {
+                                       i = SRE_MATCH(state, pattern + 3);
+                                       if (i < 0)
+                                               return i;
+                                       if (i == 0)
+                                               break;
+                                       count++;
+                               }
+                               state->ptr = ptr;
+                               ptr += count;
+                       }
+
+                       /* when we arrive here, count contains the number of
+                          matches, and ptr points to the tail of the target
+                          string.      check if the rest of the pattern matches, and
+                          backtrack if not. */
+
+                       /* FIXME: <fl> this is a mess.  fix it! */
+
+                       TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
+
+                       if (count < (int) pattern[1])
+                               return 0;
+
+                       if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
+                               /* tail is empty.  we're finished */
+                               TRACE(("%8d: tail is empty\n", PTR(ptr)));
+                               state->ptr = ptr;
+                               return 1;
+
+                       } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
+                               /* tail starts with a literal.  we can speed things up
+                                  by skipping positions where the rest of the pattern
+                                  cannot possibly match */
+                               SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
+                               TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
+                               for (;;) {
+                                       TRACE(("%8d: scan for tail match\n", PTR(ptr)));
+                                       while (count >= (int) pattern[1] &&
+                                                  (ptr >= end || *ptr != chr)) {
+                                               ptr--;
+                                               count--;
+                                       }
+                                       TRACE(("%8d: check tail\n", PTR(ptr)));
+                                       if (count < (int) pattern[1])
+                                               break;
+                                       state->ptr = ptr;
+                                       i = SRE_MATCH(state, pattern + pattern[0]);
+                                       if (i > 0) {
+                                               TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                                               return 1;
+                                       }
+                                       TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+                                       ptr--;
+                                       count--;
+                               }
+
+                       } else {
+                               TRACE(("%8d: tail is pattern\n", PTR(ptr)));
+                               while (count >= (int) pattern[1]) {
+                                       state->ptr = ptr;
+                                       i = SRE_MATCH(state, pattern + pattern[0]);
+                                       if (i > 0) {
+                                               TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                                               return 1;
+                                       }
+                                       TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+                                       ptr--;
+                                       count--;
+                               }
+                       }
+                       return 0; /* failure! */
+
+/* ----------------------------------------------------------------------- */
+/* FIXME: the following section is just plain broken */
+
+               case SRE_OP_MAX_REPEAT:
+                       /* match repeated sequence (maximizing regexp).  repeated
+                          group should end with a MAX_UNTIL code */
+
+                       TRACE(("%8d: max repeat %d %d\n", PTR(ptr),
+                                  pattern[1], pattern[2]));
+
+                       count = 0;
+                       state->ptr = ptr;
+
+                       /* FIXME: <fl> umm.      what about matching the minimum
+                          number of items before starting to collect backtracking
+                          positions? */
+
+                       stackbase = state->stackbase;
+
+                       while (count < (int) pattern[2]) {
+                               /* store current position on the stack */
+                               TRACE(("%8d: push mark at index %d\n", PTR(ptr), count));
+                               if (stackbase + count >= state->stacksize) {
+                                       i = _stack_extend(state, stackbase + count + 1,
+                                                                         stackbase + pattern[2]);
+                                       if (i < 0)
+                                               return i;
+                               }
+                               state->stack[stackbase + count] = ptr;
+                               /* check if we can match another item */
+                               state->stackbase += count + 1;
+                               i = SRE_MATCH(state, pattern + 3);
+                               state->stackbase = stackbase; /* rewind */
+                               if (i != 2)
+                                       break;
+                               if (state->ptr == ptr) {
+                                       /* if the match was successful but empty, set the
+                                          count to max and terminate the scanning loop */
+                                       stacksize = count; /* actual size of stack */
+                                       count = (int) pattern[2];
+                                       goto check_tail; /* FIXME: <fl> eliminate goto */
+                               }
+                               count++;
+                               ptr = state->ptr;
+
+                       }
+
+                       stacksize = count; /* actual number of entries on the stack */
+
+                 check_tail:
+
+                       /* when we get here, count is the number of matches,
+                          stacksize is the number of match points on the stack
+                          (usually same as count, but it might be smaller) and
+                          ptr points to the tail. */
+
+                       if (count < (int) pattern[1])
+                               return 0;
+
+                       /* make sure that rest of the expression matches.  if it
+                          doesn't, backtrack */
+
+                       TRACE(("%8d: repeat %d found (stack size = %d)\n", PTR(ptr),
+                                  count, stacksize + 1));
+
+                       TRACE(("%8d: tail is pattern\n", PTR(ptr)));
+
+                       /* hope for the best */
+                       state->ptr = ptr;
+                       state->stackbase += stacksize + 1;
+                       i = SRE_MATCH(state, pattern + pattern[0]);
+                       state->stackbase = stackbase;
+                       if (i > 0) {
+                               TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                               return 1;
+                       }
+
+                       /* backtrack! */
+                       while (count >= (int) pattern[1]) {
+                               ptr = state->stack[stackbase + (count < stacksize ? count : stacksize)];
+                               state->ptr = ptr;
+                               count--;
+                               TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+                               state->stackbase += stacksize + 1;
+                               i = SRE_MATCH(state, pattern + pattern[0]);
+                               state->stackbase = stackbase;
+                               if (i > 0) {
+                                       TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                                       return 1;
+                               }
+                       }
+                       return 0; /* failure! */
+
+               case SRE_OP_MAX_UNTIL:
+                       /* match repeated sequence (maximizing regexp).  repeated
+                          group should end with a MAX_UNTIL code */
+
+                       TRACE(("%8d: max until\n", PTR(ptr)));
+                       state->ptr = ptr;
+                       return 2; /* always succeeds, for now... */
+
+/* end of totally broken section */
+/* ----------------------------------------------------------------------- */
+
+               case SRE_OP_MIN_REPEAT:
+                       /* match repeated sequence (minimizing regexp) */
+                       TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
+                                  pattern[1], pattern[2]));
+                       count = 0;
+                       state->ptr = ptr;
+                       /* match minimum number of items */
+                       while (count < (int) pattern[1]) {
+                               i = SRE_MATCH(state, pattern + 3);
+                               if (i <= 0)
+                                       return i;
+                               count++;
+                       }
+                       /* move forward until the tail matches. */
+                       while (count <= (int) pattern[2]) {
+                               ptr = state->ptr;
+                               i = SRE_MATCH(state, pattern + pattern[0]);
+                               if (i > 0) {
+                                       TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
+                                       return 1;
+                               }
+                               TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
+                               state->ptr = ptr; /* backtrack */
+                               i = SRE_MATCH(state, pattern + 3);
+                               if (i <= 0)
+                                       return i;
+                               count++;
+                       }
+                       return 0; /* failure! */
+
+               case SRE_OP_MIN_UNTIL:
+                       /* end of repeat group */
+                       TRACE(("%8d: min until\n", PTR(ptr)));
+                       state->ptr = ptr;
+                       return 2; /* always succeeds, for now... */
+
+               case SRE_OP_BRANCH:
+                       /* match one of several subpatterns */
+                       /* format: <branch> <size> <head> ... <null> <tail> */
+                       TRACE(("%8d: branch\n", PTR(ptr)));
+                       while (*pattern) {
+                               if (pattern[1] != SRE_OP_LITERAL ||
+                                       (ptr < end && *ptr == (SRE_CHAR) pattern[2])) {
+                                       TRACE(("%8d: branch check\n", PTR(ptr)));
+                                       state->ptr = ptr;
+                                       i = SRE_MATCH(state, pattern + 1);
+                                       if (i > 0) {
+                                               TRACE(("%8d: branch succeeded\n", PTR(ptr)));
+                                               return 1;
+                                       }
+                               }
+                               pattern += *pattern;
+                       }
+                       TRACE(("%8d: branch failed\n", PTR(ptr)));
+                       return 0; /* failure! */
+
+               case SRE_OP_REPEAT:
+                       /* TEMPLATE: match repeated sequence (no backtracking) */
+                       /* format: <repeat> <skip> <min> <max> */
+                       TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
+                       count = 0;
+                       state->ptr = ptr;
+                       while (count < (int) pattern[2]) {
+                               i = SRE_MATCH(state, pattern + 3);
+                               if (i <= 0)
+                                       break;
+                               count++;
+                       }
+                       if (count <= (int) pattern[1])
+                               return 0;
+                       TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
+                       pattern += pattern[0];
+                       ptr = state->ptr;
+                       break;
+
+               default:
+                       return SRE_ERROR_ILLEGAL;
+               }
+       }
+}
+
+LOCAL(int)
+SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
+{
+       SRE_CHAR* ptr = state->start;
+       SRE_CHAR* end = state->end;
+       int status = 0;
+
+       /* FIXME: <fl> add IGNORE cases (or implement full ASSERT support?      */
+
+       if (pattern[0] == SRE_OP_LITERAL) {
+               /* pattern starts with a literal */
+               SRE_CHAR chr = (SRE_CHAR) pattern[1];
+               for (;;) {
+                       while (ptr < end && *ptr != chr)
+                               ptr++;
+                       if (ptr == end)
+                               return 0;
+                       TRACE(("%8d: search found literal\n", PTR(ptr)));
+                       state->start = ptr;
+                       state->ptr = ++ptr;
+                       status = SRE_MATCH(state, pattern + 2);
+                       if (status != 0)
+                               break;
+               }
+
+       } else if (pattern[0] == SRE_OP_IN) {
+               /* pattern starts with a set */
+               for (;;) {
+                       /* format: <in> <skip> <data> */
+                       while (ptr < end && !SRE_MEMBER(pattern + 2, *ptr))
+                               ptr++;
+                       if (ptr == end)
+                               return 0;
+                       TRACE(("%8d: search found set\n", PTR(ptr)));
+                       state->start = ptr;
+                       state->ptr = ++ptr;
+                       status = SRE_MATCH(state, pattern + pattern[1] + 1);
+                       if (status != 0)
+                               break;
+               }
+
+       } else
+               while (ptr <= end) {
+                       TRACE(("%8d: search\n", PTR(ptr))); 
+                       state->start = state->ptr = ptr++;
+                       status = SRE_MATCH(state, pattern);
+                       if (status != 0)
+                               break;
+               }
+
+       return status;
+}
+
+#ifndef SRE_RECURSIVE
+
+/* -------------------------------------------------------------------- */
+/* factories and destructors */
+
+/* see sre.h for object declarations */
+
+staticforward PyTypeObject Pattern_Type;
+staticforward PyTypeObject Match_Type;
+
+static PyObject *
+_compile(PyObject* self_, PyObject* args)
+{
+       /* "compile" pattern descriptor to pattern object */
+
+       PatternObject* self;
+
+       PyObject* pattern;
+       PyObject* code;
+       int groups = 0;
+       PyObject* groupindex = NULL;
+       if (!PyArg_ParseTuple(args, "OO!|iO", &pattern,
+                          &PyString_Type, &code, &groups, &groupindex))
+               return NULL;
+
+       self = PyObject_NEW(PatternObject, &Pattern_Type);
+       if (self == NULL)
+               return NULL;
+
+       Py_INCREF(pattern);
+       self->pattern = pattern;
+
+       Py_INCREF(code);
+       self->code = code;
+
+       self->groups = groups;
+
+       Py_XINCREF(groupindex);
+       self->groupindex = groupindex;
+
+       return (PyObject*) self;
+}
+
+static PyObject *
+_getcodesize(PyObject* self_, PyObject* args)
+{
+       return Py_BuildValue("i", sizeof(SRE_CODE));
+}
+
+static PyObject*
+_pattern_new_match(PatternObject* pattern, SRE_STATE* state,
+                   PyObject* string, int status)
+{
+       /* create match object (from state object) */
+
+       MatchObject* match;
+       int i, j;
+
+       TRACE(("status = %d\n", status));
+
+       if (status > 0) {
+
+               /* create match object (with room for extra group marks) */
+               match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
+               if (match == NULL)
+                       return NULL;
+
+               Py_INCREF(pattern);
+               match->pattern = pattern;
+
+               Py_INCREF(string);
+               match->string = string;
+
+               match->groups = pattern->groups+1;
+
+               /* group zero */
+               match->mark[0] = ((char*) state->start -
+                                                 (char*) state->beginning) / state->charsize;
+               match->mark[1] = ((char*) state->ptr -
+                                                 (char*) state->beginning) / state->charsize;
+
+               /* fill in the rest of the groups */
+               for (i = j = 0; i < pattern->groups; i++, j+=2)
+                       if (state->mark[j] != NULL && state->mark[j+1] != NULL) {
+                               match->mark[j+2] = ((char*) state->mark[j] -
+                                                                       (char*) state->beginning) / state->charsize;
+                               match->mark[j+3] = ((char*) state->mark[j+1] -
+                                                                       (char*) state->beginning) / state->charsize;
+                       } else
+                               match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
+
+               return (PyObject*) match;
+
+       } else if (status < 0) {
+
+               /* internal error */
+               PyErr_SetString(
+                       PyExc_RuntimeError, "internal error in regular expression engine"
+                       );
+               return NULL;
+
+       }
+
+       Py_INCREF(Py_None);
+       return Py_None;
+}
+
+/* -------------------------------------------------------------------- */
+/* pattern methods */
+
+LOCAL(PyObject*)
+_setup(SRE_STATE* state, PyObject* args)
+{
+       /* prepare state object */
+
+       PyBufferProcs *buffer;
+       int i, count;
+       void* ptr;
+
+       PyObject* string;
+       int start = 0;
+       int end = INT_MAX;
+       if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
+               return NULL;
+
+       /* get pointer to string buffer */
+       buffer = string->ob_type->tp_as_buffer;
+       if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
+               buffer->bf_getsegcount(string, NULL) != 1) {
+               PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
+               return NULL;
+       }
+
+       /* determine buffer size */
+       count = buffer->bf_getreadbuffer(string, 0, &ptr);
+       if (count < 0) {
+               /* sanity check */
+               PyErr_SetString(PyExc_TypeError, "buffer has negative size");
+               return NULL;
+       }
+
+       /* determine character size */
+       state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
+
+       count /= state->charsize;
+
+       /* adjust boundaries */
+       if (start < 0)
+               start = 0;
+       else if (start > count)
+               start = count;
+
+       if (end < 0)
+               end = 0;
+       else if (end > count)
+               end = count;
+
+       state->beginning = ptr;
+
+       state->start = (void*) ((char*) ptr + start * state->charsize);
+       state->end = (void*) ((char*) ptr + end * state->charsize);
+
+       /* FIXME: dynamic! */
+       for (i = 0; i < 64; i++)
+               state->mark[i] = NULL;
+
+       state->stack = NULL;
+       state->stackbase = 0;
+       state->stacksize = 0;
+
+       return string;
+}
+
+static void
+_pattern_dealloc(PatternObject* self)
+{
+       Py_XDECREF(self->code);
+       Py_XDECREF(self->pattern);
+       Py_XDECREF(self->groupindex);
+       PyMem_DEL(self);
+}
+
+static PyObject*
+_pattern_match(PatternObject* self, PyObject* args)
+{
+       SRE_STATE state;
+       PyObject* string;
+       int status;
+
+       string = _setup(&state, args);
+       if (!string)
+               return NULL;
+
+       state.ptr = state.start;
+
+       if (state.charsize == 1) {
+               status = sre_match(&state, PatternObject_GetCode(self));
+       } else {
+               status = sre_umatch(&state, PatternObject_GetCode(self));
+       }
+
+       _stack_free(&state);
+
+       return _pattern_new_match(self, &state, string, status);
+}
+
+static PyObject*
+_pattern_search(PatternObject* self, PyObject* args)
+{
+       SRE_STATE state;
+       PyObject* string;
+       int status;
+
+       string = _setup(&state, args);
+       if (!string)
+               return NULL;
+
+       if (state.charsize == 1) {
+               status = sre_search(&state, PatternObject_GetCode(self));
+       } else {
+               status = sre_usearch(&state, PatternObject_GetCode(self));
+       }
+
+       _stack_free(&state);
+
+       return _pattern_new_match(self, &state, string, status);
+}
+
+static PyObject*
+_pattern_findall(PatternObject* self, PyObject* args)
+{
+       /* FIXME: not sure about the semantics here.  this is good enough
+          for SXP, though... */
+
+       SRE_STATE state;
+       PyObject* string;
+       PyObject* list;
+       int status;
+
+       string = _setup(&state, args);
+       if (!string)
+               return NULL;
+
+       list = PyList_New(0);
+
+       while (state.start < state.end) {
+
+               PyObject* item;
+               
+               state.ptr = state.start;
+
+               if (state.charsize == 1) {
+                       status = sre_match(&state, PatternObject_GetCode(self));
+               } else {
+                       status = sre_umatch(&state,     PatternObject_GetCode(self));
+               }
+
+               if (status >= 0) {
+
+                       if (status == 0)
+                               state.ptr = (void*) ((char*) state.start + 1);
+
+                       item = PySequence_GetSlice(
+                               string,
+                               ((char*) state.start - (char*) state.beginning),
+                               ((char*) state.ptr - (char*) state.beginning));
+                       if (!item)
+                               goto error;
+                       if (PyList_Append(list, item) < 0)
+                               goto error;
+
+                       state.start = state.ptr;
+
+               } else {
+
+                       /* internal error */
+                       PyErr_SetString(
+                               PyExc_RuntimeError,
+                               "internal error in regular expression engine"
+                               );
+                       goto error;
+
+               }
+       }
+
+       _stack_free(&state);
+
+       return list;
+
+error:
+       _stack_free(&state);
+       return NULL;
+       
+}
+
+static PyMethodDef _pattern_methods[] = {
+       {"match", (PyCFunction) _pattern_match, 1},
+       {"search", (PyCFunction) _pattern_search, 1},
+       {"findall", (PyCFunction) _pattern_findall, 1},
+       {NULL, NULL}
+};
+
+static PyObject*  
+_pattern_getattr(PatternObject* self, char* name)
+{
+    PyObject* res;
+
+       res = Py_FindMethod(_pattern_methods, (PyObject*) self, name);
+
+       if (res)
+               return res;
+
+       PyErr_Clear();
+
+    /* attributes */
+       if (!strcmp(name, "pattern")) {
+        Py_INCREF(self->pattern);
+               return self->pattern;
+    }
+    
+       PyErr_SetString(PyExc_AttributeError, name);
+       return NULL;
+}
+
+statichere PyTypeObject Pattern_Type = {
+       PyObject_HEAD_INIT(NULL)
+       0, "Pattern", sizeof(PatternObject), 0,
+       (destructor)_pattern_dealloc, /*tp_dealloc*/
+       0, /*tp_print*/
+       (getattrfunc)_pattern_getattr, /*tp_getattr*/
+};
+
+/* -------------------------------------------------------------------- */
+/* match methods */
+
+static void
+_match_dealloc(MatchObject* self)
+{
+       Py_XDECREF(self->string);
+       Py_DECREF(self->pattern);
+       PyMem_DEL(self);
+}
+
+static PyObject*
+getslice_by_index(MatchObject* self, int index)
+{
+       if (index < 0 || index >= self->groups) {
+               /* raise IndexError if we were given a bad group number */
+               PyErr_SetString(
+                       PyExc_IndexError,
+                       "no such group"
+                       );
+               return NULL;
+       }
+
+       if (self->string == Py_None || self->mark[index+index] < 0) {
+               /* return None if the string or group is undefined */
+               Py_INCREF(Py_None);
+               return Py_None;
+       }
+
+       return PySequence_GetSlice(
+               self->string, self->mark[index+index], self->mark[index+index+1]
+               );
+}
+
+static PyObject*
+getslice(MatchObject* self, PyObject* index)
+{
+       if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
+               /* FIXME: resource leak? */
+               index = PyObject_GetItem(self->pattern->groupindex, index);
+               if (!index)
+                       return NULL;
+       }
+
+       if (PyInt_Check(index))
+               return getslice_by_index(self, (int) PyInt_AS_LONG(index));
+
+       return getslice_by_index(self, -1); /* signal error */
+}
+
+static PyObject*
+_match_group(MatchObject* self, PyObject* args)
+{
+       PyObject* result;
+       int i, size;
+
+       size = PyTuple_GET_SIZE(args);
+
+       switch (size) {
+       case 0:
+               result = getslice(self, Py_False); /* force error */
+               break;
+       case 1:
+               result = getslice(self, PyTuple_GET_ITEM(args, 0));
+               break;
+       default:
+               /* fetch multiple items */
+               result = PyTuple_New(size);
+               if (!result)
+                       return NULL;
+               for (i = 0; i < size; i++) {
+                       PyObject* item = getslice(self, PyTuple_GET_ITEM(args, i));
+                       if (!item) {
+                               Py_DECREF(result);
+                               return NULL;
+                       }
+                       PyTuple_SET_ITEM(result, i, item);
+               }
+               break;
+       }
+       return result;
+}
+
+static PyObject*
+_match_groups(MatchObject* self, PyObject* args)
+{
+       PyObject* result;
+       int index;
+
+       result = PyTuple_New(self->groups-1);
+       if (!result)
+               return NULL;
+
+       for (index = 1; index < self->groups; index++) {
+               PyObject* item;
+               /* FIXME: <fl> handle default! */
+               item = getslice_by_index(self, index);
+               if (!item) {
+                       Py_DECREF(result);
+                       return NULL;
+               }
+               PyTuple_SET_ITEM(result, index-1, item);
+       }
+
+       return result;
+}
+
+static PyObject*
+_match_groupdict(MatchObject* self, PyObject* args)
+{
+       PyObject* result;
+       PyObject* keys;
+       int index;
+
+       result = PyDict_New();
+       if (!result)
+               return NULL;
+       if (!self->pattern->groupindex)
+               return result;
+
+       keys = PyMapping_Keys(self->pattern->groupindex);
+       if (!keys)
+               return NULL;
+
+       for (index = 0; index < PySequence_Length(keys); index++) {
+               PyObject* key;
+               PyObject* item;
+               key = PySequence_GetItem(keys, index);
+               if (!key) {
+                       Py_DECREF(keys);
+                       Py_DECREF(result);
+                       return NULL;
+               }
+               item = getslice(self, key);
+               if (!item) {
+                       Py_DECREF(key);
+                       Py_DECREF(keys);
+                       Py_DECREF(result);
+                       return NULL;
+               }
+               /* FIXME: <fl> this can fail, right? */
+               PyDict_SetItem(result, key, item);
+       }
+
+       Py_DECREF(keys);
+
+       return result;
+}
+
+static PyObject*
+_match_start(MatchObject* self, PyObject* args)
+{
+       int index = 0;
+       if (!PyArg_ParseTuple(args, "|i", &index))
+               return NULL;
+
+       if (index < 0 || index >= self->groups) {
+               PyErr_SetString(
+                       PyExc_IndexError,
+                       "no such group"
+                       );
+               return NULL;
+       }
+
+       if (self->mark[index*2] < 0) {
+               Py_INCREF(Py_None);
+               return Py_None;
+       }
+
+       return Py_BuildValue("i", self->mark[index*2]);
+}
+
+static PyObject*
+_match_end(MatchObject* self, PyObject* args)
+{
+       int index = 0;
+       if (!PyArg_ParseTuple(args, "|i", &index))
+               return NULL;
+
+       if (index < 0 || index >= self->groups) {
+               PyErr_SetString(
+                       PyExc_IndexError,
+                       "no such group"
+                       );
+               return NULL;
+       }
+
+       if (self->mark[index*2] < 0) {
+               Py_INCREF(Py_None);
+               return Py_None;
+       }
+
+       return Py_BuildValue("i", self->mark[index*2+1]);
+}
+
+static PyObject*
+_match_span(MatchObject* self, PyObject* args)
+{
+       int index = 0;
+       if (!PyArg_ParseTuple(args, "|i", &index))
+               return NULL;
+
+       if (index < 0 || index >= self->groups) {
+               PyErr_SetString(
+                       PyExc_IndexError,
+                       "no such group"
+                       );
+               return NULL;
+       }
+
+       if (self->mark[index*2] < 0) {
+               Py_INCREF(Py_None);
+               return Py_None;
+       }
+
+       return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
+}
+
+static PyMethodDef _match_methods[] = {
+       {"group", (PyCFunction) _match_group, 1},
+       {"start", (PyCFunction) _match_start, 1},
+       {"end", (PyCFunction) _match_end, 1},
+       {"span", (PyCFunction) _match_span, 1},
+       {"groups", (PyCFunction) _match_groups, 1},
+       {"groupdict", (PyCFunction) _match_groupdict, 1},
+       {NULL, NULL}
+};
+
+static PyObject*  
+_match_getattr(MatchObject* self, char* name)
+{
+       PyObject* res;
+
+       res = Py_FindMethod(_match_methods, (PyObject*) self, name);
+       if (res)
+               return res;
+
+       PyErr_Clear();
+
+       /* attributes! */
+       if (!strcmp(name, "string")) {
+        Py_INCREF(self->string);
+               return self->string;
+    }
+       if (!strcmp(name, "regs"))
+               /* FIXME: should return the whole list! */
+               return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]);
+       if (!strcmp(name, "re")) {
+        Py_INCREF(self->pattern);
+               return (PyObject*) self->pattern;
+    }
+       if (!strcmp(name, "groupindex") && self->pattern->groupindex) {
+        Py_INCREF(self->pattern->groupindex);
+               return self->pattern->groupindex;
+    }
+       if (!strcmp(name, "pos"))
+               return Py_BuildValue("i", 0); /* FIXME */
+       if (!strcmp(name, "endpos"))
+               return Py_BuildValue("i", 0); /* FIXME */
+
+       PyErr_SetString(PyExc_AttributeError, name);
+       return NULL;
+}
+
+/* FIXME: implement setattr("string", None) as a special case (to
+   detach the associated string, if any */
+
+statichere PyTypeObject Match_Type = {
+       PyObject_HEAD_INIT(NULL)
+       0, "Match",
+       sizeof(MatchObject), /* size of basic object */
+       sizeof(int), /* space for group item */
+       (destructor)_match_dealloc, /*tp_dealloc*/
+       0, /*tp_print*/
+       (getattrfunc)_match_getattr, /*tp_getattr*/
+};
+
+static PyMethodDef _functions[] = {
+       {"compile", _compile, 1},
+       {"getcodesize", _getcodesize, 1},
+       {NULL, NULL}
+};
+
+void
+#ifdef WIN32
+__declspec(dllexport)
+#endif
+init_sre()
+{
+       /* Patch object types */
+       Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type;
+
+       Py_InitModule("_sre", _functions);
+}
+
+#endif
diff --git a/Modules/sre.h b/Modules/sre.h
new file mode 100644 (file)
index 0000000..2936b05
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Secret Labs' Regular Expression Engine
+ * $Id$
+ *
+ * simple regular expression matching engine
+ *
+ * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+ *
+ * See the _sre.c file for information on usage and redistribution.
+ */
+
+#ifndef SRE_INCLUDED
+#define SRE_INCLUDED
+
+#include "sre_constants.h"
+
+/* Python objects */
+
+typedef struct {
+    PyObject_HEAD
+    PyObject* code; /* link to the code string object */
+    PyObject* pattern; /* link to the pattern source (or None) */
+    int groups;
+    PyObject* groupindex;
+} PatternObject;
+
+#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code))
+
+typedef struct {
+    PyObject_HEAD
+    PyObject* string; /* link to the target string */
+    PatternObject* pattern; /* link to the regex (pattern) object */
+    int groups; /* number of groups (start/end marks) */
+    int mark[2];
+} MatchObject;
+
+#endif
+
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
new file mode 100644 (file)
index 0000000..6b940d3
--- /dev/null
@@ -0,0 +1,27 @@
+/* generated by sre_constants.py */
+#define SRE_OP_FAILURE 0
+#define SRE_OP_SUCCESS 1
+#define SRE_OP_ANY 2
+#define SRE_OP_ASSERT 3
+#define SRE_OP_AT 4
+#define SRE_OP_BRANCH 5
+#define SRE_OP_CALL 6
+#define SRE_OP_CATEGORY 7
+#define SRE_OP_GROUP 8
+#define SRE_OP_GROUP_IGNORE 9
+#define SRE_OP_IN 10
+#define SRE_OP_IN_IGNORE 11
+#define SRE_OP_JUMP 12
+#define SRE_OP_LITERAL 13
+#define SRE_OP_LITERAL_IGNORE 14
+#define SRE_OP_MARK 15
+#define SRE_OP_MAX_REPEAT 16
+#define SRE_OP_MAX_UNTIL 17
+#define SRE_OP_MAX_REPEAT_ONE 18
+#define SRE_OP_MIN_REPEAT 19
+#define SRE_OP_MIN_UNTIL 20
+#define SRE_OP_NOT_LITERAL 21
+#define SRE_OP_NOT_LITERAL_IGNORE 22
+#define SRE_OP_NEGATE 23
+#define SRE_OP_RANGE 24
+#define SRE_OP_REPEAT 25