]> granicus.if.org Git - python/commitdiff
Added Fredrik Lundh's sre module and its supporting cast.
authorGuido van Rossum <guido@python.org>
Fri, 31 Mar 2000 14:58:54 +0000 (14:58 +0000)
committerGuido van Rossum <guido@python.org>
Fri, 31 Mar 2000 14:58:54 +0000 (14:58 +0000)
NOTE: THIS IS VERY ROUGH ALPHA CODE!

Lib/sre.py [new file with mode: 0644]
Lib/sre_compile.py [new file with mode: 0644]
Lib/sre_constants.py [new file with mode: 0644]
Lib/sre_parse.py [new file with mode: 0644]

diff --git a/Lib/sre.py b/Lib/sre.py
new file mode 100644 (file)
index 0000000..0b41057
--- /dev/null
@@ -0,0 +1,46 @@
+# -*- Mode: Python; tab-width: 4 -*-
+#
+# Secret Labs' Regular Expression Engine
+# $Id$
+#
+# re-compatible interface for the sre matching engine
+#
+# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+#
+# This code can only be used for 1.6 alpha testing.  All other use
+# require explicit permission from Secret Labs AB.
+#
+# Portions of this engine have been developed in cooperation with
+# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+# other compatibility work.
+#
+
+"""
+this is a long string
+"""
+
+import sre_compile
+
+# --------------------------------------------------------------------
+# public interface
+
+def compile(pattern, flags=0):
+    return sre_compile.compile(pattern, _fixflags(flags))
+
+def match(pattern, string, flags=0):
+    return compile(pattern, _fixflags(flags)).match(string)
+
+def search(pattern, string, flags=0):
+    assert flags == 0
+    return compile(pattern, _fixflags(flags)).search(string)
+
+# FIXME: etc
+
+# --------------------------------------------------------------------
+# helpers
+
+def _fixflags(flags):
+    # convert flag bitmask to sequence
+    assert flags == 0
+    return ()
+
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
new file mode 100644 (file)
index 0000000..3e9700b
--- /dev/null
@@ -0,0 +1,187 @@
+#
+# Secret Labs' Regular Expression Engine
+# $Id$
+#
+# convert template to internal format
+#
+# Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+#
+# This code can only be used for 1.6 alpha testing.  All other use
+# require explicit permission from Secret Labs AB.
+#
+# Portions of this engine have been developed in cooperation with
+# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+# other compatibility work.
+#
+
+# FIXME: <fl> formalize (objectify?) and document the compiler code
+# format, so that other frontends can use this compiler
+
+import array, string, sys
+
+import _sre
+
+from sre_constants import *
+
+# find an array type code that matches the engine's code size
+for WORDSIZE in "BHil":
+    if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
+       break
+else:
+    raise RuntimeError, "cannot find a useable array type"
+
+# FIXME: <fl> should move some optimizations from the parser to here!
+
+class Code:
+    def __init__(self):
+       self.data = []
+    def __len__(self):
+       return len(self.data)
+    def __getitem__(self, index):
+       return self.data[index]
+    def __setitem__(self, index, code):
+       self.data[index] = code
+    def append(self, code):
+       self.data.append(code)
+    def todata(self):
+       # print self.data
+       return array.array(WORDSIZE, self.data).tostring()
+
+def _lower(literal):
+    # return _sre._lower(literal) # FIXME
+    return string.lower(literal)
+
+def _compile(code, pattern, flags):
+    append = code.append
+    for op, av in pattern:
+       if op is ANY:
+           if "s" in flags:
+               append(CODES[op]) # any character at all!
+           else:
+               append(CODES[NOT_LITERAL])
+               append(10)
+       elif op in (SUCCESS, FAILURE):
+           append(CODES[op])
+       elif op is AT:
+           append(CODES[op])
+           append(POSITIONS[av])
+       elif op is BRANCH:
+           append(CODES[op])
+           tail = []
+           for av in av[1]:
+               skip = len(code); append(0)
+               _compile(code, av, flags)
+               append(CODES[JUMP])
+               tail.append(len(code)); append(0)
+               code[skip] = len(code) - skip
+           append(0) # end of branch
+           for tail in tail:
+               code[tail] = len(code) - tail
+       elif op is CALL:
+           append(CODES[op])
+           skip = len(code); append(0)
+           _compile(code, av, flags)
+           append(CODES[SUCCESS])
+           code[skip] = len(code) - skip
+       elif op is CATEGORY: # not used by current parser
+           append(CODES[op])
+           append(CATEGORIES[av])
+       elif op is GROUP:
+           if "i" in flags:
+               append(CODES[MAP_IGNORE[op]])
+           else:
+               append(CODES[op])
+           append(av)
+       elif op is IN:
+           if "i" in flags:
+               append(CODES[MAP_IGNORE[op]])
+               def fixup(literal):
+                   return ord(_lower(literal))
+           else:
+               append(CODES[op])
+               fixup = ord
+           skip = len(code); append(0)
+           for op, av in av:
+               append(CODES[op])
+               if op is NEGATE:
+                   pass
+               elif op is LITERAL:
+                   append(fixup(av))
+               elif op is RANGE:
+                   append(fixup(av[0]))
+                   append(fixup(av[1]))
+               elif op is CATEGORY:
+                   append(CATEGORIES[av])
+               else:
+                   raise ValueError, "unsupported set operator"
+           append(CODES[FAILURE])
+           code[skip] = len(code) - skip
+       elif op in (LITERAL, NOT_LITERAL):
+           if "i" in flags:
+               append(CODES[MAP_IGNORE[op]])
+               append(ord(_lower(av)))
+           else:
+               append(CODES[op])
+               append(ord(av))
+       elif op is MARK:
+           append(CODES[op])
+           append(av)
+       elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
+           lo, hi = av[2].getwidth()
+           if lo == 0:
+               raise SyntaxError, "cannot repeat zero-width items"
+           if lo == hi == 1 and op is MAX_REPEAT:
+               append(CODES[MAX_REPEAT_ONE])
+               skip = len(code); append(0)
+               append(av[0])
+               append(av[1])
+               _compile(code, av[2], flags)
+               append(CODES[SUCCESS])
+               code[skip] = len(code) - skip
+           else:
+               append(CODES[op])
+               skip = len(code); append(0)
+               append(av[0])
+               append(av[1])
+               _compile(code, av[2], flags)
+               if op is MIN_REPEAT:
+                   append(CODES[MIN_UNTIL])
+               else:
+                   # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
+                   append(CODES[MAX_UNTIL])
+               code[skip] = len(code) - skip
+       elif op is SUBPATTERN:
+##         group = av[0]
+##         if group:
+##             append(CODES[MARK])
+##             append((group-1)*2)
+           _compile(code, av[1], flags)
+##         if group:
+##             append(CODES[MARK])
+##             append((group-1)*2+1)
+       else:
+           raise ValueError, ("unsupported operand type", op)
+
+def compile(p, flags=()):
+    # convert pattern list to internal format
+    if type(p) is type(""):
+       import sre_parse
+       pattern = p
+       p = sre_parse.parse(p)
+    else:
+       pattern = None
+    # print p.getwidth()
+    # print p
+    code = Code()
+    _compile(code, p.data, p.pattern.flags)
+    code.append(CODES[SUCCESS])
+    # print list(code.data)
+    data = code.todata()
+    if 0: # debugging
+       print
+       print "-" * 68
+       import sre_disasm
+       sre_disasm.disasm(data)
+       print "-" * 68
+    # print len(data), p.pattern.groups, len(p.pattern.groupdict)
+    return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
new file mode 100644 (file)
index 0000000..af88309
--- /dev/null
@@ -0,0 +1,131 @@
+#
+# Secret Labs' Regular Expression Engine
+# $Id$
+#
+# various symbols used by the regular expression engine.
+# run this script to update the _sre include files!
+#
+# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+#
+# This code can only be used for 1.6 alpha testing.  All other use
+# require explicit permission from Secret Labs AB.
+#
+# Portions of this engine have been developed in cooperation with
+# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+# other compatibility work.
+#
+
+# operators
+
+FAILURE = "failure"
+SUCCESS = "success"
+
+ANY = "any"
+ASSERT = "assert"
+AT = "at"
+BRANCH = "branch"
+CALL = "call"
+CATEGORY = "category"
+GROUP = "group"
+GROUP_IGNORE = "group_ignore"
+IN = "in"
+IN_IGNORE = "in_ignore"
+JUMP = "jump"
+LITERAL = "literal"
+LITERAL_IGNORE = "literal_ignore"
+MARK = "mark"
+MAX_REPEAT = "max_repeat"
+MAX_REPEAT_ONE = "max_repeat_one"
+MAX_UNTIL = "max_until"
+MIN_REPEAT = "min_repeat"
+MIN_UNTIL = "min_until"
+NEGATE = "negate"
+NOT_LITERAL = "not_literal"
+NOT_LITERAL_IGNORE = "not_literal_ignore"
+RANGE = "range"
+REPEAT = "repeat"
+SUBPATTERN = "subpattern"
+
+# positions
+AT_BEGINNING = "at_beginning"
+AT_BOUNDARY = "at_boundary"
+AT_NON_BOUNDARY = "at_non_boundary"
+AT_END = "at_end"
+
+# categories
+
+CATEGORY_DIGIT = "category_digit"
+CATEGORY_NOT_DIGIT = "category_not_digit"
+CATEGORY_SPACE = "category_space"
+CATEGORY_NOT_SPACE = "category_not_space"
+CATEGORY_WORD = "category_word"
+CATEGORY_NOT_WORD = "category_not_word"
+
+CODES = [
+
+    # failure=0 success=1 (just because it looks better that way :-)
+    FAILURE, SUCCESS,
+
+    ANY,
+    ASSERT,
+    AT,
+    BRANCH,
+    CALL,
+    CATEGORY,
+    GROUP, GROUP_IGNORE,
+    IN, IN_IGNORE,
+    JUMP,
+    LITERAL, LITERAL_IGNORE,
+    MARK,
+    MAX_REPEAT, MAX_UNTIL,
+    MAX_REPEAT_ONE,
+    MIN_REPEAT, MIN_UNTIL,
+    NOT_LITERAL, NOT_LITERAL_IGNORE,
+    NEGATE,
+    RANGE,
+    REPEAT
+
+]
+
+# convert to dictionary
+c = {}
+i = 0
+for code in CODES:
+    c[code] = i
+    i = i + 1
+CODES = c
+
+# replacement operations for "ignore case" mode
+MAP_IGNORE = {
+    GROUP: GROUP_IGNORE,
+    IN: IN_IGNORE,
+    LITERAL: LITERAL_IGNORE,
+    NOT_LITERAL: NOT_LITERAL_IGNORE
+}
+
+POSITIONS = {
+    AT_BEGINNING: ord("a"),
+    AT_BOUNDARY: ord("b"),
+    AT_NON_BOUNDARY: ord("B"),
+    AT_END: ord("z"),
+}
+
+CATEGORIES = {
+    CATEGORY_DIGIT: ord("d"),
+    CATEGORY_NOT_DIGIT: ord("D"),
+    CATEGORY_SPACE: ord("s"),
+    CATEGORY_NOT_SPACE: ord("S"),
+    CATEGORY_WORD: ord("w"),
+    CATEGORY_NOT_WORD: ord("W"),
+}
+
+if __name__ == "__main__":
+    import string
+    items = CODES.items()
+    items.sort(lambda a, b: cmp(a[1], b[1]))
+    f = open("sre_constants.h", "w")
+    f.write("/* generated by sre_constants.py */\n")
+    for k, v in items:
+       f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
+    f.close()
+    print "done"
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
new file mode 100644 (file)
index 0000000..a563de3
--- /dev/null
@@ -0,0 +1,492 @@
+#
+# Secret Labs' Regular Expression Engine
+# $Id$
+#
+# convert re-style regular expression to SRE template.  the current
+# implementation is somewhat incomplete, and not very fast.  should
+# definitely be rewritten before Python 1.6 goes beta.
+#
+# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+#
+# This code can only be used for 1.6 alpha testing.  All other use
+# require explicit permission from Secret Labs AB.
+#
+# Portions of this engine have been developed in cooperation with
+# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+# other compatibility work.
+#
+
+# FIXME: comments marked with the FIXME tag are open issues.  all such
+# issues should be closed before the final beta.
+
+import string, sys
+
+from sre_constants import *
+
+SPECIAL_CHARS = ".\\[{()*+?^$|"
+REPEAT_CHARS  = "*+?{"
+
+OCTDIGITS = "01234567"
+HEXDIGITS = "0123456789abcdefABCDEF"
+
+ESCAPES = {
+    "\\a": (LITERAL, chr(7)),
+    "\\b": (LITERAL, chr(8)),
+    "\\f": (LITERAL, chr(12)),
+    "\\n": (LITERAL, chr(10)),
+    "\\r": (LITERAL, chr(13)),
+    "\\t": (LITERAL, chr(9)),
+    "\\v": (LITERAL, chr(11))
+}
+
+CATEGORIES = {
+    "\\A": (AT, AT_BEGINNING), # start of string
+    "\\b": (AT, AT_BOUNDARY),
+    "\\B": (AT, AT_NON_BOUNDARY),
+    "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
+    "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
+    "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
+    "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
+    "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
+    "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
+    "\\Z": (AT, AT_END), # end of string
+}
+
+class Pattern:
+    # FIXME: <fl> rename class, and store flags in here too!
+    def __init__(self):
+       self.flags = []
+       self.groups = 1
+       self.groupdict = {}
+    def getgroup(self, name=None):
+       gid = self.groups
+       self.groups = gid + 1
+       if name:
+           self.groupdict[name] = gid
+       return gid
+    def setflag(self, flag):
+       if flag not in self.flags:
+           self.flags.append(flag)
+
+class SubPattern:
+    # a subpattern, in intermediate form
+    def __init__(self, pattern, data=None):
+       self.pattern = pattern
+       if not data:
+           data = []
+       self.data = data
+       self.flags = []
+       self.width = None
+    def __repr__(self):
+       return repr(self.data)
+    def __len__(self):
+       return len(self.data)
+    def __delitem__(self, index):
+       del self.data[index]
+    def __getitem__(self, index):
+       return self.data[index]
+    def __setitem__(self, index, code):
+       self.data[index] = code
+    def __getslice__(self, start, stop):
+       return SubPattern(self.pattern, self.data[start:stop])
+    def insert(self, index, code):
+       self.data.insert(index, code)
+    def append(self, code):
+       self.data.append(code)
+    def getwidth(self):
+       # determine the width (min, max) for this subpattern
+       if self.width:
+           return self.width
+       lo = hi = 0L
+       for op, av in self.data:
+           if op is BRANCH:
+               l = sys.maxint
+               h = 0
+               for av in av[1]:
+                   i, j = av.getwidth()
+                   l = min(l, i)
+                   h = min(h, j)
+               lo = lo + i
+               hi = hi + j
+           elif op is CALL:
+               i, j = av.getwidth()
+               lo = lo + i
+               hi = hi + j
+           elif op is SUBPATTERN:
+               i, j = av[1].getwidth()
+               lo = lo + i
+               hi = hi + j
+           elif op in (MIN_REPEAT, MAX_REPEAT):
+               i, j = av[2].getwidth()
+               lo = lo + i * av[0]
+               hi = hi + j * av[1]
+           elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
+               lo = lo + 1
+               hi = hi + 1
+           elif op == SUCCESS:
+               break
+       self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
+       return self.width
+    def set(self, flag):
+       if not flag in self.flags:
+           self.flags.append(flag)
+    def reset(self, flag):
+       if flag in self.flags:
+           self.flags.remove(flag)
+
+class Tokenizer:
+    def __init__(self, string):
+       self.string = list(string)
+       self.next = self.__next()
+    def __next(self):
+       if not self.string:
+           return None
+       char = self.string[0]
+       if char[0] == "\\":
+           try:
+               c = self.string[1]
+           except IndexError:
+               raise SyntaxError, "bogus escape"
+           char = char + c
+           try:
+               if c == "x":
+                   # hexadecimal constant
+                   for i in xrange(2, sys.maxint):
+                       c = self.string[i]
+                       if c not in HEXDIGITS:
+                           break
+                       char = char + c
+               elif c in string.digits:
+                   # decimal (or octal) number
+                   for i in xrange(2, sys.maxint):
+                       c = self.string[i]
+                       # FIXME: if larger than current number of
+                       # groups, interpret as an octal number 
+                       if c not in string.digits:
+                           break
+                       char = char + c
+           except IndexError:
+               pass # use what we've got this far
+       del self.string[0:len(char)]
+       return char
+    def match(self, char):
+       if char == self.next:
+           self.next = self.__next()
+           return 1
+       return 0
+    def match_set(self, set):
+       if self.next in set:
+           self.next = self.__next()
+           return 1
+       return 0
+    def get(self):
+       this = self.next
+       self.next = self.__next()
+       return this
+
+def _fixescape(escape, character_class=0):
+    # convert escape to (type, value)
+    if character_class:
+       # inside a character class, we'll look in the character
+       # escapes dictionary first
+       code = ESCAPES.get(escape)
+       if code:
+           return code
+       code = CATEGORIES.get(escape)
+    else:
+       code = CATEGORIES.get(escape)
+       if code:
+           return code
+       code = ESCAPES.get(escape)
+    if code:
+       return code
+    if not character_class:
+       try:
+           group = int(escape[1:])
+           # FIXME: only valid if group <= current number of groups
+           return GROUP, group
+       except ValueError:
+           pass
+    try:
+       if escape[1:2] == "x":
+           escape = escape[2:]
+           return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)
+       elif escape[1:2] in string.digits:
+           return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)
+       elif len(escape) == 2:
+           return LITERAL, escape[1]
+    except ValueError:
+       pass
+    raise SyntaxError, "bogus escape: %s" % repr(escape)
+
+def _branch(subpattern, items):
+
+    # form a branch operator from a set of items (FIXME: move this
+    # optimization to the compiler module!)
+
+    # check if all items share a common prefix
+    while 1:
+       prefix = None
+       for item in items:
+           if not item:
+               break
+           if prefix is None:
+               prefix = item[0]
+           elif item[0] != prefix:
+               break
+       else:
+           # all subitems start with a common "prefix".
+           # move it out of the branch
+           for item in items:
+               del item[0]
+           subpattern.append(prefix)
+           continue # check next one
+       break
+
+    # check if the branch can be replaced by a character set
+    for item in items:
+       if len(item) != 1 or item[0][0] != LITERAL:
+           break
+    else:
+       # we can store this as a character set instead of a
+       # branch (FIXME: use a range if possible)
+       set = []
+       for item in items:
+           set.append(item[0])
+       subpattern.append((IN, set))
+       return
+
+    subpattern.append((BRANCH, (None, items)))
+
+def _parse(source, pattern, flags=()):
+
+    # parse regular expression pattern into an operator list.
+
+    subpattern = SubPattern(pattern)
+
+    this = None
+
+    while 1:
+
+       if source.next in ("|", ")"):
+           break # end of subpattern
+       this = source.get()
+       if this is None:
+           break # end of pattern
+
+       if this and this[0] not in SPECIAL_CHARS:
+           subpattern.append((LITERAL, this))
+
+       elif this == "[":
+           # character set
+           set = []
+##         if source.match(":"):
+##             pass # handle character classes
+           if source.match("^"):
+               set.append((NEGATE, None))
+           # check remaining characters
+           start = set[:]
+           while 1:
+               this = source.get()
+               if this == "]" and set != start:
+                   break
+               elif this and this[0] == "\\":
+                   code1 = _fixescape(this, 1)
+               elif this:
+                   code1 = LITERAL, this
+               else:
+                   raise SyntaxError, "unexpected end of regular expression"
+               if source.match("-"):
+                   # potential range
+                   this = source.get()
+                   if this == "]":
+                       set.append(code1)
+                       set.append((LITERAL, "-"))
+                       break
+                   else:
+                       if this[0] == "\\":
+                           code2 = _fixescape(this, 1)
+                       else:
+                           code2 = LITERAL, this
+                       if code1[0] != LITERAL or code2[0] != LITERAL:
+                           raise SyntaxError, "illegal range"
+                       if len(code1[1]) != 1 or len(code2[1]) != 1:
+                           raise SyntaxError, "illegal range"
+                       set.append((RANGE, (code1[1], code2[1])))
+               else:
+                   if code1[0] is IN:
+                       code1 = code1[1][0]
+                   set.append(code1)
+
+           # FIXME: <fl> move set optimization to support function
+           if len(set)==1 and set[0][0] is LITERAL:
+               subpattern.append(set[0]) # optimization
+           elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
+               subpattern.append((NOT_LITERAL, set[1][1])) # optimization
+           else:
+               # FIXME: <fl> add charmap optimization
+               subpattern.append((IN, set))
+
+       elif this and this[0] in REPEAT_CHARS:
+           # repeat previous item
+           if this == "?":
+               min, max = 0, 1
+           elif this == "*":
+               min, max = 0, sys.maxint
+           elif this == "+":
+               min, max = 1, sys.maxint
+           elif this == "{":
+               min, max = 0, sys.maxint
+               lo = hi = ""
+               while source.next in string.digits:
+                   lo = lo + source.get()
+               if source.match(","):
+                   while source.next in string.digits:
+                       hi = hi + source.get()
+               else:
+                   hi = lo
+               if not source.match("}"):
+                   raise SyntaxError, "bogus range"
+               if lo:
+                   min = int(lo)
+               if hi:
+                   max = int(hi)
+               # FIXME: <fl> check that hi >= lo!
+           else:
+               raise SyntaxError, "not supported"
+           # figure out which item to repeat
+           # FIXME: should back up to the right mark, right?
+           if subpattern:
+               index = len(subpattern)-1
+               while subpattern[index][0] is MARK:
+                   index = index - 1
+               item = subpattern[index:index+1]
+           else:
+               raise SyntaxError, "nothing to repeat"
+           if source.match("?"):
+               subpattern[index] = (MIN_REPEAT, (min, max, item))
+           else:
+               subpattern[index] = (MAX_REPEAT, (min, max, item))
+       elif this == ".":
+           subpattern.append((ANY, None))
+       elif this == "(":
+           group = 1
+           name = None
+           if source.match("?"):
+               group = 0
+               # options
+               if source.match("P"):
+                   # named group: skip forward to end of name
+                   if source.match("<"):
+                       name = ""
+                       while 1:
+                           char = source.get()
+                           if char in (">", None):
+                               break
+                           name = name + char
+                       group = 1
+               elif source.match(":"):
+                   # non-capturing group
+                   group = 2
+               elif source.match_set("iI"):
+                   pattern.setflag("i")
+               elif source.match_set("lL"):
+                   pattern.setflag("l")
+               elif source.match_set("mM"):
+                   pattern.setflag("m")
+               elif source.match_set("sS"):
+                   pattern.setflag("s")
+               elif source.match_set("xX"):
+                   pattern.setflag("x")
+           if group:
+               # parse group contents
+               b = []
+               if group == 2:
+                   # anonymous group
+                   group = None
+               else:
+                   group = pattern.getgroup(name)
+               if group:
+                   subpattern.append((MARK, (group-1)*2))
+               while 1:
+                   p = _parse(source, pattern, flags)
+                   if source.match(")"):
+                       if b:
+                           b.append(p)
+                           _branch(subpattern, b)
+                       else:
+                           subpattern.append((SUBPATTERN, (group, p)))
+                       break
+                   elif source.match("|"):
+                       b.append(p)
+                   else:
+                       raise SyntaxError, "group not properly closed"
+               if group:
+                   subpattern.append((MARK, (group-1)*2+1))
+           else:
+               # FIXME: should this really be a while loop?
+               while source.get() not in (")", None):
+                   pass
+
+       elif this == "^":
+           subpattern.append((AT, AT_BEGINNING))
+
+       elif this == "$":
+           subpattern.append((AT, AT_END))
+
+       elif this and this[0] == "\\":
+           code =_fixescape(this)
+           subpattern.append(code)
+
+       else:
+           raise SyntaxError, "parser error"
+
+    return subpattern
+
+def parse(source, flags=()):
+    s = Tokenizer(source)
+    g = Pattern()
+    b = []
+    while 1:
+       p = _parse(s, g, flags)
+       tail = s.get()
+       if tail == "|":
+           b.append(p)
+       elif tail == ")":
+           raise SyntaxError, "unbalanced parenthesis"
+       elif tail is None:
+           if b:
+               b.append(p)
+               p = SubPattern(g)
+               _branch(p, b)
+           break
+       else:
+           raise SyntaxError, "bogus characters at end of regular expression"
+    return p
+
+if __name__ == "__main__":
+    from pprint import pprint
+    from testpatterns import PATTERNS
+    a = b = c = 0
+    for pattern, flags in PATTERNS:
+       if flags:
+           continue
+       print "-"*68
+       try:
+           p = parse(pattern)
+           print repr(pattern), "->"
+           pprint(p.data)
+           import sre_compile
+           try:
+               code = sre_compile.compile(p)
+               c = c + 1
+           except:
+               pass
+           a = a + 1
+       except SyntaxError, v:
+           print "**", repr(pattern), v
+       b = b + 1
+    print "-"*68
+    print a, "of", b, "patterns successfully parsed"
+    print c, "of", b, "patterns successfully compiled"
+