]> granicus.if.org Git - python/commitdiff
the usual
authorGuido van Rossum <guido@python.org>
Fri, 30 Jun 2000 16:13:37 +0000 (16:13 +0000)
committerGuido van Rossum <guido@python.org>
Fri, 30 Jun 2000 16:13:37 +0000 (16:13 +0000)
Lib/dos-8x3/sre_comp.py
Lib/dos-8x3/sre_cons.py
Lib/dos-8x3/sre_pars.py
Lib/dos-8x3/test_has.py [new file with mode: 0644]

index c0423750e3b9073e58c072f5ac262772484a2c4b..e48a7eb9901dea43845b56e7b5037d263df1b1b1 100644 (file)
@@ -18,157 +18,212 @@ from sre_constants import *
 # find an array type code that matches the engine's code size
 for WORDSIZE in "BHil":
     if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
-       break
+        break
 else:
     raise RuntimeError, "cannot find a useable array type"
 
 def _compile(code, pattern, flags):
+    # internal: compile a (sub)pattern
     emit = code.append
     for op, av in pattern:
-       if op is ANY:
-           if flags & SRE_FLAG_DOTALL:
-               emit(OPCODES[op])
-           else:
-               emit(OPCODES[CATEGORY])
-               emit(CHCODES[CATEGORY_NOT_LINEBREAK])
-       elif op in (SUCCESS, FAILURE):
-           emit(OPCODES[op])
-       elif op is AT:
-           emit(OPCODES[op])
-           if flags & SRE_FLAG_MULTILINE:
-               emit(ATCODES[AT_MULTILINE[av]])
-           else:
-               emit(ATCODES[av])
-       elif op is BRANCH:
-           emit(OPCODES[op])
-           tail = []
-           for av in av[1]:
-               skip = len(code); emit(0)
-               _compile(code, av, flags)
-               emit(OPCODES[JUMP])
-               tail.append(len(code)); emit(0)
-               code[skip] = len(code) - skip
-           emit(0) # end of branch
-           for tail in tail:
-               code[tail] = len(code) - tail
-       elif op is CALL:
-           emit(OPCODES[op])
-           skip = len(code); emit(0)
-           _compile(code, av, flags)
-           emit(OPCODES[SUCCESS])
-           code[skip] = len(code) - skip
-       elif op is CATEGORY:
-           emit(OPCODES[op])
-           if flags & SRE_FLAG_LOCALE:
-               emit(CH_LOCALE[CHCODES[av]])
-           elif flags & SRE_FLAG_UNICODE:
-               emit(CH_UNICODE[CHCODES[av]])
-           else:
-               emit(CHCODES[av])
-       elif op is GROUP:
-           if flags & SRE_FLAG_IGNORECASE:
-               emit(OPCODES[OP_IGNORE[op]])
-           else:
-               emit(OPCODES[op])
-           emit(av-1)
-       elif op is IN:
-           if flags & SRE_FLAG_IGNORECASE:
-               emit(OPCODES[OP_IGNORE[op]])
-               def fixup(literal, flags=flags):
-                   return _sre.getlower(ord(literal), flags)
-           else:
-               emit(OPCODES[op])
-               fixup = ord
-           skip = len(code); emit(0)
-           for op, av in av:
-               emit(OPCODES[op])
-               if op is NEGATE:
-                   pass
-               elif op is LITERAL:
-                   emit(fixup(av))
-               elif op is RANGE:
-                   emit(fixup(av[0]))
-                   emit(fixup(av[1]))
-               elif op is CATEGORY:
-                   if flags & SRE_FLAG_LOCALE:
-                       emit(CH_LOCALE[CHCODES[av]])
-                   elif flags & SRE_FLAG_UNICODE:
-                       emit(CH_UNICODE[CHCODES[av]])
-                   else:
-                       emit(CHCODES[av])
-               else:
-                   raise error, "internal: unsupported set operator"
-           emit(OPCODES[FAILURE])
-           code[skip] = len(code) - skip
-       elif op in (LITERAL, NOT_LITERAL):
-           if flags & SRE_FLAG_IGNORECASE:
-               emit(OPCODES[OP_IGNORE[op]])
-           else:
-               emit(OPCODES[op])
-           emit(ord(av))
-       elif op is MARK:
-           emit(OPCODES[op])
-           emit(av)
-       elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
-           if flags & SRE_FLAG_TEMPLATE:
-               emit(OPCODES[REPEAT])
-               skip = len(code); emit(0)
-               emit(av[0])
-               emit(av[1])
-               _compile(code, av[2], flags)
-               emit(OPCODES[SUCCESS])
-               code[skip] = len(code) - skip
-           else:
-               lo, hi = av[2].getwidth()
-               if lo == 0:
-                   raise error, "nothing to repeat"
-               if 0 and lo == hi == 1 and op is MAX_REPEAT:
-                   # FIXME: <fl> need a better way to figure out when
-                   # it's safe to use this one (in the parser, probably)
-                   emit(OPCODES[MAX_REPEAT_ONE])
-                   skip = len(code); emit(0)
-                   emit(av[0])
-                   emit(av[1])
-                   _compile(code, av[2], flags)
-                   emit(OPCODES[SUCCESS])
-                   code[skip] = len(code) - skip
-               else:
-                   emit(OPCODES[op])
-                   skip = len(code); emit(0)
-                   emit(av[0])
-                   emit(av[1])
-                   _compile(code, av[2], flags)
-                   emit(OPCODES[SUCCESS])
-                   code[skip] = len(code) - skip
-       elif op is SUBPATTERN:
-           group = av[0]
-           if group:
-               emit(OPCODES[MARK])
-               emit((group-1)*2)
-           _compile(code, av[1], flags)
-           if group:
-               emit(OPCODES[MARK])
-               emit((group-1)*2+1)
-       else:
-           raise ValueError, ("unsupported operand type", op)
+        if op in (LITERAL, NOT_LITERAL):
+            if flags & SRE_FLAG_IGNORECASE:
+                emit(OPCODES[OP_IGNORE[op]])
+            else:
+                emit(OPCODES[op])
+            emit(av)
+        elif op is IN:
+            if flags & SRE_FLAG_IGNORECASE:
+                emit(OPCODES[OP_IGNORE[op]])
+                def fixup(literal, flags=flags):
+                    return _sre.getlower(literal, flags)
+            else:
+                emit(OPCODES[op])
+               fixup = lambda x: x
+            skip = len(code); emit(0)
+            for op, av in av:
+                emit(OPCODES[op])
+                if op is NEGATE:
+                    pass
+                elif op is LITERAL:
+                    emit(fixup(av))
+                elif op is RANGE:
+                    emit(fixup(av[0]))
+                    emit(fixup(av[1]))
+                elif op is CATEGORY:
+                    if flags & SRE_FLAG_LOCALE:
+                        emit(CHCODES[CH_LOCALE[av]])
+                    elif flags & SRE_FLAG_UNICODE:
+                        emit(CHCODES[CH_UNICODE[av]])
+                    else:
+                        emit(CHCODES[av])
+                else:
+                    raise error, "internal: unsupported set operator"
+            emit(OPCODES[FAILURE])
+            code[skip] = len(code) - skip
+        elif op is ANY:
+            if flags & SRE_FLAG_DOTALL:
+                emit(OPCODES[op])
+            else:
+                emit(OPCODES[CATEGORY])
+                emit(CHCODES[CATEGORY_NOT_LINEBREAK])
+        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
+            if flags & SRE_FLAG_TEMPLATE:
+                emit(OPCODES[REPEAT])
+                skip = len(code); emit(0)
+                emit(av[0])
+                emit(av[1])
+                _compile(code, av[2], flags)
+                emit(OPCODES[SUCCESS])
+                code[skip] = len(code) - skip
+            else:
+                lo, hi = av[2].getwidth()
+                if lo == 0:
+                    raise error, "nothing to repeat"
+                if 0 and lo == hi == 1 and op is MAX_REPEAT:
+                    # FIXME: <fl> need a better way to figure out when
+                    # it's safe to use this one (in the parser, probably)
+                    emit(OPCODES[MAX_REPEAT_ONE])
+                    skip = len(code); emit(0)
+                    emit(av[0])
+                    emit(av[1])
+                    _compile(code, av[2], flags)
+                    emit(OPCODES[SUCCESS])
+                    code[skip] = len(code) - skip
+                else:
+                    emit(OPCODES[op])
+                    skip = len(code); emit(0)
+                    emit(av[0])
+                    emit(av[1])
+                    _compile(code, av[2], flags)
+                    emit(OPCODES[SUCCESS])
+                    code[skip] = len(code) - skip
+        elif op is SUBPATTERN:
+            group = av[0]
+            if group:
+                emit(OPCODES[MARK])
+                emit((group-1)*2)
+            _compile(code, av[1], flags)
+            if group:
+                emit(OPCODES[MARK])
+                emit((group-1)*2+1)
+        elif op in (SUCCESS, FAILURE):
+            emit(OPCODES[op])
+        elif op in (ASSERT, ASSERT_NOT, CALL):
+            emit(OPCODES[op])
+            skip = len(code); emit(0)
+            _compile(code, av, flags)
+            emit(OPCODES[SUCCESS])
+            code[skip] = len(code) - skip
+        elif op is AT:
+            emit(OPCODES[op])
+            if flags & SRE_FLAG_MULTILINE:
+                emit(ATCODES[AT_MULTILINE[av]])
+            else:
+                emit(ATCODES[av])
+        elif op is BRANCH:
+            emit(OPCODES[op])
+            tail = []
+            for av in av[1]:
+                skip = len(code); emit(0)
+                _compile(code, av, flags)
+                emit(OPCODES[JUMP])
+                tail.append(len(code)); emit(0)
+                code[skip] = len(code) - skip
+            emit(0) # end of branch
+            for tail in tail:
+                code[tail] = len(code) - tail
+        elif op is CATEGORY:
+            emit(OPCODES[op])
+            if flags & SRE_FLAG_LOCALE:
+                emit(CHCODES[CH_LOCALE[av]])
+            elif flags & SRE_FLAG_UNICODE:
+                emit(CHCODES[CH_UNICODE[av]])
+            else:
+                emit(CHCODES[av])
+        elif op is GROUP:
+            if flags & SRE_FLAG_IGNORECASE:
+                emit(OPCODES[OP_IGNORE[op]])
+            else:
+                emit(OPCODES[op])
+            emit(av-1)
+        elif op is MARK:
+            emit(OPCODES[op])
+            emit(av)
+        else:
+            raise ValueError, ("unsupported operand type", op)
+
+def _compile_info(code, pattern, flags):
+    # internal: compile an info block.  in the current version,
+    # this contains min/max pattern width and a literal prefix,
+    # if any
+    lo, hi = pattern.getwidth()
+    if lo == 0:
+        return # not worth it
+    # look for a literal prefix
+    prefix = []
+    if not (flags & SRE_FLAG_IGNORECASE):
+        for op, av in pattern.data:
+            if op is LITERAL:
+                prefix.append(av)
+            else:
+                break
+    # add an info block
+    emit = code.append
+    emit(OPCODES[INFO])
+    skip = len(code); emit(0)
+    # literal flag
+    mask = 0
+    if len(prefix) == len(pattern.data):
+        mask = 1
+    emit(mask)
+    # pattern length
+    emit(lo)
+    if hi < 32768:
+        emit(hi)
+    else:
+        emit(0)
+    # add literal prefix
+    emit(len(prefix))
+    if prefix:
+        code.extend(prefix)
+        # generate overlap table
+        table = [-1] + ([0]*len(prefix))
+        for i in range(len(prefix)):
+            table[i+1] = table[i]+1
+            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
+                table[i+1] = table[table[i+1]-1]+1
+        code.extend(table[1:]) # don't store first entry
+    code[skip] = len(code) - skip
 
 def compile(p, flags=0):
     # internal: convert pattern list to internal format
+
+    # compile, as necessary
     if type(p) in (type(""), type(u"")):
-       import sre_parse
-       pattern = p
-       p = sre_parse.parse(p)
+        import sre_parse
+        pattern = p
+        p = sre_parse.parse(p)
     else:
-       pattern = None
+        pattern = None
+
     flags = p.pattern.flags | flags
     code = []
+
+    # compile info block
+    _compile_info(code, p, flags)
+
+    # compile the pattern
     _compile(code, p.data, flags)
+
     code.append(OPCODES[SUCCESS])
-    # FIXME: <fl> get rid of this limitation
+
+    # FIXME: <fl> get rid of this limitation!
     assert p.pattern.groups <= 100,\
-          "sorry, but this version only supports 100 named groups"
+           "sorry, but this version only supports 100 named groups"
+
     return _sre.compile(
-       pattern, flags,
-       array.array(WORDSIZE, code).tostring(),
-       p.pattern.groups-1, p.pattern.groupdict
-       )
+        pattern, flags,
+        array.array(WORDSIZE, code).tostring(),
+        p.pattern.groups-1, p.pattern.groupdict
+        )
index f5e7894e3bae44bef3b2ccb74669386cb742fdab..45f4f482d2887f10ae6175b23a5de129b7f59976 100644 (file)
@@ -23,6 +23,7 @@ SUCCESS = "success"
 
 ANY = "any"
 ASSERT = "assert"
+ASSERT_NOT = "assert_not"
 AT = "at"
 BRANCH = "branch"
 CALL = "call"
@@ -81,7 +82,7 @@ OPCODES = [
     FAILURE, SUCCESS,
 
     ANY,
-    ASSERT,
+    ASSERT, ASSERT_NOT,
     AT,
     BRANCH,
     CALL,
@@ -121,8 +122,8 @@ def makedict(list):
     d = {}
     i = 0
     for item in list:
-       d[item] = i
-       i = i + 1
+        d[item] = i
+        i = i + 1
     return d
 
 OPCODES = makedict(OPCODES)
@@ -176,12 +177,27 @@ SRE_FLAG_VERBOSE = 64
 if __name__ == "__main__":
     import string
     def dump(f, d, prefix):
-       items = d.items()
-       items.sort(lambda a, b: cmp(a[1], b[1]))
-       for k, v in items:
-           f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
+        items = d.items()
+        items.sort(lambda a, b: cmp(a[1], b[1]))
+        for k, v in items:
+            f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
     f = open("sre_constants.h", "w")
-    f.write("/* generated from sre_constants.py */\n")
+    f.write("""\
+/*
+ * Secret Labs' Regular Expression Engine
+ *
+ * regular expression matching engine
+ *
+ * NOTE: This file is generated by sre_constants.py.  If you need
+ * to change anything in here, edit sre_constants.py and run it.
+ *
+ * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+ *
+ * See the _sre.c file for information on usage and redistribution.
+ */
+
+""")
+
     dump(f, OPCODES, "SRE_OP")
     dump(f, ATCODES, "SRE")
     dump(f, CHCODES, "SRE")
index 93a7b5dc997b1fe3e3fe2369c8974a518ad18571..fb954e994c6e28576ea878c522abde3f2c72ce9a 100644 (file)
@@ -19,6 +19,9 @@ from sre_constants import *
 # FIXME: should be 65535, but the arraymodule is still broken
 MAXREPEAT = 32767
 
+# FIXME: same here
+CHARMASK = 0x7fff
+
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS  = "*+?{"
 
@@ -30,26 +33,27 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF")
 WHITESPACE = string.whitespace
 
 ESCAPES = {
-    "\\a": (LITERAL, chr(7)),
-    "\\b": (LITERAL, chr(8)),
-    "\\f": (LITERAL, chr(12)),
-    "\\n": (LITERAL, chr(10)),
-    "\\r": (LITERAL, chr(13)),
-    "\\t": (LITERAL, chr(9)),
-    "\\v": (LITERAL, chr(11))
+    r"\a": (LITERAL, 7),
+    r"\b": (LITERAL, 8),
+    r"\f": (LITERAL, 12),
+    r"\n": (LITERAL, 10),
+    r"\r": (LITERAL, 13),
+    r"\t": (LITERAL, 9),
+    r"\v": (LITERAL, 11),
+    r"\\": (LITERAL, ord("\\"))
 }
 
 CATEGORIES = {
-    "\\A": (AT, AT_BEGINNING), # start of string
-    "\\b": (AT, AT_BOUNDARY),
-    "\\B": (AT, AT_NON_BOUNDARY),
-    "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
-    "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
-    "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
-    "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
-    "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
-    "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
-    "\\Z": (AT, AT_END), # end of string
+    r"\A": (AT, AT_BEGINNING), # start of string
+    r"\b": (AT, AT_BOUNDARY),
+    r"\B": (AT, AT_NON_BOUNDARY),
+    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
+    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
+    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
+    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
+    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
+    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
+    r"\Z": (AT, AT_END), # end of string
 }
 
 FLAGS = {
@@ -66,106 +70,106 @@ FLAGS = {
 
 class State:
     def __init__(self):
-       self.flags = 0
-       self.groups = 1
-       self.groupdict = {}
+        self.flags = 0
+        self.groups = 1
+        self.groupdict = {}
     def getgroup(self, name=None):
-       gid = self.groups
-       self.groups = gid + 1
-       if name:
-           self.groupdict[name] = gid
-       return gid
+        gid = self.groups
+        self.groups = gid + 1
+        if name:
+            self.groupdict[name] = gid
+        return gid
 
 class SubPattern:
     # a subpattern, in intermediate form
     def __init__(self, pattern, data=None):
-       self.pattern = pattern
-       if not data:
-           data = []
-       self.data = data
-       self.width = None
+        self.pattern = pattern
+        if not data:
+            data = []
+        self.data = data
+        self.width = None
     def __repr__(self):
-       return repr(self.data)
+        return repr(self.data)
     def __len__(self):
-       return len(self.data)
+        return len(self.data)
     def __delitem__(self, index):
-       del self.data[index]
+        del self.data[index]
     def __getitem__(self, index):
-       return self.data[index]
+        return self.data[index]
     def __setitem__(self, index, code):
-       self.data[index] = code
+        self.data[index] = code
     def __getslice__(self, start, stop):
-       return SubPattern(self.pattern, self.data[start:stop])
+        return SubPattern(self.pattern, self.data[start:stop])
     def insert(self, index, code):
-       self.data.insert(index, code)
+        self.data.insert(index, code)
     def append(self, code):
-       self.data.append(code)
+        self.data.append(code)
     def getwidth(self):
-       # determine the width (min, max) for this subpattern
-       if self.width:
-           return self.width
-       lo = hi = 0L
-       for op, av in self.data:
-           if op is BRANCH:
-               l = sys.maxint
-               h = 0
-               for av in av[1]:
-                   i, j = av.getwidth()
-                   l = min(l, i)
-                   h = min(h, j)
-               lo = lo + i
-               hi = hi + j
-           elif op is CALL:
-               i, j = av.getwidth()
-               lo = lo + i
-               hi = hi + j
-           elif op is SUBPATTERN:
-               i, j = av[1].getwidth()
-               lo = lo + i
-               hi = hi + j
-           elif op in (MIN_REPEAT, MAX_REPEAT):
-               i, j = av[2].getwidth()
-               lo = lo + long(i) * av[0]
-               hi = hi + long(j) * av[1]
-           elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
-               lo = lo + 1
-               hi = hi + 1
-           elif op == SUCCESS:
-               break
-       self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
-       return self.width
+        # determine the width (min, max) for this subpattern
+        if self.width:
+            return self.width
+        lo = hi = 0L
+        for op, av in self.data:
+            if op is BRANCH:
+                l = sys.maxint
+                h = 0
+                for av in av[1]:
+                    i, j = av.getwidth()
+                    l = min(l, i)
+                    h = min(h, j)
+                lo = lo + i
+                hi = hi + j
+            elif op is CALL:
+                i, j = av.getwidth()
+                lo = lo + i
+                hi = hi + j
+            elif op is SUBPATTERN:
+                i, j = av[1].getwidth()
+                lo = lo + i
+                hi = hi + j
+            elif op in (MIN_REPEAT, MAX_REPEAT):
+                i, j = av[2].getwidth()
+                lo = lo + long(i) * av[0]
+                hi = hi + long(j) * av[1]
+            elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
+                lo = lo + 1
+                hi = hi + 1
+            elif op == SUCCESS:
+                break
+        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
+        return self.width
 
 class Tokenizer:
     def __init__(self, string):
-       self.index = 0
-       self.string = string
-       self.next = self.__next()
+        self.index = 0
+        self.string = string
+        self.next = self.__next()
     def __next(self):
-       if self.index >= len(self.string):
-           return None
-       char = self.string[self.index]
-       if char[0] == "\\":
-           try:
-               c = self.string[self.index + 1]
-           except IndexError:
-               raise error, "bogus escape"
-           char = char + c
-       self.index = self.index + len(char)
-       return char
+        if self.index >= len(self.string):
+            return None
+        char = self.string[self.index]
+        if char[0] == "\\":
+            try:
+                c = self.string[self.index + 1]
+            except IndexError:
+                raise error, "bogus escape"
+            char = char + c
+        self.index = self.index + len(char)
+        return char
     def match(self, char):
-       if char == self.next:
-           self.next = self.__next()
-           return 1
-       return 0
+        if char == self.next:
+            self.next = self.__next()
+            return 1
+        return 0
     def match_set(self, set):
-       if self.next and self.next in set:
-           self.next = self.__next()
-           return 1
-       return 0
+        if self.next and self.next in set:
+            self.next = self.__next()
+            return 1
+        return 0
     def get(self):
-       this = self.next
-       self.next = self.__next()
-       return this
+        this = self.next
+        self.next = self.__next()
+        return this
 
 def isident(char):
     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
@@ -175,127 +179,118 @@ def isdigit(char):
 
 def isname(name):
     # check that group name is a valid string
-    # FIXME: <fl> this code is really lame.  should use a regular
-    # expression instead, but I seem to have certain bootstrapping
-    # problems here ;-)
     if not isident(name[0]):
-       return 0
+        return 0
     for char in name:
-       if not isident(char) and not isdigit(char):
-           return 0
+        if not isident(char) and not isdigit(char):
+            return 0
     return 1
 
-def _group(escape, state):
+def _group(escape, groups):
     # check if the escape string represents a valid group
     try:
-       group = int(escape[1:])
-       if group and group < state.groups:
-           return group
+        gid = int(escape[1:])
+        if gid and gid < groups:
+            return gid
     except ValueError:
-       pass
+        pass
     return None # not a valid group
 
 def _class_escape(source, escape):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
     if code:
-       return code
+        return code
     code = CATEGORIES.get(escape)
     if code:
-       return code
+        return code
     try:
-       if escape[1:2] == "x":
-           while source.next in HEXDIGITS:
-               escape = escape + source.get()
-           escape = escape[2:]
-           # FIXME: support unicode characters!
-           return LITERAL, chr(int(escape[-4:], 16) & 0xff)
-       elif str(escape[1:2]) in OCTDIGITS:
-           while source.next in OCTDIGITS:
-               escape = escape + source.get()
-           escape = escape[1:]
-           # FIXME: support unicode characters!
-           return LITERAL, chr(int(escape[-6:], 8) & 0xff)
-       if len(escape) == 2:
-           return LITERAL, escape[1]
+        if escape[1:2] == "x":
+            while source.next in HEXDIGITS:
+                escape = escape + source.get()
+            escape = escape[2:]
+            return LITERAL, int(escape[-4:], 16) & CHARMASK
+        elif str(escape[1:2]) in OCTDIGITS:
+            while source.next in OCTDIGITS:
+                escape = escape + source.get()
+            escape = escape[1:]
+            return LITERAL, int(escape[-6:], 8) & CHARMASK
+        if len(escape) == 2:
+            return LITERAL, ord(escape[1])
     except ValueError:
-       pass
+        pass
     raise error, "bogus escape: %s" % repr(escape)
 
 def _escape(source, escape, state):
     # handle escape code in expression
     code = CATEGORIES.get(escape)
     if code:
-       return code
+        return code
     code = ESCAPES.get(escape)
     if code:
-       return code
+        return code
     try:
-       if escape[1:2] == "x":
-           while source.next in HEXDIGITS:
-               escape = escape + source.get()
-           escape = escape[2:]
-           # FIXME: support unicode characters!
-           return LITERAL, chr(int(escape[-4:], 16) & 0xff)
-       elif escape[1:2] in DIGITS:
-           while 1:
-               group = _group(escape, state)
-               if group:
-                   if (not source.next or
-                       not _group(escape + source.next, state)):
-                       return GROUP, group
-                   escape = escape + source.get()
-               elif source.next in OCTDIGITS:
-                   escape = escape + source.get()
-               else:
-                   break
-           escape = escape[1:]
-           # FIXME: support unicode characters!
-           return LITERAL, chr(int(escape[-6:], 8) & 0xff)
-       if len(escape) == 2:
-           return LITERAL, escape[1]
+        if escape[1:2] == "x":
+            while source.next in HEXDIGITS:
+                escape = escape + source.get()
+            escape = escape[2:]
+            return LITERAL, int(escape[-4:], 16) & CHARMASK
+        elif escape[1:2] in DIGITS:
+            while 1:
+                group = _group(escape, state.groups)
+                if group:
+                    if (not source.next or
+                        not _group(escape + source.next, state.groups)):
+                        return GROUP, group
+                    escape = escape + source.get()
+                elif source.next in OCTDIGITS:
+                    escape = escape + source.get()
+                else:
+                    break
+            escape = escape[1:]
+            return LITERAL, int(escape[-6:], 8) & CHARMASK
+        if len(escape) == 2:
+            return LITERAL, ord(escape[1])
     except ValueError:
-       pass
+        pass
     raise error, "bogus escape: %s" % repr(escape)
 
-
 def _branch(pattern, items):
-
     # form a branch operator from a set of items
 
     subpattern = SubPattern(pattern)
 
     # check if all items share a common prefix
     while 1:
-       prefix = None
-       for item in items:
-           if not item:
-               break
-           if prefix is None:
-               prefix = item[0]
-           elif item[0] != prefix:
-               break
-       else:
-           # all subitems start with a common "prefix".
-           # move it out of the branch
-           for item in items:
-               del item[0]
-           subpattern.append(prefix)
-           continue # check next one
-       break
+        prefix = None
+        for item in items:
+            if not item:
+                break
+            if prefix is None:
+                prefix = item[0]
+            elif item[0] != prefix:
+                break
+        else:
+            # all subitems start with a common "prefix".
+            # move it out of the branch
+            for item in items:
+                del item[0]
+            subpattern.append(prefix)
+            continue # check next one
+        break
 
     # check if the branch can be replaced by a character set
     for item in items:
-       if len(item) != 1 or item[0][0] != LITERAL:
-           break
+        if len(item) != 1 or item[0][0] != LITERAL:
+            break
     else:
-       # we can store this as a character set instead of a
-       # branch (FIXME: use a range if possible)
-       set = []
-       for item in items:
-           set.append(item[0])
-       subpattern.append((IN, set))
-       return subpattern
+        # we can store this as a character set instead of a
+        # branch (FIXME: use a range if possible)
+        set = []
+        for item in items:
+            set.append(item[0])
+        subpattern.append((IN, set))
+        return subpattern
 
     subpattern.append((BRANCH, (None, items)))
     return subpattern
@@ -308,197 +303,227 @@ def _parse(source, state, flags=0):
 
     while 1:
 
-       if source.next in ("|", ")"):
-           break # end of subpattern
-       this = source.get()
-       if this is None:
-           break # end of pattern
-
-       if state.flags & SRE_FLAG_VERBOSE:
-           # skip whitespace and comments
-           if this in WHITESPACE:
-               continue
-           if this == "#":
-               while 1:
-                   this = source.get()
-                   if this in (None, "\n"):
-                       break
-               continue
-
-       if this and this[0] not in SPECIAL_CHARS:
-           subpattern.append((LITERAL, this))
-
-       elif this == "[":
-           # character set
-           set = []
-##         if source.match(":"):
-##             pass # handle character classes
-           if source.match("^"):
-               set.append((NEGATE, None))
-           # check remaining characters
-           start = set[:]
-           while 1:
-               this = source.get()
-               if this == "]" and set != start:
-                   break
-               elif this and this[0] == "\\":
-                   code1 = _class_escape(source, this)
-               elif this:
-                   code1 = LITERAL, this
-               else:
-                   raise error, "unexpected end of regular expression"
-               if source.match("-"):
-                   # potential range
-                   this = source.get()
-                   if this == "]":
-                       set.append(code1)
-                       set.append((LITERAL, "-"))
-                       break
-                   else:
-                       if this[0] == "\\":
-                           code2 = _class_escape(source, this)
-                       else:
-                           code2 = LITERAL, this
-                       if code1[0] != LITERAL or code2[0] != LITERAL:
-                           raise error, "illegal range"
-                       if len(code1[1]) != 1 or len(code2[1]) != 1:
-                           raise error, "illegal range"
-                       set.append((RANGE, (code1[1], code2[1])))
-               else:
-                   if code1[0] is IN:
-                       code1 = code1[1][0]
-                   set.append(code1)
-
-           # FIXME: <fl> move set optimization to compiler!
-           if len(set)==1 and set[0][0] is LITERAL:
-               subpattern.append(set[0]) # optimization
-           elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
-               subpattern.append((NOT_LITERAL, set[1][1])) # optimization
-           else:
-               # FIXME: <fl> add charmap optimization
-               subpattern.append((IN, set))
-
-       elif this and this[0] in REPEAT_CHARS:
-           # repeat previous item
-           if this == "?":
-               min, max = 0, 1
-           elif this == "*":
-               min, max = 0, MAXREPEAT
-           elif this == "+":
-               min, max = 1, MAXREPEAT
-           elif this == "{":
-               min, max = 0, MAXREPEAT
-               lo = hi = ""
-               while source.next in DIGITS:
-                   lo = lo + source.get()
-               if source.match(","):
-                   while source.next in DIGITS:
-                       hi = hi + source.get()
-               else:
-                   hi = lo
-               if not source.match("}"):
-                   raise error, "bogus range"
-               if lo:
-                   min = int(lo)
-               if hi:
-                   max = int(hi)
-               # FIXME: <fl> check that hi >= lo!
-           else:
-               raise error, "not supported"
-           # figure out which item to repeat
-           if subpattern:
-               item = subpattern[-1:]
-           else:
-               raise error, "nothing to repeat"
-           if source.match("?"):
-               subpattern[-1] = (MIN_REPEAT, (min, max, item))
-           else:
-               subpattern[-1] = (MAX_REPEAT, (min, max, item))
-
-       elif this == ".":
-           subpattern.append((ANY, None))
-
-       elif this == "(":
-           group = 1
-           name = None
-           if source.match("?"):
-               group = 0
-               # options
-               if source.match("P"):
-                   # python extensions
-                   if source.match("<"):
-                       # named group: skip forward to end of name
-                       name = ""
-                       while 1:
-                           char = source.get()
-                           if char is None:
-                               raise error, "unterminated name"
-                           if char == ">":
-                               break
-                           name = name + char
-                       group = 1
-                       if not isname(name):
-                           raise error, "illegal character in group name"
-                   elif source.match("="):
-                       # named backreference
-                       raise error, "not yet implemented"
-                   else:
-                       char = source.get()
-                       if char is None:
-                           raise error, "unexpected end of pattern"
-                       raise error, "unknown specifier: ?P%s" % char
-               elif source.match(":"):
-                   # non-capturing group
-                   group = 2
-               elif source.match("#"):
-                   # comment
-                   while 1:
-                       if source.next is None or source.next == ")":
-                           break
-                       source.get()
-               else:
-                   # flags
-                   while FLAGS.has_key(source.next):
-                       state.flags = state.flags | FLAGS[source.get()]
-           if group:
-               # parse group contents
-               b = []
-               if group == 2:
-                   # anonymous group
-                   group = None
-               else:
-                   group = state.getgroup(name)
-               while 1:
-                   p = _parse(source, state, flags)
-                   if source.match(")"):
-                       if b:
-                           b.append(p)
-                           p = _branch(state, b)
-                       subpattern.append((SUBPATTERN, (group, p)))
-                       break
-                   elif source.match("|"):
-                       b.append(p)
-                   else:
-                       raise error, "group not properly closed"
-           else:
-               while 1:
-                   char = source.get()
-                   if char is None or char == ")":
-                       break
-                   raise error, "unknown extension"
-
-       elif this == "^":
-           subpattern.append((AT, AT_BEGINNING))
-
-       elif this == "$":
-           subpattern.append((AT, AT_END))
-
-       elif this and this[0] == "\\":
-           code = _escape(source, this, state)
-           subpattern.append(code)
-
-       else:
-           raise error, "parser error"
+        if source.next in ("|", ")"):
+            break # end of subpattern
+        this = source.get()
+        if this is None:
+            break # end of pattern
+
+        if state.flags & SRE_FLAG_VERBOSE:
+            # skip whitespace and comments
+            if this in WHITESPACE:
+                continue
+            if this == "#":
+                while 1:
+                    this = source.get()
+                    if this in (None, "\n"):
+                        break
+                continue
+
+        if this and this[0] not in SPECIAL_CHARS:
+            subpattern.append((LITERAL, ord(this)))
+
+        elif this == "[":
+            # character set
+            set = []
+##          if source.match(":"):
+##              pass # handle character classes
+            if source.match("^"):
+                set.append((NEGATE, None))
+            # check remaining characters
+            start = set[:]
+            while 1:
+                this = source.get()
+                if this == "]" and set != start:
+                    break
+                elif this and this[0] == "\\":
+                    code1 = _class_escape(source, this)
+                elif this:
+                    code1 = LITERAL, ord(this)
+                else:
+                    raise error, "unexpected end of regular expression"
+                if source.match("-"):
+                    # potential range
+                    this = source.get()
+                    if this == "]":
+                        set.append(code1)
+                        set.append((LITERAL, ord("-")))
+                        break
+                    else:
+                        if this[0] == "\\":
+                            code2 = _class_escape(source, this)
+                        else:
+                            code2 = LITERAL, ord(this)
+                        if code1[0] != LITERAL or code2[0] != LITERAL:
+                            raise error, "illegal range"
+                        set.append((RANGE, (code1[1], code2[1])))
+                else:
+                    if code1[0] is IN:
+                        code1 = code1[1][0]
+                    set.append(code1)
+
+            # FIXME: <fl> move set optimization to compiler!
+            if len(set)==1 and set[0][0] is LITERAL:
+                subpattern.append(set[0]) # optimization
+            elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
+                subpattern.append((NOT_LITERAL, set[1][1])) # optimization
+            else:
+                # FIXME: <fl> add charmap optimization
+                subpattern.append((IN, set))
+
+        elif this and this[0] in REPEAT_CHARS:
+            # repeat previous item
+            if this == "?":
+                min, max = 0, 1
+            elif this == "*":
+                min, max = 0, MAXREPEAT
+            elif this == "+":
+                min, max = 1, MAXREPEAT
+            elif this == "{":
+                min, max = 0, MAXREPEAT
+                lo = hi = ""
+                while source.next in DIGITS:
+                    lo = lo + source.get()
+                if source.match(","):
+                    while source.next in DIGITS:
+                        hi = hi + source.get()
+                else:
+                    hi = lo
+                if not source.match("}"):
+                    raise error, "bogus range"
+                if lo:
+                    min = int(lo)
+                if hi:
+                    max = int(hi)
+                # FIXME: <fl> check that hi >= lo!
+            else:
+                raise error, "not supported"
+            # figure out which item to repeat
+            if subpattern:
+                item = subpattern[-1:]
+            else:
+                raise error, "nothing to repeat"
+            if source.match("?"):
+                subpattern[-1] = (MIN_REPEAT, (min, max, item))
+            else:
+                subpattern[-1] = (MAX_REPEAT, (min, max, item))
+
+        elif this == ".":
+            subpattern.append((ANY, None))
+
+        elif this == "(":
+            group = 1
+            name = None
+            if source.match("?"):
+                group = 0
+                # options
+                if source.match("P"):
+                    # python extensions
+                    if source.match("<"):
+                        # named group: skip forward to end of name
+                        name = ""
+                        while 1:
+                            char = source.get()
+                            if char is None:
+                                raise error, "unterminated name"
+                            if char == ">":
+                                break
+                            name = name + char
+                        group = 1
+                        if not isname(name):
+                            raise error, "illegal character in group name"
+                    elif source.match("="):
+                        # named backreference
+                        name = ""
+                        while 1:
+                            char = source.get()
+                            if char is None:
+                                raise error, "unterminated name"
+                            if char == ")":
+                                break
+                            name = name + char
+                        if not isname(name):
+                            raise error, "illegal character in group name"
+                        gid = state.groupdict.get(name)
+                        if gid is None:
+                            raise error, "unknown group name"
+                        subpattern.append((GROUP, gid))
+                    else:
+                        char = source.get()
+                        if char is None:
+                            raise error, "unexpected end of pattern"
+                        raise error, "unknown specifier: ?P%s" % char
+                elif source.match(":"):
+                    # non-capturing group
+                    group = 2
+                elif source.match("#"):
+                    # comment
+                    while 1:
+                        if source.next is None or source.next == ")":
+                            break
+                        source.get()
+                elif source.next in ("=", "!"):
+                    # lookahead assertions
+                    char = source.get()
+                    b = []
+                    while 1:
+                        p = _parse(source, state, flags)
+                        if source.next == ")":
+                            if b:
+                                b.append(p)
+                                p = _branch(state, b)
+                            if char == "=":
+                                subpattern.append((ASSERT, p))
+                            else:
+                                subpattern.append((ASSERT_NOT, p))
+                            break
+                        elif source.match("|"):
+                            b.append(p)
+                        else:
+                            raise error, "pattern not properly closed"
+                else:
+                    # flags
+                    while FLAGS.has_key(source.next):
+                        state.flags = state.flags | FLAGS[source.get()]
+            if group:
+                # parse group contents
+                b = []
+                if group == 2:
+                    # anonymous group
+                    group = None
+                else:
+                    group = state.getgroup(name)
+                while 1:
+                    p = _parse(source, state, flags)
+                    if source.match(")"):
+                        if b:
+                            b.append(p)
+                            p = _branch(state, b)
+                        subpattern.append((SUBPATTERN, (group, p)))
+                        break
+                    elif source.match("|"):
+                        b.append(p)
+                    else:
+                        raise error, "group not properly closed"
+            else:
+                while 1:
+                    char = source.get()
+                    if char is None or char == ")":
+                        break
+                    raise error, "unknown extension"
+
+        elif this == "^":
+            subpattern.append((AT, AT_BEGINNING))
+
+        elif this == "$":
+            subpattern.append((AT, AT_END))
+
+        elif this and this[0] == "\\":
+            code = _escape(source, this, state)
+            subpattern.append(code)
+
+        else:
+            raise error, "parser error"
 
     return subpattern
 
@@ -508,19 +533,19 @@ def parse(pattern, flags=0):
     state = State()
     b = []
     while 1:
-       p = _parse(source, state, flags)
-       tail = source.get()
-       if tail == "|":
-           b.append(p)
-       elif tail == ")":
-           raise error, "unbalanced parenthesis"
-       elif tail is None:
-           if b:
-               b.append(p)
-               p = _branch(state, b)
-           break
-       else:
-           raise error, "bogus characters at end of regular expression"
+        p = _parse(source, state, flags)
+        tail = source.get()
+        if tail == "|":
+            b.append(p)
+        elif tail == ")":
+            raise error, "unbalanced parenthesis"
+        elif tail is None:
+            if b:
+                b.append(p)
+                p = _branch(state, b)
+            break
+        else:
+            raise error, "bogus characters at end of regular expression"
     return p
 
 def parse_template(source, pattern):
@@ -530,44 +555,58 @@ def parse_template(source, pattern):
     p = []
     a = p.append
     while 1:
-       this = s.get()
-       if this is None:
-           break # end of replacement string
-       if this and this[0] == "\\":
-           if this == "\\g":
-               name = ""
-               if s.match("<"):
-                   while 1:
-                       char = s.get()
-                       if char is None:
-                           raise error, "unterminated group name"
-                       if char == ">":
-                           break
-                       name = name + char
-               if not name:
-                   raise error, "bad group name"
-               try:
-                   index = int(name)
-               except ValueError:
-                   if not isname(name):
-                       raise error, "illegal character in group name"
-                   try:
-                       index = pattern.groupindex[name]
-                   except KeyError:
-                       raise IndexError, "unknown group name"
-               a((MARK, index))
-           elif len(this) > 1 and this[1] in DIGITS:
-               while s.next in DIGITS:
-                   this = this + s.get()
-               a((MARK, int(this[1:])))
-           else:
-               try:
-                   a(ESCAPES[this])
-               except KeyError:
-                   for char in this:
-                       a((LITERAL, char))
-       else:
-           a((LITERAL, this))
+        this = s.get()
+        if this is None:
+            break # end of replacement string
+        if this and this[0] == "\\":
+            # group
+            if this == "\\g":
+                name = ""
+                if s.match("<"):
+                    while 1:
+                        char = s.get()
+                        if char is None:
+                            raise error, "unterminated group name"
+                        if char == ">":
+                            break
+                        name = name + char
+                if not name:
+                    raise error, "bad group name"
+                try:
+                    index = int(name)
+                except ValueError:
+                    if not isname(name):
+                        raise error, "illegal character in group name"
+                    try:
+                        index = pattern.groupindex[name]
+                    except KeyError:
+                        raise IndexError, "unknown group name"
+                a((MARK, index))
+            elif len(this) > 1 and this[1] in DIGITS:
+                code = None
+                while 1:
+                    group = _group(this, pattern.groups+1)
+                    if group:
+                        if (not s.next or
+                            not _group(this + s.next, pattern.groups+1)):
+                            code = MARK, int(group)
+                            break
+                    elif s.next in OCTDIGITS:
+                        this = this + s.get()
+                    else:
+                        break
+                if not code:
+                    this = this[1:]
+                    code = LITERAL, int(this[-6:], 8) & CHARMASK
+                a(code)
+            else:
+                try:
+                    a(ESCAPES[this])
+                except KeyError:
+                    for c in this:
+                        a((LITERAL, ord(c)))
+        else:
+            a((LITERAL, ord(this)))
     return p
 
 def expand_template(template, match):
@@ -575,12 +614,17 @@ def expand_template(template, match):
     # code instead
     p = []
     a = p.append
+    sep = match.string[:0]
+    if type(sep) is type(""):
+       char = chr
+    else:
+       char = unichr
     for c, s in template:
-       if c is LITERAL:
-           a(s)
-       elif c is MARK:
-           s = match.group(s)
-           if s is None:
-               raise error, "empty group"
-           a(s)
-    return match.string[:0].join(p)
+        if c is LITERAL:
+            a(char(s))
+        elif c is MARK:
+            s = match.group(s)
+            if s is None:
+                raise error, "empty group"
+            a(s)
+    return sep.join(p)
diff --git a/Lib/dos-8x3/test_has.py b/Lib/dos-8x3/test_has.py
new file mode 100644 (file)
index 0000000..51b4c33
--- /dev/null
@@ -0,0 +1,26 @@
+# test the invariant that
+#   iff a==b then hash(a)==hash(b)
+#
+
+import test_support
+
+
+def same_hash(*objlist):
+       # hash each object given an raise TestFailed if
+       # the hash values are not all the same
+       hashed = map(hash, objlist)
+       for h in hashed[1:]:
+               if h != hashed[0]:
+                       raise TestFailed, "hashed values differ: %s" % `objlist`
+
+
+
+same_hash(1, 1L, 1.0, 1.0+0.0j)
+same_hash(int(1), long(1), float(1), complex(1))
+
+same_hash(long(1.23e300), float(1.23e300))
+
+same_hash(float(0.5), complex(0.5, 0.0))
+
+
+