Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

author Martin v. Löwis <martin@v.loewis.de>

Sat, 19 Apr 2003 12:56:08 +0000 (12:56 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sat, 19 Apr 2003 12:56:08 +0000 (12:56 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sat, 19 Apr 2003 12:56:08 +0000 (12:56 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sat, 19 Apr 2003 12:56:08 +0000 (12:56 +0000)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py

index 3e54819bc9d5f9cb671ef945281a840bdf9df524..1d59d7e0976eb3f22e10aacec398204f608d6156 100644 (file)
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -16,7 +16,10 @@ from sre_constants import *
  
  assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  
-MAXCODE = 65535
+if _sre.CODESIZE == 2:
+    MAXCODE = 65535
+else:
+    MAXCODE = 0xFFFFFFFFL
  
  def _compile(code, pattern, flags):
      # internal: compile a (sub)pattern
@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
                  # XXX: could append to charmap tail
                  return charset # cannot compress
      except IndexError:
-        if sys.maxunicode != 65535:
-            # XXX: big charsets don't work in UCS-4 builds
-            return charset
          # character set contains unicode characters
          return _optimize_unicode(charset, fixup)
      # compress character map
@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
  
  def _mk_bitmap(bits):
      data = []
-    m = 1; v = 0
+    if _sre.CODESIZE == 2:
+        start = (1, 0)
+    else:
+        start = (1L, 0L)
+    m, v = start
      for c in bits:
          if c:
              v = v + m
          m = m << 1
          if m > MAXCODE:
              data.append(v)
-            m = 1; v = 0
+            m, v = start
      return data
  
  # To represent a big charset, first a bitmap of all characters in the
@@ -258,21 +262,38 @@ def _mk_bitmap(bits):
  # less significant byte is a bit index in the chunk (just like the
  # CHARSET matching).
  
+# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
+# of the basic multilingual plane; an efficient representation
+# for all of UTF-16 has not yet been developed. This means,
+# in particular, that negated charsets cannot be represented as
+# bigcharsets.
+
  def _optimize_unicode(charset, fixup):
+    try:
+        import array
+    except ImportError:
+        return charset
      charmap = [0]*65536
      negate = 0
-    for op, av in charset:
-        if op is NEGATE:
-            negate = 1
-        elif op is LITERAL:
-            charmap[fixup(av)] = 1
-        elif op is RANGE:
-            for i in range(fixup(av[0]), fixup(av[1])+1):
-                charmap[i] = 1
-        elif op is CATEGORY:
-            # XXX: could expand category
-            return charset # cannot compress
+    try:
+        for op, av in charset:
+            if op is NEGATE:
+                negate = 1
+            elif op is LITERAL:
+                charmap[fixup(av)] = 1
+            elif op is RANGE:
+                for i in range(fixup(av[0]), fixup(av[1])+1):
+                    charmap[i] = 1
+            elif op is CATEGORY:
+                # XXX: could expand category
+                return charset # cannot compress
+    except IndexError:
+        # non-BMP characters
+        return charset
      if negate:
+        if sys.maxunicode != 65535:
+            # XXX: negation does not work with big charsets
+            return charset
          for i in range(65536):
              charmap[i] = not charmap[i]
      comps = {}
@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
              block = block + 1
              data = data + _mk_bitmap(chunk)
      header = [block]
-    assert MAXCODE == 65535
-    for i in range(128):
-        if sys.byteorder == 'big':
-            header.append(256*mapping[2*i]+mapping[2*i+1])
-        else:
-            header.append(mapping[2*i]+256*mapping[2*i+1])
+    if MAXCODE == 65535:
+        code = 'H'
+    else:
+        code = 'L'
+    # Convert block indices to byte array of 256 bytes
+    mapping = array.array('b', mapping).tostring()
+    # Convert byte array to word array
+    header = header + array.array(code, mapping).tolist()
      data[0:0] = header
      return [(BIGCHARSET, data)]
  
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py

index 2cd85a39f9fb3c4330ea4cfc7646ec47ec95ee02..07b24ddf5410d49907a947f6c1cd056a9377bd9e 100644 (file)
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
  
  # update when constants are added or removed
  
-MAGIC = 20010701
+MAGIC = 20030419
  
  # max code word in this release
  
diff --git a/Modules/_sre.c b/Modules/_sre.c

index dde365b956b9c4e43f8ae4124b67068517515f7e..8cae095c8a77689bed8bc90ea6b9671bc9cd5aae 100644 (file)
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -20,6 +20,7 @@
   * 2001-10-24 fl  added finditer primitive (for 2.2 only)
   * 2001-12-07 fl  fixed memory leak in sub/subn (Guido van Rossum)
   * 2002-11-09 fl  fixed empty sub/subn return type
+ * 2003-04-18 mvl fully support 4-byte codes
   *
   * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
   *
@@ -510,10 +511,18 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
              break;
  
          case SRE_OP_CHARSET:
-            /* <CHARSET> <bitmap> (16 bits per code word) */
-            if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
-                return ok;
-            set += 16;
+            if (sizeof(SRE_CODE) == 2) {
+                /* <CHARSET> <bitmap> (16 bits per code word) */
+                if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
+                    return ok;
+                set += 16;
+            } 
+            else {
+                /* <CHARSET> <bitmap> (32 bits per code word) */
+                if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
+                    return ok;
+                set += 8;
+            }
              break;
  
          case SRE_OP_BIGCHARSET:
@@ -521,11 +530,25 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
          {
              int count, block;
              count = *(set++);
-            block = ((unsigned char*)set)[ch >> 8];
-            set += 128;
-            if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
-                return ok;
-            set += count*16;
+
+            if (sizeof(SRE_CODE) == 2) {
+                block = ((unsigned char*)set)[ch >> 8];
+                set += 128;
+                if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
+                    return ok;
+                set += count*16;
+            }
+            else {
+                if (ch < 65536)
+                    block = ((unsigned char*)set)[ch >> 8];
+                else
+                    block = -1;
+                set += 64;
+                if (block >=0 && 
+                    (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
+                    return ok;
+                set += count*8;
+            }
              break;
          }
  
@@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args)
  
      for (i = 0; i < n; i++) {
          PyObject *o = PyList_GET_ITEM(code, i);
-        self->code[i] = (SRE_CODE) PyInt_AsLong(o);
+        if (PyInt_Check(o))
+            self->code[i] = (SRE_CODE) PyInt_AsLong(o);
+        else
+            self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
      }
  
      if (PyErr_Occurred()) {
@@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void)
          Py_DECREF(x);
      }
  
+    x = PyInt_FromLong(sizeof(SRE_CODE));
+    if (x) {
+        PyDict_SetItemString(d, "CODESIZE", x);
+        Py_DECREF(x);
+    }
+
      x = PyString_FromString(copyright);
      if (x) {
          PyDict_SetItemString(d, "copyright", x);
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h

index 540008e3754eadf08e75e2a29e766caf337f6e0c..619ea00c1ff76eeede2acdd5df9c7b6888d64fd0 100644 (file)
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
   * See the _sre.c file for information on usage and redistribution.
   */
  
-#define SRE_MAGIC 20010701
+#define SRE_MAGIC 20030419
  #define SRE_OP_FAILURE 0
  #define SRE_OP_SUCCESS 1
  #define SRE_OP_ANY 2
author	Martin v. Löwis <martin@v.loewis.de>
	Sat, 19 Apr 2003 12:56:08 +0000 (12:56 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sat, 19 Apr 2003 12:56:08 +0000 (12:56 +0000)
Lib/sre_compile.py		patch \| blob \| history
Lib/sre_constants.py		patch \| blob \| history
Modules/_sre.c		patch \| blob \| history
Modules/sre_constants.h		patch \| blob \| history