]> granicus.if.org Git - python/commitdiff
Part of SF patch #1313939: Speedup charmap decoding by extending
authorWalter Dörwald <walter@livinglogic.de>
Thu, 6 Oct 2005 20:29:57 +0000 (20:29 +0000)
committerWalter Dörwald <walter@livinglogic.de>
Thu, 6 Oct 2005 20:29:57 +0000 (20:29 +0000)
PyUnicode_DecodeCharmap() the accept a unicode string as the mapping
argument which is used as a mapping table.

This code isn't used by any of the codecs yet.

Doc/api/concrete.tex
Lib/test/test_codecs.py
Misc/NEWS
Objects/unicodeobject.c

index b6dbc5d6c812402fb44f59947ab5b9f70ddfb90c..53c3b67760c560dfa04cf276a5f8c98b3dc7ff99 100644 (file)
@@ -1322,7 +1322,12 @@ points.
                                                const char *errors}
   Create a Unicode object by decoding \var{size} bytes of the encoded
   string \var{s} using the given \var{mapping} object.  Return
-  \NULL{} if an exception was raised by the codec.
+  \NULL{} if an exception was raised by the codec. If \var{mapping} is \NULL{}
+  latin-1 decoding will be done. Else it can be a dictionary mapping byte or a
+  unicode string, which is treated as a lookup table. Byte values greater
+  that the length of the string and U+FFFE "characters" are treated as
+  "undefined mapping".
+  \versionchanged[Allowed unicode string as mapping argument]{2.4}
 \end{cfuncdesc}
 
 \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s,
index a4d58c6857f74eb3d6524195b43467ef0d13a14c..74ad83bc04bc4642501f412e06c5b2cd6fcb316d 100644 (file)
@@ -924,6 +924,40 @@ class BasicStrTest(unittest.TestCase):
             (chars, size) = codecs.getdecoder(encoding)(bytes)
             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
 
+class CharmapTest(unittest.TestCase):
+    def test_decode_with_string_map(self):
+        self.assertEquals(
+            codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
+            (u"abc", 3)
+        )
+
+        self.assertEquals(
+            codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
+            (u"ab\ufffd", 3)
+        )
+
+        self.assertEquals(
+            codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
+            (u"ab\ufffd", 3)
+        )
+
+        self.assertEquals(
+            codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
+            (u"ab", 3)
+        )
+
+        self.assertEquals(
+            codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
+            (u"ab", 3)
+        )
+
+        allbytes = "".join(chr(i) for i in xrange(256))
+        self.assertEquals(
+            codecs.charmap_decode(allbytes, "ignore", u""),
+            (u"", len(allbytes))
+        )
+
+
 def test_main():
     test_support.run_unittest(
         UTF16Test,
@@ -940,7 +974,8 @@ def test_main():
         StreamReaderTest,
         Str2StrTest,
         BasicUnicodeTest,
-        BasicStrTest
+        BasicStrTest,
+        CharmapTest
     )
 
 
index 11dd40cde36dd622bcae329d15bf2dc763e14642..4d35774daaec741c8cb1cb2fbf0772dae7344210 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -563,6 +563,11 @@ C API
 
 - Removed PyRange_New().
 
+- Patch #1313939: PyUnicode_DecodeCharmap() accepts a unicode string as the
+  mapping argument now. This string is used as a mapping table. Byte values
+  greater than the length of the string and 0xFFFE are treated as undefined
+  mappings.
+
 
 Tests
 -----
index 5d096edee63d4073a7a9395ba63f373330b5f4db..7ab4d0c47b3f47e3fbad92faad9da4d2ebeae5ea 100644 (file)
@@ -2833,6 +2833,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
     int extrachars = 0;
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
+    Py_UNICODE *mapstring = NULL;
+    int maplen = 0;
 
     /* Default to Latin-1 */
     if (mapping == NULL)
@@ -2845,91 +2847,121 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
        return (PyObject *)v;
     p = PyUnicode_AS_UNICODE(v);
     e = s + size;
-    while (s < e) {
-       unsigned char ch = *s;
-       PyObject *w, *x;
-
-       /* Get mapping (char ordinal -> integer, Unicode char or None) */
-       w = PyInt_FromLong((long)ch);
-       if (w == NULL)
-           goto onError;
-       x = PyObject_GetItem(mapping, w);
-       Py_DECREF(w);
-       if (x == NULL) {
-           if (PyErr_ExceptionMatches(PyExc_LookupError)) {
-               /* No mapping found means: mapping is undefined. */
-               PyErr_Clear();
-               x = Py_None;
-               Py_INCREF(x);
-           } else
-               goto onError;
+    if (PyUnicode_CheckExact(mapping)) {
+       mapstring = PyUnicode_AS_UNICODE(mapping);
+       maplen = PyUnicode_GET_SIZE(mapping);
+       while (s < e) {
+           unsigned char ch = *s;
+           Py_UNICODE x = 0xfffe; /* illegal value */
+
+           if (ch < maplen)
+               x = mapstring[ch];
+
+           if (x == 0xfffe) {
+               /* undefined mapping */
+               outpos = p-PyUnicode_AS_UNICODE(v);
+               startinpos = s-starts;
+               endinpos = startinpos+1;
+               if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "charmap", "character maps to <undefined>",
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    (PyObject **)&v, &outpos, &p)) {
+                   goto onError;
+               }
+               continue;
+           }
+           *p++ = x;
+           ++s;
        }
+    }
+    else {
+       while (s < e) {
+           unsigned char ch = *s;
+           PyObject *w, *x;
 
-       /* Apply mapping */
-       if (PyInt_Check(x)) {
-           long value = PyInt_AS_LONG(x);
-           if (value < 0 || value > 65535) {
-               PyErr_SetString(PyExc_TypeError,
-                               "character mapping must be in range(65536)");
-               Py_DECREF(x);
+           /* Get mapping (char ordinal -> integer, Unicode char or None) */
+           w = PyInt_FromLong((long)ch);
+           if (w == NULL)
                goto onError;
+           x = PyObject_GetItem(mapping, w);
+           Py_DECREF(w);
+           if (x == NULL) {
+               if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+                   /* No mapping found means: mapping is undefined. */
+                   PyErr_Clear();
+                   x = Py_None;
+                   Py_INCREF(x);
+               } else
+                   goto onError;
            }
-           *p++ = (Py_UNICODE)value;
-       }
-       else if (x == Py_None) {
-           /* undefined mapping */
-           outpos = p-PyUnicode_AS_UNICODE(v);
-           startinpos = s-starts;
-           endinpos = startinpos+1;
-           if (unicode_decode_call_errorhandler(
-                errors, &errorHandler,
-                "charmap", "character maps to <undefined>",
-                starts, size, &startinpos, &endinpos, &exc, &s,
-                (PyObject **)&v, &outpos, &p)) {
-               Py_DECREF(x);
-               goto onError;
+    
+           /* Apply mapping */
+           if (PyInt_Check(x)) {
+               long value = PyInt_AS_LONG(x);
+               if (value < 0 || value > 65535) {
+                   PyErr_SetString(PyExc_TypeError,
+                                   "character mapping must be in range(65536)");
+                   Py_DECREF(x);
+                   goto onError;
+               }
+               *p++ = (Py_UNICODE)value;
            }
-           continue;
-       }
-       else if (PyUnicode_Check(x)) {
-           int targetsize = PyUnicode_GET_SIZE(x);
-
-           if (targetsize == 1)
-               /* 1-1 mapping */
-               *p++ = *PyUnicode_AS_UNICODE(x);
-
-           else if (targetsize > 1) {
-               /* 1-n mapping */
-               if (targetsize > extrachars) {
-                   /* resize first */
-                   int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
-                   int needed = (targetsize - extrachars) + \
-                                (targetsize << 2);
-                   extrachars += needed;
-                   if (_PyUnicode_Resize(&v,
-                                        PyUnicode_GET_SIZE(v) + needed) < 0) {
-                       Py_DECREF(x);
-                       goto onError;
+           else if (x == Py_None) {
+               /* undefined mapping */
+               outpos = p-PyUnicode_AS_UNICODE(v);
+               startinpos = s-starts;
+               endinpos = startinpos+1;
+               if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "charmap", "character maps to <undefined>",
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    (PyObject **)&v, &outpos, &p)) {
+                   Py_DECREF(x);
+                   goto onError;
+               }
+               continue;
+           }
+           else if (PyUnicode_Check(x)) {
+               int targetsize = PyUnicode_GET_SIZE(x);
+    
+               if (targetsize == 1)
+                   /* 1-1 mapping */
+                   *p++ = *PyUnicode_AS_UNICODE(x);
+    
+               else if (targetsize > 1) {
+                   /* 1-n mapping */
+                   if (targetsize > extrachars) {
+                       /* resize first */
+                       int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
+                       int needed = (targetsize - extrachars) + \
+                                    (targetsize << 2);
+                       extrachars += needed;
+                       if (_PyUnicode_Resize(&v,
+                                            PyUnicode_GET_SIZE(v) + needed) < 0) {
+                           Py_DECREF(x);
+                           goto onError;
+                       }
+                       p = PyUnicode_AS_UNICODE(v) + oldpos;
                    }
-                   p = PyUnicode_AS_UNICODE(v) + oldpos;
+                   Py_UNICODE_COPY(p,
+                                   PyUnicode_AS_UNICODE(x),
+                                   targetsize);
+                   p += targetsize;
+                   extrachars -= targetsize;
                }
-               Py_UNICODE_COPY(p,
-                               PyUnicode_AS_UNICODE(x),
-                               targetsize);
-               p += targetsize;
-               extrachars -= targetsize;
+               /* 1-0 mapping: skip the character */
+           }
+           else {
+               /* wrong return value */
+               PyErr_SetString(PyExc_TypeError,
+                     "character mapping must return integer, None or unicode");
+               Py_DECREF(x);
+               goto onError;
            }
-           /* 1-0 mapping: skip the character */
-       }
-       else {
-           /* wrong return value */
-           PyErr_SetString(PyExc_TypeError,
-                 "character mapping must return integer, None or unicode");
            Py_DECREF(x);
-           goto onError;
+           ++s;
        }
-       Py_DECREF(x);
-       ++s;
     }
     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
        if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)