Add fast-path in PyUnicode_DecodeCharmap() for pure 8 bit encodings:

author Victor Stinner <victor.stinner@gmail.com>

Tue, 9 Apr 2013 19:53:09 +0000 (21:53 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Tue, 9 Apr 2013 19:53:09 +0000 (21:53 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Tue, 9 Apr 2013 19:53:09 +0000 (21:53 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Tue, 9 Apr 2013 19:53:09 +0000 (21:53 +0200)
diff --git a/Lib/encodings/cp037.py b/Lib/encodings/cp037.py

index bfe2c1ed17aeb89f6ca1fd6c95234edaaf1faae8..4edd708f3d7501be4c93fb4d5c2e1c608e1093cf 100644 (file)
--- a/Lib/encodings/cp037.py
+++ b/Lib/encodings/cp037.py
@@ -301,7 +301,6 @@ decoding_table = (
      '\xd9'     #  0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
      '\xda'     #  0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
      '\x9f'     #  0xFF -> CONTROL
-    '\ufffe'   ## Widen to UCS2 for optimization
  )
  
  ### Encoding table
diff --git a/Lib/encodings/cp500.py b/Lib/encodings/cp500.py

index a975be7b8d5754c793aad1d59d3d677a3751be4b..5f61535f82aa991eb8dd8940df7e92e9a4db5763 100644 (file)
--- a/Lib/encodings/cp500.py
+++ b/Lib/encodings/cp500.py
@@ -301,7 +301,6 @@ decoding_table = (
      '\xd9'     #  0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
      '\xda'     #  0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
      '\x9f'     #  0xFF -> CONTROL
-    '\ufffe'   ## Widen to UCS2 for optimization
  )
  
  ### Encoding table
diff --git a/Lib/encodings/iso8859_1.py b/Lib/encodings/iso8859_1.py

index d9cc516718a9c513a185af924ad4d9fc5390a160..8cfc01fe14e4853c69c76e5cb9bb0659cd2b8d2a 100644 (file)
--- a/Lib/encodings/iso8859_1.py
+++ b/Lib/encodings/iso8859_1.py
@@ -301,7 +301,6 @@ decoding_table = (
      '\xfd'     #  0xFD -> LATIN SMALL LETTER Y WITH ACUTE
      '\xfe'     #  0xFE -> LATIN SMALL LETTER THORN (Icelandic)
      '\xff'     #  0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS
-    '\ufffe'   ## Widen to UCS2 for optimization
  )
  
  ### Encoding table
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index e9153c0de8ceb87f4a3cf52d566de882aa4855a4..88729c8bc01f52fc0f892ffd9c2c65ea84a44410 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7281,6 +7281,7 @@ PyUnicode_DecodeCharmap(const char *s,
          enum PyUnicode_Kind mapkind;
          void *mapdata;
          Py_UCS4 x;
+        unsigned char ch;
  
          if (PyUnicode_READY(mapping) == -1)
              return NULL;
@@ -7288,8 +7289,32 @@ PyUnicode_DecodeCharmap(const char *s,
          maplen = PyUnicode_GET_LENGTH(mapping);
          mapdata = PyUnicode_DATA(mapping);
          mapkind = PyUnicode_KIND(mapping);
+
+        if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
+            /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
+             * is disabled in encoding aliases, latin1 is preferred because
+             * its implementation is faster. */
+            Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
+            Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
+            Py_UCS4 maxchar = writer.maxchar;
+
+            assert (writer.kind == PyUnicode_1BYTE_KIND);
+            while (s < e) {
+                ch = *s;
+                x = mapdata_ucs1[ch];
+                if (x > maxchar) {
+                    if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
+                        goto onError;
+                    maxchar = writer.maxchar;
+                    outdata = (Py_UCS1 *)writer.data;
+                }
+                outdata[writer.pos] = x;
+                writer.pos++;
+                ++s;
+            }
+        }
+
          while (s < e) {
-            unsigned char ch;
              if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
                  enum PyUnicode_Kind outkind = writer.kind;
                  void *outdata = writer.data;
author	Victor Stinner <victor.stinner@gmail.com>
	Tue, 9 Apr 2013 19:53:09 +0000 (21:53 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Tue, 9 Apr 2013 19:53:09 +0000 (21:53 +0200)
Lib/encodings/cp037.py		patch \| blob \| history
Lib/encodings/cp500.py		patch \| blob \| history
Lib/encodings/iso8859_1.py		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history