]> granicus.if.org Git - python/commitdiff
Marc-Andre Lemburg <mal@lemburg.com>:
authorMarc-André Lemburg <mal@egenix.com>
Wed, 28 Jun 2000 16:43:35 +0000 (16:43 +0000)
committerMarc-André Lemburg <mal@egenix.com>
Wed, 28 Jun 2000 16:43:35 +0000 (16:43 +0000)
Patch to the standard unicode-escape codec which dynamically
loads the Unicode name to ordinal mapping from the module
ucnhash.

By Bill Tutt.

Objects/unicodeobject.c

index 1aa03f74c0200c58ec7672c605cdc628c3d50e2b..57bedb8abc18241d89ac9848fc5e93c022df016b 100644 (file)
@@ -66,6 +66,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
 
 #include "mymath.h"
 #include "unicodeobject.h"
+#include <ucnhash.h>
 
 #if defined(HAVE_LIMITS_H)
 #include <limits.h>
@@ -1020,6 +1021,28 @@ int unicodeescape_decoding_error(const char **source,
     }
 }
 
+static _Py_UCNHashAPI *pucnHash = NULL;
+
+static
+int mystrnicmp(const char *s1, const char *s2, size_t count)
+{
+    char c1, c2;
+    
+    if (count)
+    {
+        do
+        {
+           c1 = tolower(*(s1++));
+           c2 = tolower(*(s2++));
+        }
+        while(--count && c1 == c2);
+        
+        return c1 - c2;
+    }
+    
+    return 0;
+}
+
 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                                        int size,
                                        const char *errors)
@@ -1123,6 +1146,104 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
             *p++ = x;
             break;
 
+        case 'N':
+            /* Ok, we need to deal with Unicode Character Names now,
+             * make sure we've imported the hash table data...
+             */
+            if (pucnHash == NULL)
+            {
+                PyObject *mod = 0, *v = 0;
+    
+                mod = PyImport_ImportModule("ucnhash");
+                if (mod == NULL)
+                    goto onError;
+                v = PyObject_GetAttrString(mod,"ucnhashAPI");
+                Py_DECREF(mod);
+                if (v == NULL)
+                {
+                    goto onError;
+                }
+                pucnHash = PyCObject_AsVoidPtr(v);
+                Py_DECREF(v);
+                if (pucnHash == NULL)
+                {
+                    goto onError;
+                }
+            }
+                
+            if (*s == '{')
+            {
+                const char *start = s + 1;
+                const char *endBrace = start;
+                unsigned int uiValue;
+                unsigned long j;
+
+                /* look for either the closing brace, or we
+                 * exceed the maximum length of the unicode character names
+                 */
+                while (*endBrace != '}' &&
+                       (unsigned int)(endBrace - start) <=
+                           pucnHash->cchMax &&
+                       endBrace < end)
+                {
+                    endBrace++;
+                }
+                if (endBrace != end && *endBrace == '}')
+                {
+                    j = pucnHash->hash(start, endBrace - start);
+                    if (j > pucnHash->cKeys ||
+                        mystrnicmp(
+                            start,
+                            ((_Py_UnicodeCharacterName *) 
+                             (pucnHash->getValue(j)))->pszUCN,
+                            (int)(endBrace - start)) != 0)
+                    {
+                        if (unicodeescape_decoding_error(
+                                &s, &x, errors,
+                                "Invalid Unicode Character Name"))
+                        {
+                            goto onError;
+                        }
+                        goto ucnFallthrough;
+                    }
+                    uiValue = ((_Py_UnicodeCharacterName *)
+                               (pucnHash->getValue(j)))->uiValue;
+                    if (uiValue < 1<<16)
+                    {
+                        /* In UCS-2 range, easy solution.. */
+                        *p++ = uiValue;
+                    }
+                    else
+                    {
+                        /* Oops, its in UCS-4 space, */
+                        /*  compute and append the two surrogates: */
+                        /*  translate from 10000..10FFFF to 0..FFFFF */
+                        uiValue -= 0x10000;
+                    
+                        /* high surrogate = top 10 bits added to D800 */
+                        *p++ = 0xD800 + (uiValue >> 10);
+                        
+                        /* low surrogate  = bottom 10 bits added to DC00 */
+                        *p++ = 0xDC00 + (uiValue & ~0xFC00);
+                    }
+                    s = endBrace + 1;
+                }
+                else
+                {
+                    if (unicodeescape_decoding_error(
+                            &s, &x, errors,
+                            "Unicode name missing closing brace"))
+                        goto onError;
+                    goto ucnFallthrough;
+                }
+                break;                
+            }
+            if (unicodeescape_decoding_error(
+                    &s, &x, errors,
+                    "Missing opening brace for Unicode Character Name escape"))
+                goto onError;
+ucnFallthrough:
+            /* fall through on purpose */
         default:
             *p++ = '\\';
             *p++ = (unsigned char)s[-1];