refactored the unicodeobject/ucnhash interface, to hide the

author Fredrik Lundh <fredrik@pythonware.com>

Fri, 19 Jan 2001 09:45:02 +0000 (09:45 +0000)

committer Fredrik Lundh <fredrik@pythonware.com>

Fri, 19 Jan 2001 09:45:02 +0000 (09:45 +0000)
author Fredrik Lundh <fredrik@pythonware.com>
Fri, 19 Jan 2001 09:45:02 +0000 (09:45 +0000)
committer Fredrik Lundh <fredrik@pythonware.com>
Fri, 19 Jan 2001 09:45:02 +0000 (09:45 +0000)
diff --git a/Include/ucnhash.h b/Include/ucnhash.h

index 7bf3f5d47c972f61222976dee23158f58ba929bb..a664336aa5e06b6ffc2fcbd3a679ef32817e2bd9 100644 (file)
--- a/Include/ucnhash.h
+++ b/Include/ucnhash.h
@@ -1,20 +1,29 @@
+/* Unicode name database interface */
  
-#include "Python.h"
-#include <stdlib.h>
-
-/* --- C API ----------------------------------------------------*/
-/* C API for usage by other Python modules */
-typedef struct _Py_UCNHashAPI
-{
-    unsigned long cKeys;
-    unsigned long cchMax;
-    unsigned long (*hash)(const char *key, unsigned int cch);
-    const void *(*getValue)(unsigned long iKey);
-} _Py_UCNHashAPI;
-
-typedef struct 
-{
-    const char *pszUCN;
-    Py_UCS4 value;
-} _Py_UnicodeCharacterName;
+#ifndef Py_UCNHASH_H
+#define Py_UCNHASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
  
+/* revised ucnhash CAPI interface (exported through a PyCObject) */
+
+typedef struct {
+
+    /* Size of this struct */
+    int size;
+
+    /* Get name for a given character code.  Returns non-zero if
+       success, zero if not.  Does not set Python exceptions. */
+    int (*getname)(Py_UCS4 code, char* buffer, int buflen);
+
+    /* Get character code for a given name.  Same error handling
+       as for getname. */
+    int (*getcode)(const char* name, int namelen, Py_UCS4* code);
+
+} _PyUnicode_Name_CAPI;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_UCNHASH_H */
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py

index 92155be86951fd1959422e3ee131cc392934b375..a33d1114c6c98a26e5a4885d4c2ff26ee78c23a1 100644 (file)
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -50,22 +50,20 @@ print "done."
  
  # strict error testing:
  print "Testing unicode character name expansion strict error handling....",
-k_cchMaxUnicodeName = 83
-
-s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}"
  try:
-    unicode(s, 'unicode-escape', 'strict')
+    unicode("\N{blah}", 'unicode-escape', 'strict')
  except UnicodeError:
      pass
  else:
-    raise AssertionError, "failed to raise an exception when presented " \
-                          "with a UCN > k_cchMaxUnicodeName"
+    raise AssertionError, "failed to raise an exception when given a bogus character name"
+
  try:
-    unicode("\N{blah}", 'unicode-escape', 'strict')
+    unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
  except UnicodeError:
      pass
  else:
-    raise AssertionError, "failed to raise an exception when given a bogus character name"
+    raise AssertionError, "failed to raise an exception when given a very " \
+                          "long bogus character name"
  
  try:
      unicode("\N{SPACE", 'unicode-escape', 'strict')
diff --git a/Modules/ucnhash.c b/Modules/ucnhash.c

index e5a9bada11dc227341f66c8705ccd8ab750dcc0f..67a8895cb9dfe4b1684237af11fd91f213b0c608 100644 (file)
--- a/Modules/ucnhash.c
+++ b/Modules/ucnhash.c
@@ -1,5 +1,13 @@
+#include "Python.h"
  #include "ucnhash.h"
  
+/* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
+
+typedef struct {
+    const char* pszUCN;
+    Py_UCS4 value;
+}_Py_UnicodeCharacterName;   
+
  /*
   * The hash is produced using the algorithm described in
   * "Optimal algorithms for minimal perfect hashing",
@@ -14,11 +22,11 @@
   * Generated on: Fri Jul 14 08:00:58 2000
   */
  
+#define cKeys 10538
  #define k_cHashElements 18836
  #define k_cchMaxKey  83
  #define k_cKeys  10538
  
-
  staticforward const unsigned short G[k_cHashElements]; 
  staticforward const _Py_UnicodeCharacterName aucn[k_cKeys];   
  
@@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch)
      while (--len >= 0)
      {   
          /* (1000003 * x) ^ toupper(*(p++)) 
-         * translated to handle > 32 bit longs 
-         */
+         * translated to handle > 32 bit longs */
          x = (0xf4243 * x);
          x = x & 0xFFFFFFFF;
          x = x ^ toupper(*(p++));
@@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch)
  }
  
      
-static unsigned long hash(const char *key, unsigned int cch)
+static unsigned long
+hash(const char *key, unsigned int cch)
  {
      return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements;
  }
  
-const void *getValue(unsigned long iKey)
+const _Py_UnicodeCharacterName *
+getValue(unsigned long iKey)
  {
-    return &aucn[iKey];
+    return (_Py_UnicodeCharacterName *) &aucn[iKey];
  }
  
-/* Helper for adding objects to dictionaries. Check for errors with
-   PyErr_Occurred() */
-static 
-void insobj(PyObject *dict,
-     char *name,
-     PyObject *v)
+static int
+mystrnicmp(const char *s1, const char *s2, size_t count)
  {
-    PyDict_SetItemString(dict, name, v);
-    Py_XDECREF(v);
+    char c1, c2;
+    
+    if (count) {
+        do {
+           c1 = tolower(*(s1++));
+           c2 = tolower(*(s2++));
+        } while (--count && c1 == c2);
+        return c1 - c2;
+    }
+
+    return 0;
  }
  
-static const _Py_UCNHashAPI hashAPI = 
+/* bindings for the new API */
+
+static int
+ucnhash_getname(Py_UCS4 code, char* buffer, int buflen)
  {
-    k_cKeys,
-    k_cchMaxKey,
-    &hash,
-    &getValue,
+    return 0;
+}
+
+static int
+ucnhash_getcode(const char* name, int namelen, Py_UCS4* code)
+{
+    unsigned long j;
+
+    j = hash(name, namelen);
+
+    if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0)
+        return 0;
+
+    *code = getValue(j)->value;
+
+    return 1;
+}
+
+static const _PyUnicode_Name_CAPI hashAPI = 
+{
+    sizeof(_PyUnicode_Name_CAPI),
+    ucnhash_getname,
+    ucnhash_getcode
  };
  
  static  
-PyMethodDef Module_methods[] =
+PyMethodDef ucnhash_methods[] =
  {   
      {NULL, NULL},
  };
  
-static char *Module_docstring = "ucnhash hash function module";
-
-/* Error reporting for module init functions */
-
-#define Py_ReportModuleInitError(modname) {                    \
-    PyObject *exc_type, *exc_value, *exc_tb;                   \
-    PyObject *str_type, *str_value;                            \
-                                                               \
-    /* Fetch error objects and convert them to strings */      \
-    PyErr_Fetch(&exc_type, &exc_value, &exc_tb);               \
-    if (exc_type && exc_value) {                               \
-           str_type = PyObject_Str(exc_type);                  \
-           str_value = PyObject_Str(exc_value);                        \
-    }                                                          \
-    else {                                                     \
-          str_type = NULL;                                     \
-          str_value = NULL;                                    \
-    }                                                          \
-    /* Try to format a more informative error message using the        \
-       original error */                                       \
-    if (str_type && str_value &&                               \
-           PyString_Check(str_type) && PyString_Check(str_value))      \
-           PyErr_Format(                                               \
-                   PyExc_ImportError,                          \
-                   "initialization of module "modname" failed "        \
-                   "(%s:%s)",                                  \
-               PyString_AS_STRING(str_type),                   \
-               PyString_AS_STRING(str_value));                 \
-    else                                                       \
-           PyErr_SetString(                                    \
-                   PyExc_ImportError,                          \
-                   "initialization of module "modname" failed");       \
-    Py_XDECREF(str_type);                                      \
-    Py_XDECREF(str_value);                                     \
-    Py_XDECREF(exc_type);                                      \
-    Py_XDECREF(exc_value);                                     \
-    Py_XDECREF(exc_tb);                                                \
-}
+static char *ucnhash_docstring = "ucnhash hash function module";
  
  
  /* Create PyMethodObjects and register them in the module's dict */
  DL_EXPORT(void) 
  initucnhash(void)
  {
-    PyObject *module, *moddict;
-    /* Create module */
-    module = Py_InitModule4("ucnhash", /* Module name */
-             Module_methods, /* Method list */
-             Module_docstring, /* Module doc-string */
-             (PyObject *)NULL, /* always pass this as *self */
-             PYTHON_API_VERSION); /* API Version */
-    if (module == NULL)
-        goto onError;
-    /* Add some constants to the module's dict */
-    moddict = PyModule_GetDict(module);
-    if (moddict == NULL)
-        goto onError;
+    PyObject *m, *d, *v;
+
+    m = Py_InitModule4(
+        "ucnhash", /* Module name */
+        ucnhash_methods, /* Method list */
+        ucnhash_docstring, /* Module doc-string */
+        (PyObject *)NULL, /* always pass this as *self */
+        PYTHON_API_VERSION); /* API Version */
+    if (!m)
+        return;
+
+    d = PyModule_GetDict(m);
+    if (!d)
+        return;
  
      /* Export C API */
-    insobj(
-        moddict,
-        "ucnhashAPI",
-        PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
-    
-onError:
-    /* Check for errors and report them */
-    if (PyErr_Occurred())
-        Py_ReportModuleInitError("ucnhash");
-    return;
+    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+    PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
+    Py_XDECREF(v);
  }
  
  static const unsigned short G[] = 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index c1f3d5414f0cfc4112d576613e9c975246bea338..a06c40b9d604f3d99c1d553d40c8b7ca95c8fd83 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
  
  Copyright (c) Corporation for National Research Initiatives.
  
+--------------------------------------------------------------------
+The original string type implementation is:
+
+    Copyright (c) 1999 by Secret Labs AB
+    Copyright (c) 1999 by Fredrik Lundh
+
+By obtaining, using, and/or copying this software and/or its
+associated documentation, you agree that you have read, understood,
+and will comply with the following terms and conditions:
+
+Permission to use, copy, modify, and distribute this software and its
+associated documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appears in all
+copies, and that both that copyright notice and this permission notice
+appear in supporting documentation, and that the name of Secret Labs
+AB or the author not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+--------------------------------------------------------------------
  
- Original header:
- --------------------------------------------------------------------
-
- * Yet another Unicode string type for Python.  This type supports the
- * 16-bit Basic Multilingual Plane (BMP) only.
- *
- * Note that this string class supports embedded NULL characters.  End
- * of string is given by the length attribute.  However, the internal
- * representation always stores a trailing NULL to make it easier to
- * use unicode strings with standard APIs.
- *
- * History:
- * 1999-01-23 fl  Created
- * 1999-01-24 fl  Added split, join, capwords; basic UTF-8 support
- * 1999-01-24 fl  Basic UCS-2 support, buffer interface, etc.
- * 1999-03-06 fl  Moved declarations to separate file, etc.
- * 1999-06-13 fl  Changed join method semantics according to Tim's proposal
- * 1999-08-10 fl  Some minor tweaks
- *
- * Written by Fredrik Lundh, January 1999.
- *
- * Copyright (c) 1999 by Secret Labs AB.
- * Copyright (c) 1999 by Fredrik Lundh.
- *
- * fredrik@pythonware.com
- * http://www.pythonware.com
- *
- * --------------------------------------------------------------------
- * This Unicode String Type is
- * 
- * Copyright (c) 1999 by Secret Labs AB
- * Copyright (c) 1999 by Fredrik Lundh
- * 
- * By obtaining, using, and/or copying this software and/or its
- * associated documentation, you agree that you have read, understood,
- * and will comply with the following terms and conditions:
- * 
- * Permission to use, copy, modify, and distribute this software and its
- * associated documentation for any purpose and without fee is hereby
- * granted, provided that the above copyright notice appears in all
- * copies, and that both that copyright notice and this permission notice
- * appear in supporting documentation, and that the name of Secret Labs
- * AB or the author not be used in advertising or publicity pertaining to
- * distribution of the software without specific, written prior
- * permission.
- * 
- * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
- * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- * -------------------------------------------------------------------- */
+*/
  
  #include "Python.h"
  
@@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
      }
  }
  
-static _Py_UCNHashAPI *pucnHash = NULL;
-
-static
-int mystrnicmp(const char *s1, const char *s2, size_t count)
-{
-    char c1, c2;
-    
-    if (count)
-    {
-        do
-        {
-           c1 = tolower(*(s1++));
-           c2 = tolower(*(s2++));
-        }
-        while(--count && c1 == c2);
-        
-        return c1 - c2;
-    }
-    
-    return 0;
-}
+static _PyUnicode_Name_CAPI *unicode_names = NULL;
  
  PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                                         int size,
@@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
              /* Ok, we need to deal with Unicode Character Names now,
               * make sure we've imported the hash table data...
               */
-            if (pucnHash == NULL) {
+            if (unicode_names == NULL) {
                  PyObject *mod = 0, *v = 0;
                  mod = PyImport_ImportModule("ucnhash");
                  if (mod == NULL)
                      goto onError;
-                v = PyObject_GetAttrString(mod,"ucnhashAPI");
+                v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
                  Py_DECREF(mod);
                  if (v == NULL)
                      goto onError;
-                pucnHash = PyCObject_AsVoidPtr(v);
+                unicode_names = PyCObject_AsVoidPtr(v);
                  Py_DECREF(v);
-                if (pucnHash == NULL)
+                if (unicode_names == NULL)
                      goto onError;
              }
                  
              if (*s == '{') {
                  const char *start = s + 1;
                  const char *endBrace = start;
-                unsigned long j;
-
-                /* look for either the closing brace, or we
-                 * exceed the maximum length of the unicode character names
-                 */
-                while (*endBrace != '}' &&
-                       (unsigned int)(endBrace - start) <=
-                           pucnHash->cchMax &&
-                       endBrace < end)
-                {
+
+                /* look for the closing brace */
+                while (*endBrace != '}' && endBrace < end)
                      endBrace++;
-                }
                  if (endBrace != end && *endBrace == '}') {
-                    j = pucnHash->hash(start, endBrace - start);
-                    if (j > pucnHash->cKeys ||
-                        mystrnicmp(
-                            start,
-                            ((_Py_UnicodeCharacterName *) 
-                             (pucnHash->getValue(j)))->pszUCN,
-                            (int)(endBrace - start)) != 0)
-                    {
+                    if (!unicode_names->getcode(start, endBrace-start, &chr)) {
                          if (unicodeescape_decoding_error(
                                  &s, &x, errors,
-                                "Invalid Unicode Character Name"))
-                        {
+                                "Invalid Unicode Character Name")
+                            )
                              goto onError;
-                        }
                          goto ucnFallthrough;
                      }
-                    chr = ((_Py_UnicodeCharacterName *)
-                           (pucnHash->getValue(j)))->value;
                      s = endBrace + 1;
                      goto store;
                  } else {
author	Fredrik Lundh <fredrik@pythonware.com>
	Fri, 19 Jan 2001 09:45:02 +0000 (09:45 +0000)
committer	Fredrik Lundh <fredrik@pythonware.com>
	Fri, 19 Jan 2001 09:45:02 +0000 (09:45 +0000)
Include/ucnhash.h		patch \| blob \| history
Lib/test/test_ucn.py		patch \| blob \| history
Modules/ucnhash.c		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history