Implement names for CJK unified ideographs. Add name to KeyError output.

author Martin v. Löwis <martin@v.loewis.de>

Sat, 23 Nov 2002 18:01:32 +0000 (18:01 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sat, 23 Nov 2002 18:01:32 +0000 (18:01 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sat, 23 Nov 2002 18:01:32 +0000 (18:01 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sat, 23 Nov 2002 18:01:32 +0000 (18:01 +0000)
diff --git a/Lib/test/output/test_ucn b/Lib/test/output/test_ucn

index 1006c07eddecc33b7a37d3acfd9f583862715b2d..c41017bbaee211b472eee8e5df1e10db53bb0396 100644 (file)
--- a/Lib/test/output/test_ucn
+++ b/Lib/test/output/test_ucn
@@ -2,7 +2,8 @@ test_ucn
  Testing General Unicode Character Name, and case insensitivity... done.
  Testing name to code mapping.... done.
  Testing hangul syllable names.... done.
-Testing code to name mapping for all characters.... done.
-Found 22728 characters in the unicode name database
+Testing names of CJK unified ideographs.... done.
+Testing code to name mapping for all BMP characters.... done.
+Found 50212 characters in the unicode name database
  Testing misc. symbols for unicode character name expansion.... done.
  Testing unicode character name expansion strict error handling.... done.
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py

index 6f2b022b9d02d3a4756ea7548d0f2102f15a02ef..e7b8bbdea3597285ad9b6e6f163e848bfacfe49f 100644 (file)
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -80,16 +80,28 @@ else:
      raise AssertionError, "Found name for U+D7A4"
  print "done."
  
-print "Testing code to name mapping for all characters....",
+print "Testing names of CJK unified ideographs....",
+exec r"""
+verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
+"""
+print "done."
+
+print "Testing code to name mapping for all BMP characters....",
  count = 0
-for code in range(65536):
+for code in range(0x10000):
      try:
          char = unichr(code)
          name = unicodedata.name(char)
-        verify(unicodedata.lookup(name) == char)
-        count += 1
      except (KeyError, ValueError):
          pass
+    else:
+        verify(unicodedata.lookup(name) == char)
+        count += 1
  print "done."
  
  print "Found", count, "characters in the unicode name database"
diff --git a/Misc/NEWS b/Misc/NEWS

index 71da82daee53cd7ea67a5c64cba2d0e5092375d6..8fdd1f6c07b967f5f94998e299ccad5d5da79381 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -318,7 +318,7 @@ Extension modules
    is now named bsddb185.
  
  - unicodedata was updated to Unicode 3.2. In now also supports names
-  for Hangul syllables.
+  for Hangul syllables and CJK unified ideographs.
  
  - resource.getrlimit() now returns longs instead of ints.
  
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c

index 330b3763e44cfd553f2678ee28cc96a95d1edafc..3620936d9fbc3598b58ac075959f1b34c509a300 100644 (file)
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
         return 1;
      }
  
+    if ((0x3400 <= code && code <= 0x4DB5) ||  /* CJK Ideograph Extension A */
+        (0x4E00 <= code && code <= 0x9FA5) ||  /* CJK Ideograph */
+        (0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
+        if (buflen < 28)
+            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
+            return 0;
+        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
+        return 1;
+    }
+
      if (code >= 0x110000)
          return 0;
  
@@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
             *code = SBase + (L*VCount+V)*TCount + T;
             return 1;
         }
+        /* Otherwise, it's an illegal syllable name. */
+        return 0;
+    }
+
+    /* Check for unified ideographs. */
+    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+        /* Four or five hexdigits must follow. */
+        v = 0;
+        name += 22;
+        namelen -= 22;
+        if (namelen != 4 && namelen != 5)
+            return 0;
+        while (namelen--) {
+            v *= 16;
+            if (*name >= '0' && *name <= '9')
+                v += *name - '0';
+            else if (*name >= 'A' && *name <= 'F')
+                v += *name - 'A' + 10;
+            else
+                return 0;
+            name++;
+        }
+        *code = v;
+        return 1;
      }
  
      /* the following is the same as python's dictionary lookup, with
@@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args)
          return NULL;
  
      if (!_getcode(name, namelen, &code)) {
-        PyErr_SetString(PyExc_KeyError, "undefined character name");
+        char fmt[] = "undefined character name '%s'";
+        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
+        sprintf(buf, fmt, name);
+        PyErr_SetString(PyExc_KeyError, buf);
+        PyMem_FREE(buf);
          return NULL;
      }
author	Martin v. Löwis <martin@v.loewis.de>
	Sat, 23 Nov 2002 18:01:32 +0000 (18:01 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sat, 23 Nov 2002 18:01:32 +0000 (18:01 +0000)
Lib/test/output/test_ucn		patch \| blob \| history
Lib/test/test_ucn.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history
Modules/unicodedata.c		patch \| blob \| history