]> granicus.if.org Git - python/commitdiff
Backport from trunk r56727:
authorHye-Shik Chang <hyeshik@gmail.com>
Sat, 4 Aug 2007 04:15:04 +0000 (04:15 +0000)
committerHye-Shik Chang <hyeshik@gmail.com>
Sat, 4 Aug 2007 04:15:04 +0000 (04:15 +0000)
Fix gb18030 codec's bug that doesn't map two-byte characters on
GB18030 extension in encoding. (bug reported by Bjorn Stabell)

Lib/test/test_codecmaps_cn.py
Lib/test/test_multibytecodec_support.py
Misc/NEWS
Modules/cjkcodecs/_codecs_cn.c

index 8cbee766cdd0b5bfe6d3e0630d39d69b9ace187e..0c34fdcde62843a27d8c36c48bfed2d06326f602 100644 (file)
@@ -19,10 +19,18 @@ class TestGBKMap(test_multibytecodec_support.TestBase_Mapping,
     mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \
                  'MICSFT/WINDOWS/CP936.TXT'
 
+class TestGB18030Map(test_multibytecodec_support.TestBase_Mapping,
+                     unittest.TestCase):
+    encoding = 'gb18030'
+    mapfileurl = 'http://source.icu-project.org/repos/icu/data/' \
+                 'trunk/charset/data/xml/gb-18030-2000.xml'
+
+
 def test_main():
     suite = unittest.TestSuite()
     suite.addTest(unittest.makeSuite(TestGB2312Map))
     suite.addTest(unittest.makeSuite(TestGBKMap))
+    suite.addTest(unittest.makeSuite(TestGB18030Map))
     test_support.run_suite(suite)
 
 if __name__ == "__main__":
index bec32de1d5673e854601ba1ad30e24272265a6d0..197f7778bd3ed3d938af7da90656fb82e89d909b 100644 (file)
@@ -5,7 +5,7 @@
 #
 
 import sys, codecs, os.path
-import unittest
+import unittest, re
 from test import test_support
 from StringIO import StringIO
 
@@ -272,6 +272,12 @@ class TestBase_Mapping(unittest.TestCase):
         return test_support.open_urlresource(self.mapfileurl)
 
     def test_mapping_file(self):
+        if self.mapfileurl.endswith('.xml'):
+            self._test_mapping_file_ucm()
+        else:
+            self._test_mapping_file_plain()
+
+    def _test_mapping_file_plain(self):
         unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+'))))
         urt_wa = {}
 
@@ -303,6 +309,14 @@ class TestBase_Mapping(unittest.TestCase):
 
             self._testpoint(csetch, unich)
 
+    def _test_mapping_file_ucm(self):
+        ucmdata = self.open_mapping_file().read()
+        uc = re.findall('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>', ucmdata)
+        for uni, coded in uc:
+            unich = unichr(int(uni, 16))
+            codech = ''.join(chr(int(c, 16)) for c in coded.split())
+            self._testpoint(codech, unich)
+
     def test_mapping_supplemental(self):
         for mapping in self.supmaps:
             self._testpoint(*mapping)
index 0b01eaf452df4443d899da99e60bf83511f5c8ad..3d5221cd43ab6193ab3b52ed3437ac6a6c1ffa2c 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -26,6 +26,9 @@ Core and builtins
 Library
 -------
 
+- GB18030 codec now can encode additional two-byte characters that
+  are missing in GBK.
+
 - Bug #1704793: Raise KeyError if unicodedata.lookup cannot
   represent the result in a single character.
 
index c811a67eda00f72d5fde4839332e0c84bcae702c..4542ce62f06a4db77eb838549f76141b750178cc 100644 (file)
@@ -197,6 +197,7 @@ ENCODER(gb18030)
                REQUIRE_OUTBUF(2)
 
                GBK_ENCODE(c, code)
+               else TRYMAP_ENC(gb18030ext, code, c);
                else {
                        const struct _gb18030_to_unibmp_ranges *utrrange;