]> granicus.if.org Git - postgresql/commitdiff
Fix long standing Asian multibyte charsets bug.
authorTatsuo Ishii <ishii@postgresql.org>
Sat, 24 Dec 2005 10:40:55 +0000 (10:40 +0000)
committerTatsuo Ishii <ishii@postgresql.org>
Sat, 24 Dec 2005 10:40:55 +0000 (10:40 +0000)
See:

Subject: [HACKERS] bugs with certain Asian multibyte charsets
From: Tatsuo Ishii <ishii@sraoss.co.jp>
To: pgsql-hackers@postgresql.org
Date: Sat, 24 Dec 2005 18:25:33 +0900 (JST)

for more details.

src/backend/utils/mb/wchar.c

index 0e19a6075f6cdca53157d79ebb70edc72e197547..5dbc01032accc80b1898571c34952fd7fe51e2a3 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * conversion functions between pg_wchar and multibyte streams.
  * Tatsuo Ishii
- * $Id: wchar.c,v 1.34 2003/09/25 06:58:05 petere Exp $
+ * $Id: wchar.c,v 1.34.2.1 2005/12/24 10:40:55 ishii Exp $
  *
  * WIN1250 client encoding updated by Pavel Behal
  *
@@ -52,7 +52,6 @@ pg_ascii_mblen(const unsigned char *s)
 /*
  * EUC
  */
-
 static int     pg_euc2wchar_with_len
                        (const unsigned char *from, pg_wchar *to, int len)
 {
@@ -60,26 +59,26 @@ static int  pg_euc2wchar_with_len
 
        while (len > 0 && *from)
        {
-               if (*from == SS2 && len >= 2)
+               if (*from == SS2 && len >= 2)   /* JIS X 0201 (so called "1 byte KANA") */
                {
                        from++;
-                       *to = 0xff & *from++;
+                       *to = (SS2 << 8) | *from++;
                        len -= 2;
                }
-               else if (*from == SS3 && len >= 3)
+               else if (*from == SS3 && len >= 3)              /* JIS X 0212 KANJI */
                {
                        from++;
-                       *to = *from++ << 8;
-                       *to |= 0x3f & *from++;
+                       *to = (SS3 << 16) | (*from++ << 8);
+                       *to |= *from++;
                        len -= 3;
                }
-               else if ((*from & 0x80) && len >= 2)
+               else if ((*from & 0x80) && len >= 2)    /* JIS X 0208 KANJI */
                {
                        *to = *from++ << 8;
                        *to |= *from++;
                        len -= 2;
                }
-               else
+               else    /* must be ASCII */
                {
                        *to = *from++;
                        len--;
@@ -139,6 +138,7 @@ pg_euckr_mblen(const unsigned char *s)
 
 /*
  * EUC_CN
+ *
  */
 static int     pg_euccn2wchar_with_len
                        (const unsigned char *from, pg_wchar *to, int len)
@@ -147,21 +147,21 @@ static int        pg_euccn2wchar_with_len
 
        while (len > 0 && *from)
        {
-               if (*from == SS2 && len >= 3)
+               if (*from == SS2 && len >= 3)   /* code set 2 (unused?) */
                {
                        from++;
-                       *to = 0x3f00 & (*from++ << 8);
-                       *to = *from++;
+                       *to = (SS2 << 16) | (*from++ << 8);
+                       *to |= *from++;
                        len -= 3;
                }
-               else if (*from == SS3 && len >= 3)
+               else if (*from == SS3 && len >= 3)              /* code set 3 (unsed ?) */
                {
                        from++;
-                       *to = *from++ << 8;
-                       *to |= 0x3f & *from++;
+                       *to = (SS3 << 16) | (*from++ << 8);
+                       *to |= *from++;
                        len -= 3;
                }
-               else if ((*from & 0x80) && len >= 2)
+               else if ((*from & 0x80) && len >= 2)    /* code set 1 */
                {
                        *to = *from++ << 8;
                        *to |= *from++;
@@ -193,6 +193,7 @@ pg_euccn_mblen(const unsigned char *s)
 
 /*
  * EUC_TW
+ *
  */
 static int     pg_euctw2wchar_with_len
                        (const unsigned char *from, pg_wchar *to, int len)
@@ -201,22 +202,22 @@ static int        pg_euctw2wchar_with_len
 
        while (len > 0 && *from)
        {
-               if (*from == SS2 && len >= 4)
+               if (*from == SS2 && len >= 4)   /* code set 2 */
                {
                        from++;
-                       *to = *from++ << 16;
+                       *to = (SS2 << 24) | (*from++ << 16) ;
                        *to |= *from++ << 8;
                        *to |= *from++;
                        len -= 4;
                }
-               else if (*from == SS3 && len >= 3)
+               else if (*from == SS3 && len >= 3)              /* code set 3 (unused?) */
                {
                        from++;
-                       *to = *from++ << 8;
-                       *to |= 0x3f & *from++;
+                       *to = (SS3 << 16) | (*from++ << 8);
+                       *to |= *from++;
                        len -= 3;
                }
-               else if ((*from & 0x80) && len >= 2)
+               else if ((*from & 0x80) && len >= 2)    /* code set 2 */
                {
                        *to = *from++ << 8;
                        *to |= *from++;