]> granicus.if.org Git - postgresql/commitdiff
Allow direct conversion between EUC_JP and SJIS to improve
authorTatsuo Ishii <ishii@postgresql.org>
Fri, 24 Jun 2005 13:56:39 +0000 (13:56 +0000)
committerTatsuo Ishii <ishii@postgresql.org>
Fri, 24 Jun 2005 13:56:39 +0000 (13:56 +0000)
performance. patches submitted by Atsushi Ogawa.

src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c

index ddeffa57ac2c8274645653687e6e39be9a59f46f..37f6d2b1c56a80303807356de9da34e99a0307db 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.10 2005/06/10 16:43:56 ishii Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.11 2005/06/24 13:56:39 ishii Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -58,6 +58,8 @@ static void sjis2mic(unsigned char *sjis, unsigned char *p, int len);
 static void mic2sjis(unsigned char *mic, unsigned char *p, int len);
 static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len);
 static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len);
+static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len);
+static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -65,16 +67,12 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS)
        unsigned char *src = PG_GETARG_CSTRING(2);
        unsigned char *dest = PG_GETARG_CSTRING(3);
        int                     len = PG_GETARG_INT32(4);
-       unsigned char *buf;
 
        Assert(PG_GETARG_INT32(0) == PG_EUC_JP);
        Assert(PG_GETARG_INT32(1) == PG_SJIS);
        Assert(len >= 0);
 
-       buf = palloc(len * ENCODING_GROWTH_RATE);
-       euc_jp2mic(src, buf, len);
-       mic2sjis(buf, dest, strlen(buf));
-       pfree(buf);
+       euc_jp2sjis(src, dest, len);
 
        PG_RETURN_VOID();
 }
@@ -85,16 +83,12 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS)
        unsigned char *src = PG_GETARG_CSTRING(2);
        unsigned char *dest = PG_GETARG_CSTRING(3);
        int                     len = PG_GETARG_INT32(4);
-       unsigned char *buf;
 
        Assert(PG_GETARG_INT32(0) == PG_SJIS);
        Assert(PG_GETARG_INT32(1) == PG_EUC_JP);
        Assert(len >= 0);
 
-       buf = palloc(len * ENCODING_GROWTH_RATE);
-       sjis2mic(src, buf, len);
-       mic2euc_jp(buf, dest, strlen(buf));
-       pfree(buf);
+       sjis2euc_jp(src, dest, len);
 
        PG_RETURN_VOID();
 }
@@ -454,3 +448,199 @@ mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
        }
        *p = '\0';
 }
+
+/*
+ * EUC_JP -> SJIS
+ */
+static void
+euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
+{
+       int                     c1,
+                               c2,
+                               k;
+       unsigned char *euc_end = euc + len;
+
+       while (euc_end >= euc && (c1 = *euc++))
+       {
+               if(c1 < 0x80)
+               {
+                       /* should be ASCII */
+                       *p++ = c1;
+               }
+               else if (c1 == SS2)
+               {
+                       /* hankaku kana? */
+                       *p++ = *euc++;
+               }
+               else if (c1 == SS3)
+               {
+                       /* JIS X0212 kanji? */
+                       c1 = *euc++;
+                       c2 = *euc++;
+                       k = c1 << 8 | c2;
+                       if (k >= 0xf5a1)
+                       {
+                               /* UDC2 */
+                               c1 -= 0x54;
+                               *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
+                               *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
+                       }
+                       else
+                       {
+                               int                     i, k2;
+
+                               /* IBM kanji */
+                               for (i = 0;; i++)
+                               {
+                                       k2 = ibmkanji[i].euc & 0xffff;
+                                       if (k2 == 0xffff)
+                                       {
+                                               *p++ = PGSJISALTCODE >> 8;
+                                               *p++ = PGSJISALTCODE & 0xff;
+                                               break;
+                                       }
+                                       if (k2 == k)
+                                       {
+                                               k = ibmkanji[i].sjis;
+                                               *p++ = k >> 8;
+                                               *p++ = k & 0xff;
+                                               break;
+                                       }
+                               }
+                       }
+               } 
+               else
+               {       
+                       /* JIS X0208 kanji? */
+                       c2 = *euc++;
+                       k = (c1 << 8) | (c2 & 0xff);
+                       if (k >= 0xf5a1)
+                       {
+                               /* UDC1 */
+                               c1 -= 0x54;
+                               *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
+                       }
+                       else
+                               *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
+                       *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
+               }
+       }
+       *p = '\0';
+}
+
+/*
+ * SJIS ---> EUC_JP
+ */
+static void
+sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
+{
+       int                     c1,
+                               c2,
+                               i,
+                               k,
+                               k2;
+       unsigned char *sjis_end = sjis + len;
+
+       while (sjis_end >= sjis && (c1 = *sjis++))
+       {
+               if(c1 < 0x80)
+               {
+                       /* should be ASCII */
+                       *p++ = c1;
+               }
+               else if (c1 >= 0xa1 && c1 <= 0xdf)
+               {
+                       /* JIS X0201 (1 byte kana) */
+                       *p++ = SS2;
+                       *p++ = c1;
+               }
+               else
+               {
+                       /*
+                        * JIS X0208, X0212, user defined extended characters
+                        */
+                       c2 = *sjis++;
+                       k = (c1 << 8) + c2;
+                       if (k >= 0xed40 && k < 0xf040)
+                       {
+                               /* NEC selection IBM kanji */
+                               for (i = 0;; i++)
+                               {
+                                       k2 = ibmkanji[i].nec;
+                                       if (k2 == 0xffff)
+                                               break;
+                                       if (k2 == k)
+                                       {
+                                               k = ibmkanji[i].sjis;
+                                               c1 = (k >> 8) & 0xff;
+                                               c2 = k & 0xff;
+                                       }
+                               }
+                       }
+
+                       if (k < 0xeb3f)
+                       {
+                               /* JIS X0208 */
+                               *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
+                               *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
+                       }
+                       else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
+                       {
+                               /* NEC selection IBM kanji - Other undecided justice */
+                               *p++ = PGEUCALTCODE >> 8;
+                               *p++ = PGEUCALTCODE & 0xff;
+                       }
+                       else if (k >= 0xf040 && k < 0xf540)
+                       {
+                               /*
+                                * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
+                                * 0x7e7e EUC 0xf5a1 - 0xfefe
+                                */
+                               c1 -= 0x6f;
+                               *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
+                               *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
+                       }
+                       else if (k >= 0xf540 && k < 0xfa40)
+                       {
+                               /*
+                                * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
+                                * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
+                                */
+                               *p++ = SS3;
+                               c1 -= 0x74;
+                               *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
+                               *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
+                       }
+                       else if (k >= 0xfa40)
+                       {
+                               /*
+                                * mapping IBM kanji to X0208 and X0212
+                                *
+                                */
+                               for (i = 0;; i++)
+                               {
+                                       k2 = ibmkanji[i].sjis;
+                                       if (k2 == 0xffff)
+                                               break;
+                                       if (k2 == k)
+                                       {
+                                               k = ibmkanji[i].euc;
+                                               if (k >= 0x8f0000)
+                                               {
+                                                       *p++ = SS3;
+                                                       *p++ = 0x80 | ((k & 0xff00) >> 8);
+                                                       *p++ = 0x80 | (k & 0xff);
+                                               }
+                                               else
+                                               {
+                                                       *p++ = 0x80 | (k >> 8);
+                                                       *p++ = 0x80 | (k & 0xff);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+       *p = '\0';
+}
+