]> granicus.if.org Git - postgresql/blob
90d6f301512cf31885acd2584afb975ac89db2fa
[postgresql] /
1 /*-------------------------------------------------------------------------
2  *
3  *        EUC_JIS_2004, SHIFT_JIS_2004
4  *
5  * Copyright (c) 2007-2008, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  *        $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jis_2004_and_shift_jis_2004/euc_jis_2004_and_shift_jis_2004.c,v 1.3 2008/01/01 20:31:21 tgl Exp $
9  *
10  *-------------------------------------------------------------------------
11  */
12
13 #include "postgres.h"
14 #include "fmgr.h"
15 #include "mb/pg_wchar.h"
16
17 PG_MODULE_MAGIC;
18
19 PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004);
20 PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004);
21
22 extern Datum euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS);
23 extern Datum shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS);
24
25 static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len);
26 static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len);
27
28 /* ----------
29  * conv_proc(
30  *              INTEGER,        -- source encoding id
31  *              INTEGER,        -- destination encoding id
32  *              CSTRING,        -- source string (null terminated C string)
33  *              CSTRING,        -- destination string (null terminated C string)
34  *              INTEGER         -- source string length
35  * ) returns VOID;
36  * ----------
37  */
38
39 Datum
40 euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS)
41 {
42         unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
43         unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
44         int                     len = PG_GETARG_INT32(4);
45
46         Assert(PG_GETARG_INT32(0) == PG_EUC_JIS_2004);
47         Assert(PG_GETARG_INT32(1) == PG_SHIFT_JIS_2004);
48         Assert(len >= 0);
49
50         euc_jis_20042shift_jis_2004(src, dest, len);
51
52         PG_RETURN_VOID();
53 }
54
55 Datum
56 shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS)
57 {
58         unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
59         unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
60         int                     len = PG_GETARG_INT32(4);
61
62         Assert(PG_GETARG_INT32(0) == PG_SHIFT_JIS_2004);
63         Assert(PG_GETARG_INT32(1) == PG_EUC_JIS_2004);
64         Assert(len >= 0);
65
66         shift_jis_20042euc_jis_2004(src, dest, len);
67
68         PG_RETURN_VOID();
69 }
70
71 /*
72  * EUC_JIS_2004 -> SHIFT_JIS_2004
73  */
74 static void
75 euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
76 {
77         int                     c1,
78                                 ku,
79                                 ten;
80         int                     l;
81
82         while (len > 0)
83         {
84                 c1 = *euc;
85                 if (!IS_HIGHBIT_SET(c1))
86                 {
87                         /* ASCII */
88                         if (c1 == 0)
89                                 report_invalid_encoding(PG_EUC_JIS_2004,
90                                                                                 (const char *) euc, len);
91                         *p++ = c1;
92                         euc++;
93                         len--;
94                         continue;
95                 }
96
97                 l = pg_encoding_verifymb(PG_EUC_JIS_2004, (const char *) euc, len);
98
99                 if (l < 0)
100                         report_invalid_encoding(PG_EUC_JIS_2004,
101                                                                         (const char *) euc, len);
102
103                 if (c1 == SS2 && l == 2)        /* JIS X 0201 kana? */
104                 {
105                         *p++ = euc[1];
106                 }
107                 else if (c1 == SS3 && l == 3)   /* JIS X 0213 plane 2? */
108                 {
109                         ku = euc[1] - 0xa0;
110                         ten = euc[2] - 0xa0;
111
112                         switch (ku)
113                         {
114                                 case 1:
115                                 case 3:
116                                 case 4:
117                                 case 5:
118                                 case 8:
119                                 case 12:
120                                 case 13:
121                                 case 14:
122                                 case 15:
123                                         *p++ = ((ku + 0x1df) >> 1) - (ku >> 3) * 3;
124                                         break;
125                                 default:
126                                         if (ku >= 78 && ku <= 94)
127                                         {
128                                                 *p++ = (ku + 0x19b) >> 1;
129                                         }
130                                         else
131                                                 report_invalid_encoding(PG_EUC_JIS_2004,
132                                                                                                 (const char *) euc, len);
133                         }
134
135                         if (ku % 2)
136                         {
137                                 if (ten >= 1 && ten <= 63)
138                                         *p++ = ten + 0x3f;
139                                 else if (ten >= 64 && ten <= 94)
140                                         *p++ = ten + 0x40;
141                                 else
142                                         report_invalid_encoding(PG_EUC_JIS_2004,
143                                                                                         (const char *) euc, len);
144                         }
145                         else
146                                 *p++ = ten + 0x9e;
147                 }
148
149                 else if (l == 2)                /* JIS X 0213 plane 1? */
150                 {
151                         ku = c1 - 0xa0;
152                         ten = euc[1] - 0xa0;
153
154                         if (ku >= 1 && ku <= 62)
155                                 *p++ = (ku + 0x101) >> 1;
156                         else if (ku >= 63 && ku <= 94)
157                                 *p++ = (ku + 0x181) >> 1;
158                         else
159                                 report_invalid_encoding(PG_EUC_JIS_2004,
160                                                                                 (const char *) euc, len);
161
162                         if (ku % 2)
163                         {
164                                 if (ten >= 1 && ten <= 63)
165                                         *p++ = ten + 0x3f;
166                                 else if (ten >= 64 && ten <= 94)
167                                         *p++ = ten + 0x40;
168                                 else
169                                         report_invalid_encoding(PG_EUC_JIS_2004,
170                                                                                         (const char *) euc, len);
171                         }
172                         else
173                                 *p++ = ten + 0x9e;
174                 }
175                 else
176                         report_invalid_encoding(PG_EUC_JIS_2004,
177                                                                         (const char *) euc, len);
178
179                 euc += l;
180                 len -= l;
181         }
182         *p = '\0';
183 }
184
185 /*
186  * returns SHIFT_JIS_2004 "ku" code indicated by second byte
187  * *ku = 0: "ku" = even
188  * *ku = 1: "ku" = odd
189  */
190 static int
191 get_ten(int b, int *ku)
192 {
193         int                     ten;
194
195         if (b >= 0x40 && b <= 0x7e)
196         {
197                 ten = b - 0x3f;
198                 *ku = 1;
199         }
200         else if (b >= 0x80 && b <= 0x9e)
201         {
202                 ten = b - 0x40;
203                 *ku = 1;
204         }
205         else if (b >= 0x9f && b <= 0xfc)
206         {
207                 ten = b - 0x9e;
208                 *ku = 0;
209         }
210         else
211         {
212                 ten = -1;                               /* error */
213         }
214         return ten;
215 }
216
217 /*
218  * SHIFT_JIS_2004 ---> EUC_JIS_2004
219  */
220
221 static void
222 shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len)
223 {
224         int                     c1,
225                                 c2;
226         int                     ku,
227                                 ten,
228                                 kubun;
229         int                     plane;
230         int                     l;
231
232         while (len > 0)
233         {
234                 c1 = *sjis;
235                 c2 = sjis[1];
236
237                 if (!IS_HIGHBIT_SET(c1))
238                 {
239                         /* ASCII */
240                         if (c1 == 0)
241                                 report_invalid_encoding(PG_SHIFT_JIS_2004,
242                                                                                 (const char *) sjis, len);
243                         *p++ = c1;
244                         sjis++;
245                         len--;
246                         continue;
247                 }
248
249                 l = pg_encoding_verifymb(PG_SHIFT_JIS_2004, (const char *) sjis, len);
250
251                 if (l < 0)
252                         report_invalid_encoding(PG_SHIFT_JIS_2004,
253                                                                         (const char *) sjis, len);
254
255                 if (c1 >= 0xa1 && c1 <= 0xdf && l == 1)
256                 {
257                         /* JIS X0201 (1 byte kana) */
258                         *p++ = SS2;
259                         *p++ = c1;
260                 }
261                 else if (l == 2)
262                 {
263                         plane = 1;
264                         ku = 1;
265                         ten = 1;
266
267                         /*
268                          * JIS X 0213
269                          */
270                         if (c1 >= 0x81 && c1 <= 0x9f)           /* plane 1 1ku-62ku */
271                         {
272                                 ku = (c1 << 1) - 0x100;
273                                 ten = get_ten(c2, &kubun);
274                                 if (ten < 0)
275                                         report_invalid_encoding(PG_SHIFT_JIS_2004,
276                                                                                         (const char *) sjis, len);
277                                 ku -= kubun;
278                         }
279                         else if (c1 >= 0xe0 && c1 <= 0xef)      /* plane 1 62ku-94ku */
280                         {
281                                 ku = (c1 << 1) - 0x180;
282                                 ten = get_ten(c2, &kubun);
283                                 if (ten < 0)
284                                         report_invalid_encoding(PG_SHIFT_JIS_2004,
285
286                                                                                         (const char *) sjis, len);
287                                 ku -= kubun;
288                         }
289                         else if (c1 >= 0xf0 && c1 <= 0xf3)      /* plane 2
290                                                                                                  * 1,3,4,5,8,12,13,14,15 ku */
291                         {
292                                 plane = 2;
293                                 ten = get_ten(c2, &kubun);
294                                 if (ten < 0)
295                                         report_invalid_encoding(PG_SHIFT_JIS_2004,
296                                                                                         (const char *) sjis, len);
297                                 switch (c1)
298                                 {
299                                         case 0xf0:
300                                                 ku = kubun == 0 ? 8 : 1;
301                                                 break;
302                                         case 0xf1:
303                                                 ku = kubun == 0 ? 4 : 3;
304                                                 break;
305                                         case 0xf2:
306                                                 ku = kubun == 0 ? 12 : 5;
307                                                 break;
308                                         default:
309                                                 ku = kubun == 0 ? 14 : 13;
310                                                 break;
311                                 }
312                         }
313                         else if (c1 >= 0xf4 && c1 <= 0xfc)      /* plane 2 78-94ku */
314                         {
315                                 plane = 2;
316                                 ten = get_ten(c2, &kubun);
317                                 if (ten < 0)
318                                         report_invalid_encoding(PG_SHIFT_JIS_2004,
319                                                                                         (const char *) sjis, len);
320                                 if (c1 == 0xf4 && kubun == 1)
321                                         ku = 15;
322                                 else
323                                         ku = (c1 << 1) - 0x19a - kubun;
324                         }
325                         else
326                                 report_invalid_encoding(PG_SHIFT_JIS_2004,
327                                                                                 (const char *) sjis, len);
328
329                         if (plane == 2)
330                                 *p++ = SS3;
331
332                         *p++ = ku + 0xa0;
333                         *p++ = ten + 0xa0;
334                 }
335                 sjis += l;
336                 len -= l;
337         }
338         *p = '\0';
339 }