]> granicus.if.org Git - postgresql/blob - src/backend/utils/mb/wchar.c
Restructure the key include files per recent pghackers discussion: there
[postgresql] / src / backend / utils / mb / wchar.c
1 /*
2  * conversion functions between pg_wchar and multi-byte streams.
3  * Tatsuo Ishii
4  * $Id: wchar.c,v 1.14 2001/02/10 02:31:27 tgl Exp $
5  *
6  * WIN1250 client encoding updated by Pavel Behal
7  *
8  */
9 /* can be used in either frontend or backend */
10 #include "postgres_fe.h"
11
12 #include "mb/pg_wchar.h"
13
14 /*
15  * conversion to pg_wchar is done by "table driven."
16  * to add an encoding support, define mb2wchar_with_len(), mblen()
17  * for the particular encoding. Note that if the encoding is only
18  * supported in the client, you don't need to define
19  * mb2wchar_with_len() function (SJIS is the case).
20  */
21
22 /*
23  * SQL/ASCII
24  */
25 static int pg_ascii2wchar_with_len
26                         (const unsigned char *from, pg_wchar * to, int len)
27 {
28         int cnt = 0;
29
30         while (*from && len > 0)
31         {
32                 *to++ = *from++;
33                 len--;
34                 cnt++;
35         }
36         *to = 0;
37         return(cnt);
38 }
39
40 static int
41 pg_ascii_mblen(const unsigned char *s)
42 {
43         return (1);
44 }
45
46 /*
47  * EUC
48  */
49
50 static int pg_euc2wchar_with_len
51                         (const unsigned char *from, pg_wchar * to, int len)
52 {
53         int cnt = 0;
54
55         while (*from && len > 0)
56         {
57                 if (*from == SS2)
58                 {
59                         from++;
60                         len--;
61                         *to = 0xff & *from++;
62                         len--;
63                 }
64                 else if (*from == SS3)
65                 {
66                         from++;
67                         *to = *from++ << 8;
68                         *to |= 0x3f & *from++;
69                         len -= 3;
70                 }
71                 else if (*from & 0x80)
72                 {
73                         *to = *from++ << 8;
74                         *to |= *from++;
75                         len -= 2;
76                 }
77                 else
78                 {
79                         *to = *from++;
80                         len--;
81                 }
82                 to++;
83                 cnt++;
84         }
85         *to = 0;
86         return(cnt);
87 }
88
89 static int
90 pg_euc_mblen(const unsigned char *s)
91 {
92         int                     len;
93
94         if (*s == SS2)
95                 len = 2;
96         else if (*s == SS3)
97                 len = 3;
98         else if (*s & 0x80)
99                 len = 2;
100         else
101                 len = 1;
102         return (len);
103 }
104
105 /*
106  * EUC_JP
107  */
108 static int pg_eucjp2wchar_with_len
109                         (const unsigned char *from, pg_wchar * to, int len)
110 {
111         return(pg_euc2wchar_with_len(from, to, len));
112 }
113
114 static int
115 pg_eucjp_mblen(const unsigned char *s)
116 {
117         return (pg_euc_mblen(s));
118 }
119
120 /*
121  * EUC_KR
122  */
123 static int pg_euckr2wchar_with_len
124                         (const unsigned char *from, pg_wchar * to, int len)
125 {
126         return(pg_euc2wchar_with_len(from, to, len));
127 }
128
129 static int
130 pg_euckr_mblen(const unsigned char *s)
131 {
132         return (pg_euc_mblen(s));
133 }
134
135 /*
136  * EUC_CN
137  */
138 static int pg_euccn2wchar_with_len
139                         (const unsigned char *from, pg_wchar * to, int len)
140 {
141         int cnt = 0;
142
143         while (*from && len > 0)
144         {
145                 if (*from == SS2)
146                 {
147                         from++;
148                         len--;
149                         *to = 0x3f00 & (*from++ << 8);
150                         *to = *from++;
151                         len -= 2;
152                 }
153                 else if (*from == SS3)
154                 {
155                         from++;
156                         *to = *from++ << 8;
157                         *to |= 0x3f & *from++;
158                         len -= 3;
159                 }
160                 else if (*from & 0x80)
161                 {
162                         *to = *from++ << 8;
163                         *to |= *from++;
164                         len -= 2;
165                 }
166                 else
167                 {
168                         *to = *from++;
169                         len--;
170                 }
171                 to++;
172                 cnt++;
173         }
174         *to = 0;
175         return(cnt);
176 }
177
178 static int
179 pg_euccn_mblen(const unsigned char *s)
180 {
181         int                     len;
182
183         if (*s & 0x80)
184                 len = 2;
185         else
186                 len = 1;
187         return (len);
188 }
189
190 /*
191  * EUC_TW
192  */
193 static int pg_euctw2wchar_with_len
194                         (const unsigned char *from, pg_wchar * to, int len)
195 {
196         int cnt = 0;
197
198         while (*from && len > 0)
199         {
200                 if (*from == SS2)
201                 {
202                         from++;
203                         len--;
204                         *to = *from++ << 16;
205                         *to |= *from++ << 8;
206                         *to |= *from++;
207                         len -= 3;
208                 }
209                 else if (*from == SS3)
210                 {
211                         from++;
212                         *to = *from++ << 8;
213                         *to |= 0x3f & *from++;
214                         len -= 3;
215                 }
216                 else if (*from & 0x80)
217                 {
218                         *to = *from++ << 8;
219                         *to |= *from++;
220                         len -= 2;
221                 }
222                 else
223                 {
224                         *to = *from++;
225                         len--;
226                 }
227                 to++;
228                 cnt++;
229         }
230         *to = 0;
231         return(cnt);
232 }
233
234 static int
235 pg_euctw_mblen(const unsigned char *s)
236 {
237         int                     len;
238
239         if (*s == SS2)
240                 len = 4;
241         else if (*s == SS3)
242                 len = 3;
243         else if (*s & 0x80)
244                 len = 2;
245         else
246                 len = 1;
247         return (len);
248 }
249
250 /*
251  * convert UTF-8 string to pg_wchar (UCS-2)
252  * caller should allocate enough space for "to"
253  * len: length of from.
254  * "from" not necessarily null terminated.
255  */
256 static int
257 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
258 {
259         unsigned char c1,
260                                 c2,
261                                 c3;
262         int cnt = 0;
263
264         while (*from && len > 0)
265         {
266                 if ((*from & 0x80) == 0)
267                 {
268                         *to = *from++;
269                         len--;
270                 }
271                 else if ((*from & 0xe0) == 0xc0)
272                 {
273                         c1 = *from++ & 0x1f;
274                         c2 = *from++ & 0x3f;
275                         len -= 2;
276                         *to = c1 << 6;
277                         *to |= c2;
278                 }
279                 else if ((*from & 0xe0) == 0xe0)
280                 {
281                         c1 = *from++ & 0x0f;
282                         c2 = *from++ & 0x3f;
283                         c3 = *from++ & 0x3f;
284                         len -= 3;
285                         *to = c1 << 12;
286                         *to |= c2 << 6;
287                         *to |= c3;
288                 }
289                 else
290                 {
291                         *to = *from++;
292                         len--;
293                 }
294                 to++;
295                 cnt++;
296         }
297         *to = 0;
298         return(cnt);
299 }
300
301 /*
302  * returns the byte length of a UTF-8 word pointed to by s
303  */
304 int
305 pg_utf_mblen(const unsigned char *s)
306 {
307         int                     len = 1;
308
309         if ((*s & 0x80) == 0)
310                 len = 1;
311         else if ((*s & 0xe0) == 0xc0)
312                 len = 2;
313         else if ((*s & 0xe0) == 0xe0)
314                 len = 3;
315         return (len);
316 }
317
318 /*
319  * convert mule internal code to pg_wchar
320  * caller should allocate enough space for "to"
321  * len: length of from.
322  * "from" not necessarily null terminated.
323  */
324 static int
325 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
326 {
327         int cnt = 0;
328
329         while (*from && len > 0)
330         {
331                 if (IS_LC1(*from))
332                 {
333                         *to = *from++ << 16;
334                         *to |= *from++;
335                         len -= 2;
336                 }
337                 else if (IS_LCPRV1(*from))
338                 {
339                         from++;
340                         *to = *from++ << 16;
341                         *to |= *from++;
342                         len -= 3;
343                 }
344                 else if (IS_LC2(*from))
345                 {
346                         *to = *from++ << 16;
347                         *to |= *from++ << 8;
348                         *to |= *from++;
349                         len -= 3;
350                 }
351                 else if (IS_LCPRV2(*from))
352                 {
353                         from++;
354                         *to = *from++ << 16;
355                         *to |= *from++ << 8;
356                         *to |= *from++;
357                         len -= 4;
358                 }
359                 else
360                 {                                               /* assume ASCII */
361                         *to = (unsigned char) *from++;
362                         len--;
363                 }
364                 to++;
365                 cnt++;
366         }
367         *to = 0;
368         return(cnt);
369 }
370
371 int
372 pg_mule_mblen(const unsigned char *s)
373 {
374         int                     len;
375
376         if (IS_LC1(*s))
377                 len = 2;
378         else if (IS_LCPRV1(*s))
379                 len = 3;
380         else if (IS_LC2(*s))
381                 len = 3;
382         else if (IS_LCPRV2(*s))
383                 len = 4;
384         else
385         {                                                       /* assume ASCII */
386                 len = 1;
387         }
388         return (len);
389 }
390
391 /*
392  * ISO8859-1
393  */
394 static int
395 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
396 {
397         int cnt = 0;
398
399         while (*from && len-- > 0)
400         {
401                 *to++ = *from++;
402                 cnt++;
403         }
404         *to = 0;
405         return(cnt);
406 }
407
408 static int
409 pg_latin1_mblen(const unsigned char *s)
410 {
411         return (1);
412 }
413
414 /*
415  * SJIS
416  */
417 static int
418 pg_sjis_mblen(const unsigned char *s)
419 {
420         int                     len;
421
422         if (*s >= 0xa1 && *s <= 0xdf)
423         {                                                       /* 1 byte kana? */
424                 len = 1;
425         }
426         else if (*s > 0x7f)
427         {                                                       /* kanji? */
428                 len = 2;
429         }
430         else
431         {                                                       /* should be ASCII */
432                 len = 1;
433         }
434         return (len);
435 }
436
437 /*
438  * Big5
439  */
440 static int
441 pg_big5_mblen(const unsigned char *s)
442 {
443         int                     len;
444
445         if (*s > 0x7f)
446         {                                                       /* kanji? */
447                 len = 2;
448         }
449         else
450         {                                                       /* should be ASCII */
451                 len = 1;
452         }
453         return (len);
454 }
455
456 pg_wchar_tbl pg_wchar_table[] = {
457         {pg_ascii2wchar_with_len, pg_ascii_mblen},      /* 0 */
458         {pg_eucjp2wchar_with_len, pg_eucjp_mblen},      /* 1 */
459         {pg_euccn2wchar_with_len, pg_euccn_mblen},      /* 2 */
460         {pg_euckr2wchar_with_len, pg_euckr_mblen},      /* 3 */
461         {pg_euctw2wchar_with_len, pg_euctw_mblen},      /* 4 */
462         {pg_utf2wchar_with_len, pg_utf_mblen},          /* 5 */
463         {pg_mule2wchar_with_len, pg_mule_mblen},        /* 6 */
464         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 7 */
465         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 8 */
466         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 9 */
467         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 10 */
468         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 11 */
469         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 12 */
470         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 13 */
471         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 14 */
472         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 15 */
473         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 16 */
474         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 17 */
475         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 18 */
476         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 19 */
477         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 20 */
478         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 21 */
479         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 22 */
480         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 23 */
481         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 24 */
482         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 25 */
483         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 26 */
484         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 27 */
485         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 28 */
486         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 29 */
487         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 30 */
488         {pg_latin12wchar_with_len, pg_latin1_mblen},            /* 31 */
489         {0, pg_sjis_mblen},                     /* 32 */
490         {0, pg_big5_mblen},                     /* 33 */
491         {pg_latin12wchar_with_len, pg_latin1_mblen} /* 34 */
492 };
493
494 /* returns the byte length of a word for mule internal code */
495 int
496 pg_mic_mblen(const unsigned char *mbstr)
497 {
498         return (pg_mule_mblen(mbstr));
499 }