1 /*-------------------------------------------------------------------------
3 * Utility functions for conversion procs.
5 * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
9 * src/backend/utils/mb/conv.c
11 *-------------------------------------------------------------------------
14 #include "mb/pg_wchar.h"
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
30 local2local(const unsigned char *l,
35 const unsigned char *tab)
44 report_invalid_encoding(src_encoding, (const char *) l, len);
45 if (!IS_HIGHBIT_SET(c1))
49 c2 = tab[c1 - HIGHBIT];
53 report_untranslatable_char(src_encoding, dest_encoding,
54 (const char *) l, len);
63 * LATINn ---> MIC when the charset's local codes map directly to MIC
65 * l points to the source string of length len
66 * p is the output area (must be large enough!)
67 * lc is the mule character set id for the local encoding
68 * encoding is the PG identifier for the local encoding
71 latin2mic(const unsigned char *l, unsigned char *p, int len,
80 report_invalid_encoding(encoding, (const char *) l, len);
81 if (IS_HIGHBIT_SET(c1))
91 * MIC ---> LATINn when the charset's local codes map directly to MIC
93 * mic points to the source string of length len
94 * p is the output area (must be large enough!)
95 * lc is the mule character set id for the local encoding
96 * encoding is the PG identifier for the local encoding
99 mic2latin(const unsigned char *mic, unsigned char *p, int len,
100 int lc, int encoding)
108 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109 if (!IS_HIGHBIT_SET(c1))
118 int l = pg_mic_mblen(mic);
121 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
123 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
124 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125 (const char *) mic, len);
138 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139 * characters, here we must take a hard line because we don't know
140 * the appropriate MIC equivalent.
143 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
150 if (c1 == 0 || IS_HIGHBIT_SET(c1))
151 report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
163 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
170 if (c1 == 0 || IS_HIGHBIT_SET(c1))
171 report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
172 (const char *) mic, len);
181 * latin2mic_with_table: a generic single byte charset encoding
182 * conversion from a local charset to the mule internal code.
184 * l points to the source string of length len
185 * p is the output area (must be large enough!)
186 * lc is the mule character set id for the local encoding
187 * encoding is the PG identifier for the local encoding
188 * tab holds conversion entries for the local charset
189 * starting from 128 (0x80). each entry in the table holds the corresponding
190 * code point for the mule encoding, or 0 if there is no equivalent code.
193 latin2mic_with_table(const unsigned char *l,
198 const unsigned char *tab)
207 report_invalid_encoding(encoding, (const char *) l, len);
208 if (!IS_HIGHBIT_SET(c1))
212 c2 = tab[c1 - HIGHBIT];
219 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
220 (const char *) l, len);
229 * mic2latin_with_table: a generic single byte charset encoding
230 * conversion from the mule internal code to a local charset.
232 * mic points to the source string of length len
233 * p is the output area (must be large enough!)
234 * lc is the mule character set id for the local encoding
235 * encoding is the PG identifier for the local encoding
236 * tab holds conversion entries for the mule internal code's second byte,
237 * starting from 128 (0x80). each entry in the table holds the corresponding
238 * code point for the local charset, or 0 if there is no equivalent code.
241 mic2latin_with_table(const unsigned char *mic,
246 const unsigned char *tab)
255 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256 if (!IS_HIGHBIT_SET(c1))
265 int l = pg_mic_mblen(mic);
268 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
270 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
271 (c2 = tab[mic[1] - HIGHBIT]) == 0)
273 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
274 (const char *) mic, len);
275 break; /* keep compiler quiet */
286 * comparison routine for bsearch()
287 * this routine is intended for UTF8 -> local code
290 compare1(const void *p1, const void *p2)
295 v1 = *(const uint32 *) p1;
296 v2 = ((const pg_utf_to_local *) p2)->utf;
297 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
301 * comparison routine for bsearch()
302 * this routine is intended for local code -> UTF8
305 compare2(const void *p1, const void *p2)
310 v1 = *(const uint32 *) p1;
311 v2 = ((const pg_local_to_utf *) p2)->code;
312 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
316 * comparison routine for bsearch()
317 * this routine is intended for combined UTF8 -> local code
320 compare3(const void *p1, const void *p2)
327 s1 = *(const uint32 *) p1;
328 s2 = *((const uint32 *) p1 + 1);
329 d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
335 * comparison routine for bsearch()
336 * this routine is intended for local code -> combined UTF8
339 compare4(const void *p1, const void *p2)
344 v1 = *(const uint32 *) p1;
345 v2 = ((const pg_local_to_utf_combined *) p2)->code;
346 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
350 * store 32bit character representation into multibyte stream
352 static inline unsigned char *
353 store_coded_char(unsigned char *dest, uint32 code)
355 if (code & 0xff000000)
356 *dest++ = code >> 24;
357 if (code & 0x00ff0000)
358 *dest++ = code >> 16;
359 if (code & 0x0000ff00)
361 if (code & 0x000000ff)
367 * UTF8 ---> local code
369 * utf: input string in UTF8 encoding (need not be null-terminated)
370 * len: length of input string (in bytes)
371 * iso: pointer to the output area (must be large enough!)
372 (output string will be null-terminated)
373 * map: conversion map for single characters
374 * mapsize: number of entries in the conversion map
375 * cmap: conversion map for combined characters
376 * (optional, pass NULL if none)
377 * cmapsize: number of entries in the conversion map for combined characters
378 * (optional, pass 0 if none)
379 * conv_func: algorithmic encoding conversion function
380 * (optional, pass NULL if none)
381 * encoding: PG identifier for the local encoding
383 * For each character, the cmap (if provided) is consulted first; if no match,
384 * the map is consulted next; if still no match, the conv_func (if provided)
385 * is applied. An error is raised if no match is found.
387 * See pg_wchar.h for more details about the data structures used here.
390 UtfToLocal(const unsigned char *utf, int len,
392 const pg_utf_to_local *map, int mapsize,
393 const pg_utf_to_local_combined *cmap, int cmapsize,
394 utf_local_conversion_func conv_func,
399 const pg_utf_to_local *p;
400 const pg_utf_to_local_combined *cp;
402 if (!PG_VALID_ENCODING(encoding))
404 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
405 errmsg("invalid encoding number: %d", encoding)));
407 for (; len > 0; len -= l)
409 /* "break" cases all represent errors */
413 l = pg_utf_mblen(utf);
417 if (!pg_utf8_islegal(utf, l))
422 /* ASCII case is easy, assume it's one-to-one conversion */
427 /* collect coded char of length l */
442 iutf |= *utf++ << 16;
448 elog(ERROR, "unsupported character length %d", l);
449 iutf = 0; /* keep compiler quiet */
452 /* First, try with combined map if possible */
455 const unsigned char *utf_save = utf;
459 /* collect next character, same as above */
462 l = pg_utf_mblen(utf);
466 if (!pg_utf8_islegal(utf, l))
469 /* We assume ASCII character cannot be in combined map */
482 iutf2 = *utf++ << 16;
483 iutf2 |= *utf++ << 8;
488 iutf2 = *utf++ << 24;
489 iutf2 |= *utf++ << 16;
490 iutf2 |= *utf++ << 8;
495 elog(ERROR, "unsupported character length %d", l);
496 iutf2 = 0; /* keep compiler quiet */
502 cp = bsearch(cutf, cmap, cmapsize,
503 sizeof(pg_utf_to_local_combined), compare3);
507 iso = store_coded_char(iso, cp->code);
512 /* fail, so back up to reprocess second character next time */
518 /* Now check ordinary map */
519 p = bsearch(&iutf, map, mapsize,
520 sizeof(pg_utf_to_local), compare1);
524 iso = store_coded_char(iso, p->code);
528 /* if there's a conversion function, try that */
531 uint32 converted = (*conv_func) (iutf);
535 iso = store_coded_char(iso, converted);
540 /* failed to translate this character */
541 report_untranslatable_char(PG_UTF8, encoding,
542 (const char *) (utf - l), len);
545 /* if we broke out of loop early, must be invalid input */
547 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
553 * local code ---> UTF8
555 * iso: input string in local encoding (need not be null-terminated)
556 * len: length of input string (in bytes)
557 * utf: pointer to the output area (must be large enough!)
558 (output string will be null-terminated)
559 * map: conversion map for single characters
560 * mapsize: number of entries in the conversion map
561 * cmap: conversion map for combined characters
562 * (optional, pass NULL if none)
563 * cmapsize: number of entries in the conversion map for combined characters
564 * (optional, pass 0 if none)
565 * conv_func: algorithmic encoding conversion function
566 * (optional, pass NULL if none)
567 * encoding: PG identifier for the local encoding
569 * For each character, the map is consulted first; if no match, the cmap
570 * (if provided) is consulted next; if still no match, the conv_func
571 * (if provided) is applied. An error is raised if no match is found.
573 * See pg_wchar.h for more details about the data structures used here.
576 LocalToUtf(const unsigned char *iso, int len,
578 const pg_local_to_utf *map, int mapsize,
579 const pg_local_to_utf_combined *cmap, int cmapsize,
580 utf_local_conversion_func conv_func,
585 const pg_local_to_utf *p;
586 const pg_local_to_utf_combined *cp;
588 if (!PG_VALID_ENCODING(encoding))
590 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
591 errmsg("invalid encoding number: %d", encoding)));
593 for (; len > 0; len -= l)
595 /* "break" cases all represent errors */
599 if (!IS_HIGHBIT_SET(*iso))
601 /* ASCII case is easy, assume it's one-to-one conversion */
607 l = pg_encoding_verifymb(encoding, (const char *) iso, len);
611 /* collect coded char of length l */
628 iiso |= *iso++ << 16;
634 elog(ERROR, "unsupported character length %d", l);
635 iiso = 0; /* keep compiler quiet */
638 /* First check ordinary map */
639 p = bsearch(&iiso, map, mapsize,
640 sizeof(pg_local_to_utf), compare2);
644 utf = store_coded_char(utf, p->utf);
648 /* If there's a combined character map, try that */
651 cp = bsearch(&iiso, cmap, cmapsize,
652 sizeof(pg_local_to_utf_combined), compare4);
656 utf = store_coded_char(utf, cp->utf1);
657 utf = store_coded_char(utf, cp->utf2);
662 /* if there's a conversion function, try that */
665 uint32 converted = (*conv_func) (iiso);
669 utf = store_coded_char(utf, converted);
674 /* failed to translate this character */
675 report_untranslatable_char(encoding, PG_UTF8,
676 (const char *) (iso - l), len);
679 /* if we broke out of loop early, must be invalid input */
681 report_invalid_encoding(encoding, (const char *) iso, len);