From 72dd6291f216440f6bb61a8733729a37c7e3b2d2 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 4 Jul 2012 17:10:10 -0400 Subject: [PATCH] Add wchar -> mb conversion routines. This is infrastructure for Alexander Korotkov's work on indexing regular expression searches. Alexander Korotkov, with a bit of further hackery on the MULE conversion by me --- src/backend/utils/mb/mbutils.c | 22 +++ src/backend/utils/mb/wchar.c | 247 +++++++++++++++++++++++++++------ src/include/mb/pg_wchar.h | 24 +++- 3 files changed, 248 insertions(+), 45 deletions(-) diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 848c26f41f..287ff808fc 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -710,6 +710,28 @@ pg_encoding_mb2wchar_with_len(int encoding, return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len); } +/* convert a wchar string to a multibyte */ +int +pg_wchar2mb(const pg_wchar *from, char *to) +{ + return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, pg_wchar_strlen(from)); +} + +/* convert a wchar string to a multibyte with a limited length */ +int +pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len) +{ + return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, len); +} + +/* same, with any encoding */ +int +pg_encoding_wchar2mb_with_len(int encoding, + const pg_wchar *from, char *to, int len) +{ + return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *)to, len); +} + /* returns the byte length of a multibyte character */ int pg_mblen(const char *mbstr) diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 2e1db5ebf4..1755be5ebf 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -339,6 +339,54 @@ pg_euctw_dsplen(const unsigned char *s) return len; } +/* + * Convert pg_wchar to EUC_* encoding. + * caller must allocate enough space for "to", including a trailing zero! + * len: length of from. + * "from" not necessarily null terminated. + */ +static int +pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len) +{ + int cnt = 0; + + while (len > 0 && *from) + { + unsigned char c; + + if ((c = *from >> 24)) + { + *to++ = c; + *to++ = (*from >> 16) & 0xff; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 4; + } + else if ((c = *from >> 16)) + { + *to++ = c; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 3; + } + else if ((c = *from >> 8)) + { + *to++ = c; + *to++ = *from & 0xff; + cnt += 2; + } + else + { + *to++ = *from; + cnt++; + } + len--; + } + *to = 0; + return cnt; +} + + /* * JOHAB */ @@ -453,6 +501,30 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string) return utf8string; } +/* + * Trivial conversion from pg_wchar to UTF-8. + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ +static int +pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len) +{ + int cnt = 0; + + while (len > 0 && *from) + { + int char_len; + + unicode_to_utf8(*from, to); + char_len = pg_utf_mblen(to); + len--; + cnt += char_len; + to += char_len; + } + *to = 0; + return cnt; +} /* * Return the byte length of a UTF8 character pointed to by s @@ -719,6 +791,75 @@ pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) return cnt; } +/* + * convert pg_wchar to mule internal code + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ +static int +pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len) +{ + int cnt = 0; + unsigned char lb; + + while (len > 0 && *from) + { + lb = (*from >> 16) & 0xff; + if (IS_LC1(lb)) + { + *to++ = lb; + *to++ = *from & 0xff; + cnt += 2; + } + else if (IS_LC2(lb)) + { + *to++ = lb; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 3; + } + else if (IS_LCPRV1_A_RANGE(lb)) + { + *to++ = LCPRV1_A; + *to++ = lb; + *to++ = *from & 0xff; + cnt += 3; + } + else if (IS_LCPRV1_B_RANGE(lb)) + { + *to++ = LCPRV1_B; + *to++ = lb; + *to++ = *from & 0xff; + cnt += 3; + } + else if (IS_LCPRV2_A_RANGE(lb)) + { + *to++ = LCPRV2_A; + *to++ = lb; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 4; + } + else if (IS_LCPRV2_B_RANGE(lb)) + { + *to++ = LCPRV2_B; + *to++ = lb; + *to++ = (*from >> 8) & 0xff; + *to++ = *from & 0xff; + cnt += 4; + } + else + { + *to++ = lb; + cnt += 1; + } + len--; + } + *to = 0; + return cnt; +} + int pg_mule_mblen(const unsigned char *s) { @@ -780,6 +921,28 @@ pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len) return cnt; } +/* + * Trivial conversion from pg_wchar to single byte encoding. Just ignores + * high bits. + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ +static int +pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len) +{ + int cnt = 0; + + while (len > 0 && *from) + { + *to++ = *from++; + len--; + cnt++; + } + *to = 0; + return cnt; +} + static int pg_latin1_mblen(const unsigned char *s) { @@ -1556,48 +1719,48 @@ pg_eucjp_increment(unsigned char *charptr, int length) *------------------------------------------------------------------- */ pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ - {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ - {0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ - {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ + {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ + {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ + {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ + {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ + {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ + {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ + {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ + {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ + {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ + {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ + {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ + {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ }; /* returns the byte length of a word for mule internal code */ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 1d6d9554bd..1bcdfbceb9 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -140,6 +140,10 @@ typedef unsigned int pg_wchar; #define LCPRV1_A 0x9a #define LCPRV1_B 0x9b #define IS_LCPRV1(c) ((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B) +#define IS_LCPRV1_A_RANGE(c) \ + ((unsigned char)(c) >= 0xa0 && (unsigned char)(c) <= 0xdf) +#define IS_LCPRV1_B_RANGE(c) \ + ((unsigned char)(c) >= 0xe0 && (unsigned char)(c) <= 0xef) /* * Postgres-specific prefix bytes for "private" multibyte encodings @@ -148,6 +152,10 @@ typedef unsigned int pg_wchar; #define LCPRV2_A 0x9c #define LCPRV2_B 0x9d #define IS_LCPRV2(c) ((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B) +#define IS_LCPRV2_A_RANGE(c) \ + ((unsigned char)(c) >= 0xf0 && (unsigned char)(c) <= 0xf4) +#define IS_LCPRV2_B_RANGE(c) \ + ((unsigned char)(c) >= 0xf5 && (unsigned char)(c) <= 0xfe) /* * Charset IDs for private single byte encodings (0xa0-0xef) @@ -324,7 +332,11 @@ extern pg_enc2gettext pg_enc2gettext_tbl[]; * pg_wchar stuff */ typedef int (*mb2wchar_with_len_converter) (const unsigned char *from, - pg_wchar *to, + pg_wchar *to, + int len); + +typedef int (*wchar2mb_with_len_converter) (const pg_wchar *from, + unsigned char *to, int len); typedef int (*mblen_converter) (const unsigned char *mbstr); @@ -337,8 +349,10 @@ typedef int (*mbverifier) (const unsigned char *mbstr, int len); typedef struct { - mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte - * string to a wchar */ + mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte + * string to a wchar */ + wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar + * string to a multibyte */ mblen_converter mblen; /* get byte length of a char */ mbdisplaylen_converter dsplen; /* get display width of a char */ mbverifier mbverify; /* verify multibyte sequence */ @@ -419,6 +433,10 @@ extern int pg_mb2wchar(const char *from, pg_wchar *to); extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len); extern int pg_encoding_mb2wchar_with_len(int encoding, const char *from, pg_wchar *to, int len); +extern int pg_wchar2mb(const pg_wchar *from, char *to); +extern int pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len); +extern int pg_encoding_wchar2mb_with_len(int encoding, + const pg_wchar *from, char *to, int len); extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); -- 2.40.0