From: Tom Lane Date: Sat, 23 Apr 2011 16:35:41 +0000 (-0400) Subject: Fix char2wchar/wchar2char to support collations properly. X-Git-Tag: REL9_1_BETA1~41 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2ab0796d7a3a7116a79b65531fd33f1548514b52;p=postgresql Fix char2wchar/wchar2char to support collations properly. These functions should take a pg_locale_t, not a collation OID, and should call mbstowcs_l/wcstombs_l where available. Where those functions are not available, temporarily select the correct locale with uselocale(). This change removes the bogus assumption that all locales selectable in a given database have the same wide-character conversion method; in particular, the collate.linux.utf8 regression test now passes with LC_CTYPE=C, so long as the database encoding is UTF8. I decided to move the char2wchar/wchar2char functions out of mbutils.c and into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus don't really belong with the mbutils.c functions. Keeping them where they were would have required importing pg_locale_t into pg_wchar.h somehow, which did not seem like a good plan. --- diff --git a/configure b/configure index 69c7418f38..b0a10fbbc1 100755 --- a/configure +++ b/configure @@ -18985,7 +18985,8 @@ fi -for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs + +for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l do as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5 diff --git a/configure.in b/configure.in index 3c4089f38e..03ff57d4bc 100644 --- a/configure.in +++ b/configure.in @@ -1187,7 +1187,7 @@ PGAC_VAR_INT_TIMEZONE AC_FUNC_ACCEPT_ARGTYPES PGAC_FUNC_GETTIMEOFDAY_1ARG -AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs]) +AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index b8ae0fe65e..dcaf18b00c 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -29,11 +29,12 @@ t_isdigit(const char *ptr) int clen = pg_mblen(ptr); wchar_t character[2]; Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ if (clen == 1 || lc_ctype_is_c(collation)) return isdigit(TOUCHAR(ptr)); - char2wchar(character, 2, ptr, clen, collation); + char2wchar(character, 2, ptr, clen, mylocale); return iswdigit((wint_t) character[0]); } @@ -44,11 +45,12 @@ t_isspace(const char *ptr) int clen = pg_mblen(ptr); wchar_t character[2]; Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ if (clen == 1 || lc_ctype_is_c(collation)) return isspace(TOUCHAR(ptr)); - char2wchar(character, 2, ptr, clen, collation); + char2wchar(character, 2, ptr, clen, mylocale); return iswspace((wint_t) character[0]); } @@ -59,11 +61,12 @@ t_isalpha(const char *ptr) int clen = pg_mblen(ptr); wchar_t character[2]; Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ if (clen == 1 || lc_ctype_is_c(collation)) return isalpha(TOUCHAR(ptr)); - char2wchar(character, 2, ptr, clen, collation); + char2wchar(character, 2, ptr, clen, mylocale); return iswalpha((wint_t) character[0]); } @@ -74,11 +77,12 @@ t_isprint(const char *ptr) int clen = pg_mblen(ptr); wchar_t character[2]; Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ if (clen == 1 || lc_ctype_is_c(collation)) return isprint(TOUCHAR(ptr)); - char2wchar(character, 2, ptr, clen, collation); + char2wchar(character, 2, ptr, clen, mylocale); return iswprint((wint_t) character[0]); } @@ -246,6 +250,7 @@ lowerstr_with_len(const char *str, int len) #ifdef USE_WIDE_UPPER_LOWER Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ #endif if (len == 0) @@ -272,7 +277,7 @@ lowerstr_with_len(const char *str, int len) */ wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); - wlen = char2wchar(wstr, len + 1, str, len, collation); + wlen = char2wchar(wstr, len + 1, str, len, mylocale); Assert(wlen <= len); while (*wptr) @@ -287,7 +292,7 @@ lowerstr_with_len(const char *str, int len) len = pg_database_encoding_max_length() * wlen + 1; out = (char *) palloc(len); - wlen = wchar2char(out, wstr, len, collation); + wlen = wchar2char(out, wstr, len, mylocale); pfree(wstr); diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 47d777a3e6..3176ddc696 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -300,13 +300,14 @@ TParserInit(char *str, int len) if (prs->charmaxlen > 1) { Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ prs->usewide = true; if (lc_ctype_is_c(collation)) { /* * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could - * be not equal to sizeof(wchar_t) + * be different from sizeof(wchar_t) */ prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); @@ -314,7 +315,8 @@ TParserInit(char *str, int len) else { prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); - char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, collation); + char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, + mylocale); } } else diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index f895bbbb8b..726a1f4552 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1454,6 +1454,10 @@ str_numth(char *dest, char *num, int type) return dest; } +/***************************************************************************** + * upper/lower/initcap functions + *****************************************************************************/ + /* * If the system provides the needed functions for wide-character manipulation * (which are all standardized by C99), then we implement upper/lower/initcap @@ -1527,7 +1531,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - char2wchar(workspace, nbytes + 1, buff, nbytes, collid); + char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { @@ -1543,7 +1547,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) result_size = curr_char * pg_database_encoding_max_length() + 1; result = palloc(result_size); - wchar2char(result, workspace, result_size, collid); + wchar2char(result, workspace, result_size, mylocale); pfree(workspace); } #endif /* USE_WIDE_UPPER_LOWER */ @@ -1648,7 +1652,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - char2wchar(workspace, nbytes + 1, buff, nbytes, collid); + char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { @@ -1664,7 +1668,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) result_size = curr_char * pg_database_encoding_max_length() + 1; result = palloc(result_size); - wchar2char(result, workspace, result_size, collid); + wchar2char(result, workspace, result_size, mylocale); pfree(workspace); } #endif /* USE_WIDE_UPPER_LOWER */ @@ -1781,7 +1785,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - char2wchar(workspace, nbytes + 1, buff, nbytes, collid); + char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { @@ -1809,7 +1813,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) result_size = curr_char * pg_database_encoding_max_length() + 1; result = palloc(result_size); - wchar2char(result, workspace, result_size, collid); + wchar2char(result, workspace, result_size, mylocale); pfree(workspace); } #endif /* USE_WIDE_UPPER_LOWER */ diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 0e6723d469..8208d3cad9 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid) return cache_entry->locale; } + + +/* + * These functions convert from/to libc's wchar_t, *not* pg_wchar_t. + * Therefore we keep them here rather than with the mbutils code. + */ + +#ifdef USE_WIDE_UPPER_LOWER + +/* + * wchar2char --- convert wide characters to multibyte format + * + * This has the same API as the standard wcstombs_l() function; in particular, + * tolen is the maximum number of bytes to store at *to, and *from must be + * zero-terminated. The output will be zero-terminated iff there is room. + */ +size_t +wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) +{ + size_t result; + + if (tolen == 0) + return 0; + +#ifdef WIN32 + + /* + * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and + * for some reason mbstowcs and wcstombs won't do this for us, so we use + * MultiByteToWideChar(). + */ + if (GetDatabaseEncoding() == PG_UTF8) + { + result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, + NULL, NULL); + /* A zero return is failure */ + if (result <= 0) + result = -1; + else + { + Assert(result <= tolen); + /* Microsoft counts the zero terminator in the result */ + result--; + } + } + else +#endif /* WIN32 */ + if (locale == (pg_locale_t) 0) + { + /* Use wcstombs directly for the default locale */ + result = wcstombs(to, from, tolen); + } + else + { +#ifdef HAVE_LOCALE_T +#ifdef HAVE_WCSTOMBS_L + /* Use wcstombs_l for nondefault locales */ + result = wcstombs_l(to, from, tolen, locale); +#else /* !HAVE_WCSTOMBS_L */ + /* We have to temporarily set the locale as current ... ugh */ + locale_t save_locale = uselocale(locale); + + result = wcstombs(to, from, tolen); + + uselocale(save_locale); +#endif /* HAVE_WCSTOMBS_L */ +#else /* !HAVE_LOCALE_T */ + /* Can't have locale != 0 without HAVE_LOCALE_T */ + elog(ERROR, "wcstombs_l is not available"); + result = 0; /* keep compiler quiet */ +#endif /* HAVE_LOCALE_T */ + } + + return result; +} + +/* + * char2wchar --- convert multibyte characters to wide characters + * + * This has almost the API of mbstowcs_l(), except that *from need not be + * null-terminated; instead, the number of input bytes is specified as + * fromlen. Also, we ereport() rather than returning -1 for invalid + * input encoding. tolen is the maximum number of wchar_t's to store at *to. + * The output will be zero-terminated iff there is room. + */ +size_t +char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, + pg_locale_t locale) +{ + size_t result; + + if (tolen == 0) + return 0; + +#ifdef WIN32 + /* See WIN32 "Unicode" comment above */ + if (GetDatabaseEncoding() == PG_UTF8) + { + /* Win32 API does not work for zero-length input */ + if (fromlen == 0) + result = 0; + else + { + result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1); + /* A zero return is failure */ + if (result == 0) + result = -1; + } + + if (result != -1) + { + Assert(result < tolen); + /* Append trailing null wchar (MultiByteToWideChar() does not) */ + to[result] = 0; + } + } + else +#endif /* WIN32 */ + { + /* mbstowcs requires ending '\0' */ + char *str = pnstrdup(from, fromlen); + + if (locale == (pg_locale_t) 0) + { + /* Use mbstowcs directly for the default locale */ + result = mbstowcs(to, str, tolen); + } + else + { +#ifdef HAVE_LOCALE_T +#ifdef HAVE_WCSTOMBS_L + /* Use mbstowcs_l for nondefault locales */ + result = mbstowcs_l(to, str, tolen, locale); +#else /* !HAVE_WCSTOMBS_L */ + /* We have to temporarily set the locale as current ... ugh */ + locale_t save_locale = uselocale(locale); + + result = mbstowcs(to, str, tolen); + + uselocale(save_locale); +#endif /* HAVE_WCSTOMBS_L */ +#else /* !HAVE_LOCALE_T */ + /* Can't have locale != 0 without HAVE_LOCALE_T */ + elog(ERROR, "mbstowcs_l is not available"); + result = 0; /* keep compiler quiet */ +#endif /* HAVE_LOCALE_T */ + } + + pfree(str); + } + + if (result == -1) + { + /* + * Invalid multibyte character encountered. We try to give a useful + * error message by letting pg_verifymbstr check the string. But it's + * possible that the string is OK to us, and not OK to mbstowcs --- + * this suggests that the LC_CTYPE locale is different from the + * database encoding. Give a generic error message if verifymbstr + * can't find anything wrong. + */ + pg_verifymbstr(from, fromlen, false); /* might not return */ + /* but if it does ... */ + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid multibyte character for locale"), + errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); + } + + return result; +} + +#endif /* USE_WIDE_UPPER_LOWER */ diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 3cb7ce3269..848c26f41f 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -13,7 +13,6 @@ #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/memutils.h" -#include "utils/pg_locale.h" #include "utils/syscache.h" /* @@ -689,126 +688,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_ } - -#ifdef USE_WIDE_UPPER_LOWER - -/* - * wchar2char --- convert wide characters to multibyte format - * - * This has the same API as the standard wcstombs() function; in particular, - * tolen is the maximum number of bytes to store at *to, and *from must be - * zero-terminated. The output will be zero-terminated iff there is room. - */ -size_t -wchar2char(char *to, const wchar_t *from, size_t tolen, Oid collation) -{ - size_t result; - - if (tolen == 0) - return 0; - -#ifdef WIN32 - - /* - * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and - * for some reason mbstowcs and wcstombs won't do this for us, so we use - * MultiByteToWideChar(). - */ - if (GetDatabaseEncoding() == PG_UTF8) - { - result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, - NULL, NULL); - /* A zero return is failure */ - if (result <= 0) - result = -1; - else - { - Assert(result <= tolen); - /* Microsoft counts the zero terminator in the result */ - result--; - } - } - else -#endif /* WIN32 */ - { - Assert(!lc_ctype_is_c(collation)); - result = wcstombs(to, from, tolen); - } - return result; -} - -/* - * char2wchar --- convert multibyte characters to wide characters - * - * This has almost the API of mbstowcs(), except that *from need not be - * null-terminated; instead, the number of input bytes is specified as - * fromlen. Also, we ereport() rather than returning -1 for invalid - * input encoding. tolen is the maximum number of wchar_t's to store at *to. - * The output will be zero-terminated iff there is room. - */ -size_t -char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, Oid collation) -{ - size_t result; - - if (tolen == 0) - return 0; - -#ifdef WIN32 - /* See WIN32 "Unicode" comment above */ - if (GetDatabaseEncoding() == PG_UTF8) - { - /* Win32 API does not work for zero-length input */ - if (fromlen == 0) - result = 0; - else - { - result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1); - /* A zero return is failure */ - if (result == 0) - result = -1; - } - - if (result != -1) - { - Assert(result < tolen); - /* Append trailing null wchar (MultiByteToWideChar() does not) */ - to[result] = 0; - } - } - else -#endif /* WIN32 */ - { - /* mbstowcs requires ending '\0' */ - char *str = pnstrdup(from, fromlen); - - Assert(!lc_ctype_is_c(collation)); - result = mbstowcs(to, str, tolen); - pfree(str); - } - - if (result == -1) - { - /* - * Invalid multibyte character encountered. We try to give a useful - * error message by letting pg_verifymbstr check the string. But it's - * possible that the string is OK to us, and not OK to mbstowcs --- - * this suggests that the LC_CTYPE locale is different from the - * database encoding. Give a generic error message if verifymbstr - * can't find anything wrong. - */ - pg_verifymbstr(from, fromlen, false); /* might not return */ - /* but if it does ... */ - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid multibyte character for locale"), - errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); - } - - return result; -} -#endif - /* convert a multibyte string to a wchar */ int pg_mb2wchar(const char *from, pg_wchar *to) diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 8efc6d3046..826c7af53b 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -19,8 +19,6 @@ #ifndef PG_WCHAR_H #define PG_WCHAR_H -#include - /* * The pg_wchar type */ @@ -392,11 +390,6 @@ extern int pg_mbcharcliplen(const char *mbstr, int len, int imit); extern int pg_encoding_max_length(int encoding); extern int pg_database_encoding_max_length(void); -#ifdef USE_WIDE_UPPER_LOWER -extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, Oid collation); -extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, Oid collation); -#endif - extern int PrepareClientEncoding(int encoding); extern int SetClientEncoding(int encoding); extern void InitializeClientEncoding(void); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 3ee1d077a5..04560c74bf 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -656,6 +656,9 @@ /* Define to 1 if you have the `wcstombs' function. */ #undef HAVE_WCSTOMBS +/* Define to 1 if you have the `wcstombs_l' function. */ +#undef HAVE_WCSTOMBS_L + /* Define to 1 if you have the header file. */ #undef HAVE_WCTYPE_H diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32 index 177bca1bd5..b85bf411de 100644 --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@ -538,6 +538,9 @@ /* Define to 1 if you have the `wcstombs' function. */ #define HAVE_WCSTOMBS 1 +/* Define to 1 if you have the `wcstombs_l' function. */ +#define HAVE_WCSTOMBS_L 1 + /* Define to 1 if you have the header file. */ #define HAVE_WCTYPE_H 1 diff --git a/src/include/port/win32.h b/src/include/port/win32.h index 8a3c33f995..2914a59811 100644 --- a/src/include/port/win32.h +++ b/src/include/port/win32.h @@ -304,6 +304,8 @@ typedef int pid_t; #define iswspace_l _iswspace_l #define strcoll_l _strcoll_l #define wcscoll_l _wcscoll_l +#define wcstombs_l _wcstombs_l +#define mbstowcs_l _mbstowcs_l /* In backend/port/win32/signal.c */ diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 25b9d50915..c59a004fc9 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -72,4 +72,12 @@ typedef int pg_locale_t; extern pg_locale_t pg_newlocale_from_collation(Oid collid); +/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ +#ifdef USE_WIDE_UPPER_LOWER +extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, + pg_locale_t locale); +extern size_t char2wchar(wchar_t *to, size_t tolen, + const char *from, size_t fromlen, pg_locale_t locale); +#endif + #endif /* _PG_LOCALE_ */