From 5f538ad004aa00cf0881f179f0cde789aad4f47e Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Wed, 26 Jun 2013 11:17:33 -0400 Subject: [PATCH] Renovate display of non-ASCII messages on Windows. GNU gettext selects a default encoding for the messages it emits in a platform-specific manner; it uses the Windows ANSI code page on Windows and follows LC_CTYPE on other platforms. This is inconvenient for PostgreSQL server processes, so realize consistent cross-platform behavior by calling bind_textdomain_codeset() on Windows each time we permanently change LC_CTYPE. This primarily affects SQL_ASCII databases and processes like the postmaster that do not attach to a database, making their behavior consistent with PostgreSQL on non-Windows platforms. Messages from SQL_ASCII databases use the encoding implied by the database LC_CTYPE, and messages from non-database processes use LC_CTYPE from the postmaster system environment. PlatformEncoding becomes unused, so remove it. Make write_console() prefer WriteConsoleW() to write() regardless of the encodings in use. In this situation, write() will invariably mishandle non-ASCII characters. elog.c has assumed that messages conform to the database encoding. While usually true, this does not hold for SQL_ASCII and MULE_INTERNAL. Introduce MessageEncoding to track the actual encoding of message text. The present consumers are Windows-specific code for converting messages to UTF16 for use in system interfaces. This fixes the appearance in Windows event logs and consoles of translated messages from SQL_ASCII processes like the postmaster. Note that SQL_ASCII inherently disclaims a strong notion of encoding, so non-ASCII byte sequences interpolated into messages by %s may yet yield a nonsensical message. MULE_INTERNAL has similar problems at present, albeit for a different reason: its lack of libiconv support or a conversion to UTF8. Consequently, one need no longer restart Windows with a different Windows ANSI code page to broadly test backend logging under a given language. Changing the user's locale ("Format") is enough. Several accounts can simultaneously run postmasters under different locales, all correctly logging localized messages to Windows event logs and consoles. Alexander Law and Noah Misch --- src/backend/main/main.c | 4 + src/backend/utils/adt/pg_locale.c | 34 ++++++-- src/backend/utils/error/elog.c | 47 ++++++++--- src/backend/utils/init/postinit.c | 5 -- src/backend/utils/mb/encnames.c | 10 +++ src/backend/utils/mb/mbutils.c | 132 +++++++++++++++++++++--------- src/include/mb/pg_wchar.h | 10 ++- src/include/port.h | 4 + src/port/chklocale.c | 29 +++++++ 9 files changed, 210 insertions(+), 65 deletions(-) diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 8ea6c1f387..d71885dba9 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -265,6 +265,10 @@ startup_hacks(const char *progname) /* * Help display should match the options accepted by PostmasterMain() * and PostgresMain(). + * + * XXX On Windows, non-ASCII localizations of these messages only display + * correctly if the console output code page covers the necessary characters. + * Messages emitted in write_console() do not exhibit this problem. */ static void help(const char *progname) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 7081b00500..3d85e297d2 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -131,14 +131,16 @@ static char *IsoLocaleName(const char *); /* MSVC specific */ /* * pg_perm_setlocale * - * This is identical to the libc function setlocale(), with the addition - * that if the operation is successful, the corresponding LC_XXX environment - * variable is set to match. By setting the environment variable, we ensure - * that any subsequent use of setlocale(..., "") will preserve the settings - * made through this routine. Of course, LC_ALL must also be unset to fully - * ensure that, but that has to be done elsewhere after all the individual - * LC_XXX variables have been set correctly. (Thank you Perl for making this - * kluge necessary.) + * This wraps the libc function setlocale(), with two additions. First, when + * changing LC_CTYPE, update gettext's encoding for the current message + * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but + * not on Windows. Second, if the operation is successful, the corresponding + * LC_XXX environment variable is set to match. By setting the environment + * variable, we ensure that any subsequent use of setlocale(..., "") will + * preserve the settings made through this routine. Of course, LC_ALL must + * also be unset to fully ensure that, but that has to be done elsewhere after + * all the individual LC_XXX variables have been set correctly. (Thank you + * Perl for making this kluge necessary.) */ char * pg_perm_setlocale(int category, const char *locale) @@ -172,6 +174,22 @@ pg_perm_setlocale(int category, const char *locale) if (result == NULL) return result; /* fall out immediately on failure */ + /* + * Use the right encoding in translated messages. Under ENABLE_NLS, let + * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message + * format strings are ASCII, but database-encoding strings may enter the + * message via %s. This makes the overall message encoding equal to the + * database encoding. + */ + if (category == LC_CTYPE) + { +#ifdef ENABLE_NLS + SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL))); +#else + SetMessageEncoding(GetDatabaseEncoding()); +#endif + } + switch (category) { case LC_COLLATE: diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 7f03f419de..706c01eca5 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -1813,6 +1813,22 @@ write_syslog(int level, const char *line) #endif /* HAVE_SYSLOG */ #ifdef WIN32 +/* + * Get the PostgreSQL equivalent of the Windows ANSI code page. "ANSI" system + * interfaces (e.g. CreateFileA()) expect string arguments in this encoding. + * Every process in a given system will find the same value at all times. + */ +static int +GetACPEncoding(void) +{ + static int encoding = -2; + + if (encoding == -2) + encoding = pg_codepage_to_encoding(GetACP()); + + return encoding; +} + /* * Write a message line to the windows event log */ @@ -1858,16 +1874,18 @@ write_eventlog(int level, const char *line, int len) } /* - * Convert message to UTF16 text and write it with ReportEventW, but - * fall-back into ReportEventA if conversion failed. + * If message character encoding matches the encoding expected by + * ReportEventA(), call it to avoid the hazards of conversion. Otherwise, + * try to convert the message to UTF16 and write it with ReportEventW(). + * Fall back on ReportEventA() if conversion failed. * * Also verify that we are not on our way into error recursion trouble due - * to error messages thrown deep inside pgwin32_toUTF16(). + * to error messages thrown deep inside pgwin32_message_to_UTF16(). */ - if (GetDatabaseEncoding() != GetPlatformEncoding() && - !in_error_recursion_trouble()) + if (!in_error_recursion_trouble() && + GetMessageEncoding() != GetACPEncoding()) { - utf16 = pgwin32_toUTF16(line, len, NULL); + utf16 = pgwin32_message_to_UTF16(line, len, NULL); if (utf16) { ReportEventW(evtHandle, @@ -1879,6 +1897,7 @@ write_eventlog(int level, const char *line, int len) 0, (LPCWSTR *) &utf16, NULL); + /* XXX Try ReportEventA() when ReportEventW() fails? */ pfree(utf16); return; @@ -1904,22 +1923,30 @@ write_console(const char *line, int len) #ifdef WIN32 /* - * WriteConsoleW() will fail if stdout is redirected, so just fall through + * Try to convert the message to UTF16 and write it with WriteConsoleW(). + * Fall back on write() if anything fails. + * + * In contrast to write_eventlog(), don't skip straight to write() based + * on the applicable encodings. Unlike WriteConsoleW(), write() depends + * on the suitability of the console output code page. Since we put + * stderr into binary mode in SubPostmasterMain(), write() skips the + * necessary translation anyway. + * + * WriteConsoleW() will fail if stderr is redirected, so just fall through * to writing unconverted to the logfile in this case. * * Since we palloc the structure required for conversion, also fall * through to writing unconverted if we have not yet set up * CurrentMemoryContext. */ - if (GetDatabaseEncoding() != GetPlatformEncoding() && - !in_error_recursion_trouble() && + if (!in_error_recursion_trouble() && !redirection_done && CurrentMemoryContext != NULL) { WCHAR *utf16; int utf16len; - utf16 = pgwin32_toUTF16(line, len, &utf16len); + utf16 = pgwin32_message_to_UTF16(line, len, &utf16len); if (utf16 != NULL) { HANDLE stdHandle; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index e0abff1145..e0ea2e9ecf 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -357,11 +357,6 @@ CheckMyDatabase(const char *name, bool am_superuser) SetConfigOption("lc_collate", collate, PGC_INTERNAL, PGC_S_OVERRIDE); SetConfigOption("lc_ctype", ctype, PGC_INTERNAL, PGC_S_OVERRIDE); - /* Use the right encoding in translated messages */ -#ifdef ENABLE_NLS - pg_bind_textdomain_codeset(textdomain(NULL)); -#endif - ReleaseSysCache(tup); } diff --git a/src/backend/utils/mb/encnames.c b/src/backend/utils/mb/encnames.c index 9a05e573ff..772d4a5d05 100644 --- a/src/backend/utils/mb/encnames.c +++ b/src/backend/utils/mb/encnames.c @@ -352,10 +352,13 @@ pg_enc2name pg_enc2name_tbl[] = /* ---------- * These are encoding names for gettext. + * + * This covers all encodings except MULE_INTERNAL, which is alien to gettext. * ---------- */ pg_enc2gettext pg_enc2gettext_tbl[] = { + {PG_SQL_ASCII, "US-ASCII"}, {PG_UTF8, "UTF-8"}, {PG_LATIN1, "LATIN1"}, {PG_LATIN2, "LATIN2"}, @@ -389,6 +392,13 @@ pg_enc2gettext pg_enc2gettext_tbl[] = {PG_EUC_KR, "EUC-KR"}, {PG_EUC_TW, "EUC-TW"}, {PG_EUC_JIS_2004, "EUC-JP"}, + {PG_SJIS, "SHIFT-JIS"}, + {PG_BIG5, "BIG5"}, + {PG_GBK, "GBK"}, + {PG_UHC, "UHC"}, + {PG_GB18030, "GB18030"}, + {PG_JOHAB, "JOHAB"}, + {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"}, {0, NULL} }; diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 4582219af7..6d1cd8e875 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -53,11 +53,11 @@ static FmgrInfo *ToServerConvProc = NULL; static FmgrInfo *ToClientConvProc = NULL; /* - * These variables track the currently selected FE and BE encodings. + * These variables track the currently-selected encodings. */ static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; -static pg_enc2name *PlatformEncoding = NULL; +static pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; /* * During backend startup we can't set client encoding because we (a) @@ -881,46 +881,102 @@ SetDatabaseEncoding(int encoding) Assert(DatabaseEncoding->encoding == encoding); } -/* - * Bind gettext to the codeset equivalent with the database encoding. - */ void -pg_bind_textdomain_codeset(const char *domainname) +SetMessageEncoding(int encoding) { -#if defined(ENABLE_NLS) - int encoding = GetDatabaseEncoding(); - int i; + /* Some calls happen before we can elog()! */ + Assert(PG_VALID_ENCODING(encoding)); - /* - * gettext() uses the codeset specified by LC_CTYPE by default, so if that - * matches the database encoding we don't need to do anything. In CREATE - * DATABASE, we enforce or trust that the locale's codeset matches - * database encoding, except for the C locale. In C locale, we bind - * gettext() explicitly to the right codeset. - * - * On Windows, though, gettext() tends to get confused so we always bind - * it. - */ -#ifndef WIN32 - const char *ctype = setlocale(LC_CTYPE, NULL); + MessageEncoding = &pg_enc2name_tbl[encoding]; + Assert(MessageEncoding->encoding == encoding); +} - if (pg_strcasecmp(ctype, "C") != 0 && pg_strcasecmp(ctype, "POSIX") != 0) - return; -#endif +#ifdef ENABLE_NLS +/* + * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext + * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also + * fail for gettext-internal causes like out-of-memory. + */ +static bool +raw_pg_bind_textdomain_codeset(const char *domainname, int encoding) +{ + bool elog_ok = (CurrentMemoryContext != NULL); + int i; for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++) { if (pg_enc2gettext_tbl[i].encoding == encoding) { if (bind_textdomain_codeset(domainname, - pg_enc2gettext_tbl[i].name) == NULL) + pg_enc2gettext_tbl[i].name) != NULL) + return true; + + if (elog_ok) elog(LOG, "bind_textdomain_codeset failed"); + else + write_stderr("bind_textdomain_codeset failed"); + break; } } + + return false; +} + +/* + * Bind a gettext message domain to the codeset corresponding to the database + * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE. + * Return the MessageEncoding implied by the new settings. + * + * On most platforms, gettext defaults to the codeset implied by LC_CTYPE. + * When that matches the database encoding, we don't need to do anything. In + * CREATE DATABASE, we enforce or trust that the locale's codeset matches the + * database encoding, except for the C locale. (On Windows, we also permit a + * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind + * gettext to the right codeset. + * + * On Windows, gettext defaults to the Windows ANSI code page. This is a + * convenient departure for software that passes the strings to Windows ANSI + * APIs, but we don't do that. Compel gettext to use database encoding or, + * failing that, the LC_CTYPE encoding as it would on other platforms. + * + * This function is called before elog() and palloc() are usable. + */ +int +pg_bind_textdomain_codeset(const char *domainname) +{ + bool elog_ok = (CurrentMemoryContext != NULL); + int encoding = GetDatabaseEncoding(); + int new_msgenc; + +#ifndef WIN32 + const char *ctype = setlocale(LC_CTYPE, NULL); + + if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0) #endif + if (encoding != PG_SQL_ASCII && + raw_pg_bind_textdomain_codeset(domainname, encoding)) + return encoding; + + new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok); + if (new_msgenc < 0) + new_msgenc = PG_SQL_ASCII; + +#ifdef WIN32 + if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc)) + /* On failure, the old message encoding remains valid. */ + return GetMessageEncoding(); +#endif + + return new_msgenc; } +#endif +/* + * The database encoding, also called the server encoding, represents the + * encoding of data stored in text-like data types. Affected types include + * cstring, text, varchar, name, xml, and json. + */ int GetDatabaseEncoding(void) { @@ -949,19 +1005,17 @@ pg_client_encoding(PG_FUNCTION_ARGS) return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name)); } +/* + * gettext() returns messages in this encoding. This often matches the + * database encoding, but it differs for SQL_ASCII databases, for processes + * not attached to a database, and under a database encoding lacking iconv + * support (MULE_INTERNAL). + */ int -GetPlatformEncoding(void) +GetMessageEncoding(void) { - if (PlatformEncoding == NULL) - { - /* try to determine encoding of server's environment locale */ - int encoding = pg_get_encoding_from_locale("", true); - - if (encoding < 0) - encoding = PG_SQL_ASCII; - PlatformEncoding = &pg_enc2name_tbl[encoding]; - } - return PlatformEncoding->encoding; + Assert(MessageEncoding); + return MessageEncoding->encoding; } #ifdef WIN32 @@ -971,13 +1025,13 @@ GetPlatformEncoding(void) * is also passed to utf16len if not null. Returns NULL iff failed. */ WCHAR * -pgwin32_toUTF16(const char *str, int len, int *utf16len) +pgwin32_message_to_UTF16(const char *str, int len, int *utf16len) { WCHAR *utf16; int dstlen; UINT codepage; - codepage = pg_enc2name_tbl[GetDatabaseEncoding()].codepage; + codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage; /* * Use MultiByteToWideChar directly if there is a corresponding codepage, @@ -994,7 +1048,7 @@ pgwin32_toUTF16(const char *str, int len, int *utf16len) char *utf8; utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str, - len, GetDatabaseEncoding(), PG_UTF8); + len, GetMessageEncoding(), PG_UTF8); if (utf8 != str) len = strlen(utf8); diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 725865595a..d255c64bc1 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -481,8 +481,12 @@ extern const char *pg_get_client_encoding_name(void); extern void SetDatabaseEncoding(int encoding); extern int GetDatabaseEncoding(void); extern const char *GetDatabaseEncodingName(void); -extern int GetPlatformEncoding(void); -extern void pg_bind_textdomain_codeset(const char *domainname); +extern void SetMessageEncoding(int encoding); +extern int GetMessageEncoding(void); + +#ifdef ENABLE_NLS +extern int pg_bind_textdomain_codeset(const char *domainname); +#endif extern int pg_valid_client_encoding(const char *name); extern int pg_valid_server_encoding(const char *name); @@ -542,7 +546,7 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, extern bool pg_utf8_islegal(const unsigned char *source, int length); #ifdef WIN32 -extern WCHAR *pgwin32_toUTF16(const char *str, int len, int *utf16len); +extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); #endif #endif /* PG_WCHAR_H */ diff --git a/src/include/port.h b/src/include/port.h index 5eda5f0af5..5ef4b0a0b1 100644 --- a/src/include/port.h +++ b/src/include/port.h @@ -452,6 +452,10 @@ extern void qsort_arg(void *base, size_t nel, size_t elsize, /* port/chklocale.c */ extern int pg_get_encoding_from_locale(const char *ctype, bool write_message); +#if defined(WIN32) && !defined(FRONTEND) +extern int pg_codepage_to_encoding(UINT cp); +#endif + /* port/inet_net_ntop.c */ extern char *inet_net_ntop(int af, const void *src, int bits, char *dst, size_t size); diff --git a/src/port/chklocale.c b/src/port/chklocale.c index 9e889383f2..8b8862ffb2 100644 --- a/src/port/chklocale.c +++ b/src/port/chklocale.c @@ -235,6 +235,32 @@ win32_langinfo(const char *ctype) return r; } + +#ifndef FRONTEND +/* + * Given a Windows code page identifier, find the corresponding PostgreSQL + * encoding. Issue a warning and return -1 if none found. + */ +int +pg_codepage_to_encoding(UINT cp) +{ + char sys[16]; + int i; + + sprintf(sys, "CP%u", cp); + + /* Check the table */ + for (i = 0; encoding_match_list[i].system_enc_name; i++) + if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) + return encoding_match_list[i].pg_enc_code; + + ereport(WARNING, + (errmsg("could not determine encoding for codeset \"%s\"", sys), + errdetail("Please report this to ."))); + + return -1; +} +#endif #endif /* WIN32 */ #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32) @@ -248,6 +274,9 @@ win32_langinfo(const char *ctype) * * If the result is PG_SQL_ASCII, callers should treat it as being compatible * with any desired encoding. + * + * If running in the backend and write_message is false, this function must + * cope with the possibility that elog() and palloc() are not yet usable. */ int pg_get_encoding_from_locale(const char *ctype, bool write_message) -- 2.40.0