From ed87e1980706975e7aa412bee200087774c5ff22 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 22 Sep 2017 11:35:12 -0400 Subject: [PATCH] Mop-up for commit 85feb77aa09cda9ff3e12cf95c757c499dc25343. Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.) --- src/backend/regex/regc_pg_locale.c | 24 +++--- src/backend/tsearch/wparser_def.c | 113 +++++++---------------------- 2 files changed, 40 insertions(+), 97 deletions(-) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index b2122e9e8f..e39ee7ae09 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -29,20 +29,20 @@ * * 2. In the "default" collation (which is supposed to obey LC_CTYPE): * - * 2a. When working in UTF8 encoding, we use the functions if - * available. This assumes that every platform uses Unicode codepoints - * directly as the wchar_t representation of Unicode. On some platforms + * 2a. When working in UTF8 encoding, we use the functions. + * This assumes that every platform uses Unicode codepoints directly + * as the wchar_t representation of Unicode. On some platforms * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. * - * 2b. In all other encodings, or on machines that lack , we use - * the functions for pg_wchar values up to 255, and punt for values - * above that. This is only 100% correct in single-byte encodings such as - * LATINn. However, non-Unicode multibyte encodings are mostly Far Eastern - * character sets for which the properties being tested here aren't very - * relevant for higher code values anyway. The difficulty with using the - * functions with non-Unicode multibyte encodings is that we can - * have no certainty that the platform's wchar_t representation matches - * what we do in pg_wchar conversions. + * 2b. In all other encodings, we use the functions for pg_wchar + * values up to 255, and punt for values above that. This is 100% correct + * only in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't very relevant for higher code values + * anyway. The difficulty with using the functions with + * non-Unicode multibyte encodings is that we can have no certainty that + * the platform's wchar_t representation matches what we do in pg_wchar + * conversions. * * 3. Other collations are only supported on platforms that HAVE_LOCALE_T. * Here, we use the locale_t-extended forms of the and diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index c118357336..8450e1c08e 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -427,94 +427,45 @@ TParserCopyClose(TParser *prs) * - if locale is C then we use pgwstr instead of wstr. */ -#define p_iswhat(type) \ +#define p_iswhat(type, nonascii) \ + \ static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - if ( prs->usewide ) \ +p_is##type(TParser *prs) \ +{ \ + Assert(prs->state); \ + if (prs->usewide) \ { \ - if ( prs->pgwstr ) \ + if (prs->pgwstr) \ { \ unsigned int c = *(prs->pgwstr + prs->state->poschar); \ - if ( c > 0x7f ) \ - return 0; \ - return is##type( c ); \ + if (c > 0x7f) \ + return nonascii; \ + return is##type(c); \ } \ - return isw##type( *( prs->wstr + prs->state->poschar ) ); \ + return isw##type(*(prs->wstr + prs->state->poschar)); \ } \ - \ - return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ -} \ + return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ +} \ \ static int \ -p_isnot##type(TParser *prs) { \ +p_isnot##type(TParser *prs) \ +{ \ return !p_is##type(prs); \ } -static int -p_isalnum(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalnum(c); - } - - return iswalnum(*(prs->wstr + prs->state->poschar)); - } - - return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte)); -} -static int -p_isnotalnum(TParser *prs) -{ - return !p_isalnum(prs); -} - -static int -p_isalpha(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalpha(c); - } - - return iswalpha(*(prs->wstr + prs->state->poschar)); - } - - return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte)); -} - -static int -p_isnotalpha(TParser *prs) -{ - return !p_isalpha(prs); -} +/* + * In C locale with a multibyte encoding, any non-ASCII symbol is considered + * an alpha character, but not a member of other char classes. + */ +p_iswhat(alnum, 1) +p_iswhat(alpha, 1) +p_iswhat(digit, 0) +p_iswhat(lower, 0) +p_iswhat(print, 0) +p_iswhat(punct, 0) +p_iswhat(space, 0) +p_iswhat(upper, 0) +p_iswhat(xdigit, 0) /* p_iseq should be used only for ascii symbols */ @@ -525,14 +476,6 @@ p_iseq(TParser *prs, char c) return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } -p_iswhat(digit) -p_iswhat(lower) -p_iswhat(print) -p_iswhat(punct) -p_iswhat(space) -p_iswhat(upper) -p_iswhat(xdigit) - static int p_isEOF(TParser *prs) { -- 2.40.0