From 958ffb8c286d93d1bfced17e6300d13f9634b431 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Mon, 21 Aug 2017 11:22:00 -0400 Subject: [PATCH] Don't install ICU collation keyword variants Users can still create them themselves. Instead, document Unicode TR 35 collation options for ICU, so users can create all this themselves. Reviewed-by: Peter Geoghegan --- doc/src/sgml/charset.sgml | 98 ++++++++++++++++++++++++---- src/backend/commands/collationcmds.c | 71 -------------------- 2 files changed, 84 insertions(+), 85 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index f2a4acc115..44e43503a6 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -664,13 +664,6 @@ SELECT a COLLATE "C" < b COLLATE "POSIX" FROM test1; - - de-u-co-phonebk-x-icu - - German collation, phone book variant - - - de-AT-x-icu @@ -683,13 +676,6 @@ SELECT a COLLATE "C" < b COLLATE "POSIX" FROM test1; - - de-AT-u-co-phonebk-x-icu - - German collation for Austria, phone book variant - - - und-x-icu (for undefined) @@ -709,6 +695,90 @@ SELECT a COLLATE "C" < b COLLATE "POSIX" FROM test1; will draw an error along the lines of collation "de-x-icu" for encoding "WIN874" does not exist. + + + ICU allows collations to be customized beyond the basic language+country + set that is preloaded by initdb. Users are encouraged + to define their own collation objects that make use of these facilities to + suit the sorting behavior to their requirements. Here are some examples: + + + + CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk') + + German collation with phone book collation type + + + + + CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji') + + + Root collation with Emoji collation type, per Unicode Technical Standard #51 + + + + + + CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit') + + + Sort digits after Latin letters. (The default is digits before letters.) + + + + + + CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper') + + + Sort upper-case letters before lower-case letters. (The default is + lower-case letters first.) + + + + + + CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit') + + + Combines both of the above options. + + + + + + CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true') + + + Numeric ordering, sorts sequences of digits by their numeric value, + for example: A-21 < A-123 + (also known as natural sort). + + + + + + See Unicode + Technical Standard #35 + and BCP 47 for + details. The list of possible collation types (co + subtag) can be found in + the CLDR + repository. + The ICU Locale + Explorer can be used to check the details of a particular locale + definition. + + + + Note that while this system allows creating collations that ignore + case or ignore accents or similar (using + the ks key), PostgreSQL does not at the moment allow + such collations to act in a truly case- or accent-insensitive manner. Any + strings that compare equal according to the collation but are not + byte-wise equal will be sorted according to their byte values. + diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index d36ce53560..9437731276 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -687,30 +687,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS) */ for (i = -1; i < uloc_countAvailable(); i++) { - /* - * In ICU 4.2, ucol_getKeywordValuesForLocale() sometimes returns - * values that will not be accepted by uloc_toLanguageTag(). Skip - * loading keyword variants in that version. (Both - * ucol_getKeywordValuesForLocale() and uloc_toLanguageTag() are - * new in ICU 4.2, so older versions are not supported at all.) - * - * XXX We have no information about ICU 4.3 through 4.7, but we - * know the code below works with 4.8. - */ -#if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM > 2) -#define LOAD_ICU_KEYWORD_VARIANTS -#endif - const char *name; char *langtag; char *icucomment; const char *collcollate; Oid collid; -#ifdef LOAD_ICU_KEYWORD_VARIANTS - UEnumeration *en; - UErrorCode status; - const char *val; -#endif if (i == -1) name = ""; /* ICU root locale */ @@ -744,58 +725,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS) CreateComments(collid, CollationRelationId, 0, icucomment); } - - /* - * Add keyword variants, if enabled. - */ -#ifdef LOAD_ICU_KEYWORD_VARIANTS - status = U_ZERO_ERROR; - en = ucol_getKeywordValuesForLocale("collation", name, TRUE, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not get keyword values for locale \"%s\": %s", - name, u_errorName(status)))); - - status = U_ZERO_ERROR; - uenum_reset(en, &status); - while ((val = uenum_next(en, NULL, &status))) - { - char *localeid = psprintf("%s@collation=%s", name, val); - - langtag = get_icu_language_tag(localeid); - collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid; - - /* - * Be paranoid about not allowing any non-ASCII strings into - * pg_collation - */ - if (!is_all_ascii(langtag) || !is_all_ascii(collcollate)) - continue; - - collid = CollationCreate(psprintf("%s-x-icu", langtag), - nspid, GetUserId(), - COLLPROVIDER_ICU, -1, - collcollate, collcollate, - get_collation_actual_version(COLLPROVIDER_ICU, collcollate), - true, true); - if (OidIsValid(collid)) - { - ncreated++; - - CommandCounterIncrement(); - - icucomment = get_icu_locale_comment(localeid); - if (icucomment) - CreateComments(collid, CollationRelationId, 0, - icucomment); - } - } - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not get keyword values for locale \"%s\": %s", - name, u_errorName(status)))); - uenum_close(en); -#endif /* LOAD_ICU_KEYWORD_VARIANTS */ } } #endif /* USE_ICU */ -- 2.40.0