From: Robert Haas Date: Sat, 29 Oct 2011 18:22:20 +0000 (-0400) Subject: Improve make_greater_string() with encoding-specific incrementers. X-Git-Tag: REL9_2_BETA1~914 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=78d523b633d0c251c904318f2ba9916e2c47b9e8;p=postgresql Improve make_greater_string() with encoding-specific incrementers. This infrastructure doesn't in any way guarantee that the character we produce will sort before the one we incremented; but it does at least make it much more likely that we'll end up with something that is a valid character, which improves our chances. Kyotaro Horiguchi, with various adjustments by me. --- diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index f05cc4f87e..107d85fa8b 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5665,6 +5665,19 @@ pattern_selectivity(Const *patt, Pattern_Type ptype) } +/* + * For bytea, the increment function need only increment the current byte + * (there are no multibyte characters to worry about). + */ +static bool +byte_increment(unsigned char *ptr, int len) +{ + if (*ptr >= 255) + return false; + (*ptr)++; + return true; +} + /* * Try to generate a string greater than the given string or any * string it is a prefix of. If successful, return a palloc'd string @@ -5704,6 +5717,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) int len; Datum cmpstr; text *cmptxt = NULL; + mbcharacter_incrementer charinc; /* * Get a modifiable copy of the prefix string in C-string format, and set @@ -5765,29 +5779,33 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) } } + if (datatype == BYTEAOID) + charinc = &byte_increment; + else + charinc = pg_database_encoding_character_incrementer(); + while (len > 0) { - unsigned char *lastchar = (unsigned char *) (workstr + len - 1); - unsigned char savelastchar = *lastchar; + int charlen; + unsigned char *lastchar; + Const *workstr_const; + + if (datatype == BYTEAOID) + charlen = 1; + else + charlen = len - pg_mbcliplen(workstr, len, len - 1); + lastchar = (unsigned char *) (workstr + len - charlen); /* - * Try to generate a larger string by incrementing the last byte. + * Try to generate a larger string by incrementing the last character + * (for BYTEA, we treat each byte as a character). */ - while (*lastchar < (unsigned char) 255) + if (charinc(lastchar, charlen)) { - Const *workstr_const; - - (*lastchar)++; - - if (datatype != BYTEAOID) - { - /* do not generate invalid encoding sequences */ - if (!pg_verifymbstr(workstr, len, true)) - continue; - workstr_const = string_to_const(workstr, datatype); - } - else + if (datatype == BYTEAOID) workstr_const = string_to_bytea_const(workstr, len); + else + workstr_const = string_to_const(workstr, datatype); if (DatumGetBool(FunctionCall2Coll(ltproc, collation, @@ -5806,20 +5824,11 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) pfree(workstr_const); } - /* restore last byte so we don't confuse pg_mbcliplen */ - *lastchar = savelastchar; - /* - * Truncate off the last character, which might be more than 1 byte, - * depending on the character encoding. + * Truncate off the last character or byte. */ - if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1) - len = pg_mbcliplen(workstr, len, len - 1); - else - len -= 1; - - if (datatype != BYTEAOID) - workstr[len] = '\0'; + len -= charlen; + workstr[len] = '\0'; } /* Failed... */ diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index f23732f01e..39f6efc241 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1334,6 +1334,244 @@ pg_utf8_islegal(const unsigned char *source, int length) return true; } +#ifndef FRONTEND + +/* + * Generic character increment function. + * + * Not knowing anything about the properties of the encoding in use, we just + * keep incrementing the last byte until pg_verifymbstr() likes the result, + * or we run out of values to try. + * + * Like all character-increment functions, we must restore the original input + * string on failure. + */ +static bool +pg_generic_charinc(unsigned char *charptr, int len) +{ + unsigned char *lastchar = (unsigned char *) (charptr + len - 1); + unsigned char savelastchar = *lastchar; + const char *const_charptr = (const char *)charptr; + + while (*lastchar < (unsigned char) 255) + { + (*lastchar)++; + if (!pg_verifymbstr(const_charptr, len, true)) + continue; + return true; + } + + *lastchar = savelastchar; + return false; +} + +/* + * UTF-8 character increment function. + * + * For a one-byte character less than 0x7F, we just increment the byte. + * + * For a multibyte character, every byte but the first must fall between 0x80 + * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment + * the last byte that's not already at its maximum value, and set any following + * bytes back to 0x80. If we can't find a byte that's less than the maximum + * allowable vale, we simply fail. We also have some special-case logic to + * skip regions used for surrogate pair handling, as those should not occur in + * valid UTF-8. + * + * Like all character-increment functions, we must restore the original input + * string on failure. + */ +static bool +pg_utf8_increment(unsigned char *charptr, int length) +{ + unsigned char a; + unsigned char bak[4]; + unsigned char limit; + + switch (length) + { + default: + /* reject lengths 5 and 6 for now */ + return false; + case 4: + bak[3] = charptr[3]; + a = charptr[3]; + if (a < 0xBF) + { + charptr[3]++; + break; + } + charptr[3] = 0x80; + /* FALL THRU */ + case 3: + bak[2] = charptr[2]; + a = charptr[2]; + if (a < 0xBF) + { + charptr[2]++; + break; + } + charptr[2] = 0x80; + /* FALL THRU */ + case 2: + bak[1] = charptr[1]; + a = charptr[1]; + switch (*charptr) + { + case 0xED: + limit = 0x9F; + break; + case 0xF4: + limit = 0x8F; + break; + default: + limit = 0xBF; + break; + } + if (a < limit) + { + charptr[1]++; + break; + } + charptr[1] = 0x80; + /* FALL THRU */ + case 1: + bak[0] = *charptr; + a = *charptr; + if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4) + { + /* Restore original string. */ + memcpy(charptr, bak, length); + return false; + } + charptr[0]++; + break; + } + + return true; +} + +/* + * EUC-JP character increment function. + * + * If the sequence starts with SS2(0x8e), it must be a two-byte sequence + * representing JIS X 0201 characters with the second byte ranges between + * 0xa1 and 0xde. We just increment the last byte if it's less than 0xde, + * and otherwise rewrite whole the sequence to 0xa1 0xa1. + * + * If the sequence starts with SS3(0x8f), it must be a three-byte sequence + * which the last two bytes ranges between 0xa1 and 0xfe. The last byte + * is incremented, carrying overflow to the second-to-last byte. + * + * If the sequence starts with the values other than the aboves and its MSB + * is set, it must be a two-byte sequence representing JIS X 0208 characters + * with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented, + * carrying overflow to the second-to-last byte. + * + * Otherwise the sequence is consists of single byte representing ASCII + * characters. It is incremented up to 0x7f. + * + * Only three EUC-JP byte sequences shown below - which have no character + * allocated - make this function to fail in spite of its validity: 0x7f, + * 0xfe 0xfe, 0x8f 0xfe 0xfe. + */ +static bool +pg_eucjp_increment(unsigned char *charptr, int length) +{ + unsigned char bak[3]; + unsigned char c1, c2; + signed int i; + + c1 = *charptr; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + if (length != 2) + return false; + + c2 = charptr[1]; + + if (c2 > 0xde) + charptr[0] = charptr[1] = 0xa1; + else if (c2 < 0xa1) + charptr[1] = 0xa1; + else + charptr[1]++; + + break; + + case SS3: /* JIS X 0212 */ + if (length != 3) + return false; + + for (i = 2; i > 0; i--) + { + bak[i] = charptr[i]; + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + break; + } + charptr[i] = 0xa1; + } + + if (i == 0) /* Out of 3-byte code region */ + { + charptr[1] = bak[1]; + charptr[2] = bak[2]; + return false; + } + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + if (length != 2) + return false; + + for (i = 1 ; i >= 0 ; i--) /* i must be signed */ + { + bak[i] = charptr[i]; + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + break; + } + charptr[i] = 0xa1; + } + + if (i < 0) /* Out of 2 byte code region */ + { + charptr[0] = bak[0]; + charptr[1] = bak[1]; + return false; + } + } + else + { /* ASCII, single byte */ + if (c1 > 0x7e) + return false; + (*charptr)++; + } + } + + return true; +} +#endif + /* *------------------------------------------------------------------- * encoding info table @@ -1458,6 +1696,25 @@ pg_database_encoding_max_length(void) return pg_wchar_table[GetDatabaseEncoding()].maxmblen; } +/* + * give the character incrementer for the encoding for the current database + */ +mbcharacter_incrementer +pg_database_encoding_character_incrementer(void) +{ + switch (GetDatabaseEncoding()) + { + case PG_UTF8: + return pg_utf8_increment; + + case PG_EUC_JP: + return pg_eucjp_increment; + + default: + return pg_generic_charinc; + } +} + /* * Verify mbstr to make sure that it is validly encoded in the current * database encoding. Otherwise same as pg_verify_mbstr(). diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 826c7af53b..db4409163b 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr); typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr); +typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len); + typedef int (*mbverifier) (const unsigned char *mbstr, int len); typedef struct @@ -389,6 +391,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr, extern int pg_mbcharcliplen(const char *mbstr, int len, int imit); extern int pg_encoding_max_length(int encoding); extern int pg_database_encoding_max_length(void); +extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void); extern int PrepareClientEncoding(int encoding); extern int SetClientEncoding(int encoding);