From 59f9a0b9df0d224bb62ff8ec5b65e0b187655742 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 21 Feb 2004 00:34:53 +0000 Subject: [PATCH] Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. --- src/backend/commands/define.c | 19 +++---- src/backend/commands/functioncmds.c | 6 +-- src/backend/commands/proclang.c | 24 +++++---- src/backend/parser/keywords.c | 10 ++-- src/backend/parser/scan.l | 44 +++------------- src/backend/parser/scansup.c | 78 ++++++++++++++++++++++++++++- src/backend/utils/adt/varlena.c | 44 ++++++++-------- src/include/commands/defrem.h | 4 +- src/include/parser/scansup.h | 9 +++- src/pl/plpgsql/src/pl_funcs.c | 45 ++++++----------- 10 files changed, 158 insertions(+), 125 deletions(-) diff --git a/src/backend/commands/define.c b/src/backend/commands/define.c index 8e30d53d3d..fc24c2c30f 100644 --- a/src/backend/commands/define.c +++ b/src/backend/commands/define.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.85 2003/11/29 19:51:47 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.86 2004/02/21 00:34:52 tgl Exp $ * * DESCRIPTION * The "DefineFoo" routines take the parse tree and pick out the @@ -38,24 +38,19 @@ #include "catalog/namespace.h" #include "commands/defrem.h" #include "parser/parse_type.h" +#include "parser/scansup.h" #include "utils/int8.h" /* - * Translate the input language name to lower case. + * Translate the input language name to lower case, and truncate if needed. * - * Output buffer must be NAMEDATALEN long. + * Returns a palloc'd string */ -void -case_translate_language_name(const char *input, char *output) +char * +case_translate_language_name(const char *input) { - int i; - - MemSet(output, 0, NAMEDATALEN); /* ensure result Name is - * zero-filled */ - - for (i = 0; i < NAMEDATALEN - 1 && input[i]; ++i) - output[i] = tolower((unsigned char) input[i]); + return downcase_truncate_identifier(input, strlen(input), false); } diff --git a/src/backend/commands/functioncmds.c b/src/backend/commands/functioncmds.c index 2eb4c100a2..c91b31ed6f 100644 --- a/src/backend/commands/functioncmds.c +++ b/src/backend/commands/functioncmds.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.43 2004/01/06 23:55:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.44 2004/02/21 00:34:52 tgl Exp $ * * DESCRIPTION * These routines take the parse tree and pick out the @@ -401,7 +401,7 @@ CreateFunction(CreateFunctionStmt *stmt) Oid prorettype; bool returnsSet; char *language; - char languageName[NAMEDATALEN]; + char *languageName; Oid languageOid; Oid languageValidator; char *funcname; @@ -437,7 +437,7 @@ CreateFunction(CreateFunctionStmt *stmt) &as_clause, &language, &volatility, &isStrict, &security); /* Convert language name to canonical case */ - case_translate_language_name(language, languageName); + languageName = case_translate_language_name(language); /* Look up the language and validate permissions */ languageTuple = SearchSysCache(LANGNAME, diff --git a/src/backend/commands/proclang.c b/src/backend/commands/proclang.c index 3c8e3185cb..ba6929325b 100644 --- a/src/backend/commands/proclang.c +++ b/src/backend/commands/proclang.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.52 2003/11/29 19:51:47 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.53 2004/02/21 00:34:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -40,11 +40,12 @@ void CreateProceduralLanguage(CreatePLangStmt *stmt) { - char languageName[NAMEDATALEN]; + char *languageName; Oid procOid, valProcOid; Oid funcrettype; Oid typev[FUNC_MAX_ARGS]; + NameData langname; char nulls[Natts_pg_language]; Datum values[Natts_pg_language]; Relation rel; @@ -66,7 +67,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt) * Translate the language name and check that this language doesn't * already exist */ - case_translate_language_name(stmt->plname, languageName); + languageName = case_translate_language_name(stmt->plname); if (SearchSysCacheExists(LANGNAME, PointerGetDatum(languageName), @@ -124,12 +125,13 @@ CreateProceduralLanguage(CreatePLangStmt *stmt) } i = 0; - values[i++] = PointerGetDatum(languageName); - values[i++] = BoolGetDatum(true); /* lanispl */ - values[i++] = BoolGetDatum(stmt->pltrusted); - values[i++] = ObjectIdGetDatum(procOid); - values[i++] = ObjectIdGetDatum(valProcOid); - nulls[i] = 'n'; /* lanacl */ + namestrcpy(&langname, languageName); + values[i++] = NameGetDatum(&langname); /* lanname */ + values[i++] = BoolGetDatum(true); /* lanispl */ + values[i++] = BoolGetDatum(stmt->pltrusted); /* lanpltrusted */ + values[i++] = ObjectIdGetDatum(procOid); /* lanplcallfoid */ + values[i++] = ObjectIdGetDatum(valProcOid); /* lanvalidator */ + nulls[i] = 'n'; /* lanacl */ rel = heap_openr(LanguageRelationName, RowExclusiveLock); @@ -173,7 +175,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt) void DropProceduralLanguage(DropPLangStmt *stmt) { - char languageName[NAMEDATALEN]; + char *languageName; HeapTuple langTup; ObjectAddress object; @@ -189,7 +191,7 @@ DropProceduralLanguage(DropPLangStmt *stmt) * Translate the language name, check that this language exist and is * a PL */ - case_translate_language_name(stmt->plname, languageName); + languageName = case_translate_language_name(stmt->plname); langTup = SearchSysCache(LANGNAME, CStringGetDatum(languageName), diff --git a/src/backend/parser/keywords.c b/src/backend/parser/keywords.c index 57e020c108..a94786690e 100644 --- a/src/backend/parser/keywords.c +++ b/src/backend/parser/keywords.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.144 2003/11/29 19:51:51 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.145 2004/02/21 00:34:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -369,17 +369,13 @@ ScanKeywordLookup(const char *text) /* * Apply an ASCII-only downcasing. We must not use tolower() since it - * may produce the wrong translation in some locales (eg, Turkish), - * and we don't trust isupper() very much either. In an ASCII-based - * encoding the tests against A and Z are sufficient, but we also - * check isupper() so that we will work correctly under EBCDIC. The - * actual case conversion step should work for either ASCII or EBCDIC. + * may produce the wrong translation in some locales (eg, Turkish). */ for (i = 0; i < len; i++) { char ch = text[i]; - if (ch >= 'A' && ch <= 'Z' && isupper((unsigned char) ch)) + if (ch >= 'A' && ch <= 'Z') ch += 'a' - 'A'; word[i] = ch; } diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 13cbfb9895..caab9a002c 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -10,7 +10,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.113 2004/02/19 19:11:30 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.114 2004/02/21 00:34:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -27,6 +27,7 @@ #include "parser/keywords.h" /* Not needed now that this file is compiled as part of gram.y */ /* #include "parser/parse.h" */ +#include "parser/scansup.h" #include "utils/builtins.h" #include "mb/pg_wchar.h" @@ -395,23 +396,15 @@ other . startlit(); } {xdstop} { + char *ident; + BEGIN(INITIAL); if (literallen == 0) yyerror("zero-length delimited identifier"); + ident = litbufdup(); if (literallen >= NAMEDATALEN) - { - int len; - - len = pg_mbcliplen(literalbuf, literallen, - NAMEDATALEN-1); - ereport(NOTICE, - (errcode(ERRCODE_NAME_TOO_LONG), - errmsg("identifier \"%s\" will be truncated to \"%.*s\"", - literalbuf, len, literalbuf))); - literalbuf[len] = '\0'; - literallen = len; - } - yylval.str = litbufdup(); + truncate_identifier(ident, literallen, true); + yylval.str = ident; return IDENT; } {xddouble} { @@ -537,7 +530,6 @@ other . {identifier} { const ScanKeyword *keyword; char *ident; - int i; /* Is it a keyword? */ keyword = ScanKeywordLookup(yytext); @@ -550,28 +542,8 @@ other . /* * No. Convert the identifier to lower case, and truncate * if necessary. - * - * Note: here we use a locale-dependent case conversion, - * which seems appropriate under standard SQL rules, whereas - * the keyword comparison was NOT locale-dependent. */ - ident = pstrdup(yytext); - for (i = 0; ident[i]; i++) - { - if (isupper((unsigned char) ident[i])) - ident[i] = tolower((unsigned char) ident[i]); - } - if (i >= NAMEDATALEN) - { - int len; - - len = pg_mbcliplen(ident, i, NAMEDATALEN-1); - ereport(NOTICE, - (errcode(ERRCODE_NAME_TOO_LONG), - errmsg("identifier \"%s\" will be truncated to \"%.*s\"", - ident, len, ident))); - ident[len] = '\0'; - } + ident = downcase_truncate_identifier(yytext, yyleng, true); yylval.str = ident; return IDENT; } diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index 9177b858a7..76c620b394 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.25 2003/11/29 19:51:52 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.26 2004/02/21 00:34:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,6 +19,8 @@ #include "miscadmin.h" #include "parser/scansup.h" +#include "mb/pg_wchar.h" + /* ---------------- * scanstr @@ -32,7 +34,7 @@ */ char * -scanstr(char *s) +scanstr(const char *s) { char *newStr; int len, @@ -109,3 +111,75 @@ scanstr(char *s) newStr[j] = '\0'; return newStr; } + + +/* + * downcase_truncate_identifier() --- do appropriate downcasing and + * truncation of an unquoted identifier. Optionally warn of truncation. + * + * Returns a palloc'd string containing the adjusted identifier. + * + * Note: in some usages the passed string is not null-terminated. + * + * Note: the API of this function is designed to allow for downcasing + * transformations that increase the string length, but we don't yet + * support that. If you want to implement it, you'll need to fix + * SplitIdentifierString() in utils/adt/varlena.c. + */ +char * +downcase_truncate_identifier(const char *ident, int len, bool warn) +{ + char *result; + int i; + + result = palloc(len + 1); + /* + * SQL99 specifies Unicode-aware case normalization, which we don't yet + * have the infrastructure for. Instead we use tolower() to provide a + * locale-aware translation. However, there are some locales where this + * is not right either (eg, Turkish may do strange things with 'i' and + * 'I'). Our current compromise is to use tolower() for characters with + * the high bit set, and use an ASCII-only downcasing for 7-bit + * characters. + */ + for (i = 0; i < len; i++) + { + unsigned char ch = (unsigned char) ident[i]; + + if (ch >= 'A' && ch <= 'Z') + ch += 'a' - 'A'; + else if (ch >= 0x80 && isupper(ch)) + ch = tolower(ch); + result[i] = (char) ch; + } + result[i] = '\0'; + + if (i >= NAMEDATALEN) + truncate_identifier(result, i, warn); + + return result; +} + +/* + * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes. + * + * The given string is modified in-place, if necessary. A warning is + * issued if requested. + * + * We require the caller to pass in the string length since this saves a + * strlen() call in some common usages. + */ +void +truncate_identifier(char *ident, int len, bool warn) +{ + if (len >= NAMEDATALEN) + { + len = pg_mbcliplen(ident, len, NAMEDATALEN-1); + if (warn) + ereport(NOTICE, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("identifier \"%s\" will be truncated to \"%.*s\"", + ident, len, ident))); + ident[len] = '\0'; + } +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 3d96ce23ac..f329486321 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.111 2004/01/31 05:09:40 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.112 2004/02/21 00:34:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -16,17 +16,18 @@ #include -#include "mb/pg_wchar.h" -#include "miscadmin.h" #include "access/tuptoaster.h" #include "catalog/pg_type.h" #include "lib/stringinfo.h" #include "libpq/crypt.h" #include "libpq/pqformat.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "parser/scansup.h" #include "utils/array.h" #include "utils/builtins.h" -#include "utils/pg_locale.h" #include "utils/lsyscache.h" +#include "utils/pg_locale.h" typedef struct varlena unknown; @@ -1695,7 +1696,6 @@ SplitIdentifierString(char *rawstring, char separator, { char *curname; char *endp; - int curlen; if (*nextp == '\"') { @@ -1718,21 +1718,30 @@ SplitIdentifierString(char *rawstring, char separator, else { /* Unquoted name --- extends to separator or whitespace */ + char *downname; + int len; + curname = nextp; while (*nextp && *nextp != separator && !isspace((unsigned char) *nextp)) - { - /* - * It's important that this match the identifier - * downcasing code used by backend/parser/scan.l. - */ - if (isupper((unsigned char) *nextp)) - *nextp = tolower((unsigned char) *nextp); nextp++; - } endp = nextp; if (curname == nextp) return false; /* empty unquoted name not allowed */ + /* + * Downcase the identifier, using same code as main lexer does. + * + * XXX because we want to overwrite the input in-place, we cannot + * support a downcasing transformation that increases the + * string length. This is not a problem given the current + * implementation of downcase_truncate_identifier, but we'll + * probably have to do something about this someday. + */ + len = endp - curname; + downname = downcase_truncate_identifier(curname, len, false); + Assert(strlen(downname) <= len); + strncpy(curname, downname, len); + pfree(downname); } while (isspace((unsigned char) *nextp)) @@ -1753,13 +1762,8 @@ SplitIdentifierString(char *rawstring, char separator, /* Now safe to overwrite separator with a null */ *endp = '\0'; - /* Truncate name if it's overlength; again, should match scan.l */ - curlen = strlen(curname); - if (curlen >= NAMEDATALEN) - { - curlen = pg_mbcliplen(curname, curlen, NAMEDATALEN - 1); - curname[curlen] = '\0'; - } + /* Truncate name if it's overlength */ + truncate_identifier(curname, strlen(curname), false); /* * Finished isolating current name --- add it to list diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index a462dd55ac..00f5fa1a48 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/commands/defrem.h,v 1.53 2003/11/29 22:40:59 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/commands/defrem.h,v 1.54 2004/02/21 00:34:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,7 +61,7 @@ extern void RenameOpClass(List *name, const char *access_method, const char *new /* support routines in commands/define.c */ -extern void case_translate_language_name(const char *input, char *output); +extern char *case_translate_language_name(const char *input); extern char *defGetString(DefElem *def); extern double defGetNumeric(DefElem *def); diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h index caa2f5d172..d710c81060 100644 --- a/src/include/parser/scansup.h +++ b/src/include/parser/scansup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/parser/scansup.h,v 1.14 2003/11/29 22:41:09 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/parser/scansup.h,v 1.15 2004/02/21 00:34:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -15,6 +15,11 @@ #ifndef SCANSUP_H #define SCANSUP_H -extern char *scanstr(char *s); +extern char *scanstr(const char *s); + +extern char *downcase_truncate_identifier(const char *ident, int len, + bool warn); + +extern void truncate_identifier(char *ident, int len, bool warn); #endif /* SCANSUP_H */ diff --git a/src/pl/plpgsql/src/pl_funcs.c b/src/pl/plpgsql/src/pl_funcs.c index b9a8a8b0ba..f49a2ac500 100644 --- a/src/pl/plpgsql/src/pl_funcs.c +++ b/src/pl/plpgsql/src/pl_funcs.c @@ -3,7 +3,7 @@ * procedural language * * IDENTIFICATION - * $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_funcs.c,v 1.31 2003/11/29 19:52:12 pgsql Exp $ + * $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_funcs.c,v 1.32 2004/02/21 00:34:53 tgl Exp $ * * This software is copyrighted by Jan Wieck - Hamburg. * @@ -40,7 +40,7 @@ #include -#include "mb/pg_wchar.h" +#include "parser/scansup.h" /* ---------- @@ -348,15 +348,15 @@ plpgsql_convert_ident(const char *s, char **output, int numidents) { char *curident; char *cp; - int i; /* Process current identifier */ - curident = palloc(strlen(s) + 1); /* surely enough room */ - cp = curident; if (*s == '"') { /* Quoted identifier: copy, collapsing out doubled quotes */ + + curident = palloc(strlen(s) + 1); /* surely enough room */ + cp = curident; s++; while (*s) { @@ -373,35 +373,20 @@ plpgsql_convert_ident(const char *s, char **output, int numidents) (errcode(ERRCODE_SYNTAX_ERROR), errmsg("unterminated \" in name: %s", sstart))); s++; + *cp = '\0'; + /* Truncate to NAMEDATALEN */ + truncate_identifier(curident, cp-curident, false); } else { - /* - * Normal identifier: downcase, stop at dot or whitespace. - * - * Note that downcasing is locale-sensitive, following SQL99 - * rules for identifiers. We have already decided that the - * item is not a PLPGSQL keyword. - */ - while (*s && *s != '.' && !isspace((unsigned char) *s)) - { - if (isupper((unsigned char) *s)) - *cp++ = tolower((unsigned char) *s++); - else - *cp++ = *s++; - } - } - - /* Truncate to NAMEDATALEN */ - *cp = '\0'; - i = cp - curident; - - if (i >= NAMEDATALEN) - { - int len; + /* Normal identifier: extends till dot or whitespace */ + const char *thisstart = s; - len = pg_mbcliplen(curident, i, NAMEDATALEN - 1); - curident[len] = '\0'; + while (*s && *s != '.' && !isspace((unsigned char) *s)) + s++; + /* Downcase and truncate to NAMEDATALEN */ + curident = downcase_truncate_identifier(thisstart, s-thisstart, + false); } /* Pass ident to caller */ -- 2.40.0