]> granicus.if.org Git - postgresql/commitdiff
Implement a solution to the 'Turkish locale downcases I incorrectly'
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 21 Feb 2004 00:34:53 +0000 (00:34 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 21 Feb 2004 00:34:53 +0000 (00:34 +0000)
problem, per previous discussion.  Make some additional changes to
centralize the knowledge of just how identifier downcasing is done,
in hopes of simplifying any future tweaking in this area.

src/backend/commands/define.c
src/backend/commands/functioncmds.c
src/backend/commands/proclang.c
src/backend/parser/keywords.c
src/backend/parser/scan.l
src/backend/parser/scansup.c
src/backend/utils/adt/varlena.c
src/include/commands/defrem.h
src/include/parser/scansup.h
src/pl/plpgsql/src/pl_funcs.c

index 8e30d53d3dd149aa570f73c8d4c5e6c418efbcbc..fc24c2c30fb2cbde081275a229868de3516ebc69 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.85 2003/11/29 19:51:47 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.86 2004/02/21 00:34:52 tgl Exp $
  *
  * DESCRIPTION
  *       The "DefineFoo" routines take the parse tree and pick out the
 #include "catalog/namespace.h"
 #include "commands/defrem.h"
 #include "parser/parse_type.h"
+#include "parser/scansup.h"
 #include "utils/int8.h"
 
 
 /*
- * Translate the input language name to lower case.
+ * Translate the input language name to lower case, and truncate if needed.
  *
- * Output buffer must be NAMEDATALEN long.
+ * Returns a palloc'd string
  */
-void
-case_translate_language_name(const char *input, char *output)
+char *
+case_translate_language_name(const char *input)
 {
-       int                     i;
-
-       MemSet(output, 0, NAMEDATALEN);         /* ensure result Name is
-                                                                                * zero-filled */
-
-       for (i = 0; i < NAMEDATALEN - 1 && input[i]; ++i)
-               output[i] = tolower((unsigned char) input[i]);
+       return downcase_truncate_identifier(input, strlen(input), false);
 }
 
 
index 2eb4c100a2b2ae9701970b686666ab4f86f06eff..c91b31ed6fd58f9238ee0b1f917a069d042820f7 100644 (file)
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.43 2004/01/06 23:55:18 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.44 2004/02/21 00:34:52 tgl Exp $
  *
  * DESCRIPTION
  *       These routines take the parse tree and pick out the
@@ -401,7 +401,7 @@ CreateFunction(CreateFunctionStmt *stmt)
        Oid                     prorettype;
        bool            returnsSet;
        char       *language;
-       char            languageName[NAMEDATALEN];
+       char       *languageName;
        Oid                     languageOid;
        Oid                     languageValidator;
        char       *funcname;
@@ -437,7 +437,7 @@ CreateFunction(CreateFunctionStmt *stmt)
                           &as_clause, &language, &volatility, &isStrict, &security);
 
        /* Convert language name to canonical case */
-       case_translate_language_name(language, languageName);
+       languageName = case_translate_language_name(language);
 
        /* Look up the language and validate permissions */
        languageTuple = SearchSysCache(LANGNAME,
index 3c8e3185cb91985891bb87f18619ace0b87fc56c..ba6929325bd776250b949511b06935fb958c58fd 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.52 2003/11/29 19:51:47 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.53 2004/02/21 00:34:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 void
 CreateProceduralLanguage(CreatePLangStmt *stmt)
 {
-       char            languageName[NAMEDATALEN];
+       char       *languageName;
        Oid                     procOid,
                                valProcOid;
        Oid                     funcrettype;
        Oid                     typev[FUNC_MAX_ARGS];
+       NameData        langname;
        char            nulls[Natts_pg_language];
        Datum           values[Natts_pg_language];
        Relation        rel;
@@ -66,7 +67,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
         * Translate the language name and check that this language doesn't
         * already exist
         */
-       case_translate_language_name(stmt->plname, languageName);
+       languageName = case_translate_language_name(stmt->plname);
 
        if (SearchSysCacheExists(LANGNAME,
                                                         PointerGetDatum(languageName),
@@ -124,12 +125,13 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
        }
 
        i = 0;
-       values[i++] = PointerGetDatum(languageName);
-       values[i++] = BoolGetDatum(true);       /* lanispl */
-       values[i++] = BoolGetDatum(stmt->pltrusted);
-       values[i++] = ObjectIdGetDatum(procOid);
-       values[i++] = ObjectIdGetDatum(valProcOid);
-       nulls[i] = 'n';                         /* lanacl */
+       namestrcpy(&langname, languageName);
+       values[i++] = NameGetDatum(&langname);                  /* lanname */
+       values[i++] = BoolGetDatum(true);                               /* lanispl */
+       values[i++] = BoolGetDatum(stmt->pltrusted);    /* lanpltrusted */
+       values[i++] = ObjectIdGetDatum(procOid);                /* lanplcallfoid */
+       values[i++] = ObjectIdGetDatum(valProcOid);             /* lanvalidator */
+       nulls[i] = 'n';                                                                 /* lanacl */
 
        rel = heap_openr(LanguageRelationName, RowExclusiveLock);
 
@@ -173,7 +175,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
 void
 DropProceduralLanguage(DropPLangStmt *stmt)
 {
-       char            languageName[NAMEDATALEN];
+       char       *languageName;
        HeapTuple       langTup;
        ObjectAddress object;
 
@@ -189,7 +191,7 @@ DropProceduralLanguage(DropPLangStmt *stmt)
         * Translate the language name, check that this language exist and is
         * a PL
         */
-       case_translate_language_name(stmt->plname, languageName);
+       languageName = case_translate_language_name(stmt->plname);
 
        langTup = SearchSysCache(LANGNAME,
                                                         CStringGetDatum(languageName),
index 57e020c10808d3879821a8d455eba7bd6b6aaf23..a94786690ed52e04190cdc4e43bab3a54aea0433 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.144 2003/11/29 19:51:51 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.145 2004/02/21 00:34:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -369,17 +369,13 @@ ScanKeywordLookup(const char *text)
 
        /*
         * Apply an ASCII-only downcasing.      We must not use tolower() since it
-        * may produce the wrong translation in some locales (eg, Turkish),
-        * and we don't trust isupper() very much either.  In an ASCII-based
-        * encoding the tests against A and Z are sufficient, but we also
-        * check isupper() so that we will work correctly under EBCDIC.  The
-        * actual case conversion step should work for either ASCII or EBCDIC.
+        * may produce the wrong translation in some locales (eg, Turkish).
         */
        for (i = 0; i < len; i++)
        {
                char            ch = text[i];
 
-               if (ch >= 'A' && ch <= 'Z' && isupper((unsigned char) ch))
+               if (ch >= 'A' && ch <= 'Z')
                        ch += 'a' - 'A';
                word[i] = ch;
        }
index 13cbfb9895e52478c2e8f84bd893ce1f6fb18945..caab9a002cf075298accde359be5d39c02e1e89d 100644 (file)
@@ -10,7 +10,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.113 2004/02/19 19:11:30 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.114 2004/02/21 00:34:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -27,6 +27,7 @@
 #include "parser/keywords.h"
 /* Not needed now that this file is compiled as part of gram.y */
 /* #include "parser/parse.h" */
+#include "parser/scansup.h"
 #include "utils/builtins.h"
 #include "mb/pg_wchar.h"
 
@@ -395,23 +396,15 @@ other                     .
                                        startlit();
                                }
 <xd>{xdstop}   {
+                                       char               *ident;
+
                                        BEGIN(INITIAL);
                                        if (literallen == 0)
                                                yyerror("zero-length delimited identifier");
+                                       ident = litbufdup();
                                        if (literallen >= NAMEDATALEN)
-                                       {
-                                               int len;
-
-                                               len = pg_mbcliplen(literalbuf, literallen,
-                                                                                  NAMEDATALEN-1);
-                                               ereport(NOTICE,
-                                                               (errcode(ERRCODE_NAME_TOO_LONG),
-                                                                errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
-                                                                               literalbuf, len, literalbuf)));
-                                               literalbuf[len] = '\0';
-                                               literallen = len;
-                                       }
-                                       yylval.str = litbufdup();
+                                               truncate_identifier(ident, literallen, true);
+                                       yylval.str = ident;
                                        return IDENT;
                                }
 <xd>{xddouble} {
@@ -537,7 +530,6 @@ other                       .
 {identifier}   {
                                        const ScanKeyword *keyword;
                                        char               *ident;
-                                       int                             i;
 
                                        /* Is it a keyword? */
                                        keyword = ScanKeywordLookup(yytext);
@@ -550,28 +542,8 @@ other                      .
                                        /*
                                         * No.  Convert the identifier to lower case, and truncate
                                         * if necessary.
-                                        *
-                                        * Note: here we use a locale-dependent case conversion,
-                                        * which seems appropriate under standard SQL rules, whereas
-                                        * the keyword comparison was NOT locale-dependent.
                                         */
-                                       ident = pstrdup(yytext);
-                                       for (i = 0; ident[i]; i++)
-                                       {
-                                               if (isupper((unsigned char) ident[i]))
-                                                       ident[i] = tolower((unsigned char) ident[i]);
-                                       }
-                                       if (i >= NAMEDATALEN)
-                    {
-                                               int len;
-
-                                               len = pg_mbcliplen(ident, i, NAMEDATALEN-1);
-                                               ereport(NOTICE,
-                                                               (errcode(ERRCODE_NAME_TOO_LONG),
-                                                                errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
-                                                                               ident, len, ident)));
-                                               ident[len] = '\0';
-                    }
+                                       ident = downcase_truncate_identifier(yytext, yyleng, true);
                                        yylval.str = ident;
                                        return IDENT;
                                }
index 9177b858a794f23e5644087e684576d4d979f416..76c620b394e595b04b87ddf3cc820f92a1275f33 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.25 2003/11/29 19:51:52 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.26 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "miscadmin.h"
 #include "parser/scansup.h"
+#include "mb/pg_wchar.h"
+
 
 /* ----------------
  *             scanstr
@@ -32,7 +34,7 @@
  */
 
 char *
-scanstr(char *s)
+scanstr(const char *s)
 {
        char       *newStr;
        int                     len,
@@ -109,3 +111,75 @@ scanstr(char *s)
        newStr[j] = '\0';
        return newStr;
 }
+
+
+/*
+ * downcase_truncate_identifier() --- do appropriate downcasing and
+ * truncation of an unquoted identifier.  Optionally warn of truncation.
+ *
+ * Returns a palloc'd string containing the adjusted identifier.
+ *
+ * Note: in some usages the passed string is not null-terminated.
+ *
+ * Note: the API of this function is designed to allow for downcasing
+ * transformations that increase the string length, but we don't yet
+ * support that.  If you want to implement it, you'll need to fix
+ * SplitIdentifierString() in utils/adt/varlena.c.
+ */
+char *
+downcase_truncate_identifier(const char *ident, int len, bool warn)
+{
+       char       *result;
+       int                     i;
+
+       result = palloc(len + 1);
+       /*
+        * SQL99 specifies Unicode-aware case normalization, which we don't yet
+        * have the infrastructure for.  Instead we use tolower() to provide a
+        * locale-aware translation.  However, there are some locales where this
+        * is not right either (eg, Turkish may do strange things with 'i' and
+        * 'I').  Our current compromise is to use tolower() for characters with
+        * the high bit set, and use an ASCII-only downcasing for 7-bit
+        * characters.
+        */
+       for (i = 0; i < len; i++)
+       {
+               unsigned char   ch = (unsigned char) ident[i];
+
+               if (ch >= 'A' && ch <= 'Z')
+                       ch += 'a' - 'A';
+               else if (ch >= 0x80 && isupper(ch))
+                       ch = tolower(ch);
+               result[i] = (char) ch;
+       }
+       result[i] = '\0';
+
+       if (i >= NAMEDATALEN)
+               truncate_identifier(result, i, warn);
+
+       return result;
+}
+
+/*
+ * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
+ *
+ * The given string is modified in-place, if necessary.  A warning is
+ * issued if requested.
+ *
+ * We require the caller to pass in the string length since this saves a
+ * strlen() call in some common usages.
+ */
+void
+truncate_identifier(char *ident, int len, bool warn)
+{
+       if (len >= NAMEDATALEN)
+       {
+               len = pg_mbcliplen(ident, len, NAMEDATALEN-1);
+               if (warn)
+                       ereport(NOTICE,
+                                       (errcode(ERRCODE_NAME_TOO_LONG),
+                                        errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
+                                                       ident, len, ident)));
+               ident[len] = '\0';
+       }
+}
index 3d96ce23ac5e5a74d590b7466656361ef2c42268..f329486321d96c8af9fe9b9e28a5196e65cb7f36 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.111 2004/01/31 05:09:40 neilc Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.112 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include <ctype.h>
 
-#include "mb/pg_wchar.h"
-#include "miscadmin.h"
 #include "access/tuptoaster.h"
 #include "catalog/pg_type.h"
 #include "lib/stringinfo.h"
 #include "libpq/crypt.h"
 #include "libpq/pqformat.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "parser/scansup.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
-#include "utils/pg_locale.h"
 #include "utils/lsyscache.h"
+#include "utils/pg_locale.h"
 
 
 typedef struct varlena unknown;
@@ -1695,7 +1696,6 @@ SplitIdentifierString(char *rawstring, char separator,
        {
                char       *curname;
                char       *endp;
-               int                     curlen;
 
                if (*nextp == '\"')
                {
@@ -1718,21 +1718,30 @@ SplitIdentifierString(char *rawstring, char separator,
                else
                {
                        /* Unquoted name --- extends to separator or whitespace */
+                       char       *downname;
+                       int                     len;
+
                        curname = nextp;
                        while (*nextp && *nextp != separator &&
                                   !isspace((unsigned char) *nextp))
-                       {
-                               /*
-                                * It's important that this match the identifier
-                                * downcasing code used by backend/parser/scan.l.
-                                */
-                               if (isupper((unsigned char) *nextp))
-                                       *nextp = tolower((unsigned char) *nextp);
                                nextp++;
-                       }
                        endp = nextp;
                        if (curname == nextp)
                                return false;   /* empty unquoted name not allowed */
+                       /*
+                        * Downcase the identifier, using same code as main lexer does.
+                        *
+                        * XXX because we want to overwrite the input in-place, we cannot
+                        * support a downcasing transformation that increases the
+                        * string length.  This is not a problem given the current
+                        * implementation of downcase_truncate_identifier, but we'll
+                        * probably have to do something about this someday.
+                        */
+                       len = endp - curname;
+                       downname = downcase_truncate_identifier(curname, len, false);
+                       Assert(strlen(downname) <= len);
+                       strncpy(curname, downname, len);
+                       pfree(downname);
                }
 
                while (isspace((unsigned char) *nextp))
@@ -1753,13 +1762,8 @@ SplitIdentifierString(char *rawstring, char separator,
                /* Now safe to overwrite separator with a null */
                *endp = '\0';
 
-               /* Truncate name if it's overlength; again, should match scan.l */
-               curlen = strlen(curname);
-               if (curlen >= NAMEDATALEN)
-               {
-                       curlen = pg_mbcliplen(curname, curlen, NAMEDATALEN - 1);
-                       curname[curlen] = '\0';
-               }
+               /* Truncate name if it's overlength */
+               truncate_identifier(curname, strlen(curname), false);
 
                /*
                 * Finished isolating current name --- add it to list
index a462dd55acb1c42c9313b7bce26f8c65f1cab9f3..00f5fa1a4801c23e8a72d0791da80f3fc2d5e236 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/commands/defrem.h,v 1.53 2003/11/29 22:40:59 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/commands/defrem.h,v 1.54 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -61,7 +61,7 @@ extern void RenameOpClass(List *name, const char *access_method, const char *new
 
 /* support routines in commands/define.c */
 
-extern void case_translate_language_name(const char *input, char *output);
+extern char *case_translate_language_name(const char *input);
 
 extern char *defGetString(DefElem *def);
 extern double defGetNumeric(DefElem *def);
index caa2f5d1727d9541e776d6ea3b03149324a5fa4e..d710c81060a6daee4e305a5fcffc7a13c4e80c47 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/parser/scansup.h,v 1.14 2003/11/29 22:41:09 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/parser/scansup.h,v 1.15 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
-extern char *scanstr(char *s);
+extern char *scanstr(const char *s);
+
+extern char *downcase_truncate_identifier(const char *ident, int len,
+                                                                                 bool warn);
+
+extern void truncate_identifier(char *ident, int len, bool warn);
 
 #endif   /* SCANSUP_H */
index b9a8a8b0ba940513881f5967b721874a0c9b1804..f49a2ac500c72da0526c68badbf5cba5bec49839 100644 (file)
@@ -3,7 +3,7 @@
  *                       procedural language
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_funcs.c,v 1.31 2003/11/29 19:52:12 pgsql Exp $
+ *       $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_funcs.c,v 1.32 2004/02/21 00:34:53 tgl Exp $
  *
  *       This software is copyrighted by Jan Wieck - Hamburg.
  *
@@ -40,7 +40,7 @@
 
 #include <ctype.h>
 
-#include "mb/pg_wchar.h"
+#include "parser/scansup.h"
 
 
 /* ----------
@@ -348,15 +348,15 @@ plpgsql_convert_ident(const char *s, char **output, int numidents)
        {
                char       *curident;
                char       *cp;
-               int                     i;
 
                /* Process current identifier */
-               curident = palloc(strlen(s) + 1);               /* surely enough room */
-               cp = curident;
 
                if (*s == '"')
                {
                        /* Quoted identifier: copy, collapsing out doubled quotes */
+
+                       curident = palloc(strlen(s) + 1); /* surely enough room */
+                       cp = curident;
                        s++;
                        while (*s)
                        {
@@ -373,35 +373,20 @@ plpgsql_convert_ident(const char *s, char **output, int numidents)
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("unterminated \" in name: %s", sstart)));
                        s++;
+                       *cp = '\0';
+                       /* Truncate to NAMEDATALEN */
+                       truncate_identifier(curident, cp-curident, false);
                }
                else
                {
-                       /*
-                        * Normal identifier: downcase, stop at dot or whitespace.
-                        *
-                        * Note that downcasing is locale-sensitive, following SQL99
-                        * rules for identifiers.  We have already decided that the
-                        * item is not a PLPGSQL keyword.
-                        */
-                       while (*s && *s != '.' && !isspace((unsigned char) *s))
-                       {
-                               if (isupper((unsigned char) *s))
-                                       *cp++ = tolower((unsigned char) *s++);
-                               else
-                                       *cp++ = *s++;
-                       }
-               }
-
-               /* Truncate to NAMEDATALEN */
-               *cp = '\0';
-               i = cp - curident;
-
-               if (i >= NAMEDATALEN)
-               {
-                       int                     len;
+                       /* Normal identifier: extends till dot or whitespace */
+                       const char *thisstart = s;
 
-                       len = pg_mbcliplen(curident, i, NAMEDATALEN - 1);
-                       curident[len] = '\0';
+                       while (*s && *s != '.' && !isspace((unsigned char) *s))
+                               s++;
+                       /* Downcase and truncate to NAMEDATALEN */
+                       curident = downcase_truncate_identifier(thisstart, s-thisstart,
+                                                                                                       false);
                }
 
                /* Pass ident to caller */