]> granicus.if.org Git - postgresql/commitdiff
Improve to_date/to_number/to_timestamp behavior with multibyte characters.
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 18 Nov 2017 17:42:52 +0000 (12:42 -0500)
The documentation says that these functions skip one input character
per literal (non-pattern) format character.  Actually, though, they
skipped one input *byte* per literal *byte*, which could be hugely
confusing if either data or format contained multibyte characters.

To fix, adjust the FormatNode representation and parse_format() so
that multibyte format characters are stored as one FormatNode not
several, and adjust the data-skipping bits to advance by pg_mblen()
not necessarily one byte.  There's no user-visible behavior change
on the to_char() side, although the internal representation changes.

Commit e87d4965b had already fixed most places where we skip characters
on the basis of non-literal format patterns to advance by characters
not bytes, but this gets one more place, the SKIP_THth macro.  I think
everything in formatting.c gets that right now.

It'd be nice to have some regression test cases covering this behavior;
but of course there's no way to do so in an encoding-agnostic way, and
many of the interesting aspects would also require unportable locale
selections.  So I've not bothered here.

Discussion: https://postgr.es/m/28186.1510957703@sss.pgh.pa.us

src/backend/utils/adt/formatting.c

index cb0dbf748e501f0ba002efe37d48e9e9c4df12a8..ec97de0ad27ee9c29731f9138c965fb07c1f103a 100644 (file)
@@ -151,8 +151,6 @@ typedef enum
        FROM_CHAR_DATE_ISOWEEK          /* ISO 8601 week date */
 } FromCharDateMode;
 
-typedef struct FormatNode FormatNode;
-
 typedef struct
 {
        const char *name;
@@ -162,13 +160,13 @@ typedef struct
        FromCharDateMode date_mode;
 } KeyWord;
 
-struct FormatNode
+typedef struct
 {
-       int                     type;                   /* node type                    */
-       const KeyWord *key;                     /* if node type is KEYWORD      */
-       char            character;              /* if node type is CHAR         */
-       int                     suffix;                 /* keyword suffix               */
-};
+       int                     type;                   /* NODE_TYPE_XXX, see below */
+       const KeyWord *key;                     /* if type is ACTION */
+       char            character[MAX_MULTIBYTE_CHAR_LEN + 1];  /* if type is CHAR */
+       int                     suffix;                 /* keyword prefix/suffix code, if any */
+} FormatNode;
 
 #define NODE_TYPE_END          1
 #define NODE_TYPE_ACTION       2
@@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                }
                else if (*str)
                {
+                       int                     chlen;
+
                        /*
                         * Process double-quoted literal string, if any
                         */
                        if (*str == '"')
                        {
-                               while (*(++str))
+                               str++;
+                               while (*str)
                                {
                                        if (*str == '"')
                                        {
@@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                        /* backslash quotes the next character, if any */
                                        if (*str == '\\' && *(str + 1))
                                                str++;
+                                       chlen = pg_mblen(str);
                                        n->type = NODE_TYPE_CHAR;
-                                       n->character = *str;
+                                       memcpy(n->character, str, chlen);
+                                       n->character[chlen] = '\0';
                                        n->key = NULL;
                                        n->suffix = 0;
                                        n++;
+                                       str += chlen;
                                }
                        }
                        else
@@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
                                 */
                                if (*str == '\\' && *(str + 1) == '"')
                                        str++;
+                               chlen = pg_mblen(str);
                                n->type = NODE_TYPE_CHAR;
-                               n->character = *str;
+                               memcpy(n->character, str, chlen);
+                               n->character[chlen] = '\0';
                                n->key = NULL;
                                n->suffix = 0;
                                n++;
-                               str++;
+                               str += chlen;
                        }
                }
        }
@@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max)
                        elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
                                 a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
                else if (n->type == NODE_TYPE_CHAR)
-                       elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
+                       elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
+                                a, n->character);
                else if (n->type == NODE_TYPE_END)
                {
                        elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
@@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff)
        do { \
                if (S_THth(_suf)) \
                { \
-                       if (*(ptr)) (ptr)++; \
-                       if (*(ptr)) (ptr)++; \
+                       if (*(ptr)) (ptr) += pg_mblen(ptr); \
+                       if (*(ptr)) (ptr) += pg_mblen(ptr); \
                } \
        } while (0)
 
@@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n)
 
                return true;
        }
-       else if (isdigit((unsigned char) n->character))
+       else if (n->character[1] == '\0' &&
+                        isdigit((unsigned char) n->character[0]))
                return false;
 
        return true;                            /* some non-digit input (separator) */
@@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
        {
                if (n->type != NODE_TYPE_ACTION)
                {
-                       *s = n->character;
-                       s++;
+                       strcpy(s, n->character);
+                       s += strlen(s);
                        continue;
                }
 
@@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
                         * we don't insist that the consumed character match the format's
                         * character.
                         */
-                       s++;
+                       s += pg_mblen(s);
                        continue;
                }
 
@@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num)
 /*
  * These macros are used in NUM_processor() and its subsidiary routines.
  * OVERLOAD_TEST: true if we've reached end of input string
- * AMOUNT_TEST(s): true if at least s characters remain in string
+ * AMOUNT_TEST(s): true if at least s bytes remain in string
  */
 #define OVERLOAD_TEST  (Np->inout_p >= Np->inout + input_len)
 #define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
@@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                if (!Np->is_to_char)
                {
                        /*
-                        * Check at least one character remains to be scanned.  (In
-                        * actions below, must use AMOUNT_TEST if we want to read more
-                        * characters than that.)
+                        * Check at least one byte remains to be scanned.  (In actions
+                        * below, must use AMOUNT_TEST if we want to read more bytes than
+                        * that.)
                         */
                        if (OVERLOAD_TEST)
                                break;
@@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                         * In TO_CHAR, non-pattern characters in the format are copied to
                         * the output.  In TO_NUMBER, we skip one input character for each
                         * non-pattern format character, whether or not it matches the
-                        * format character.  (Currently, that's actually implemented as
-                        * skipping one input byte per non-pattern format byte, which is
-                        * wrong...)
+                        * format character.
                         */
                        if (Np->is_to_char)
-                               *Np->inout_p = n->character;
+                       {
+                               strcpy(Np->inout_p, n->character);
+                               Np->inout_p += strlen(Np->inout_p);
+                       }
+                       else
+                       {
+                               Np->inout_p += pg_mblen(Np->inout_p);
+                       }
+                       continue;
                }
                Np->inout_p++;
        }