]> granicus.if.org Git - postgresql/commitdiff
Prevent to_number() from losing data when template doesn't match exactly.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 17 Nov 2017 17:04:06 +0000 (12:04 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 17 Nov 2017 17:04:13 +0000 (12:04 -0500)
Non-data template patterns would consume characters whether or not those
characters were what the pattern expected, for example
SELECT TO_NUMBER('1234', '9,999');
produced 134 because the '2' got eaten by the comma pattern.  This seems
undesirable, not least because it doesn't happen in Oracle.  For the ','
and 'G' template patterns, we can fix this by consuming characters only
if they match what the pattern would output.  For non-data patterns such
as 'L' and 'TH', it seems impractical to tighten things up to the point of
consuming only exact matches to what the pattern would output; but we can
improve matters quite a lot by redefining the behavior as "consume only
characters that aren't digits, signs, decimal point, or comma".

Also, fix it so that the behavior is to consume the number of *characters*
the pattern would output, not the number of *bytes*.  The old coding would
do surprising things with non-ASCII currency symbols, for example.  (It
would be good to apply that rule for literal text as well, but this commit
only fixes it for non-data patterns.)

Oliver Ford, reviewed by Thomas Munro and Nathan Wagner, and whacked around
a bit more by me

Discussion: https://postgr.es/m/CAGMVOdvpbMqPf9XWNzOwBpzJfErkydr_fEGhmuDGa015z97mwg@mail.gmail.com

doc/src/sgml/func.sgml
src/backend/utils/adt/formatting.c
src/test/regress/expected/numeric.out
src/test/regress/sql/numeric.sql

index f901567f7e2fe6dc88e0a6ef3d5031d48e8423d4..35a845c4001cfd4de0da381b44e8408f6e4693c2 100644 (file)
@@ -5850,7 +5850,10 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
     data based on the given value.  Any text that is not a template pattern is
     simply copied verbatim.  Similarly, in an input template string (for the
     other functions), template patterns identify the values to be supplied by
-    the input data string.
+    the input data string.  If there are characters in the template string
+    that are not template patterns, the corresponding characters in the input
+    data string are simply skipped over (whether or not they are equal to the
+    template string characters).
    </para>
 
   <para>
@@ -6176,13 +6179,15 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        Ordinary text is allowed in <function>to_char</function>
        templates and will be output literally.  You can put a substring
        in double quotes to force it to be interpreted as literal text
-       even if it contains pattern key words.  For example, in
+       even if it contains template patterns.  For example, in
        <literal>'"Hello Year "YYYY'</literal>, the <literal>YYYY</literal>
        will be replaced by the year data, but the single <literal>Y</literal> in <literal>Year</literal>
-       will not be.  In <function>to_date</function>, <function>to_number</function>,
-       and <function>to_timestamp</function>, double-quoted strings skip the number of
-       input characters contained in the string, e.g. <literal>"XX"</literal>
-       skips two input characters.
+       will not be.
+       In <function>to_date</function>, <function>to_number</function>,
+       and <function>to_timestamp</function>, literal text and double-quoted
+       strings result in skipping the number of characters contained in the
+       string; for example <literal>"XX"</literal> skips two input characters
+       (whether or not they are <literal>XX</literal>).
       </para>
      </listitem>
 
@@ -6483,6 +6488,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
       </para>
      </listitem>
 
+     <listitem>
+      <para>
+       In <function>to_number</function>, if non-data template patterns such
+       as <literal>L</literal> or <literal>TH</literal> are used, the
+       corresponding number of input characters are skipped, whether or not
+       they match the template pattern, unless they are data characters
+       (that is, digits, sign, decimal point, or comma).  For
+       example, <literal>TH</literal> would skip two non-data characters.
+      </para>
+     </listitem>
+
      <listitem>
       <para>
        <literal>V</literal> with <function>to_char</function>
index 50254f238885057b38655fc24df33739fcefb351..5afc293a5a0d93e91fb2d5a9600e51944b5ad025 100644 (file)
@@ -988,7 +988,7 @@ static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
 static void NUM_numpart_to_char(NUMProc *Np, int id);
 static char *NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
-                         char *number, int from_char_input_len, int to_char_out_pre_spaces,
+                         char *number, int input_len, int to_char_out_pre_spaces,
                          int sign, bool is_to_char, Oid collid);
 static DCHCacheEntry *DCH_cache_getnew(const char *str);
 static DCHCacheEntry *DCH_cache_search(const char *str);
@@ -4232,6 +4232,14 @@ get_last_relevant_decnum(char *num)
        return result;
 }
 
+/*
+ * These macros are used in NUM_processor() and its subsidiary routines.
+ * OVERLOAD_TEST: true if we've reached end of input string
+ * AMOUNT_TEST(s): true if at least s characters remain in string
+ */
+#define OVERLOAD_TEST  (Np->inout_p >= Np->inout + input_len)
+#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
+
 /* ----------
  * Number extraction for TO_NUMBER()
  * ----------
@@ -4246,9 +4254,6 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len)
                 (id == NUM_0 || id == NUM_9) ? "NUM_0/9" : id == NUM_DEC ? "NUM_DEC" : "???");
 #endif
 
-#define OVERLOAD_TEST  (Np->inout_p >= Np->inout + input_len)
-#define AMOUNT_TEST(_s) (input_len-(Np->inout_p-Np->inout) >= _s)
-
        if (OVERLOAD_TEST)
                return;
 
@@ -4641,14 +4646,32 @@ NUM_numpart_to_char(NUMProc *Np, int id)
        ++Np->num_curr;
 }
 
+/*
+ * Skip over "n" input characters, but only if they aren't numeric data
+ */
+static void
+NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len)
+{
+       while (n-- > 0)
+       {
+               if (OVERLOAD_TEST)
+                       break;                          /* end of input */
+               if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
+                       break;                          /* it's a data character */
+               Np->inout_p += pg_mblen(Np->inout_p);
+       }
+}
+
 static char *
 NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
-                         char *number, int from_char_input_len, int to_char_out_pre_spaces,
+                         char *number, int input_len, int to_char_out_pre_spaces,
                          int sign, bool is_to_char, Oid collid)
 {
        FormatNode *n;
        NUMProc         _Np,
                           *Np = &_Np;
+       const char *pattern;
+       int                     pattern_len;
 
        MemSet(Np, 0, sizeof(NUMProc));
 
@@ -4816,9 +4839,11 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                if (!Np->is_to_char)
                {
                        /*
-                        * Check non-string inout end
+                        * Check at least one character remains to be scanned.  (In
+                        * actions below, must use AMOUNT_TEST if we want to read more
+                        * characters than that.)
                         */
-                       if (Np->inout_p >= Np->inout + from_char_input_len)
+                       if (OVERLOAD_TEST)
                                break;
                }
 
@@ -4828,12 +4853,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                if (n->type == NODE_TYPE_ACTION)
                {
                        /*
-                        * Create/reading digit/zero/blank/sing
+                        * Create/read digit/zero/blank/sign/special-case
                         *
                         * 'NUM_S' note: The locale sign is anchored to number and we
                         * read/write it when we work with first or last number
-                        * (NUM_0/NUM_9). This is reason why NUM_S missing in follow
-                        * switch().
+                        * (NUM_0/NUM_9).  This is why NUM_S is missing in switch().
+                        *
+                        * Notice the "Np->inout_p++" at the bottom of the loop.  This is
+                        * why most of the actions advance inout_p one less than you might
+                        * expect.  In cases where we don't want that increment to happen,
+                        * a switch case ends with "continue" not "break".
                         */
                        switch (n->key->id)
                        {
@@ -4848,7 +4877,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                        }
                                        else
                                        {
-                                               NUM_numpart_from_char(Np, n->key->id, from_char_input_len);
+                                               NUM_numpart_from_char(Np, n->key->id, input_len);
                                                break;  /* switch() case: */
                                        }
 
@@ -4872,10 +4901,14 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                        if (IS_FILLMODE(Np->Num))
                                                                continue;
                                                }
+                                               if (*Np->inout_p != ',')
+                                                       continue;
                                        }
                                        break;
 
                                case NUM_G:
+                                       pattern = Np->L_thousands_sep;
+                                       pattern_len = strlen(pattern);
                                        if (Np->is_to_char)
                                        {
                                                if (!Np->num_in)
@@ -4884,16 +4917,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                                continue;
                                                        else
                                                        {
-                                                               int                     x = strlen(Np->L_thousands_sep);
-
-                                                               memset(Np->inout_p, ' ', x);
-                                                               Np->inout_p += x - 1;
+                                                               /* just in case there are MB chars */
+                                                               pattern_len = pg_mbstrlen(pattern);
+                                                               memset(Np->inout_p, ' ', pattern_len);
+                                                               Np->inout_p += pattern_len - 1;
                                                        }
                                                }
                                                else
                                                {
-                                                       strcpy(Np->inout_p, Np->L_thousands_sep);
-                                                       Np->inout_p += strlen(Np->inout_p) - 1;
+                                                       strcpy(Np->inout_p, pattern);
+                                                       Np->inout_p += pattern_len - 1;
                                                }
                                        }
                                        else
@@ -4903,18 +4936,33 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                        if (IS_FILLMODE(Np->Num))
                                                                continue;
                                                }
-                                               Np->inout_p += strlen(Np->L_thousands_sep) - 1;
+
+                                               /*
+                                                * Because L_thousands_sep typically contains data
+                                                * characters (either '.' or ','), we can't use
+                                                * NUM_eat_non_data_chars here.  Instead skip only if
+                                                * the input matches L_thousands_sep.
+                                                */
+                                               if (AMOUNT_TEST(pattern_len) &&
+                                                       strncmp(Np->inout_p, pattern, pattern_len) == 0)
+                                                       Np->inout_p += pattern_len - 1;
+                                               else
+                                                       continue;
                                        }
                                        break;
 
                                case NUM_L:
+                                       pattern = Np->L_currency_symbol;
                                        if (Np->is_to_char)
                                        {
-                                               strcpy(Np->inout_p, Np->L_currency_symbol);
-                                               Np->inout_p += strlen(Np->inout_p) - 1;
+                                               strcpy(Np->inout_p, pattern);
+                                               Np->inout_p += strlen(pattern) - 1;
                                        }
                                        else
-                                               Np->inout_p += strlen(Np->L_currency_symbol) - 1;
+                                       {
+                                               NUM_eat_non_data_chars(Np, pg_mbstrlen(pattern), input_len);
+                                               continue;
+                                       }
                                        break;
 
                                case NUM_RN:
@@ -4949,8 +4997,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                continue;
 
                                        if (Np->is_to_char)
+                                       {
                                                strcpy(Np->inout_p, get_th(Np->number, TH_LOWER));
-                                       Np->inout_p += 1;
+                                               Np->inout_p += 1;
+                                       }
+                                       else
+                                       {
+                                               /* All variants of 'th' occupy 2 characters */
+                                               NUM_eat_non_data_chars(Np, 2, input_len);
+                                               continue;
+                                       }
                                        break;
 
                                case NUM_TH:
@@ -4959,8 +5015,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                continue;
 
                                        if (Np->is_to_char)
+                                       {
                                                strcpy(Np->inout_p, get_th(Np->number, TH_UPPER));
-                                       Np->inout_p += 1;
+                                               Np->inout_p += 1;
+                                       }
+                                       else
+                                       {
+                                               /* All variants of 'TH' occupy 2 characters */
+                                               NUM_eat_non_data_chars(Np, 2, input_len);
+                                               continue;
+                                       }
                                        break;
 
                                case NUM_MI:
@@ -4977,6 +5041,11 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                        {
                                                if (*Np->inout_p == '-')
                                                        *Np->number = '-';
+                                               else
+                                               {
+                                                       NUM_eat_non_data_chars(Np, 1, input_len);
+                                                       continue;
+                                               }
                                        }
                                        break;
 
@@ -4994,23 +5063,31 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                        {
                                                if (*Np->inout_p == '+')
                                                        *Np->number = '+';
+                                               else
+                                               {
+                                                       NUM_eat_non_data_chars(Np, 1, input_len);
+                                                       continue;
+                                               }
                                        }
                                        break;
 
                                case NUM_SG:
                                        if (Np->is_to_char)
                                                *Np->inout_p = Np->sign;
-
                                        else
                                        {
                                                if (*Np->inout_p == '-')
                                                        *Np->number = '-';
                                                else if (*Np->inout_p == '+')
                                                        *Np->number = '+';
+                                               else
+                                               {
+                                                       NUM_eat_non_data_chars(Np, 1, input_len);
+                                                       continue;
+                                               }
                                        }
                                        break;
 
-
                                default:
                                        continue;
                                        break;
@@ -5019,7 +5096,12 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                else
                {
                        /*
-                        * Remove to output char from input in TO_CHAR
+                        * In TO_CHAR, non-pattern characters in the format are copied to
+                        * the output.  In TO_NUMBER, we skip one input character for each
+                        * non-pattern format character, whether or not it matches the
+                        * format character.  (Currently, that's actually implemented as
+                        * skipping one input byte per non-pattern format byte, which is
+                        * wrong...)
                         */
                        if (Np->is_to_char)
                                *Np->inout_p = n->character;
index 7e55b0e2931bf3228064753947d4ad7660d42df5..a96bfc0eb04952eac7b19d9ee1a5bcec992999aa 100644 (file)
@@ -1219,6 +1219,7 @@ SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');
 
 -- TO_NUMBER()
 --
+SET lc_numeric = 'C';
 SELECT '' AS to_number_1,  to_number('-34,338,492', '99G999G999');
  to_number_1 | to_number 
 -------------+-----------
@@ -1297,6 +1298,61 @@ SELECT '' AS to_number_13, to_number(' . 0 1-', ' 9 9 . 9 9 S');
               |     -0.01
 (1 row)
 
+SELECT '' AS to_number_14, to_number('34,50','999,99');
+ to_number_14 | to_number 
+--------------+-----------
+              |      3450
+(1 row)
+
+SELECT '' AS to_number_15, to_number('123,000','999G');
+ to_number_15 | to_number 
+--------------+-----------
+              |       123
+(1 row)
+
+SELECT '' AS to_number_16, to_number('123456','999G999');
+ to_number_16 | to_number 
+--------------+-----------
+              |    123456
+(1 row)
+
+SELECT '' AS to_number_17, to_number('$1234.56','L9,999.99');
+ to_number_17 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_18, to_number('$1234.56','L99,999.99');
+ to_number_18 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_19, to_number('$1,234.56','L99,999.99');
+ to_number_19 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_20, to_number('1234.56','L99,999.99');
+ to_number_20 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_21, to_number('1,234.56','L99,999.99');
+ to_number_21 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_22, to_number('42nd', '99th');
+ to_number_22 | to_number 
+--------------+-----------
+              |        42
+(1 row)
+
+RESET lc_numeric;
 --
 -- Input syntax
 --
index 9675b6eabf3110da2683824c9a2bfd24914b88d0..321c7bdf7c55a6635854bdf7124c5e6b2df97155 100644 (file)
@@ -788,6 +788,7 @@ SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');
 
 -- TO_NUMBER()
 --
+SET lc_numeric = 'C';
 SELECT '' AS to_number_1,  to_number('-34,338,492', '99G999G999');
 SELECT '' AS to_number_2,  to_number('-34,338,492.654,878', '99G999G999D999G999');
 SELECT '' AS to_number_3,  to_number('<564646.654564>', '999999.999999PR');
@@ -801,6 +802,16 @@ SELECT '' AS to_number_10, to_number('0', '99.99');
 SELECT '' AS to_number_11, to_number('.-01', 'S99.99');
 SELECT '' AS to_number_12, to_number('.01-', '99.99S');
 SELECT '' AS to_number_13, to_number(' . 0 1-', ' 9 9 . 9 9 S');
+SELECT '' AS to_number_14, to_number('34,50','999,99');
+SELECT '' AS to_number_15, to_number('123,000','999G');
+SELECT '' AS to_number_16, to_number('123456','999G999');
+SELECT '' AS to_number_17, to_number('$1234.56','L9,999.99');
+SELECT '' AS to_number_18, to_number('$1234.56','L99,999.99');
+SELECT '' AS to_number_19, to_number('$1,234.56','L99,999.99');
+SELECT '' AS to_number_20, to_number('1234.56','L99,999.99');
+SELECT '' AS to_number_21, to_number('1,234.56','L99,999.99');
+SELECT '' AS to_number_22, to_number('42nd', '99th');
+RESET lc_numeric;
 
 --
 -- Input syntax