Prevent to_number() from losing data when template doesn't match exactly.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 17 Nov 2017 17:04:06 +0000 (12:04 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 17 Nov 2017 17:04:13 +0000 (12:04 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 17 Nov 2017 17:04:06 +0000 (12:04 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 17 Nov 2017 17:04:13 +0000 (12:04 -0500)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index f901567f7e2fe6dc88e0a6ef3d5031d48e8423d4..35a845c4001cfd4de0da381b44e8408f6e4693c2 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -5850,7 +5850,10 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
      data based on the given value.  Any text that is not a template pattern is
      simply copied verbatim.  Similarly, in an input template string (for the
      other functions), template patterns identify the values to be supplied by
-    the input data string.
+    the input data string.  If there are characters in the template string
+    that are not template patterns, the corresponding characters in the input
+    data string are simply skipped over (whether or not they are equal to the
+    template string characters).
     </para>
  
    <para>
@@ -6176,13 +6179,15 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
         Ordinary text is allowed in <function>to_char</function>
         templates and will be output literally.  You can put a substring
         in double quotes to force it to be interpreted as literal text
-       even if it contains pattern key words.  For example, in
+       even if it contains template patterns.  For example, in
         <literal>'"Hello Year "YYYY'</literal>, the <literal>YYYY</literal>
         will be replaced by the year data, but the single <literal>Y</literal> in <literal>Year</literal>
-       will not be.  In <function>to_date</function>, <function>to_number</function>,
-       and <function>to_timestamp</function>, double-quoted strings skip the number of
-       input characters contained in the string, e.g. <literal>"XX"</literal>
-       skips two input characters.
+       will not be.
+       In <function>to_date</function>, <function>to_number</function>,
+       and <function>to_timestamp</function>, literal text and double-quoted
+       strings result in skipping the number of characters contained in the
+       string; for example <literal>"XX"</literal> skips two input characters
+       (whether or not they are <literal>XX</literal>).
        </para>
       </listitem>
  
@@ -6483,6 +6488,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </para>
       </listitem>
  
+     <listitem>
+      <para>
+       In <function>to_number</function>, if non-data template patterns such
+       as <literal>L</literal> or <literal>TH</literal> are used, the
+       corresponding number of input characters are skipped, whether or not
+       they match the template pattern, unless they are data characters
+       (that is, digits, sign, decimal point, or comma).  For
+       example, <literal>TH</literal> would skip two non-data characters.
+      </para>
+     </listitem>
+
       <listitem>
        <para>
         <literal>V</literal> with <function>to_char</function>
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c

index 50254f238885057b38655fc24df33739fcefb351..5afc293a5a0d93e91fb2d5a9600e51944b5ad025 100644 (file)
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -988,7 +988,7 @@ static char *get_last_relevant_decnum(char *num);
  static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
  static void NUM_numpart_to_char(NUMProc *Np, int id);
  static char *NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
-                         char *number, int from_char_input_len, int to_char_out_pre_spaces,
+                         char *number, int input_len, int to_char_out_pre_spaces,
                           int sign, bool is_to_char, Oid collid);
  static DCHCacheEntry *DCH_cache_getnew(const char *str);
  static DCHCacheEntry *DCH_cache_search(const char *str);
@@ -4232,6 +4232,14 @@ get_last_relevant_decnum(char *num)
         return result;
  }
  
+/*
+ * These macros are used in NUM_processor() and its subsidiary routines.
+ * OVERLOAD_TEST: true if we've reached end of input string
+ * AMOUNT_TEST(s): true if at least s characters remain in string
+ */
+#define OVERLOAD_TEST  (Np->inout_p >= Np->inout + input_len)
+#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
+
  /* ----------
   * Number extraction for TO_NUMBER()
   * ----------
@@ -4246,9 +4254,6 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len)
                  (id == NUM_0 || id == NUM_9) ? "NUM_0/9" : id == NUM_DEC ? "NUM_DEC" : "???");
  #endif
  
-#define OVERLOAD_TEST  (Np->inout_p >= Np->inout + input_len)
-#define AMOUNT_TEST(_s) (input_len-(Np->inout_p-Np->inout) >= _s)
-
         if (OVERLOAD_TEST)
                 return;
  
@@ -4641,14 +4646,32 @@ NUM_numpart_to_char(NUMProc *Np, int id)
         ++Np->num_curr;
  }
  
+/*
+ * Skip over "n" input characters, but only if they aren't numeric data
+ */
+static void
+NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len)
+{
+       while (n-- > 0)
+       {
+               if (OVERLOAD_TEST)
+                       break;                          /* end of input */
+               if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
+                       break;                          /* it's a data character */
+               Np->inout_p += pg_mblen(Np->inout_p);
+       }
+}
+
  static char *
  NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
-                         char *number, int from_char_input_len, int to_char_out_pre_spaces,
+                         char *number, int input_len, int to_char_out_pre_spaces,
                           int sign, bool is_to_char, Oid collid)
  {
         FormatNode *n;
         NUMProc         _Np,
                            *Np = &_Np;
+       const char *pattern;
+       int                     pattern_len;
  
         MemSet(Np, 0, sizeof(NUMProc));
  
@@ -4816,9 +4839,11 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                 if (!Np->is_to_char)
                 {
                         /*
-                        * Check non-string inout end
+                        * Check at least one character remains to be scanned.  (In
+                        * actions below, must use AMOUNT_TEST if we want to read more
+                        * characters than that.)
                          */
-                       if (Np->inout_p >= Np->inout + from_char_input_len)
+                       if (OVERLOAD_TEST)
                                 break;
                 }
  
@@ -4828,12 +4853,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                 if (n->type == NODE_TYPE_ACTION)
                 {
                         /*
-                        * Create/reading digit/zero/blank/sing
+                        * Create/read digit/zero/blank/sign/special-case
                          *
                          * 'NUM_S' note: The locale sign is anchored to number and we
                          * read/write it when we work with first or last number
-                        * (NUM_0/NUM_9). This is reason why NUM_S missing in follow
-                        * switch().
+                        * (NUM_0/NUM_9).  This is why NUM_S is missing in switch().
+                        *
+                        * Notice the "Np->inout_p++" at the bottom of the loop.  This is
+                        * why most of the actions advance inout_p one less than you might
+                        * expect.  In cases where we don't want that increment to happen,
+                        * a switch case ends with "continue" not "break".
                          */
                         switch (n->key->id)
                         {
@@ -4848,7 +4877,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                         }
                                         else
                                         {
-                                               NUM_numpart_from_char(Np, n->key->id, from_char_input_len);
+                                               NUM_numpart_from_char(Np, n->key->id, input_len);
                                                 break;  /* switch() case: */
                                         }
  
@@ -4872,10 +4901,14 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                         if (IS_FILLMODE(Np->Num))
                                                                 continue;
                                                 }
+                                               if (*Np->inout_p != ',')
+                                                       continue;
                                         }
                                         break;
  
                                 case NUM_G:
+                                       pattern = Np->L_thousands_sep;
+                                       pattern_len = strlen(pattern);
                                         if (Np->is_to_char)
                                         {
                                                 if (!Np->num_in)
@@ -4884,16 +4917,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                                 continue;
                                                         else
                                                         {
-                                                               int                     x = strlen(Np->L_thousands_sep);
-
-                                                               memset(Np->inout_p, ' ', x);
-                                                               Np->inout_p += x - 1;
+                                                               /* just in case there are MB chars */
+                                                               pattern_len = pg_mbstrlen(pattern);
+                                                               memset(Np->inout_p, ' ', pattern_len);
+                                                               Np->inout_p += pattern_len - 1;
                                                         }
                                                 }
                                                 else
                                                 {
-                                                       strcpy(Np->inout_p, Np->L_thousands_sep);
-                                                       Np->inout_p += strlen(Np->inout_p) - 1;
+                                                       strcpy(Np->inout_p, pattern);
+                                                       Np->inout_p += pattern_len - 1;
                                                 }
                                         }
                                         else
@@ -4903,18 +4936,33 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                         if (IS_FILLMODE(Np->Num))
                                                                 continue;
                                                 }
-                                               Np->inout_p += strlen(Np->L_thousands_sep) - 1;
+
+                                               /*
+                                                * Because L_thousands_sep typically contains data
+                                                * characters (either '.' or ','), we can't use
+                                                * NUM_eat_non_data_chars here.  Instead skip only if
+                                                * the input matches L_thousands_sep.
+                                                */
+                                               if (AMOUNT_TEST(pattern_len) &&
+                                                       strncmp(Np->inout_p, pattern, pattern_len) == 0)
+                                                       Np->inout_p += pattern_len - 1;
+                                               else
+                                                       continue;
                                         }
                                         break;
  
                                 case NUM_L:
+                                       pattern = Np->L_currency_symbol;
                                         if (Np->is_to_char)
                                         {
-                                               strcpy(Np->inout_p, Np->L_currency_symbol);
-                                               Np->inout_p += strlen(Np->inout_p) - 1;
+                                               strcpy(Np->inout_p, pattern);
+                                               Np->inout_p += strlen(pattern) - 1;
                                         }
                                         else
-                                               Np->inout_p += strlen(Np->L_currency_symbol) - 1;
+                                       {
+                                               NUM_eat_non_data_chars(Np, pg_mbstrlen(pattern), input_len);
+                                               continue;
+                                       }
                                         break;
  
                                 case NUM_RN:
@@ -4949,8 +4997,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                 continue;
  
                                         if (Np->is_to_char)
+                                       {
                                                 strcpy(Np->inout_p, get_th(Np->number, TH_LOWER));
-                                       Np->inout_p += 1;
+                                               Np->inout_p += 1;
+                                       }
+                                       else
+                                       {
+                                               /* All variants of 'th' occupy 2 characters */
+                                               NUM_eat_non_data_chars(Np, 2, input_len);
+                                               continue;
+                                       }
                                         break;
  
                                 case NUM_TH:
@@ -4959,8 +5015,16 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                                 continue;
  
                                         if (Np->is_to_char)
+                                       {
                                                 strcpy(Np->inout_p, get_th(Np->number, TH_UPPER));
-                                       Np->inout_p += 1;
+                                               Np->inout_p += 1;
+                                       }
+                                       else
+                                       {
+                                               /* All variants of 'TH' occupy 2 characters */
+                                               NUM_eat_non_data_chars(Np, 2, input_len);
+                                               continue;
+                                       }
                                         break;
  
                                 case NUM_MI:
@@ -4977,6 +5041,11 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                         {
                                                 if (*Np->inout_p == '-')
                                                         *Np->number = '-';
+                                               else
+                                               {
+                                                       NUM_eat_non_data_chars(Np, 1, input_len);
+                                                       continue;
+                                               }
                                         }
                                         break;
  
@@ -4994,23 +5063,31 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                                         {
                                                 if (*Np->inout_p == '+')
                                                         *Np->number = '+';
+                                               else
+                                               {
+                                                       NUM_eat_non_data_chars(Np, 1, input_len);
+                                                       continue;
+                                               }
                                         }
                                         break;
  
                                 case NUM_SG:
                                         if (Np->is_to_char)
                                                 *Np->inout_p = Np->sign;
-
                                         else
                                         {
                                                 if (*Np->inout_p == '-')
                                                         *Np->number = '-';
                                                 else if (*Np->inout_p == '+')
                                                         *Np->number = '+';
+                                               else
+                                               {
+                                                       NUM_eat_non_data_chars(Np, 1, input_len);
+                                                       continue;
+                                               }
                                         }
                                         break;
  
-
                                 default:
                                         continue;
                                         break;
@@ -5019,7 +5096,12 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
                 else
                 {
                         /*
-                        * Remove to output char from input in TO_CHAR
+                        * In TO_CHAR, non-pattern characters in the format are copied to
+                        * the output.  In TO_NUMBER, we skip one input character for each
+                        * non-pattern format character, whether or not it matches the
+                        * format character.  (Currently, that's actually implemented as
+                        * skipping one input byte per non-pattern format byte, which is
+                        * wrong...)
                          */
                         if (Np->is_to_char)
                                 *Np->inout_p = n->character;
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out

index 7e55b0e2931bf3228064753947d4ad7660d42df5..a96bfc0eb04952eac7b19d9ee1a5bcec992999aa 100644 (file)
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -1219,6 +1219,7 @@ SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');
  
  -- TO_NUMBER()
  --
+SET lc_numeric = 'C';
  SELECT '' AS to_number_1,  to_number('-34,338,492', '99G999G999');
   to_number_1 | to_number 
  -------------+-----------
@@ -1297,6 +1298,61 @@ SELECT '' AS to_number_13, to_number(' . 0 1-', ' 9 9 . 9 9 S');
                |     -0.01
  (1 row)
  
+SELECT '' AS to_number_14, to_number('34,50','999,99');
+ to_number_14 | to_number 
+--------------+-----------
+              |      3450
+(1 row)
+
+SELECT '' AS to_number_15, to_number('123,000','999G');
+ to_number_15 | to_number 
+--------------+-----------
+              |       123
+(1 row)
+
+SELECT '' AS to_number_16, to_number('123456','999G999');
+ to_number_16 | to_number 
+--------------+-----------
+              |    123456
+(1 row)
+
+SELECT '' AS to_number_17, to_number('$1234.56','L9,999.99');
+ to_number_17 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_18, to_number('$1234.56','L99,999.99');
+ to_number_18 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_19, to_number('$1,234.56','L99,999.99');
+ to_number_19 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_20, to_number('1234.56','L99,999.99');
+ to_number_20 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_21, to_number('1,234.56','L99,999.99');
+ to_number_21 | to_number 
+--------------+-----------
+              |   1234.56
+(1 row)
+
+SELECT '' AS to_number_22, to_number('42nd', '99th');
+ to_number_22 | to_number 
+--------------+-----------
+              |        42
+(1 row)
+
+RESET lc_numeric;
  --
  -- Input syntax
  --
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql

index 9675b6eabf3110da2683824c9a2bfd24914b88d0..321c7bdf7c55a6635854bdf7124c5e6b2df97155 100644 (file)
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -788,6 +788,7 @@ SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');
  
  -- TO_NUMBER()
  --
+SET lc_numeric = 'C';
  SELECT '' AS to_number_1,  to_number('-34,338,492', '99G999G999');
  SELECT '' AS to_number_2,  to_number('-34,338,492.654,878', '99G999G999D999G999');
  SELECT '' AS to_number_3,  to_number('<564646.654564>', '999999.999999PR');
@@ -801,6 +802,16 @@ SELECT '' AS to_number_10, to_number('0', '99.99');
  SELECT '' AS to_number_11, to_number('.-01', 'S99.99');
  SELECT '' AS to_number_12, to_number('.01-', '99.99S');
  SELECT '' AS to_number_13, to_number(' . 0 1-', ' 9 9 . 9 9 S');
+SELECT '' AS to_number_14, to_number('34,50','999,99');
+SELECT '' AS to_number_15, to_number('123,000','999G');
+SELECT '' AS to_number_16, to_number('123456','999G999');
+SELECT '' AS to_number_17, to_number('$1234.56','L9,999.99');
+SELECT '' AS to_number_18, to_number('$1234.56','L99,999.99');
+SELECT '' AS to_number_19, to_number('$1,234.56','L99,999.99');
+SELECT '' AS to_number_20, to_number('1234.56','L99,999.99');
+SELECT '' AS to_number_21, to_number('1,234.56','L99,999.99');
+SELECT '' AS to_number_22, to_number('42nd', '99th');
+RESET lc_numeric;
  
  --
  -- Input syntax
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 17 Nov 2017 17:04:06 +0000 (12:04 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 17 Nov 2017 17:04:13 +0000 (12:04 -0500)
doc/src/sgml/func.sgml		patch \| blob \| history
src/backend/utils/adt/formatting.c		patch \| blob \| history
src/test/regress/expected/numeric.out		patch \| blob \| history
src/test/regress/sql/numeric.sql		patch \| blob \| history