]> granicus.if.org Git - postgresql/commitdiff
Fix assorted bugs in contrib/unaccent's configuration file parsing.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 7 Nov 2011 16:48:53 +0000 (11:48 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 7 Nov 2011 16:50:18 +0000 (11:50 -0500)
Make it use t_isspace() to identify whitespace, rather than relying on
sscanf which is known to get it wrong on some platform/locale combinations.
Get rid of fixed-size buffers.  Make it actually continue to parse the file
after ignoring a line with untranslatable characters, as was obviously
intended.

The first of these issues is per gripe from J Smith, though not exactly
either of his proposed patches.

contrib/unaccent/unaccent.c

index d9c2eac2e74833c666939d4b74db183dbb5863be..d22f5c7beaa875451539b14983abe2f142b619eb 100644 (file)
@@ -91,35 +91,83 @@ initSuffixTree(char *filename)
 
        do
        {
-               char            src[4096];
-               char            trg[4096];
-               int                     srclen;
-               int                     trglen;
-               char       *line = NULL;
-
+               /*
+                * pg_do_encoding_conversion() (called by tsearch_readline()) will
+                * emit exception if it finds untranslatable characters in current
+                * locale. We just skip such lines, continuing with the next.
+                */
                skip = true;
 
                PG_TRY();
                {
-                       /*
-                        * pg_do_encoding_conversion() (called by tsearch_readline()) will
-                        * emit exception if it finds untranslatable characters in current
-                        * locale. We just skip such characters.
-                        */
+                       char       *line;
+
                        while ((line = tsearch_readline(&trst)) != NULL)
                        {
-                               if (sscanf(line, "%s\t%s\n", src, trg) != 2)
-                                       continue;
+                               /*
+                                * The format of each line must be "src trg" where src and trg
+                                * are sequences of one or more non-whitespace characters,
+                                * separated by whitespace.  Whitespace at start or end of
+                                * line is ignored.
+                                */
+                               int                     state;
+                               char       *ptr;
+                               char       *src = NULL;
+                               char       *trg = NULL;
+                               int                     ptrlen;
+                               int                     srclen = 0;
+                               int                     trglen = 0;
+
+                               state = 0;
+                               for (ptr = line; *ptr; ptr += ptrlen)
+                               {
+                                       ptrlen = pg_mblen(ptr);
+                                       /* ignore whitespace, but end src or trg */
+                                       if (t_isspace(ptr))
+                                       {
+                                               if (state == 1)
+                                                       state = 2;
+                                               else if (state == 3)
+                                                       state = 4;
+                                               continue;
+                                       }
+                                       switch (state)
+                                       {
+                                               case 0:
+                                                       /* start of src */
+                                                       src = ptr;
+                                                       srclen = ptrlen;
+                                                       state = 1;
+                                                       break;
+                                               case 1:
+                                                       /* continue src */
+                                                       srclen += ptrlen;
+                                                       break;
+                                               case 2:
+                                                       /* start of trg */
+                                                       trg = ptr;
+                                                       trglen = ptrlen;
+                                                       state = 3;
+                                                       break;
+                                               case 3:
+                                                       /* continue trg */
+                                                       trglen += ptrlen;
+                                                       break;
+                                               default:
+                                                       /* bogus line format */
+                                                       state = -1;
+                                                       break;
+                                       }
+                               }
 
-                               srclen = strlen(src);
-                               trglen = strlen(trg);
+                               if (state >= 3)
+                                       rootSuffixTree = placeChar(rootSuffixTree,
+                                                                                          (unsigned char *) src, srclen,
+                                                                                          trg, trglen);
 
-                               rootSuffixTree = placeChar(rootSuffixTree,
-                                                                                  (unsigned char *) src, srclen,
-                                                                                  trg, trglen);
-                               skip = false;
                                pfree(line);
                        }
+                       skip = false;
                }
                PG_CATCH();
                {