]> granicus.if.org Git - postgresql/commitdiff
Avoid use of sscanf() to parse ispell dictionary files.
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 11 Feb 2016 00:30:12 +0000 (19:30 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 11 Feb 2016 00:30:12 +0000 (19:30 -0500)
It turns out that on FreeBSD-derived platforms (including OS X), the
*scanf() family of functions is pretty much brain-dead about multibyte
characters.  In particular it will apply isspace() to individual bytes
of input even when those bytes are part of a multibyte character, thus
allowing false recognition of a field-terminating space.

We appear to have little alternative other than instituting a coding
rule that *scanf() is not to be used if the input string might contain
multibyte characters.  (There was some discussion of relying on "%ls",
but that probably just moves the portability problem somewhere else,
and besides it doesn't fully prevent BSD *scanf() from using isspace().)

This patch is a down payment on that: it gets rid of use of sscanf()
to parse ispell dictionary files, which are certainly at great risk
of having a problem.  The code is cleaner this way anyway, though
a bit longer.

In passing, improve a few comments.

Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me.
Back-patch to all supported branches.

src/backend/tsearch/spell.c

index 49c32c53c3aa93c39b2e0e9826e5bf1ddb4d81b7..3b492a8298e457ed1ad8ccac8e77cbaa12b6e194 100644 (file)
@@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
        Conf->naffixes++;
 }
 
+
+/* Parsing states for parse_affentry() and friends */
 #define PAE_WAIT_MASK  0
-#define PAE_INMASK     1
+#define PAE_INMASK             1
 #define PAE_WAIT_FIND  2
-#define PAE_INFIND     3
+#define PAE_INFIND             3
 #define PAE_WAIT_REPL  4
-#define PAE_INREPL     5
+#define PAE_INREPL             5
+#define PAE_WAIT_TYPE  6
+#define PAE_WAIT_FLAG  7
 
+/*
+ * Parse next space-separated field of an .affix file line.
+ *
+ * *str is the input pointer (will be advanced past field)
+ * next is where to copy the field value to, with null termination
+ *
+ * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
+ *
+ * Returns TRUE if we found a field, FALSE if not.
+ */
+static bool
+get_nextfield(char **str, char *next)
+{
+       int                     state = PAE_WAIT_MASK;
+       int                     avail = BUFSIZ;
+
+       while (**str)
+       {
+               if (state == PAE_WAIT_MASK)
+               {
+                       if (t_iseq(*str, '#'))
+                               return false;
+                       else if (!t_isspace(*str))
+                       {
+                               int                     clen = pg_mblen(*str);
+
+                               if (clen < avail)
+                               {
+                                       COPYCHAR(next, *str);
+                                       next += clen;
+                                       avail -= clen;
+                               }
+                               state = PAE_INMASK;
+                       }
+               }
+               else    /* state == PAE_INMASK */
+               {
+                       if (t_isspace(*str))
+                       {
+                               *next = '\0';
+                               return true;
+                       }
+                       else
+                       {
+                               int                     clen = pg_mblen(*str);
+
+                               if (clen < avail)
+                               {
+                                       COPYCHAR(next, *str);
+                                       next += clen;
+                                       avail -= clen;
+                               }
+                       }
+               }
+               *str += pg_mblen(*str);
+       }
+
+       *next = '\0';
+
+       return (state == PAE_INMASK);           /* OK if we got a nonempty field */
+}
+
+/*
+ * Parses entry of an .affix file of MySpell or Hunspell format.
+ *
+ * An .affix file entry has the following format:
+ * - header
+ *      <type>  <flag>  <cross_flag>  <flag_count>
+ * - fields after header:
+ *      <type>  <flag>  <find>  <replace>      <mask>
+ *
+ * str is the input line
+ * field values are returned to type etc, which must be buffers of size BUFSIZ.
+ *
+ * Returns number of fields found; any omitted fields are set to empty strings.
+ */
+static int
+parse_ooaffentry(char *str, char *type, char *flag, char *find,
+                                char *repl, char *mask)
+{
+       int                     state = PAE_WAIT_TYPE;
+       int                     fields_read = 0;
+       bool            valid = false;
+
+       *type = *flag = *find = *repl = *mask = '\0';
+
+       while (*str)
+       {
+               switch (state)
+               {
+                       case PAE_WAIT_TYPE:
+                               valid = get_nextfield(&str, type);
+                               state = PAE_WAIT_FLAG;
+                               break;
+                       case PAE_WAIT_FLAG:
+                               valid = get_nextfield(&str, flag);
+                               state = PAE_WAIT_FIND;
+                               break;
+                       case PAE_WAIT_FIND:
+                               valid = get_nextfield(&str, find);
+                               state = PAE_WAIT_REPL;
+                               break;
+                       case PAE_WAIT_REPL:
+                               valid = get_nextfield(&str, repl);
+                               state = PAE_WAIT_MASK;
+                               break;
+                       case PAE_WAIT_MASK:
+                               valid = get_nextfield(&str, mask);
+                               state = -1;             /* force loop exit */
+                               break;
+                       default:
+                               elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
+                                        state);
+                               break;
+               }
+               if (valid)
+                       fields_read++;
+               else
+                       break;                          /* early EOL */
+               if (state < 0)
+                       break;                          /* got all fields */
+       }
+
+       return fields_read;
+}
+
+/*
+ * Parses entry of an .affix file of Ispell format
+ *
+ * An .affix file entry has the following format:
+ * <mask>  >  [-<find>,]<replace>
+ */
 static bool
 parse_affentry(char *str, char *mask, char *find, char *repl)
 {
@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
        int                     flag = 0;
        char            flagflags = 0;
        tsearch_readline_state trst;
-       int                     scanread = 0;
-       char            scanbuf[BUFSIZ];
        char       *recoded;
 
        /* read file to find any flag */
@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
        }
        tsearch_readline_end(&trst);
 
-       sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
-
        if (!tsearch_readline_begin(&trst, filename))
                ereport(ERROR,
                                (errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 
        while ((recoded = tsearch_readline(&trst)) != NULL)
        {
+               int                     fields_read;
+
                if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
                        goto nextline;
 
-               scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
+               fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
 
                if (ptype)
                        pfree(ptype);
                ptype = lowerstr_ctx(Conf, type);
-               if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
+               if (fields_read < 4 ||
+                       (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
                        goto nextline;
 
-               if (scanread == 4)
+               if (fields_read == 4)
                {
                        if (strlen(sflag) != 1)
                                goto nextline;
@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
                        if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
                                goto nextline;
                        prepl = lowerstr_ctx(Conf, repl);
-                       /* affix flag */
+                       /* Find position of '/' in lowercased string "prepl" */
                        if ((ptr = strchr(prepl, '/')) != NULL)
                        {
+                               /*
+                                * Here we use non-lowercased string "repl". We need position
+                                * of '/' in "repl".
+                                */
                                *ptr = '\0';
                                ptr = repl + (ptr - prepl) + 1;
                                while (*ptr)
@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 
                if (STRNCMP(pstr, "compoundwords") == 0)
                {
+                       /* Find position in lowercased string "pstr" */
                        s = findchar(pstr, 'l');
                        if (s)
                        {
-                               s = recoded + (s - pstr);               /* we need non-lowercased
-                                                                                                * string */
+                               /* Here we use non-lowercased string "recoded" */
+                               s = recoded + (s - pstr);
                                while (*s && !t_isspace(s))
                                        s += pg_mblen(s);
                                while (*s && t_isspace(s))