From 51e78ab4ff3282963f5e8ba2633040829413aefa Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Feb 2016 19:30:11 -0500 Subject: [PATCH] Avoid use of sscanf() to parse ispell dictionary files. It turns out that on FreeBSD-derived platforms (including OS X), the *scanf() family of functions is pretty much brain-dead about multibyte characters. In particular it will apply isspace() to individual bytes of input even when those bytes are part of a multibyte character, thus allowing false recognition of a field-terminating space. We appear to have little alternative other than instituting a coding rule that *scanf() is not to be used if the input string might contain multibyte characters. (There was some discussion of relying on "%ls", but that probably just moves the portability problem somewhere else, and besides it doesn't fully prevent BSD *scanf() from using isspace().) This patch is a down payment on that: it gets rid of use of sscanf() to parse ispell dictionary files, which are certainly at great risk of having a problem. The code is cleaner this way anyway, though a bit longer. In passing, improve a few comments. Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me. Back-patch to all supported branches. --- src/backend/tsearch/spell.c | 166 +++++++++++++++++++++++++++++++++--- 1 file changed, 153 insertions(+), 13 deletions(-) diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index 00422227f3..49280b831a 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c Conf->naffixes++; } + +/* Parsing states for parse_affentry() and friends */ #define PAE_WAIT_MASK 0 -#define PAE_INMASK 1 +#define PAE_INMASK 1 #define PAE_WAIT_FIND 2 -#define PAE_INFIND 3 +#define PAE_INFIND 3 #define PAE_WAIT_REPL 4 -#define PAE_INREPL 5 +#define PAE_INREPL 5 +#define PAE_WAIT_TYPE 6 +#define PAE_WAIT_FLAG 7 +/* + * Parse next space-separated field of an .affix file line. + * + * *str is the input pointer (will be advanced past field) + * next is where to copy the field value to, with null termination + * + * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit. + * + * Returns TRUE if we found a field, FALSE if not. + */ +static bool +get_nextfield(char **str, char *next) +{ + int state = PAE_WAIT_MASK; + int avail = BUFSIZ; + + while (**str) + { + if (state == PAE_WAIT_MASK) + { + if (t_iseq(*str, '#')) + return false; + else if (!t_isspace(*str)) + { + int clen = pg_mblen(*str); + + if (clen < avail) + { + COPYCHAR(next, *str); + next += clen; + avail -= clen; + } + state = PAE_INMASK; + } + } + else /* state == PAE_INMASK */ + { + if (t_isspace(*str)) + { + *next = '\0'; + return true; + } + else + { + int clen = pg_mblen(*str); + + if (clen < avail) + { + COPYCHAR(next, *str); + next += clen; + avail -= clen; + } + } + } + *str += pg_mblen(*str); + } + + *next = '\0'; + + return (state == PAE_INMASK); /* OK if we got a nonempty field */ +} + +/* + * Parses entry of an .affix file of MySpell or Hunspell format. + * + * An .affix file entry has the following format: + * - header + * + * - fields after header: + * + * + * str is the input line + * field values are returned to type etc, which must be buffers of size BUFSIZ. + * + * Returns number of fields found; any omitted fields are set to empty strings. + */ +static int +parse_ooaffentry(char *str, char *type, char *flag, char *find, + char *repl, char *mask) +{ + int state = PAE_WAIT_TYPE; + int fields_read = 0; + bool valid = false; + + *type = *flag = *find = *repl = *mask = '\0'; + + while (*str) + { + switch (state) + { + case PAE_WAIT_TYPE: + valid = get_nextfield(&str, type); + state = PAE_WAIT_FLAG; + break; + case PAE_WAIT_FLAG: + valid = get_nextfield(&str, flag); + state = PAE_WAIT_FIND; + break; + case PAE_WAIT_FIND: + valid = get_nextfield(&str, find); + state = PAE_WAIT_REPL; + break; + case PAE_WAIT_REPL: + valid = get_nextfield(&str, repl); + state = PAE_WAIT_MASK; + break; + case PAE_WAIT_MASK: + valid = get_nextfield(&str, mask); + state = -1; /* force loop exit */ + break; + default: + elog(ERROR, "unrecognized state in parse_ooaffentry: %d", + state); + break; + } + if (valid) + fields_read++; + else + break; /* early EOL */ + if (state < 0) + break; /* got all fields */ + } + + return fields_read; +} + +/* + * Parses entry of an .affix file of Ispell format + * + * An .affix file entry has the following format: + * > [-,] + */ static bool parse_affentry(char *str, char *mask, char *find, char *repl) { @@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) int flag = 0; char flagflags = 0; tsearch_readline_state trst; - int scanread = 0; - char scanbuf[BUFSIZ]; char *recoded; /* read file to find any flag */ @@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) } tsearch_readline_end(&trst); - sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5); - if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), @@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) while ((recoded = tsearch_readline(&trst)) != NULL) { + int fields_read; + if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) goto nextline; - scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask); + fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask); if (ptype) pfree(ptype); ptype = lowerstr_ctx(Conf, type); - if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) + if (fields_read < 4 || + (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0)) goto nextline; - if (scanread == 4) + if (fields_read == 4) { if (strlen(sflag) != 1) goto nextline; @@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) if (strlen(sflag) != 1 || flag != *sflag || flag == 0) goto nextline; prepl = lowerstr_ctx(Conf, repl); - /* affix flag */ + /* Find position of '/' in lowercased string "prepl" */ if ((ptr = strchr(prepl, '/')) != NULL) { + /* + * Here we use non-lowercased string "repl". We need position + * of '/' in "repl". + */ *ptr = '\0'; ptr = repl + (ptr - prepl) + 1; while (*ptr) @@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename) if (STRNCMP(pstr, "compoundwords") == 0) { + /* Find position in lowercased string "pstr" */ s = findchar(pstr, 'l'); if (s) { - s = recoded + (s - pstr); /* we need non-lowercased - * string */ + /* Here we use non-lowercased string "recoded" */ + s = recoded + (s - pstr); while (*s && !t_isspace(s)) s += pg_mblen(s); while (*s && t_isspace(s)) -- 2.40.0