From ced3a93ccbbd0a3866f2324662f7a1fa4c31909a Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 7 Nov 2011 11:48:53 -0500 Subject: [PATCH] Fix assorted bugs in contrib/unaccent's configuration file parsing. Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches. --- contrib/unaccent/unaccent.c | 86 +++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index d9c2eac2e7..d22f5c7bea 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -91,35 +91,83 @@ initSuffixTree(char *filename) do { - char src[4096]; - char trg[4096]; - int srclen; - int trglen; - char *line = NULL; - + /* + * pg_do_encoding_conversion() (called by tsearch_readline()) will + * emit exception if it finds untranslatable characters in current + * locale. We just skip such lines, continuing with the next. + */ skip = true; PG_TRY(); { - /* - * pg_do_encoding_conversion() (called by tsearch_readline()) will - * emit exception if it finds untranslatable characters in current - * locale. We just skip such characters. - */ + char *line; + while ((line = tsearch_readline(&trst)) != NULL) { - if (sscanf(line, "%s\t%s\n", src, trg) != 2) - continue; + /* + * The format of each line must be "src trg" where src and trg + * are sequences of one or more non-whitespace characters, + * separated by whitespace. Whitespace at start or end of + * line is ignored. + */ + int state; + char *ptr; + char *src = NULL; + char *trg = NULL; + int ptrlen; + int srclen = 0; + int trglen = 0; + + state = 0; + for (ptr = line; *ptr; ptr += ptrlen) + { + ptrlen = pg_mblen(ptr); + /* ignore whitespace, but end src or trg */ + if (t_isspace(ptr)) + { + if (state == 1) + state = 2; + else if (state == 3) + state = 4; + continue; + } + switch (state) + { + case 0: + /* start of src */ + src = ptr; + srclen = ptrlen; + state = 1; + break; + case 1: + /* continue src */ + srclen += ptrlen; + break; + case 2: + /* start of trg */ + trg = ptr; + trglen = ptrlen; + state = 3; + break; + case 3: + /* continue trg */ + trglen += ptrlen; + break; + default: + /* bogus line format */ + state = -1; + break; + } + } - srclen = strlen(src); - trglen = strlen(trg); + if (state >= 3) + rootSuffixTree = placeChar(rootSuffixTree, + (unsigned char *) src, srclen, + trg, trglen); - rootSuffixTree = placeChar(rootSuffixTree, - (unsigned char *) src, srclen, - trg, trglen); - skip = false; pfree(line); } + skip = false; } PG_CATCH(); { -- 2.40.0