Cleanup for some problems in tsearch patch:

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 25 Aug 2007 00:03:59 +0000 (00:03 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 25 Aug 2007 00:03:59 +0000 (00:03 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 25 Aug 2007 00:03:59 +0000 (00:03 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 25 Aug 2007 00:03:59 +0000 (00:03 +0000)
diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c

index 03f2dd928c2a3e77a41af2e81314ed8e57d21c0d..57aac234ed280a06f48163f908cf5616fd183d78 100644 (file)
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -192,7 +192,6 @@ dsnowball_init(PG_FUNCTION_ARGS)
         ListCell   *l;
  
         d = (DictSnowball *) palloc0(sizeof(DictSnowball));
-       d->stoplist.wordop = recode_and_lowerstr;
  
         foreach(l, dictoptions)
         {
@@ -204,8 +203,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                  errmsg("multiple StopWords parameters")));
-                       readstoplist(defGetString(defel), &d->stoplist);
-                       sortstoplist(&d->stoplist);
+                       readstoplist(defGetString(defel), &d->stoplist, lowerstr);
                         stoploaded = true;
                 }
                 else if (pg_strcasecmp("Language", defel->defname) == 0)
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c

index 802a64508787460f496a724521a2f37c91222e10..d7fe3cc46561a380784d75ea9990d7e49d53f84f 100644 (file)
--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -39,7 +39,6 @@ dispell_init(PG_FUNCTION_ARGS)
         ListCell   *l;
  
         d = (DictISpell *) palloc0(sizeof(DictISpell));
-       d->stoplist.wordop = recode_and_lowerstr;
  
         foreach(l, dictoptions)
         {
@@ -73,8 +72,7 @@ dispell_init(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                  errmsg("multiple StopWords parameters")));
-                       readstoplist(defGetString(defel), &(d->stoplist));
-                       sortstoplist(&(d->stoplist));
+                       readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
                         stoploaded = true;
                 }
                 else
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c

index fcc08ea180da5d561626e6b80a69004f8001de85..aea2c0963b150e46819ee88ff563c54b7f6f8d9c 100644 (file)
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -23,19 +23,17 @@
  typedef struct
  {
         StopList        stoplist;
-} DictExample;
+} DictSimple;
  
  
  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
-       DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
+       DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
         bool            stoploaded = false;
         ListCell   *l;
  
-       d->stoplist.wordop = recode_and_lowerstr;
-
         foreach(l, dictoptions)
         {
                 DefElem    *defel = (DefElem *) lfirst(l);
@@ -46,8 +44,7 @@ dsimple_init(PG_FUNCTION_ARGS)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                                  errmsg("multiple StopWords parameters")));
-                       readstoplist(defGetString(defel), &d->stoplist);
-                       sortstoplist(&d->stoplist);
+                       readstoplist(defGetString(defel), &d->stoplist, lowerstr);
                         stoploaded = true;
                 }
                 else
@@ -65,16 +62,16 @@ dsimple_init(PG_FUNCTION_ARGS)
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
-       DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
+       DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
         char       *in = (char *) PG_GETARG_POINTER(1);
         int32      len = PG_GETARG_INT32(2);
-       char       *txt = lowerstr_with_len(in, len);
+       char       *txt;
         TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
  
+       txt = lowerstr_with_len(in, len);
+
         if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
-       {
                 pfree(txt);
-       }
         else
                 res[0].lexeme = txt;
  
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c

index c5bd197e9263f2368262d9c4a60112de605b2b5b..1c0fd95413cd5f703b886a9c09e2efb69edf0073 100644 (file)
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.2 2007/08/22 04:13:15 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -20,9 +20,6 @@
  #include "tsearch/ts_utils.h"
  #include "utils/builtins.h"
  
-
-#define SYNBUFLEN      4096
-
  typedef struct
  {
         char       *in;
@@ -31,23 +28,34 @@ typedef struct
  
  typedef struct
  {
-       int                     len;
+       int                     len;    /* length of syn array */
         Syn                *syn;
  } DictSyn;
  
+/*
+ * Finds the next whitespace-delimited word within the 'in' string.
+ * Returns a pointer to the first character of the word, and a pointer
+ * to the next byte after the last character in the word (in *end).
+ */
  static char *
  findwrd(char *in, char **end)
  {
         char       *start;
  
-       *end = NULL;
+       /* Skip leading spaces */
         while (*in && t_isspace(in))
                 in += pg_mblen(in);
  
+       /* Return NULL on empty lines */
         if (*in == '\0')
+       {
+               *end = NULL;
                 return NULL;
+       }
+
         start = in;
  
+       /* Find end of word */
         while (*in && !t_isspace(in))
                 in += pg_mblen(in);
  
@@ -70,12 +78,11 @@ dsynonym_init(PG_FUNCTION_ARGS)
         ListCell   *l;
         char       *filename = NULL;
         FILE       *fin;
-       char            buf[SYNBUFLEN];
         char       *starti,
                            *starto,
                            *end = NULL;
         int                     cur = 0;
-       int                     slen;
+       char       *line = NULL;
  
         foreach(l, dictoptions)
         {
@@ -105,10 +112,33 @@ dsynonym_init(PG_FUNCTION_ARGS)
  
         d = (DictSyn *) palloc0(sizeof(DictSyn));
  
-       while (fgets(buf, SYNBUFLEN, fin))
+       while ((line = t_readline(fin)) != NULL)
         {
-               slen = strlen(buf);
-               pg_verifymbstr(buf, slen, false);
+               starti = findwrd(line, &end);
+               if (!starti)
+               {
+                       /* Empty line */
+                       goto skipline;
+               }
+               *end = '\0';
+               if (end >= line + strlen(line))
+               {
+                       /* A line with only one word. Ignore silently. */
+                       goto skipline;
+               }
+
+               starto = findwrd(end + 1, &end);
+               if (!starto)
+               {
+                       /* A line with only one word. Ignore silently. */
+                       goto skipline;
+               }
+               *end = '\0';
+
+               /* starti now points to the first word, and starto to the second
+                * word on the line, with a \0 terminator at the end of both words.
+                */
+
                 if (cur == d->len)
                 {
                         if (d->len == 0)
@@ -123,36 +153,19 @@ dsynonym_init(PG_FUNCTION_ARGS)
                         }
                 }
  
-               starti = findwrd(buf, &end);
-               if (!starti)
-                       continue;
-               *end = '\0';
-               if (end >= buf + slen)
-                       continue;
-
-               starto = findwrd(end + 1, &end);
-               if (!starto)
-                       continue;
-               *end = '\0';
-
-               d->syn[cur].in = recode_and_lowerstr(starti);
-               d->syn[cur].out = recode_and_lowerstr(starto);
-               if (!(d->syn[cur].in && d->syn[cur].out))
-               {
-                       FreeFile(fin);
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-               }
+               d->syn[cur].in = lowerstr(starti);
+               d->syn[cur].out = lowerstr(starto);
  
                 cur++;
+
+       skipline:
+               pfree(line);
         }
  
         FreeFile(fin);
  
         d->len = cur;
-       if (cur > 1)
-               qsort(d->syn, d->len, sizeof(Syn), compareSyn);
+       qsort(d->syn, d->len, sizeof(Syn), compareSyn);
  
         PG_RETURN_POINTER(d);
  }
@@ -179,8 +192,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
         if (!found)
                 PG_RETURN_POINTER(NULL);
  
-       res = palloc(sizeof(TSLexeme) * 2);
-       memset(res, 0, sizeof(TSLexeme) * 2);
+       res = palloc0(sizeof(TSLexeme) * 2);
         res[0].lexeme = pstrdup(found->out);
  
         PG_RETURN_POINTER(res);
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c

index 70700db41fb8653d38188abbcd07a2579f8a12d8..2891dc42c751e51275b1927fec82985b19fc66fd 100644 (file)
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -170,10 +170,10 @@ static void
  thesaurusRead(char *filename, DictThesaurus * d)
  {
         FILE       *fh;
-       char            str[BUFSIZ];
         int                     lineno = 0;
         uint16          idsubst = 0;
         bool            useasis = false;
+       char       *line;
  
         filename = get_tsearch_config_filename(filename, "ths");
         fh = AllocateFile(filename, "r");
@@ -183,27 +183,28 @@ thesaurusRead(char *filename, DictThesaurus * d)
                                  errmsg("could not open thesaurus file \"%s\": %m",
                                                 filename)));
  
-       while (fgets(str, sizeof(str), fh))
+       while ((line = t_readline(fh)) != NULL)
         {
-               char       *ptr,
-                                  *recoded;
+               char       *ptr;
                 int                     state = TR_WAITLEX;
                 char       *beginwrd = NULL;
                 uint16          posinsubst = 0;
                 uint16          nwrd = 0;
  
-               ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-                                                                                        GetDatabaseEncoding(), PG_UTF8);
-               if (recoded == NULL)
-                       elog(ERROR, "encoding conversion failed");
-
                 lineno++;
  
-               /* is it comment ? */
-               while (t_isspace(ptr))
+               ptr = line;
+
+               /* is it a comment? */
+               while (*ptr && t_isspace(ptr))
                         ptr += pg_mblen(ptr);
-               if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
+
+               if (t_iseq(ptr, '#') || *ptr == '\0' ||
+                       t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
+               {
+                       pfree(line);
                         continue;
+               }
  
                 while (*ptr)
                 {
@@ -301,8 +302,7 @@ thesaurusRead(char *filename, DictThesaurus * d)
                                                         lineno, filename)));
                 }
  
-               if (recoded != str)
-                       pfree(recoded);
+               pfree(line);
         }
  
         d->nsubst = idsubst;
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c

index d09208649f7a4933d5e684419132af69d8a1057f..e9bb9995627f92fc4c7efa1d2c1faf419f5477ed 100644 (file)
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -21,8 +21,11 @@
  
  
  /*
- * during initialization dictionary requires a lot
- * of memory, so it will use temporary context
+ * Initialization requires a lot of memory that's not needed
+ * after the initialization is done.  In init function, 
+ * CurrentMemoryContext is a long lived memory context associated
+ * with the dictionary cache entry, so we use a temporary context
+ * for the short-lived stuff.
   */
  static MemoryContext tmpCtx = NULL;
  
@@ -32,6 +35,9 @@ static MemoryContext tmpCtx = NULL;
  static void
  checkTmpCtx(void)
  {
+       /* XXX: This assumes that CurrentMemoryContext doesn't have
+        * any children other than the one we create here.
+        */
         if (CurrentMemoryContext->firstchild == NULL)
         {
                 tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
@@ -74,17 +80,7 @@ cmpspell(const void *s1, const void *s2)
  static int
  cmpspellaffix(const void *s1, const void *s2)
  {
-       return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag));
-}
-
-static char *
-strnduplicate(char *s, int len)
-{
-       char       *d = (char *) palloc(len + 1);
-
-       memcpy(d, s, len);
-       d[len] = '\0';
-       return d;
+       return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
  }
  
  static char *
@@ -185,7 +181,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
         }
         Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
         strcpy(Conf->Spell[Conf->nspell]->word, word);
-       strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
+       strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
         Conf->nspell++;
  }
  
@@ -197,9 +193,8 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
  void
  NIImportDictionary(IspellDict * Conf, const char *filename)
  {
-       char            str[BUFSIZ],
-                          *pstr;
         FILE       *dict;
+       char       *line;
  
         checkTmpCtx();
  
@@ -209,19 +204,14 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
                                  errmsg("could not open dictionary file \"%s\": %m",
                                                 filename)));
  
-       while (fgets(str, sizeof(str), dict))
+       while ((line = t_readline(dict)) != NULL)
         {
-               char       *s,
-                                  *recoded;
+               char       *s, *pstr;
                 const char *flag;
  
-               recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-                                                                                        PG_UTF8, GetDatabaseEncoding());
-               if (recoded == NULL)
-                       elog(ERROR, "encoding conversion failed");
-
+               /* Extract flag from the line */
                 flag = NULL;
-               if ((s = findchar(recoded, '/')))
+               if ((s = findchar(line, '/')))
                 {
                         *s++ = '\0';
                         flag = s;
@@ -240,8 +230,8 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
                 else
                         flag = "";
  
-
-               s = recoded;
+               /* Remove trailing spaces */
+               s = line;
                 while (*s)
                 {
                         if (t_isspace(s))
@@ -251,13 +241,12 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
                         }
                         s += pg_mblen(s);
                 }
-               pstr = lowerstr_ctx(recoded);
+               pstr = lowerstr_ctx(line);
  
                 NIAddSpell(Conf, pstr, flag);
                 pfree(pstr);
  
-               if (recoded != str)
-                       pfree(recoded);
+               pfree(line);
         }
         FreeFile(dict);
  }
@@ -402,7 +391,7 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
  
  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl,
-                          const char *filename, int line)
+                          const char *filename, int lineno)
  {
         int                     state = PAE_WAIT_MASK;
         char       *pmask = mask,
@@ -453,7 +442,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
                 }
                 else if (state == PAE_INFIND)
                 {
@@ -471,7 +460,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
                 }
                 else if (state == PAE_WAIT_REPL)
                 {
@@ -489,7 +478,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
                 }
                 else if (state == PAE_INREPL)
                 {
@@ -507,7 +496,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("syntax error at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
                 }
                 else
                         elog(ERROR, "unknown state in parse_affentry: %d", state);
@@ -522,7 +511,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
  
  static void
  addFlagValue(IspellDict * Conf, char *s, uint32 val,
-                        const char *filename, int line)
+                        const char *filename, int lineno)
  {
         while (*s && t_isspace(s))
                 s++;
@@ -531,13 +520,13 @@ addFlagValue(IspellDict * Conf, char *s, uint32 val,
                 ereport(ERROR,
                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                  errmsg("syntax error at line %d of affix file \"%s\"",
-                                               line, filename)));
+                                               lineno, filename)));
  
         if (pg_mblen(s) != 1)
                 ereport(ERROR,
                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                  errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
-                                               line, filename)));
+                                               lineno, filename)));
  
         Conf->flagval[(unsigned int) *s] = (unsigned char) val;
         Conf->usecompound = true;
@@ -546,7 +535,6 @@ addFlagValue(IspellDict * Conf, char *s, uint32 val,
  static void
  NIImportOOAffixes(IspellDict * Conf, const char *filename)
  {
-       char            str[BUFSIZ];
         char            type[BUFSIZ],
                            *ptype = NULL;
         char            sflag[BUFSIZ];
@@ -560,9 +548,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
         int                     flag = 0;
         char            flagflags = 0;
         FILE       *affix;
-       int                     line = 0;
+       int                     lineno = 0;
         int                     scanread = 0;
         char            scanbuf[BUFSIZ];
+       char       *recoded;
  
         checkTmpCtx();
  
@@ -576,45 +565,41 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
                                  errmsg("could not open affix file \"%s\": %m",
                                                 filename)));
  
-       while (fgets(str, sizeof(str), affix))
+       while ((recoded = t_readline(affix)) != NULL)
         {
-               char       *recoded;
-
-               recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-                                                                                        PG_UTF8, GetDatabaseEncoding());
-               if (recoded == NULL)
-                       elog(ERROR, "encoding conversion failed");
-
-               line++;
+               lineno++;
  
                 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+               {
+                       pfree(recoded);
                         continue;
+               }
  
                 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
-                                                FF_COMPOUNDFLAG, filename, line);
+                                                FF_COMPOUNDFLAG, filename, lineno);
                 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
-                                                FF_COMPOUNDBEGIN, filename, line);
+                                                FF_COMPOUNDBEGIN, filename, lineno);
                 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
-                                                FF_COMPOUNDLAST, filename, line);
+                                                FF_COMPOUNDLAST, filename, lineno);
                 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
                 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
-                                                FF_COMPOUNDLAST, filename, line);
+                                                FF_COMPOUNDLAST, filename, lineno);
                 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
-                                                FF_COMPOUNDMIDDLE, filename, line);
+                                                FF_COMPOUNDMIDDLE, filename, lineno);
                 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
                         addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
-                                                FF_COMPOUNDONLY, filename, line);
+                                                FF_COMPOUNDONLY, filename, lineno);
                 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
-                                                FF_COMPOUNDPERMITFLAG, filename, line);
+                                                FF_COMPOUNDPERMITFLAG, filename, lineno);
                 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
                         addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
-                                                FF_COMPOUNDFORBIDFLAG, filename, line);
+                                                FF_COMPOUNDFORBIDFLAG, filename, lineno);
                 else if (STRNCMP(recoded, "FLAG") == 0)
                 {
                         char       *s = recoded + strlen("FLAG");
@@ -626,14 +611,13 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
                 }
  
-               if (recoded != str)
-                       pfree(recoded);
+               pfree(recoded);
         }
         FreeFile(affix);
-       line = 0;
+       lineno = 0;
  
         sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
  
@@ -643,18 +627,11 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
                                  errmsg("could not open affix file \"%s\": %m",
                                                 filename)));
  
-       while (fgets(str, sizeof(str), affix))
+       while ((recoded = t_readline(affix)) != NULL)
         {
-               char       *recoded;
-
-               recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-                                                                                        PG_UTF8, GetDatabaseEncoding());
-               if (recoded == NULL)
-                       elog(ERROR, "encoding conversion failed");
-
-               line++;
+               lineno++;
                 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
-                       continue;
+                       goto nextline;
  
                 scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
  
@@ -662,12 +639,12 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
                         pfree(ptype);
                 ptype = lowerstr_ctx(type);
                 if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
-                       continue;
+                       goto nextline;
  
                 if (scanread == 4)
                 {
                         if (strlen(sflag) != 1)
-                               continue;
+                               goto nextline;
                         flag = *sflag;
                         isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
                         pfind = lowerstr_ctx(find);
@@ -683,7 +660,7 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
                         int                     aflg = 0;
  
                         if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
-                               continue;
+                               goto nextline;
                         prepl = lowerstr_ctx(repl);
                         /* affix flag */
                         if ((ptr = strchr(prepl, '/')) != NULL)
@@ -710,8 +687,8 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
                         pfree(pmask);
                 }
  
-               if (recoded != str)
-                       pfree(recoded);
+       nextline:
+               pfree(recoded);
         }
  
         if (ptype)
@@ -733,13 +710,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
         char            find[BUFSIZ];
         char            repl[BUFSIZ];
         char       *s;
-       int                     suffixes = 0;
-       int                     prefixes = 0;
+       bool            suffixes = false;
+       bool            prefixes = false;
         int                     flag = 0;
         char            flagflags = 0;
         FILE       *affix;
-       int                     line = 0;
-       int                     oldformat = 0;
+       int                     lineno = 0;
+       bool            oldformat = false;
+       char       *recoded = NULL;
  
         checkTmpCtx();
  
@@ -752,16 +730,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
         memset(Conf->flagval, 0, sizeof(Conf->flagval));
         Conf->usecompound = false;
  
-       while (fgets(str, sizeof(str), affix))
+       while ((recoded = t_readline(affix)) != NULL)
         {
-               if (pstr)
-                       pfree(pstr);
+               pstr = lowerstr(recoded);
+               pfree(recoded);
  
-               pstr = recode_and_lowerstr(str);
+               lineno++;
  
-               line++;
+               /* Skip comments and empty lines */
                 if (*pstr == '#' || *pstr == '\n')
-                       continue;
+                       goto nextline;
  
                 if (STRNCMP(pstr, "compoundwords") == 0)
                 {
@@ -777,23 +755,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
                                         Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                                         Conf->usecompound = true;
                                 }
-                               oldformat++;
-                               continue;
+                               oldformat = true;
+                               goto nextline;
                         }
                 }
                 if (STRNCMP(pstr, "suffixes") == 0)
                 {
-                       suffixes = 1;
-                       prefixes = 0;
-                       oldformat++;
-                       continue;
+                       suffixes = true;
+                       prefixes = false;
+                       oldformat = true;
+                       goto nextline;
                 }
                 if (STRNCMP(pstr, "prefixes") == 0)
                 {
-                       suffixes = 0;
-                       prefixes = 1;
-                       oldformat++;
-                       continue;
+                       suffixes = false;
+                       prefixes = true;
+                       oldformat = true;
+                       goto nextline;
                 }
                 if (STRNCMP(pstr, "flag") == 0)
                 {
@@ -802,14 +780,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
  
                         while (*s && t_isspace(s))
                                 s++;
-                       oldformat++;
+                       oldformat = true;
  
                         /* allow only single-encoded flags */
                         if (pg_mblen(s) != 1)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
  
                         if (*s == '*')
                         {
@@ -830,10 +808,10 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
  
                         flag = (unsigned char) *s;
-                       continue;
+                       goto nextline;
                 }
                 if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
                         STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
@@ -842,23 +820,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
                                 ereport(ERROR,
                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                                  errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
-                                                               line, filename)));
+                                                               lineno, filename)));
                         FreeFile(affix);
                         NIImportOOAffixes(Conf, filename);
                         return;
                 }
                 if ((!suffixes) && (!prefixes))
-                       continue;
+                       goto nextline;
  
-               if (!parse_affentry(pstr, mask, find, repl, filename, line))
-                       continue;
+               if (!parse_affentry(pstr, mask, find, repl, filename, lineno))
+                       goto nextline;
  
                 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
-       }
-       FreeFile(affix);
  
-       if (pstr)
+       nextline:
                 pfree(pstr);
+       }
+       FreeFile(affix);
  }
  
  static int
@@ -975,38 +953,55 @@ mkSPNode(IspellDict * Conf, int low, int high, int level)
         return rs;
  }
  
+/*
+ * Builds the Conf->Dictionary tree and AffixData from the imported dictionary 
+ * and affixes.
+ */
  void
  NISortDictionary(IspellDict * Conf)
  {
-       size_t          i;
-       int                     naffix = 3;
+       int     i;
+       int     naffix = 0;
+       int     curaffix;
  
         checkTmpCtx();
  
         /* compress affixes */
+
+       /* Count the number of different flags used in the dictionary */
+
         qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
-       for (i = 1; i < Conf->nspell; i++)
-               if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
+
+       naffix = 0;
+       for (i = 0; i < Conf->nspell; i++)
+       {
+               if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
                         naffix++;
+       }
  
+       /*
+        * Fill in Conf->AffixData with the affixes that were used
+        * in the dictionary. Replace textual flag-field of Conf->Spell 
+        * entries with indexes into Conf->AffixData array.
+        */
         Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
-       naffix = 1;
-       Conf->AffixData[0] = pstrdup("");
-       Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
-       Conf->Spell[0]->p.d.affix = 1;
-       Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
-       for (i = 1; i < Conf->nspell; i++)
+
+       curaffix = -1;
+       for (i = 0; i < Conf->nspell; i++)
         {
-               if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
+               if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
                 {
-                       naffix++;
-                       Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
+                       curaffix++;
+                       Assert(curaffix < naffix);
+                       Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
                 }
-               Conf->Spell[i]->p.d.affix = naffix;
+
+               Conf->Spell[i]->p.d.affix = curaffix;
                 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
         }
  
         Conf->lenAffixData = Conf->nAffixData = naffix;
+
         qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
         Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
  
@@ -1085,7 +1080,7 @@ mkANode(IspellDict * Conf, int low, int high, int level, int type)
  }
  
  static void
-mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
+mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
  {
         int                     i,
                                 cnt = 0;
@@ -1145,7 +1140,7 @@ NISortAffixes(IspellDict * Conf)
         AFFIX      *Affix;
         size_t          i;
         CMPDAffix  *ptr;
-       int                     firstsuffix = -1;
+       int                     firstsuffix = Conf->naffixes;
  
         checkTmpCtx();
  
@@ -1160,7 +1155,7 @@ NISortAffixes(IspellDict * Conf)
         for (i = 0; i < Conf->naffixes; i++)
         {
                 Affix = &(((AFFIX *) Conf->Affix)[i]);
-               if (Affix->type == FF_SUFFIX && firstsuffix < 0)
+               if (Affix->type == FF_SUFFIX && i < firstsuffix)
                         firstsuffix = i;
  
                 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
@@ -1185,12 +1180,12 @@ NISortAffixes(IspellDict * Conf)
  
         Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
         Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
-       mkVoidAffix(Conf, 1, firstsuffix);
-       mkVoidAffix(Conf, 0, firstsuffix);
+       mkVoidAffix(Conf, true, firstsuffix);
+       mkVoidAffix(Conf, false, firstsuffix);
  }
  
  static AffixNodeData *
-FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
+FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
         AffixNodeData *StopLow,
                            *StopHigh,
@@ -1374,7 +1369,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
         plevel = 0;
         while (pnode)
         {
-               prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
+               prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
                 if (!prefix)
                         break;
                 for (j = 0; j < prefix->naff; j++)
@@ -1398,7 +1393,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
                 int                     baselen = 0;
  
                 /* find possible suffix */
-               suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
+               suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
                 if (!suffix)
                         break;
                 /* foreach suffix check affix */
@@ -1416,7 +1411,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
                                 swrdlen = strlen(newword);
                                 while (pnode)
                                 {
-                                       prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
+                                       prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                                         if (!prefix)
                                                 break;
                                         for (j = 0; j < prefix->naff; j++)
@@ -1626,7 +1621,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
                                         if (wordlen == level + 1)
                                         {
                                                 /* well, it was last word */
-                                               var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
+                                               var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
                                                 var->nstem++;
                                                 pfree(notprobed);
                                                 return var;
@@ -1641,7 +1636,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
                                                 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                                                 /* we can find next word */
                                                 level++;
-                                               var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
+                                               var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
                                                 var->nstem++;
                                                 node = Conf->Dictionary;
                                                 startpos = level;
@@ -1656,7 +1651,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
                 level++;
         }
  
-       var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
+       var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
         var->nstem++;
         pfree(notprobed);
         return var;
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c

index c822f086e0aca7305fa0c42931cb3029f30e857d..361152e6bec9be1d7ccfe8aefee09213d92dba86 100644 (file)
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -125,28 +125,47 @@ _t_isprint(const char *ptr)
  }
  #endif   /* TS_USE_WIDE */
  
+
  /*
- * Convert C-string from UTF8 to server encoding and
- * lower it
+ * Read the next line from a tsearch data file (expected to be in UTF-8), and
+ * convert it to database encoding if needed. The returned string is palloc'd.
+ * NULL return means EOF.
   */
  char *
-recode_and_lowerstr(char *str)
+t_readline(FILE *fp)
  {
-       char       *recoded;
-       char       *ret;
+       int len;
+       char *recoded;
+       char buf[4096];         /* lines must not be longer than this */
+       
+       if (fgets(buf, sizeof(buf), fp) == NULL)
+               return NULL;
  
-       recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-                                                                                        PG_UTF8, GetDatabaseEncoding());
+       len = strlen(buf);
  
-       if (recoded == NULL)
-               elog(ERROR, "encoding conversion failed");
+       /* Make sure the input is valid UTF-8 */
+       (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
  
-       ret = lowerstr(recoded);
+       /* And convert */
+       recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf,
+                                                                                                len,
+                                                                                                PG_UTF8,
+                                                                                                GetDatabaseEncoding());
  
-       if (recoded != str)
-               pfree(recoded);
+       if (recoded == NULL)            /* should not happen */
+               elog(ERROR, "encoding conversion failed");
+
+       if (recoded == buf)
+       {
+               /*
+                * conversion didn't pstrdup, so we must.
+                * We can use the length of the original string, because
+                * no conversion was done.
+                */
+               recoded = pnstrdup(recoded, len);
+       }
  
-       return ret;
+       return recoded;
  }
  
  char *
@@ -155,6 +174,9 @@ lowerstr(char *str)
         return lowerstr_with_len(str, strlen(str));
  }
  
+/*
+ * Returned string is palloc'd
+ */
  char *
  lowerstr_with_len(char *str, int len)
  {
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c

index f286a61fb0ddbeaa3b44b6a5543c3da80da3c91f..47e18fc1ac5b3c75137208621c40676ae401d321 100644 (file)
--- a/src/backend/tsearch/ts_parse.c
+++ b/src/backend/tsearch/ts_parse.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -308,7 +308,7 @@ LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
                         {
                                 /*
                                  * Dictionary normalizes lexemes, so we remove from stack all
-                                * used lexemes , return to basic mode and redo end of stack
+                                * used lexemes, return to basic mode and redo end of stack
                                  * (if it exists)
                                  */
                                 if (res)
@@ -427,14 +427,14 @@ parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
   * Headline framework
   */
  static void
-hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
+hladdword(HeadlineParsedText * prs, char *buf, int4 buflen, int type)
  {
         while (prs->curwords >= prs->lenwords)
         {
                 prs->lenwords *= 2;
-               prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+               prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
         }
-       memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
+       memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
         prs->words[prs->curwords].type = (uint8) type;
         prs->words[prs->curwords].len = buflen;
         prs->words[prs->curwords].word = palloc(buflen);
@@ -443,16 +443,16 @@ hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
  }
  
  static void
-hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
+hlfinditem(HeadlineParsedText * prs, TSQuery query, char *buf, int buflen)
  {
         int                     i;
         QueryItem  *item = GETQUERY(query);
-       HeadlineWord *word;
+       HeadlineWordEntry *word;
  
         while (prs->curwords + query->size >= prs->lenwords)
         {
                 prs->lenwords *= 2;
-               prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
+               prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
         }
  
         word = &(prs->words[prs->curwords - 1]);
@@ -462,7 +462,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
                 {
                         if (word->item)
                         {
-                               memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
+                               memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
                                 prs->words[prs->curwords].item = item;
                                 prs->words[prs->curwords].repeated = 1;
                                 prs->curwords++;
@@ -475,7 +475,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
  }
  
  static void
-addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
+addHLParsedLex(HeadlineParsedText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
  {
         ParsedLex  *tmplexs;
         TSLexeme   *ptr;
@@ -511,7 +511,7 @@ addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * n
  }
  
  void
-hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
+hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen)
  {
         int                     type,
                                 lenlemm;
@@ -571,12 +571,12 @@ hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen
  }
  
  text *
-generatHeadline(HeadlineText * prs)
+generateHeadline(HeadlineParsedText * prs)
  {
         text       *out;
         int                     len = 128;
         char       *ptr;
-       HeadlineWord *wrd = prs->words;
+       HeadlineWordEntry *wrd = prs->words;
  
         out = (text *) palloc(len);
         ptr = ((char *) out) + VARHDRSZ;
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c

index 9270c403696156cf1deaf19af793ebb9cab577f8..e9ad59282a084737bac5815f51c899e869165ec2 100644 (file)
--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.2 2007/08/22 01:39:44 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -63,21 +63,29 @@ get_tsearch_config_filename(const char *basename,
         return result;
  }
  
-#define STOPBUFLEN     4096
+static int
+comparestr(const void *a, const void *b)
+{
+       return strcmp(*(char **) a, *(char **) b);
+}
  
+/*
+ * Reads a stopword file. Each word is run through 'wordop'
+ * function, if given.  wordop may either modify the input in-place,
+ * or palloc a new version.
+ */
  void
-readstoplist(char *in, StopList * s)
+readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
  {
         char      **stop = NULL;
  
         s->len = 0;
-       if (in && *in)
+       if (fname && *fname)
         {
-               char       *filename = get_tsearch_config_filename(in, "stop");
+               char       *filename = get_tsearch_config_filename(fname, "stop");
                 FILE       *hin;
-               char            buf[STOPBUFLEN];
+               char       *line;
                 int                     reallen = 0;
-               int                     line = 0;
  
                 if ((hin = AllocateFile(filename, "r")) == NULL)
                         ereport(ERROR,
@@ -85,65 +93,56 @@ readstoplist(char *in, StopList * s)
                                          errmsg("could not open stopword file \"%s\": %m",
                                                         filename)));
  
-               while (fgets(buf, STOPBUFLEN, hin))
+               while ((line = t_readline(hin)) != NULL)
                 {
-                       char       *pbuf = buf;
+                       char *pbuf = line;
  
-                       line++;
-                       while (*pbuf && !isspace(*pbuf))
+                       /* Trim trailing space */
+                       while (*pbuf && !t_isspace(pbuf))
                                 pbuf++;
                         *pbuf = '\0';
  
-                       if (*buf == '\0')
-                               continue;
-
-                       if (!pg_verifymbstr(buf, strlen(buf), true))
+                       /* Skip empty lines */
+                       if (*line == '\0')
                         {
-                               FreeFile(hin);
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                                errmsg("invalid multibyte encoding at line %d in file \"%s\"",
-                                                               line, filename)));
+                               pfree(line);
+                               continue;
                         }
  
                         if (s->len >= reallen)
                         {
                                 if (reallen == 0)
                                 {
-                                       reallen = 16;
+                                       reallen = 64;
                                         stop = (char **) palloc(sizeof(char *) * reallen);
                                 }
                                 else
                                 {
                                         reallen *= 2;
-                                       stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen);
+                                       stop = (char **) repalloc((void *) stop,
+                                                                                         sizeof(char *) * reallen);
                                 }
                         }
  
-
-                       if (s->wordop)
-                               stop[s->len] = s->wordop(buf);
+                       if (wordop)
+                       {
+                               stop[s->len] = wordop(line);
+                               if (stop[s->len] != line)
+                                       pfree(line);
+                       }
                         else
-                               stop[s->len] = pstrdup(buf);
+                               stop[s->len] = line;
  
                         (s->len)++;
                 }
+
                 FreeFile(hin);
                 pfree(filename);
         }
  
         s->stop = stop;
-}
  
-static int
-comparestr(const void *a, const void *b)
-{
-       return strcmp(*(char **) a, *(char **) b);
-}
-
-void
-sortstoplist(StopList * s)
-{
+       /* Sort to allow binary searching */
         if (s->stop && s->len > 0)
                 qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c

index e927e98aab2f147ba9188d7e0f6e0dde5605d67f..0582fec2b5f42cbda3c7426b100c2519dd528f1e 100644 (file)
--- a/src/backend/tsearch/wparser.c
+++ b/src/backend/tsearch/wparser.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.2 2007/08/22 01:39:45 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -300,7 +300,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
         text       *in = PG_GETARG_TEXT_P(1);
         TSQuery         query = PG_GETARG_TSQUERY(2);
         text       *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
-       HeadlineText prs;
+       HeadlineParsedText prs;
         List       *prsoptions;
         text       *out;
         TSConfigCacheEntry *cfg;
@@ -309,9 +309,9 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
         cfg = lookup_ts_config_cache(PG_GETARG_OID(0));
         prsobj = lookup_ts_parser_cache(cfg->prsId);
  
-       memset(&prs, 0, sizeof(HeadlineText));
+       memset(&prs, 0, sizeof(HeadlineParsedText));
         prs.lenwords = 32;
-       prs.words = (HeadlineWord *) palloc(sizeof(HeadlineWord) * prs.lenwords);
+       prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);
  
         hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);
  
@@ -325,7 +325,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
                                   PointerGetDatum(prsoptions),
                                   PointerGetDatum(query));
  
-       out = generatHeadline(&prs);
+       out = generateHeadline(&prs);
  
         PG_FREE_IF_COPY(in, 1);
         PG_FREE_IF_COPY(query, 2);
diff --git a/src/include/tsearch/dicts/spell.h b/src/include/tsearch/dicts/spell.h

index 6c15a672f33600808239be9eb1e2251c3b0c3e35..3dc013fea1e03525ff72ebb38e605488d3495450 100644 (file)
--- a/src/include/tsearch/dicts/spell.h
+++ b/src/include/tsearch/dicts/spell.h
@@ -6,7 +6,7 @@
   *
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.1 2007/08/21 01:11:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -18,12 +18,17 @@
  #include "tsearch/dicts/regis.h"
  #include "tsearch/ts_public.h"
  
+/*
+ * Max length of a flag name. Names longer than this will be truncated
+ * to the maximum. 
+ */
+#define MAXFLAGLEN 16
+
  struct SPNode;
  
  typedef struct
  {
-       uint32
-                               val:8,
+       uint32          val:8,
                                 isword:1,
                                 compoundflag:4,
                                 affix:19;
@@ -54,22 +59,25 @@ typedef struct spell_struct
  {
         union
         {
-               char            flag[16];
+               /*
+                * flag is filled in by NIImportDictionary. After NISortDictionary,
+                * d is valid and flag is invalid. 
+                */
+               char            flag[MAXFLAGLEN];
                 struct
                 {
                         int                     affix;
                         int                     len;
                 }                       d;
         }                       p;
-       char            word[1];
+       char            word[1]; /* variable length, null-terminated */
  } SPELL;
  
  #define SPELLHDRSZ     (offsetof(SPELL, word))
  
  typedef struct aff_struct
  {
-       uint32
-                               flag:8,
+       uint32          flag:8,
                                 type:1,
                                 flagflags:7,
                                 issimple:1,
@@ -85,11 +93,16 @@ typedef struct aff_struct
  } AFFIX;
  
  /*
- * affixes use deictinary flags too
+ * affixes use dictionary flags too
   */
  #define FF_COMPOUNDPERMITFLAG  0x10
  #define FF_COMPOUNDFORBIDFLAG  0x20
  #define FF_CROSSPRODUCT                        0x40
+
+/*
+ * Don't change the order of these. Initialization sorts by these,
+ * and expects prefixes to come first after sorting.
+ */
  #define FF_SUFFIX                              1
  #define FF_PREFIX                              0
  
@@ -97,8 +110,7 @@ struct AffixNode;
  
  typedef struct
  {
-       uint32
-                               val:8,
+       uint32          val:8,
                                 naff:24;
         AFFIX     **aff;
         struct AffixNode *node;
@@ -126,9 +138,13 @@ typedef struct
         int                     naffixes;
         AFFIX      *Affix;
  
-       int                     nspell;
-       int                     mspell;
+       /*
+        * Temporary array of all words in the dict file. Only used during 
+        * initialization
+        */
         SPELL     **Spell;
+       int                     nspell; /* number of valid entries in Spell array */
+       int                     mspell; /* allocated length of Spell array */
  
         AffixNode  *Suffix;
         AffixNode  *Prefix;
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h

index 8a197666473e58dee1256135eaf0fa5dab96db65..dcae2af93a4ded4cc8787f3d018d7b227ec66cf8 100644 (file)
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -5,7 +5,7 @@
   *
   * Copyright (c) 1998-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.1 2007/08/21 01:11:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -38,11 +38,11 @@
  
  #ifdef TS_USE_WIDE
  
-size_t         char2wchar(wchar_t *to, const char *from, size_t len);
+extern size_t char2wchar(wchar_t *to, const char *from, size_t len);
  
  #ifdef WIN32
  
-size_t         wchar2char(char *to, const wchar_t *from, size_t len);
+extern size_t wchar2char(char *to, const wchar_t *from, size_t len);
  #else                                                  /* WIN32 */
  
  /* correct wcstombs */
@@ -81,8 +81,8 @@ extern int    _t_isprint(const char *ptr);
  #define COPYCHAR(d,s)  TOUCHAR(d) = TOUCHAR(s)
  #endif
  
-char      *lowerstr(char *str);
-char      *lowerstr_with_len(char *str, int len);
-char      *recode_and_lowerstr(char *str);
+extern char *lowerstr(char *str);
+extern char *lowerstr_with_len(char *str, int len);
+extern char *t_readline(FILE *fp);
  
  #endif   /* __TSLOCALE_H__ */
diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h

index 718abdb61d4aee65c90bda693bae48b290f3f469..148129aa8bc5b165959d8223c4b4f0d7d640d179 100644 (file)
--- a/src/include/tsearch/ts_public.h
+++ b/src/include/tsearch/ts_public.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 1998-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.2 2007/08/22 01:39:46 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.3 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -71,12 +71,11 @@ typedef struct
  {
         int                     len;
         char      **stop;
-       char       *(*wordop) (char *);
  } StopList;
  
-extern void sortstoplist(StopList * s);
-extern void readstoplist(char *in, StopList * s);
-extern bool searchstoplist(StopList * s, char *key);
+extern void readstoplist(const char *fname, StopList *s,
+                                                char *(*wordop) (char *));
+extern bool searchstoplist(StopList *s, char *key);
  
  /*
   * Interface with dictionaries
@@ -102,9 +101,8 @@ typedef struct
  #define TSL_ADDPOS             0x01
  
  /*
- * Struct for supporting complex dictionaries like
- * thesaurus, pointer to is an 4-th argument for
- * dictlexize method
+ * Struct for supporting complex dictionaries like thesaurus.
+ * 4th argument for dictlexize method is a pointer to this
   */
  typedef struct
  {
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h

index f84db4c6e414488da49c363faafba8422d3a7144..d2e5c8d8e4957d231897fe4e9cad33072ce43e80 100644 (file)
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -5,7 +5,7 @@
   *
   * Copyright (c) 1998-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.1 2007/08/21 01:11:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -13,6 +13,7 @@
  #define _PG_TS_UTILS_H_
  
  #include "tsearch/ts_type.h"
+#include "tsearch/ts_public.h"
  
  /*
   * Common parse definitions for tsvector and tsquery
@@ -38,7 +39,8 @@ typedef struct
  
  extern bool gettoken_tsvector(TSVectorParseState *state);
  
-struct ParseQueryNode;
+struct ParseQueryNode;                 /* private in backend/utils/adt/tsquery.c */
+
  typedef struct
  {
         char       *buffer;                     /* entire string we are scanning */
@@ -46,7 +48,7 @@ typedef struct
         int4            state;
         int4            count;
  
-       /* reverse polish notation in list (for temprorary usage) */
+       /* reverse polish notation in list (for temporary usage) */
         struct ParseQueryNode *str;
  
         /* number in str */
@@ -102,36 +104,12 @@ extern void parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen);
   * headline framework, flow in common to generate:
   *     1 parse text with hlparsetext
   *     2 parser-specific function to find part
- *     3 generatHeadline to generate result text
+ *     3 generateHeadline to generate result text
   */
  
-typedef struct
-{
-       uint32          selected:1,
-                               in:1,
-                               replace:1,
-                               repeated:1,
-                               unused:4,
-                               type:8,
-                               len:16;
-       char       *word;
-       QueryItem  *item;
-} HeadlineWord;
-
-typedef struct
-{
-       HeadlineWord *words;
-       int4            lenwords;
-       int4            curwords;
-       char       *startsel;
-       char       *stopsel;
-       int2            startsellen;
-       int2            stopsellen;
-} HeadlineText;
-
-extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
+extern void hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query,
                         char *buf, int4 buflen);
-extern text *generatHeadline(HeadlineText * prs);
+extern text *generateHeadline(HeadlineParsedText * prs);
  
  /*
   * token/node types for parsing
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 25 Aug 2007 00:03:59 +0000 (00:03 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 25 Aug 2007 00:03:59 +0000 (00:03 +0000)
src/backend/snowball/dict_snowball.c		patch \| blob \| history
src/backend/tsearch/dict_ispell.c		patch \| blob \| history
src/backend/tsearch/dict_simple.c		patch \| blob \| history
src/backend/tsearch/dict_synonym.c		patch \| blob \| history
src/backend/tsearch/dict_thesaurus.c		patch \| blob \| history
src/backend/tsearch/spell.c		patch \| blob \| history
src/backend/tsearch/ts_locale.c		patch \| blob \| history
src/backend/tsearch/ts_parse.c		patch \| blob \| history
src/backend/tsearch/ts_utils.c		patch \| blob \| history
src/backend/tsearch/wparser.c		patch \| blob \| history
src/include/tsearch/dicts/spell.h		patch \| blob \| history
src/include/tsearch/ts_locale.h		patch \| blob \| history
src/include/tsearch/ts_public.h		patch \| blob \| history
src/include/tsearch/ts_utils.h		patch \| blob \| history