improve support of agglutinative languages (query with compound words).

author Teodor Sigaev <teodor@sigaev.ru>

Tue, 25 Jan 2005 15:24:38 +0000 (15:24 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Tue, 25 Jan 2005 15:24:38 +0000 (15:24 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Tue, 25 Jan 2005 15:24:38 +0000 (15:24 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Tue, 25 Jan 2005 15:24:38 +0000 (15:24 +0000)
diff --git a/contrib/tsearch2/dict.c b/contrib/tsearch2/dict.c

index 357097681e55d0a839c95aeb1e50eab2fb22b6f8..7a3626b3bc9934c5590ea2188b9931a1d04086b8 100644 (file)
--- a/contrib/tsearch2/dict.c
+++ b/contrib/tsearch2/dict.c
@@ -183,15 +183,15 @@ lexize(PG_FUNCTION_ARGS)
  {
         text       *in = PG_GETARG_TEXT_P(1);
         DictInfo   *dict;
-       char      **res,
-                         **ptr;
+       TSLexeme          *res,
+                         *ptr;
         Datum      *da;
         ArrayType  *a;
  
         SET_FUNCOID();
         dict = finddict(PG_GETARG_OID(0));
  
-       ptr = res = (char **) DatumGetPointer(
+       ptr = res = (TSLexeme *) DatumGetPointer(
                                                                           FunctionCall3(&(dict->lexize_info),
                                                                            PointerGetDatum(dict->dictionary),
                                                                                         PointerGetDatum(VARDATA(in)),
@@ -207,13 +207,13 @@ lexize(PG_FUNCTION_ARGS)
                         PG_RETURN_NULL();
         }
  
-       while (*ptr)
+       while (ptr->lexeme)
                 ptr++;
         da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1));
         ptr = res;
-       while (*ptr)
+       while (ptr->lexeme)
         {
-               da[ptr - res] = PointerGetDatum(char2text(*ptr));
+               da[ptr - res] = PointerGetDatum(char2text(ptr->lexeme));
                 ptr++;
         }
  
@@ -227,10 +227,10 @@ lexize(PG_FUNCTION_ARGS)
                 );
  
         ptr = res;
-       while (*ptr)
+       while (ptr->lexeme)
         {
                 pfree(DatumGetPointer(da[ptr - res]));
-               pfree(*ptr);
+               pfree(ptr->lexeme);
                 ptr++;
         }
         pfree(res);
diff --git a/contrib/tsearch2/dict.h b/contrib/tsearch2/dict.h

index 86ea42263e5069a201de18fa041f4b13952ecd66..a21086a49dd4b0732b64d317051221eca221f22f 100644 (file)
--- a/contrib/tsearch2/dict.h
+++ b/contrib/tsearch2/dict.h
@@ -38,4 +38,27 @@ typedef struct
  
  void           parse_cfgdict(text *in, Map ** m);
  
+/* return struct for any lexize function */
+typedef struct {
+       /* number of variant of split word , for example
+               Word 'fotballklubber' (norwegian) has two varian to split:
+               ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
+               should return:
+               nvariant        lexeme
+               1               fotball
+               1               klubb
+               2               fot
+               2               ball
+               2               klubb
+
+       */
+       uint16  nvariant;
+
+       /* currently unused */
+       uint16  flags;
+
+       /* C-string */
+       char    *lexeme;
+} TSLexeme;
+
  #endif
diff --git a/contrib/tsearch2/dict_ex.c b/contrib/tsearch2/dict_ex.c

index a8fb20453ba2125bed97bbadc0cfff28cef5112f..241161a5c21567a4338751107f8280b89f56e8ae 100644 (file)
--- a/contrib/tsearch2/dict_ex.c
+++ b/contrib/tsearch2/dict_ex.c
@@ -54,16 +54,16 @@ dex_lexize(PG_FUNCTION_ARGS)
         DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
         char       *in = (char *) PG_GETARG_POINTER(1);
         char       *txt = pnstrdup(in, PG_GETARG_INT32(2));
-       char      **res = palloc(sizeof(char *) * 2);
+       TSLexeme   *res = palloc(sizeof(TSLexeme) * 2);
+
+       memset(res,0,sizeof(TSLexeme) * 2);
  
         if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
         {
                 pfree(txt);
-               res[0] = NULL;
         }
         else
-               res[0] = txt;
-       res[1] = NULL;
+               res[0].lexeme = txt;
  
         PG_RETURN_POINTER(res);
  }
diff --git a/contrib/tsearch2/dict_ispell.c b/contrib/tsearch2/dict_ispell.c

index 5725c8fb3629020d8efaa3d3c475879e80c301e2..9af11edf8e21434afb59a16805bd2431476e58f2 100644 (file)
--- a/contrib/tsearch2/dict_ispell.c
+++ b/contrib/tsearch2/dict_ispell.c
@@ -159,14 +159,13 @@ spell_lexize(PG_FUNCTION_ARGS)
         DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
         char       *in = (char *) PG_GETARG_POINTER(1);
         char       *txt;
-       char      **res;
-       char      **ptr,
-                         **cptr;
+       TSLexeme          *res;
+       TSLexeme          *ptr,
+                         *cptr;
  
         if (!PG_GETARG_INT32(2))
                 PG_RETURN_POINTER(NULL);
  
-       res = palloc(sizeof(char *) * 2);
         txt = pnstrdup(in, PG_GETARG_INT32(2));
         res = NINormalizeWord(&(d->obj), txt);
         pfree(txt);
@@ -175,22 +174,22 @@ spell_lexize(PG_FUNCTION_ARGS)
                 PG_RETURN_POINTER(NULL);
  
         ptr = cptr = res;
-       while (*ptr)
+       while (ptr->lexeme)
         {
-               if (searchstoplist(&(d->stoplist), *ptr))
+               if (searchstoplist(&(d->stoplist), ptr->lexeme))
                 {
-                       pfree(*ptr);
-                       *ptr = NULL;
+                       pfree(ptr->lexeme);
+                       ptr->lexeme = NULL;
                         ptr++;
                 }
                 else
                 {
-                       *cptr = *ptr;
+                       memcpy(cptr, ptr, sizeof(TSLexeme));
                         cptr++;
                         ptr++;
                 }
         }
-       *cptr = NULL;
+       cptr->lexeme = NULL;
  
         PG_RETURN_POINTER(res);
  }
diff --git a/contrib/tsearch2/dict_snowball.c b/contrib/tsearch2/dict_snowball.c

index 51dba0444994e36eb8017aaa8c3b86287054151b..03850b33ea44843d52e59e7ed83b30018062acbb 100644 (file)
--- a/contrib/tsearch2/dict_snowball.c
+++ b/contrib/tsearch2/dict_snowball.c
@@ -105,12 +105,12 @@ snb_lexize(PG_FUNCTION_ARGS)
         DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
         char       *in = (char *) PG_GETARG_POINTER(1);
         char       *txt = pnstrdup(in, PG_GETARG_INT32(2));
-       char      **res = palloc(sizeof(char *) * 2);
+       TSLexeme          *res = palloc(sizeof(TSLexeme) * 2);
  
+       memset(res, 0, sizeof(TSLexeme) * 2);
         if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
         {
                 pfree(txt);
-               res[0] = NULL;
         }
         else
         {
@@ -122,10 +122,8 @@ snb_lexize(PG_FUNCTION_ARGS)
                         memcpy(txt, d->z->p, d->z->l);
                         txt[d->z->l] = '\0';
                 }
-               res[0] = txt;
+               res->lexeme = txt;
         }
-       res[1] = NULL;
-
  
         PG_RETURN_POINTER(res);
  }
diff --git a/contrib/tsearch2/dict_syn.c b/contrib/tsearch2/dict_syn.c

index 046a59490394195501c35adf1178f6edaba7448a..6e3ed8643450892d7352f4d03aed90a2773ae503 100644 (file)
--- a/contrib/tsearch2/dict_syn.c
+++ b/contrib/tsearch2/dict_syn.c
@@ -162,7 +162,7 @@ syn_lexize(PG_FUNCTION_ARGS)
         char       *in = (char *) PG_GETARG_POINTER(1);
         Syn                     key,
                            *found;
-       char      **res = NULL;
+       TSLexeme          *res = NULL;
  
         if (!PG_GETARG_INT32(2))
                 PG_RETURN_POINTER(NULL);
@@ -176,10 +176,9 @@ syn_lexize(PG_FUNCTION_ARGS)
         if (!found)
                 PG_RETURN_POINTER(NULL);
  
-       res = palloc(sizeof(char *) * 2);
-
-       res[0] = pstrdup(found->out);
-       res[1] = NULL;
+       res = palloc(sizeof(TSLexeme) * 2);
+       memset(res,0,sizeof(TSLexeme) * 2);
+       res[0].lexeme = pstrdup(found->out);
  
         PG_RETURN_POINTER(res);
  }
diff --git a/contrib/tsearch2/gendict/dict_tmpl.c.IN b/contrib/tsearch2/gendict/dict_tmpl.c.IN

index deafdcead8a374c757387d7d88d06dd879c17eef..e534ed30a78ab5560d25cea199d687446ee16f8f 100644 (file)
--- a/contrib/tsearch2/gendict/dict_tmpl.c.IN
+++ b/contrib/tsearch2/gendict/dict_tmpl.c.IN
@@ -52,15 +52,15 @@ dlexize_CFG_MODNAME(PG_FUNCTION_ARGS) {
  HASINIT        DictExample *d = (DictExample*)PG_GETARG_POINTER(0);
         char       *in = (char*)PG_GETARG_POINTER(1);
         char *txt = pnstrdup(in, PG_GETARG_INT32(2));
-       char    **res=palloc(sizeof(char*)*2);
+       TSLexeme        *res=palloc(sizeof(TSLexeme*)*2);
  
-       /* Your INIT dictionary code */
+       /* Your LEXIZE dictionary code */
  HASINIT        if ( *txt=='\0' || searchstoplist(&(d->stoplist),txt) ) {
  HASINIT                pfree(txt);
-HASINIT                res[0]=NULL;
+HASINIT                res[0].lexeme=NULL;
  HASINIT        } else 
-               res[0]=txt;
-       res[1]=NULL;
+               res[0].lexeme=txt;
+       res[1].lexeme=NULL;
  
         PG_RETURN_POINTER(res);
  }
diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c

index 54b01e8ed73b6bf9e727ada5b7d27607da0fdfa3..f9053c63011a087e95e22915afc1ba60bac9de0a 100644 (file)
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -1119,17 +1119,32 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
         return var;
  }
  
-char     **
+TSLexeme *
  NINormalizeWord(IspellDict * Conf, char *word)
  {
         char      **res = NormalizeSubWord(Conf, word, 0);
+       TSLexeme *lcur=NULL, *lres=NULL;
+       u_int16_t NVariant=1;
+
+       if (res) {
+               char **ptr = res;
+               lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
+               while(*ptr) {
+                       lcur->lexeme=*ptr;
+                       lcur->flags=0;
+                       lcur->nvariant = NVariant++;
+                       lcur++;
+                       ptr++;
+               }
+               lcur->lexeme=NULL;
+               pfree(res);
+       }
  
         if (Conf->compoundcontrol != '\t')
         {
                 int                     wordlen = strlen(word);
                 SplitVar   *ptr,
                                    *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
-               char      **cur = res;
                 int                     i;
  
                 while (var)
@@ -1140,30 +1155,31 @@ NINormalizeWord(IspellDict * Conf, char *word)
  
                                 if (subres)
                                 {
-                                       char      **ptr = subres;
+                                       char      **subptr = subres;
+
+                                       if ( !lcur )
+                                               lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
+               
+                                       while(*subptr) {
+                                               for(i=0;i<var->nstem-1;i++) {
+                                                       lcur->lexeme=(subptr==subres) ? var->stem[ i ] : pstrdup(var->stem[ i ]);
+                                                       lcur->flags=0;
+                                                       lcur->nvariant = NVariant;
+                                                       lcur++;
+                                               }
  
-                                       if (cur)
-                                       {
-                                               while (*cur)
-                                                       cur++;
-                                       }
-                                       else
-                                               res = cur = (char **) palloc(MAX_NORM * sizeof(char *));
+                                               lcur->lexeme=*subptr;
+                                               lcur->flags=0;
+                                               lcur->nvariant = NVariant;
+                                               lcur++;
+                                               subptr++;
+                                               NVariant++;
+                                       }       
  
-                                       for (i = 0; i < var->nstem - 1; i++)
-                                       {
-                                               *cur = var->stem[i];
-                                               cur++;
-                                       }
-                                       while (*ptr)
-                                       {
-                                               *cur = *ptr;
-                                               cur++;
-                                               ptr++;
-                                       }
-                                       *cur = NULL;
+                                       lcur->lexeme=NULL;
                                         pfree(subres);
                                         var->stem[0] = NULL;
+                                       pfree( var->stem[ var->nstem-1 ] );     
                                 }
                         }
  
@@ -1175,7 +1191,7 @@ NINormalizeWord(IspellDict * Conf, char *word)
                         var = ptr;
                 }
         }
-       return res;
+       return lres;
  }
  
  
diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h

index cc7935fd743f6f2d122ad829a0e97da0dec0d494..a3695113a18391c883aae5aec33cf5b06e528d4f 100644 (file)
--- a/contrib/tsearch2/ispell/spell.h
+++ b/contrib/tsearch2/ispell/spell.h
@@ -3,10 +3,11 @@
  
  #include <sys/types.h>
  #include "regex/regex.h"
-#include "regis.h"
  #include "c.h"
  
-
+#include "regis.h"
+#include "dict.h"
+ 
  struct SPNode;
  
  
@@ -116,7 +117,7 @@ typedef struct
  
  }      IspellDict;
  
-char     **NINormalizeWord(IspellDict * Conf, char *word);
+TSLexeme         *NINormalizeWord(IspellDict * Conf, char *word);
  int                    NIImportAffixes(IspellDict * Conf, const char *filename);
  int                    NIImportDictionary(IspellDict * Conf, const char *filename);
  
diff --git a/contrib/tsearch2/query.c b/contrib/tsearch2/query.c

index 6787b63ae8638263e41eb4ee33c31066dfceeef2..ee4f779d58dd41dd2b76ed9b88f15fcb59c604bd 100644 (file)
--- a/contrib/tsearch2/query.c
+++ b/contrib/tsearch2/query.c
@@ -265,6 +265,7 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
  {
         int4            count = 0;
         PRSTEXT         prs;
+       uint32          variant, pos, cntvar=0, cntpos=0, cnt=0;
  
         prs.lenwords = 32;
         prs.curwords = 0;
@@ -273,17 +274,39 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
  
         parsetext_v2(findcfg(state->cfg_id), &prs, strval, lenval);
  
-       for (count = 0; count < prs.curwords; count++)
-       {
-               pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
-               pfree(prs.words[count].word);
-               if (count)
-                       pushquery(state, OPR, (int4) '&', 0, 0, 0);
-       }
-       pfree(prs.words);
+       if ( prs.curwords>0 ) {
+
+               while (count < prs.curwords) {
+                       pos = prs.words[count].pos.pos;
+                       cntvar=0;
+                       while(count < prs.curwords && pos==prs.words[count].pos.pos) {
+                               variant = prs.words[count].nvariant;
+
+                               cnt=0;
+                               while(count < prs.curwords && pos==prs.words[count].pos.pos && variant==prs.words[count].nvariant)      {
+                                       
+                                       pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
+                                       pfree(prs.words[count].word);
+                                       if ( cnt ) 
+                                               pushquery(state, OPR, (int4) '&', 0, 0, 0);
+                                       cnt++;
+                                       count++;
+                               }
+
+                               if ( cntvar ) 
+                                       pushquery(state, OPR, (int4) '|', 0, 0, 0);
+                               cntvar++;
+                       }
+
+                       if (cntpos) 
+                               pushquery(state, OPR, (int4) '&', 0, 0, 0);
+               
+                       cntpos++;
+               }
+
+               pfree(prs.words);
  
-       /* XXX */
-       if (prs.curwords == 0)
+       } else
                 pushval_asis(state, VALSTOP, NULL, 0, 0);
  }
  
diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c

index afebb11319963885595e3b7a364a9720169ee787..79f25c43d95c54d2e7d7ec84768047619f1ed787 100644 (file)
--- a/contrib/tsearch2/ts_cfg.c
+++ b/contrib/tsearch2/ts_cfg.c
@@ -321,10 +321,10 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
                 for (i = 0; i < cfg->map[type].len; i++)
                 {
                         DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-                       char      **norms,
-                                         **ptr;
+                       TSLexeme          *norms,
+                                         *ptr;
  
-                       norms = ptr = (char **) DatumGetPointer(
+                       norms = ptr = (TSLexeme *) DatumGetPointer(
                                                                                                         FunctionCall3(
                                                                                                         &(dict->lexize_info),
                                                                            PointerGetDatum(dict->dictionary),
@@ -337,7 +337,7 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
  
                         prs->pos++;                     /* set pos */
  
-                       while (*ptr)
+                       while (ptr->lexeme)
                         {
                                 if (prs->curwords == prs->lenwords)
                                 {
@@ -345,8 +345,9 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
                                         prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
                                 }
  
-                               prs->words[prs->curwords].len = strlen(*ptr);
-                               prs->words[prs->curwords].word = *ptr;
+                               prs->words[prs->curwords].len = strlen(ptr->lexeme);
+                               prs->words[prs->curwords].word = ptr->lexeme;
+                               prs->words[prs->curwords].nvariant = ptr->nvariant;
                                 prs->words[prs->curwords].alen = 0;
                                 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
                                 ptr++;
@@ -458,10 +459,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
                 for (i = 0; i < cfg->map[type].len; i++)
                 {
                         DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-                       char      **norms,
-                                         **ptr;
+                       TSLexeme          *norms,
+                                         *ptr;
  
-                       norms = ptr = (char **) DatumGetPointer(
+                       norms = ptr = (TSLexeme *) DatumGetPointer(
                                                                                                         FunctionCall3(
                                                                                                         &(dict->lexize_info),
                                                                            PointerGetDatum(dict->dictionary),
@@ -472,10 +473,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
                         if (!norms)                     /* dictionary doesn't know this lexem */
                                 continue;
  
-                       while (*ptr)
+                       while (ptr->lexeme)
                         {
-                               hlfinditem(prs, query, *ptr, strlen(*ptr));
-                               pfree(*ptr);
+                               hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+                               pfree(ptr->lexeme);
                                 ptr++;
                         }
                         pfree(norms);
diff --git a/contrib/tsearch2/ts_cfg.h b/contrib/tsearch2/ts_cfg.h

index e000233178d1e556029ba7f8a46e336644765124..7bffdbcdd611c4168ffc3a62d5b93f403f2fc618 100644 (file)
--- a/contrib/tsearch2/ts_cfg.h
+++ b/contrib/tsearch2/ts_cfg.h
@@ -27,6 +27,7 @@ void          reset_cfg(void);
  typedef struct
  {
         uint16          len;
+       uint16          nvariant;
         union
         {
                 uint16          pos;
author	Teodor Sigaev <teodor@sigaev.ru>
	Tue, 25 Jan 2005 15:24:38 +0000 (15:24 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Tue, 25 Jan 2005 15:24:38 +0000 (15:24 +0000)
contrib/tsearch2/dict.c		patch \| blob \| history
contrib/tsearch2/dict.h		patch \| blob \| history
contrib/tsearch2/dict_ex.c		patch \| blob \| history
contrib/tsearch2/dict_ispell.c		patch \| blob \| history
contrib/tsearch2/dict_snowball.c		patch \| blob \| history
contrib/tsearch2/dict_syn.c		patch \| blob \| history
contrib/tsearch2/gendict/dict_tmpl.c.IN		patch \| blob \| history
contrib/tsearch2/ispell/spell.c		patch \| blob \| history
contrib/tsearch2/ispell/spell.h		patch \| blob \| history
contrib/tsearch2/query.c		patch \| blob \| history
contrib/tsearch2/ts_cfg.c		patch \| blob \| history
contrib/tsearch2/ts_cfg.h		patch \| blob \| history