]> granicus.if.org Git - postgresql/commitdiff
1 Fix affixes with void replacement (AFAIK, it's only russian)
authorTeodor Sigaev <teodor@sigaev.ru>
Wed, 23 Jun 2004 11:06:11 +0000 (11:06 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Wed, 23 Jun 2004 11:06:11 +0000 (11:06 +0000)
2 Optimize regex execution

contrib/tsearch2/ispell/Makefile
contrib/tsearch2/ispell/regis.c [new file with mode: 0644]
contrib/tsearch2/ispell/regis.h [new file with mode: 0644]
contrib/tsearch2/ispell/spell.c
contrib/tsearch2/ispell/spell.h
contrib/tsearch2/tsearch.sql.in

index c620aaa15390f1b516b26393ccfaf00208119162..fbbe3f7912f7d792206029c4a1124f3be9b231e7 100644 (file)
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.5 2003/11/29 19:51:36 pgsql Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.6 2004/06/23 11:06:11 teodor Exp $
 
 subdir = contrib/tsearch2/ispell
 top_builddir = ../../..
@@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global
 PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS)
 override CFLAGS += $(CFLAGS_SL)
 
-SUBOBJS = spell.o 
+SUBOBJS = spell.o regis.o 
 
 all: SUBSYS.o
 
diff --git a/contrib/tsearch2/ispell/regis.c b/contrib/tsearch2/ispell/regis.c
new file mode 100644 (file)
index 0000000..0524137
--- /dev/null
@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+         
+#include "regis.h"
+#include "common.h"
+
+int
+RS_isRegis(const char *str) {
+       unsigned char *ptr=(unsigned char *)str;
+
+       while(ptr && *ptr) 
+               if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^')
+                       ptr++;
+               else
+                       return 0;
+       return 1; 
+}
+
+#define RS_IN_ONEOF    1
+#define RS_IN_ONEOF_IN 2
+#define RS_IN_NONEOF   3
+#define RS_IN_WAIT     4
+
+static RegisNode*
+newRegisNode(RegisNode *prev, int len) {
+       RegisNode       *ptr;
+       ptr = (RegisNode*)malloc(RNHDRSZ+len+1);
+       if (!ptr)
+               ts_error(ERROR, "No memory"); 
+       memset(ptr,0,RNHDRSZ+len+1);
+       if (prev)
+               prev->next=ptr;
+       return ptr;
+}
+
+int
+RS_compile(Regis *r, int issuffix, const char *str) {
+       int i,len = strlen(str);
+       int state = RS_IN_WAIT;
+       RegisNode       *ptr=NULL;
+
+       memset(r,0,sizeof(Regis));
+       r->issuffix = (issuffix) ? 1 : 0;
+
+       for(i=0;i<len;i++) {
+               unsigned char c = *( ( (unsigned char*)str ) + i );
+               if ( state == RS_IN_WAIT ) {
+                       if ( isalpha(c) ) {
+                               if ( ptr ) 
+                                       ptr = newRegisNode(ptr,len);
+                               else
+                                       ptr = r->node = newRegisNode(NULL,len);
+                               ptr->data[ 0 ] = c;
+                               ptr->type = RSF_ONEOF;
+                               ptr->len=1;
+                       } else if ( c=='[' )  {
+                               if ( ptr ) 
+                                       ptr = newRegisNode(ptr,len);
+                               else
+                                       ptr = r->node = newRegisNode(NULL,len);
+                               ptr->type = RSF_ONEOF;
+                               state=RS_IN_ONEOF;
+                       } else 
+                               ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+               } else if ( state == RS_IN_ONEOF ) {
+                       if ( c=='^' ) {
+                               ptr->type = RSF_NONEOF;
+                               state=RS_IN_NONEOF;
+                       } else if ( isalpha(c) ) {
+                               ptr->data[ 0 ] = c;
+                               ptr->len=1;
+                               state=RS_IN_ONEOF_IN;
+                       } else
+                               ts_error(ERROR,"Error in regis: %s at pos %d\n", str,  i+1);
+               } else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) {
+                       if ( isalpha(c) ) {
+                               ptr->data[ ptr->len ] = c;
+                               ptr->len++;
+                       } else if ( c==']' ) {
+                               state=RS_IN_WAIT;
+                       } else
+                               ts_error(ERROR,"Error in regis: %s at pos %d\n", str,  i+1);
+               } else
+                       ts_error(ERROR,"Internal error in RS_compile: %d\n", state);
+       }
+
+       ptr = r->node;
+       while(ptr) {
+               r->nchar++;
+               ptr=ptr->next;
+       }
+
+       return 0;
+}
+
+void 
+RS_free(Regis *r) {
+       RegisNode *ptr=r->node,*tmp;
+
+       while(ptr) {
+               tmp=ptr->next;
+               free(ptr);
+               ptr = tmp;
+       }
+
+       r->node = NULL;
+}
+
+int 
+RS_execute(Regis *r, const char *str, int len) {
+       RegisNode *ptr=r->node;
+       unsigned char *c;
+
+       if (len<0)
+               len=strlen(str);
+
+       if (len<r->nchar)
+               return 0;
+
+       if ( r->issuffix ) 
+               c = ((unsigned char*)str) + len - r->nchar;
+       else
+               c = (unsigned char*)str; 
+
+       while(ptr) {
+               switch(ptr->type) {
+                       case RSF_ONEOF:
+                               if ( ptr->len==0 ) {
+                                       if ( *c != *(ptr->data) )
+                                               return 0;
+                               } else if ( strchr((char*)ptr->data, *c) == NULL )
+                                       return 0;
+                               break;
+                       case RSF_NONEOF:
+                               if ( ptr->len==0 ) {
+                                       if ( *c == *(ptr->data) )
+                                               return 0;
+                               } else if ( strchr((char*)ptr->data, *c) != NULL )
+                                       return 0;
+                               break;
+                       default:
+                               ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type);
+               }
+               ptr=ptr->next;
+               c++;
+       }
+
+       return 1;
+}
diff --git a/contrib/tsearch2/ispell/regis.h b/contrib/tsearch2/ispell/regis.h
new file mode 100644 (file)
index 0000000..64a7e9d
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef __REGIS_H__
+#define __REGIS_H__
+
+#include "postgres.h" 
+
+typedef struct RegisNode {
+       uint32  
+               type:2,
+               len:16,
+               unused:14;
+       struct RegisNode *next;
+       unsigned char   data[1];
+} RegisNode;
+
+#define  RNHDRSZ       (sizeof(uint32)+sizeof(void*))
+
+#define        RSF_ONEOF       1
+#define        RSF_NONEOF      2
+
+typedef struct Regis {
+       RegisNode *node;
+       uint32  
+               issuffix:1,
+               nchar:16,
+               unused:15;
+} Regis;
+
+int RS_isRegis(const char *str);
+
+int RS_compile(Regis *r, int issuffix, const char *str);
+void RS_free(Regis *r);
+/*×ÏÚ×ÒÁÝÁÅÔ 1 ÅÓÌÉ ÍÁÔÞÉÔÓÑ */
+int RS_execute(Regis *r, const char *str, int len); 
+#endif
index fa62473895971e724e3c325d3a7a2af8de4a603b..06d712470f1352433b500815879403cc2472cac8 100644 (file)
@@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
 {
        SPNode *node = Conf->Dictionary;
        SPNodeData *StopLow, *StopHigh, *StopMiddle;
-       int level=0, wrdlen=strlen(word);
+       uint8 *ptr =(uint8*)word;
 
-       while( node && level<wrdlen) {
+       while( node && *ptr) {
                StopLow = node->data;
                StopHigh = node->data+node->length;
                while (StopLow < StopHigh) {
-                       StopMiddle = StopLow + (StopHigh - StopLow) / 2;
-                       if ( StopMiddle->val == ((uint8*)(word))[level] ) {
-                               if ( wrdlen==level+1 && StopMiddle->isword ) {
+                       StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+                       if ( StopMiddle->val == *ptr ) {
+                               if ( *(ptr+1)=='\0' && StopMiddle->isword ) {
                                        if ( compoundonly && !StopMiddle->compoundallow )
                                                return 0;
                                        if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
                                                return 1;
                                }
                                node=StopMiddle->node;
-                               level++;
+                               ptr++;
                                break;
-                       } else if ( StopMiddle->val < ((uint8*)(word))[level] ) {
+                       } else if ( StopMiddle->val < *ptr ) {
                                StopLow = StopMiddle + 1;
                        } else {
                                StopHigh = StopMiddle;
@@ -236,19 +236,32 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
                }
                MEMOUT(Conf->Affix);
        }
-       if (type == 's')
-               sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
-       else
-               sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
-       Conf->Affix[Conf->naffixes].compile = 1;
-       Conf->Affix[Conf->naffixes].flagflags = flagflags;
-       Conf->Affix[Conf->naffixes].flag = flag;
-       Conf->Affix[Conf->naffixes].type = type;
-
-       strcpy(Conf->Affix[Conf->naffixes].find, find);
-       strcpy(Conf->Affix[Conf->naffixes].repl, repl);
-       Conf->Affix[Conf->naffixes].replen = strlen(repl);
-       Conf->naffixes++;
+
+        if ( strcmp(mask,".")==0 ) {
+                Conf->Affix[Conf->naffixes].issimple=1;
+                Conf->Affix[Conf->naffixes].isregis=0;
+                *( Conf->Affix[Conf->naffixes].mask )='\0';
+        } else if ( RS_isRegis(mask) ) {
+                Conf->Affix[Conf->naffixes].issimple=0;
+                Conf->Affix[Conf->naffixes].isregis=1;
+                strcpy(Conf->Affix[Conf->naffixes].mask, mask);
+        } else {
+                Conf->Affix[Conf->naffixes].issimple=0;
+                Conf->Affix[Conf->naffixes].isregis=0;
+                if (type == FF_SUFFIX)
+                        sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
+                else
+                        sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
+        }
+        Conf->Affix[Conf->naffixes].compile = 1;
+        Conf->Affix[Conf->naffixes].flagflags = flagflags;
+        Conf->Affix[Conf->naffixes].flag = flag;
+        Conf->Affix[Conf->naffixes].type = type;
+
+        strcpy(Conf->Affix[Conf->naffixes].find, find);
+        strcpy(Conf->Affix[Conf->naffixes].repl, repl);
+        Conf->Affix[Conf->naffixes].replen = strlen(repl);
+        Conf->naffixes++;
        return (0);
 }
 
@@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
                                continue;
                }
 
-               NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
+               NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
 
        }
        fclose(affix);
@@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) {
        return rs;
 }
 
+static void
+mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) {
+        int i,cnt=0;
+        int start = (issuffix) ? startsuffix : 0;
+        int end = (issuffix) ? Conf->naffixes : startsuffix;
+        AffixNode       *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData));
+
+       MEMOUT(Affix);
+        memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) );
+        Affix->length=1;
+        Affix->isvoid=1;
+
+        if (issuffix) {
+                Affix->data->node=Conf->Suffix;
+                Conf->Suffix = Affix;
+        } else {
+                Affix->data->node=Conf->Prefix;
+                Conf->Prefix = Affix;
+        }
+
+
+        for(i=start;i<end;i++)
+                if (Conf->Affix[i].replen==0)
+                        cnt++;
+
+        if ( cnt==0 )
+                return;
+
+        Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt );
+       MEMOUT(Affix->data->aff);
+        Affix->data->naff = (uint32)cnt;
+
+        cnt=0; 
+        for(i=start;i<end;i++)
+                if (Conf->Affix[i].replen==0) {
+                        Affix->data->aff[cnt] = Conf->Affix + i;
+                        cnt++;
+                }
+}
+
 void
 NISortAffixes(IspellDict * Conf)
 {
@@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf)
 
        Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p'); 
        Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
+        mkVoidAffix(Conf, 1, firstsuffix);
+        mkVoidAffix(Conf, 0, firstsuffix);
 }
 
 static AffixNodeData*
@@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
        AffixNodeData *StopLow, *StopHigh, *StopMiddle;
        uint8 symbol;
 
+        if ( node->isvoid ) { /* search void affixes */
+                if (node->data->naff)
+                        return node->data;
+                node = node->data->node;
+        }
+
        while( node && *level<wrdlen) {
                StopLow = node->data;
                StopHigh = node->data+node->length;
                while (StopLow < StopHigh) {
-                       StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+                       StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
                        symbol = GETWCHAR(word,wrdlen,*level,type);
                        if ( StopMiddle->val == symbol ) {
+                               (*level)++;
                                if ( StopMiddle->naff ) 
                                        return StopMiddle;
                                node=StopMiddle->node;
-                               (*level)++;
                                break;
                        } else if ( StopMiddle->val < symbol ) {
                                StopLow = StopMiddle + 1;
@@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
 
 static char *
 CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
-       regmatch_t      subs[2];                /* workaround for apache&linux */
-       int                     err;
-       pg_wchar   *data;
-       size_t          data_len;
-       int                     dat_len;
 
        if ( flagflags & FF_COMPOUNDONLYAFX ) {
                if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
@@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
                        return NULL;
        } 
 
-       if ( Affix->type=='s' ) {
+       if ( Affix->type==FF_SUFFIX ) {
                strcpy(newword, word);
                strcpy(newword + len - Affix->replen, Affix->find);
        } else {
@@ -639,34 +695,50 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
                strcat(newword, word + Affix->replen);
        }
 
-       if (Affix->compile)
-       {
-               int wmasklen,masklen = strlen(Affix->mask);
-               pg_wchar *mask;
-               mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
-               wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
-               
-               err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
-               pfree(mask);
-               if (err)
+        if ( Affix->issimple ) {
+                return newword;
+        } else if ( Affix->isregis ) {
+                if (Affix->compile) {
+                        RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask);
+                        Affix->compile = 0;
+                }
+                if ( RS_execute(&(Affix->reg.regis), newword, -1) )
+                        return newword;
+       } else {
+               regmatch_t      subs[2];                /* workaround for apache&linux */
+               int                     err;
+               pg_wchar   *data;
+               size_t          data_len;
+               int     dat_len;
+               if (Affix->compile)
                {
-                       /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
-                       pg_regfree(&(Affix->reg));
-                       return (NULL);
+                       int wmasklen,masklen = strlen(Affix->mask);
+                       pg_wchar *mask;
+                       mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
+                       wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
+               
+                       err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+                       pfree(mask);
+                       if (err)
+                       {
+                               /* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */
+                               pg_regfree(&(Affix->reg.regex));
+                               return (NULL);
+                       }
+                       Affix->compile = 0;
                }
-               Affix->compile = 0;
-       }
 
-       /* Convert data string to wide characters */
-       dat_len = strlen(newword);
-       data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-       data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+               /* Convert data string to wide characters */
+               dat_len = strlen(newword);
+               data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+               data_len = pg_mb2wchar_with_len(newword, data, dat_len);
 
-       if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) {
-                       pfree(data); 
-                       return newword;
+               if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) {
+                               pfree(data); 
+                               return newword;
+               }
+               pfree(data);
        }
-       pfree(data);
 
        return NULL;
 }
@@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
                        }
                }
                pnode = prefix->node;
-               plevel++;
        }
  
        /* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
@@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
                                                }
                                        }
                                        pnode = prefix->node;
-                                       plevel++;
                                } 
                        }
                }
 
                snode=suffix->node;
-               slevel++;
        }
 
        if (cur == forms) {
@@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf)
        
        for (i = 0; i < Conf->naffixes; i++)
        {
-               if (Affix[i].compile == 0)
-                       pg_regfree(&(Affix[i].reg));
+               if (Affix[i].compile == 0) {
+                        if ( Affix[i].isregis )
+                                RS_free(&(Affix[i].reg.regis));
+                        else
+                               pg_regfree(&(Affix[i].reg.regex));
+               }
        }
        if (Conf->Spell) {
                for (i = 0; i < Conf->nspell; i++)
index 150e4166e13d3b38f171e3357e16fbb092824b18..1e22a0d1bcd9d06eea7fbe995dd7b632d12262eb 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <sys/types.h>
 #include "regex/regex.h"
+#include "regis.h"
 #include "c.h"
 
 
@@ -40,20 +41,29 @@ typedef struct spell_struct
 
 typedef struct aff_struct
 {
-       char            flag;
-       char            flagflags;
-       char            type;
-       char            mask[33];
-       char            find[16];
-       char            repl[16];
-       regex_t         reg;
-       size_t          replen;
-       char            compile;
+        uint32
+                flag:8,
+                type:2,
+                compile:1,
+                flagflags:3,
+                issimple:1,
+                isregis:1,
+                unused:1,
+                replen:16;
+        char            mask[32];
+        char            find[16];
+        char            repl[16];
+        union {
+                regex_t         regex;
+                Regis           regis;
+        } reg;
 }      AFFIX;
 
 #define FF_CROSSPRODUCT        0x01
 #define FF_COMPOUNDWORD        0x02
 #define FF_COMPOUNDONLYAFX      0x04
+#define FF_SUFFIX               2
+#define FF_PREFIX               1
 
 struct AffixNode;
 
@@ -66,18 +76,13 @@ typedef struct {
 } AffixNodeData;
 
 typedef struct AffixNode {
-       uint32 length;
+        uint32  isvoid:1,
+                length:31;
        AffixNodeData   data[1];
 } AffixNode;
 
 #define ANHRDSZ        (sizeof(uint32))
 
-typedef struct Tree_struct
-{
-       int                     Left[256],
-                               Right[256];
-}      Tree_struct;
-
 typedef struct {
        char *affix;
        int len;
index 25cb05f59c0ee82be4eae47b497153c4b9c8ee13..f0b2397d0de8adf70f3ba4d1bdd84454ba3d8515 100644 (file)
@@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops
         FUNCTION        1       tsvector_cmp(tsvector, tsvector);
 
 --example of ISpell dictionary
---update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_id=4;
+--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
 --example of synonym dict
 --update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
 END;