From de55c0cef63f7a74a8a14de50f1a4f9a064a0990 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 23 Jun 2004 11:06:11 +0000 Subject: [PATCH] 1 Fix affixes with void replacement (AFAIK, it's only russian) 2 Optimize regex execution --- contrib/tsearch2/ispell/Makefile | 4 +- contrib/tsearch2/ispell/regis.c | 151 +++++++++++++++++++++++++ contrib/tsearch2/ispell/regis.h | 34 ++++++ contrib/tsearch2/ispell/spell.c | 187 +++++++++++++++++++++---------- contrib/tsearch2/ispell/spell.h | 37 +++--- contrib/tsearch2/tsearch.sql.in | 2 +- 6 files changed, 339 insertions(+), 76 deletions(-) create mode 100644 contrib/tsearch2/ispell/regis.c create mode 100644 contrib/tsearch2/ispell/regis.h diff --git a/contrib/tsearch2/ispell/Makefile b/contrib/tsearch2/ispell/Makefile index c620aaa153..fbbe3f7912 100644 --- a/contrib/tsearch2/ispell/Makefile +++ b/contrib/tsearch2/ispell/Makefile @@ -1,4 +1,4 @@ -# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.5 2003/11/29 19:51:36 pgsql Exp $ +# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.6 2004/06/23 11:06:11 teodor Exp $ subdir = contrib/tsearch2/ispell top_builddir = ../../.. @@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS) override CFLAGS += $(CFLAGS_SL) -SUBOBJS = spell.o +SUBOBJS = spell.o regis.o all: SUBSYS.o diff --git a/contrib/tsearch2/ispell/regis.c b/contrib/tsearch2/ispell/regis.c new file mode 100644 index 0000000000..052413788b --- /dev/null +++ b/contrib/tsearch2/ispell/regis.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include + +#include "regis.h" +#include "common.h" + +int +RS_isRegis(const char *str) { + unsigned char *ptr=(unsigned char *)str; + + while(ptr && *ptr) + if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^') + ptr++; + else + return 0; + return 1; +} + +#define RS_IN_ONEOF 1 +#define RS_IN_ONEOF_IN 2 +#define RS_IN_NONEOF 3 +#define RS_IN_WAIT 4 + +static RegisNode* +newRegisNode(RegisNode *prev, int len) { + RegisNode *ptr; + ptr = (RegisNode*)malloc(RNHDRSZ+len+1); + if (!ptr) + ts_error(ERROR, "No memory"); + memset(ptr,0,RNHDRSZ+len+1); + if (prev) + prev->next=ptr; + return ptr; +} + +int +RS_compile(Regis *r, int issuffix, const char *str) { + int i,len = strlen(str); + int state = RS_IN_WAIT; + RegisNode *ptr=NULL; + + memset(r,0,sizeof(Regis)); + r->issuffix = (issuffix) ? 1 : 0; + + for(i=0;inode = newRegisNode(NULL,len); + ptr->data[ 0 ] = c; + ptr->type = RSF_ONEOF; + ptr->len=1; + } else if ( c=='[' ) { + if ( ptr ) + ptr = newRegisNode(ptr,len); + else + ptr = r->node = newRegisNode(NULL,len); + ptr->type = RSF_ONEOF; + state=RS_IN_ONEOF; + } else + ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1); + } else if ( state == RS_IN_ONEOF ) { + if ( c=='^' ) { + ptr->type = RSF_NONEOF; + state=RS_IN_NONEOF; + } else if ( isalpha(c) ) { + ptr->data[ 0 ] = c; + ptr->len=1; + state=RS_IN_ONEOF_IN; + } else + ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1); + } else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) { + if ( isalpha(c) ) { + ptr->data[ ptr->len ] = c; + ptr->len++; + } else if ( c==']' ) { + state=RS_IN_WAIT; + } else + ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1); + } else + ts_error(ERROR,"Internal error in RS_compile: %d\n", state); + } + + ptr = r->node; + while(ptr) { + r->nchar++; + ptr=ptr->next; + } + + return 0; +} + +void +RS_free(Regis *r) { + RegisNode *ptr=r->node,*tmp; + + while(ptr) { + tmp=ptr->next; + free(ptr); + ptr = tmp; + } + + r->node = NULL; +} + +int +RS_execute(Regis *r, const char *str, int len) { + RegisNode *ptr=r->node; + unsigned char *c; + + if (len<0) + len=strlen(str); + + if (lennchar) + return 0; + + if ( r->issuffix ) + c = ((unsigned char*)str) + len - r->nchar; + else + c = (unsigned char*)str; + + while(ptr) { + switch(ptr->type) { + case RSF_ONEOF: + if ( ptr->len==0 ) { + if ( *c != *(ptr->data) ) + return 0; + } else if ( strchr((char*)ptr->data, *c) == NULL ) + return 0; + break; + case RSF_NONEOF: + if ( ptr->len==0 ) { + if ( *c == *(ptr->data) ) + return 0; + } else if ( strchr((char*)ptr->data, *c) != NULL ) + return 0; + break; + default: + ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type); + } + ptr=ptr->next; + c++; + } + + return 1; +} diff --git a/contrib/tsearch2/ispell/regis.h b/contrib/tsearch2/ispell/regis.h new file mode 100644 index 0000000000..64a7e9d996 --- /dev/null +++ b/contrib/tsearch2/ispell/regis.h @@ -0,0 +1,34 @@ +#ifndef __REGIS_H__ +#define __REGIS_H__ + +#include "postgres.h" + +typedef struct RegisNode { + uint32 + type:2, + len:16, + unused:14; + struct RegisNode *next; + unsigned char data[1]; +} RegisNode; + +#define RNHDRSZ (sizeof(uint32)+sizeof(void*)) + +#define RSF_ONEOF 1 +#define RSF_NONEOF 2 + +typedef struct Regis { + RegisNode *node; + uint32 + issuffix:1, + nchar:16, + unused:15; +} Regis; + +int RS_isRegis(const char *str); + +int RS_compile(Regis *r, int issuffix, const char *str); +void RS_free(Regis *r); +/*×ÏÚ×ÒÁÝÁÅÔ 1 ÅÓÌÉ ÍÁÔÞÉÔÓÑ */ +int RS_execute(Regis *r, const char *str, int len); +#endif diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c index fa62473895..06d712470f 100644 --- a/contrib/tsearch2/ispell/spell.c +++ b/contrib/tsearch2/ispell/spell.c @@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly) { SPNode *node = Conf->Dictionary; SPNodeData *StopLow, *StopHigh, *StopMiddle; - int level=0, wrdlen=strlen(word); + uint8 *ptr =(uint8*)word; - while( node && leveldata; StopHigh = node->data+node->length; while (StopLow < StopHigh) { - StopMiddle = StopLow + (StopHigh - StopLow) / 2; - if ( StopMiddle->val == ((uint8*)(word))[level] ) { - if ( wrdlen==level+1 && StopMiddle->isword ) { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + if ( StopMiddle->val == *ptr ) { + if ( *(ptr+1)=='\0' && StopMiddle->isword ) { if ( compoundonly && !StopMiddle->compoundallow ) return 0; if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL)) return 1; } node=StopMiddle->node; - level++; + ptr++; break; - } else if ( StopMiddle->val < ((uint8*)(word))[level] ) { + } else if ( StopMiddle->val < *ptr ) { StopLow = StopMiddle + 1; } else { StopHigh = StopMiddle; @@ -236,19 +236,32 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const } MEMOUT(Conf->Affix); } - if (type == 's') - sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); - else - sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); - Conf->Affix[Conf->naffixes].compile = 1; - Conf->Affix[Conf->naffixes].flagflags = flagflags; - Conf->Affix[Conf->naffixes].flag = flag; - Conf->Affix[Conf->naffixes].type = type; - - strcpy(Conf->Affix[Conf->naffixes].find, find); - strcpy(Conf->Affix[Conf->naffixes].repl, repl); - Conf->Affix[Conf->naffixes].replen = strlen(repl); - Conf->naffixes++; + + if ( strcmp(mask,".")==0 ) { + Conf->Affix[Conf->naffixes].issimple=1; + Conf->Affix[Conf->naffixes].isregis=0; + *( Conf->Affix[Conf->naffixes].mask )='\0'; + } else if ( RS_isRegis(mask) ) { + Conf->Affix[Conf->naffixes].issimple=0; + Conf->Affix[Conf->naffixes].isregis=1; + strcpy(Conf->Affix[Conf->naffixes].mask, mask); + } else { + Conf->Affix[Conf->naffixes].issimple=0; + Conf->Affix[Conf->naffixes].isregis=0; + if (type == FF_SUFFIX) + sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); + else + sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); + } + Conf->Affix[Conf->naffixes].compile = 1; + Conf->Affix[Conf->naffixes].flagflags = flagflags; + Conf->Affix[Conf->naffixes].flag = flag; + Conf->Affix[Conf->naffixes].type = type; + + strcpy(Conf->Affix[Conf->naffixes].find, find); + strcpy(Conf->Affix[Conf->naffixes].repl, repl); + Conf->Affix[Conf->naffixes].replen = strlen(repl); + Conf->naffixes++; return (0); } @@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename) continue; } - NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p'); + NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); } fclose(affix); @@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) { return rs; } +static void +mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) { + int i,cnt=0; + int start = (issuffix) ? startsuffix : 0; + int end = (issuffix) ? Conf->naffixes : startsuffix; + AffixNode *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData)); + + MEMOUT(Affix); + memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) ); + Affix->length=1; + Affix->isvoid=1; + + if (issuffix) { + Affix->data->node=Conf->Suffix; + Conf->Suffix = Affix; + } else { + Affix->data->node=Conf->Prefix; + Conf->Prefix = Affix; + } + + + for(i=start;iAffix[i].replen==0) + cnt++; + + if ( cnt==0 ) + return; + + Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt ); + MEMOUT(Affix->data->aff); + Affix->data->naff = (uint32)cnt; + + cnt=0; + for(i=start;iAffix[i].replen==0) { + Affix->data->aff[cnt] = Conf->Affix + i; + cnt++; + } +} + void NISortAffixes(IspellDict * Conf) { @@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf) Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p'); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's'); + mkVoidAffix(Conf, 1, firstsuffix); + mkVoidAffix(Conf, 0, firstsuffix); } static AffixNodeData* @@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) AffixNodeData *StopLow, *StopHigh, *StopMiddle; uint8 symbol; + if ( node->isvoid ) { /* search void affixes */ + if (node->data->naff) + return node->data; + node = node->data->node; + } + while( node && *leveldata; StopHigh = node->data+node->length; while (StopLow < StopHigh) { - StopMiddle = StopLow + (StopHigh - StopLow) / 2; + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); symbol = GETWCHAR(word,wrdlen,*level,type); if ( StopMiddle->val == symbol ) { + (*level)++; if ( StopMiddle->naff ) return StopMiddle; node=StopMiddle->node; - (*level)++; break; } else if ( StopMiddle->val < symbol ) { StopLow = StopMiddle + 1; @@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) static char * CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) { - regmatch_t subs[2]; /* workaround for apache&linux */ - int err; - pg_wchar *data; - size_t data_len; - int dat_len; if ( flagflags & FF_COMPOUNDONLYAFX ) { if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 ) @@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne return NULL; } - if ( Affix->type=='s' ) { + if ( Affix->type==FF_SUFFIX ) { strcpy(newword, word); strcpy(newword + len - Affix->replen, Affix->find); } else { @@ -639,34 +695,50 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne strcat(newword, word + Affix->replen); } - if (Affix->compile) - { - int wmasklen,masklen = strlen(Affix->mask); - pg_wchar *mask; - mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar)); - wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen); - - err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB); - pfree(mask); - if (err) + if ( Affix->issimple ) { + return newword; + } else if ( Affix->isregis ) { + if (Affix->compile) { + RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask); + Affix->compile = 0; + } + if ( RS_execute(&(Affix->reg.regis), newword, -1) ) + return newword; + } else { + regmatch_t subs[2]; /* workaround for apache&linux */ + int err; + pg_wchar *data; + size_t data_len; + int dat_len; + if (Affix->compile) { - /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */ - pg_regfree(&(Affix->reg)); - return (NULL); + int wmasklen,masklen = strlen(Affix->mask); + pg_wchar *mask; + mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar)); + wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen); + + err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB); + pfree(mask); + if (err) + { + /* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */ + pg_regfree(&(Affix->reg.regex)); + return (NULL); + } + Affix->compile = 0; } - Affix->compile = 0; - } - /* Convert data string to wide characters */ - dat_len = strlen(newword); - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(newword, data, dat_len); + /* Convert data string to wide characters */ + dat_len = strlen(newword); + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(newword, data, dat_len); - if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) { - pfree(data); - return newword; + if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) { + pfree(data); + return newword; + } + pfree(data); } - pfree(data); return NULL; } @@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) { } } pnode = prefix->node; - plevel++; } /* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/ @@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) { } } pnode = prefix->node; - plevel++; } } } snode=suffix->node; - slevel++; } if (cur == forms) { @@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf) for (i = 0; i < Conf->naffixes; i++) { - if (Affix[i].compile == 0) - pg_regfree(&(Affix[i].reg)); + if (Affix[i].compile == 0) { + if ( Affix[i].isregis ) + RS_free(&(Affix[i].reg.regis)); + else + pg_regfree(&(Affix[i].reg.regex)); + } } if (Conf->Spell) { for (i = 0; i < Conf->nspell; i++) diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h index 150e4166e1..1e22a0d1bc 100644 --- a/contrib/tsearch2/ispell/spell.h +++ b/contrib/tsearch2/ispell/spell.h @@ -3,6 +3,7 @@ #include #include "regex/regex.h" +#include "regis.h" #include "c.h" @@ -40,20 +41,29 @@ typedef struct spell_struct typedef struct aff_struct { - char flag; - char flagflags; - char type; - char mask[33]; - char find[16]; - char repl[16]; - regex_t reg; - size_t replen; - char compile; + uint32 + flag:8, + type:2, + compile:1, + flagflags:3, + issimple:1, + isregis:1, + unused:1, + replen:16; + char mask[32]; + char find[16]; + char repl[16]; + union { + regex_t regex; + Regis regis; + } reg; } AFFIX; #define FF_CROSSPRODUCT 0x01 #define FF_COMPOUNDWORD 0x02 #define FF_COMPOUNDONLYAFX 0x04 +#define FF_SUFFIX 2 +#define FF_PREFIX 1 struct AffixNode; @@ -66,18 +76,13 @@ typedef struct { } AffixNodeData; typedef struct AffixNode { - uint32 length; + uint32 isvoid:1, + length:31; AffixNodeData data[1]; } AffixNode; #define ANHRDSZ (sizeof(uint32)) -typedef struct Tree_struct -{ - int Left[256], - Right[256]; -} Tree_struct; - typedef struct { char *affix; int len; diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in index 25cb05f59c..f0b2397d0d 100644 --- a/contrib/tsearch2/tsearch.sql.in +++ b/contrib/tsearch2/tsearch.sql.in @@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops FUNCTION 1 tsvector_cmp(tsvector, tsvector); --example of ISpell dictionary ---update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_id=4; +--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template'; --example of synonym dict --update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5; END; -- 2.40.0