--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "regis.h"
+#include "common.h"
+
+int
+RS_isRegis(const char *str) {
+ unsigned char *ptr=(unsigned char *)str;
+
+ while(ptr && *ptr)
+ if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^')
+ ptr++;
+ else
+ return 0;
+ return 1;
+}
+
+#define RS_IN_ONEOF 1
+#define RS_IN_ONEOF_IN 2
+#define RS_IN_NONEOF 3
+#define RS_IN_WAIT 4
+
+static RegisNode*
+newRegisNode(RegisNode *prev, int len) {
+ RegisNode *ptr;
+ ptr = (RegisNode*)malloc(RNHDRSZ+len+1);
+ if (!ptr)
+ ts_error(ERROR, "No memory");
+ memset(ptr,0,RNHDRSZ+len+1);
+ if (prev)
+ prev->next=ptr;
+ return ptr;
+}
+
+int
+RS_compile(Regis *r, int issuffix, const char *str) {
+ int i,len = strlen(str);
+ int state = RS_IN_WAIT;
+ RegisNode *ptr=NULL;
+
+ memset(r,0,sizeof(Regis));
+ r->issuffix = (issuffix) ? 1 : 0;
+
+ for(i=0;i<len;i++) {
+ unsigned char c = *( ( (unsigned char*)str ) + i );
+ if ( state == RS_IN_WAIT ) {
+ if ( isalpha(c) ) {
+ if ( ptr )
+ ptr = newRegisNode(ptr,len);
+ else
+ ptr = r->node = newRegisNode(NULL,len);
+ ptr->data[ 0 ] = c;
+ ptr->type = RSF_ONEOF;
+ ptr->len=1;
+ } else if ( c=='[' ) {
+ if ( ptr )
+ ptr = newRegisNode(ptr,len);
+ else
+ ptr = r->node = newRegisNode(NULL,len);
+ ptr->type = RSF_ONEOF;
+ state=RS_IN_ONEOF;
+ } else
+ ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+ } else if ( state == RS_IN_ONEOF ) {
+ if ( c=='^' ) {
+ ptr->type = RSF_NONEOF;
+ state=RS_IN_NONEOF;
+ } else if ( isalpha(c) ) {
+ ptr->data[ 0 ] = c;
+ ptr->len=1;
+ state=RS_IN_ONEOF_IN;
+ } else
+ ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+ } else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) {
+ if ( isalpha(c) ) {
+ ptr->data[ ptr->len ] = c;
+ ptr->len++;
+ } else if ( c==']' ) {
+ state=RS_IN_WAIT;
+ } else
+ ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+ } else
+ ts_error(ERROR,"Internal error in RS_compile: %d\n", state);
+ }
+
+ ptr = r->node;
+ while(ptr) {
+ r->nchar++;
+ ptr=ptr->next;
+ }
+
+ return 0;
+}
+
+void
+RS_free(Regis *r) {
+ RegisNode *ptr=r->node,*tmp;
+
+ while(ptr) {
+ tmp=ptr->next;
+ free(ptr);
+ ptr = tmp;
+ }
+
+ r->node = NULL;
+}
+
+int
+RS_execute(Regis *r, const char *str, int len) {
+ RegisNode *ptr=r->node;
+ unsigned char *c;
+
+ if (len<0)
+ len=strlen(str);
+
+ if (len<r->nchar)
+ return 0;
+
+ if ( r->issuffix )
+ c = ((unsigned char*)str) + len - r->nchar;
+ else
+ c = (unsigned char*)str;
+
+ while(ptr) {
+ switch(ptr->type) {
+ case RSF_ONEOF:
+ if ( ptr->len==0 ) {
+ if ( *c != *(ptr->data) )
+ return 0;
+ } else if ( strchr((char*)ptr->data, *c) == NULL )
+ return 0;
+ break;
+ case RSF_NONEOF:
+ if ( ptr->len==0 ) {
+ if ( *c == *(ptr->data) )
+ return 0;
+ } else if ( strchr((char*)ptr->data, *c) != NULL )
+ return 0;
+ break;
+ default:
+ ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type);
+ }
+ ptr=ptr->next;
+ c++;
+ }
+
+ return 1;
+}
{
SPNode *node = Conf->Dictionary;
SPNodeData *StopLow, *StopHigh, *StopMiddle;
- int level=0, wrdlen=strlen(word);
+ uint8 *ptr =(uint8*)word;
- while( node && level<wrdlen) {
+ while( node && *ptr) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
- StopMiddle = StopLow + (StopHigh - StopLow) / 2;
- if ( StopMiddle->val == ((uint8*)(word))[level] ) {
- if ( wrdlen==level+1 && StopMiddle->isword ) {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if ( StopMiddle->val == *ptr ) {
+ if ( *(ptr+1)=='\0' && StopMiddle->isword ) {
if ( compoundonly && !StopMiddle->compoundallow )
return 0;
if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
return 1;
}
node=StopMiddle->node;
- level++;
+ ptr++;
break;
- } else if ( StopMiddle->val < ((uint8*)(word))[level] ) {
+ } else if ( StopMiddle->val < *ptr ) {
StopLow = StopMiddle + 1;
} else {
StopHigh = StopMiddle;
}
MEMOUT(Conf->Affix);
}
- if (type == 's')
- sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
- else
- sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
- Conf->Affix[Conf->naffixes].compile = 1;
- Conf->Affix[Conf->naffixes].flagflags = flagflags;
- Conf->Affix[Conf->naffixes].flag = flag;
- Conf->Affix[Conf->naffixes].type = type;
-
- strcpy(Conf->Affix[Conf->naffixes].find, find);
- strcpy(Conf->Affix[Conf->naffixes].repl, repl);
- Conf->Affix[Conf->naffixes].replen = strlen(repl);
- Conf->naffixes++;
+
+ if ( strcmp(mask,".")==0 ) {
+ Conf->Affix[Conf->naffixes].issimple=1;
+ Conf->Affix[Conf->naffixes].isregis=0;
+ *( Conf->Affix[Conf->naffixes].mask )='\0';
+ } else if ( RS_isRegis(mask) ) {
+ Conf->Affix[Conf->naffixes].issimple=0;
+ Conf->Affix[Conf->naffixes].isregis=1;
+ strcpy(Conf->Affix[Conf->naffixes].mask, mask);
+ } else {
+ Conf->Affix[Conf->naffixes].issimple=0;
+ Conf->Affix[Conf->naffixes].isregis=0;
+ if (type == FF_SUFFIX)
+ sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
+ else
+ sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
+ }
+ Conf->Affix[Conf->naffixes].compile = 1;
+ Conf->Affix[Conf->naffixes].flagflags = flagflags;
+ Conf->Affix[Conf->naffixes].flag = flag;
+ Conf->Affix[Conf->naffixes].type = type;
+
+ strcpy(Conf->Affix[Conf->naffixes].find, find);
+ strcpy(Conf->Affix[Conf->naffixes].repl, repl);
+ Conf->Affix[Conf->naffixes].replen = strlen(repl);
+ Conf->naffixes++;
return (0);
}
continue;
}
- NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
+ NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
}
fclose(affix);
return rs;
}
+static void
+mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) {
+ int i,cnt=0;
+ int start = (issuffix) ? startsuffix : 0;
+ int end = (issuffix) ? Conf->naffixes : startsuffix;
+ AffixNode *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData));
+
+ MEMOUT(Affix);
+ memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) );
+ Affix->length=1;
+ Affix->isvoid=1;
+
+ if (issuffix) {
+ Affix->data->node=Conf->Suffix;
+ Conf->Suffix = Affix;
+ } else {
+ Affix->data->node=Conf->Prefix;
+ Conf->Prefix = Affix;
+ }
+
+
+ for(i=start;i<end;i++)
+ if (Conf->Affix[i].replen==0)
+ cnt++;
+
+ if ( cnt==0 )
+ return;
+
+ Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt );
+ MEMOUT(Affix->data->aff);
+ Affix->data->naff = (uint32)cnt;
+
+ cnt=0;
+ for(i=start;i<end;i++)
+ if (Conf->Affix[i].replen==0) {
+ Affix->data->aff[cnt] = Conf->Affix + i;
+ cnt++;
+ }
+}
+
void
NISortAffixes(IspellDict * Conf)
{
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p');
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
+ mkVoidAffix(Conf, 1, firstsuffix);
+ mkVoidAffix(Conf, 0, firstsuffix);
}
static AffixNodeData*
AffixNodeData *StopLow, *StopHigh, *StopMiddle;
uint8 symbol;
+ if ( node->isvoid ) { /* search void affixes */
+ if (node->data->naff)
+ return node->data;
+ node = node->data->node;
+ }
+
while( node && *level<wrdlen) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
- StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
symbol = GETWCHAR(word,wrdlen,*level,type);
if ( StopMiddle->val == symbol ) {
+ (*level)++;
if ( StopMiddle->naff )
return StopMiddle;
node=StopMiddle->node;
- (*level)++;
break;
} else if ( StopMiddle->val < symbol ) {
StopLow = StopMiddle + 1;
static char *
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
- regmatch_t subs[2]; /* workaround for apache&linux */
- int err;
- pg_wchar *data;
- size_t data_len;
- int dat_len;
if ( flagflags & FF_COMPOUNDONLYAFX ) {
if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
return NULL;
}
- if ( Affix->type=='s' ) {
+ if ( Affix->type==FF_SUFFIX ) {
strcpy(newword, word);
strcpy(newword + len - Affix->replen, Affix->find);
} else {
strcat(newword, word + Affix->replen);
}
- if (Affix->compile)
- {
- int wmasklen,masklen = strlen(Affix->mask);
- pg_wchar *mask;
- mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
- wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
-
- err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
- pfree(mask);
- if (err)
+ if ( Affix->issimple ) {
+ return newword;
+ } else if ( Affix->isregis ) {
+ if (Affix->compile) {
+ RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask);
+ Affix->compile = 0;
+ }
+ if ( RS_execute(&(Affix->reg.regis), newword, -1) )
+ return newword;
+ } else {
+ regmatch_t subs[2]; /* workaround for apache&linux */
+ int err;
+ pg_wchar *data;
+ size_t data_len;
+ int dat_len;
+ if (Affix->compile)
{
- /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
- pg_regfree(&(Affix->reg));
- return (NULL);
+ int wmasklen,masklen = strlen(Affix->mask);
+ pg_wchar *mask;
+ mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
+ wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
+
+ err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+ pfree(mask);
+ if (err)
+ {
+ /* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */
+ pg_regfree(&(Affix->reg.regex));
+ return (NULL);
+ }
+ Affix->compile = 0;
}
- Affix->compile = 0;
- }
- /* Convert data string to wide characters */
- dat_len = strlen(newword);
- data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
- data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+ /* Convert data string to wide characters */
+ dat_len = strlen(newword);
+ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(newword, data, dat_len);
- if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) {
- pfree(data);
- return newword;
+ if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) {
+ pfree(data);
+ return newword;
+ }
+ pfree(data);
}
- pfree(data);
return NULL;
}
}
}
pnode = prefix->node;
- plevel++;
}
/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
}
}
pnode = prefix->node;
- plevel++;
}
}
}
snode=suffix->node;
- slevel++;
}
if (cur == forms) {
for (i = 0; i < Conf->naffixes; i++)
{
- if (Affix[i].compile == 0)
- pg_regfree(&(Affix[i].reg));
+ if (Affix[i].compile == 0) {
+ if ( Affix[i].isregis )
+ RS_free(&(Affix[i].reg.regis));
+ else
+ pg_regfree(&(Affix[i].reg.regex));
+ }
}
if (Conf->Spell) {
for (i = 0; i < Conf->nspell; i++)