#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <ctype.h>
#include "regis.h"
+#include "ts_locale.h"
#include "common.h"
-int
+bool
RS_isRegis(const char *str)
{
unsigned char *ptr = (unsigned char *) str;
while (ptr && *ptr)
- if (isalpha(*ptr) || *ptr == '[' || *ptr == ']' || *ptr == '^')
- ptr++;
+ if (t_isalpha(ptr) || t_iseq(ptr,'[') || t_iseq(ptr,']') || t_iseq(ptr, '^'))
+ ptr+=pg_mblen(ptr);
else
- return 0;
- return 1;
+ return false;
+
+ return true;
}
#define RS_IN_ONEOF 1
return ptr;
}
-int
-RS_compile(Regis * r, int issuffix, const char *str)
+void
+RS_compile(Regis * r, bool issuffix, char *str)
{
- int i,
- len = strlen(str);
+ int len = strlen(str);
int state = RS_IN_WAIT;
+ char *c = (char*)str;
RegisNode *ptr = NULL;
memset(r, 0, sizeof(Regis));
r->issuffix = (issuffix) ? 1 : 0;
- for (i = 0; i < len; i++)
+ while(*c)
{
- unsigned char c = *(((unsigned char *) str) + i);
-
if (state == RS_IN_WAIT)
{
- if (isalpha(c))
+ if (t_isalpha(c))
{
if (ptr)
ptr = newRegisNode(ptr, len);
else
ptr = r->node = newRegisNode(NULL, len);
- ptr->data[0] = c;
+ COPYCHAR(ptr->data, c);
ptr->type = RSF_ONEOF;
- ptr->len = 1;
+ ptr->len = pg_mblen(c);
}
- else if (c == '[')
+ else if (t_iseq(c,'['))
{
if (ptr)
ptr = newRegisNode(ptr, len);
state = RS_IN_ONEOF;
}
else
- ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+ ts_error(ERROR, "Error in regis: %s", str );
}
else if (state == RS_IN_ONEOF)
{
- if (c == '^')
+ if (t_iseq(c,'^'))
{
ptr->type = RSF_NONEOF;
state = RS_IN_NONEOF;
}
- else if (isalpha(c))
+ else if (t_isalpha(c))
{
- ptr->data[0] = c;
- ptr->len = 1;
+ COPYCHAR(ptr->data, c);
+ ptr->len = pg_mblen(c);
state = RS_IN_ONEOF_IN;
}
else
- ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+ ts_error(ERROR, "Error in regis: %s", str);
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
- if (isalpha(c))
+ if (t_isalpha(c))
{
- ptr->data[ptr->len] = c;
- ptr->len++;
+ COPYCHAR(ptr->data+ptr->len, c);
+ ptr->len+=pg_mblen(c);
}
- else if (c == ']')
+ else if (t_iseq(c,']'))
state = RS_IN_WAIT;
else
- ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+ ts_error(ERROR, "Error in regis: %s", str);
}
else
- ts_error(ERROR, "Internal error in RS_compile: %d\n", state);
+ ts_error(ERROR, "Internal error in RS_compile: %d", state);
+ c += pg_mblen(c);
}
ptr = r->node;
r->nchar++;
ptr = ptr->next;
}
-
- return 0;
}
void
r->node = NULL;
}
-int
-RS_execute(Regis * r, const char *str, int len)
+#ifdef TS_USE_WIDE
+static bool
+mb_strchr(char *str, char *c) {
+ int clen = pg_mblen(c), plen,i;
+ char *ptr =str;
+ bool res=false;
+
+ clen = pg_mblen(c);
+ while( *ptr && !res) {
+ plen = pg_mblen(ptr);
+ if ( plen == clen ) {
+ i=plen;
+ res = true;
+ while(i--)
+ if ( *(ptr+i) != *(c+i) ) {
+ res = false;
+ break;
+ }
+ }
+
+ ptr += plen;
+ }
+
+ return res;
+}
+#else
+#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true )
+#endif
+
+
+bool
+RS_execute(Regis * r, char *str)
{
RegisNode *ptr = r->node;
- unsigned char *c;
+ char *c = str;
+ int len=0;
- if (len < 0)
- len = strlen(str);
+ while(*c) {
+ len++;
+ c += pg_mblen(c);
+ }
if (len < r->nchar)
return 0;
- if (r->issuffix)
- c = ((unsigned char *) str) + len - r->nchar;
- else
- c = (unsigned char *) str;
+ c = str;
+ if (r->issuffix) {
+ len -= r->nchar;
+ while(len-- > 0)
+ c += pg_mblen(c);
+ }
+
while (ptr)
{
switch (ptr->type)
{
case RSF_ONEOF:
- if (ptr->len == 0)
- {
- if (*c != *(ptr->data))
- return 0;
- }
- else if (strchr((char *) ptr->data, *c) == NULL)
- return 0;
+ if ( mb_strchr((char *) ptr->data, c) != true )
+ return false;
break;
case RSF_NONEOF:
- if (ptr->len == 0)
- {
- if (*c == *(ptr->data))
- return 0;
- }
- else if (strchr((char *) ptr->data, *c) != NULL)
- return 0;
+ if ( mb_strchr((char *) ptr->data, c) == true )
+ return false;
break;
default:
ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type);
}
ptr = ptr->next;
- c++;
+ c+=pg_mblen(c);
}
- return 1;
+ return true;
}
#include "postgres.h"
#include "spell.h"
+#include "common.h"
#include "ts_locale.h"
#define MAX_NORM 1024
#define ERRSTRSIZE 1024
-#define STRNCASECMP(x,y) pg_strncasecmp(x, y, strlen(y))
+#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
return d;
}
+static char *
+findchar(char *str, int c) {
+ while( *str ) {
+ if ( t_iseq(str, c) )
+ return str;
+ str+=pg_mblen(str);
+ }
+
+ return NULL;
+}
+
+
/* backward string compare for suffix tree operations */
static int
strbcmp(const unsigned char *s1, const unsigned char *s2)
char *s;
const char *flag;
+ pg_verifymbstr( str, strlen(str), false);
+
flag = NULL;
- if ((s = strchr(str, '/')))
+ if ((s = findchar(str, '/')))
{
*s++ = '\0';
flag = s;
while (*s)
{
- if (isprint((unsigned char) *s) &&
- !isspace((unsigned char) *s))
+ /* we allow only single encoded flags for faster works */
+ if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
s++;
else
{
}
else
flag = "";
- lowerstr(str);
- /* Dont load words if first letter is not required */
- /* It allows to optimize loading at search time */
+
+
s = str;
while (*s)
{
- if (*s == '\r' || *s == '\n')
+ if (t_isspace(s)) {
*s = '\0';
- s++;
+ break;
+ }
+ s+=pg_mblen(s);
}
+ lowerstr(str);
+
NIAddSpell(Conf, str, flag);
}
fclose(dict);
}
else
{
+ int masklen = strlen(mask);
Conf->Affix[Conf->naffixes].issimple = 0;
Conf->Affix[Conf->naffixes].isregis = 0;
- Conf->Affix[Conf->naffixes].mask = (char *) malloc(strlen(mask) + 2);
- if (type == FF_SUFFIX)
- sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
+ Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2);
+ if (type == FF_SUFFIX)
+ sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
}
return (0);
}
-static char *
-remove_spaces(char *dist, char *src)
-{
- char *d,
- *s;
-
- d = dist;
- s = src;
- while (*s)
- {
- if (*s != ' ' && *s != '-' && *s != '\t')
- {
- *d = *s;
- d++;
- }
- s++;
+#define PAE_WAIT_MASK 0
+#define PAE_INMASK 1
+#define PAE_WAIT_FIND 2
+#define PAE_INFIND 3
+#define PAE_WAIT_REPL 4
+#define PAE_INREPL 5
+
+static bool
+parse_affentry( char *str, char *mask, char *find, char *repl ) {
+ int state = PAE_WAIT_MASK;
+ char *pmask=mask, *pfind=find, *prepl=repl;
+
+ *mask = *find = *repl = '\0';
+
+ while(*str) {
+ if ( state == PAE_WAIT_MASK ) {
+ if ( t_iseq(str,'#') )
+ return false;
+ else if (!t_isspace(str)) {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ state = PAE_INMASK;
+ }
+ } else if ( state == PAE_INMASK ) {
+ if ( t_iseq(str,'>') ) {
+ *pmask='\0';
+ state = PAE_WAIT_FIND;
+ } else if (!t_isspace(str)) {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ }
+ } else if ( state == PAE_WAIT_FIND ) {
+ if ( t_iseq(str,'-') ) {
+ state = PAE_INFIND;
+ } else if (t_isalpha(str)) {
+ COPYCHAR(prepl,str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ } else if (!t_isspace(str))
+ ts_error(ERROR, "Affix parse error");
+ } else if ( state == PAE_INFIND ) {
+ if ( t_iseq(str,',') ) {
+ *pfind='\0';
+ state = PAE_WAIT_REPL;
+ } else if (t_isalpha(str)) {
+ COPYCHAR(pfind,str);
+ pfind += pg_mblen(str);
+ } else if (!t_isspace(str))
+ ts_error(ERROR, "Affix parse error");
+ } else if ( state == PAE_WAIT_REPL ) {
+ if ( t_iseq(str,'-') ) {
+ break; /* void repl */
+ } else if ( t_isalpha(str) ) {
+ COPYCHAR(prepl,str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ } else if (!t_isspace(str))
+ ts_error(ERROR, "Affix parse error");
+ } else if ( state == PAE_INREPL ) {
+ if ( t_iseq(str,'#') ) {
+ *prepl = '\0';
+ break;
+ } else if ( t_isalpha(str) ) {
+ COPYCHAR(prepl,str);
+ prepl += pg_mblen(str);
+ } else if (!t_isspace(str))
+ ts_error(ERROR, "Affix parse error");
+ } else
+ ts_error(ERROR, "Unknown state in parse_affentry: %d", state);
+
+ str += pg_mblen(str);
}
- *d = 0;
- return (dist);
-}
+ *pmask = *pfind = *prepl = '\0';
+
+ return ( *mask && ( *find || *repl) ) ? true : false;
+}
int
NIImportAffixes(IspellDict * Conf, const char *filename)
{
char str[BUFSIZ];
+ char tmpstr[BUFSIZ];
char mask[BUFSIZ];
char find[BUFSIZ];
char repl[BUFSIZ];
char *s;
- int i;
int suffixes = 0;
int prefixes = 0;
int flag = 0;
while (fgets(str, sizeof(str), affix))
{
- if (STRNCASECMP(str, "compoundwords") == 0)
+ pg_verifymbstr( str, strlen(str), false);
+ memcpy(tmpstr, str, 32); /* compoundwords... */
+ tmpstr[32]='\0';
+ lowerstr(tmpstr);
+ if (STRNCMP(tmpstr, "compoundwords") == 0)
{
- s = strchr(str, 'l');
+ s = findchar(str, 'l');
if (s)
{
- while (*s != ' ')
- s++;
- while (*s == ' ')
- s++;
- Conf->compoundcontrol = *s;
+ while (*s && !t_isspace(s)) s++;
+ while (*s && t_isspace(s)) s++;
+ if ( *s && pg_mblen(s) == 1 )
+ Conf->compoundcontrol = *s;
continue;
}
}
- if (STRNCASECMP(str, "suffixes") == 0)
+ if (STRNCMP(tmpstr, "suffixes") == 0)
{
suffixes = 1;
prefixes = 0;
continue;
}
- if (STRNCASECMP(str, "prefixes") == 0)
+ if (STRNCMP(tmpstr, "prefixes") == 0)
{
suffixes = 0;
prefixes = 1;
continue;
}
- if (STRNCASECMP(str, "flag ") == 0)
+ if (STRNCMP(tmpstr, "flag") == 0)
{
- s = str + 5;
+ s = str + 4;
flagflags = 0;
- while (*s == ' ')
- s++;
+
+ while (*s && t_isspace(s)) s++;
+
+ /* allow only single-encoded flags */
+ if ( pg_mblen(s) != 1 )
+ continue;
+
if (*s == '*')
{
flagflags |= FF_CROSSPRODUCT;
if (*s == '\\')
s++;
+ /* allow only single-encoded flags */
+ if ( pg_mblen(s) != 1 ) {
+ flagflags = 0;
+ continue;
+ }
+
flag = (unsigned char) *s;
continue;
}
if ((!suffixes) && (!prefixes))
continue;
- if ((s = strchr(str, '#')))
- *s = 0;
- if (!*str)
- continue;
+
lowerstr(str);
- strcpy(mask, "");
- strcpy(find, "");
- strcpy(repl, "");
- i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
- remove_spaces(str, repl);
- strcpy(repl, str);
- remove_spaces(str, find);
- strcpy(find, str);
- remove_spaces(str, mask);
- strcpy(mask, str);
- switch (i)
- {
- case 3:
- break;
- case 2:
- if (*find != '\0')
- {
- strcpy(repl, find);
- strcpy(find, "");
- }
- break;
- default:
- continue;
- }
+ if ( !parse_affentry(str, mask, find, repl) )
+ continue;
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
-
}
fclose(affix);
{
if (Affix->compile)
{
- RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask);
+ RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask);
Affix->compile = 0;
}
- if (RS_execute(&(Affix->reg.regis), newword, -1))
+ if (RS_execute(&(Affix->reg.regis), newword))
return newword;
}
else
{
- regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
- int dat_len;
+ int newword_len;
if (Affix->compile)
{
int wmasklen,
masklen = strlen(Affix->mask);
pg_wchar *mask;
-
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);
- err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+ err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB);
pfree(mask);
if (err)
{
}
/* Convert data string to wide characters */
- dat_len = strlen(newword);
- data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
- data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+ newword_len = strlen(newword);
+ data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(newword, data, newword_len);
- if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, 0, NULL, 1, subs, 0)))
+ if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
{
pfree(data);
return newword;