From 7ac8a4be8946c11d5a6bf91bb971b9750c1c60e5 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 21 Dec 2005 13:05:49 +0000 Subject: [PATCH] Multibyte encodings support for ISpell dictionary --- contrib/tsearch2/ispell/regis.c | 136 +++++++++++-------- contrib/tsearch2/ispell/regis.h | 8 +- contrib/tsearch2/ispell/spell.c | 229 ++++++++++++++++++++------------ contrib/tsearch2/stopword.c | 4 +- contrib/tsearch2/ts_locale.h | 2 +- 5 files changed, 232 insertions(+), 147 deletions(-) diff --git a/contrib/tsearch2/ispell/regis.c b/contrib/tsearch2/ispell/regis.c index 996417b18a..db6f1873f3 100644 --- a/contrib/tsearch2/ispell/regis.c +++ b/contrib/tsearch2/ispell/regis.c @@ -1,22 +1,23 @@ #include #include #include -#include #include "regis.h" +#include "ts_locale.h" #include "common.h" -int +bool RS_isRegis(const char *str) { unsigned char *ptr = (unsigned char *) str; while (ptr && *ptr) - if (isalpha(*ptr) || *ptr == '[' || *ptr == ']' || *ptr == '^') - ptr++; + if (t_isalpha(ptr) || t_iseq(ptr,'[') || t_iseq(ptr,']') || t_iseq(ptr, '^')) + ptr+=pg_mblen(ptr); else - return 0; - return 1; + return false; + + return true; } #define RS_IN_ONEOF 1 @@ -38,34 +39,32 @@ newRegisNode(RegisNode * prev, int len) return ptr; } -int -RS_compile(Regis * r, int issuffix, const char *str) +void +RS_compile(Regis * r, bool issuffix, char *str) { - int i, - len = strlen(str); + int len = strlen(str); int state = RS_IN_WAIT; + char *c = (char*)str; RegisNode *ptr = NULL; memset(r, 0, sizeof(Regis)); r->issuffix = (issuffix) ? 1 : 0; - for (i = 0; i < len; i++) + while(*c) { - unsigned char c = *(((unsigned char *) str) + i); - if (state == RS_IN_WAIT) { - if (isalpha(c)) + if (t_isalpha(c)) { if (ptr) ptr = newRegisNode(ptr, len); else ptr = r->node = newRegisNode(NULL, len); - ptr->data[0] = c; + COPYCHAR(ptr->data, c); ptr->type = RSF_ONEOF; - ptr->len = 1; + ptr->len = pg_mblen(c); } - else if (c == '[') + else if (t_iseq(c,'[')) { if (ptr) ptr = newRegisNode(ptr, len); @@ -75,38 +74,39 @@ RS_compile(Regis * r, int issuffix, const char *str) state = RS_IN_ONEOF; } else - ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1); + ts_error(ERROR, "Error in regis: %s", str ); } else if (state == RS_IN_ONEOF) { - if (c == '^') + if (t_iseq(c,'^')) { ptr->type = RSF_NONEOF; state = RS_IN_NONEOF; } - else if (isalpha(c)) + else if (t_isalpha(c)) { - ptr->data[0] = c; - ptr->len = 1; + COPYCHAR(ptr->data, c); + ptr->len = pg_mblen(c); state = RS_IN_ONEOF_IN; } else - ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1); + ts_error(ERROR, "Error in regis: %s", str); } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (isalpha(c)) + if (t_isalpha(c)) { - ptr->data[ptr->len] = c; - ptr->len++; + COPYCHAR(ptr->data+ptr->len, c); + ptr->len+=pg_mblen(c); } - else if (c == ']') + else if (t_iseq(c,']')) state = RS_IN_WAIT; else - ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1); + ts_error(ERROR, "Error in regis: %s", str); } else - ts_error(ERROR, "Internal error in RS_compile: %d\n", state); + ts_error(ERROR, "Internal error in RS_compile: %d", state); + c += pg_mblen(c); } ptr = r->node; @@ -115,8 +115,6 @@ RS_compile(Regis * r, int issuffix, const char *str) r->nchar++; ptr = ptr->next; } - - return 0; } void @@ -135,51 +133,77 @@ RS_free(Regis * r) r->node = NULL; } -int -RS_execute(Regis * r, const char *str, int len) +#ifdef TS_USE_WIDE +static bool +mb_strchr(char *str, char *c) { + int clen = pg_mblen(c), plen,i; + char *ptr =str; + bool res=false; + + clen = pg_mblen(c); + while( *ptr && !res) { + plen = pg_mblen(ptr); + if ( plen == clen ) { + i=plen; + res = true; + while(i--) + if ( *(ptr+i) != *(c+i) ) { + res = false; + break; + } + } + + ptr += plen; + } + + return res; +} +#else +#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true ) +#endif + + +bool +RS_execute(Regis * r, char *str) { RegisNode *ptr = r->node; - unsigned char *c; + char *c = str; + int len=0; - if (len < 0) - len = strlen(str); + while(*c) { + len++; + c += pg_mblen(c); + } if (len < r->nchar) return 0; - if (r->issuffix) - c = ((unsigned char *) str) + len - r->nchar; - else - c = (unsigned char *) str; + c = str; + if (r->issuffix) { + len -= r->nchar; + while(len-- > 0) + c += pg_mblen(c); + } + while (ptr) { switch (ptr->type) { case RSF_ONEOF: - if (ptr->len == 0) - { - if (*c != *(ptr->data)) - return 0; - } - else if (strchr((char *) ptr->data, *c) == NULL) - return 0; + if ( mb_strchr((char *) ptr->data, c) != true ) + return false; break; case RSF_NONEOF: - if (ptr->len == 0) - { - if (*c == *(ptr->data)) - return 0; - } - else if (strchr((char *) ptr->data, *c) != NULL) - return 0; + if ( mb_strchr((char *) ptr->data, c) == true ) + return false; break; default: ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type); } ptr = ptr->next; - c++; + c+=pg_mblen(c); } - return 1; + return true; } diff --git a/contrib/tsearch2/ispell/regis.h b/contrib/tsearch2/ispell/regis.h index 6f26f66d46..5bc337c925 100644 --- a/contrib/tsearch2/ispell/regis.h +++ b/contrib/tsearch2/ispell/regis.h @@ -27,12 +27,12 @@ typedef struct Regis unused:15; } Regis; -int RS_isRegis(const char *str); +bool RS_isRegis(const char *str); -int RS_compile(Regis * r, int issuffix, const char *str); +void RS_compile(Regis * r, bool issuffix, char *str); void RS_free(Regis * r); -/*×ÏÚ×ÒÁÝÁÅÔ 1 ÅÓÌÉ ÍÁÔÞÉÔÓÑ */ -int RS_execute(Regis * r, const char *str, int len); +/*returns true if matches */ +bool RS_execute(Regis * r, char *str); #endif diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c index baa36f31f1..d702dbd9cc 100644 --- a/contrib/tsearch2/ispell/spell.c +++ b/contrib/tsearch2/ispell/spell.c @@ -6,6 +6,7 @@ #include "postgres.h" #include "spell.h" +#include "common.h" #include "ts_locale.h" #define MAX_NORM 1024 @@ -13,7 +14,7 @@ #define ERRSTRSIZE 1024 -#define STRNCASECMP(x,y) pg_strncasecmp(x, y, strlen(y)) +#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) ) #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] ) #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T ) @@ -41,6 +42,18 @@ strnduplicate(char *s, int len) return d; } +static char * +findchar(char *str, int c) { + while( *str ) { + if ( t_iseq(str, c) ) + return str; + str+=pg_mblen(str); + } + + return NULL; +} + + /* backward string compare for suffix tree operations */ static int strbcmp(const unsigned char *s1, const unsigned char *s2) @@ -145,15 +158,17 @@ NIImportDictionary(IspellDict * Conf, const char *filename) char *s; const char *flag; + pg_verifymbstr( str, strlen(str), false); + flag = NULL; - if ((s = strchr(str, '/'))) + if ((s = findchar(str, '/'))) { *s++ = '\0'; flag = s; while (*s) { - if (isprint((unsigned char) *s) && - !isspace((unsigned char) *s)) + /* we allow only single encoded flags for faster works */ + if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s)) s++; else { @@ -164,16 +179,19 @@ NIImportDictionary(IspellDict * Conf, const char *filename) } else flag = ""; - lowerstr(str); - /* Dont load words if first letter is not required */ - /* It allows to optimize loading at search time */ + + s = str; while (*s) { - if (*s == '\r' || *s == '\n') + if (t_isspace(s)) { *s = '\0'; - s++; + break; + } + s+=pg_mblen(s); } + lowerstr(str); + NIAddSpell(Conf, str, flag); } fclose(dict); @@ -253,11 +271,12 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const } else { + int masklen = strlen(mask); Conf->Affix[Conf->naffixes].issimple = 0; Conf->Affix[Conf->naffixes].isregis = 0; - Conf->Affix[Conf->naffixes].mask = (char *) malloc(strlen(mask) + 2); - if (type == FF_SUFFIX) - sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); + Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2); + if (type == FF_SUFFIX) + sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); else sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); } @@ -277,37 +296,93 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const return (0); } -static char * -remove_spaces(char *dist, char *src) -{ - char *d, - *s; - - d = dist; - s = src; - while (*s) - { - if (*s != ' ' && *s != '-' && *s != '\t') - { - *d = *s; - d++; - } - s++; +#define PAE_WAIT_MASK 0 +#define PAE_INMASK 1 +#define PAE_WAIT_FIND 2 +#define PAE_INFIND 3 +#define PAE_WAIT_REPL 4 +#define PAE_INREPL 5 + +static bool +parse_affentry( char *str, char *mask, char *find, char *repl ) { + int state = PAE_WAIT_MASK; + char *pmask=mask, *pfind=find, *prepl=repl; + + *mask = *find = *repl = '\0'; + + while(*str) { + if ( state == PAE_WAIT_MASK ) { + if ( t_iseq(str,'#') ) + return false; + else if (!t_isspace(str)) { + COPYCHAR(pmask, str); + pmask += pg_mblen(str); + state = PAE_INMASK; + } + } else if ( state == PAE_INMASK ) { + if ( t_iseq(str,'>') ) { + *pmask='\0'; + state = PAE_WAIT_FIND; + } else if (!t_isspace(str)) { + COPYCHAR(pmask, str); + pmask += pg_mblen(str); + } + } else if ( state == PAE_WAIT_FIND ) { + if ( t_iseq(str,'-') ) { + state = PAE_INFIND; + } else if (t_isalpha(str)) { + COPYCHAR(prepl,str); + prepl += pg_mblen(str); + state = PAE_INREPL; + } else if (!t_isspace(str)) + ts_error(ERROR, "Affix parse error"); + } else if ( state == PAE_INFIND ) { + if ( t_iseq(str,',') ) { + *pfind='\0'; + state = PAE_WAIT_REPL; + } else if (t_isalpha(str)) { + COPYCHAR(pfind,str); + pfind += pg_mblen(str); + } else if (!t_isspace(str)) + ts_error(ERROR, "Affix parse error"); + } else if ( state == PAE_WAIT_REPL ) { + if ( t_iseq(str,'-') ) { + break; /* void repl */ + } else if ( t_isalpha(str) ) { + COPYCHAR(prepl,str); + prepl += pg_mblen(str); + state = PAE_INREPL; + } else if (!t_isspace(str)) + ts_error(ERROR, "Affix parse error"); + } else if ( state == PAE_INREPL ) { + if ( t_iseq(str,'#') ) { + *prepl = '\0'; + break; + } else if ( t_isalpha(str) ) { + COPYCHAR(prepl,str); + prepl += pg_mblen(str); + } else if (!t_isspace(str)) + ts_error(ERROR, "Affix parse error"); + } else + ts_error(ERROR, "Unknown state in parse_affentry: %d", state); + + str += pg_mblen(str); } - *d = 0; - return (dist); -} + *pmask = *pfind = *prepl = '\0'; + + return ( *mask && ( *find || *repl) ) ? true : false; +} int NIImportAffixes(IspellDict * Conf, const char *filename) { char str[BUFSIZ]; + char tmpstr[BUFSIZ]; char mask[BUFSIZ]; char find[BUFSIZ]; char repl[BUFSIZ]; char *s; - int i; int suffixes = 0; int prefixes = 0; int flag = 0; @@ -320,37 +395,45 @@ NIImportAffixes(IspellDict * Conf, const char *filename) while (fgets(str, sizeof(str), affix)) { - if (STRNCASECMP(str, "compoundwords") == 0) + pg_verifymbstr( str, strlen(str), false); + memcpy(tmpstr, str, 32); /* compoundwords... */ + tmpstr[32]='\0'; + lowerstr(tmpstr); + if (STRNCMP(tmpstr, "compoundwords") == 0) { - s = strchr(str, 'l'); + s = findchar(str, 'l'); if (s) { - while (*s != ' ') - s++; - while (*s == ' ') - s++; - Conf->compoundcontrol = *s; + while (*s && !t_isspace(s)) s++; + while (*s && t_isspace(s)) s++; + if ( *s && pg_mblen(s) == 1 ) + Conf->compoundcontrol = *s; continue; } } - if (STRNCASECMP(str, "suffixes") == 0) + if (STRNCMP(tmpstr, "suffixes") == 0) { suffixes = 1; prefixes = 0; continue; } - if (STRNCASECMP(str, "prefixes") == 0) + if (STRNCMP(tmpstr, "prefixes") == 0) { suffixes = 0; prefixes = 1; continue; } - if (STRNCASECMP(str, "flag ") == 0) + if (STRNCMP(tmpstr, "flag") == 0) { - s = str + 5; + s = str + 4; flagflags = 0; - while (*s == ' ') - s++; + + while (*s && t_isspace(s)) s++; + + /* allow only single-encoded flags */ + if ( pg_mblen(s) != 1 ) + continue; + if (*s == '*') { flagflags |= FF_CROSSPRODUCT; @@ -365,43 +448,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename) if (*s == '\\') s++; + /* allow only single-encoded flags */ + if ( pg_mblen(s) != 1 ) { + flagflags = 0; + continue; + } + flag = (unsigned char) *s; continue; } if ((!suffixes) && (!prefixes)) continue; - if ((s = strchr(str, '#'))) - *s = 0; - if (!*str) - continue; + lowerstr(str); - strcpy(mask, ""); - strcpy(find, ""); - strcpy(repl, ""); - i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl); - remove_spaces(str, repl); - strcpy(repl, str); - remove_spaces(str, find); - strcpy(find, str); - remove_spaces(str, mask); - strcpy(mask, str); - switch (i) - { - case 3: - break; - case 2: - if (*find != '\0') - { - strcpy(repl, find); - strcpy(find, ""); - } - break; - default: - continue; - } + if ( !parse_affentry(str, mask, find, repl) ) + continue; NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); - } fclose(affix); @@ -768,30 +831,28 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne { if (Affix->compile) { - RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask); + RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask); Affix->compile = 0; } - if (RS_execute(&(Affix->reg.regis), newword, -1)) + if (RS_execute(&(Affix->reg.regis), newword)) return newword; } else { - regmatch_t subs[2]; /* workaround for apache&linux */ int err; pg_wchar *data; size_t data_len; - int dat_len; + int newword_len; if (Affix->compile) { int wmasklen, masklen = strlen(Affix->mask); pg_wchar *mask; - mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar)); wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen); - err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB); + err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB); pfree(mask); if (err) { @@ -804,11 +865,11 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne } /* Convert data string to wide characters */ - dat_len = strlen(newword); - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(newword, data, dat_len); + newword_len = strlen(newword); + data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(newword, data, newword_len); - if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, 0, NULL, 1, subs, 0))) + if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0))) { pfree(data); return newword; diff --git a/contrib/tsearch2/stopword.c b/contrib/tsearch2/stopword.c index 2a9a464596..f3894714d2 100644 --- a/contrib/tsearch2/stopword.c +++ b/contrib/tsearch2/stopword.c @@ -4,8 +4,6 @@ */ #include "postgres.h" -#include - #include "miscadmin.h" #include "common.h" @@ -71,6 +69,8 @@ readstoplist(text *in, StopList * s) while (fgets(buf, STOPBUFLEN, hin)) { buf[strlen(buf) - 1] = '\0'; + pg_verifymbstr( buf, strlen(buf), false ); + lowerstr(buf); if (*buf == '\0') continue; diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h index 2d5bc17a96..8695b27a0e 100644 --- a/contrib/tsearch2/ts_locale.h +++ b/contrib/tsearch2/ts_locale.h @@ -57,7 +57,7 @@ int _t_isprint( char *ptr ); int lll = pg_mblen( s ); \ \ while( lll-- ) \ - TOUCHAR(d+lll) = TOUCHAR(s+lll); \ + TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ } while(0) -- 2.40.0