Multibyte encodings support for ISpell dictionary

author Teodor Sigaev <teodor@sigaev.ru>

Wed, 21 Dec 2005 13:05:49 +0000 (13:05 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Wed, 21 Dec 2005 13:05:49 +0000 (13:05 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Wed, 21 Dec 2005 13:05:49 +0000 (13:05 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Wed, 21 Dec 2005 13:05:49 +0000 (13:05 +0000)
diff --git a/contrib/tsearch2/ispell/regis.c b/contrib/tsearch2/ispell/regis.c

index 996417b18a9c5fa5e9790da3eda0a9a5ffd16925..db6f1873f339dc15908160dab45ed9664335e51d 100644 (file)
--- a/contrib/tsearch2/ispell/regis.c
+++ b/contrib/tsearch2/ispell/regis.c
@@ -1,22 +1,23 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
-#include <ctype.h>
  
  #include "regis.h"
+#include "ts_locale.h"
  #include "common.h"
  
-int
+bool
  RS_isRegis(const char *str)
  {
         unsigned char *ptr = (unsigned char *) str;
  
         while (ptr && *ptr)
-               if (isalpha(*ptr) || *ptr == '[' || *ptr == ']' || *ptr == '^')
-                       ptr++;
+               if (t_isalpha(ptr) || t_iseq(ptr,'[') || t_iseq(ptr,']') || t_iseq(ptr, '^'))
+                       ptr+=pg_mblen(ptr);
                 else
-                       return 0;
-       return 1;
+                       return false;
+
+       return true;
  }
  
  #define RS_IN_ONEOF 1
@@ -38,34 +39,32 @@ newRegisNode(RegisNode * prev, int len)
         return ptr;
  }
  
-int
-RS_compile(Regis * r, int issuffix, const char *str)
+void
+RS_compile(Regis * r, bool issuffix, char *str)
  {
-       int                     i,
-                               len = strlen(str);
+       int                     len = strlen(str);
         int                     state = RS_IN_WAIT;
+       char                    *c = (char*)str;
         RegisNode  *ptr = NULL;
  
         memset(r, 0, sizeof(Regis));
         r->issuffix = (issuffix) ? 1 : 0;
  
-       for (i = 0; i < len; i++)
+       while(*c)
         {
-               unsigned char c = *(((unsigned char *) str) + i);
-
                 if (state == RS_IN_WAIT)
                 {
-                       if (isalpha(c))
+                       if (t_isalpha(c))
                         {
                                 if (ptr)
                                         ptr = newRegisNode(ptr, len);
                                 else
                                         ptr = r->node = newRegisNode(NULL, len);
-                               ptr->data[0] = c;
+                               COPYCHAR(ptr->data, c);
                                 ptr->type = RSF_ONEOF;
-                               ptr->len = 1;
+                               ptr->len = pg_mblen(c);
                         }
-                       else if (c == '[')
+                       else if (t_iseq(c,'['))
                         {
                                 if (ptr)
                                         ptr = newRegisNode(ptr, len);
@@ -75,38 +74,39 @@ RS_compile(Regis * r, int issuffix, const char *str)
                                 state = RS_IN_ONEOF;
                         }
                         else
-                               ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+                               ts_error(ERROR, "Error in regis: %s", str );
                 }
                 else if (state == RS_IN_ONEOF)
                 {
-                       if (c == '^')
+                       if (t_iseq(c,'^'))
                         {
                                 ptr->type = RSF_NONEOF;
                                 state = RS_IN_NONEOF;
                         }
-                       else if (isalpha(c))
+                       else if (t_isalpha(c))
                         {
-                               ptr->data[0] = c;
-                               ptr->len = 1;
+                               COPYCHAR(ptr->data, c);
+                               ptr->len = pg_mblen(c);
                                 state = RS_IN_ONEOF_IN;
                         }
                         else
-                               ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+                               ts_error(ERROR, "Error in regis: %s", str);
                 }
                 else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
                 {
-                       if (isalpha(c))
+                       if (t_isalpha(c))
                         {
-                               ptr->data[ptr->len] = c;
-                               ptr->len++;
+                               COPYCHAR(ptr->data+ptr->len,  c);
+                               ptr->len+=pg_mblen(c);
                         }
-                       else if (c == ']')
+                       else if (t_iseq(c,']'))
                                 state = RS_IN_WAIT;
                         else
-                               ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+                               ts_error(ERROR, "Error in regis: %s", str);
                 }
                 else
-                       ts_error(ERROR, "Internal error in RS_compile: %d\n", state);
+                       ts_error(ERROR, "Internal error in RS_compile: %d", state);
+               c += pg_mblen(c);
         }
  
         ptr = r->node;
@@ -115,8 +115,6 @@ RS_compile(Regis * r, int issuffix, const char *str)
                 r->nchar++;
                 ptr = ptr->next;
         }
-
-       return 0;
  }
  
  void
@@ -135,51 +133,77 @@ RS_free(Regis * r)
         r->node = NULL;
  }
  
-int
-RS_execute(Regis * r, const char *str, int len)
+#ifdef TS_USE_WIDE
+static bool
+mb_strchr(char *str, char *c) {
+       int clen = pg_mblen(c), plen,i;
+       char    *ptr =str;
+       bool    res=false;
+
+       clen = pg_mblen(c);
+       while( *ptr && !res) {
+               plen = pg_mblen(ptr);
+               if ( plen == clen ) {
+                       i=plen;
+                       res = true;
+                       while(i--)
+                               if ( *(ptr+i) != *(c+i) ) {
+                                       res = false;
+                                       break; 
+                               }
+               }
+               
+               ptr += plen;
+       }        
+
+       return res;     
+}
+#else
+#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true )
+#endif
+
+
+bool
+RS_execute(Regis * r, char *str)
  {
         RegisNode  *ptr = r->node;
-       unsigned char *c;
+       char *c = str;
+       int len=0;
  
-       if (len < 0)
-               len = strlen(str);
+       while(*c) {
+               len++;
+               c += pg_mblen(c);
+       }       
  
         if (len < r->nchar)
                 return 0;
  
-       if (r->issuffix)
-               c = ((unsigned char *) str) + len - r->nchar;
-       else
-               c = (unsigned char *) str;
+       c = str;
+       if (r->issuffix) {
+               len -= r->nchar;
+               while(len-- > 0)
+                       c += pg_mblen(c);
+       }
+
  
         while (ptr)
         {
                 switch (ptr->type)
                 {
                         case RSF_ONEOF:
-                               if (ptr->len == 0)
-                               {
-                                       if (*c != *(ptr->data))
-                                               return 0;
-                               }
-                               else if (strchr((char *) ptr->data, *c) == NULL)
-                                       return 0;
+                               if ( mb_strchr((char *) ptr->data, c) != true )
+                                       return false;
                                 break;
                         case RSF_NONEOF:
-                               if (ptr->len == 0)
-                               {
-                                       if (*c == *(ptr->data))
-                                               return 0;
-                               }
-                               else if (strchr((char *) ptr->data, *c) != NULL)
-                                       return 0;
+                               if ( mb_strchr((char *) ptr->data, c) == true )
+                                       return false;
                                 break;
                         default:
                                 ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type);
                 }
                 ptr = ptr->next;
-               c++;
+               c+=pg_mblen(c);
         }
  
-       return 1;
+       return true;
  }
diff --git a/contrib/tsearch2/ispell/regis.h b/contrib/tsearch2/ispell/regis.h

index 6f26f66d46149f6ab0c255a7e359bd043e9190c2..5bc337c9251033a2df1a7a236bc684ac9f082893 100644 (file)
--- a/contrib/tsearch2/ispell/regis.h
+++ b/contrib/tsearch2/ispell/regis.h
@@ -27,12 +27,12 @@ typedef struct Regis
                                 unused:15;
  }      Regis;
  
-int                    RS_isRegis(const char *str);
+bool                   RS_isRegis(const char *str);
  
-int                    RS_compile(Regis * r, int issuffix, const char *str);
+void                   RS_compile(Regis * r, bool issuffix, char *str);
  void           RS_free(Regis * r);
  
-/*×ÏÚ×ÒÁÝÁÅÔ 1 ÅÓÌÉ ÍÁÔÞÉÔÓÑ */
-int                    RS_execute(Regis * r, const char *str, int len);
+/*returns true if matches */
+bool                   RS_execute(Regis * r, char *str);
  
  #endif
diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c

index baa36f31f10f988344b7814b141b83cba08410c9..d702dbd9ccafeefca4378c63064cfb2010bc8f5f 100644 (file)
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -6,6 +6,7 @@
  #include "postgres.h"
  
  #include "spell.h"
+#include "common.h"
  #include "ts_locale.h"
  
  #define MAX_NORM 1024
@@ -13,7 +14,7 @@
  
  #define ERRSTRSIZE     1024
  
-#define STRNCASECMP(x,y)               pg_strncasecmp(x, y, strlen(y))
+#define STRNCMP(s,p)   strncmp( (s), (p), strlen(p) )
  #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
  #define GETCHAR(A,N,T)   GETWCHAR( (A)->repl, (A)->replen, N, T )
  
@@ -41,6 +42,18 @@ strnduplicate(char *s, int len)
         return d;
  }
  
+static char *
+findchar(char *str, int c) {
+       while( *str ) {
+               if ( t_iseq(str, c) ) 
+                       return str;
+               str+=pg_mblen(str);
+       }
+
+       return NULL;
+}
+               
+
  /* backward string compare for suffix tree operations */
  static int
  strbcmp(const unsigned char *s1, const unsigned char *s2)
@@ -145,15 +158,17 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
                 char       *s;
                 const char *flag;
  
+               pg_verifymbstr( str, strlen(str), false);
+
                 flag = NULL;
-               if ((s = strchr(str, '/')))
+               if ((s = findchar(str, '/')))
                 {
                         *s++ = '\0';
                         flag = s;
                         while (*s)
                         {
-                               if (isprint((unsigned char) *s) &&
-                                       !isspace((unsigned char) *s))
+                               /* we allow only single encoded flags for faster works */
+                               if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
                                         s++;
                                 else
                                 {
@@ -164,16 +179,19 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
                 }
                 else
                         flag = "";
-               lowerstr(str);
-               /* Dont load words if first letter is not required */
-               /* It allows to optimize loading at  search time   */
+
+
                 s = str;
                 while (*s)
                 {
-                       if (*s == '\r' || *s == '\n')
+                       if (t_isspace(s)) {
                                 *s = '\0';
-                       s++;
+                               break;
+                       }
+                       s+=pg_mblen(s);
                 }
+               lowerstr(str);
+
                 NIAddSpell(Conf, str, flag);
         }
         fclose(dict);
@@ -253,11 +271,12 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
         }
         else
         {
+               int masklen = strlen(mask);
                 Conf->Affix[Conf->naffixes].issimple = 0;
                 Conf->Affix[Conf->naffixes].isregis = 0;
-               Conf->Affix[Conf->naffixes].mask = (char *) malloc(strlen(mask) + 2);
-               if (type == FF_SUFFIX)
-                       sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
+               Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2);
+               if (type == FF_SUFFIX) 
+                       sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); 
                 else
                         sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
         }
@@ -277,37 +296,93 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
         return (0);
  }
  
-static char *
-remove_spaces(char *dist, char *src)
-{
-       char       *d,
-                          *s;
-
-       d = dist;
-       s = src;
-       while (*s)
-       {
-               if (*s != ' ' && *s != '-' && *s != '\t')
-               {
-                       *d = *s;
-                       d++;
-               }
-               s++;
+#define PAE_WAIT_MASK  0
+#define PAE_INMASK     1
+#define PAE_WAIT_FIND  2
+#define PAE_INFIND     3
+#define PAE_WAIT_REPL  4
+#define PAE_INREPL     5
+
+static bool
+parse_affentry( char *str, char *mask, char *find, char *repl ) {
+       int state = PAE_WAIT_MASK;
+       char    *pmask=mask, *pfind=find, *prepl=repl;
+
+       *mask = *find = *repl = '\0';
+
+       while(*str) {
+               if ( state == PAE_WAIT_MASK ) {
+                       if ( t_iseq(str,'#') ) 
+                               return false;
+                       else if (!t_isspace(str)) {
+                               COPYCHAR(pmask, str);
+                               pmask += pg_mblen(str);
+                               state = PAE_INMASK;
+                       }
+               } else if ( state == PAE_INMASK ) {
+                       if ( t_iseq(str,'>') ) {
+                               *pmask='\0';
+                               state = PAE_WAIT_FIND;
+                       } else if (!t_isspace(str)) {
+                               COPYCHAR(pmask, str);
+                               pmask += pg_mblen(str);
+                       }
+               } else if ( state == PAE_WAIT_FIND ) {
+                       if ( t_iseq(str,'-') ) {
+                               state = PAE_INFIND;
+                       } else if (t_isalpha(str)) {
+                               COPYCHAR(prepl,str);
+                               prepl += pg_mblen(str);
+                               state = PAE_INREPL;
+                       } else if (!t_isspace(str))
+                               ts_error(ERROR, "Affix parse error");
+               } else if ( state == PAE_INFIND ) {
+                       if ( t_iseq(str,',') ) {
+                               *pfind='\0';
+                               state = PAE_WAIT_REPL;
+                       } else if (t_isalpha(str)) {
+                               COPYCHAR(pfind,str);
+                               pfind += pg_mblen(str);
+                       } else if (!t_isspace(str))
+                               ts_error(ERROR, "Affix parse error");
+               } else if ( state == PAE_WAIT_REPL ) {
+                       if ( t_iseq(str,'-') ) {
+                               break; /* void repl */
+                       } else if ( t_isalpha(str) ) {
+                               COPYCHAR(prepl,str);
+                               prepl += pg_mblen(str);
+                               state = PAE_INREPL;
+                       } else if (!t_isspace(str))
+                               ts_error(ERROR, "Affix parse error");
+               } else if ( state == PAE_INREPL ) {
+                       if ( t_iseq(str,'#') ) {
+                               *prepl = '\0';
+                               break;
+                       } else if ( t_isalpha(str) ) { 
+                               COPYCHAR(prepl,str);
+                               prepl += pg_mblen(str);
+                       } else if (!t_isspace(str))
+                               ts_error(ERROR, "Affix parse error");
+               } else
+                       ts_error(ERROR, "Unknown state in parse_affentry: %d", state);
+
+               str += pg_mblen(str);
         }
-       *d = 0;
-       return (dist);
-}
  
+       *pmask = *pfind = *prepl = '\0';
+
+       return ( *mask && ( *find || *repl) ) ? true : false;
+} 
  
  int
  NIImportAffixes(IspellDict * Conf, const char *filename)
  {
         char            str[BUFSIZ];
+       char            tmpstr[BUFSIZ];
         char            mask[BUFSIZ];
         char            find[BUFSIZ];
         char            repl[BUFSIZ];
         char       *s;
-       int                     i;
         int                     suffixes = 0;
         int                     prefixes = 0;
         int                     flag = 0;
@@ -320,37 +395,45 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
  
         while (fgets(str, sizeof(str), affix))
         {
-               if (STRNCASECMP(str, "compoundwords") == 0)
+               pg_verifymbstr( str, strlen(str), false);
+               memcpy(tmpstr, str, 32); /* compoundwords... */
+               tmpstr[32]='\0';
+               lowerstr(tmpstr);
+               if (STRNCMP(tmpstr, "compoundwords") == 0)
                 {
-                       s = strchr(str, 'l');
+                       s = findchar(str, 'l');
                         if (s)
                         {
-                               while (*s != ' ')
-                                       s++;
-                               while (*s == ' ')
-                                       s++;
-                               Conf->compoundcontrol = *s;
+                               while (*s && !t_isspace(s)) s++;
+                               while (*s && t_isspace(s)) s++;
+                               if ( *s && pg_mblen(s) == 1 ) 
+                                       Conf->compoundcontrol = *s;
                                 continue;
                         }
                 }
-               if (STRNCASECMP(str, "suffixes") == 0)
+               if (STRNCMP(tmpstr, "suffixes") == 0)
                 {
                         suffixes = 1;
                         prefixes = 0;
                         continue;
                 }
-               if (STRNCASECMP(str, "prefixes") == 0)
+               if (STRNCMP(tmpstr, "prefixes") == 0)
                 {
                         suffixes = 0;
                         prefixes = 1;
                         continue;
                 }
-               if (STRNCASECMP(str, "flag ") == 0)
+               if (STRNCMP(tmpstr, "flag") == 0)
                 {
-                       s = str + 5;
+                       s = str + 4;
                         flagflags = 0;
-                       while (*s == ' ')
-                               s++;
+
+                       while (*s && t_isspace(s)) s++;
+
+                       /* allow only single-encoded flags */
+                       if ( pg_mblen(s) != 1 )
+                               continue;                       
+
                         if (*s == '*')
                         {
                                 flagflags |= FF_CROSSPRODUCT;
@@ -365,43 +448,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
                         if (*s == '\\')
                                 s++;
  
+                       /* allow only single-encoded flags */
+                       if ( pg_mblen(s) != 1 ) {
+                               flagflags = 0;
+                               continue;
+                       }
+
                         flag = (unsigned char) *s;
                         continue;
                 }
                 if ((!suffixes) && (!prefixes))
                         continue;
-               if ((s = strchr(str, '#')))
-                       *s = 0;
-               if (!*str)
-                       continue;
+
                 lowerstr(str);
-               strcpy(mask, "");
-               strcpy(find, "");
-               strcpy(repl, "");
-               i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
-               remove_spaces(str, repl);
-               strcpy(repl, str);
-               remove_spaces(str, find);
-               strcpy(find, str);
-               remove_spaces(str, mask);
-               strcpy(mask, str);
-               switch (i)
-               {
-                       case 3:
-                               break;
-                       case 2:
-                               if (*find != '\0')
-                               {
-                                       strcpy(repl, find);
-                                       strcpy(find, "");
-                               }
-                               break;
-                       default:
-                               continue;
-               }
+               if ( !parse_affentry(str, mask, find, repl) )
+                       continue;
  
                 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
-
         }
         fclose(affix);
  
@@ -768,30 +831,28 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
         {
                 if (Affix->compile)
                 {
-                       RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask);
+                       RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask);
                         Affix->compile = 0;
                 }
-               if (RS_execute(&(Affix->reg.regis), newword, -1))
+               if (RS_execute(&(Affix->reg.regis), newword))
                         return newword;
         }
         else
         {
-               regmatch_t      subs[2];        /* workaround for apache&linux */
                 int                     err;
                 pg_wchar   *data;
                 size_t          data_len;
-               int                     dat_len;
+               int                     newword_len;
  
                 if (Affix->compile)
                 {
                         int                     wmasklen,
                                                 masklen = strlen(Affix->mask);
                         pg_wchar   *mask;
-
                         mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
                         wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);
  
-                       err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+                       err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB);
                         pfree(mask);
                         if (err)
                         {
@@ -804,11 +865,11 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
                 }
  
                 /* Convert data string to wide characters */
-               dat_len = strlen(newword);
-               data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-               data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+               newword_len = strlen(newword);
+               data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
+               data_len = pg_mb2wchar_with_len(newword, data, newword_len);
  
-               if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, 0, NULL, 1, subs, 0)))
+               if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
                 {
                         pfree(data);
                         return newword;
diff --git a/contrib/tsearch2/stopword.c b/contrib/tsearch2/stopword.c

index 2a9a464596eee94cea81ffabd65d73c22442cb65..f3894714d22c69fe6caf4e4d661cd409f2b5fd68 100644 (file)
--- a/contrib/tsearch2/stopword.c
+++ b/contrib/tsearch2/stopword.c
@@ -4,8 +4,6 @@
   */
  #include "postgres.h"
  
-#include <ctype.h>
-
  #include "miscadmin.h"
  
  #include "common.h"
@@ -71,6 +69,8 @@ readstoplist(text *in, StopList * s)
                 while (fgets(buf, STOPBUFLEN, hin))
                 {
                         buf[strlen(buf) - 1] = '\0';
+                       pg_verifymbstr( buf, strlen(buf), false );      
+                       lowerstr(buf);
                         if (*buf == '\0')
                                 continue;
  
diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h

index 2d5bc17a961c1bbb76bdc38d3b9cab3689e3acba..8695b27a0eb5bd1d080fca7a9e6a2a146c02900b 100644 (file)
--- a/contrib/tsearch2/ts_locale.h
+++ b/contrib/tsearch2/ts_locale.h
@@ -57,7 +57,7 @@ int _t_isprint( char *ptr );
         int lll = pg_mblen( s );                        \
                                                         \
         while( lll-- )                                  \
-               TOUCHAR(d+lll) = TOUCHAR(s+lll);        \
+               TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)
author	Teodor Sigaev <teodor@sigaev.ru>
	Wed, 21 Dec 2005 13:05:49 +0000 (13:05 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Wed, 21 Dec 2005 13:05:49 +0000 (13:05 +0000)
contrib/tsearch2/ispell/regis.c		patch \| blob \| history
contrib/tsearch2/ispell/regis.h		patch \| blob \| history
contrib/tsearch2/ispell/spell.c		patch \| blob \| history
contrib/tsearch2/stopword.c		patch \| blob \| history
contrib/tsearch2/ts_locale.h		patch \| blob \| history