]> granicus.if.org Git - postgresql/commitdiff
Improve support of multibyte encoding:
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 12 Dec 2005 11:10:12 +0000 (11:10 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 12 Dec 2005 11:10:12 +0000 (11:10 +0000)
- tsvector_(in|out)
- tsquery_(in|out)
- to_tsvector
- to_tsquery, plainto_tsquery
- 'simple' dictionary

19 files changed:
contrib/tsearch2/dict.h
contrib/tsearch2/dict_ex.c
contrib/tsearch2/dict_ispell.c
contrib/tsearch2/dict_snowball.c
contrib/tsearch2/dict_syn.c
contrib/tsearch2/gendict/dict_snowball.c.IN
contrib/tsearch2/gendict/dict_tmpl.c.IN
contrib/tsearch2/ispell/spell.c
contrib/tsearch2/prs_dcfg.c
contrib/tsearch2/query.c
contrib/tsearch2/query.h
contrib/tsearch2/stopword.c
contrib/tsearch2/ts_locale.c
contrib/tsearch2/ts_locale.h
contrib/tsearch2/ts_stat.c
contrib/tsearch2/tsvector.c
contrib/tsearch2/tsvector_op.c
contrib/tsearch2/wordparser/parser.c
contrib/tsearch2/wordparser/parser.h

index 0227bb484508f7d8087926b6e097a276566ffabb..8aef0b0cb70dd0d919ef1ffd360a8ee4887ed84c 100644 (file)
@@ -14,7 +14,6 @@ void          sortstoplist(StopList * s);
 void           freestoplist(StopList * s);
 void           readstoplist(text *in, StopList * s);
 bool           searchstoplist(StopList * s, char *key);
-char      *lowerstr(char *str);
 
 typedef struct
 {
index 8ec3950f9f88353fd2fbc1086d18c9992723120e..334bb5248d8a7ac56cf759670c682090585ca981 100644 (file)
@@ -6,6 +6,7 @@
 
 #include "dict.h"
 #include "common.h"
+#include "ts_locale.h"
 
 typedef struct
 {
index 28ce70a285e7f49a92ca7051739c01bdb2de6867..0e887da584a81cfc199538213f259b4ab1c8ee82 100644 (file)
@@ -9,6 +9,7 @@
 #include "dict.h"
 #include "common.h"
 #include "ispell/spell.h"
+#include "ts_locale.h"
 
 typedef struct
 {
index 0c08c293d360cd4dd11c0d641dc1d9af61a2ab1e..bbd44246b8ecb4100eec8d7face2c068d2a5021f 100644 (file)
@@ -10,6 +10,7 @@
 #include "snowball/header.h"
 #include "snowball/english_stem.h"
 #include "snowball/russian_stem.h"
+#include "ts_locale.h"
 
 typedef struct
 {
index f3281520809d9fa78dd5979f7b36f5c93e18ef6c..b0c50334eac9a8b72b6e74df8c2dd542d1a2cd59 100644 (file)
@@ -8,6 +8,7 @@
 
 #include "dict.h"
 #include "common.h"
+#include "ts_locale.h"
 
 #define SYNBUFLEN      4096
 typedef struct
index ec25edc0fffeb709075df7ae5f5ef1b0f22bd599..818fd6b15770c07622115b9622c61a452a0db159 100644 (file)
@@ -12,6 +12,7 @@
 #include "common.h"
 #include "snowball/header.h"
 #include "subinclude.h"
+#include "ts_locale.h"
 
 typedef struct {
        struct SN_env *z;
index e534ed30a78ab5560d25cea199d687446ee16f8f..9d90df712bf3796cc2adaf8aded27c3423d92380 100644 (file)
@@ -12,6 +12,7 @@
 #include "common.h"
 
 #include "subinclude.h"
+#include "ts_locale.h"
 
 HASINIT typedef struct {
 HASINIT        StopList        stoplist;
index 9999983cc83632dcf9de0e9ce743d54974f921c5..baa36f31f10f988344b7814b141b83cba08410c9 100644 (file)
@@ -6,6 +6,7 @@
 #include "postgres.h"
 
 #include "spell.h"
+#include "ts_locale.h"
 
 #define MAX_NORM 1024
 #define MAXNORMLEN 256
@@ -30,18 +31,6 @@ cmpspellaffix(const void *s1, const void *s2)
        return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
 }
 
-static void
-strlower(char *str)
-{
-       unsigned char *ptr = (unsigned char *) str;
-
-       while (*ptr)
-       {
-               *ptr = tolower(*ptr);
-               ptr++;
-       }
-}
-
 static char *
 strnduplicate(char *s, int len)
 {
@@ -175,7 +164,7 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
                }
                else
                        flag = "";
-               strlower(str);
+               lowerstr(str);
                /* Dont load words if first letter is not required */
                /* It allows to optimize loading at  search time   */
                s = str;
@@ -385,7 +374,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
                        *s = 0;
                if (!*str)
                        continue;
-               strlower(str);
+               lowerstr(str);
                strcpy(mask, "");
                strcpy(find, "");
                strcpy(repl, "");
@@ -851,7 +840,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
 
        if (wrdlen > MAXNORMLEN)
                return NULL;
-       strlower(word);
+       lowerstr(word);
        cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
        *cur = NULL;
 
index 240aaa44973a508f9ceb5e8ac14d1f053575f1f1..c54ca11803c0bfd1ba602be15eba3176d517bf87 100644 (file)
@@ -8,6 +8,7 @@
 
 #include "dict.h"
 #include "common.h"
+#include "ts_locale.h"
 
 #define CS_WAITKEY     0
 #define CS_INKEY       1
@@ -30,11 +31,11 @@ nstrdup(char *ptr, int len)
        cptr = ptr = res;
        while (*ptr)
        {
-               if (*ptr == '\\')
+               if (t_iseq(ptr, '\\'))
                        ptr++;
-               *cptr = *ptr;
-               ptr++;
-               cptr++;
+               COPYCHAR( cptr, ptr );
+               cptr+=pg_mblen(ptr);
+               ptr+=pg_mblen(ptr);
        }
        *cptr = '\0';
 
@@ -52,9 +53,9 @@ parse_cfgdict(text *in, Map ** m)
 
        while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
        {
-               if (*ptr == ',')
+               if ( t_iseq(ptr, ',') )
                        num++;
-               ptr++;
+               ptr+=pg_mblen(ptr);
        }
 
        *m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
@@ -64,56 +65,56 @@ parse_cfgdict(text *in, Map ** m)
        {
                if (state == CS_WAITKEY)
                {
-                       if (isalpha((unsigned char) *ptr))
+                       if (t_isalpha(ptr))
                        {
                                begin = ptr;
                                state = CS_INKEY;
                        }
-                       else if (!isspace((unsigned char) *ptr))
+                       else if (!t_isspace(ptr))
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("syntax error"),
-                                                errdetail("Syntax error in position %d near \"%c\"",
-                                                                  (int) (ptr - VARDATA(in)), *ptr)));
+                                                errdetail("Syntax error in position %d",
+                                                                  (int) (ptr - VARDATA(in)))));
                }
                else if (state == CS_INKEY)
                {
-                       if (isspace((unsigned char) *ptr))
+                       if (t_isspace(ptr))
                        {
                                mptr->key = nstrdup(begin, ptr - begin);
                                state = CS_WAITEQ;
                        }
-                       else if (*ptr == '=')
+                       else if (t_iseq(ptr,'='))
                        {
                                mptr->key = nstrdup(begin, ptr - begin);
                                state = CS_WAITVALUE;
                        }
-                       else if (!isalpha((unsigned char) *ptr))
+                       else if (!t_isalpha(ptr))
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("syntax error"),
-                                                errdetail("Syntax error in position %d near \"%c\"",
-                                                                  (int) (ptr - VARDATA(in)), *ptr)));
+                                                errdetail("Syntax error in position %d",
+                                                                  (int) (ptr - VARDATA(in)))));
                }
                else if (state == CS_WAITEQ)
                {
-                       if (*ptr == '=')
+                       if (t_iseq(ptr, '='))
                                state = CS_WAITVALUE;
-                       else if (!isspace((unsigned char) *ptr))
+                       else if (!t_isspace(ptr))
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("syntax error"),
-                                                errdetail("Syntax error in position %d near \"%c\"",
-                                                                  (int) (ptr - VARDATA(in)), *ptr)));
+                                                errdetail("Syntax error in position %d",
+                                                                  (int) (ptr - VARDATA(in)))));
                }
                else if (state == CS_WAITVALUE)
                {
-                       if (*ptr == '"')
+                       if (t_iseq(ptr, '"'))
                        {
                                begin = ptr + 1;
                                state = CS_INVALUE;
                        }
-                       else if (!isspace((unsigned char) *ptr))
+                       else if (!t_isspace(ptr))
                        {
                                begin = ptr;
                                state = CS_IN2VALUE;
@@ -121,36 +122,36 @@ parse_cfgdict(text *in, Map ** m)
                }
                else if (state == CS_INVALUE)
                {
-                       if (*ptr == '"')
+                       if (t_iseq(ptr, '"'))
                        {
                                mptr->value = nstrdup(begin, ptr - begin);
                                mptr++;
                                state = CS_WAITDELIM;
                        }
-                       else if (*ptr == '\\')
+                       else if (t_iseq(ptr, '\\'))
                                state = CS_INESC;
                }
                else if (state == CS_IN2VALUE)
                {
-                       if (isspace((unsigned char) *ptr) || *ptr == ',')
+                       if (t_isspace(ptr) || t_iseq(ptr, ','))
                        {
                                mptr->value = nstrdup(begin, ptr - begin);
                                mptr++;
-                               state = (*ptr == ',') ? CS_WAITKEY : CS_WAITDELIM;
+                               state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
                        }
-                       else if (*ptr == '\\')
+                       else if (t_iseq(ptr, '\\'))
                                state = CS_INESC;
                }
                else if (state == CS_WAITDELIM)
                {
-                       if (*ptr == ',')
+                       if (t_iseq(ptr, ','))
                                state = CS_WAITKEY;
-                       else if (!isspace((unsigned char) *ptr))
+                       else if (!t_isspace(ptr))
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("syntax error"),
-                                                errdetail("Syntax error in position %d near \"%c\"",
-                                                                  (int) (ptr - VARDATA(in)), *ptr)));
+                                                errdetail("Syntax error in position %d",
+                                                                  (int) (ptr - VARDATA(in)))));
                }
                else if (state == CS_INESC)
                        state = CS_INVALUE;
@@ -160,9 +161,9 @@ parse_cfgdict(text *in, Map ** m)
                        ereport(ERROR,
                                        (errcode(ERRCODE_SYNTAX_ERROR),
                                         errmsg("bad parser state"),
-                                        errdetail("%d at position %d near \"%c\"",
-                                                          state, (int) (ptr - VARDATA(in)), *ptr)));
-               ptr++;
+                                        errdetail("%d at position %d",
+                                                          state, (int) (ptr - VARDATA(in)))));
+               ptr+=pg_mblen(ptr);
        }
 
        if (state == CS_IN2VALUE)
index de6d96ed52eb29c94f12ece44843d98d3d21712c..e6285fd9d2ec4b3e8d41b6352c5242ddacc9fc91 100644 (file)
@@ -25,7 +25,7 @@
 #include "query.h"
 #include "query_cleanup.h"
 #include "common.h"
-
+#include "ts_locale.h"
 
 PG_FUNCTION_INFO_V1(tsquery_in);
 Datum          tsquery_in(PG_FUNCTION_ARGS);
@@ -108,24 +108,28 @@ get_weight(char *buf, int2 *weight)
 {
        *weight = 0;
 
-       if (*buf != ':')
+       if ( !t_iseq(buf, ':') )
                return buf;
 
        buf++;
-       while (*buf)
+       while ( *buf && pg_mblen(buf) == 1 )
        {
-               switch (tolower(*buf))
+               switch (*buf)
                {
                        case 'a':
+                       case 'A':
                                *weight |= 1 << 3;
                                break;
                        case 'b':
+                       case 'B':
                                *weight |= 1 << 2;
                                break;
                        case 'c':
+                       case 'C':
                                *weight |= 1 << 1;
                                break;
                        case 'd':
+                       case 'D':
                                *weight |= 1;
                                break;
                        default:
@@ -149,25 +153,25 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                {
                        case WAITFIRSTOPERAND:
                        case WAITOPERAND:
-                               if (*(state->buf) == '!')
+                               if ( t_iseq(state->buf, '!') )
                                {
-                                       (state->buf)++;
+                                       (state->buf)++; /* can safely ++, t_iseq guarantee that pg_mblen()==1 */
                                        *val = (int4) '!';
                                        return OPR;
                                }
-                               else if (*(state->buf) == '(')
+                               else if ( t_iseq(state->buf, '(') )
                                {
                                        state->count++;
                                        (state->buf)++;
                                        return OPEN;
                                }
-                               else if (*(state->buf) == ':')
+                               else if ( t_iseq(state->buf, ':') )
                                {
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_SYNTAX_ERROR),
                                                         errmsg("error at start of operand")));
                                }
-                               else if (*(state->buf) != ' ')
+                               else if ( !t_isspace(state->buf) )
                                {
                                        state->valstate.prsbuf = state->buf;
                                        if (gettoken_tsvector(&(state->valstate)))
@@ -187,14 +191,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                                }
                                break;
                        case WAITOPERATOR:
-                               if (*(state->buf) == '&' || *(state->buf) == '|')
+                               if ( t_iseq(state->buf, '&') || t_iseq(state->buf, '|') )
                                {
                                        state->state = WAITOPERAND;
                                        *val = (int4) *(state->buf);
                                        (state->buf)++;
                                        return OPR;
                                }
-                               else if (*(state->buf) == ')')
+                               else if ( t_iseq(state->buf, ')') )
                                {
                                        (state->buf)++;
                                        state->count--;
@@ -202,7 +206,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                                }
                                else if (*(state->buf) == '\0')
                                        return (state->count) ? ERR : END;
-                               else if (*(state->buf) != ' ')
+                               else if ( !t_isspace(state->buf) )
                                        return ERR;
                                break;
                        case WAITSINGLEOPERAND:
@@ -217,7 +221,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                                return ERR;
                                break;
                }
-               (state->buf)++;
+               state->buf+=pg_mblen(state->buf);
        }
        return END;
 }
@@ -697,8 +701,11 @@ static QUERYTYPE *
 Datum
 tsquery_in(PG_FUNCTION_ARGS)
 {
+       char * in = (char*)PG_GETARG_POINTER(0);
+       pg_verifymbstr( in, strlen(in), false);
+
        SET_FUNCOID();
-       PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
+       PG_RETURN_POINTER(queryin((char *) in, pushval_asis, 0, false));
 }
 
 /*
@@ -732,20 +739,23 @@ infix(INFIX * in, bool first)
        if (in->curpol->type == VAL)
        {
                char       *op = in->op + in->curpol->distance;
+               int             clen;
 
-               RESIZEBUF(in, in->curpol->length * 2 + 2 + 5);
+               RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length()+1) + 2 + 5);
                *(in->cur) = '\'';
                in->cur++;
                while (*op)
                {
-                       if (*op == '\'')
+                       if ( t_iseq(op, '\'') )
                        {
                                *(in->cur) = '\\';
                                in->cur++;
                        }
-                       *(in->cur) = *op;
-                       op++;
-                       in->cur++;
+                       COPYCHAR(in->cur,op);
+
+                       clen = pg_mblen(op);
+                       op+=clen;
+                       in->cur+=clen;
                }
                *(in->cur) = '\'';
                in->cur++;
index 9eff69cc719b505f29988f829995cce19e8e691f..b4d586a684b085c043c0eb611a269a73a0edf97d 100644 (file)
@@ -4,7 +4,7 @@
 #define BS_DEBUG
 */
 
-
+#include "ts_locale.h"
 /*
  * item in polish notation with back link
  * to left operand
@@ -38,7 +38,7 @@ typedef struct
 #define GETQUERY(x)  (ITEM*)( (char*)(x)+HDRSIZEQT )
 #define GETOPERAND(x)  ( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) )
 
-#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' )
+#define ISOPERATOR(x) (  pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
 
 #define END                            0
 #define ERR                            1
index b8789f9e648ce3999e60ccfe87c0a85539b4d802..2a9a464596eee94cea81ffabd65d73c22442cb65 100644 (file)
 
 #include "common.h"
 #include "dict.h"
+#include "ts_locale.h"
 
 #define STOPBUFLEN     4096
 
-char *
-lowerstr(char *str)
-{
-       char       *ptr = str;
-
-       while (*ptr)
-       {
-               *ptr = tolower(*(unsigned char *) ptr);
-               ptr++;
-       }
-       return str;
-}
-
 void
 freestoplist(StopList * s)
 {
@@ -60,10 +48,16 @@ readstoplist(text *in, StopList * s)
                {
                        char            sharepath[MAXPGPATH];
                        char       *absfn;
+#ifdef WIN32
+                       char    delim = '\\';
+#else
+                       char    delim = '/';
+#endif
 
                        get_share_path(my_exec_path, sharepath);
                        absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
-                       sprintf(absfn, "%s/%s", sharepath, filename);
+                       sprintf(absfn, "%s%c%s", sharepath, delim, filename);
+
                        pfree(filename);
                        filename = absfn;
                }
index 5dc67abc8dc292ad9fc505308083bf75f9ad522d..29c07c0eab6955cb3286467a261dbc95204d4821 100644 (file)
@@ -5,7 +5,9 @@
 #include "mb/pg_wchar.h"
 
 
-#if defined(TS_USE_WIDE) && defined(WIN32)
+#ifdef TS_USE_WIDE
+
+#ifdef WIN32
 
 size_t
 wchar2char(char *to, const wchar_t *from, size_t len)
@@ -69,4 +71,59 @@ char2wchar(wchar_t *to, const char *from, size_t len)
        return mbstowcs(to, from, len);
 }
 
+#endif /* WIN32 */
+
+int
+_t_isalpha( char *ptr ) {
+       wchar_t character;
+
+       char2wchar(&character, ptr, 1);
+
+       return iswalpha( (wint_t)character );   
+}
+
+int
+_t_isprint( char *ptr ) {
+       wchar_t character;
+
+       char2wchar(&character, ptr, 1);
+
+       return iswprint( (wint_t)character );   
+}
+
+#endif /* TS_USE_WIDE */
+
+char *
+lowerstr(char *str)
+{
+       char       *ptr = str;
+
+#ifdef TS_USE_WIDE
+       /*
+        * Use wide char code only when max encoding length > 1 and ctype != C.
+        * Some operating systems fail with multi-byte encodings and a C locale.
+        * Also, for a C locale there is no need to process as multibyte. From
+        * backend/utils/adt/oracle_compat.c Teodor
+        */
+       if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) {
+                       wchar_t *wstr, *wptr;
+                       int len = strlen(str);
+
+                       wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
+                       char2wchar(wstr, str, len+1);
+                       while (*wptr) {
+                               *wptr = towlower((wint_t) *wptr);
+                               wptr++;
+                       }
+                       wchar2char(str, wstr, len);
+                       pfree( wstr );
+       } else
 #endif
+               while (*ptr)
+               {
+                       *ptr = tolower(*(unsigned char *) ptr);
+                       ptr++;
+               }
+       return str;
+}
+
index 905eb94af089eca6a2e7a85417078e2a6ef0e36c..2d5bc17a961c1bbb76bdc38d3b9cab3689e3acba 100644 (file)
@@ -2,6 +2,8 @@
 #define __TSLOCALE_H__
 
 #include "postgres.h"
+#include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
 
 #include <ctype.h>
 #include <limits.h>
 
 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
 #define TS_USE_WIDE
+#endif
+
+#ifdef TS_USE_WIDE
+#endif   /* TS_USE_WIDE */
+
+
+#define TOUCHAR(x)     (*((unsigned char*)(x)))
+
+#ifdef TS_USE_WIDE
 
 #ifdef WIN32
 
 size_t         wchar2char(char *to, const wchar_t *from, size_t len);
 size_t         char2wchar(wchar_t *to, const char *from, size_t len);
-#else                                                  /* WIN32 */
+#else    /* WIN32 */
 
 /* correct mbstowcs */
 #define char2wchar mbstowcs
 #define wchar2char wcstombs
 #endif   /* WIN32 */
-#endif   /* defined(HAVE_WCSTOMBS) &&
-                                                                * defined(HAVE_TOWLOWER) */
+
+#define        t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+#define        t_isspace(x)    ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
+int _t_isalpha( char *ptr );
+#define        t_isalpha(x)    ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
+int _t_isprint( char *ptr );
+#define        t_isprint(x)    ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
+/*
+ * t_iseq() should be called only for ASCII symbols 
+ */
+#define t_iseq(x,c)    ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) 
+
+#define COPYCHAR(d,s)  do {                            \
+       int lll = pg_mblen( s );                        \
+                                                       \
+       while( lll-- )                                  \
+               TOUCHAR(d+lll) = TOUCHAR(s+lll);        \
+} while(0)
+
+               
+#else /* not def TS_USE_WIDE */
+
+#define t_isdigit(x)   isdigit( TOUCHAR(x) )
+#define t_isspace(x)   isspace( TOUCHAR(x) )
+#define t_isalpha(x)   isalpha( TOUCHAR(x) )
+#define t_isprint(x)   isprint( TOUCHAR(x) )
+#define t_iseq(x,c)    ( TOUCHAR(x) == ((unsigned char)(c)) )
+
+#define COPYCHAR(d,s)  TOUCHAR(d) = TOUCHAR(s) 
+
+#endif
+
+char* lowerstr(char *str);
 
 #endif   /* __TSLOCALE_H__ */
index b8ecf96e6db4e23d37aac5dc948f92e3c42c9e1e..ae9575b35322a1c4ebfbbfa67101f7c8b56a4efe 100644 (file)
@@ -8,6 +8,7 @@
 #include "catalog/pg_type.h"
 #include "executor/spi.h"
 #include "common.h"
+#include "ts_locale.h"
 
 PG_FUNCTION_INFO_V1(tsstat_in);
 Datum          tsstat_in(PG_FUNCTION_ARGS);
@@ -476,24 +477,30 @@ ts_stat_sql(text *txt, text *ws)
                buf = VARDATA(ws);
                while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
                {
-                       switch (tolower(*buf))
-                       {
-                               case 'a':
-                                       stat->weight |= 1 << 3;
-                                       break;
-                               case 'b':
-                                       stat->weight |= 1 << 2;
-                                       break;
-                               case 'c':
-                                       stat->weight |= 1 << 1;
-                                       break;
-                               case 'd':
-                                       stat->weight |= 1;
-                                       break;
-                               default:
-                                       stat->weight |= 0;
+                       if ( pg_mblen(buf) == 1 ) {
+                               switch (*buf)
+                               {
+                                       case 'A':
+                                       case 'a':
+                                               stat->weight |= 1 << 3;
+                                               break;
+                                       case 'B':
+                                       case 'b':
+                                               stat->weight |= 1 << 2;
+                                               break;
+                                       case 'C':
+                                       case 'c':
+                                               stat->weight |= 1 << 1;
+                                               break;
+                                       case 'D':
+                                       case 'd':
+                                               stat->weight |= 1;
+                                               break;
+                                       default:
+                                               stat->weight |= 0;
+                               }
                        }
-                       buf++;
+                       buf+=pg_mblen(buf);
                }
        }
 
index cfed6e428a3b1de23c002406326f7318eb41478a..dd895ff38ab0eb6a420edfa40d05302c530b8cba 100644 (file)
@@ -16,8 +16,9 @@
 #include "catalog/namespace.h"
 
 #include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
 
-#include <ctype.h>                             /* tolower */
+#include <ctype.h>
 #include "tsvector.h"
 #include "query.h"
 #include "ts_cfg.h"
@@ -173,7 +174,7 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
 
 #define RESIZEPRSBUF \
 do { \
-       if ( state->curpos - state->word + 1 >= state->len ) \
+       if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
        { \
                int4 clen = state->curpos - state->word; \
                state->len *= 2; \
@@ -182,6 +183,7 @@ do { \
        } \
 } while (0)
 
+
 int4
 gettoken_tsvector(TI_IN_STATE * state)
 {
@@ -197,21 +199,21 @@ gettoken_tsvector(TI_IN_STATE * state)
                {
                        if (*(state->prsbuf) == '\0')
                                return 0;
-                       else if (*(state->prsbuf) == '\'')
+                       else if ( t_iseq(state->prsbuf, '\'') )
                                state->state = WAITENDCMPLX;
-                       else if (*(state->prsbuf) == '\\')
+                       else if ( t_iseq(state->prsbuf, '\\') )
                        {
                                state->state = WAITNEXTCHAR;
                                oldstate = WAITENDWORD;
                        }
-                       else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))
+                       else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("syntax error")));
-                       else if (*(state->prsbuf) != ' ')
+                       else if (!t_isspace(state->prsbuf))
                        {
-                               *(state->curpos) = *(state->prsbuf);
-                               state->curpos++;
+                               COPYCHAR(state->curpos, state->prsbuf);
+                               state->curpos+=pg_mblen(state->prsbuf);
                                state->state = WAITENDWORD;
                        }
                }
@@ -224,20 +226,20 @@ gettoken_tsvector(TI_IN_STATE * state)
                        else
                        {
                                RESIZEPRSBUF;
-                               *(state->curpos) = *(state->prsbuf);
-                               state->curpos++;
+                               COPYCHAR(state->curpos, state->prsbuf);
+                               state->curpos+=pg_mblen(state->prsbuf);
                                state->state = oldstate;
                        }
                }
                else if (state->state == WAITENDWORD)
                {
-                       if (*(state->prsbuf) == '\\')
+                       if ( t_iseq(state->prsbuf, '\\') )
                        {
                                state->state = WAITNEXTCHAR;
                                oldstate = WAITENDWORD;
                        }
-                       else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' ||
-                                        (state->oprisdelim && ISOPERATOR(*(state->prsbuf))))
+                       else if ( t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+                                        (state->oprisdelim && ISOPERATOR(state->prsbuf)))
                        {
                                RESIZEPRSBUF;
                                if (state->curpos == state->word)
@@ -247,7 +249,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                                *(state->curpos) = '\0';
                                return 1;
                        }
-                       else if (*(state->prsbuf) == ':')
+                       else if ( t_iseq(state->prsbuf,':') )
                        {
                                if (state->curpos == state->word)
                                        ereport(ERROR,
@@ -262,13 +264,13 @@ gettoken_tsvector(TI_IN_STATE * state)
                        else
                        {
                                RESIZEPRSBUF;
-                               *(state->curpos) = *(state->prsbuf);
-                               state->curpos++;
+                               COPYCHAR(state->curpos, state->prsbuf);
+                               state->curpos+=pg_mblen(state->prsbuf);
                        }
                }
                else if (state->state == WAITENDCMPLX)
                {
-                       if (*(state->prsbuf) == '\'')
+                       if ( t_iseq(state->prsbuf, '\'') )
                        {
                                RESIZEPRSBUF;
                                *(state->curpos) = '\0';
@@ -278,13 +280,13 @@ gettoken_tsvector(TI_IN_STATE * state)
                                                         errmsg("syntax error")));
                                if (state->oprisdelim)
                                {
-                                       state->prsbuf++;
+                                       state->prsbuf+=pg_mblen(state->prsbuf);
                                        return 1;
                                }
                                else
                                        state->state = WAITPOSINFO;
                        }
-                       else if (*(state->prsbuf) == '\\')
+                       else if ( t_iseq(state->prsbuf, '\\') )
                        {
                                state->state = WAITNEXTCHAR;
                                oldstate = WAITENDCMPLX;
@@ -296,20 +298,20 @@ gettoken_tsvector(TI_IN_STATE * state)
                        else
                        {
                                RESIZEPRSBUF;
-                               *(state->curpos) = *(state->prsbuf);
-                               state->curpos++;
+                               COPYCHAR(state->curpos, state->prsbuf);
+                               state->curpos+=pg_mblen(state->prsbuf);
                        }
                }
                else if (state->state == WAITPOSINFO)
                {
-                       if (*(state->prsbuf) == ':')
+                       if ( t_iseq(state->prsbuf, ':') )
                                state->state = INPOSINFO;
                        else
                                return 1;
                }
                else if (state->state == INPOSINFO)
                {
-                       if (isdigit((unsigned char) *(state->prsbuf)))
+                       if (t_isdigit(state->prsbuf))
                        {
                                if (state->alen == 0)
                                {
@@ -338,9 +340,9 @@ gettoken_tsvector(TI_IN_STATE * state)
                }
                else if (state->state == WAITPOSDELIM)
                {
-                       if (*(state->prsbuf) == ',')
+                       if ( t_iseq(state->prsbuf, ',') )
                                state->state = INPOSINFO;
-                       else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*')
+                       else if ( t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*') )
                        {
                                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                                        ereport(ERROR,
@@ -348,7 +350,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                                                         errmsg("syntax error")));
                                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
                        }
-                       else if (tolower(*(state->prsbuf)) == 'b')
+                       else if ( t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B') )
                        {
                                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                                        ereport(ERROR,
@@ -356,7 +358,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                                                         errmsg("syntax error")));
                                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
                        }
-                       else if (tolower(*(state->prsbuf)) == 'c')
+                       else if ( t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C') )
                        {
                                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                                        ereport(ERROR,
@@ -364,7 +366,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                                                         errmsg("syntax error")));
                                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
                        }
-                       else if (tolower(*(state->prsbuf)) == 'd')
+                       else if ( t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D') )
                        {
                                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                                        ereport(ERROR,
@@ -372,10 +374,10 @@ gettoken_tsvector(TI_IN_STATE * state)
                                                         errmsg("syntax error")));
                                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
                        }
-                       else if (isspace((unsigned char) *(state->prsbuf)) ||
+                       else if (t_isspace(state->prsbuf) ||
                                         *(state->prsbuf) == '\0')
                                return 1;
-                       else if (!isdigit((unsigned char) *(state->prsbuf)))
+                       else if (!t_isdigit(state->prsbuf))
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
                                                 errmsg("syntax error")));
@@ -383,7 +385,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                else
                        /* internal error */
                        elog(ERROR, "internal error");
-               state->prsbuf++;
+               state->prsbuf+=pg_mblen(state->prsbuf);
        }
 
        return 0;
@@ -405,6 +407,8 @@ tsvector_in(PG_FUNCTION_ARGS)
                                buflen = 256;
 
        SET_FUNCOID();
+
+       pg_verifymbstr( buf, strlen(buf), false );
        state.prsbuf = buf;
        state.len = 32;
        state.word = (char *) palloc(state.len);
@@ -495,17 +499,16 @@ tsvector_out(PG_FUNCTION_ARGS)
        tsvector   *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
        char       *outbuf;
        int4            i,
-                               j,
                                lenbuf = 0,
                                pp;
        WordEntry  *ptr = ARRPTR(out);
-       char       *curin,
+       char       *curbegin, *curin,
                           *curout;
 
        lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
        for (i = 0; i < out->size; i++)
        {
-               lenbuf += ptr[i].len * 2 /* for escape */ ;
+               lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length()/* for escape */ ;
                if (ptr[i].haspos)
                        lenbuf += 7 * POSDATALEN(out, &(ptr[i]));
        }
@@ -513,14 +516,14 @@ tsvector_out(PG_FUNCTION_ARGS)
        curout = outbuf = (char *) palloc(lenbuf);
        for (i = 0; i < out->size; i++)
        {
-               curin = STRPTR(out) + ptr->pos;
+               curbegin = curin = STRPTR(out) + ptr->pos;
                if (i != 0)
                        *curout++ = ' ';
                *curout++ = '\'';
-               j = ptr->len;
-               while (j--)
+               while ( curin-curbegin < ptr->len )
                {
-                       if (*curin == '\'')
+                       int len = pg_mblen(curin);
+                       if ( t_iseq(curin, '\'') )
                        {
                                int4            pos = curout - outbuf;
 
@@ -528,7 +531,8 @@ tsvector_out(PG_FUNCTION_ARGS)
                                curout = outbuf + pos;
                                *curout++ = '\\';
                        }
-                       *curout++ = *curin++;
+                       while(len--)
+                               *curout++ = *curin++;
                }
                *curout++ = '\'';
                if ((pp = POSDATALEN(out, ptr)) != 0)
index b2562e8984bcf82545a9568a6b44c673d36d3fc5..c9119753941cb32ea33d10560f1c1f1bb271e22a 100644 (file)
@@ -15,7 +15,6 @@
 
 #include "utils/pg_locale.h"
 
-#include <ctype.h>                             /* tolower */
 #include "tsvector.h"
 #include "query.h"
 #include "ts_cfg.h"
@@ -76,17 +75,21 @@ setweight(PG_FUNCTION_ARGS)
        WordEntryPos *p;
        int                     w = 0;
 
-       switch (tolower(cw))
+       switch (cw)
        {
+               case 'A':
                case 'a':
                        w = 3;
                        break;
+               case 'B':
                case 'b':
                        w = 2;
                        break;
+               case 'C':
                case 'c':
                        w = 1;
                        break;
+               case 'D':
                case 'd':
                        w = 0;
                        break;
index 23b031be79671f5245ed659f7d40c943950ec2fe..8a5fcdabe66aae9eee5ab9ef6ca51767ec474fa7 100644 (file)
@@ -71,8 +71,11 @@ TParserClose(TParser * prs)
                prs->state = ptr;
        }
 
+#ifdef TS_USE_WIDE
        if (prs->wstr)
                pfree(prs->wstr);
+#endif
+
        pfree(prs);
 }
 
index 923edea5896bf4931ebfe2ba5c66ff5fbb2be81e..baeabf72cd7d2e16b51bd14bf50ea0d423deb063 100644 (file)
@@ -134,8 +134,10 @@ typedef struct TParser
        /* string and position information */
        char       *str;                        /* multibyte string */
        int                     lenstr;                 /* length of mbstring */
+#ifdef TS_USE_WIDE
        wchar_t    *wstr;                       /* wide character string */
        int                     lenwstr;                /* length of wsting */
+#endif
 
        /* State of parse */
        int                     charmaxlen;