]> granicus.if.org Git - postgresql/commitdiff
Fix localization support for multibyte encoding and C locale.
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 15 Jan 2007 15:16:28 +0000 (15:16 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 15 Jan 2007 15:16:28 +0000 (15:16 +0000)
Slightly reworked patch from Tatsuo Ishii

contrib/tsearch2/ts_locale.c
contrib/tsearch2/ts_locale.h
contrib/tsearch2/wordparser/parser.c

index cac5317a1057140b6a598e6882fbd304786c9021..cb022d7e2a46458b2d0def4819cfb6a5f8e23fcc 100644 (file)
 size_t
 wchar2char(char *to, const wchar_t *from, size_t len)
 {
+       if (len == 0)
+               return 0;
+
        if (GetDatabaseEncoding() == PG_UTF8)
        {
                int                     r;
 
-               if (len == 0)
-                       return 0;
-
                r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                                                NULL, NULL);
 
@@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len)
 
        return wcstombs(to, from, len);
 }
+#endif   /* WIN32 */
 
 size_t
 char2wchar(wchar_t *to, const char *from, size_t len)
 {
+       if (len == 0)
+               return 0;
+
+#ifdef WIN32
        if (GetDatabaseEncoding() == PG_UTF8)
        {
                int                     r;
 
-               if (len == 0)
-                       return 0;
-
                r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
 
                if (!r)
@@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len)
 
                return r;
        }
+       else 
+#endif /* WIN32 */
+       if ( lc_ctype_is_c() )
+       {
+               /*
+                * pg_mb2wchar_with_len always adds trailing '\0', so 
+                * 'to' should be allocated with sufficient space 
+                */
+               return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+       }
 
        return mbstowcs(to, from, len);
 }
-#endif   /* WIN32 */
 
 int
 _t_isalpha(const char *ptr)
 {
-       wchar_t         character;
+       wchar_t         character[2];
+
+       if (lc_ctype_is_c())
+               return isalpha(TOUCHAR(ptr));
 
-       char2wchar(&character, ptr, 1);
+       char2wchar(character, ptr, 1);
 
-       return iswalpha((wint_t) character);
+       return iswalpha((wint_t) *character);
 }
 
 int
 _t_isprint(const char *ptr)
 {
-       wchar_t         character;
+       wchar_t         character[2];
+
+       if (lc_ctype_is_c())
+               return isprint(TOUCHAR(ptr));
 
-       char2wchar(&character, ptr, 1);
+       char2wchar(character, ptr, 1);
 
-       return iswprint((wint_t) character);
+       return iswprint((wint_t) *character);
 }
 #endif   /* TS_USE_WIDE */
 
@@ -126,7 +143,7 @@ lowerstr(char *str)
                if ( wlen < 0 )
                        ereport(ERROR,
                                        (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                        errmsg("transalation failed from server encoding to wchar_t")));
+                                        errmsg("translation failed from server encoding to wchar_t")));
 
                Assert(wlen<=len);
                wstr[wlen] = 0;
@@ -152,7 +169,7 @@ lowerstr(char *str)
                if ( wlen < 0 )
                        ereport(ERROR,
                                        (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                        errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+                                        errmsg("translation failed from wchar_t to server encoding %d", errno)));
                Assert(wlen<=len);
                out[wlen]='\0';
        }
index e2e22481370162bed9789b5acfec480302eb44c9..81d1a1660059bfb08c108011ca950881c0da560a 100644 (file)
 #define TOUCHAR(x)     (*((unsigned char*)(x)))
 
 #ifdef TS_USE_WIDE
+size_t         char2wchar(wchar_t *to, const char *from, size_t len);
 
 #ifdef WIN32
 
 size_t         wchar2char(char *to, const wchar_t *from, size_t len);
-size_t         char2wchar(wchar_t *to, const char *from, size_t len);
+
 #else                                                  /* WIN32 */
 
-/* correct mbstowcs */
-#define char2wchar mbstowcs
+/* correct wcstombs */
 #define wchar2char wcstombs
+
 #endif   /* WIN32 */
 
 #define t_isdigit(x)   ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
@@ -55,10 +56,10 @@ extern int  _t_isprint(const char *ptr);
  */
 #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
 
-#define COPYCHAR(d,s)  do {                            \
-       int lll = pg_mblen( s );                        \
-                                                       \
-       while( lll-- )                                  \
+#define COPYCHAR(d,s)  do {                                    \
+       int lll = pg_mblen( s );                                        \
+                                                                                               \
+       while( lll-- )                                                          \
                TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
 } while(0)
 
index fced41ec5e8bac0d71f8e1b8a6ce90ce3084dc2d..3706a0efb7259bf183e34946c7e0480b704dbfe5 100644 (file)
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */
 
 #include "postgres.h"
 
@@ -40,16 +40,13 @@ TParserInit(char *str, int len)
 #ifdef TS_USE_WIDE
 
        /*
-        * Use wide char code only when max encoding length > 1 and ctype != C.
-        * Some operating systems fail with multi-byte encodings and a C locale.
-        * Also, for a C locale there is no need to process as multibyte. From
-        * backend/utils/adt/oracle_compat.c Teodor
+        * Use wide char code only when max encoding length > 1.
         */
 
-       if (prs->charmaxlen > 1 && !lc_ctype_is_c())
+       if (prs->charmaxlen > 1)
        {
                prs->usewide = true;
-               prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
+               prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
                prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
        }
        else
@@ -83,25 +80,99 @@ TParserClose(TParser * prs)
 
 /*
  * defining support function, equvalent is* macroses, but
- * working with any possible encodings and locales
+ * working with any possible encodings and locales. Note,
+ * that with multibyte encoding and C-locale isw* function may fail
+ * or give wrong result. Note 2: multibyte encoding and C-locale 
+ * often are used for Asian languages.
  */
 
 #ifdef TS_USE_WIDE
 
-#define p_iswhat(type)                                                                         \
-static int                                                                                     \
-p_is##type(TParser *prs) {                                                                     \
-       Assert( prs->state );                                                                   \
-       return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
-               is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );               \
-}      \
-                                                                                               \
-static int                                                                                     \
-p_isnot##type(TParser *prs) {                                                                  \
-       return !p_is##type(prs);                                                                \
+#define p_iswhat(type)                                                                                                         \
+static int                                                                                                                                     \
+p_is##type(TParser *prs) {                                                                                                     \
+       Assert( prs->state );                                                                                                   \
+       if ( prs->usewide )                                                                                                             \
+       {                                                                                                                                               \
+               if ( lc_ctype_is_c() )                                                                                          \
+                       return is##type( 0xff & *( prs->wstr + prs->state->poschar) );  \
+                                                                                                                                                       \
+               return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );      \
+       }                                                                                                                                               \
+                                                                                                                                                       \
+       return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
+}                                                                                                                                                      \
+                                                                                                                                                       \
+static int                                                                                                                                     \
+p_isnot##type(TParser *prs) {                                                                                          \
+       return !p_is##type(prs);                                                                                                \
 }
 
+static int 
+p_isalnum(TParser *prs)
+{
+       Assert( prs->state );
+
+       if (prs->usewide)
+       {
+               if (lc_ctype_is_c())
+               {
+                       unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+
+                       /*
+                        * any non-ascii symbol with multibyte encoding
+                        * with C-locale is an alpha character
+                        */
+                       if ( c > 0x7f )
+                               return 1;
+
+                       return isalnum(0xff & c);
+               }
+
+               return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+       }
 
+       return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+}
+
+static int
+p_isnotalnum(TParser *prs)
+{
+       return !p_isalnum(prs);
+}
+
+static int 
+p_isalpha(TParser *prs)
+{
+       Assert( prs->state );
+
+       if (prs->usewide)
+       {
+               if (lc_ctype_is_c())
+               {
+                       unsigned int c = *(prs->wstr + prs->state->poschar);
+
+                       /*
+                        * any non-ascii symbol with multibyte encoding
+                        * with C-locale is an alpha character
+                        */
+                       if ( c > 0x7f )
+                               return 1;
+
+                       return isalpha(0xff & c);
+               }
+
+               return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+       }
+
+       return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+}
+
+static int
+p_isnotalpha(TParser *prs)
+{
+       return !p_isalpha(prs);
+}
 
 /* p_iseq should be used only for ascii symbols */
 
@@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c)
        Assert(prs->state);
        return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
 }
+
 #else                                                  /* TS_USE_WIDE */
 
-#define p_iswhat(type)                                                                         \
-static int                                                                                     \
-p_is##type(TParser *prs) {                                                                     \
-       Assert( prs->state );                                                                   \
-       return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );                  \
-}      \
-                                                                                               \
-static int                                                                                     \
-p_isnot##type(TParser *prs) {                                                                  \
-       return !p_is##type(prs);                                                                \
+#define p_iswhat(type)                                                                                                         \
+static int                                                                                                                                     \
+p_is##type(TParser *prs) {                                                                                                     \
+       Assert( prs->state );                                                                                                   \
+       return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  \
+}                                                                                                                                                      \
+                                                                                                                                                       \
+static int                                                                                                                                     \
+p_isnot##type(TParser *prs) {                                                                                          \
+       return !p_is##type(prs);                                                                                                \
 }
 
 
@@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c)
        Assert(prs->state);
        return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
 }
-#endif   /* TS_USE_WIDE */
 
 p_iswhat(alnum)
 p_iswhat(alpha)
+
+#endif   /* TS_USE_WIDE */
+
 p_iswhat(digit)
 p_iswhat(lower)
 p_iswhat(print)