Clean up ts_locale.h/.c. Fix broken and not-consistent-across-platforms

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 9 Nov 2007 22:37:35 +0000 (22:37 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 9 Nov 2007 22:37:35 +0000 (22:37 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 9 Nov 2007 22:37:35 +0000 (22:37 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 9 Nov 2007 22:37:35 +0000 (22:37 +0000)
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c

index 361152e6bec9be1d7ccfe8aefee09213d92dba86..784cc17edd2424f73ebc8d814b706b88aca1df7a 100644 (file)
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -1,13 +1,13 @@
  /*-------------------------------------------------------------------------
   *
   * ts_locale.c
- *             locale compatiblility layer for tsearch
+ *             locale compatibility layer for tsearch
   *
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -16,41 +16,56 @@
  #include "tsearch/ts_locale.h"
  #include "tsearch/ts_public.h"
  
-#ifdef TS_USE_WIDE
  
-#ifdef WIN32
+#ifdef TS_USE_WIDE
  
+/*
+ * wchar2char --- convert wide characters to multibyte format
+ *
+ * This has the same API as the standard wcstombs() function; in particular,
+ * tolen is the maximum number of bytes to store at *to, and *from should be
+ * zero-terminated.  The output will be zero-terminated iff there is room.
+ */
  size_t
-wchar2char(char *to, const wchar_t *from, size_t len)
+wchar2char(char *to, const wchar_t *from, size_t tolen)
  {
-       if (len == 0)
+       if (tolen == 0)
                 return 0;
  
+#ifdef WIN32
         if (GetDatabaseEncoding() == PG_UTF8)
         {
                 int                     r;
  
-               r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+               r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
                                                                 NULL, NULL);
  
-               if (r == 0)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                        errmsg("UTF-16 to UTF-8 translation failed: %lu",
-                                                       GetLastError())));
-               Assert(r <= len);
+               if (r <= 0)
+                       return (size_t) -1;
+
+               Assert(r <= tolen);
  
-               return r;
+               /* Microsoft counts the zero terminator in the result */
+               return r-1;
         }
+#endif   /* WIN32 */
  
-       return wcstombs(to, from, len);
+       return wcstombs(to, from, tolen);
  }
-#endif   /* WIN32 */
  
+/*
+ * char2wchar --- convert multibyte characters to wide characters
+ *
+ * This has almost the API of mbstowcs(), except that *from need not be
+ * null-terminated; instead, the number of input bytes is specified as
+ * fromlen.  Also, we ereport() rather than returning -1 for invalid
+ * input encoding.  tolen is the maximum number of wchar_t's to store at *to.
+ * The output will be zero-terminated iff there is room.
+ */
  size_t
-char2wchar(wchar_t *to, const char *from, size_t len)
+char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
  {
-       if (len == 0)
+       if (tolen == 0)
                 return 0;
  
  #ifdef WIN32
@@ -58,71 +73,117 @@ char2wchar(wchar_t *to, const char *from, size_t len)
         {
                 int                     r;
  
-               r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+               r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen);
  
-               if (!r)
+               if (r <= 0)
                 {
-                       pg_verifymbstr(from, len, false);
+                       pg_verifymbstr(from, fromlen, false);
                         ereport(ERROR,
                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
                                          errmsg("invalid multibyte character for locale"),
                                          errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
                 }
  
-               Assert(r <= len);
+               Assert(r <= tolen);
  
-               return r;
+               /* Microsoft counts the zero terminator in the result */
+               return r-1;
         }
-       else
  #endif   /* WIN32 */
+
         if (lc_ctype_is_c())
         {
                 /*
                  * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
                  * allocated with sufficient space
                  */
-               return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
+               return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
         }
         else
         {
                 /*
-                * mbstowcs require ending '\0'
+                * mbstowcs requires ending '\0'
                  */
-               char       *str = pnstrdup(from, len);
-               size_t          tolen;
+               char       *str = pnstrdup(from, fromlen);
+               size_t          result;
+
+               result = mbstowcs(to, str, tolen);
  
-               tolen = mbstowcs(to, str, len);
                 pfree(str);
  
-               return tolen;
+               if (result == (size_t) -1)
+               {
+                       pg_verifymbstr(from, fromlen, false);
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                                        errmsg("invalid multibyte character for locale"),
+                                        errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+               }
+
+               if (result < tolen)
+                       to[result] = 0;
+
+               return result;
         }
  }
  
+
  int
-_t_isalpha(const char *ptr)
+t_isdigit(const char *ptr)
  {
+       int                     clen = pg_mblen(ptr);
         wchar_t         character[2];
  
-       if (lc_ctype_is_c())
+       if (clen == 1 || lc_ctype_is_c())
+               return isdigit(TOUCHAR(ptr));
+
+       char2wchar(character, 2, ptr, clen);
+
+       return iswdigit((wint_t) character[0]);
+}
+
+int
+t_isspace(const char *ptr)
+{
+       int                     clen = pg_mblen(ptr);
+       wchar_t         character[2];
+
+       if (clen == 1 || lc_ctype_is_c())
+               return isspace(TOUCHAR(ptr));
+
+       char2wchar(character, 2, ptr, clen);
+
+       return iswspace((wint_t) character[0]);
+}
+
+int
+t_isalpha(const char *ptr)
+{
+       int                     clen = pg_mblen(ptr);
+       wchar_t         character[2];
+
+       if (clen == 1 || lc_ctype_is_c())
                 return isalpha(TOUCHAR(ptr));
  
-       char2wchar(character, ptr, 1);
+       char2wchar(character, 2, ptr, clen);
  
-       return iswalpha((wint_t) *character);
+       return iswalpha((wint_t) character[0]);
  }
  
  int
-_t_isprint(const char *ptr)
+t_isprint(const char *ptr)
  {
+       int                     clen = pg_mblen(ptr);
         wchar_t         character[2];
  
-       if (lc_ctype_is_c())
+       if (clen == 1 || lc_ctype_is_c())
                 return isprint(TOUCHAR(ptr));
  
-       char2wchar(character, ptr, 1);
+       char2wchar(character, 2, ptr, clen);
  
-       return iswprint((wint_t) *character);
+       return iswprint((wint_t) character[0]);
  }
+
  #endif   /* TS_USE_WIDE */
  
  
@@ -168,19 +229,27 @@ t_readline(FILE *fp)
         return recoded;
  }
  
+/*
+ * lowerstr --- fold null-terminated string to lower case
+ *
+ * Returned string is palloc'd
+ */
  char *
-lowerstr(char *str)
+lowerstr(const char *str)
  {
         return lowerstr_with_len(str, strlen(str));
  }
  
  /*
+ * lowerstr_with_len --- fold string to lower case
+ *
+ * Input string need not be null-terminated.
+ *
   * Returned string is palloc'd
   */
  char *
-lowerstr_with_len(char *str, int len)
+lowerstr_with_len(const char *str, int len)
  {
-       char       *ptr = str;
         char       *out;
  
         if (len == 0)
@@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
  
                 /*
                  * alloc number of wchar_t for worst case, len contains number of
-                * bytes <= number of characters and alloc 1 wchar_t for 0, because
-                * wchar2char(wcstombs in really) wants zero-terminated string
+                * bytes >= number of characters and alloc 1 wchar_t for 0, because
+                * wchar2char wants zero-terminated string
                  */
                 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
  
-               /*
-                * str SHOULD be cstring, so wlen contains number of converted
-                * character
-                */
-               wlen = char2wchar(wstr, str, len);
-               if (wlen < 0)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                         errmsg("translation failed from server encoding to wchar_t")));
-
+               wlen = char2wchar(wstr, len+1, str, len);
                 Assert(wlen <= len);
-               wstr[wlen] = 0;
  
                 while (*wptr)
                 {
@@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
                 /*
                  * Alloc result string for worst case + '\0'
                  */
-               len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
+               len = pg_database_encoding_max_length() * wlen + 1;
                 out = (char *) palloc(len);
  
-               /*
-                * wlen now is number of bytes which is always >= number of characters
-                */
                 wlen = wchar2char(out, wstr, len);
+
                 pfree(wstr);
  
                 if (wlen < 0)
                         ereport(ERROR,
                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                        errmsg("translation failed from wchar_t to server encoding %d", errno)));
-               Assert(wlen <= len);
-               out[wlen] = '\0';
+                                        errmsg("translation from wchar_t to server encoding failed: %m")));
+               Assert(wlen < len);
         }
         else
-#endif
+#endif   /* TS_USE_WIDE */
         {
+               const char *ptr = str;
                 char       *outptr;
  
                 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
-               while (*ptr && ptr - str < len)
+               while ((ptr - str) < len && *ptr)
                 {
-                       *outptr++ = tolower(*(unsigned char *) ptr);
+                       *outptr++ = tolower(TOUCHAR(ptr));
                         ptr++;
                 }
                 *outptr = '\0';
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c

index 781146886a30e0ec5c90f6293ffee22e78226d4d..6c989474202cae395980fdfb6e4ac3fa2620cfd5 100644 (file)
--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -75,7 +75,7 @@ comparestr(const void *a, const void *b)
   * or palloc a new version.
   */
  void
-readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
+readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
  {
         char      **stop = NULL;
  
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c

index 086ac95155801a9c7cb54e5075787b777e80d7f8..b79056ca688c668d41064166dc6df5fcb5be2964 100644 (file)
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.8 2007/11/09 22:37:35 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -294,12 +294,12 @@ TParserInit(char *str, int len)
         /*
          * Use wide char code only when max encoding length > 1.
          */
-
         if (prs->charmaxlen > 1)
         {
                 prs->usewide = true;
                 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-               prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
+               prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
+                                                                 prs->str, prs->lenstr);
         }
         else
  #endif
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h

index dcae2af93a4ded4cc8787f3d018d7b227ec66cf8..cea3830a0f103724371f858573c4612b249814de 100644 (file)
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -1,15 +1,14 @@
  /*-------------------------------------------------------------------------
   *
   * ts_locale.h
- *    helper utilities for tsearch
+ *             locale compatibility layer for tsearch
   *
   * Copyright (c) 1998-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.3 2007/11/09 22:37:35 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
-
  #ifndef __TSLOCALE_H__
  #define __TSLOCALE_H__
  
@@ -34,55 +33,37 @@
  #define TS_USE_WIDE
  #endif
  
-#define TOUCHAR(x)     (*((unsigned char*)(x)))
+#define TOUCHAR(x)     (*((const unsigned char *) (x)))
  
  #ifdef TS_USE_WIDE
  
-extern size_t char2wchar(wchar_t *to, const char *from, size_t len);
-
-#ifdef WIN32
-
-extern size_t wchar2char(char *to, const wchar_t *from, size_t len);
-#else                                                  /* WIN32 */
-
-/* correct wcstombs */
-#define wchar2char wcstombs
+extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen);
+extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen);
  
-#endif   /* WIN32 */
+extern int     t_isdigit(const char *ptr);
+extern int     t_isspace(const char *ptr);
+extern int     t_isalpha(const char *ptr);
+extern int     t_isprint(const char *ptr);
  
-#define t_isdigit(x)   ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
-#define t_isspace(x)   ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
-extern int     _t_isalpha(const char *ptr);
+/* The second argument of t_iseq() must be a plain ASCII character */
+#define t_iseq(x,c)            (TOUCHAR(x) == (unsigned char) (c))
  
-#define t_isalpha(x)   ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
-extern int     _t_isprint(const char *ptr);
+#define COPYCHAR(d,s)  memcpy(d, s, pg_mblen(s))
  
-#define t_isprint(x)   ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
-/*
- * t_iseq() should be called only for ASCII symbols
- */
-#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+#else  /* not TS_USE_WIDE */
  
-#define COPYCHAR(d,s)  do {                                    \
-       int lll = pg_mblen( s );                                        \
-                                                                                               \
-       while( lll-- )                                                          \
-               TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
-} while(0)
+#define t_isdigit(x)   isdigit(TOUCHAR(x))
+#define t_isspace(x)   isspace(TOUCHAR(x))
+#define t_isalpha(x)   isalpha(TOUCHAR(x))
+#define t_isprint(x)   isprint(TOUCHAR(x))
+#define t_iseq(x,c)            (TOUCHAR(x) == (unsigned char) (c))
  
-#else                                                  /* not def TS_USE_WIDE */
+#define COPYCHAR(d,s)  (*((unsigned char *) (d)) = TOUCHAR(s))
  
-#define t_isdigit(x)   isdigit( TOUCHAR(x) )
-#define t_isspace(x)   isspace( TOUCHAR(x) )
-#define t_isalpha(x)   isalpha( TOUCHAR(x) )
-#define t_isprint(x)   isprint( TOUCHAR(x) )
-#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) )
-
-#define COPYCHAR(d,s)  TOUCHAR(d) = TOUCHAR(s)
-#endif
+#endif /* TS_USE_WIDE */
  
-extern char *lowerstr(char *str);
-extern char *lowerstr_with_len(char *str, int len);
+extern char *lowerstr(const char *str);
+extern char *lowerstr_with_len(const char *str, int len);
  extern char *t_readline(FILE *fp);
  
  #endif   /* __TSLOCALE_H__ */
diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h

index ab19de7924f05037e9e7a572d067b1070f2dfe9d..92736c4e1bcb7d4358c0d261a9c9318a881be814 100644 (file)
--- a/src/include/tsearch/ts_public.h
+++ b/src/include/tsearch/ts_public.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 1998-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.4 2007/09/07 15:09:56 teodor Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.5 2007/11/09 22:37:35 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -74,7 +74,7 @@ typedef struct
  } StopList;
  
  extern void readstoplist(const char *fname, StopList *s,
-                                                char *(*wordop) (char *));
+                                                char *(*wordop) (const char *));
  extern bool searchstoplist(StopList *s, char *key);
  
  /*
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 9 Nov 2007 22:37:35 +0000 (22:37 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 9 Nov 2007 22:37:35 +0000 (22:37 +0000)
src/backend/tsearch/ts_locale.c		patch \| blob \| history
src/backend/tsearch/ts_utils.c		patch \| blob \| history
src/backend/tsearch/wparser_def.c		patch \| blob \| history
src/include/tsearch/ts_locale.h		patch \| blob \| history
src/include/tsearch/ts_public.h		patch \| blob \| history