Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 24 Jan 2007 17:12:17 +0000 (17:12 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 24 Jan 2007 17:12:17 +0000 (17:12 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 24 Jan 2007 17:12:17 +0000 (17:12 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 24 Jan 2007 17:12:17 +0000 (17:12 +0000)
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c

index 6828fce7d3a1add4b02e48554e4d47901ba6acab..6f72a7dbb699bf1513d48b256a51abd6ceadf260 100644 (file)
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,7 +1,7 @@
  /*
   * conversion functions between pg_wchar and multibyte streams.
   * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.58 2006/10/04 00:30:02 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.59 2007/01/24 17:12:17 tgl Exp $
   *
   * WIN1250 client encoding updated by Pavel Behal
   *
@@ -364,46 +364,60 @@ pg_johab_dsplen(const unsigned char *s)
  }
  
  /*
- * convert UTF8 string to pg_wchar (UCS-2)
- * caller should allocate enough space for "to"
+ * convert UTF8 string to pg_wchar (UCS-4)
+ * caller must allocate enough space for "to", including a trailing zero!
   * len: length of from.
   * "from" not necessarily null terminated.
   */
  static int
  pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  {
-       unsigned char c1,
-                               c2,
-                               c3;
         int                     cnt = 0;
+       uint32          c1,
+                               c2,
+                               c3,
+                               c4;
  
         while (len > 0 && *from)
         {
-               if (!IS_HIGHBIT_SET(*from))
+               if ((*from & 0x80) == 0)
                 {
                         *to = *from++;
                         len--;
                 }
-               else if ((*from & 0xe0) == 0xc0 && len >= 2)
+               else if ((*from & 0xe0) == 0xc0)
                 {
+                       if (len < 2)
+                               break;                  /* drop trailing incomplete char */
                         c1 = *from++ & 0x1f;
                         c2 = *from++ & 0x3f;
-                       *to = c1 << 6;
-                       *to |= c2;
+                       *to = (c1 << 6) | c2;
                         len -= 2;
                 }
-               else if ((*from & 0xe0) == 0xe0 && len >= 3)
+               else if ((*from & 0xf0) == 0xe0)
                 {
+                       if (len < 3)
+                               break;                  /* drop trailing incomplete char */
                         c1 = *from++ & 0x0f;
                         c2 = *from++ & 0x3f;
                         c3 = *from++ & 0x3f;
-                       *to = c1 << 12;
-                       *to |= c2 << 6;
-                       *to |= c3;
+                       *to = (c1 << 12) | (c2 << 6) | c3;
                         len -= 3;
                 }
+               else if ((*from & 0xf8) == 0xf0)
+               {
+                       if (len < 4)
+                               break;                  /* drop trailing incomplete char */
+                       c1 = *from++ & 0x07;
+                       c2 = *from++ & 0x3f;
+                       c3 = *from++ & 0x3f;
+                       c4 = *from++ & 0x3f;
+                       *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
+                       len -= 4;
+               }
                 else
                 {
+                       /* treat a bogus char as length 1; not ours to raise error */
                         *to = *from++;
                         len--;
                 }
@@ -415,12 +429,20 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  }
  
  /*
- * returns the byte length of a UTF8 character pointed to by s
+ * Return the byte length of a UTF8 character pointed to by s
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 4 bytes; hence do NOT return a value larger than 4.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
   */
  int
  pg_utf_mblen(const unsigned char *s)
  {
-       int                     len = 1;
+       int                     len;
  
         if ((*s & 0x80) == 0)
                 len = 1;
@@ -430,10 +452,14 @@ pg_utf_mblen(const unsigned char *s)
                 len = 3;
         else if ((*s & 0xf8) == 0xf0)
                 len = 4;
+#ifdef NOT_USED
         else if ((*s & 0xfc) == 0xf8)
                 len = 5;
         else if ((*s & 0xfe) == 0xfc)
                 len = 6;
+#endif
+       else
+               len = 1;
         return len;
  }
  
@@ -596,7 +622,7 @@ utf2ucs(const unsigned char *c)
                 return (pg_wchar) (((c[0] & 0x0f) << 12) |
                                                    ((c[1] & 0x3f) << 6) |
                                                    (c[2] & 0x3f));
-       else if ((*c & 0xf0) == 0xf0)
+       else if ((*c & 0xf8) == 0xf0)
                 return (pg_wchar) (((c[0] & 0x07) << 18) |
                                                    ((c[1] & 0x3f) << 12) |
                                                    ((c[2] & 0x3f) << 6) |
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 24 Jan 2007 17:12:17 +0000 (17:12 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 24 Jan 2007 17:12:17 +0000 (17:12 +0000)