/*
* conversion functions between pg_wchar and multibyte streams.
* Tatsuo Ishii
- * $Id: wchar.c,v 1.34.2.2 2006/05/21 20:06:44 tgl Exp $
+ * $Id: wchar.c,v 1.34.2.3 2007/01/24 17:12:41 tgl Exp $
*
* WIN1250 client encoding updated by Pavel Behal
*
}
/*
- * convert UTF-8 string to pg_wchar (UCS-2)
- * caller should allocate enough space for "to"
+ * convert UTF8 string to pg_wchar (UCS-4)
+ * caller must allocate enough space for "to", including a trailing zero!
* len: length of from.
* "from" not necessarily null terminated.
*/
static int
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
{
- unsigned char c1,
+ int cnt = 0;
+ uint32 c1,
c2,
c3;
- int cnt = 0;
while (len > 0 && *from)
{
*to = *from++;
len--;
}
- else if ((*from & 0xe0) == 0xc0 && len >= 2)
+ else if ((*from & 0xe0) == 0xc0)
{
+ if (len < 2)
+ break; /* drop trailing incomplete char */
c1 = *from++ & 0x1f;
c2 = *from++ & 0x3f;
- *to = c1 << 6;
- *to |= c2;
+ *to = (c1 << 6) | c2;
len -= 2;
}
- else if ((*from & 0xe0) == 0xe0 && len >= 3)
+ else if ((*from & 0xf0) == 0xe0)
{
+ if (len < 3)
+ break; /* drop trailing incomplete char */
c1 = *from++ & 0x0f;
c2 = *from++ & 0x3f;
c3 = *from++ & 0x3f;
- *to = c1 << 12;
- *to |= c2 << 6;
- *to |= c3;
+ *to = (c1 << 12) | (c2 << 6) | c3;
len -= 3;
}
else
{
+ /* treat a bogus char as length 1; not ours to raise error */
*to = *from++;
len--;
}
}
/*
- * returns the byte length of a UTF-8 word pointed to by s
+ * Return the byte length of a UTF8 character pointed to by s
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 3 bytes; hence do NOT return a value larger than 3.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
*/
int
pg_utf_mblen(const unsigned char *s)
{
- int len = 1;
+ int len;
if ((*s & 0x80) == 0)
len = 1;
else if ((*s & 0xe0) == 0xc0)
len = 2;
- else if ((*s & 0xe0) == 0xe0)
+ else if ((*s & 0xf0) == 0xe0)
len = 3;
- return (len);
+#ifdef NOT_USED
+ else if ((*s & 0xf8) == 0xf0)
+ len = 4;
+ else if ((*s & 0xfc) == 0xf8)
+ len = 5;
+ else if ((*s & 0xfe) == 0xfc)
+ len = 6;
+#endif
+ else
+ len = 1;
+ return len;
}
/*