Applied patch # 562005 "Detect invalid UTF-8 sequences"

author Karl Waclawek <kwaclaw@users.sourceforge.net>

Thu, 30 May 2002 18:02:59 +0000 (18:02 +0000)

committer Karl Waclawek <kwaclaw@users.sourceforge.net>

Thu, 30 May 2002 18:02:59 +0000 (18:02 +0000)
author Karl Waclawek <kwaclaw@users.sourceforge.net>
Thu, 30 May 2002 18:02:59 +0000 (18:02 +0000)
committer Karl Waclawek <kwaclaw@users.sourceforge.net>
Thu, 30 May 2002 18:02:59 +0000 (18:02 +0000)
diff --git a/expat/lib/xmltok.c b/expat/lib/xmltok.c

index 6a68db5ca9ea54cf896a1d1ee9c477c45a7fa080..4b0af6331c5151caf1c6e79ac14703d5cfc4cb46 100644 (file)
--- a/expat/lib/xmltok.c
+++ b/expat/lib/xmltok.c
@@ -66,12 +66,40 @@
       ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
       : 0))
  
-#define UTF8_INVALID3(p) \
-  ((((*p) && 0xF0) == 0xE0) \
-   && (((p)[1] && 0xC0) == 0x80) \
-   && (((p)[2] && 0xC0) == 0x80))
+/* Detection of invalid UTF-8 sequences is based on Table 3.1B
+   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/.
+   Implementation details:
+     (A & 0x80) == 0     means A < 0x80
+   and
+     (A & 0xC0) == 0xC0  means A > 0xBF
+*/
  
-#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
+#define UTF8_INVALID2(p) \
+  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
+
+#define UTF8_INVALID3(p) \
+  (((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
+  || \
+  ((*p) == 0xE0 \
+    ? \
+    (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
+    : \
+    ((p)[1] & 0x80) == 0 \
+    || \
+    ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
+
+#define UTF8_INVALID4(p) \
+  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
+  || \
+  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
+  || \
+  ((*p) == 0xF0 \
+    ? \
+    (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
+    : \
+    ((p)[1] & 0x80) == 0 \
+    || \
+    ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
  
  static int
  isNever(const ENCODING *enc, const char *p)
@@ -107,7 +135,11 @@ utf8_isNmstrt3(const ENCODING *enc, const char *p)
  
  #define utf8_isNmstrt4 isNever
  
-#define utf8_isInvalid2 isNever
+static int
+utf8_isInvalid2(const ENCODING *enc, const char *p)
+{
+  return UTF8_INVALID2((const unsigned char *)p);
+}
  
  static int
  utf8_isInvalid3(const ENCODING *enc, const char *p)
author	Karl Waclawek <kwaclaw@users.sourceforge.net>
	Thu, 30 May 2002 18:02:59 +0000 (18:02 +0000)
committer	Karl Waclawek <kwaclaw@users.sourceforge.net>
	Thu, 30 May 2002 18:02:59 +0000 (18:02 +0000)