Cover UTF-8 limit correction; some tests fail

author Sebastian Pipping <sebastian@pipping.org>

Fri, 20 May 2016 20:20:19 +0000 (22:20 +0200)

committer Sebastian Pipping <sebastian@pipping.org>

Fri, 20 May 2016 20:29:47 +0000 (22:29 +0200)
author Sebastian Pipping <sebastian@pipping.org>
Fri, 20 May 2016 20:20:19 +0000 (22:20 +0200)
committer Sebastian Pipping <sebastian@pipping.org>
Fri, 20 May 2016 20:29:47 +0000 (22:29 +0200)
diff --git a/expat/lib/internal.h b/expat/lib/internal.h

index 8eb719007a9fc70f03d837063df09393b4cdeca6..94cb98e15cae40a81419feae3480bce4fcf3b06d 100644 (file)
--- a/expat/lib/internal.h
+++ b/expat/lib/internal.h
@@ -79,3 +79,17 @@
  #  define UNUSED_P(p) UNUSED_ ## p
  # endif
  #endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void
+align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/expat/lib/xmltok.c b/expat/lib/xmltok.c

index 62b75ae651694c3550268ba7b20d8379b73cb6ed..72058d3c772e11a43319280ba3e7533f0d25091e 100644 (file)
--- a/expat/lib/xmltok.c
+++ b/expat/lib/xmltok.c
@@ -329,7 +329,7 @@ enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
    UTF8_cval4 = 0xf0
  };
  
-static void
+void
  align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef)
  {
    const char * fromLim = *fromLimRef;
diff --git a/expat/tests/runtests.c b/expat/tests/runtests.c

index 45adfa5a5be794f0432248cd83dc2abe01f55ae5..c0cdea9975480d59730b954d194407c962eb09d7 100644 (file)
--- a/expat/tests/runtests.c
+++ b/expat/tests/runtests.c
@@ -13,6 +13,10 @@
  #include <stdio.h>
  #include <string.h>
  #include <stdint.h>
+#include <stddef.h>  /* ptrdiff_t */
+#ifndef __cplusplus
+# include <stdbool.h>
+#endif
  
  #include "expat.h"
  #include "chardata.h"
@@ -367,6 +371,68 @@ START_TEST(test_illegal_utf8)
  }
  END_TEST
  
+
+/* Examples, not masks: */
+#define UTF8_LEAD_1  "\x7f"  /* 0b01111111 */
+#define UTF8_LEAD_2  "\xdf"  /* 0b11011111 */
+#define UTF8_LEAD_3  "\xef"  /* 0b11101111 */
+#define UTF8_LEAD_4  "\xf7"  /* 0b11110111 */
+#define UTF8_FOLLOW  "\xbf"  /* 0b10111111 */
+
+START_TEST(test_utf8_auto_align)
+{
+    struct TestCase {
+        ptrdiff_t expectedMovementInChars;
+        const char * input;
+    };
+
+    struct TestCase cases[] = {
+        {00, ""},
+
+        {00, UTF8_LEAD_1},
+
+        {-1, UTF8_LEAD_2},
+        {00, UTF8_LEAD_2 UTF8_FOLLOW},
+
+        {-1, UTF8_LEAD_3},
+        {-2, UTF8_LEAD_3 UTF8_FOLLOW},
+        {00, UTF8_LEAD_3 UTF8_FOLLOW UTF8_FOLLOW},
+
+        {-1, UTF8_LEAD_4},
+        {-2, UTF8_LEAD_4 UTF8_FOLLOW},
+        {-3, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW},
+        {00, UTF8_LEAD_4 UTF8_FOLLOW UTF8_FOLLOW UTF8_FOLLOW},
+    };
+
+    size_t i = 0;
+    bool success = true;
+    for (; i < sizeof(cases) / sizeof(*cases); i++) {
+        const char * fromLim = cases[i].input + strlen(cases[i].input);
+        const char * const fromLimInitially = fromLim;
+        ptrdiff_t actualMovementInChars;
+
+        align_limit_to_full_utf8_characters(cases[i].input, &fromLim);
+
+        actualMovementInChars = (fromLim - fromLimInitially);
+        if (actualMovementInChars != cases[i].expectedMovementInChars) {
+            size_t j = 0;
+            success = false;
+            printf("[-] UTF-8 case %2lu: Expected movement by %2ld chars"
+                    ", actually moved by %2ld chars: \"",
+                    i + 1, cases[i].expectedMovementInChars, actualMovementInChars);
+            for (; j < strlen(cases[i].input); j++) {
+                printf("\\x%02x", (unsigned char)cases[i].input[j]);
+            }
+            printf("\"\n");
+        }
+    }
+
+    if (! success) {
+        fail("UTF-8 auto-alignment is not bullet-proof\n");
+    }
+}
+END_TEST
+
  START_TEST(test_utf16)
  {
      /* <?xml version="1.0" encoding="UTF-16"?>
@@ -1543,6 +1609,7 @@ make_suite(void)
      tcase_add_test(tc_basic, test_bom_utf16_be);
      tcase_add_test(tc_basic, test_bom_utf16_le);
      tcase_add_test(tc_basic, test_illegal_utf8);
+    tcase_add_test(tc_basic, test_utf8_auto_align);
      tcase_add_test(tc_basic, test_utf16);
      tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
      tcase_add_test(tc_basic, test_latin1_umlauts);
author	Sebastian Pipping <sebastian@pipping.org>
	Fri, 20 May 2016 20:20:19 +0000 (22:20 +0200)
committer	Sebastian Pipping <sebastian@pipping.org>
	Fri, 20 May 2016 20:29:47 +0000 (22:29 +0200)
expat/lib/internal.h		patch \| blob \| history
expat/lib/xmltok.c		patch \| blob \| history
expat/tests/runtests.c		patch \| blob \| history