xml_core: support a mode for escaping UTF-8 characters

author Matthew Fernandez <matthew.fernandez@gmail.com>

Sat, 16 Oct 2021 05:24:34 +0000 (22:24 -0700)

committer Matthew Fernandez <matthew.fernandez@gmail.com>

Sat, 30 Oct 2021 02:36:03 +0000 (19:36 -0700)
author Matthew Fernandez <matthew.fernandez@gmail.com>
Sat, 16 Oct 2021 05:24:34 +0000 (22:24 -0700)
committer Matthew Fernandez <matthew.fernandez@gmail.com>
Sat, 30 Oct 2021 02:36:03 +0000 (19:36 -0700)
diff --git a/lib/common/utils.h b/lib/common/utils.h

index 68815bff4ae3147ea586b65c79dc8b282d3ff0b0..3e4ace495d70c134ac4192caea7b7c7212350423 100644 (file)
--- a/lib/common/utils.h
+++ b/lib/common/utils.h
@@ -39,6 +39,8 @@ extern "C" {
        unsigned dash : 1;
        // escape consecutive ' '
        unsigned nbsp : 1;
+      // anticipate non-ASCII characters that need to be encoded
+      unsigned utf8 : 1;
      } xml_flags_t;
  
      UTILS_API nodequeue *new_queue(int);
diff --git a/lib/common/xml.c b/lib/common/xml.c

index 7a6faf9fcf04e7a0bca53444748d1f818e75807c..50bf82a026921ad02f3a8130084ae20a98e733ae 100644 (file)
--- a/lib/common/xml.c
+++ b/lib/common/xml.c
@@ -1,7 +1,12 @@
+#include <cgraph/unreachable.h>
  #include <common/types.h>
  #include <common/utils.h>
  #include <ctype.h>
+#include <inttypes.h>
  #include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
  
  // variant of `isalpha` that assumes a C locale
  static bool isalpha_no_locale(char c) {
@@ -62,7 +67,7 @@ static int xml_core(char previous, const char **current, xml_flags_t flags,
    const char *s = *current;
    char c = *s;
  
-  // we always consume one character for now
+  // we will consume at least one character, so note that now
    ++*current;
  
    // escape '&' only if not part of a legal entity sequence
@@ -97,6 +102,76 @@ static int xml_core(char previous, const char **current, xml_flags_t flags,
    if (c == '\r' && flags.raw)
      return cb(state, "&#13;");
  
+  unsigned char uc = (unsigned char)c;
+  if (uc > 0x7f && flags.utf8) {
+
+    // replicating a table from https://en.wikipedia.org/wiki/UTF-8:
+    //
+    //   ┌────────────────┬───────────────┬────────┬────────┬────────┬────────┐
+    //   │First code point│Last code point│Byte 1  │Byte 2  │Byte 3  │Byte 4  │
+    //   ├────────────────┼───────────────┼────────┼────────┼────────┼────────┤
+    //   │          U+0000│         U+007F│0xxxxxxx│        │        │        │
+    //   │          U+0080│         U+07FF│110xxxxx│10xxxxxx│        │        │
+    //   │          U+0800│         U+FFFF│1110xxxx│10xxxxxx│10xxxxxx│        │
+    //   │         U+10000│       U+10FFFF│11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│
+    //   └────────────────┴───────────────┴────────┴────────┴────────┴────────┘
+    //
+    // from which we can calculate the byte length of the current character
+    size_t length =
+        (uc >> 5) == 6 ? 2 : (uc >> 4) == 14 ? 3 : (uc >> 3) == 30 ? 4 : 0;
+
+    // was the length malformed or is the follow on sequence truncated?
+    bool is_invalid = length == 0;
+    for (size_t l = 1; !is_invalid && length > l; ++l)
+      is_invalid |= s[l] == '\0';
+
+    // TODO: a better strategy than aborting on malformed data
+    if (is_invalid) {
+      fprintf(stderr, "Error during conversion to \"UTF-8\". Quiting.\n");
+      exit(EXIT_FAILURE);
+    }
+
+    // Decode the character. Refer again to the above table to understand this
+    // algorithm.
+    uint32_t utf8_char = 0;
+    switch (length) {
+    case 2: {
+      uint32_t low = ((uint32_t)s[1]) & ((1 << 6) - 1);
+      uint32_t high = ((uint32_t)s[0]) & ((1 << 5) - 1);
+      utf8_char = low | (high << 6);
+      break;
+    }
+    case 3: {
+      uint32_t low = ((uint32_t)s[2]) & ((1 << 6) - 1);
+      uint32_t mid = ((uint32_t)s[1]) & ((1 << 6) - 1);
+      uint32_t high = ((uint32_t)s[0]) & ((1 << 4) - 1);
+      utf8_char = low | (mid << 6) | (high << 12);
+      break;
+    }
+    case 4: {
+      uint32_t low = ((uint32_t)s[3]) & ((1 << 6) - 1);
+      uint32_t mid1 = ((uint32_t)s[2]) & ((1 << 6) - 1);
+      uint32_t mid2 = ((uint32_t)s[1]) & ((1 << 6) - 1);
+      uint32_t high = ((uint32_t)s[0]) & ((1 << 3) - 1);
+      utf8_char = low | (mid1 << 6) | (mid2 << 12) | (high << 18);
+      break;
+    }
+    default:
+      UNREACHABLE();
+    }
+
+    // setup a buffer that will fit the largest escape we need to print
+    char buffer[sizeof("&#xFFFFFFFF;")];
+
+    // emit the escape sequence itself
+    snprintf(buffer, sizeof(buffer), "&#x%" PRIx32 ";", utf8_char);
+
+    // note how many extra characters we consumed
+    *current += length - 1;
+
+    return cb(state, buffer);
+  }
+
    // otherwise, output the character as-is
    char buffer[2] = {c, '\0'};
    return cb(state, buffer);
author	Matthew Fernandez <matthew.fernandez@gmail.com>
	Sat, 16 Oct 2021 05:24:34 +0000 (22:24 -0700)
committer	Matthew Fernandez <matthew.fernandez@gmail.com>
	Sat, 30 Oct 2021 02:36:03 +0000 (19:36 -0700)
lib/common/utils.h		patch \| blob \| history
lib/common/xml.c		patch \| blob \| history