* Add handling of surrogate pairs

author Brent Miller <bdmiller@yahoo-inc.com>

Thu, 20 Aug 2009 06:50:22 +0000 (06:50 +0000)

committer Brent Miller <bdmiller@yahoo-inc.com>

Thu, 20 Aug 2009 06:50:22 +0000 (06:50 +0000)
author Brent Miller <bdmiller@yahoo-inc.com>
Thu, 20 Aug 2009 06:50:22 +0000 (06:50 +0000)
committer Brent Miller <bdmiller@yahoo-inc.com>
Thu, 20 Aug 2009 06:50:22 +0000 (06:50 +0000)
diff --git a/ChangeLog b/ChangeLog

index 1a90e1053e63d9df7f5891a317d3d76c9b5be2b8..a99c7edd934a207941d3ce26b06ec23db4bb0bf8 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,5 @@
+  * Add handling of surrogate pairs (json_tokener.c, test4.c, Makefile.am)
+    Brent Miller, bdmiller at yahoo dash inc dot com
    * Correction to comment describing printbuf_memappend in printbuf.h
      Brent Miller, bdmiller at yahoo dash inc dot com
  
diff --git a/Makefile.am b/Makefile.am

index fbedab8c7fc8e7f00976dcc4ac2a96ff515e9bcd..1c1a9badf7a6ea379efd3d533dd1eaf5947c482c 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -31,7 +31,7 @@ libjson_la_SOURCES = \
         linkhash.c \
         printbuf.c
  
-check_PROGRAMS = test1 test2 test3
+check_PROGRAMS = test1 test2 test3 test4
  
  test1_SOURCES = test1.c
  test1_LDADD = $(lib_LTLIBRARIES)
@@ -41,3 +41,6 @@ test2_LDADD = $(lib_LTLIBRARIES)
  
  test3_SOURCES = test3.c
  test3_LDADD = $(lib_LTLIBRARIES)
+
+test4_SOURCES = test4.c
+test4_LDADD = $(lib_LTLIBRARIES)
diff --git a/json_tokener.c b/json_tokener.c

index 04f11bacbd7170ec6f2d2c976214e4bd41ecb214..8d0b5dce0e76d3aba33d32296d39820dcc3aaa3a 100644 (file)
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -58,6 +58,12 @@ const char* json_tokener_errors[] = {
    "expected comment",
  };
  
+/* Stuff for decoding unicode sequences */
+#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
+#define IS_LOW_SURROGATE(uc)  (((uc) & 0xFC00) == 0xDC00)
+#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
+static unsigned char utf8_replacement_char[3] = { 0xEF, 0xBF, 0xBD };
+
  
  struct json_tokener* json_tokener_new(void)
  {
@@ -176,6 +182,7 @@ char* strndup(const char* str, size_t n)
  #define ADVANCE_CHAR(str, tok) \
    ( ++(str), ((tok)->char_offset)++, c)
  
+
  /* End optimization macro defs */
  
  
@@ -398,40 +405,97 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
        break;
  
      case json_tokener_state_escape_unicode:
-            /* Note that the following code is inefficient for handling large
-       * chunks of extended chars, calling printbuf_memappend() once
-       * for each multi-byte character of input.
-       * This is a good area for future optimization.
-       */
         {
-         /* Advance until we change state */
+          unsigned int got_hi_surrogate = 0;
+
+         /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
           while(1) {
             if(strchr(json_hex_chars, c)) {
               tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
               if(tok->st_pos == 4) {
-               unsigned char utf_out[3];
+               unsigned char unescaped_utf[4];
+
+                if (got_hi_surrogate) {
+                 if (IS_LOW_SURROGATE(tok->ucs_char)) {
+                    /* Recalculate the ucs_char, then fall thru to process normally */
+                    tok->ucs_char = DECODE_SURROGATE_PAIR(got_hi_surrogate, tok->ucs_char);
+                  } else {
+                    /* Hi surrogate was not followed by a low surrogate */
+                    /* Replace the hi and process the rest normally */
+                   printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                  }
+                  got_hi_surrogate = 0;
+                }
+
                 if (tok->ucs_char < 0x80) {
-                 utf_out[0] = tok->ucs_char;
-                 printbuf_memappend_fast(tok->pb, (char*)utf_out, 1);
+                 unescaped_utf[0] = tok->ucs_char;
+                 printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 1);
                 } else if (tok->ucs_char < 0x800) {
-                 utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
-                 utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
-                 printbuf_memappend_fast(tok->pb, (char*)utf_out, 2);
+                 unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
+                 unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
+                 printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 2);
+               } else if (IS_HIGH_SURROGATE(tok->ucs_char)) {
+                  /* Got a high surrogate.  Remember it and look for the
+                   * the beginning of another sequence, which should be the
+                   * low surrogate.
+                   */
+                  got_hi_surrogate = tok->ucs_char;
+                  /* Not at end, and the next two chars should be "\u" */
+                  if ((tok->char_offset+1 != len) &&
+                      (tok->char_offset+2 != len) &&
+                      (str[1] == '\\') &&
+                      (str[2] == 'u'))
+                  {
+                   ADVANCE_CHAR(str, tok);
+                   ADVANCE_CHAR(str, tok);
+
+                    /* Advance to the first char of the next sequence and
+                     * continue processing with the next sequence.
+                     */
+                   if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+                     printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                     goto out;
+                    }
+                   tok->ucs_char = 0;
+                    tok->st_pos = 0;
+                    continue; /* other json_tokener_state_escape_unicode */
+                  } else {
+                    /* Got a high surrogate without another sequence following
+                     * it.  Put a replacement char in for the hi surrogate
+                     * and pretend we finished.
+                     */
+                   printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                  }
+               } else if (IS_LOW_SURROGATE(tok->ucs_char)) {
+                  /* Got a low surrogate not preceded by a high */
+                 printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                } else if (tok->ucs_char < 0x10000) {
+                 unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
+                 unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+                 unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
+                 printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 3);
+               } else if (tok->ucs_char < 0x110000) {
+                 unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
+                 unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
+                 unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+                 unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
+                 printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 4);
                 } else {
-                 utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
-                 utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
-                 utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
-                 printbuf_memappend_fast(tok->pb, (char*)utf_out, 3);
-               }
+                  /* Don't know what we got--insert the replacement char */
+                 printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                }
                 state = saved_state;
                 break;
               }
             } else {
               tok->err = json_tokener_error_parse_string;
               goto out;
-                 }
-         if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok))
+           }
+         if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+            if (got_hi_surrogate) /* Clean up any pending chars */
+             printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
             goto out;
+         }
         }
        }
        break;
diff --git a/test4.c b/test4.c

new file mode 100644 (file)

index 0000000..921383d
--- /dev/null
+++ b/test4.c
@@ -0,0 +1,44 @@
+/*
+ * gcc -o utf8 utf8.c -I/home/y/include -L./.libs -ljson
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <json/json_object.h>
+#include <json/json_tokener.h>
+
+void print_hex( const unsigned char* s) {
+        const unsigned char *iter = s;
+        unsigned char ch;
+        while ((ch = *iter++) != 0) {
+           if( ',' != ch)
+            printf("%x ", ch);
+           else
+            printf( ",");
+        }
+        printf("\n");
+}
+
+int main() {
+    const char *input = "\"\\ud840\\udd26,\\ud840\\udd27,\\ud800\\udd26,\\ud800\\udd27\"";
+    const char *expected = "\xF0\xA0\x84\xA6,\xF0\xA0\x84\xA7,\xF0\x90\x84\xA6,\xF0\x90\x84\xA7";
+    struct json_object *parse_result = json_tokener_parse((char*)input);
+    const char *unjson = json_object_get_string(parse_result);
+
+    printf("input: %s\n", input);
+
+    int strings_match = !strcmp( expected, unjson);
+    if (strings_match) {
+        printf("JSON parse result is correct: %s\n", unjson);
+        printf("PASS\n");
+        return(0);
+    } else {
+        printf("JSON parse result doesn't match expected string\n");
+        printf("expected string bytes: ");
+        print_hex( expected);
+        printf("parsed string bytes:   ");
+        print_hex( unjson);
+        printf("FAIL\n");
+        return(1);
+    }
+}
author	Brent Miller <bdmiller@yahoo-inc.com>
	Thu, 20 Aug 2009 06:50:22 +0000 (06:50 +0000)
committer	Brent Miller <bdmiller@yahoo-inc.com>
	Thu, 20 Aug 2009 06:50:22 +0000 (06:50 +0000)
ChangeLog		patch \| blob \| history
Makefile.am		patch \| blob \| history
json_tokener.c		patch \| blob \| history
test4.c	[new file with mode: 0644]	patch \| blob