fix issue 616: support the surrogate pair in split file.

author dota17 <chenguopingdota@163.com>

Tue, 2 Jun 2020 11:17:42 +0000 (19:17 +0800)

committer dota17 <chenguopingdota@163.com>

Mon, 8 Jun 2020 09:19:32 +0000 (17:19 +0800)
author dota17 <chenguopingdota@163.com>
Tue, 2 Jun 2020 11:17:42 +0000 (19:17 +0800)
committer dota17 <chenguopingdota@163.com>
Mon, 8 Jun 2020 09:19:32 +0000 (17:19 +0800)
diff --git a/json_tokener.c b/json_tokener.c

index 0373d6f7a68690af62970761a846941894553e1b..69d7af2a8082e3d3a7bddb0bf484180256ef40d1 100644 (file)
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -630,8 +630,6 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
  
                 case json_tokener_state_escape_unicode:
                 {
-                       unsigned int got_hi_surrogate = 0;
-
                         /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
                         while (1)
                         {
@@ -643,14 +641,24 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
                                         {
                                                 unsigned char unescaped_utf[4];
  
-                                               if (got_hi_surrogate)
+                                               if (tok->got_hi_surrogate)
                                                 {
                                                         if (IS_LOW_SURROGATE(tok->ucs_char))
                                                         {
+                                                               /* remove the utf8_replacement_char */
+                                                               /* which may generate during */
+                                                               /* parsing the high surrogate pair. */
+                                                               if (!strcmp(
+                                                                       tok->pb->buf,
+                                                                       (char *)
+                                                                           utf8_replacement_char))
+                                                               {
+                                                                       printbuf_reset(tok->pb);
+                                                               }
                                                                 /* Recalculate the ucs_char, then fall thru to process normally */
                                                                 tok->ucs_char =
                                                                     DECODE_SURROGATE_PAIR(
-                                                                       got_hi_surrogate,
+                                                                       tok->got_hi_surrogate,
                                                                         tok->ucs_char);
                                                         }
                                                         else
@@ -662,7 +670,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
                                                                     (char *)utf8_replacement_char,
                                                                     3);
                                                         }
-                                                       got_hi_surrogate = 0;
+                                                       tok->got_hi_surrogate = 0;
                                                 }
  
                                                 if (tok->ucs_char < 0x80)
@@ -686,7 +694,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
                                                          * the beginning of another sequence, which
                                                          * should be the low surrogate.
                                                          */
-                                                       got_hi_surrogate = tok->ucs_char;
+                                                       tok->got_hi_surrogate = tok->ucs_char;
                                                         /* Not at end, and the next two chars should be "\u" */
                                                         if ((len == -1 ||
                                                              len > (tok->char_offset + 2)) &&
@@ -717,6 +725,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
                                                                             (char *)
                                                                                 utf8_replacement_char,
                                                                             3);
+                                                                       tok->ucs_char = 0;
+                                                                       tok->st_pos = 0;
                                                                         goto out;
                                                                 }
                                                                 tok->ucs_char = 0;
@@ -786,7 +796,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
                                 if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
                                 {
                                         /* Clean up any pending chars */
-                                       if (got_hi_surrogate)
+                                       if (tok->got_hi_surrogate &&
+                                           strcmp(tok->pb->buf, (char *)utf8_replacement_char))
                                                 printbuf_memappend_fast(
                                                     tok->pb, (char *)utf8_replacement_char, 3);
                                         goto out;
diff --git a/json_tokener.h b/json_tokener.h

index ea49ad82a51c1c7dbdd06395be0812da852de846..8afa6ec4649e816fd1e38ba8d35488d26615dad4 100644 (file)
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -111,7 +111,7 @@ struct json_tokener
          * @deprecated See json_tokener_get_error() instead.
          */
         enum json_tokener_error err;
-       unsigned int ucs_char;
+       unsigned int ucs_char, got_hi_surrogate;
         char quote_char;
         struct json_tokener_srec *stack;
         int flags;
diff --git a/tests/test_parse.c b/tests/test_parse.c

index 6014ac1b977aa88feccb26dd72f09c6fc7bd8dd3..42b2fdfd1e47f88ca83a920ae0cef1a5827d8096 100644 (file)
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -224,6 +224,35 @@ struct incremental_step
      {"\": {\"bar", -1, -1, json_tokener_continue, 0},
      {"\":13}}", -1, -1, json_tokener_success, 1},
  
+    /* Check the UTF-16 surrogate pair */
+    /* parse one char at every time */
+    {"\"\\", -1, -1, json_tokener_continue, 0},
+    {"u", -1, -1, json_tokener_continue, 0},
+    {"d", -1, -1, json_tokener_continue, 0},
+    {"8", -1, -1, json_tokener_continue, 0},
+    {"3", -1, -1, json_tokener_continue, 0},
+    {"4", -1, -1, json_tokener_continue, 0},
+    {"\\", -1, -1, json_tokener_continue, 0},
+    {"u", -1, -1, json_tokener_continue, 0},
+    {"d", -1, -1, json_tokener_continue, 0},
+    {"d", -1, -1, json_tokener_continue, 0},
+    {"1", -1, -1, json_tokener_continue, 0},
+    {"e\"", -1, -1, json_tokener_success, 1},
+    /* parse two char at every time */
+    {"\"\\u", -1, -1, json_tokener_continue, 0},
+    {"d8", -1, -1, json_tokener_continue, 0},
+    {"34", -1, -1, json_tokener_continue, 0},
+    {"\\u", -1, -1, json_tokener_continue, 0},
+    {"dd", -1, -1, json_tokener_continue, 0},
+    {"1e\"", -1, -1, json_tokener_success, 1},
+    /* check the low surrogate pair */
+    {"\"\\ud834", -1, -1, json_tokener_continue, 0},
+    {"\\udd1e\"", -1, -1, json_tokener_success, 1},
+    {"\"\\ud834\\", -1, -1, json_tokener_continue, 0},
+    {"udd1e\"", -1, -1, json_tokener_success, 1},
+    {"\"\\ud834\\u", -1, -1, json_tokener_continue, 0},
+    {"dd1e\"", -1, -1, json_tokener_success, 1},
+
      /* Check that json_tokener_reset actually resets */
      {"{ \"foo", -1, -1, json_tokener_continue, 1},
      {": \"bar\"}", -1, 0, json_tokener_error_parse_unexpected, 1},
@@ -239,11 +268,13 @@ struct incremental_step
      {"\"Y\"", -1, -1, json_tokener_success, 1},
  
      /* Trailing characters should cause a failure in strict mode */
-    {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
+    {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
  
      /* ... unless explicitly allowed. */
-    {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_success, 0, JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS },
-    {"{\"b\":8}ignored garbage", -1, 7, json_tokener_success, 1, JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS },
+    {"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_success, 0,
+     JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS},
+    {"{\"b\":8}ignored garbage", -1, 7, json_tokener_success, 1,
+     JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS},
  
      /* To stop parsing a number we need to reach a non-digit, e.g. a \0 */
      {"1", 1, 1, json_tokener_continue, 0},
@@ -251,7 +282,7 @@ struct incremental_step
      {"2", 2, 1, json_tokener_success, 0},
      {"12{", 3, 2, json_tokener_success, 1},
      /* Parse number in strict model */
-    {"[02]", -1, 3, json_tokener_error_parse_number, 1, JSON_TOKENER_STRICT },
+    {"[02]", -1, 3, json_tokener_error_parse_number, 1, JSON_TOKENER_STRICT},
  
      /* Similar tests for other kinds of objects: */
      /* These could all return success immediately, since regardless of
@@ -267,8 +298,8 @@ struct incremental_step
      {"Infinity", 9, 8, json_tokener_success, 1},
      {"infinity", 9, 8, json_tokener_success, 1},
      {"-infinity", 10, 9, json_tokener_success, 1},
-    {"infinity", 9, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
-    {"-infinity", 10, 1, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
+    {"infinity", 9, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
+    {"-infinity", 10, 1, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
  
      {"inf", 3, 3, json_tokener_continue, 0},
      {"inity", 6, 5, json_tokener_success, 1},
@@ -350,7 +381,7 @@ struct incremental_step
      {"\"\\a\"", -1, 2, json_tokener_error_parse_string, 1},
  
      /* Check '\'' in strict model */
-    {"\'foo\'", -1, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
+    {"\'foo\'", -1, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
  
      /* Parse array/object */
      {"[1,2,3]", -1, -1, json_tokener_success, 0},
@@ -372,42 +403,54 @@ struct incremental_step
      {"[1,2,3,]", -1, -1, json_tokener_success, 0},
      {"[1,2,,3,]", -1, 5, json_tokener_error_parse_unexpected, 0},
  
-    {"[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
-    {"{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
+    {"[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
+    {"{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
  
      // utf-8 test
      // acsll encoding
-    {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1},
      // utf-8 encoding
-    {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
-    {"\x22\xe4\xb8", -1, 3, json_tokener_error_parse_utf8_string, 0, JSON_TOKENER_VALIDATE_UTF8 },
-    {"\x96\xe7\x95\x8c\x22", -1, 0, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
+    {"\x22\xe4\xb8", -1, 3, json_tokener_error_parse_utf8_string, 0, JSON_TOKENER_VALIDATE_UTF8},
+    {"\x96\xe7\x95\x8c\x22", -1, 0, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1},
-    {"\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
-    {"\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8},
+    {"\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8},
      // wrong utf-8 encoding
-    {"\x22\xe6\x9d\x4e\x22", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x22\xe6\x9d\x4e\x22", -1, 3, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      {"\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1},
      // GBK encoding
-    {"\x22\xc0\xee\xc5\xf4\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x22\xc0\xee\xc5\xf4\x22", -1, 2, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      {"\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1},
      // char after space
-    {"\x20\x20\x22\xe4\xb8\x96\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
-    {"\x20\x20\x81\x22\xe4\xb8\x96\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
-    {"\x5b\x20\x81\x31\x5d", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x20\x20\x22\xe4\xb8\x96\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8},
+    {"\x20\x20\x81\x22\xe4\xb8\x96\x22", -1, 2, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
+    {"\x5b\x20\x81\x31\x5d", -1, 2, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      // char in state inf
      {"\x49\x6e\x66\x69\x6e\x69\x74\x79", 9, 8, json_tokener_success, 1},
-    {"\x49\x6e\x66\x81\x6e\x69\x74\x79", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x49\x6e\x66\x81\x6e\x69\x74\x79", -1, 3, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      // char in escape unicode
-    {"\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22", 15, 14, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22", 15, 14, json_tokener_success, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      {"\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22", -1, 8,
-     json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
-    {"\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22", -1, 9, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+     json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8},
+    {"\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22", -1, 9, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      // char in number
-    {"\x31\x31\x81\x31\x31", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x31\x31\x81\x31\x31", -1, 2, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
      // char in object
-    {"\x7b\x22\x31\x81\x22\x3a\x31\x7d", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
+    {"\x7b\x22\x31\x81\x22\x3a\x31\x7d", -1, 3, json_tokener_error_parse_utf8_string, 1,
+     JSON_TOKENER_VALIDATE_UTF8},
  
      {NULL, -1, -1, json_tokener_success, 0},
  };
diff --git a/tests/test_parse.expected b/tests/test_parse.expected

index 77e8be1212e2d6bad7a255beb9e3faa39683804c..df74fbd8b4b66e0be59881eaa38f9e745959c6ab 100644 (file)
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -100,6 +100,30 @@ json_tokener_parse_ex(tok, // hello"foo",  13) ... OK: got correct error: contin
  json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
  json_tokener_parse_ex(tok, ": {"bar    ,   8) ... OK: got correct error: continue
  json_tokener_parse_ex(tok, ":13}}      ,   6) ... OK: got object of type [object]: { "foo": { "bar": 13 } }
+json_tokener_parse_ex(tok, "\          ,   2) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, u           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, d           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 8           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 3           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 4           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, \           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, u           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, d           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, d           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 1           ,   1) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, e"          ,   2) ... OK: got object of type [string]: "𝄞"
+json_tokener_parse_ex(tok, "\u         ,   3) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, d8          ,   2) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 34          ,   2) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, \u          ,   2) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, dd          ,   2) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, 1e"         ,   3) ... OK: got object of type [string]: "𝄞"
+json_tokener_parse_ex(tok, "\ud834     ,   7) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, \udd1e"     ,   7) ... OK: got object of type [string]: "𝄞"
+json_tokener_parse_ex(tok, "\ud834\    ,   8) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, udd1e"      ,   6) ... OK: got object of type [string]: "𝄞"
+json_tokener_parse_ex(tok, "\ud834\u   ,   9) ... OK: got correct error: continue
+json_tokener_parse_ex(tok, dd1e"       ,   5) ... OK: got object of type [string]: "𝄞"
  json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
  json_tokener_parse_ex(tok, : "bar"}    ,   8) ... OK: got correct error: unexpected character
  json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
@@ -216,5 +240,5 @@ json_tokener_parse_ex(tok, "\ud855
  json_tokener_parse_ex(tok, "\ud0031À"  ,  10) ... OK: got correct error: invalid utf-8 string
  json_tokener_parse_ex(tok, 11\8111       ,   5) ... OK: got correct error: invalid utf-8 string
  json_tokener_parse_ex(tok, {"1\81":1}    ,   8) ... OK: got correct error: invalid utf-8 string
-End Incremental Tests OK=130 ERROR=0
+End Incremental Tests OK=154 ERROR=0
  ==================================
author	dota17 <chenguopingdota@163.com>
	Tue, 2 Jun 2020 11:17:42 +0000 (19:17 +0800)
committer	dota17 <chenguopingdota@163.com>
	Mon, 8 Jun 2020 09:19:32 +0000 (17:19 +0800)
json_tokener.c		patch \| blob \| history
json_tokener.h		patch \| blob \| history
tests/test_parse.c		patch \| blob \| history
tests/test_parse.expected		patch \| blob \| history