Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify...

author Eric Haszlakiewicz <erh+git@nimenees.com>

Sun, 21 Jun 2020 03:10:55 +0000 (03:10 +0000)

committer Eric Haszlakiewicz <erh+git@nimenees.com>

Sun, 21 Jun 2020 03:10:55 +0000 (03:10 +0000)
author Eric Haszlakiewicz <erh+git@nimenees.com>
Sun, 21 Jun 2020 03:10:55 +0000 (03:10 +0000)
committer Eric Haszlakiewicz <erh+git@nimenees.com>
Sun, 21 Jun 2020 03:10:55 +0000 (03:10 +0000)
diff --git a/json_tokener.c b/json_tokener.c

index 2a73ce2d74057178ef581380b29404edd33f1f68..15ddd1785689346d7bb5f5ecf8d0ded3bd82922e 100644 (file)
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene
  /* PEEK_CHAR(dest, tok) macro:
   *   Peeks at the current char and stores it in dest.
   *   Returns 1 on success, sets tok->err and returns 0 if no more chars.
- *   Implicit inputs:  str, len vars
+ *   Implicit inputs:  str, len, nBytesp vars
   */
  #define PEEK_CHAR(dest, tok)                                                 \
         (((tok)->char_offset == len)                                         \
@@ -633,175 +633,182 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
                         /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
                         while (1)
                         {
-                               if (c && strchr(json_hex_chars, c))
+                               if (!c || !strchr(json_hex_chars, c))
                                 {
-                                       tok->ucs_char += ((unsigned int)jt_hexdigit(c)
-                                                         << ((3 - tok->st_pos++) * 4));
-                                       if (tok->st_pos == 4)
+                                       tok->err = json_tokener_error_parse_string;
+                                       goto out;
+                               }
+                               tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
+                                                                 << ((3 - tok->st_pos) * 4));
+                               tok->st_pos++;
+                               if (tok->st_pos < 4)
+                               {
+                                       ADVANCE_CHAR(str, tok);
+                                       if (!PEEK_CHAR(c, tok))
                                         {
-                                               unsigned char unescaped_utf[4];
+                                               /*
+                                                * We're out of characters in the current call to
+                                                * json_tokener_parse(), but a subsequent call might
+                                                * provide us with more, so leave our current state
+                                                * as-is (including tok->high_surrogate) and return.
+                                                */
+                                               goto out;
+                                       }
+                                       continue;
+                               }
  
-                                               if (tok->got_hi_surrogate)
-                                               {
-                                                       if (IS_LOW_SURROGATE(tok->ucs_char))
-                                                       {
-                                                               /* remove the utf8_replacement_char */
-                                                               /* which may generate during */
-                                                               /* parsing the high surrogate pair. */
-                                                               if (!strcmp(
-                                                                       tok->pb->buf,
-                                                                       (char *)
-                                                                           utf8_replacement_char))
-                                                               {
-                                                                       printbuf_reset(tok->pb);
-                                                               }
-                                                               /* Recalculate the ucs_char, then fall thru to process normally */
-                                                               tok->ucs_char =
-                                                                   DECODE_SURROGATE_PAIR(
-                                                                       tok->got_hi_surrogate,
-                                                                       tok->ucs_char);
-                                                       }
-                                                       else
-                                                       {
-                                                               /* Hi surrogate was not followed by a low surrogate */
-                                                               /* Replace the hi and process the rest normally */
-                                                               printbuf_memappend_fast(
-                                                                   tok->pb,
-                                                                   (char *)utf8_replacement_char,
-                                                                   3);
-                                                       }
-                                                       tok->got_hi_surrogate = 0;
-                                               }
+                               /* Now, we have a full \uNNNN sequence in tok->ucs_char */
  
-                                               if (tok->ucs_char < 0x80)
-                                               {
-                                                       unescaped_utf[0] = tok->ucs_char;
-                                                       printbuf_memappend_fast(
-                                                           tok->pb, (char *)unescaped_utf, 1);
-                                               }
-                                               else if (tok->ucs_char < 0x800)
-                                               {
-                                                       unescaped_utf[0] =
-                                                           0xc0 | (tok->ucs_char >> 6);
-                                                       unescaped_utf[1] =
-                                                           0x80 | (tok->ucs_char & 0x3f);
-                                                       printbuf_memappend_fast(
-                                                           tok->pb, (char *)unescaped_utf, 2);
-                                               }
-                                               else if (IS_HIGH_SURROGATE(tok->ucs_char))
-                                               {
-                                                       /* Got a high surrogate.  Remember it and look for
-                                                        * the beginning of another sequence, which
-                                                        * should be the low surrogate.
-                                                        */
-                                                       tok->got_hi_surrogate = tok->ucs_char;
-                                                       /* Not at end, and the next two chars should be "\u" */
-                                                       if ((len == -1 ||
-                                                            len > (tok->char_offset + 2)) &&
-                                                           // str[0] != '0' &&  // implied by json_hex_chars, above.
-                                                           (str[1] == '\\') && (str[2] == 'u'))
-                                                       {
-                                                               /* Advance through the 16 bit surrogate, and move
-                                                                * on to the next sequence. The next step is to
-                                                                * process the following characters.
-                                                                */
-                                                               if (!ADVANCE_CHAR(str, tok) ||
-                                                                   !ADVANCE_CHAR(str, tok))
-                                                               {
-                                                                       printbuf_memappend_fast(
-                                                                           tok->pb,
-                                                                           (char *)
-                                                                               utf8_replacement_char,
-                                                                           3);
-                                                               }
-                                                               /* Advance to the first char of the next sequence and
-                                                                * continue processing with the next sequence.
-                                                                */
-                                                               if (!ADVANCE_CHAR(str, tok) ||
-                                                                   !PEEK_CHAR(c, tok))
-                                                               {
-                                                                       printbuf_memappend_fast(
-                                                                           tok->pb,
-                                                                           (char *)
-                                                                               utf8_replacement_char,
-                                                                           3);
-                                                                       tok->ucs_char = 0;
-                                                                       tok->st_pos = 0;
-                                                                       goto out;
-                                                               }
-                                                               tok->ucs_char = 0;
-                                                               tok->st_pos = 0;
-                                                               /* other json_tokener_state_escape_unicode */
-                                                               continue;
-                                                       }
-                                                       else
-                                                       {
-                                                               /* Got a high surrogate without another sequence following
-                                                                * it.  Put a replacement char in for the hi surrogate
-                                                                * and pretend we finished.
-                                                                */
-                                                               printbuf_memappend_fast(
-                                                                   tok->pb,
-                                                                   (char *)utf8_replacement_char,
-                                                                   3);
-                                                       }
-                                               }
-                                               else if (IS_LOW_SURROGATE(tok->ucs_char))
-                                               {
-                                                       /* Got a low surrogate not preceded by a high */
-                                                       printbuf_memappend_fast(
-                                                           tok->pb, (char *)utf8_replacement_char,
-                                                           3);
-                                               }
-                                               else if (tok->ucs_char < 0x10000)
+                               if (tok->high_surrogate)
+                               {
+                                       if (IS_LOW_SURROGATE(tok->ucs_char))
+                                       {
+                                               /* remove the utf8_replacement_char */
+                                               /* which may generate during */
+                                               /* parsing the high surrogate pair. */
+                                               if (!strcmp(
+                                                               tok->pb->buf,
+                                                               (char *)
+                                                                       utf8_replacement_char))
                                                 {
-                                                       unescaped_utf[0] =
-                                                           0xe0 | (tok->ucs_char >> 12);
-                                                       unescaped_utf[1] =
-                                                           0x80 | ((tok->ucs_char >> 6) & 0x3f);
-                                                       unescaped_utf[2] =
-                                                           0x80 | (tok->ucs_char & 0x3f);
-                                                       printbuf_memappend_fast(
-                                                           tok->pb, (char *)unescaped_utf, 3);
+                                                       printbuf_reset(tok->pb);
                                                 }
-                                               else if (tok->ucs_char < 0x110000)
+                                               /* Recalculate the ucs_char, then fall thru to process normally */
+                                               tok->ucs_char =
+                                                       DECODE_SURROGATE_PAIR(
+                                                               tok->high_surrogate,
+                                                               tok->ucs_char);
+                                       }
+                                       else
+                                       {
+                                               /* High surrogate was not followed by a low surrogate
+                                                * Replace the high and process the rest normally
+                                                */
+                                               printbuf_memappend_fast(
+                                                       tok->pb,
+                                                       (char *)utf8_replacement_char,
+                                                       3);
+                                       }
+                                       tok->high_surrogate = 0;
+                               }
+
+                               if (tok->ucs_char < 0x80)
+                               {
+                                       unsigned char unescaped_utf[1];
+                                       unescaped_utf[0] = tok->ucs_char;
+                                       printbuf_memappend_fast(
+                                               tok->pb, (char *)unescaped_utf, 1);
+                               }
+                               else if (tok->ucs_char < 0x800)
+                               {
+                                       unsigned char unescaped_utf[2];
+                                       unescaped_utf[0] =
+                                               0xc0 | (tok->ucs_char >> 6);
+                                       unescaped_utf[1] =
+                                               0x80 | (tok->ucs_char & 0x3f);
+                                       printbuf_memappend_fast(
+                                               tok->pb, (char *)unescaped_utf, 2);
+                               }
+                               else if (IS_HIGH_SURROGATE(tok->ucs_char))
+                               {
+                                       /* Got a high surrogate.  Remember it and look for
+                                        * the beginning of another \uNNNN sequence, which
+                                        * should be the low surrogate.
+                                        */
+                                       tok->high_surrogate = tok->ucs_char;
+                                       /* Not at end, and the next two chars should be "\u" */
+                                       if ((len == -1 ||
+                                                len > (tok->char_offset + 2)) &&
+                                               // str[0] != '0' &&  // implied by json_hex_chars, above.
+                                               (str[1] == '\\') && (str[2] == 'u'))
+                                       {
+                                               /* Advance through the 16 bit surrogate, and move
+                                                * on to the next sequence. The next step is to
+                                                * process the following characters.
+                                                */
+                                               if (!ADVANCE_CHAR(str, tok) ||
+                                                       !ADVANCE_CHAR(str, tok))
                                                 {
-                                                       unescaped_utf[0] =
-                                                           0xf0 | ((tok->ucs_char >> 18) & 0x07);
-                                                       unescaped_utf[1] =
-                                                           0x80 | ((tok->ucs_char >> 12) & 0x3f);
-                                                       unescaped_utf[2] =
-                                                           0x80 | ((tok->ucs_char >> 6) & 0x3f);
-                                                       unescaped_utf[3] =
-                                                           0x80 | (tok->ucs_char & 0x3f);
                                                         printbuf_memappend_fast(
-                                                           tok->pb, (char *)unescaped_utf, 4);
+                                                               tok->pb,
+                                                               (char *)
+                                                                       utf8_replacement_char,
+                                                               3);
                                                 }
-                                               else
+                                               /* Advance to the first char of the next sequence and
+                                                * continue processing with the next sequence.
+                                                */
+                                               if (!ADVANCE_CHAR(str, tok) ||
+                                                       !PEEK_CHAR(c, tok))
                                                 {
-                                                       /* Don't know what we got--insert the replacement char */
                                                         printbuf_memappend_fast(
-                                                           tok->pb, (char *)utf8_replacement_char,
-                                                           3);
+                                                               tok->pb,
+                                                               (char *)
+                                                                       utf8_replacement_char,
+                                                               3);
+                                                       tok->ucs_char = 0;
+                                                       tok->st_pos = 0;
+                                                       goto out;
                                                 }
-                                               state = saved_state;
-                                               break;
+                                               tok->ucs_char = 0;
+                                               tok->st_pos = 0;
+                                               /* other json_tokener_state_escape_unicode */
+                                               continue;
+                                       }
+                                       else
+                                       {
+                                               /* Got a high surrogate without another sequence following
+                                                * it.  Put a replacement char in for the high surrogate
+                                                * and pretend we finished.
+                                                */
+                                               printbuf_memappend_fast(
+                                                       tok->pb,
+                                                       (char *)utf8_replacement_char,
+                                                       3);
                                         }
                                 }
-                               else
+                               else if (IS_LOW_SURROGATE(tok->ucs_char))
                                 {
-                                       tok->err = json_tokener_error_parse_string;
-                                       goto out;
+                                       /* Got a low surrogate not preceded by a high */
+                                       printbuf_memappend_fast(
+                                               tok->pb, (char *)utf8_replacement_char,
+                                               3);
                                 }
-                               if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
+                               else if (tok->ucs_char < 0x10000)
                                 {
-                                       /* Clean up any pending chars */
-                                       if (tok->got_hi_surrogate &&
-                                           strcmp(tok->pb->buf, (char *)utf8_replacement_char))
-                                               printbuf_memappend_fast(
-                                                   tok->pb, (char *)utf8_replacement_char, 3);
-                                       goto out;
+                                       unsigned char unescaped_utf[3];
+                                       unescaped_utf[0] =
+                                               0xe0 | (tok->ucs_char >> 12);
+                                       unescaped_utf[1] =
+                                               0x80 | ((tok->ucs_char >> 6) & 0x3f);
+                                       unescaped_utf[2] =
+                                               0x80 | (tok->ucs_char & 0x3f);
+                                       printbuf_memappend_fast(
+                                               tok->pb, (char *)unescaped_utf, 3);
+                               }
+                               else if (tok->ucs_char < 0x110000)
+                               {
+                                       unsigned char unescaped_utf[4];
+                                       unescaped_utf[0] =
+                                               0xf0 | ((tok->ucs_char >> 18) & 0x07);
+                                       unescaped_utf[1] =
+                                               0x80 | ((tok->ucs_char >> 12) & 0x3f);
+                                       unescaped_utf[2] =
+                                               0x80 | ((tok->ucs_char >> 6) & 0x3f);
+                                       unescaped_utf[3] =
+                                               0x80 | (tok->ucs_char & 0x3f);
+                                       printbuf_memappend_fast(
+                                               tok->pb, (char *)unescaped_utf, 4);
                                 }
+                               else
+                               {
+                                       /* Don't know what we got--insert the replacement char */
+                                       printbuf_memappend_fast(
+                                               tok->pb, (char *)utf8_replacement_char,
+                                               3);
+                               }
+                               state = saved_state; // i.e. _state_string or _object_field
+                               break;
                         }
                 }
                 break;
diff --git a/json_tokener.h b/json_tokener.h

index 4e17dffcc30bf280969d7cf38d46c42751c6aae5..421ef14f2239b708d533e7c99554293870712445 100644 (file)
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -111,7 +111,7 @@ struct json_tokener
          * @deprecated See json_tokener_get_error() instead.
          */
         enum json_tokener_error err;
-       unsigned int ucs_char, got_hi_surrogate;
+       unsigned int ucs_char, high_surrogate;
         char quote_char;
         struct json_tokener_srec *stack;
         int flags;
author	Eric Haszlakiewicz <erh+git@nimenees.com>
	Sun, 21 Jun 2020 03:10:55 +0000 (03:10 +0000)
committer	Eric Haszlakiewicz <erh+git@nimenees.com>
	Sun, 21 Jun 2020 03:10:55 +0000 (03:10 +0000)
json_tokener.c		patch \| blob \| history
json_tokener.h		patch \| blob \| history