]> granicus.if.org Git - json-c/blob - json_tokener.c
Small update to README file
[json-c] / json_tokener.c
1 /*
2  * $Id: json_tokener.c,v 1.20 2006/07/25 03:24:50 mclark Exp $
3  *
4  * Copyright (c) 2004, 2005 Metaparadigm Pte. Ltd.
5  * Michael Clark <michael@metaparadigm.com>
6  *
7  * This library is free software; you can redistribute it and/or modify
8  * it under the terms of the MIT license. See COPYING for details.
9  *
10  *
11  * Copyright (c) 2008-2009 Yahoo! Inc.  All rights reserved.
12  * The copyrights to the contents of this file are licensed under the MIT License
13  * (https://www.opensource.org/licenses/mit-license.php)
14  */
15
16 #include "config.h"
17
18 #include "math_compat.h"
19 #include <assert.h>
20 #include <limits.h>
21 #include <math.h>
22 #include <stddef.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #include "debug.h"
28 #include "json_inttypes.h"
29 #include "json_object.h"
30 #include "json_object_private.h"
31 #include "json_tokener.h"
32 #include "json_util.h"
33 #include "printbuf.h"
34 #include "strdup_compat.h"
35
36 #ifdef HAVE_LOCALE_H
37 #include <locale.h>
38 #endif /* HAVE_LOCALE_H */
39 #ifdef HAVE_XLOCALE_H
40 #include <xlocale.h>
41 #endif
42 #ifdef HAVE_STRINGS_H
43 #include <strings.h>
44 #endif /* HAVE_STRINGS_H */
45
46 #define jt_hexdigit(x) (((x) <= '9') ? (x) - '0' : ((x)&7) + 9)
47
48 #if !HAVE_STRNCASECMP && defined(_MSC_VER)
49 /* MSC has the version as _strnicmp */
50 #define strncasecmp _strnicmp
51 #elif !HAVE_STRNCASECMP
52 #error You do not have strncasecmp on your system.
53 #endif /* HAVE_STRNCASECMP */
54
55 #if defined(_MSC_VER) && (_MSC_VER <= 1800)
56 /* VS2013 doesn't know about "inline" */
57 #define inline __inline
58 #elif defined(AIX_CC)
59 #define inline
60 #endif
61
62 /* The following helper functions are used to speed up parsing. They
63  * are faster than their ctype counterparts because they assume that
64  * the input is in ASCII and that the locale is set to "C". The
65  * compiler will also inline these functions, providing an additional
66  * speedup by saving on function calls.
67  */
68 static inline int is_ws_char(char c)
69 {
70         return c == ' '
71             || c == '\t'
72             || c == '\n'
73             || c == '\r';
74 }
75
76 static inline int is_hex_char(char c)
77 {
78         return (c >= '0' && c <= '9')
79             || (c >= 'A' && c <= 'F')
80             || (c >= 'a' && c <= 'f');
81 }
82
83 /* Use C99 NAN by default; if not available, nan("") should work too. */
84 #ifndef NAN
85 #define NAN nan("")
86 #endif /* !NAN */
87
88 static const char json_null_str[] = "null";
89 static const int json_null_str_len = sizeof(json_null_str) - 1;
90 static const char json_inf_str[] = "Infinity";
91 /* Swapped case "Infinity" to avoid need to call tolower() on input chars: */
92 static const char json_inf_str_invert[] = "iNFINITY";
93 static const unsigned int json_inf_str_len = sizeof(json_inf_str) - 1;
94 static const char json_nan_str[] = "NaN";
95 static const int json_nan_str_len = sizeof(json_nan_str) - 1;
96 static const char json_true_str[] = "true";
97 static const int json_true_str_len = sizeof(json_true_str) - 1;
98 static const char json_false_str[] = "false";
99 static const int json_false_str_len = sizeof(json_false_str) - 1;
100
101 /* clang-format off */
102 static const char *json_tokener_errors[] = {
103         "success",
104         "continue",
105         "nesting too deep",
106         "unexpected end of data",
107         "unexpected character",
108         "null expected",
109         "boolean expected",
110         "number expected",
111         "array value separator ',' expected",
112         "quoted object property name expected",
113         "object property name separator ':' expected",
114         "object value separator ',' expected",
115         "invalid string sequence",
116         "expected comment",
117         "invalid utf-8 string",
118         "buffer size overflow"
119 };
120 /* clang-format on */
121
122 /**
123  * validete the utf-8 string in strict model.
124  * if not utf-8 format, return err.
125  */
126 static json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes);
127
128 static int json_tokener_parse_double(const char *buf, int len, double *retval);
129
130 const char *json_tokener_error_desc(enum json_tokener_error jerr)
131 {
132         int jerr_int = (int)jerr;
133         if (jerr_int < 0 ||
134             jerr_int >= (int)(sizeof(json_tokener_errors) / sizeof(json_tokener_errors[0])))
135                 return "Unknown error, "
136                        "invalid json_tokener_error value passed to json_tokener_error_desc()";
137         return json_tokener_errors[jerr];
138 }
139
140 enum json_tokener_error json_tokener_get_error(struct json_tokener *tok)
141 {
142         return tok->err;
143 }
144
145 /* Stuff for decoding unicode sequences */
146 #define IS_HIGH_SURROGATE(uc) (((uc)&0xFC00) == 0xD800)
147 #define IS_LOW_SURROGATE(uc) (((uc)&0xFC00) == 0xDC00)
148 #define DECODE_SURROGATE_PAIR(hi, lo) ((((hi)&0x3FF) << 10) + ((lo)&0x3FF) + 0x10000)
149 static unsigned char utf8_replacement_char[3] = {0xEF, 0xBF, 0xBD};
150
151 struct json_tokener *json_tokener_new_ex(int depth)
152 {
153         struct json_tokener *tok;
154
155         tok = (struct json_tokener *)calloc(1, sizeof(struct json_tokener));
156         if (!tok)
157                 return NULL;
158         tok->stack = (struct json_tokener_srec *)calloc(depth, sizeof(struct json_tokener_srec));
159         if (!tok->stack)
160         {
161                 free(tok);
162                 return NULL;
163         }
164         tok->pb = printbuf_new();
165         if (!tok->pb)
166         {
167                 free(tok->stack);
168                 free(tok);
169                 return NULL;
170         }
171         tok->max_depth = depth;
172         json_tokener_reset(tok);
173         return tok;
174 }
175
176 struct json_tokener *json_tokener_new(void)
177 {
178         return json_tokener_new_ex(JSON_TOKENER_DEFAULT_DEPTH);
179 }
180
181 void json_tokener_free(struct json_tokener *tok)
182 {
183         json_tokener_reset(tok);
184         if (tok->pb)
185                 printbuf_free(tok->pb);
186         free(tok->stack);
187         free(tok);
188 }
189
190 static void json_tokener_reset_level(struct json_tokener *tok, int depth)
191 {
192         tok->stack[depth].state = json_tokener_state_eatws;
193         tok->stack[depth].saved_state = json_tokener_state_start;
194         json_object_put(tok->stack[depth].current);
195         tok->stack[depth].current = NULL;
196         free(tok->stack[depth].obj_field_name);
197         tok->stack[depth].obj_field_name = NULL;
198 }
199
200 void json_tokener_reset(struct json_tokener *tok)
201 {
202         int i;
203         if (!tok)
204                 return;
205
206         for (i = tok->depth; i >= 0; i--)
207                 json_tokener_reset_level(tok, i);
208         tok->depth = 0;
209         tok->err = json_tokener_success;
210 }
211
212 struct json_object *json_tokener_parse(const char *str)
213 {
214         enum json_tokener_error jerr_ignored;
215         struct json_object *obj;
216         obj = json_tokener_parse_verbose(str, &jerr_ignored);
217         return obj;
218 }
219
220 struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokener_error *error)
221 {
222         struct json_tokener *tok;
223         struct json_object *obj;
224
225         tok = json_tokener_new();
226         if (!tok)
227                 return NULL;
228         obj = json_tokener_parse_ex(tok, str, -1);
229         *error = tok->err;
230         if (tok->err != json_tokener_success
231 #if 0
232                 /* This would be a more sensible default, and cause parsing
233                  * things like "null123" to fail when the caller can't know
234                  * where the parsing left off, but starting to fail would
235                  * be a notable behaviour change.  Save for a 1.0 release.
236                  */
237             || json_tokener_get_parse_end(tok) != strlen(str)
238 #endif
239         )
240
241         {
242                 if (obj != NULL)
243                         json_object_put(obj);
244                 obj = NULL;
245         }
246
247         json_tokener_free(tok);
248         return obj;
249 }
250
251 #define state tok->stack[tok->depth].state
252 #define saved_state tok->stack[tok->depth].saved_state
253 #define current tok->stack[tok->depth].current
254 #define obj_field_name tok->stack[tok->depth].obj_field_name
255
256 /* Optimization:
257  * json_tokener_parse_ex() consumed a lot of CPU in its main loop,
258  * iterating character-by character.  A large performance boost is
259  * achieved by using tighter loops to locally handle units such as
260  * comments and strings.  Loops that handle an entire token within
261  * their scope also gather entire strings and pass them to
262  * printbuf_memappend() in a single call, rather than calling
263  * printbuf_memappend() one char at a time.
264  *
265  * PEEK_CHAR() and ADVANCE_CHAR() macros are used for code that is
266  * common to both the main loop and the tighter loops.
267  */
268
269 /* PEEK_CHAR(dest, tok) macro:
270  *   Peeks at the current char and stores it in dest.
271  *   Returns 1 on success, sets tok->err and returns 0 if no more chars.
272  *   Implicit inputs:  str, len, nBytesp vars
273  */
274 #define PEEK_CHAR(dest, tok)                                                 \
275         (((tok)->char_offset == len)                                         \
276              ? (((tok)->depth == 0 && state == json_tokener_state_eatws &&   \
277                  saved_state == json_tokener_state_finish)                   \
278                     ? (((tok)->err = json_tokener_success), 0)               \
279                     : (((tok)->err = json_tokener_continue), 0))             \
280              : (((tok->flags & JSON_TOKENER_VALIDATE_UTF8) &&                \
281                  (!json_tokener_validate_utf8(*str, nBytesp)))               \
282                     ? ((tok->err = json_tokener_error_parse_utf8_string), 0) \
283                     : (((dest) = *str), 1)))
284
285 /* ADVANCE_CHAR() macro:
286  *   Increments str & tok->char_offset.
287  *   For convenience of existing conditionals, returns the old value of c (0 on eof)
288  *   Implicit inputs:  c var
289  */
290 #define ADVANCE_CHAR(str, tok) (++(str), ((tok)->char_offset)++, c)
291
292 /* End optimization macro defs */
293
294 struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *str, int len)
295 {
296         struct json_object *obj = NULL;
297         char c = '\1';
298         unsigned int nBytes = 0;
299         unsigned int *nBytesp = &nBytes;
300
301 #ifdef HAVE_USELOCALE
302         locale_t oldlocale = uselocale(NULL);
303         locale_t newloc;
304 #elif defined(HAVE_SETLOCALE)
305         char *oldlocale = NULL;
306 #endif
307
308         tok->char_offset = 0;
309         tok->err = json_tokener_success;
310
311         /* this interface is presently not 64-bit clean due to the int len argument
312          * and the internal printbuf interface that takes 32-bit int len arguments
313          * so the function limits the maximum string size to INT32_MAX (2GB).
314          * If the function is called with len == -1 then strlen is called to check
315          * the string length is less than INT32_MAX (2GB)
316          */
317         if ((len < -1) || (len == -1 && strlen(str) > INT32_MAX))
318         {
319                 tok->err = json_tokener_error_size;
320                 return NULL;
321         }
322
323 #ifdef HAVE_USELOCALE
324         {
325                 locale_t duploc = duplocale(oldlocale);
326                 newloc = newlocale(LC_NUMERIC_MASK, "C", duploc);
327                 if (newloc == NULL)
328                 {
329                         freelocale(duploc);
330                         return NULL;
331                 }
332                 uselocale(newloc);
333         }
334 #elif defined(HAVE_SETLOCALE)
335         {
336                 char *tmplocale;
337                 tmplocale = setlocale(LC_NUMERIC, NULL);
338                 if (tmplocale)
339                         oldlocale = strdup(tmplocale);
340                 setlocale(LC_NUMERIC, "C");
341         }
342 #endif
343
344         while (PEEK_CHAR(c, tok)) // Note: c might be '\0' !
345         {
346
347         redo_char:
348                 switch (state)
349                 {
350
351                 case json_tokener_state_eatws:
352                         /* Advance until we change state */
353                         while (is_ws_char(c))
354                         {
355                                 if ((!ADVANCE_CHAR(str, tok)) || (!PEEK_CHAR(c, tok)))
356                                         goto out;
357                         }
358                         if (c == '/' && !(tok->flags & JSON_TOKENER_STRICT))
359                         {
360                                 printbuf_reset(tok->pb);
361                                 printbuf_memappend_fast(tok->pb, &c, 1);
362                                 state = json_tokener_state_comment_start;
363                         }
364                         else
365                         {
366                                 state = saved_state;
367                                 goto redo_char;
368                         }
369                         break;
370
371                 case json_tokener_state_start:
372                         switch (c)
373                         {
374                         case '{':
375                                 state = json_tokener_state_eatws;
376                                 saved_state = json_tokener_state_object_field_start;
377                                 current = json_object_new_object();
378                                 if (current == NULL)
379                                         goto out;
380                                 break;
381                         case '[':
382                                 state = json_tokener_state_eatws;
383                                 saved_state = json_tokener_state_array;
384                                 current = json_object_new_array();
385                                 if (current == NULL)
386                                         goto out;
387                                 break;
388                         case 'I':
389                         case 'i':
390                                 state = json_tokener_state_inf;
391                                 printbuf_reset(tok->pb);
392                                 tok->st_pos = 0;
393                                 goto redo_char;
394                         case 'N':
395                         case 'n':
396                                 state = json_tokener_state_null; // or NaN
397                                 printbuf_reset(tok->pb);
398                                 tok->st_pos = 0;
399                                 goto redo_char;
400                         case '\'':
401                                 if (tok->flags & JSON_TOKENER_STRICT)
402                                 {
403                                         /* in STRICT mode only double-quote are allowed */
404                                         tok->err = json_tokener_error_parse_unexpected;
405                                         goto out;
406                                 }
407                                 /* FALLTHRU */
408                         case '"':
409                                 state = json_tokener_state_string;
410                                 printbuf_reset(tok->pb);
411                                 tok->quote_char = c;
412                                 break;
413                         case 'T':
414                         case 't':
415                         case 'F':
416                         case 'f':
417                                 state = json_tokener_state_boolean;
418                                 printbuf_reset(tok->pb);
419                                 tok->st_pos = 0;
420                                 goto redo_char;
421                         case '0':
422                         case '1':
423                         case '2':
424                         case '3':
425                         case '4':
426                         case '5':
427                         case '6':
428                         case '7':
429                         case '8':
430                         case '9':
431                         case '-':
432                                 state = json_tokener_state_number;
433                                 printbuf_reset(tok->pb);
434                                 tok->is_double = 0;
435                                 goto redo_char;
436                         default: tok->err = json_tokener_error_parse_unexpected; goto out;
437                         }
438                         break;
439
440                 case json_tokener_state_finish:
441                         if (tok->depth == 0)
442                                 goto out;
443                         obj = json_object_get(current);
444                         json_tokener_reset_level(tok, tok->depth);
445                         tok->depth--;
446                         goto redo_char;
447
448                 case json_tokener_state_inf: /* aka starts with 'i' (or 'I', or "-i", or "-I") */
449                 {
450                         /* If we were guaranteed to have len set, then we could (usually) handle
451                          * the entire "Infinity" check in a single strncmp (strncasecmp), but
452                          * since len might be -1 (i.e. "read until \0"), we need to check it
453                          * a character at a time.
454                          * Trying to handle it both ways would make this code considerably more
455                          * complicated with likely little performance benefit.
456                          */
457                         int is_negative = 0;
458
459                         /* Note: tok->st_pos must be 0 when state is set to json_tokener_state_inf */
460                         while (tok->st_pos < (int)json_inf_str_len)
461                         {
462                                 char inf_char = *str;
463                                 if (inf_char != json_inf_str[tok->st_pos] &&
464                                     ((tok->flags & JSON_TOKENER_STRICT) ||
465                                       inf_char != json_inf_str_invert[tok->st_pos])
466                                    )
467                                 {
468                                         tok->err = json_tokener_error_parse_unexpected;
469                                         goto out;
470                                 }
471                                 tok->st_pos++;
472                                 (void)ADVANCE_CHAR(str, tok);
473                                 if (!PEEK_CHAR(c, tok))
474                                 {
475                                         /* out of input chars, for now at least */
476                                         goto out;
477                                 }
478                         }
479                         /* We checked the full length of "Infinity", so create the object.
480                          * When handling -Infinity, the number parsing code will have dropped
481                          * the "-" into tok->pb for us, so check it now.
482                          */
483                         if (printbuf_length(tok->pb) > 0 && *(tok->pb->buf) == '-')
484                         {
485                                 is_negative = 1;
486                         }
487                         current = json_object_new_double(is_negative ? -INFINITY : INFINITY);
488                         if (current == NULL)
489                                 goto out;
490                         saved_state = json_tokener_state_finish;
491                         state = json_tokener_state_eatws;
492                         goto redo_char;
493                 }
494                 break;
495                 case json_tokener_state_null: /* aka starts with 'n' */
496                 {
497                         int size;
498                         int size_nan;
499                         printbuf_memappend_fast(tok->pb, &c, 1);
500                         size = json_min(tok->st_pos + 1, json_null_str_len);
501                         size_nan = json_min(tok->st_pos + 1, json_nan_str_len);
502                         if ((!(tok->flags & JSON_TOKENER_STRICT) &&
503                              strncasecmp(json_null_str, tok->pb->buf, size) == 0) ||
504                             (strncmp(json_null_str, tok->pb->buf, size) == 0))
505                         {
506                                 if (tok->st_pos == json_null_str_len)
507                                 {
508                                         current = NULL;
509                                         saved_state = json_tokener_state_finish;
510                                         state = json_tokener_state_eatws;
511                                         goto redo_char;
512                                 }
513                         }
514                         else if ((!(tok->flags & JSON_TOKENER_STRICT) &&
515                                   strncasecmp(json_nan_str, tok->pb->buf, size_nan) == 0) ||
516                                  (strncmp(json_nan_str, tok->pb->buf, size_nan) == 0))
517                         {
518                                 if (tok->st_pos == json_nan_str_len)
519                                 {
520                                         current = json_object_new_double(NAN);
521                                         if (current == NULL)
522                                                 goto out;
523                                         saved_state = json_tokener_state_finish;
524                                         state = json_tokener_state_eatws;
525                                         goto redo_char;
526                                 }
527                         }
528                         else
529                         {
530                                 tok->err = json_tokener_error_parse_null;
531                                 goto out;
532                         }
533                         tok->st_pos++;
534                 }
535                 break;
536
537                 case json_tokener_state_comment_start:
538                         if (c == '*')
539                         {
540                                 state = json_tokener_state_comment;
541                         }
542                         else if (c == '/')
543                         {
544                                 state = json_tokener_state_comment_eol;
545                         }
546                         else
547                         {
548                                 tok->err = json_tokener_error_parse_comment;
549                                 goto out;
550                         }
551                         printbuf_memappend_fast(tok->pb, &c, 1);
552                         break;
553
554                 case json_tokener_state_comment:
555                 {
556                         /* Advance until we change state */
557                         const char *case_start = str;
558                         while (c != '*')
559                         {
560                                 if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
561                                 {
562                                         printbuf_memappend_fast(tok->pb, case_start,
563                                                                 str - case_start);
564                                         goto out;
565                                 }
566                         }
567                         printbuf_memappend_fast(tok->pb, case_start, 1 + str - case_start);
568                         state = json_tokener_state_comment_end;
569                 }
570                 break;
571
572                 case json_tokener_state_comment_eol:
573                 {
574                         /* Advance until we change state */
575                         const char *case_start = str;
576                         while (c != '\n')
577                         {
578                                 if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
579                                 {
580                                         printbuf_memappend_fast(tok->pb, case_start,
581                                                                 str - case_start);
582                                         goto out;
583                                 }
584                         }
585                         printbuf_memappend_fast(tok->pb, case_start, str - case_start);
586                         MC_DEBUG("json_tokener_comment: %s\n", tok->pb->buf);
587                         state = json_tokener_state_eatws;
588                 }
589                 break;
590
591                 case json_tokener_state_comment_end:
592                         printbuf_memappend_fast(tok->pb, &c, 1);
593                         if (c == '/')
594                         {
595                                 MC_DEBUG("json_tokener_comment: %s\n", tok->pb->buf);
596                                 state = json_tokener_state_eatws;
597                         }
598                         else
599                         {
600                                 state = json_tokener_state_comment;
601                         }
602                         break;
603
604                 case json_tokener_state_string:
605                 {
606                         /* Advance until we change state */
607                         const char *case_start = str;
608                         while (1)
609                         {
610                                 if (c == tok->quote_char)
611                                 {
612                                         printbuf_memappend_fast(tok->pb, case_start,
613                                                                 str - case_start);
614                                         current =
615                                             json_object_new_string_len(tok->pb->buf, tok->pb->bpos);
616                                         if (current == NULL)
617                                                 goto out;
618                                         saved_state = json_tokener_state_finish;
619                                         state = json_tokener_state_eatws;
620                                         break;
621                                 }
622                                 else if (c == '\\')
623                                 {
624                                         printbuf_memappend_fast(tok->pb, case_start,
625                                                                 str - case_start);
626                                         saved_state = json_tokener_state_string;
627                                         state = json_tokener_state_string_escape;
628                                         break;
629                                 }
630                                 if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
631                                 {
632                                         printbuf_memappend_fast(tok->pb, case_start,
633                                                                 str - case_start);
634                                         goto out;
635                                 }
636                         }
637                 }
638                 break;
639
640                 case json_tokener_state_string_escape:
641                         switch (c)
642                         {
643                         case '"':
644                         case '\\':
645                         case '/':
646                                 printbuf_memappend_fast(tok->pb, &c, 1);
647                                 state = saved_state;
648                                 break;
649                         case 'b':
650                         case 'n':
651                         case 'r':
652                         case 't':
653                         case 'f':
654                                 if (c == 'b')
655                                         printbuf_memappend_fast(tok->pb, "\b", 1);
656                                 else if (c == 'n')
657                                         printbuf_memappend_fast(tok->pb, "\n", 1);
658                                 else if (c == 'r')
659                                         printbuf_memappend_fast(tok->pb, "\r", 1);
660                                 else if (c == 't')
661                                         printbuf_memappend_fast(tok->pb, "\t", 1);
662                                 else if (c == 'f')
663                                         printbuf_memappend_fast(tok->pb, "\f", 1);
664                                 state = saved_state;
665                                 break;
666                         case 'u':
667                                 tok->ucs_char = 0;
668                                 tok->st_pos = 0;
669                                 state = json_tokener_state_escape_unicode;
670                                 break;
671                         default: tok->err = json_tokener_error_parse_string; goto out;
672                         }
673                         break;
674
675                         // ===================================================
676
677                 case json_tokener_state_escape_unicode:
678                 {
679                         /* Handle a 4-byte \uNNNN sequence, or two sequences if a surrogate pair */
680                         while (1)
681                         {
682                                 if (!c || !is_hex_char(c))
683                                 {
684                                         tok->err = json_tokener_error_parse_string;
685                                         goto out;
686                                 }
687                                 tok->ucs_char |=
688                                     ((unsigned int)jt_hexdigit(c) << ((3 - tok->st_pos) * 4));
689                                 tok->st_pos++;
690                                 if (tok->st_pos >= 4)
691                                         break;
692
693                                 (void)ADVANCE_CHAR(str, tok);
694                                 if (!PEEK_CHAR(c, tok))
695                                 {
696                                         /*
697                                          * We're out of characters in the current call to
698                                          * json_tokener_parse(), but a subsequent call might
699                                          * provide us with more, so leave our current state
700                                          * as-is (including tok->high_surrogate) and return.
701                                          */
702                                         goto out;
703                                 }
704                         }
705                         tok->st_pos = 0;
706
707                         /* Now, we have a full \uNNNN sequence in tok->ucs_char */
708
709                         /* If the *previous* sequence was a high surrogate ... */
710                         if (tok->high_surrogate)
711                         {
712                                 if (IS_LOW_SURROGATE(tok->ucs_char))
713                                 {
714                                         /* Recalculate the ucs_char, then fall thru to process normally */
715                                         tok->ucs_char = DECODE_SURROGATE_PAIR(tok->high_surrogate,
716                                                                               tok->ucs_char);
717                                 }
718                                 else
719                                 {
720                                         /* High surrogate was not followed by a low surrogate
721                                          * Replace the high and process the rest normally
722                                          */
723                                         printbuf_memappend_fast(tok->pb,
724                                                                 (char *)utf8_replacement_char, 3);
725                                 }
726                                 tok->high_surrogate = 0;
727                         }
728
729                         if (tok->ucs_char < 0x80)
730                         {
731                                 unsigned char unescaped_utf[1];
732                                 unescaped_utf[0] = tok->ucs_char;
733                                 printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 1);
734                         }
735                         else if (tok->ucs_char < 0x800)
736                         {
737                                 unsigned char unescaped_utf[2];
738                                 unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
739                                 unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
740                                 printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 2);
741                         }
742                         else if (IS_HIGH_SURROGATE(tok->ucs_char))
743                         {
744                                 /*
745                                  * The next two characters should be \u, HOWEVER,
746                                  * we can't simply peek ahead here, because the
747                                  * characters we need might not be passed to us
748                                  * until a subsequent call to json_tokener_parse.
749                                  * Instead, transition through a couple of states.
750                                  * (now):
751                                  *   _escape_unicode => _unicode_need_escape
752                                  * (see a '\\' char):
753                                  *   _unicode_need_escape => _unicode_need_u
754                                  * (see a 'u' char):
755                                  *   _unicode_need_u => _escape_unicode
756                                  *      ...and we'll end up back around here.
757                                  */
758                                 tok->high_surrogate = tok->ucs_char;
759                                 tok->ucs_char = 0;
760                                 state = json_tokener_state_escape_unicode_need_escape;
761                                 break;
762                         }
763                         else if (IS_LOW_SURROGATE(tok->ucs_char))
764                         {
765                                 /* Got a low surrogate not preceded by a high */
766                                 printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
767                         }
768                         else if (tok->ucs_char < 0x10000)
769                         {
770                                 unsigned char unescaped_utf[3];
771                                 unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
772                                 unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
773                                 unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
774                                 printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 3);
775                         }
776                         else if (tok->ucs_char < 0x110000)
777                         {
778                                 unsigned char unescaped_utf[4];
779                                 unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
780                                 unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
781                                 unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
782                                 unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
783                                 printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 4);
784                         }
785                         else
786                         {
787                                 /* Don't know what we got--insert the replacement char */
788                                 printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
789                         }
790                         state = saved_state; // i.e. _state_string or _state_object_field
791                 }
792                 break;
793
794                 case json_tokener_state_escape_unicode_need_escape:
795                         // We get here after processing a high_surrogate
796                         // require a '\\' char
797                         if (!c || c != '\\')
798                         {
799                                 /* Got a high surrogate without another sequence following
800                                  * it.  Put a replacement char in for the high surrogate
801                                  * and pop back up to _state_string or _state_object_field.
802                                  */
803                                 printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
804                                 tok->high_surrogate = 0;
805                                 tok->ucs_char = 0;
806                                 tok->st_pos = 0;
807                                 state = saved_state;
808                                 goto redo_char;
809                         }
810                         state = json_tokener_state_escape_unicode_need_u;
811                         break;
812
813                 case json_tokener_state_escape_unicode_need_u:
814                         /* We already had a \ char, check that it's \u */
815                         if (!c || c != 'u')
816                         {
817                                 /* Got a high surrogate with some non-unicode escape
818                                  * sequence following it.
819                                  * Put a replacement char in for the high surrogate
820                                  * and handle the escape sequence normally.
821                                  */
822                                 printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
823                                 tok->high_surrogate = 0;
824                                 tok->ucs_char = 0;
825                                 tok->st_pos = 0;
826                                 state = json_tokener_state_string_escape;
827                                 goto redo_char;
828                         }
829                         state = json_tokener_state_escape_unicode;
830                         break;
831
832                         // ===================================================
833
834                 case json_tokener_state_boolean:
835                 {
836                         int size1, size2;
837                         printbuf_memappend_fast(tok->pb, &c, 1);
838                         size1 = json_min(tok->st_pos + 1, json_true_str_len);
839                         size2 = json_min(tok->st_pos + 1, json_false_str_len);
840                         if ((!(tok->flags & JSON_TOKENER_STRICT) &&
841                              strncasecmp(json_true_str, tok->pb->buf, size1) == 0) ||
842                             (strncmp(json_true_str, tok->pb->buf, size1) == 0))
843                         {
844                                 if (tok->st_pos == json_true_str_len)
845                                 {
846                                         current = json_object_new_boolean(1);
847                                         if (current == NULL)
848                                                 goto out;
849                                         saved_state = json_tokener_state_finish;
850                                         state = json_tokener_state_eatws;
851                                         goto redo_char;
852                                 }
853                         }
854                         else if ((!(tok->flags & JSON_TOKENER_STRICT) &&
855                                   strncasecmp(json_false_str, tok->pb->buf, size2) == 0) ||
856                                  (strncmp(json_false_str, tok->pb->buf, size2) == 0))
857                         {
858                                 if (tok->st_pos == json_false_str_len)
859                                 {
860                                         current = json_object_new_boolean(0);
861                                         if (current == NULL)
862                                                 goto out;
863                                         saved_state = json_tokener_state_finish;
864                                         state = json_tokener_state_eatws;
865                                         goto redo_char;
866                                 }
867                         }
868                         else
869                         {
870                                 tok->err = json_tokener_error_parse_boolean;
871                                 goto out;
872                         }
873                         tok->st_pos++;
874                 }
875                 break;
876
877                 case json_tokener_state_number:
878                 {
879                         /* Advance until we change state */
880                         const char *case_start = str;
881                         int case_len = 0;
882                         int is_exponent = 0;
883                         int neg_sign_ok = 1;
884                         int pos_sign_ok = 0;
885                         if (printbuf_length(tok->pb) > 0)
886                         {
887                                 /* We don't save all state from the previous incremental parse
888                                    so we need to re-generate it based on the saved string so far.
889                                  */
890                                 char *e_loc = strchr(tok->pb->buf, 'e');
891                                 if (!e_loc)
892                                         e_loc = strchr(tok->pb->buf, 'E');
893                                 if (e_loc)
894                                 {
895                                         char *last_saved_char =
896                                             &tok->pb->buf[printbuf_length(tok->pb) - 1];
897                                         is_exponent = 1;
898                                         pos_sign_ok = neg_sign_ok = 1;
899                                         /* If the "e" isn't at the end, we can't start with a '-' */
900                                         if (e_loc != last_saved_char)
901                                         {
902                                                 neg_sign_ok = 0;
903                                                 pos_sign_ok = 0;
904                                         }
905                                         // else leave it set to 1, i.e. start of the new input
906                                 }
907                         }
908
909                         while (c && ((c >= '0' && c <= '9') ||
910                                      (!is_exponent && (c == 'e' || c == 'E')) ||
911                                      (neg_sign_ok && c == '-') || (pos_sign_ok && c == '+') ||
912                                      (!tok->is_double && c == '.')))
913                         {
914                                 pos_sign_ok = neg_sign_ok = 0;
915                                 ++case_len;
916
917                                 /* non-digit characters checks */
918                                 /* note: since the main loop condition to get here was
919                                  * an input starting with 0-9 or '-', we are
920                                  * protected from input starting with '.' or
921                                  * e/E.
922                                  */
923                                 switch (c)
924                                 {
925                                 case '.':
926                                         tok->is_double = 1;
927                                         pos_sign_ok = 1;
928                                         neg_sign_ok = 1;
929                                         break;
930                                 case 'e': /* FALLTHRU */
931                                 case 'E':
932                                         is_exponent = 1;
933                                         tok->is_double = 1;
934                                         /* the exponent part can begin with a negative sign */
935                                         pos_sign_ok = neg_sign_ok = 1;
936                                         break;
937                                 default: break;
938                                 }
939
940                                 if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
941                                 {
942                                         printbuf_memappend_fast(tok->pb, case_start, case_len);
943                                         goto out;
944                                 }
945                         }
946                         /*
947                                 Now we know c isn't a valid number char, but check whether
948                                 it might have been intended to be, and return a potentially
949                                 more understandable error right away.
950                                 However, if we're at the top-level, use the number as-is
951                             because c can be part of a new object to parse on the
952                                 next call to json_tokener_parse().
953                          */
954                         if (tok->depth > 0 && c != ',' && c != ']' && c != '}' && c != '/' &&
955                             c != 'I' && c != 'i' && !is_ws_char(c))
956                         {
957                                 tok->err = json_tokener_error_parse_number;
958                                 goto out;
959                         }
960                         if (case_len > 0)
961                                 printbuf_memappend_fast(tok->pb, case_start, case_len);
962
963                         // Check for -Infinity
964                         if (tok->pb->buf[0] == '-' && case_len <= 1 && (c == 'i' || c == 'I'))
965                         {
966                                 state = json_tokener_state_inf;
967                                 tok->st_pos = 0;
968                                 goto redo_char;
969                         }
970                         if (tok->is_double && !(tok->flags & JSON_TOKENER_STRICT))
971                         {
972                                 /* Trim some chars off the end, to allow things
973                                    like "123e+" to parse ok. */
974                                 while (printbuf_length(tok->pb) > 1)
975                                 {
976                                         char last_char = tok->pb->buf[printbuf_length(tok->pb) - 1];
977                                         if (last_char != 'e' && last_char != 'E' &&
978                                             last_char != '-' && last_char != '+')
979                                         {
980                                                 break;
981                                         }
982                                         tok->pb->buf[printbuf_length(tok->pb) - 1] = '\0';
983                                         printbuf_length(tok->pb)--;
984                                 }
985                         }
986                 }
987                         {
988                                 int64_t num64;
989                                 uint64_t numuint64;
990                                 double numd;
991                                 if (!tok->is_double && tok->pb->buf[0] == '-' &&
992                                     json_parse_int64(tok->pb->buf, &num64) == 0)
993                                 {
994                                         current = json_object_new_int64(num64);
995                                         if (current == NULL)
996                                                 goto out;
997                                 }
998                                 else if (!tok->is_double && tok->pb->buf[0] != '-' &&
999                                          json_parse_uint64(tok->pb->buf, &numuint64) == 0)
1000                                 {
1001                                         if (numuint64 && tok->pb->buf[0] == '0' &&
1002                                             (tok->flags & JSON_TOKENER_STRICT))
1003                                         {
1004                                                 tok->err = json_tokener_error_parse_number;
1005                                                 goto out;
1006                                         }
1007                                         if (numuint64 <= INT64_MAX)
1008                                         {
1009                                                 num64 = (uint64_t)numuint64;
1010                                                 current = json_object_new_int64(num64);
1011                                                 if (current == NULL)
1012                                                         goto out;
1013                                         }
1014                                         else
1015                                         {
1016                                                 current = json_object_new_uint64(numuint64);
1017                                                 if (current == NULL)
1018                                                         goto out;
1019                                         }
1020                                 }
1021                                 else if (tok->is_double &&
1022                                          json_tokener_parse_double(
1023                                              tok->pb->buf, printbuf_length(tok->pb), &numd) == 0)
1024                                 {
1025                                         current = json_object_new_double_s(numd, tok->pb->buf);
1026                                         if (current == NULL)
1027                                                 goto out;
1028                                 }
1029                                 else
1030                                 {
1031                                         tok->err = json_tokener_error_parse_number;
1032                                         goto out;
1033                                 }
1034                                 saved_state = json_tokener_state_finish;
1035                                 state = json_tokener_state_eatws;
1036                                 goto redo_char;
1037                         }
1038                         break;
1039
1040                 case json_tokener_state_array_after_sep:
1041                 case json_tokener_state_array:
1042                         if (c == ']')
1043                         {
1044                                 // Minimize memory usage; assume parsed objs are unlikely to be changed
1045                                 json_object_array_shrink(current, 0);
1046
1047                                 if (state == json_tokener_state_array_after_sep &&
1048                                     (tok->flags & JSON_TOKENER_STRICT))
1049                                 {
1050                                         tok->err = json_tokener_error_parse_unexpected;
1051                                         goto out;
1052                                 }
1053                                 saved_state = json_tokener_state_finish;
1054                                 state = json_tokener_state_eatws;
1055                         }
1056                         else
1057                         {
1058                                 if (tok->depth >= tok->max_depth - 1)
1059                                 {
1060                                         tok->err = json_tokener_error_depth;
1061                                         goto out;
1062                                 }
1063                                 state = json_tokener_state_array_add;
1064                                 tok->depth++;
1065                                 json_tokener_reset_level(tok, tok->depth);
1066                                 goto redo_char;
1067                         }
1068                         break;
1069
1070                 case json_tokener_state_array_add:
1071                         if (json_object_array_add(current, obj) != 0)
1072                                 goto out;
1073                         saved_state = json_tokener_state_array_sep;
1074                         state = json_tokener_state_eatws;
1075                         goto redo_char;
1076
1077                 case json_tokener_state_array_sep:
1078                         if (c == ']')
1079                         {
1080                                 // Minimize memory usage; assume parsed objs are unlikely to be changed
1081                                 json_object_array_shrink(current, 0);
1082
1083                                 saved_state = json_tokener_state_finish;
1084                                 state = json_tokener_state_eatws;
1085                         }
1086                         else if (c == ',')
1087                         {
1088                                 saved_state = json_tokener_state_array_after_sep;
1089                                 state = json_tokener_state_eatws;
1090                         }
1091                         else
1092                         {
1093                                 tok->err = json_tokener_error_parse_array;
1094                                 goto out;
1095                         }
1096                         break;
1097
1098                 case json_tokener_state_object_field_start:
1099                 case json_tokener_state_object_field_start_after_sep:
1100                         if (c == '}')
1101                         {
1102                                 if (state == json_tokener_state_object_field_start_after_sep &&
1103                                     (tok->flags & JSON_TOKENER_STRICT))
1104                                 {
1105                                         tok->err = json_tokener_error_parse_unexpected;
1106                                         goto out;
1107                                 }
1108                                 saved_state = json_tokener_state_finish;
1109                                 state = json_tokener_state_eatws;
1110                         }
1111                         else if (c == '"' || c == '\'')
1112                         {
1113                                 tok->quote_char = c;
1114                                 printbuf_reset(tok->pb);
1115                                 state = json_tokener_state_object_field;
1116                         }
1117                         else
1118                         {
1119                                 tok->err = json_tokener_error_parse_object_key_name;
1120                                 goto out;
1121                         }
1122                         break;
1123
1124                 case json_tokener_state_object_field:
1125                 {
1126                         /* Advance until we change state */
1127                         const char *case_start = str;
1128                         while (1)
1129                         {
1130                                 if (c == tok->quote_char)
1131                                 {
1132                                         printbuf_memappend_fast(tok->pb, case_start,
1133                                                                 str - case_start);
1134                                         obj_field_name = strdup(tok->pb->buf);
1135                                         saved_state = json_tokener_state_object_field_end;
1136                                         state = json_tokener_state_eatws;
1137                                         break;
1138                                 }
1139                                 else if (c == '\\')
1140                                 {
1141                                         printbuf_memappend_fast(tok->pb, case_start,
1142                                                                 str - case_start);
1143                                         saved_state = json_tokener_state_object_field;
1144                                         state = json_tokener_state_string_escape;
1145                                         break;
1146                                 }
1147                                 if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
1148                                 {
1149                                         printbuf_memappend_fast(tok->pb, case_start,
1150                                                                 str - case_start);
1151                                         goto out;
1152                                 }
1153                         }
1154                 }
1155                 break;
1156
1157                 case json_tokener_state_object_field_end:
1158                         if (c == ':')
1159                         {
1160                                 saved_state = json_tokener_state_object_value;
1161                                 state = json_tokener_state_eatws;
1162                         }
1163                         else
1164                         {
1165                                 tok->err = json_tokener_error_parse_object_key_sep;
1166                                 goto out;
1167                         }
1168                         break;
1169
1170                 case json_tokener_state_object_value:
1171                         if (tok->depth >= tok->max_depth - 1)
1172                         {
1173                                 tok->err = json_tokener_error_depth;
1174                                 goto out;
1175                         }
1176                         state = json_tokener_state_object_value_add;
1177                         tok->depth++;
1178                         json_tokener_reset_level(tok, tok->depth);
1179                         goto redo_char;
1180
1181                 case json_tokener_state_object_value_add:
1182                         json_object_object_add(current, obj_field_name, obj);
1183                         free(obj_field_name);
1184                         obj_field_name = NULL;
1185                         saved_state = json_tokener_state_object_sep;
1186                         state = json_tokener_state_eatws;
1187                         goto redo_char;
1188
1189                 case json_tokener_state_object_sep:
1190                         /* { */
1191                         if (c == '}')
1192                         {
1193                                 saved_state = json_tokener_state_finish;
1194                                 state = json_tokener_state_eatws;
1195                         }
1196                         else if (c == ',')
1197                         {
1198                                 saved_state = json_tokener_state_object_field_start_after_sep;
1199                                 state = json_tokener_state_eatws;
1200                         }
1201                         else
1202                         {
1203                                 tok->err = json_tokener_error_parse_object_value_sep;
1204                                 goto out;
1205                         }
1206                         break;
1207                 }
1208                 (void)ADVANCE_CHAR(str, tok);
1209                 if (!c) // This is the char *before* advancing
1210                         break;
1211         } /* while(PEEK_CHAR) */
1212
1213 out:
1214         if ((tok->flags & JSON_TOKENER_VALIDATE_UTF8) && (nBytes != 0))
1215         {
1216                 tok->err = json_tokener_error_parse_utf8_string;
1217         }
1218         if (c && (state == json_tokener_state_finish) && (tok->depth == 0) &&
1219             (tok->flags & (JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS)) ==
1220                 JSON_TOKENER_STRICT)
1221         {
1222                 /* unexpected char after JSON data */
1223                 tok->err = json_tokener_error_parse_unexpected;
1224         }
1225         if (!c)
1226         {
1227                 /* We hit an eof char (0) */
1228                 if (state != json_tokener_state_finish && saved_state != json_tokener_state_finish)
1229                         tok->err = json_tokener_error_parse_eof;
1230         }
1231
1232 #ifdef HAVE_USELOCALE
1233         uselocale(oldlocale);
1234         freelocale(newloc);
1235 #elif defined(HAVE_SETLOCALE)
1236         setlocale(LC_NUMERIC, oldlocale);
1237         free(oldlocale);
1238 #endif
1239
1240         if (tok->err == json_tokener_success)
1241         {
1242                 json_object *ret = json_object_get(current);
1243                 int ii;
1244
1245                 /* Partially reset, so we parse additional objects on subsequent calls. */
1246                 for (ii = tok->depth; ii >= 0; ii--)
1247                         json_tokener_reset_level(tok, ii);
1248                 return ret;
1249         }
1250
1251         MC_DEBUG("json_tokener_parse_ex: error %s at offset %d\n", json_tokener_errors[tok->err],
1252                  tok->char_offset);
1253         return NULL;
1254 }
1255
1256 static json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes)
1257 {
1258         unsigned char chr = c;
1259         if (*nBytes == 0)
1260         {
1261                 if (chr >= 0x80)
1262                 {
1263                         if ((chr & 0xe0) == 0xc0)
1264                                 *nBytes = 1;
1265                         else if ((chr & 0xf0) == 0xe0)
1266                                 *nBytes = 2;
1267                         else if ((chr & 0xf8) == 0xf0)
1268                                 *nBytes = 3;
1269                         else
1270                                 return 0;
1271                 }
1272         }
1273         else
1274         {
1275                 if ((chr & 0xC0) != 0x80)
1276                         return 0;
1277                 (*nBytes)--;
1278         }
1279         return 1;
1280 }
1281
1282 void json_tokener_set_flags(struct json_tokener *tok, int flags)
1283 {
1284         tok->flags = flags;
1285 }
1286
1287 size_t json_tokener_get_parse_end(struct json_tokener *tok)
1288 {
1289         assert(tok->char_offset >= 0); /* Drop this line when char_offset becomes a size_t */
1290         return (size_t)tok->char_offset;
1291 }
1292
1293 static int json_tokener_parse_double(const char *buf, int len, double *retval)
1294 {
1295         char *end;
1296         *retval = strtod(buf, &end);
1297         if (buf + len == end)
1298                 return 0; // It worked
1299         return 1;
1300 }