regardless of the database encoding, and are checked only for syntactic
correctness (that is, that four hex digits follow <literal>\u</>).
However, the input function for <type>jsonb</> is stricter: it disallows
- Unicode escapes for non-ASCII characters (those
- above <literal>U+007F</>) unless the database encoding is UTF8. It also
- insists that any use of Unicode surrogate pairs to designate characters
- outside the Unicode Basic Multilingual Plane be correct. Valid Unicode
- escapes, except for <literal>\u0000</>, are then converted to the
- equivalent ASCII or UTF8 character for storage.
+ Unicode escapes for non-ASCII characters (those above <literal>U+007F</>)
+ unless the database encoding is UTF8. The <type>jsonb</> type also
+ rejects <literal>\u0000</> (because that cannot be represented in
+ <productname>PostgreSQL</productname>'s <type>text</> type), and it insists
+ that any use of Unicode surrogate pairs to designate characters outside
+ the Unicode Basic Multilingual Plane be correct. Valid Unicode escapes
+ are converted to the equivalent ASCII or UTF8 character for storage;
+ this includes folding surrogate pairs into a single character.
</para>
<note>
constitutes valid <type>jsonb</type> data that do not apply to
the <type>json</type> type, nor to JSON in the abstract, corresponding
to limits on what can be represented by the underlying data type.
- Specifically, <type>jsonb</> will reject numbers that are outside the
+ Notably, <type>jsonb</> will reject numbers that are outside the
range of the <productname>PostgreSQL</productname> <type>numeric</> data
type, while <type>json</> will not. Such implementation-defined
restrictions are permitted by <acronym>RFC</> 7159. However, in
<row>
<entry><type>string</></entry>
<entry><type>text</></entry>
- <entry>See notes above concerning encoding restrictions</entry>
+ <entry><literal>\u0000</> is disallowed, as are non-ASCII Unicode
+ escapes if database encoding is not UTF8</entry>
</row>
<row>
<entry><type>number</></entry>
</para>
</listitem>
- <listitem>
- <para>
- Unicode escapes in <link linkend="datatype-json"><type>JSON</type></link>
- text values are no longer rendered with the backslash escaped
- (Andrew Dunstan)
- </para>
-
- <para>
- Previously, all backslashes in text values being formed into JSON
- were escaped. Now a backslash followed by <literal>u</> and four
- hexadecimal digits is not escaped, as this is a legal sequence in a
- JSON string value, and escaping the backslash led to some perverse
- results.
- </para>
- </listitem>
-
<listitem>
<para>
When converting values of type <type>date</>, <type>timestamp</>
* For UTF8, replace the escape sequence by the actual
* utf8 character in lex->strval. Do this also for other
* encodings if the escape designates an ASCII character,
- * otherwise raise an error. We don't ever unescape a
- * \u0000, since that would result in an impermissible nul
- * byte.
+ * otherwise raise an error.
*/
if (ch == 0)
{
- appendStringInfoString(lex->strval, "\\u0000");
+ /* We can't allow this, since our TEXT type doesn't */
+ ereport(ERROR,
+ (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+ errmsg("unsupported Unicode escape sequence"),
+ errdetail("\\u0000 cannot be converted to text."),
+ report_json_context(lex)));
}
else if (GetDatabaseEncoding() == PG_UTF8)
{
else
{
ereport(ERROR,
- (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
- errmsg("invalid input syntax for type json"),
+ (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+ errmsg("unsupported Unicode escape sequence"),
errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
report_json_context(lex)));
}
/*
* We need to get the output function for everything except date and
- * timestamp types, array and composite types, booleans,
- * and non-builtin types where there's a cast to json.
+ * timestamp types, array and composite types, booleans, and non-builtin
+ * types where there's a cast to json.
*/
switch (typoid)
/* but let's look for a cast to json, if it's not built-in */
if (typoid >= FirstNormalObjectId)
{
- Oid castfunc;
+ Oid castfunc;
CoercionPathType ctype;
ctype = find_coercion_pathway(JSONOID, typoid,
- COERCION_EXPLICIT, &castfunc);
+ COERCION_EXPLICIT,
+ &castfunc);
if (ctype == COERCION_PATH_FUNC && OidIsValid(castfunc))
{
*tcategory = JSONTYPE_CAST;
appendStringInfoString(buf, "\\\"");
break;
case '\\':
-
- /*
- * Unicode escapes are passed through as is. There is no
- * requirement that they denote a valid character in the
- * server encoding - indeed that is a big part of their
- * usefulness.
- *
- * All we require is that they consist of \uXXXX where the Xs
- * are hexadecimal digits. It is the responsibility of the
- * caller of, say, to_json() to make sure that the unicode
- * escape is valid.
- *
- * In the case of a jsonb string value being escaped, the only
- * unicode escape that should be present is \u0000, all the
- * other unicode escapes will have been resolved.
- */
- if (p[1] == 'u' &&
- isxdigit((unsigned char) p[2]) &&
- isxdigit((unsigned char) p[3]) &&
- isxdigit((unsigned char) p[4]) &&
- isxdigit((unsigned char) p[5]))
- appendStringInfoCharMacro(buf, *p);
- else
- appendStringInfoString(buf, "\\\\");
+ appendStringInfoString(buf, "\\\\");
break;
default:
if ((unsigned char) *p < ' ')
(1 row)
COMMIT;
--- unicode escape - backslash is not escaped
-select to_json(text '\uabcd');
- to_json
-----------
- "\uabcd"
-(1 row)
-
--- any other backslash is escaped
-select to_json(text '\abcd');
- to_json
-----------
- "\\abcd"
-(1 row)
-
--json_agg
SELECT json_agg(q)
FROM ( SELECT $$a$$ || x AS b, y AS c,
DETAIL: Unicode low surrogate must follow a high surrogate.
CONTEXT: JSON data, line 1: { "a":...
--handling of simple unicode escapes
+select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
+ correct_in_utf8
+---------------------------------------
+ { "a": "the Copyright \u00a9 sign" }
+(1 row)
+
+select json '{ "a": "dollar \u0024 character" }' as correct_everywhere;
+ correct_everywhere
+-------------------------------------
+ { "a": "dollar \u0024 character" }
+(1 row)
+
+select json '{ "a": "dollar \\u0024 character" }' as not_an_escape;
+ not_an_escape
+--------------------------------------
+ { "a": "dollar \\u0024 character" }
+(1 row)
+
+select json '{ "a": "null \u0000 escape" }' as not_unescaped;
+ not_unescaped
+--------------------------------
+ { "a": "null \u0000 escape" }
+(1 row)
+
+select json '{ "a": "null \\u0000 escape" }' as not_an_escape;
+ not_an_escape
+---------------------------------
+ { "a": "null \\u0000 escape" }
+(1 row)
+
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
correct_in_utf8
----------------------
dollar $ character
(1 row)
-select json '{ "a": "null \u0000 escape" }' ->> 'a' as not_unescaped;
- not_unescaped
+select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
+ not_an_escape
+-------------------------
+ dollar \u0024 character
+(1 row)
+
+select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
+ERROR: unsupported Unicode escape sequence
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: { "a":...
+select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
+ not_an_escape
--------------------
null \u0000 escape
(1 row)
(1 row)
COMMIT;
--- unicode escape - backslash is not escaped
-select to_json(text '\uabcd');
- to_json
-----------
- "\uabcd"
-(1 row)
-
--- any other backslash is escaped
-select to_json(text '\abcd');
- to_json
-----------
- "\\abcd"
-(1 row)
-
--json_agg
SELECT json_agg(q)
FROM ( SELECT $$a$$ || x AS b, y AS c,
-- handling of unicode surrogate pairs
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
-ERROR: invalid input syntax for type json
+ERROR: unsupported Unicode escape sequence
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
CONTEXT: JSON data, line 1: { "a":...
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
DETAIL: Unicode low surrogate must follow a high surrogate.
CONTEXT: JSON data, line 1: { "a":...
--handling of simple unicode escapes
+select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
+ correct_in_utf8
+---------------------------------------
+ { "a": "the Copyright \u00a9 sign" }
+(1 row)
+
+select json '{ "a": "dollar \u0024 character" }' as correct_everywhere;
+ correct_everywhere
+-------------------------------------
+ { "a": "dollar \u0024 character" }
+(1 row)
+
+select json '{ "a": "dollar \\u0024 character" }' as not_an_escape;
+ not_an_escape
+--------------------------------------
+ { "a": "dollar \\u0024 character" }
+(1 row)
+
+select json '{ "a": "null \u0000 escape" }' as not_unescaped;
+ not_unescaped
+--------------------------------
+ { "a": "null \u0000 escape" }
+(1 row)
+
+select json '{ "a": "null \\u0000 escape" }' as not_an_escape;
+ not_an_escape
+---------------------------------
+ { "a": "null \\u0000 escape" }
+(1 row)
+
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
-ERROR: invalid input syntax for type json
+ERROR: unsupported Unicode escape sequence
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
CONTEXT: JSON data, line 1: { "a":...
select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
dollar $ character
(1 row)
-select json '{ "a": "null \u0000 escape" }' ->> 'a' as not_unescaped;
- not_unescaped
+select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
+ not_an_escape
+-------------------------
+ dollar \u0024 character
+(1 row)
+
+select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
+ERROR: unsupported Unicode escape sequence
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: { "a":...
+select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
+ not_an_escape
--------------------
null \u0000 escape
(1 row)
^
DETAIL: "\u" must be followed by four hexadecimal digits.
CONTEXT: JSON data, line 1: "\u000g...
-SELECT '"\u0000"'::jsonb; -- OK, legal escape
- jsonb
-----------
- "\u0000"
+SELECT '"\u0045"'::jsonb; -- OK, legal escape
+ jsonb
+-------
+ "E"
(1 row)
+SELECT '"\u0000"'::jsonb; -- ERROR, we don't support U+0000
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT '"\u0000"'::jsonb;
+ ^
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: ...
-- use octet_length here so we don't get an odd unicode char in the
-- output
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
(1 row)
COMMIT;
--- unicode escape - backslash is not escaped
-select to_jsonb(text '\uabcd');
- to_jsonb
-----------
- "\uabcd"
-(1 row)
-
--- any other backslash is escaped
-select to_jsonb(text '\abcd');
- to_jsonb
-----------
- "\\abcd"
-(1 row)
-
--jsonb_agg
CREATE TEMP TABLE rows AS
SELECT x, 'txt' || x as y
DETAIL: Unicode low surrogate must follow a high surrogate.
CONTEXT: JSON data, line 1: { "a":...
-- handling of simple unicode escapes
-SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' AS correct_in_utf8;
+SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
+ correct_in_utf8
+-------------------------------
+ {"a": "the Copyright © sign"}
+(1 row)
+
+SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere;
+ correct_everywhere
+-----------------------------
+ {"a": "dollar $ character"}
+(1 row)
+
+SELECT jsonb '{ "a": "dollar \\u0024 character" }' as not_an_escape;
+ not_an_escape
+-----------------------------------
+ {"a": "dollar \\u0024 character"}
+(1 row)
+
+SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
+ ^
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: { "a":...
+SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
+ not_an_escape
+------------------------------
+ {"a": "null \\u0000 escape"}
+(1 row)
+
+SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
correct_in_utf8
----------------------
the Copyright © sign
(1 row)
-SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' AS correct_everyWHERE;
+SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
correct_everywhere
--------------------
dollar $ character
(1 row)
-SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' AS not_unescaped;
- not_unescaped
+SELECT jsonb '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
+ not_an_escape
+-------------------------
+ dollar \u0024 character
+(1 row)
+
+SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
+ ^
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: { "a":...
+SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
+ not_an_escape
--------------------
null \u0000 escape
(1 row)
^
DETAIL: "\u" must be followed by four hexadecimal digits.
CONTEXT: JSON data, line 1: "\u000g...
-SELECT '"\u0000"'::jsonb; -- OK, legal escape
- jsonb
-----------
- "\u0000"
+SELECT '"\u0045"'::jsonb; -- OK, legal escape
+ jsonb
+-------
+ "E"
(1 row)
+SELECT '"\u0000"'::jsonb; -- ERROR, we don't support U+0000
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT '"\u0000"'::jsonb;
+ ^
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: ...
-- use octet_length here so we don't get an odd unicode char in the
-- output
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
-ERROR: invalid input syntax for type json
+ERROR: unsupported Unicode escape sequence
LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text);
^
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
(1 row)
COMMIT;
--- unicode escape - backslash is not escaped
-select to_jsonb(text '\uabcd');
- to_jsonb
-----------
- "\uabcd"
-(1 row)
-
--- any other backslash is escaped
-select to_jsonb(text '\abcd');
- to_jsonb
-----------
- "\\abcd"
-(1 row)
-
--jsonb_agg
CREATE TEMP TABLE rows AS
SELECT x, 'txt' || x as y
-- handling of unicode surrogate pairs
SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8;
-ERROR: invalid input syntax for type json
+ERROR: unsupported Unicode escape sequence
LINE 1: SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc3...
^
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
DETAIL: Unicode low surrogate must follow a high surrogate.
CONTEXT: JSON data, line 1: { "a":...
-- handling of simple unicode escapes
-SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' AS correct_in_utf8;
-ERROR: invalid input syntax for type json
+SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as corr...
+ ^
+DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
+CONTEXT: JSON data, line 1: { "a":...
+SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere;
+ correct_everywhere
+-----------------------------
+ {"a": "dollar $ character"}
+(1 row)
+
+SELECT jsonb '{ "a": "dollar \\u0024 character" }' as not_an_escape;
+ not_an_escape
+-----------------------------------
+ {"a": "dollar \\u0024 character"}
+(1 row)
+
+SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
+ ^
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: { "a":...
+SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
+ not_an_escape
+------------------------------
+ {"a": "null \\u0000 escape"}
+(1 row)
+
+SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
+ERROR: unsupported Unicode escape sequence
LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a'...
^
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
CONTEXT: JSON data, line 1: { "a":...
-SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' AS correct_everyWHERE;
+SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
correct_everywhere
--------------------
dollar $ character
(1 row)
-SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' AS not_unescaped;
- not_unescaped
+SELECT jsonb '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
+ not_an_escape
+-------------------------
+ dollar \u0024 character
+(1 row)
+
+SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
+ERROR: unsupported Unicode escape sequence
+LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
+ ^
+DETAIL: \u0000 cannot be converted to text.
+CONTEXT: JSON data, line 1: { "a":...
+SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
+ not_an_escape
--------------------
null \u0000 escape
(1 row)
select to_json(timestamptz '2014-05-28 12:22:35.614298-04');
COMMIT;
--- unicode escape - backslash is not escaped
-
-select to_json(text '\uabcd');
-
--- any other backslash is escaped
-
-select to_json(text '\abcd');
-
--json_agg
SELECT json_agg(q)
--handling of simple unicode escapes
+select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
+select json '{ "a": "dollar \u0024 character" }' as correct_everywhere;
+select json '{ "a": "dollar \\u0024 character" }' as not_an_escape;
+select json '{ "a": "null \u0000 escape" }' as not_unescaped;
+select json '{ "a": "null \\u0000 escape" }' as not_an_escape;
+
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
-select json '{ "a": "null \u0000 escape" }' ->> 'a' as not_unescaped;
+select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
+select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
+select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
--json_typeof() function
select value, json_typeof(value)
SELECT '"\u"'::jsonb; -- ERROR, incomplete escape
SELECT '"\u00"'::jsonb; -- ERROR, incomplete escape
SELECT '"\u000g"'::jsonb; -- ERROR, g is not a hex digit
-SELECT '"\u0000"'::jsonb; -- OK, legal escape
+SELECT '"\u0045"'::jsonb; -- OK, legal escape
+SELECT '"\u0000"'::jsonb; -- ERROR, we don't support U+0000
-- use octet_length here so we don't get an odd unicode char in the
-- output
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
select to_jsonb(timestamptz '2014-05-28 12:22:35.614298-04');
COMMIT;
--- unicode escape - backslash is not escaped
-
-select to_jsonb(text '\uabcd');
-
--- any other backslash is escaped
-
-select to_jsonb(text '\abcd');
-
--jsonb_agg
CREATE TEMP TABLE rows AS
SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
-- handling of simple unicode escapes
-SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' AS correct_in_utf8;
-SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' AS correct_everyWHERE;
-SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' AS not_unescaped;
+
+SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
+SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere;
+SELECT jsonb '{ "a": "dollar \\u0024 character" }' as not_an_escape;
+SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
+SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
+
+SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
+SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
+SELECT jsonb '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
+SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
+SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
-- jsonb_to_record and jsonb_to_recordset