From 89e26969ae5bc7a259c1bd150f4d58c67800424b Mon Sep 17 00:00:00 2001 From: Stephen Dolan <mu@netsoc.tcd.ie> Date: Thu, 27 Dec 2012 20:49:34 +0000 Subject: [PATCH] @foo syntax for encoding of strings into various formats. Fixes part of #47 and #48. --- builtin.c | 171 +++++++++++++++++++++++++++++++ docs/content/3.manual/manual.yml | 141 ++++++++++++++++++------- lexer.l | 3 + parser.y | 20 +++- testdata | 14 +++ 5 files changed, 307 insertions(+), 42 deletions(-) diff --git a/builtin.c b/builtin.c index 4f9fab5..c431be0 100644 --- a/builtin.c +++ b/builtin.c @@ -5,6 +5,8 @@ #include "parser.h" #include "locfile.h" #include "jv_aux.h" +#include "jv_unicode.h" + typedef jv (*func_1)(jv); @@ -207,6 +209,174 @@ static jv f_tostring(jv input) { } } +#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + +static jv escape_string(jv input, const char* escapings) { + + assert(jv_get_kind(input) == JV_KIND_STRING); + const char* lookup[128] = {0}; + const char* p = escapings; + while (*p) { + lookup[(int)*p] = p+1; + p++; + p += strlen(p); + p++; + } + + jv ret = jv_string(""); + const char* i = jv_string_value(input); + const char* end = i + jv_string_length(jv_copy(input)); + const char* cstart; + int c = 0; + while ((i = jvp_utf8_next((cstart = i), end, &c))) { + assert(c != -1); + if (c < 128 && lookup[c]) { + ret = jv_string_append_str(ret, lookup[c]); + } else { + ret = jv_string_append_buf(ret, cstart, i - cstart); + } + } + jv_free(input); + return ret; + +} + +static jv f_format(jv input, jv fmt) { + if (jv_get_kind(fmt) != JV_KIND_STRING) { + jv_free(input); + return type_error(fmt, "is not a valid format"); + } + const char* fmt_s = jv_string_value(fmt); + if (!strcmp(fmt_s, "json")) { + jv_free(fmt); + return jv_dump_string(input, 0); + } else if (!strcmp(fmt_s, "text")) { + jv_free(fmt); + return f_tostring(input); + } else if (!strcmp(fmt_s, "csv")) { + jv_free(fmt); + if (jv_get_kind(input) != JV_KIND_ARRAY) + return type_error(input, "cannot be csv-formatted, only array"); + jv line = jv_string(""); + for (int i=0; i<jv_array_length(jv_copy(input)); i++) { + if (i) line = jv_string_append_str(line, ","); + jv x = jv_array_get(jv_copy(input), i); + switch (jv_get_kind(x)) { + case JV_KIND_NULL: + /* null rendered as empty string */ + jv_free(x); + break; + case JV_KIND_TRUE: + case JV_KIND_FALSE: + line = jv_string_concat(line, jv_dump_string(x, 0)); + break; + case JV_KIND_NUMBER: + if (jv_number_value(x) != jv_number_value(x)) { + /* NaN, render as empty string */ + jv_free(x); + } else { + line = jv_string_concat(line, jv_dump_string(x, 0)); + } + break; + case JV_KIND_STRING: { + line = jv_string_append_str(line, "\""); + line = jv_string_concat(line, escape_string(x, "\"\"\"\0")); + line = jv_string_append_str(line, "\""); + break; + } + default: + jv_free(input); + jv_free(line); + return type_error(x, "is not valid in a csv row"); + } + } + jv_free(input); + return line; + } else if (!strcmp(fmt_s, "html")) { + jv_free(fmt); + return escape_string(f_tostring(input), "&&\0<<\0>>\0''\0\""\0"); + } else if (!strcmp(fmt_s, "uri")) { + jv_free(fmt); + input = f_tostring(input); + + int unreserved[128] = {0}; + const char* p = CHARS_ALPHANUM "-_.!~*'()"; + while (*p) unreserved[(int)*p++] = 1; + + jv line = jv_string(""); + const char* s = jv_string_value(input); + for (int i=0; i<jv_string_length(jv_copy(input)); i++) { + unsigned ch = (unsigned)*s; + if (ch < 128 && unreserved[ch]) { + line = jv_string_append_buf(line, s, 1); + } else { + line = jv_string_concat(line, jv_string_fmt("%%%02x", ch)); + } + s++; + } + jv_free(input); + return line; + } else if (!strcmp(fmt_s, "sh")) { + jv_free(fmt); + if (jv_get_kind(input) != JV_KIND_ARRAY) + input = jv_array_set(jv_array(), 0, input); + jv line = jv_string(""); + for (int i=0; i<jv_array_length(jv_copy(input)); i++) { + if (i) line = jv_string_append_str(line, " "); + jv x = jv_array_get(jv_copy(input), i); + switch (jv_get_kind(x)) { + case JV_KIND_NULL: + case JV_KIND_TRUE: + case JV_KIND_FALSE: + case JV_KIND_NUMBER: + line = jv_string_concat(line, jv_dump_string(x, 0)); + break; + + case JV_KIND_STRING: { + line = jv_string_append_str(line, "'"); + line = jv_string_concat(line, escape_string(x, "''\\''\0")); + line = jv_string_append_str(line, "'"); + break; + } + + default: + jv_free(input); + jv_free(line); + return type_error(x, "can not be escaped for shell"); + } + } + jv_free(input); + return line; + } else if (!strcmp(fmt_s, "base64")) { + jv_free(fmt); + input = f_tostring(input); + jv line = jv_string(""); + const char b64[64 + 1] = CHARS_ALPHANUM "+/"; + const char* data = jv_string_value(input); + int len = jv_string_length(jv_copy(input)); + for (int i=0; i<len; i+=3) { + uint32_t code = 0; + int n = len - i >= 3 ? 3 : len-i; + for (int j=0; j<3; j++) { + code <<= 8; + code |= j < n ? (unsigned)data[i+j] : 0; + } + char buf[4]; + for (int j=0; j<4; j++) { + buf[j] = b64[(code >> (18 - j*6)) & 0x3f]; + } + if (n < 3) buf[3] = '='; + if (n < 2) buf[2] = '='; + line = jv_string_append_buf(line, buf, sizeof(buf)); + } + jv_free(input); + return line; + } else { + jv_free(input); + return jv_invalid_with_msg(jv_string_concat(fmt, jv_string(" is not a valid format"))); + } +} + static jv f_keys(jv input) { if (jv_get_kind(input) == JV_KIND_OBJECT || jv_get_kind(input) == JV_KIND_ARRAY) { return jv_keys(input); @@ -332,6 +502,7 @@ static struct cfunction function_list[] = { {(cfunction_ptr)f_min_by_impl, "_min_by_impl", 2}, {(cfunction_ptr)f_max_by_impl, "_max_by_impl", 2}, {(cfunction_ptr)f_error, "error", 2}, + {(cfunction_ptr)f_format, "format", 2}, }; static struct symbol_table cbuiltins = diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml index cf5de52..55f43a0 100644 --- a/docs/content/3.manual/manual.yml +++ b/docs/content/3.manual/manual.yml @@ -75,51 +75,51 @@ sections: You can affect how jq reads and writes its input and output using some command-line options: - * `--slurp`/`-s`: + * `--slurp`/`-s`: - Instead of running the filter for each JSON object in the - input, read the entire input stream into a large array and run - the filter just once. - - * `--raw-input`/`-R`: - - Don't parse the input as JSON. Instead, each line of text is - passed to the filter as a string. If combined with `--slurp`, - then the entire input is passed to the filter as a single long - string. + Instead of running the filter for each JSON object in the + input, read the entire input stream into a large array and run + the filter just once. + + * `--raw-input`/`-R`: + + Don't parse the input as JSON. Instead, each line of text is + passed to the filter as a string. If combined with `--slurp`, + then the entire input is passed to the filter as a single long + string. - * `--null-input`/`-n`: + * `--null-input`/`-n`: - Don't read any input at all! Instead, the filter is run once - using `null` as the input. This is useful when using jq as a - simple calculator or to construct JSON data from scratch. + Don't read any input at all! Instead, the filter is run once + using `null` as the input. This is useful when using jq as a + simple calculator or to construct JSON data from scratch. - * `--compact-output` / `-c`: + * `--compact-output` / `-c`: - By default, jq pretty-prints JSON output. Using this option - will result in more compact output by instead putting each - JSON object on a single line. + By default, jq pretty-prints JSON output. Using this option + will result in more compact output by instead putting each + JSON object on a single line. - * `--colour-output` / `-C` and `--monochrome-output` / `-M`: - - By default, jq outputs colored JSON if writing to a - terminal. You can force it to produce color even if writing to - a pipe or a file using `-C`, and disable color with `-M`. + * `--colour-output` / `-C` and `--monochrome-output` / `-M`: + + By default, jq outputs colored JSON if writing to a + terminal. You can force it to produce color even if writing to + a pipe or a file using `-C`, and disable color with `-M`. - * `--ascii-output` / `-a`: + * `--ascii-output` / `-a`: - jq usually outputs non-ASCII Unicode codepoints as UTF-8, even - if the input specified them as escape sequences (like - "\u03bc"). Using this option, you can force jq to produce pure - ASCII output with every non-ASCII character replaced with the - equivalent escape sequence. + jq usually outputs non-ASCII Unicode codepoints as UTF-8, even + if the input specified them as escape sequences (like + "\u03bc"). Using this option, you can force jq to produce pure + ASCII output with every non-ASCII character replaced with the + equivalent escape sequence. - * `--raw-output` / `-r`: + * `--raw-output` / `-r`: - With this option, if the filter's result is a string then it - will be written directly to standard output rather than being - formatted as a JSON string with quotes. This can be useful for - making jq filters talk to non-JSON-based systems. + With this option, if the filter's result is a string then it + will be written directly to standard output rather than being + formatted as a JSON string with quotes. This can be useful for + making jq filters talk to non-JSON-based systems. - title: Basic filters entries: @@ -646,10 +646,77 @@ sections: input: '42' output: ['"The input was 42, which is one less than 43"'] - + - title: "Format strings and escaping" + body: | - + The `@foo` syntax is used to format and escape strings, + which is useful for building URLs, documents in a language + like HTML or XML, and so forth. `@foo` can be used as a + filter on its own, the possible escapings are: + + * `@text`: + + Calls `tostring`, see that function for details. + + * `@json`: + + Serialises the input as JSON. + + * `@html`: + + Applies HTML/XML escaping, by mapping the characters + `<>&'"` to their entity equivalents `<`, `>`, + `&`, `'`, `"`. + + * `@uri`: + + Applies percent-encoding, by mapping all reserved URI + characters to a `%xx` sequence. + + * `@csv`: + + The input must be an array, and it is rendered as CSV + with double quotes for strings, and quotes escaped by + repetition. + + * `@sh`: + + The input is escaped suitable for use in a command-line + for a POSIX shell. If the input is an array, the output + will be a series of space-separated strings. + + * `@base64`: + + The input is converted to base64 as specified by RFC 4648. + + This syntax can be combined with string interpolation in a + useful way. You can follow a `@foo` token with a string + literal. The contents of the string literal will *not* be + escaped. However, all interpolations made inside that string + literal will be escaped. For instance, + + @uri "http://www.google.com/search?q=\(.search)" + + will produce the following output for the input + `{"search":"jq!"}`: + + http://www.google.com/search?q=jq%21 + + Note that the slashes, question mark, etc. in the URL are + not escaped, as they were part of the string literal. + + examples: + - program: '@html' + input: '"This works if x < y"' + output: ['"This works if x < y"'] + +# - program: '@html "<span>Anonymous said: \(.)</span>"' +# input: '"<script>alert(\"lol hax\");</script>"' +# output: ["<span>Anonymous said: <script>alert("lol hax");</script></span>"] + - program: '@sh "echo \(.)"' + input: "\"O'Hara's Ale\"" + output: ["\"echo 'O'\\''Hara'\\''s Ale\""] - title: Conditionals and Comparisons entries: diff --git a/lexer.l b/lexer.l index 7090de3..12851de 100644 --- a/lexer.l +++ b/lexer.l @@ -68,6 +68,9 @@ struct lexer_param; return try_exit(yytext[0], YY_START, yyscanner); } +"@"[a-zA-Z0-9_]+ { + yylval->literal = jv_string_sized(yytext + 1, yyleng - 1); return FORMAT; +} -?[0-9.]+([eE][+-]?[0-9]+)? { yylval->literal = jv_parse_sized(yytext, yyleng); return LITERAL; diff --git a/parser.y b/parser.y index 807db70..397982d 100644 --- a/parser.y +++ b/parser.y @@ -48,6 +48,7 @@ struct lexer_param; %token INVALID_CHARACTER %token <literal> IDENT %token <literal> LITERAL +%token <literal> FORMAT %token EQ "==" %token NEQ "!=" %token DEFINEDOR "//" @@ -158,8 +159,8 @@ static block gen_binop(block a, block b, int op) { return gen_call(funcname, BLOCK(gen_lambda(a), gen_lambda(b))); } -static block gen_format(block a) { - return BLOCK(a, gen_call("tostring", gen_noop())); +static block gen_format(block a, jv fmt) { + return BLOCK(a, gen_call("format", BLOCK(gen_lambda(gen_const(fmt))))); } static block gen_update(block a, block op, int optype) { @@ -316,10 +317,16 @@ FuncDef: String: -QQSTRING_START QQString QQSTRING_END { - $$ = $2; +QQSTRING_START { $<literal>$ = jv_string("text"); } QQString QQSTRING_END { + $$ = $3; + jv_free($<literal>2); +} | +FORMAT QQSTRING_START { $<literal>$ = $1; } QQString QQSTRING_END { + $$ = $4; + jv_free($<literal>3); } + QQString: /* empty */ { $$ = gen_const(jv_string("")); @@ -328,7 +335,7 @@ QQString QQSTRING_TEXT { $$ = gen_binop($1, gen_const($2), '+'); } | QQString QQSTRING_INTERP_START Exp QQSTRING_INTERP_END { - $$ = gen_binop($1, gen_format($3), '+'); + $$ = gen_binop($1, gen_format($3, jv_copy($<literal>0)), '+'); } @@ -373,6 +380,9 @@ LITERAL { String { $$ = $1; } | +FORMAT { + $$ = gen_format(gen_noop(), $1); +} | '(' Exp ')' { $$ = $2; } | diff --git a/testdata b/testdata index a1c72f0..c3352af 100644 --- a/testdata +++ b/testdata @@ -52,6 +52,20 @@ null null "interpolation" +@text,@json,([1,.] | @csv),@html,@uri,@sh,@base64 +"<>&'\"" +"<>&'\"" +"\"<>&'\\\"\"" +"1,\"<>&'\"\"\"" +"<>&'"" +"%3c%3e%26'%22" +"'<>&'\\''\"'" +"PD4mJyI=" + +@html "<b>\(.)</b>" +"<script>hax</script>" +"<b><script>hax</script></b>" + # # Dictionary construction syntax # -- 2.40.0