From 89e26969ae5bc7a259c1bd150f4d58c67800424b Mon Sep 17 00:00:00 2001
From: Stephen Dolan <mu@netsoc.tcd.ie>
Date: Thu, 27 Dec 2012 20:49:34 +0000
Subject: [PATCH] @foo syntax for encoding of strings into various formats.

Fixes part of #47 and #48.
---
 builtin.c                        | 171 +++++++++++++++++++++++++++++++
 docs/content/3.manual/manual.yml | 141 ++++++++++++++++++-------
 lexer.l                          |   3 +
 parser.y                         |  20 +++-
 testdata                         |  14 +++
 5 files changed, 307 insertions(+), 42 deletions(-)

diff --git a/builtin.c b/builtin.c
index 4f9fab5..c431be0 100644
--- a/builtin.c
+++ b/builtin.c
@@ -5,6 +5,8 @@
 #include "parser.h"
 #include "locfile.h"
 #include "jv_aux.h"
+#include "jv_unicode.h"
+
 
 
 typedef jv (*func_1)(jv);
@@ -207,6 +209,174 @@ static jv f_tostring(jv input) {
   }
 }
 
+#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+
+static jv escape_string(jv input, const char* escapings) {
+
+  assert(jv_get_kind(input) == JV_KIND_STRING);
+  const char* lookup[128] = {0};
+  const char* p = escapings;
+  while (*p) {
+    lookup[(int)*p] = p+1;
+    p++;
+    p += strlen(p);
+    p++;
+  }
+
+  jv ret = jv_string("");
+  const char* i = jv_string_value(input);
+  const char* end = i + jv_string_length(jv_copy(input));
+  const char* cstart;
+  int c = 0;
+  while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+    assert(c != -1);
+    if (c < 128 && lookup[c]) {
+      ret = jv_string_append_str(ret, lookup[c]);
+    } else {
+      ret = jv_string_append_buf(ret, cstart, i - cstart);
+    }
+  }
+  jv_free(input);
+  return ret;
+
+}
+
+static jv f_format(jv input, jv fmt) {
+  if (jv_get_kind(fmt) != JV_KIND_STRING) {
+    jv_free(input);
+    return type_error(fmt, "is not a valid format");
+  }
+  const char* fmt_s = jv_string_value(fmt);
+  if (!strcmp(fmt_s, "json")) {
+    jv_free(fmt);
+    return jv_dump_string(input, 0);
+  } else if (!strcmp(fmt_s, "text")) {
+    jv_free(fmt);
+    return f_tostring(input);
+  } else if (!strcmp(fmt_s, "csv")) {
+    jv_free(fmt);
+    if (jv_get_kind(input) != JV_KIND_ARRAY)
+      return type_error(input, "cannot be csv-formatted, only array");
+    jv line = jv_string("");
+    for (int i=0; i<jv_array_length(jv_copy(input)); i++) {
+      if (i) line = jv_string_append_str(line, ",");
+      jv x = jv_array_get(jv_copy(input), i);
+      switch (jv_get_kind(x)) {
+      case JV_KIND_NULL:
+        /* null rendered as empty string */
+        jv_free(x);
+        break;
+      case JV_KIND_TRUE:
+      case JV_KIND_FALSE:
+        line = jv_string_concat(line, jv_dump_string(x, 0));
+        break;
+      case JV_KIND_NUMBER:
+        if (jv_number_value(x) != jv_number_value(x)) {
+          /* NaN, render as empty string */
+          jv_free(x);
+        } else {
+          line = jv_string_concat(line, jv_dump_string(x, 0));
+        }
+        break;
+      case JV_KIND_STRING: {
+        line = jv_string_append_str(line, "\"");
+        line = jv_string_concat(line, escape_string(x, "\"\"\"\0"));
+        line = jv_string_append_str(line, "\"");
+        break;
+      }
+      default:
+        jv_free(input);
+        jv_free(line);
+        return type_error(x, "is not valid in a csv row");
+      }
+    }
+    jv_free(input);
+    return line;
+  } else if (!strcmp(fmt_s, "html")) {
+    jv_free(fmt);
+    return escape_string(f_tostring(input), "&&amp;\0<&lt;\0>&gt;\0'&apos;\0\"&quot;\0");
+  } else if (!strcmp(fmt_s, "uri")) {
+    jv_free(fmt);
+    input = f_tostring(input);
+
+    int unreserved[128] = {0};
+    const char* p = CHARS_ALPHANUM "-_.!~*'()";
+    while (*p) unreserved[(int)*p++] = 1;
+
+    jv line = jv_string("");
+    const char* s = jv_string_value(input);
+    for (int i=0; i<jv_string_length(jv_copy(input)); i++) {
+      unsigned ch = (unsigned)*s;
+      if (ch < 128 && unreserved[ch]) {
+        line = jv_string_append_buf(line, s, 1);
+      } else {
+        line = jv_string_concat(line, jv_string_fmt("%%%02x", ch));
+      }
+      s++;
+    }
+    jv_free(input);
+    return line;
+  } else if (!strcmp(fmt_s, "sh")) {
+    jv_free(fmt);
+    if (jv_get_kind(input) != JV_KIND_ARRAY)
+      input = jv_array_set(jv_array(), 0, input);
+    jv line = jv_string("");
+    for (int i=0; i<jv_array_length(jv_copy(input)); i++) {
+      if (i) line = jv_string_append_str(line, " ");
+      jv x = jv_array_get(jv_copy(input), i);
+      switch (jv_get_kind(x)) {
+      case JV_KIND_NULL:
+      case JV_KIND_TRUE:
+      case JV_KIND_FALSE:
+      case JV_KIND_NUMBER:
+        line = jv_string_concat(line, jv_dump_string(x, 0));
+        break;
+
+      case JV_KIND_STRING: {
+        line = jv_string_append_str(line, "'");
+        line = jv_string_concat(line, escape_string(x, "''\\''\0"));
+        line = jv_string_append_str(line, "'");
+        break;
+      }
+
+      default:
+        jv_free(input);
+        jv_free(line);
+        return type_error(x, "can not be escaped for shell");
+      }
+    }
+    jv_free(input);
+    return line;
+  } else if (!strcmp(fmt_s, "base64")) {
+    jv_free(fmt);
+    input = f_tostring(input);
+    jv line = jv_string("");
+    const char b64[64 + 1] = CHARS_ALPHANUM "+/";
+    const char* data = jv_string_value(input);
+    int len = jv_string_length(jv_copy(input));
+    for (int i=0; i<len; i+=3) {
+      uint32_t code = 0;
+      int n = len - i >= 3 ? 3 : len-i;
+      for (int j=0; j<3; j++) {
+        code <<= 8;
+        code |= j < n ? (unsigned)data[i+j] : 0;
+      }
+      char buf[4];
+      for (int j=0; j<4; j++) {
+        buf[j] = b64[(code >> (18 - j*6)) & 0x3f];
+      }
+      if (n < 3) buf[3] = '=';
+      if (n < 2) buf[2] = '=';
+      line = jv_string_append_buf(line, buf, sizeof(buf));
+    }
+    jv_free(input);
+    return line;
+  } else {
+    jv_free(input);
+    return jv_invalid_with_msg(jv_string_concat(fmt, jv_string(" is not a valid format")));
+  }
+}
+
 static jv f_keys(jv input) {
   if (jv_get_kind(input) == JV_KIND_OBJECT || jv_get_kind(input) == JV_KIND_ARRAY) {
     return jv_keys(input);
@@ -332,6 +502,7 @@ static struct cfunction function_list[] = {
   {(cfunction_ptr)f_min_by_impl, "_min_by_impl", 2},
   {(cfunction_ptr)f_max_by_impl, "_max_by_impl", 2},
   {(cfunction_ptr)f_error, "error", 2},
+  {(cfunction_ptr)f_format, "format", 2},
 };
 
 static struct symbol_table cbuiltins = 
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index cf5de52..55f43a0 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -75,51 +75,51 @@ sections:
       You can affect how jq reads and writes its input and output
       using some command-line options:
 
-        * `--slurp`/`-s`:
+      * `--slurp`/`-s`:
   
-          Instead of running the filter for each JSON object in the
-          input, read the entire input stream into a large array and run
-          the filter just once.
-        
-        * `--raw-input`/`-R`:
-        
-          Don't parse the input as JSON. Instead, each line of text is
-          passed to the filter as a string. If combined with `--slurp`,
-          then the entire input is passed to the filter as a single long
-          string.
+        Instead of running the filter for each JSON object in the
+        input, read the entire input stream into a large array and run
+        the filter just once.
+      
+      * `--raw-input`/`-R`:
+      
+        Don't parse the input as JSON. Instead, each line of text is
+        passed to the filter as a string. If combined with `--slurp`,
+        then the entire input is passed to the filter as a single long
+        string.
   
-        * `--null-input`/`-n`:
+      * `--null-input`/`-n`:
   
-          Don't read any input at all! Instead, the filter is run once
-          using `null` as the input. This is useful when using jq as a
-          simple calculator or to construct JSON data from scratch.
+        Don't read any input at all! Instead, the filter is run once
+        using `null` as the input. This is useful when using jq as a
+        simple calculator or to construct JSON data from scratch.
   
-        * `--compact-output` / `-c`:
+      * `--compact-output` / `-c`:
   
-          By default, jq pretty-prints JSON output. Using this option
-          will result in more compact output by instead putting each
-          JSON object on a single line.
+        By default, jq pretty-prints JSON output. Using this option
+        will result in more compact output by instead putting each
+        JSON object on a single line.
   
-        * `--colour-output` / `-C` and `--monochrome-output` / `-M`:
-        
-          By default, jq outputs colored JSON if writing to a
-          terminal. You can force it to produce color even if writing to
-          a pipe or a file using `-C`, and disable color with `-M`.
+      * `--colour-output` / `-C` and `--monochrome-output` / `-M`:
+      
+        By default, jq outputs colored JSON if writing to a
+        terminal. You can force it to produce color even if writing to
+        a pipe or a file using `-C`, and disable color with `-M`.
   
-        * `--ascii-output` / `-a`:
+      * `--ascii-output` / `-a`:
   
-          jq usually outputs non-ASCII Unicode codepoints as UTF-8, even
-          if the input specified them as escape sequences (like
-          "\u03bc"). Using this option, you can force jq to produce pure
-          ASCII output with every non-ASCII character replaced with the
-          equivalent escape sequence.
+        jq usually outputs non-ASCII Unicode codepoints as UTF-8, even
+        if the input specified them as escape sequences (like
+        "\u03bc"). Using this option, you can force jq to produce pure
+        ASCII output with every non-ASCII character replaced with the
+        equivalent escape sequence.
   
-        * `--raw-output` / `-r`:
+      * `--raw-output` / `-r`:
   
-          With this option, if the filter's result is a string then it
-          will be written directly to standard output rather than being
-          formatted as a JSON string with quotes. This can be useful for
-          making jq filters talk to non-JSON-based systems.
+        With this option, if the filter's result is a string then it
+        will be written directly to standard output rather than being
+        formatted as a JSON string with quotes. This can be useful for
+        making jq filters talk to non-JSON-based systems.
 
   - title: Basic filters
     entries:
@@ -646,10 +646,77 @@ sections:
             input: '42'
             output: ['"The input was 42, which is one less than 43"']
           
-              
+      - title: "Format strings and escaping"
+        body: |
 
-              
+          The `@foo` syntax is used to format and escape strings,
+          which is useful for building URLs, documents in a language
+          like HTML or XML, and so forth. `@foo` can be used as a
+          filter on its own, the possible escapings are:
+
+          * `@text`:
+
+            Calls `tostring`, see that function for details.
+
+          * `@json`:
+
+            Serialises the input as JSON.
+
+          * `@html`:
+
+            Applies HTML/XML escaping, by mapping the characters
+            `<>&'"` to their entity equivalents `&lt;`, `&gt;`,
+            `&amp;`, `&apos;`, `&quot;`.
+
+          * `@uri`:
+
+            Applies percent-encoding, by mapping all reserved URI
+            characters to a `%xx` sequence.
+
+          * `@csv`:
+            
+            The input must be an array, and it is rendered as CSV
+            with double quotes for strings, and quotes escaped by
+            repetition.
+
+          * `@sh`:
+            
+            The input is escaped suitable for use in a command-line
+            for a POSIX shell. If the input is an array, the output
+            will be a series of space-separated strings.
+            
+          * `@base64`:
+
+            The input is converted to base64 as specified by RFC 4648.
+
+          This syntax can be combined with string interpolation in a
+          useful way. You can follow a `@foo` token with a string
+          literal. The contents of the string literal will *not* be
+          escaped. However, all interpolations made inside that string
+          literal will be escaped. For instance,
+
+              @uri "http://www.google.com/search?q=\(.search)"
+
+          will produce the following output for the input
+          `{"search":"jq!"}`:
+
+              http://www.google.com/search?q=jq%21
+
+          Note that the slashes, question mark, etc. in the URL are
+          not escaped, as they were part of the string literal.
+
+        examples:
+          - program: '@html'
+            input: '"This works if x < y"'
+            output: ['"This works if x &lt; y"']
+
+#          - program: '@html "<span>Anonymous said: \(.)</span>"'
+#            input: '"<script>alert(\"lol hax\");</script>"'
+#            output: ["<span>Anonymous said: &lt;script&gt;alert(&quot;lol hax&quot;);&lt;/script&gt;</span>"]
 
+          - program: '@sh "echo \(.)"'
+            input: "\"O'Hara's Ale\""
+            output: ["\"echo 'O'\\''Hara'\\''s Ale\""]
         
   - title: Conditionals and Comparisons
     entries:
diff --git a/lexer.l b/lexer.l
index 7090de3..12851de 100644
--- a/lexer.l
+++ b/lexer.l
@@ -68,6 +68,9 @@ struct lexer_param;
   return try_exit(yytext[0], YY_START, yyscanner);
 }
 
+"@"[a-zA-Z0-9_]+ {
+  yylval->literal = jv_string_sized(yytext + 1, yyleng - 1); return FORMAT;
+}
 
 -?[0-9.]+([eE][+-]?[0-9]+)? { 
    yylval->literal = jv_parse_sized(yytext, yyleng); return LITERAL; 
diff --git a/parser.y b/parser.y
index 807db70..397982d 100644
--- a/parser.y
+++ b/parser.y
@@ -48,6 +48,7 @@ struct lexer_param;
 %token INVALID_CHARACTER
 %token <literal> IDENT
 %token <literal> LITERAL
+%token <literal> FORMAT
 %token EQ "=="
 %token NEQ "!="
 %token DEFINEDOR "//"
@@ -158,8 +159,8 @@ static block gen_binop(block a, block b, int op) {
   return gen_call(funcname, BLOCK(gen_lambda(a), gen_lambda(b)));
 }
 
-static block gen_format(block a) {
-  return BLOCK(a, gen_call("tostring", gen_noop()));
+static block gen_format(block a, jv fmt) {
+  return BLOCK(a, gen_call("format", BLOCK(gen_lambda(gen_const(fmt)))));
 }
  
 static block gen_update(block a, block op, int optype) {
@@ -316,10 +317,16 @@ FuncDef:
 
 
 String:
-QQSTRING_START QQString QQSTRING_END {
-  $$ = $2;
+QQSTRING_START { $<literal>$ = jv_string("text"); } QQString QQSTRING_END {
+  $$ = $3;
+  jv_free($<literal>2);
+} |
+FORMAT QQSTRING_START { $<literal>$ = $1; } QQString QQSTRING_END {
+  $$ = $4;
+  jv_free($<literal>3);
 }
 
+
 QQString:
 /* empty */ {
   $$ = gen_const(jv_string(""));
@@ -328,7 +335,7 @@ QQString QQSTRING_TEXT {
   $$ = gen_binop($1, gen_const($2), '+');
 } |
 QQString QQSTRING_INTERP_START Exp QQSTRING_INTERP_END {
-  $$ = gen_binop($1, gen_format($3), '+');
+  $$ = gen_binop($1, gen_format($3, jv_copy($<literal>0)), '+');
 }
 
 
@@ -373,6 +380,9 @@ LITERAL {
 String {
   $$ = $1;
 } |
+FORMAT {
+  $$ = gen_format(gen_noop(), $1);
+} |
 '(' Exp ')' { 
   $$ = $2; 
 } | 
diff --git a/testdata b/testdata
index a1c72f0..c3352af 100644
--- a/testdata
+++ b/testdata
@@ -52,6 +52,20 @@ null
 null
 "interpolation"
 
+@text,@json,([1,.] | @csv),@html,@uri,@sh,@base64
+"<>&'\""
+"<>&'\""
+"\"<>&'\\\"\""
+"1,\"<>&'\"\"\""
+"&lt;&gt;&amp;&apos;&quot;"
+"%3c%3e%26'%22"
+"'<>&'\\''\"'"
+"PD4mJyI="
+
+@html "<b>\(.)</b>"
+"<script>hax</script>"
+"<b>&lt;script&gt;hax&lt;/script&gt;</b>"
+
 #
 # Dictionary construction syntax
 #
-- 
2.40.0