From: Nicolas Williams Date: Sat, 27 Jun 2015 01:16:23 +0000 (-0500) Subject: Add streaming utilities (fix #827) X-Git-Tag: jq-1.5rc2~53 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=25d47ca08e682dd28793cb76e0deeae93cd2f8bd;p=jq Add streaming utilities (fix #827) --- diff --git a/builtin.c b/builtin.c index 7cb402a..f99ce09 100644 --- a/builtin.c +++ b/builtin.c @@ -1521,6 +1521,45 @@ static const char* const jq_builtins[] = { "def ascii_upcase:" " explode | map( if 97 <= . and . <= 122 then . - 32 else . end) | implode;", + // Streaming utilities + "def truncate_stream(stream):" + " . as $n | null | stream | . as $input | if (.[0]|length) > $n then setpath([0];$input[0][1:]) else empty end;", + "def fromstream(i):" + " foreach i as $item (" + " [null,false,null,false];" + " if ($item[0]|length) == 0 then [null,false,.[2],.[3]]" + " elif ($item|length) == 1 and ($item[0]|length) < 2 then [null,false,.[0],.[1]]" + " else . end |" + " . as $state |" + " if ($item|length) > 1 and ($item[0]|length) > 0 then" + " [.[0]|setpath(($item|.[0]); ($item|.[1])), " + " true, " + " $state[2], " + " $state[3]] " + " else ." + " end;" + " if ($item[0]|length) == 1 and ($item|length == 1) and .[3] then .[2] else empty end," + " if ($item[0]|length) == 0 then $item[1] else empty end" + " );", + "def tostream:\n" + " {string:true,number:true,boolean:true,null:true} as $leaf_types |\n" + " . as $dot |\n" + " if $leaf_types[$dot|type] or length==0 then [[],$dot]\n" + " else\n" + " # We really need a _streaming_ form of `keys`.\n" + " # We can use `range` for arrays, but not for objects.\n" + " keys as $keys |\n" + " $keys[-1] as $last|\n" + " ((# for each key\n" + " $keys[] | . as $key |\n" + " $dot[$key] | . as $dot |\n" + " # recurse on each key/value\n" + " tostream|.[0]|=[$key]+.),\n" + " # then add the closing marker\n" + " [[$last]])\n" + " end;", + + // # Assuming the input array is sorted, bsearch/1 returns // # the index of the target if the target is in the input array; and otherwise // # (-1 - ix), where ix is the insertion point that would leave the array sorted. diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml index 8d4800d..1092b3b 100644 --- a/docs/content/3.manual/manual.yml +++ b/docs/content/3.manual/manual.yml @@ -2500,6 +2500,63 @@ sections: Returns the line number of the input currently being filtered. + - title: 'Streaming' + body: | + + With the `--stream` option jq can parse input texts in a streaming + fashion, allowing jq programs to start processing large JSON texts + immediately rather than after the parse completes. If you have a + single JSON text that is 1GB in size, streaming it will allow you + to process it much more quickly. + + However, streaming isn't easy to deal with as the jq program will + have `[, ]` (and a few other forms) as inputs. + + Several builtins are provided to make handling streams easier. + + The examples below use the the streamed form of `[0,[1]]`, which + is `[[0],1],[[1,0],2],[[1,0]],[[1]])]`. + + Streaming forms include `[, ]` (to indicate any + scalar value, empty array, or empty object), and `[]` (to + indicate the end of an array or object). Future versions of jq + run with `--stream` and `-seq` may output additional forms such as + `["error message"]` when an input text fails to parse. + + entries: + - title: "`truncate_stream(stream_expression)`" + body: | + + Consumes a number as input and truncates the corresponding + number of path elements from the left of the outputs of the + given streaming expression. + + examples: + - program: '[1|truncate_stream([[0],1],[[1,0],2],[[1,0]],[[1]])]' + input: '1' + output: ['[[[0],2],[[0]]]'] + + - title: "`fromstream(stream_expression)`" + body: | + + Outputs values corresponding to the stream expression's + outputs. + + examples: + - program: 'fromstream(1|truncate_stream([[0],1],[[1,0],2],[[1,0]],[[1]]))' + input: 'null' + output: ['[2]'] + + - title: "`tostream`" + body: | + + The `tostream` builtin outputs the streamed form of its input. + + examples: + - program: '. as $dot|fromstream($dot|tostream)|.==$dot' + input: '[0,[1,{"a":1},{"b":2}]]' + output: ['true'] + - title: Assignment body: | diff --git a/tests/modules/streaming.jq b/tests/modules/streaming.jq deleted file mode 100644 index 4e2909e..0000000 --- a/tests/modules/streaming.jq +++ /dev/null @@ -1,49 +0,0 @@ - -# Filter and adjust streamed values so that only values from the .th -# level are output. -def trunc(stream): - . as $n | stream | . as $input | if (.[0]|length) > $n then setpath([0];$input[0][$n:]) else empty end; - -# Reduce streamed values back to normal -def tovalues(i): - def debug(msg): . as $dot | [msg, .] | debug | $dot; - foreach i as $item ( - [null,false,null]; - - # Updator - # - # If the new $item is a top-level value, - # then clear out the current value - . as [$cur, $cur_isvalid, $prev] | - $item as [$path, $leaf] | - ($item|length > 1) as $has_leaf | - ($item|length == 1) as $closing | - ($path|length) as $plen | - # if the new $item terminates the current value, then cur is ready - # for extraction and we'll start building a new value with the next - # inputs - if ($plen == 0) or # top-level scalar - ($closing and $plen < 2) then [null,false,$cur] - # else continue building up cur - else . end | - . as [$cur, $cur_isvalid, $prev] | - # If the new $item has a leaf, upate the current value - if $has_leaf and $plen > 0 then - [$cur|setpath(($path); $leaf), # update current value - true, # current value is now valid (if, perhaps, incomplete) - $prev] # previous value is unchanged - else . - end; - - # Extractor - # - . as [$cur, $cur_isvalid, $prev] | - $item as [$path, $leaf] | - ($item|length > 1) as $has_leaf | - ($item|length == 1) as $closing | - ($path|length) as $plen | - # If previous value is valid, output it - if $plen == 1 and $closing then $prev else empty end, - # and/or if the new $item is a top-level scalar, output it - if $plen == 0 then $leaf else empty end - ); diff --git a/tests/shtest b/tests/shtest index 3840565..70c384d 100755 --- a/tests/shtest +++ b/tests/shtest @@ -77,9 +77,6 @@ fi ## Test JSON sequence support -## XXX If we add a `stream_fromjson` builtin then we can move these tests -## into tests/all.test - cat > $d/expected < /dev/null 2>&1; then fi dd "if=tests/torture/input0.json" bs=$i count=1 2>/dev/null | - $VALGRIND $JQ -cn --stream -L "$mods" 'import "streaming" as streaming; streaming::tovalues(inputs)' > $d/out1 2>$d/err || true + $VALGRIND $JQ -cn --stream 'fromstream(inputs)' > $d/out1 2>$d/err || true if [ -n "$VALGRIND" ]; then grep '^==[0-9][0-9]*== ERROR SUMMARY: 0 errors' $d/err > /dev/null else