From 8ff935c01a6e9a16e0d84b29c548dc621bc8ff98 Mon Sep 17 00:00:00 2001 From: William Langford Date: Wed, 18 Jun 2014 19:49:38 -0400 Subject: [PATCH] Added regex support as per issue #164. jq now depends on oniguruma for regex support. Modified configure.ac accordingly. Added valgrind suppression file for oniguruma to prevent one-time and bounded leaks from causing tests to fail. Signed-off-by: Nicolas Williams --- builtin.c | 217 +++++++++++++++++++++++++++++++ configure.ac | 57 ++++++++ docs/content/3.manual/manual.yml | 72 ++++++++++ jv_unicode.c | 7 + tests/all.test | 46 ++++++- tests/onig.supp | 21 +++ tests/run | 2 +- 7 files changed, 420 insertions(+), 2 deletions(-) create mode 100644 tests/onig.supp diff --git a/builtin.c b/builtin.c index ebb4ffd..9ea06a1 100644 --- a/builtin.c +++ b/builtin.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include "builtin.h" @@ -514,6 +515,215 @@ static jv f_group_by_impl(jv input, jv keys) { } } +static int f_match_name_iter(const UChar* name, const UChar *name_end, int ngroups, + int *groups, regex_t *reg, void *arg) { + jv captures = *(jv*)arg; + for (int i = 0; i < ngroups; ++i) { + jv cap = jv_array_get(jv_copy(captures),groups[i]-1); + if (jv_get_kind(cap) == JV_KIND_OBJECT) { + cap = jv_object_set(cap, jv_string("name"), jv_string_sized((const char*)name, name_end-name)); + captures = jv_array_set(captures,groups[i]-1,cap); + } else { + jv_free(cap); + } + } + *(jv *)arg = captures; + return 0; +} + + +static jv f_match(jv input, jv regex, jv modifiers, jv testmode) { + int test = jv_equal(testmode, jv_true()); + jv result; + int onigret; + int global = 0; + regex_t *reg; + OnigErrorInfo einfo; + OnigRegion* region; + + jv_free(testmode); + if (jv_get_kind(input) != JV_KIND_STRING) { + jv_free(regex); + jv_free(modifiers); + return type_error(input, "cannot be matched, as it is not a string"); + } + + if (jv_get_kind(regex) != JV_KIND_STRING) { + jv_free(input); + jv_free(modifiers); + return type_error(regex, "is not a string"); + } + + OnigOptionType options = ONIG_OPTION_CAPTURE_GROUP; + + if (jv_get_kind(modifiers) == JV_KIND_STRING) { + jv modarray = jv_string_explode(jv_copy(modifiers)); + jv_array_foreach(modarray, i, mod) { + switch ((int)jv_number_value(mod)) { + case 'g': + global = 1; + break; + case 'i': + options |= ONIG_OPTION_IGNORECASE; + break; + case 'x': + options |= ONIG_OPTION_EXTEND; + break; + case 'm': + options |= ONIG_OPTION_MULTILINE; + break; + case 's': + options |= ONIG_OPTION_SINGLELINE; + break; + case 'p': + options |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; + break; + case 'l': + options |= ONIG_OPTION_FIND_LONGEST; + break; + case 'n': + options |= ONIG_OPTION_FIND_NOT_EMPTY; + break; + default: + jv_free(input); + jv_free(regex); + jv_free(modarray); + return jv_invalid_with_msg(jv_string_concat(modifiers, + jv_string(" is not a valid modifier string"))); + } + } + jv_free(modarray); + } else if (jv_get_kind(modifiers) != JV_KIND_NULL) { + // If it isn't a string or null, then it is the wrong type... + jv_free(input); + jv_free(regex); + return type_error(modifiers, "is not a string"); + } + + jv_free(modifiers); + + onigret = onig_new(®, (const UChar*)jv_string_value(regex), + (const UChar*)(jv_string_value(regex) + jv_string_length_bytes(jv_copy(regex))), + options, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PERL_NG, &einfo); + if (onigret != ONIG_NORMAL) { + UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(ebuf, onigret, einfo); + jv_free(input); + jv_free(regex); + return jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "), + jv_string((char*)ebuf))); + } + if (!test) + result = jv_array(); + const char *input_string = jv_string_value(input); + const UChar* start = (const UChar*)jv_string_value(input); + const unsigned long length = jv_string_length_bytes(jv_copy(input)); + const UChar* end = start + length; + region = onig_region_new(); + do { + onigret = onig_search(reg, + (const UChar*)jv_string_value(input), end, /* string boundaries */ + start, end, /* search boundaries */ + region, ONIG_OPTION_NONE); + if (onigret >= 0) { + if (test) { + result = jv_true(); + break; + } + + // Zero-width match + if (region->end[0] == region->beg[0]) { + unsigned long idx; + const char *fr = (const char*)input_string; + for (idx = 0; fr != input_string+region->beg[0]; idx++) { + fr += jvp_utf8_decode_length(*fr); + } + jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); + match = jv_object_set(match, jv_string("length"), jv_number(0)); + match = jv_object_set(match, jv_string("string"), jv_string("")); + match = jv_object_set(match, jv_string("captures"), jv_array()); + result = jv_array_append(result, match); + start += 1; + continue; + } + + unsigned long idx; + unsigned long len; + const char *fr = (const char*)input_string; + + for (idx = len = 0; fr != input_string+region->end[0]; len++) { + if (fr == input_string+region->beg[0]) idx = len, len=0; + fr += jvp_utf8_decode_length(*fr); + } + + jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); + + unsigned long blen = region->end[0]-region->beg[0]; + match = jv_object_set(match, jv_string("length"), jv_number(len)); + match = jv_object_set(match, jv_string("string"), jv_string_sized(input_string+region->beg[0],blen)); + jv captures = jv_array(); + for (int i = 1; i < region->num_regs; ++i) { + // Empty capture. + if (region->beg[i] == region->end[i]) { + // Didn't match. + jv cap; + if (region->beg[i] == -1) { + cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(-1)); + cap = jv_object_set(cap, jv_string("string"), jv_null()); + } else { + fr = input_string; + for (idx = 0; fr != input_string+region->beg[i]; idx++) { + fr += jvp_utf8_decode_length(*fr); + } + cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); + cap = jv_object_set(cap, jv_string("string"), jv_string("")); + } + cap = jv_object_set(cap, jv_string("length"), jv_number(0)); + cap = jv_object_set(cap, jv_string("name"), jv_null()); + captures = jv_array_append(captures, cap); + continue; + } + fr = input_string; + for (idx = len = 0; fr != input_string+region->end[i]; len++) { + if (fr == input_string+region->beg[i]) idx = len, len=0; + fr += jvp_utf8_decode_length(*fr); + } + + blen = region->end[i]-region->beg[i]; + jv cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx)); + cap = jv_object_set(cap, jv_string("length"), jv_number(len)); + cap = jv_object_set(cap, jv_string("string"), jv_string_sized(input_string+region->beg[i],blen)); + cap = jv_object_set(cap, jv_string("name"), jv_null()); + captures = jv_array_append(captures,cap); + } + onig_foreach_name(reg,f_match_name_iter,&captures); + match = jv_object_set(match, jv_string("captures"), captures); + result = jv_array_append(result, match); + start = (const UChar*)(input_string+region->end[0]); + onig_region_free(region,0); + } else if (onigret == ONIG_MISMATCH) { + if (test) + result = jv_false(); + break; + } else { /* Error */ + UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(ebuf, onigret, einfo); + jv_free(result); + result = jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "), + jv_string((char*)ebuf))); + break; + } + } while (global && start != end); + onig_region_free(region,1); + region = NULL; + if (region) + onig_region_free(region,1); + onig_free(reg); + jv_free(input); + jv_free(regex); + return result; +} + static jv minmax_by(jv values, jv keys, int is_min) { if (jv_get_kind(values) != JV_KIND_ARRAY) return type_error2(values, keys, "cannot be iterated over"); @@ -642,6 +852,7 @@ static const struct cfunction function_list[] = { {(cfunction_ptr)f_error, "error", 2}, {(cfunction_ptr)f_format, "format", 2}, {(cfunction_ptr)f_env, "env", 1}, + {(cfunction_ptr)f_match, "_match_impl", 4}, }; #undef LIBM_DD @@ -737,6 +948,12 @@ static const char* const jq_builtins[] = { "def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);", "def flatten(x): reduce .[] as $i ([]; if $i | type == \"array\" and x > 0 then . + ($i | flatten(x-1)) else . + [$i] end);", "def range(x): range(0;x);", + "def match(re; mode): _match_impl(re; mode; false)|.[];", + "def match(val): if val | type == \"string\" then match(val; null) elif val | type == \"array\" and (val | length) > 1 then match(val[0]; val[1]) elif val | type == \"array\" and (val | length > 0) then match(val[0]; null) else error((val | type) + \" not a string or array\") end;", + "def test(re; mode): _match_impl(re; mode; true);", + "def test(val): if val |type == \"string\" then test(val; null) elif val | type == \"array\" and (val | length) > 1 then test(val[0]; val[1]) elif val | type == \"array\" and (val | length > 0) then test(val[0]; null) else error((val | type) + \" not a string or array\") end;", +// "def test(re): _match(re; null; 1);", + }; #undef LIBM_DD diff --git a/configure.ac b/configure.ac index e8dfed4..a0040e5 100644 --- a/configure.ac +++ b/configure.ac @@ -42,6 +42,63 @@ if test "x$LEX" != xflex; then fi +########################################################################## +# check for ONIGURUMA library +########################################################################## + +AC_ARG_WITH([oniguruma], + [AS_HELP_STRING([--with-oniguruma=prefix], + [try this for a non-standard install prefix of the oniguruma library])], + [ONIGURUMAPATHSET=1], + [ONIGURUMAPATHSET=0]) + +if test $ONIGURUMAPATHSET == 1; then + CFLAGS="$CFLAGS -I${with_oniguruma}/include" + LDFLAGS="$LDFLAGS -L${with_oniguruma}/lib" +fi + +# store current *FLAGS and merge with AM_*FLAGS for compilation and linker check +OLD_CFLAGS=$CFLAGS; +OLD_LDFLAGS=$LDFLAGS; +CFLAGS="$AM_CFLAGS $CFLAGS" +LDFLAGS="$AM_LDFLAGS $LDFLAGS" + +# ensure the library to check for is covered by the LIBS variable +OLD_LIBS=$LIBS +LIBS="$LIBS -lonig" + +# check for ONIGURUMA library headers +AC_MSG_CHECKING([for oniguruma.h]) +# try to compile a file that includes a header of the library oniguruma +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #include ])], + [AC_MSG_RESULT([yes]) + # try to link the function 'onig_free' out of library oniguruma + AC_MSG_CHECKING([for oniguruma usability]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[onig_free(0);]])], + [AC_MSG_RESULT([yes]) + FOUND_ONIGURUMA=1;], + [AC_MSG_RESULT([no]) + LIBS=$OLD_LIBS; dnl reset to old value since oniguruma was not found + FOUND_ONIGURUMA=0;])], + [AC_MSG_RESULT([not found]) + FOUND_ONIGURUMA=0;]) + +# reset original *FLAGS +CFLAGS=$OLD_CFLAGS +LDFLAGS=$OLD_LDFLAGS + +# handle check results +if test $FOUND_ONIGURUMA != 1; then + AC_MSG_NOTICE([Oniguruma was not found.]) + AC_MSG_NOTICE([ Try setting the location using '--with-oniguruma=PREFIX' ]) + AC_MSG_ERROR([ oniguruma is required to build jq.]) +fi + + + + dnl Check for valgrind AC_CHECK_PROGS(valgrind_cmd, valgrind) if test "x$valgrind_cmd" = "x" ; then diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml index aa0b15d..2e83129 100644 --- a/docs/content/3.manual/manual.yml +++ b/docs/content/3.manual/manual.yml @@ -1087,6 +1087,78 @@ sections: input: '["foobar", "barfoo"]' output: ['[false, true, true, false, false]'] + - title: "`match(val)`, `match(regex; modifiers)`" + body: | + + The filter `match(val)` performs PCRE regex matching on its input. + `val` can be either a string or an array. If it is an array, + the first element is the regex specifier and the optional + second element is the modifier flags. + The accepted modifier flags are: + + * `g` - Global search (find all matches, not just the first) + * `i` - Case insensitive search + * `x` - Extended regex format (ignore whitespaces) + * `m` - Multi line mode ('.' will match newlines) + * `s` - Single line mode ('^' -> '\A', '$' -> '\Z') + * `p` - Both s and m modes are enabled + * `l` - Find longest possible matches + * `n` - Ignore empty matches + + The filter outputs an object for each match it finds. Matches have + the following fields: + + * `offset` - offset in UTF-8 codepoints from the beginning of the input + * `length` - length in UTF-8 codepoints of the match + * `string` - the string that it matched + * `captures` - an array of objects representing capturing groups. + + Capturing group objects have the following fields: + + * `offset` - offset in UTF-8 codepoints from the beginning of the input + * `length` - length in UTF-8 codepoints of this capturing group + * `string` - the string that was captured + * `name` - the name of the capturing group (or `null` if it was unnamed) + + Capturing groups that did not match anything return an offset of -1 + + examples: + - program: 'match("(abc)+"; "g")' + input: '"abc abc"' + output: + - '{"offset": 0, "length": 3, "string": "abc", "captures": [{"offset": 0, "length": 3, "string": "abc", "name": null}]}' + - '{"offset": 4, "length": 3, "string": "abc", "captures": [{"offset": 4, "length": 3, "string": "abc", "name": null}]}' + - program: 'match("foo")' + input: '"foo bar foo"' + output: ['{"offset": 0, "length": 3, "string": "foo", "captures": []}'] + - program: 'match(["foo", "ig"])' + input: '"foo bar FOO"' + output: + - '{"offset": 0, "length": 3, "string": "foo", "captures": []}' + - '{"offset": 8, "length": 3, "string": "FOO", "captures": []}' + - program: 'match("foo (?bar)? foo"; "ig")' + input: '"foo bar foo foo foo"' + output: + - '{"offset": 0, "length": 11, "string": "foo bar foo", "captures": [{"offset": 4, "length": 3, "string": "bar", "name": "bar123"}]}' + - '{"offset": 12, "length": 8, "string": "foo foo", "captures": [{"offset": -1, "length": 0, "string": null, "name": "bar123"}]}' + + + - title: "`test(val)`, `test(regex)`, `test(regex; modifiers)`" + body: | + + Like `match`, but does not return match objects, only `true` or `false` + for whether or not the regex matches the input. + + examples: + - program: 'test("foo")' + input: '"foo"' + output: ['true'] + - program: 'test("foo"; "i")' + input: '"Foo"' + output: ['true'] + - program: 'test("foo")' + input: '"bar"' + output: ['false'] - title: "`ltrimstr(str)`" body: | diff --git a/jv_unicode.c b/jv_unicode.c index 907e985..c3f9f11 100644 --- a/jv_unicode.c +++ b/jv_unicode.c @@ -59,6 +59,13 @@ int jvp_utf8_is_valid(const char* in, const char* end) { return 1; } +int jvp_utf8_decode_length(char startchar) { + if ((startchar & 0x80) == 0) return 1; + else if ((startchar & 0xC0) == 0xC0) return 2; + else if ((startchar & 0xE0) == 0xE0) return 3; + else return 4; +} + int jvp_utf8_encode_length(int codepoint) { if (codepoint <= 0x7F) return 1; else if (codepoint <= 0x7FF) return 2; diff --git a/tests/all.test b/tests/all.test index 1e220cd..0b5b947 100644 --- a/tests/all.test +++ b/tests/all.test @@ -649,6 +649,50 @@ def inc(x): x |= .+1; inc(.[].a) ["fo", "foo", "barfoo", "foobar", "barfoob"] [false, true, true, false, false] +# match builtin +[match("( )*"; "g")] +"abc" +[{"offset":0, "length":0, "string":"", "captures":[]},{"offset":1, "length":0, "string":"", "captures":[]},{"offset":2, "length":0, "string":"", "captures":[]}] + +[match("( )*"; "gn")] +"abc" +[] + +[match("a"; "gi")] +"āáàä" +[] + +[match(["(bar)"])] +"foo bar" +[{"offset": 4, "length": 3, "string": "bar", "captures":[{"offset": 4, "length": 3, "string": "bar", "name": null}]}] + +# offsets account for combining codepoints and multi-byte UTF-8 +[match("bar")] +"ā bar with a combining codepoint U+0304" +[{"offset": 3, "length": 3, "string": "bar", "captures":[]}] + +# matches with combining codepoints still count them in their length +[match("bār")] +"a bār" +[{"offset": 2, "length": 4, "string": "bār", "captures":[]}] + +[match(".+?\\b")] +"ā two-codepoint grapheme" +[{"offset": 0, "length": 2, "string": "ā", "captures":[]}] + +[match(["foo (?bar)? foo", "ig"])] +"foo bar foo foo foo" +[{"offset": 0, "length": 11, "string": "foo bar foo", "captures":[{"offset": 4, "length": 3, "string": "bar", "name": "bar123"}]},{"offset":12, "length": 8, "string": "foo foo", "captures":[{"offset": -1, "length": 0, "string": null, "name": "bar123"}]}] + +#test builtin +[test("( )*"; "gn")] +"abc" +[false] + +[test("ā")] +"ā" +[true] + [.[]|ltrimstr("foo")] ["fo", "foo", "barfoo", "foobar", "afoo"] ["fo","","barfoo","bar","afoo"] @@ -857,4 +901,4 @@ flatten(2) flatten(2) [0, [1, [2]], [1, [[3], 2]]] -[0, 1, 2, 1, [3], 2] \ No newline at end of file +[0, 1, 2, 1, [3], 2] diff --git a/tests/onig.supp b/tests/onig.supp new file mode 100644 index 0000000..37c847e --- /dev/null +++ b/tests/onig.supp @@ -0,0 +1,21 @@ +{ + onig node recycling + Memcheck:Leak + ... + fun:onig_parse_make_tree + ... +} +{ + onig unicode case insensitivity 1 + Memcheck:Leak + ... + fun:setup_tree + ... +} +{ + onig unicode case insensitivity 2 + Memcheck:Leak + ... + fun:onig*unicode* + ... +} diff --git a/tests/run b/tests/run index ff09246..5268e2e 100755 --- a/tests/run +++ b/tests/run @@ -1,7 +1,7 @@ #!/bin/sh if which valgrind > /dev/null; then - VALGRIND='valgrind --error-exitcode=1 -q --leak-check=full' + VALGRIND='valgrind --error-exitcode=1 -q --leak-check=full --suppressions=tests/onig.supp' else VALGRIND= fi -- 2.40.0