From 8ff935c01a6e9a16e0d84b29c548dc621bc8ff98 Mon Sep 17 00:00:00 2001
From: William Langford <wlangfor@gmail.com>
Date: Wed, 18 Jun 2014 19:49:38 -0400
Subject: [PATCH] Added regex support as per issue #164.

jq now depends on oniguruma for regex support.
Modified configure.ac accordingly.

Added valgrind suppression file for oniguruma to prevent one-time and bounded
leaks from causing tests to fail.

Signed-off-by: Nicolas Williams <nico@cryptonector.com>
---
 builtin.c                        | 217 +++++++++++++++++++++++++++++++
 configure.ac                     |  57 ++++++++
 docs/content/3.manual/manual.yml |  72 ++++++++++
 jv_unicode.c                     |   7 +
 tests/all.test                   |  46 ++++++-
 tests/onig.supp                  |  21 +++
 tests/run                        |   2 +-
 7 files changed, 420 insertions(+), 2 deletions(-)
 create mode 100644 tests/onig.supp

diff --git a/builtin.c b/builtin.c
index ebb4ffd..9ea06a1 100644
--- a/builtin.c
+++ b/builtin.c
@@ -1,6 +1,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <math.h>
+#include <oniguruma.h>
 #include <stdlib.h>
 #include <string.h>
 #include "builtin.h"
@@ -514,6 +515,215 @@ static jv f_group_by_impl(jv input, jv keys) {
   }
 }
 
+static int f_match_name_iter(const UChar* name, const UChar *name_end, int ngroups,
+    int *groups, regex_t *reg, void *arg) {
+  jv captures = *(jv*)arg;
+  for (int i = 0; i < ngroups; ++i) {
+    jv cap = jv_array_get(jv_copy(captures),groups[i]-1);
+    if (jv_get_kind(cap) == JV_KIND_OBJECT) {
+      cap = jv_object_set(cap, jv_string("name"), jv_string_sized((const char*)name, name_end-name));
+      captures = jv_array_set(captures,groups[i]-1,cap);
+    } else {
+      jv_free(cap);
+    }
+  }
+  *(jv *)arg = captures;
+  return 0;
+}
+
+
+static jv f_match(jv input, jv regex, jv modifiers, jv testmode) {
+  int test = jv_equal(testmode, jv_true());
+  jv result;
+  int onigret;
+  int global = 0;
+  regex_t *reg;
+  OnigErrorInfo einfo;
+  OnigRegion* region;
+
+  jv_free(testmode);
+  if (jv_get_kind(input) != JV_KIND_STRING) {
+    jv_free(regex);
+    jv_free(modifiers);
+    return type_error(input, "cannot be matched, as it is not a string");
+  }
+
+  if (jv_get_kind(regex) != JV_KIND_STRING) {
+    jv_free(input);
+    jv_free(modifiers);
+    return type_error(regex, "is not a string");
+  }
+
+  OnigOptionType options = ONIG_OPTION_CAPTURE_GROUP;
+
+  if (jv_get_kind(modifiers) == JV_KIND_STRING) {
+    jv modarray = jv_string_explode(jv_copy(modifiers));
+    jv_array_foreach(modarray, i, mod) {
+      switch ((int)jv_number_value(mod)) {
+        case 'g':
+          global = 1;
+          break;
+        case 'i':
+          options |= ONIG_OPTION_IGNORECASE;
+          break;
+        case 'x':
+          options |= ONIG_OPTION_EXTEND;
+          break;
+        case 'm':
+          options |= ONIG_OPTION_MULTILINE;
+          break;
+        case 's':
+          options |= ONIG_OPTION_SINGLELINE;
+          break;
+        case 'p':
+          options |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
+          break;
+        case 'l':
+          options |= ONIG_OPTION_FIND_LONGEST;
+          break;
+        case 'n':
+          options |= ONIG_OPTION_FIND_NOT_EMPTY;
+          break;
+        default:
+          jv_free(input);
+          jv_free(regex);
+          jv_free(modarray);
+          return jv_invalid_with_msg(jv_string_concat(modifiers,
+                jv_string(" is not a valid modifier string")));
+      }
+    }
+    jv_free(modarray);
+  } else if (jv_get_kind(modifiers) != JV_KIND_NULL) { 
+    // If it isn't a string or null, then it is the wrong type...
+    jv_free(input);
+    jv_free(regex);
+    return type_error(modifiers, "is not a string");
+  }
+
+  jv_free(modifiers);
+
+  onigret = onig_new(&reg, (const UChar*)jv_string_value(regex),
+      (const UChar*)(jv_string_value(regex) + jv_string_length_bytes(jv_copy(regex))),
+      options, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PERL_NG, &einfo);
+  if (onigret != ONIG_NORMAL) {
+    UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN];
+    onig_error_code_to_str(ebuf, onigret, einfo);
+    jv_free(input);
+    jv_free(regex);
+    return jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "),
+          jv_string((char*)ebuf)));
+  }
+  if (!test)
+    result = jv_array();
+  const char *input_string = jv_string_value(input);
+  const UChar* start = (const UChar*)jv_string_value(input);
+  const unsigned long length = jv_string_length_bytes(jv_copy(input));
+  const UChar* end = start + length;
+  region = onig_region_new();
+  do {
+    onigret = onig_search(reg, 
+        (const UChar*)jv_string_value(input), end, /* string boundaries */
+        start, end, /* search boundaries */
+        region, ONIG_OPTION_NONE);
+    if (onigret >= 0) {
+      if (test) {
+        result = jv_true();
+        break;
+      }
+
+      // Zero-width match
+      if (region->end[0] == region->beg[0]) {
+        unsigned long idx;
+        const char *fr = (const char*)input_string;
+        for (idx = 0; fr != input_string+region->beg[0]; idx++) {
+          fr += jvp_utf8_decode_length(*fr);
+        }
+        jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
+        match = jv_object_set(match, jv_string("length"), jv_number(0));
+        match = jv_object_set(match, jv_string("string"), jv_string(""));
+        match = jv_object_set(match, jv_string("captures"), jv_array());
+        result = jv_array_append(result, match);
+        start += 1;
+        continue;
+      }
+
+      unsigned long idx;
+      unsigned long len;
+      const char *fr = (const char*)input_string;
+
+      for (idx = len = 0; fr != input_string+region->end[0]; len++) {
+        if (fr == input_string+region->beg[0]) idx = len, len=0;
+        fr += jvp_utf8_decode_length(*fr);
+      }
+
+      jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
+
+      unsigned long blen = region->end[0]-region->beg[0];
+      match = jv_object_set(match, jv_string("length"), jv_number(len));
+      match = jv_object_set(match, jv_string("string"), jv_string_sized(input_string+region->beg[0],blen));
+      jv captures = jv_array();
+      for (int i = 1; i < region->num_regs; ++i) {
+        // Empty capture.
+        if (region->beg[i] == region->end[i]) {
+          // Didn't match.
+          jv cap;
+          if (region->beg[i] == -1) {
+            cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(-1));
+            cap = jv_object_set(cap, jv_string("string"), jv_null());
+          } else {
+            fr = input_string;
+            for (idx = 0; fr != input_string+region->beg[i]; idx++) {
+              fr += jvp_utf8_decode_length(*fr);
+            }
+            cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
+            cap = jv_object_set(cap, jv_string("string"), jv_string(""));
+          }
+          cap = jv_object_set(cap, jv_string("length"), jv_number(0));
+          cap = jv_object_set(cap, jv_string("name"), jv_null());
+          captures = jv_array_append(captures, cap);
+          continue;
+        }
+        fr = input_string;
+        for (idx = len = 0; fr != input_string+region->end[i]; len++) {
+          if (fr == input_string+region->beg[i]) idx = len, len=0;
+          fr += jvp_utf8_decode_length(*fr);
+        }
+
+        blen = region->end[i]-region->beg[i];
+        jv cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
+        cap = jv_object_set(cap, jv_string("length"), jv_number(len));
+        cap = jv_object_set(cap, jv_string("string"), jv_string_sized(input_string+region->beg[i],blen));
+        cap = jv_object_set(cap, jv_string("name"), jv_null());
+        captures = jv_array_append(captures,cap);
+      }
+      onig_foreach_name(reg,f_match_name_iter,&captures);
+      match = jv_object_set(match, jv_string("captures"), captures);
+      result = jv_array_append(result, match);
+      start = (const UChar*)(input_string+region->end[0]);
+      onig_region_free(region,0);
+    } else if (onigret == ONIG_MISMATCH) {
+      if (test)
+        result = jv_false();
+      break;
+    } else { /* Error */
+      UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN];
+      onig_error_code_to_str(ebuf, onigret, einfo);
+      jv_free(result);
+      result = jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "),
+            jv_string((char*)ebuf)));
+      break;
+    }
+  } while (global && start != end);
+  onig_region_free(region,1);
+  region = NULL;
+  if (region)
+    onig_region_free(region,1);
+  onig_free(reg);
+  jv_free(input);
+  jv_free(regex);
+  return result;
+}
+
 static jv minmax_by(jv values, jv keys, int is_min) {
   if (jv_get_kind(values) != JV_KIND_ARRAY)
     return type_error2(values, keys, "cannot be iterated over");
@@ -642,6 +852,7 @@ static const struct cfunction function_list[] = {
   {(cfunction_ptr)f_error, "error", 2},
   {(cfunction_ptr)f_format, "format", 2},
   {(cfunction_ptr)f_env, "env", 1},
+  {(cfunction_ptr)f_match, "_match_impl", 4},
 };
 #undef LIBM_DD
 
@@ -737,6 +948,12 @@ static const char* const jq_builtins[] = {
   "def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
   "def flatten(x): reduce .[] as $i ([]; if $i | type == \"array\" and x > 0 then . + ($i | flatten(x-1)) else . + [$i] end);",
   "def range(x): range(0;x);",
+  "def match(re; mode): _match_impl(re; mode; false)|.[];",
+  "def match(val): if val | type == \"string\" then match(val; null) elif val | type == \"array\" and (val | length) > 1 then match(val[0]; val[1]) elif val | type == \"array\" and (val | length > 0) then match(val[0]; null) else error((val | type) + \" not a string or array\") end;",
+  "def test(re; mode): _match_impl(re; mode; true);",
+  "def test(val): if val |type == \"string\" then test(val; null) elif val | type == \"array\" and (val | length) > 1 then test(val[0]; val[1]) elif val | type == \"array\" and (val | length > 0) then test(val[0]; null) else error((val | type) + \" not a string or array\") end;",
+//  "def test(re): _match(re; null; 1);",
+  
 };
 #undef LIBM_DD
 
diff --git a/configure.ac b/configure.ac
index e8dfed4..a0040e5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -42,6 +42,63 @@ if test "x$LEX" != xflex; then
 fi
 
 
+##########################################################################
+# check for ONIGURUMA library
+##########################################################################
+
+AC_ARG_WITH([oniguruma],
+    [AS_HELP_STRING([--with-oniguruma=prefix],
+        [try this for a non-standard install prefix of the oniguruma library])],
+    [ONIGURUMAPATHSET=1],
+    [ONIGURUMAPATHSET=0])
+
+if test $ONIGURUMAPATHSET == 1; then
+  CFLAGS="$CFLAGS -I${with_oniguruma}/include"
+  LDFLAGS="$LDFLAGS -L${with_oniguruma}/lib"
+fi
+
+# store current *FLAGS and merge with AM_*FLAGS for compilation and linker check   
+OLD_CFLAGS=$CFLAGS;
+OLD_LDFLAGS=$LDFLAGS;
+CFLAGS="$AM_CFLAGS $CFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
+
+# ensure the library to check for is covered by the LIBS variable
+OLD_LIBS=$LIBS
+LIBS="$LIBS -lonig"
+
+# check for ONIGURUMA library headers   
+AC_MSG_CHECKING([for oniguruma.h])
+# try to compile a file that includes a header of the library oniguruma
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #include <oniguruma.h> ])],
+    [AC_MSG_RESULT([yes])
+        # try to link the function 'onig_free' out of library oniguruma
+        AC_MSG_CHECKING([for oniguruma usability])
+        AC_LINK_IFELSE(
+            [AC_LANG_PROGRAM([[#include <oniguruma.h>]],
+                [[onig_free(0);]])],
+            [AC_MSG_RESULT([yes])
+                FOUND_ONIGURUMA=1;],
+            [AC_MSG_RESULT([no])
+                LIBS=$OLD_LIBS; dnl reset to old value since oniguruma was not found
+                FOUND_ONIGURUMA=0;])],
+    [AC_MSG_RESULT([not found])
+        FOUND_ONIGURUMA=0;])
+
+# reset original *FLAGS
+CFLAGS=$OLD_CFLAGS
+LDFLAGS=$OLD_LDFLAGS
+
+# handle check results
+if test $FOUND_ONIGURUMA != 1; then
+    AC_MSG_NOTICE([Oniguruma was not found.])
+    AC_MSG_NOTICE([ Try setting the location using '--with-oniguruma=PREFIX' ])
+    AC_MSG_ERROR([ oniguruma is required to build jq.])
+fi
+
+
+
+
 dnl Check for valgrind
 AC_CHECK_PROGS(valgrind_cmd, valgrind)
 if test "x$valgrind_cmd" = "x" ; then
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index aa0b15d..2e83129 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -1087,6 +1087,78 @@ sections:
             input: '["foobar", "barfoo"]'
             output: ['[false, true, true, false, false]']
 
+      - title: "`match(val)`, `match(regex; modifiers)`"
+        body: |
+
+          The filter `match(val)` performs PCRE regex matching on its input.
+          `val` can be either a string or an array.  If it is an array, 
+          the first element is the regex specifier and the optional
+          second element is the modifier flags.
+          The accepted modifier flags are:
+          
+          * `g` - Global search (find all matches, not just the first)
+          * `i` - Case insensitive search
+          * `x` - Extended regex format (ignore whitespaces)
+          * `m` - Multi line mode ('.' will match newlines)
+          * `s` - Single line mode ('^' -> '\A', '$' -> '\Z')
+          * `p` - Both s and m modes are enabled
+          * `l` - Find longest possible matches
+          * `n` - Ignore empty matches
+          
+          The filter outputs an object for each match it finds.  Matches have
+          the following fields:
+          
+          * `offset` - offset in UTF-8 codepoints from the beginning of the input
+          * `length` - length in UTF-8 codepoints of the match
+          * `string` - the string that it matched
+          * `captures` - an array of objects representing capturing groups.
+          
+          Capturing group objects have the following fields:
+          
+          * `offset` - offset in UTF-8 codepoints from the beginning of the input
+          * `length` - length in UTF-8 codepoints of this capturing group
+          * `string` - the string that was captured
+          * `name` - the name of the capturing group (or `null` if it was unnamed)
+
+          Capturing groups that did not match anything return an offset of -1
+
+        examples:
+          - program: 'match("(abc)+"; "g")'
+            input: '"abc abc"'
+            output: 
+             - '{"offset": 0, "length": 3, "string": "abc", "captures": [{"offset": 0, "length": 3, "string": "abc", "name": null}]}'
+             - '{"offset": 4, "length": 3, "string": "abc", "captures": [{"offset": 4, "length": 3, "string": "abc", "name": null}]}'
+          - program: 'match("foo")'
+            input: '"foo bar foo"'
+            output: ['{"offset": 0, "length": 3, "string": "foo", "captures": []}']
+          - program: 'match(["foo", "ig"])'
+            input: '"foo bar FOO"'
+            output: 
+             - '{"offset": 0, "length": 3, "string": "foo", "captures": []}'
+             - '{"offset": 8, "length": 3, "string": "FOO", "captures": []}'
+          - program: 'match("foo (?<bar123>bar)? foo"; "ig")'
+            input: '"foo bar foo foo  foo"'
+            output:
+             - '{"offset": 0, "length": 11, "string": "foo bar foo", "captures": [{"offset": 4, "length": 3, "string": "bar", "name": "bar123"}]}'
+             - '{"offset": 12, "length": 8, "string": "foo  foo", "captures": [{"offset": -1, "length": 0, "string": null, "name": "bar123"}]}'
+
+
+      - title: "`test(val)`, `test(regex)`, `test(regex; modifiers)`"
+        body: |
+
+           Like `match`, but does not return match objects, only `true` or `false`
+           for whether or not the regex matches the input.
+
+        examples:
+         - program: 'test("foo")'
+           input: '"foo"'
+           output: ['true']
+         - program: 'test("foo"; "i")'
+           input: '"Foo"'
+           output: ['true']
+         - program: 'test("foo")'
+           input: '"bar"'
+           output: ['false']
       - title: "`ltrimstr(str)`"
         body: |
 
diff --git a/jv_unicode.c b/jv_unicode.c
index 907e985..c3f9f11 100644
--- a/jv_unicode.c
+++ b/jv_unicode.c
@@ -59,6 +59,13 @@ int jvp_utf8_is_valid(const char* in, const char* end) {
   return 1;
 }
 
+int jvp_utf8_decode_length(char startchar) {
+	if ((startchar & 0x80) == 0) return 1;
+	else if ((startchar & 0xC0) == 0xC0) return 2;
+	else if ((startchar & 0xE0) == 0xE0) return 3;
+	else return 4;
+}
+
 int jvp_utf8_encode_length(int codepoint) {
   if (codepoint <= 0x7F) return 1;
   else if (codepoint <= 0x7FF) return 2;
diff --git a/tests/all.test b/tests/all.test
index 1e220cd..0b5b947 100644
--- a/tests/all.test
+++ b/tests/all.test
@@ -649,6 +649,50 @@ def inc(x): x |= .+1; inc(.[].a)
 ["fo", "foo", "barfoo", "foobar", "barfoob"]
 [false, true, true, false, false]
 
+# match builtin
+[match("( )*"; "g")]
+"abc"
+[{"offset":0, "length":0, "string":"", "captures":[]},{"offset":1, "length":0, "string":"", "captures":[]},{"offset":2, "length":0, "string":"", "captures":[]}]
+
+[match("( )*"; "gn")]
+"abc"
+[]
+
+[match("a"; "gi")]
+"ÄÃ¡Ã Ã¤"
+[]
+
+[match(["(bar)"])]
+"foo bar"
+[{"offset": 4, "length": 3, "string": "bar", "captures":[{"offset": 4, "length": 3, "string": "bar", "name": null}]}]
+
+# offsets account for combining codepoints and multi-byte UTF-8
+[match("bar")]
+"aÌ bar with a combining codepoint U+0304"
+[{"offset": 3, "length": 3, "string": "bar", "captures":[]}]
+
+# matches with combining codepoints still count them in their length
+[match("baÌr")]
+"a baÌr"
+[{"offset": 2, "length": 4, "string": "baÌr", "captures":[]}]
+
+[match(".+?\\b")]
+"aÌ two-codepoint grapheme"
+[{"offset": 0, "length": 2, "string": "aÌ", "captures":[]}]
+
+[match(["foo (?<bar123>bar)? foo", "ig"])]
+"foo bar foo foo  foo"
+[{"offset": 0, "length": 11, "string": "foo bar foo", "captures":[{"offset": 4, "length": 3, "string": "bar", "name": "bar123"}]},{"offset":12, "length": 8, "string": "foo  foo", "captures":[{"offset": -1, "length": 0, "string": null, "name": "bar123"}]}]
+
+#test builtin
+[test("( )*"; "gn")]
+"abc"
+[false]
+
+[test("Ä")]
+"Ä"
+[true]
+
 [.[]|ltrimstr("foo")]
 ["fo", "foo", "barfoo", "foobar", "afoo"]
 ["fo","","barfoo","bar","afoo"]
@@ -857,4 +901,4 @@ flatten(2)
 
 flatten(2)
 [0, [1, [2]], [1, [[3], 2]]]
-[0, 1, 2, 1, [3], 2]
\ No newline at end of file
+[0, 1, 2, 1, [3], 2]
diff --git a/tests/onig.supp b/tests/onig.supp
new file mode 100644
index 0000000..37c847e
--- /dev/null
+++ b/tests/onig.supp
@@ -0,0 +1,21 @@
+{
+	onig node recycling
+	Memcheck:Leak
+	...
+	fun:onig_parse_make_tree
+	...
+}
+{
+	onig unicode case insensitivity 1
+	Memcheck:Leak
+	...
+	fun:setup_tree
+	...
+}
+{
+	onig unicode case insensitivity 2
+	Memcheck:Leak
+	...
+	fun:onig*unicode*
+	...
+}
diff --git a/tests/run b/tests/run
index ff09246..5268e2e 100755
--- a/tests/run
+++ b/tests/run
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 if which valgrind > /dev/null; then
-    VALGRIND='valgrind --error-exitcode=1 -q --leak-check=full'
+    VALGRIND='valgrind --error-exitcode=1 -q --leak-check=full --suppressions=tests/onig.supp'
 else
     VALGRIND=
 fi
-- 
2.40.0