cgraph: implement a string tokenization API

author Matthew Fernandez <matthew.fernandez@gmail.com>

Sat, 9 Jul 2022 05:10:45 +0000 (22:10 -0700)

committer Matthew Fernandez <matthew.fernandez@gmail.com>

Fri, 22 Jul 2022 00:41:57 +0000 (17:41 -0700)
author Matthew Fernandez <matthew.fernandez@gmail.com>
Sat, 9 Jul 2022 05:10:45 +0000 (22:10 -0700)
committer Matthew Fernandez <matthew.fernandez@gmail.com>
Fri, 22 Jul 2022 00:41:57 +0000 (17:41 -0700)
diff --git a/lib/cgraph/CMakeLists.txt b/lib/cgraph/CMakeLists.txt

index d3c5f45684abe8757dab8b3ea43fc7a202f7386f..ac14bc49074a3adfcadf0008e2a2f144b4e670cc 100644 (file)
--- a/lib/cgraph/CMakeLists.txt
+++ b/lib/cgraph/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(cgraph SHARED
    stack.h
    strcasecmp.h
    strview.h
+  tokenize.h
    unreachable.h
    unused.h
  
diff --git a/lib/cgraph/Makefile.am b/lib/cgraph/Makefile.am

index 9542406dbaca601fe74d33e605247d49b608ab80..1a58bda7fa4fd959332dd9bc6eecddf99044526d 100644 (file)
--- a/lib/cgraph/Makefile.am
+++ b/lib/cgraph/Makefile.am
@@ -10,7 +10,7 @@ endif
  
  pkginclude_HEADERS = cgraph.h
  noinst_HEADERS = agxbuf.h alloc.h bitarray.h cghdr.h exit.h itos.h likely.h \
-       prisize_t.h stack.h strcasecmp.h strview.h unreachable.h unused.h
+       prisize_t.h stack.h strcasecmp.h strview.h tokenize.h unreachable.h unused.h
  noinst_LTLIBRARIES = libcgraph_C.la
  lib_LTLIBRARIES = libcgraph.la
  pkgconfig_DATA = libcgraph.pc
diff --git a/lib/cgraph/cgraph.vcxproj b/lib/cgraph/cgraph.vcxproj

index c1ac98aa15c8f64faf868e5cb082381a84d10f2a..4fd19c689f0a6fc16604e76c5afb2bc10ff2f118 100644 (file)
--- a/lib/cgraph/cgraph.vcxproj
+++ b/lib/cgraph/cgraph.vcxproj
@@ -109,6 +109,7 @@ win_flex -oscan.c scan.l</Command>
      <ClInclude Include="stack.h" />
      <ClInclude Include="strcasecmp.h" />
      <ClInclude Include="strview.h" />
+    <ClInclude Include="tokenize.h" />
      <ClInclude Include="unreachable.h" />
      <ClInclude Include="unused.h" />
    </ItemGroup>
diff --git a/lib/cgraph/cgraph.vcxproj.filters b/lib/cgraph/cgraph.vcxproj.filters

index 39676c16043614b1d00924eeedb9d458fa018737..83633d7db1b7d835027d80a35f5b66d37f2f463a 100644 (file)
--- a/lib/cgraph/cgraph.vcxproj.filters
+++ b/lib/cgraph/cgraph.vcxproj.filters
@@ -51,6 +51,9 @@
      <ClInclude Include="strview.h">
        <Filter>Header Files</Filter>
      </ClInclude>
+    <ClInclude Include="tokenize.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
      <ClInclude Include="unreachable.h">
        <Filter>Header Files</Filter>
      </ClInclude>
diff --git a/lib/cgraph/test_tokenize.c b/lib/cgraph/test_tokenize.c

new file mode 100644 (file)

index 0000000..aea3044
--- /dev/null
+++ b/lib/cgraph/test_tokenize.c
@@ -0,0 +1,210 @@
+/// \file
+/// \brief basic unit tester for tokenize.h
+
+#ifdef NDEBUG
+#error this is not intended to be compiled with assertions off
+#endif
+
+#include <assert.h>
+#include <cgraph/strview.h>
+#include <cgraph/tokenize.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/// basic lifecycle
+static void test_basic(void) {
+  const char input[] = "hello world";
+  tok_t t = tok(input, " ");
+
+  assert(!tok_end(&t));
+  strview_t hello = tok_get(&t);
+  assert(strview_str_eq(hello, "hello"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t world = tok_get(&t);
+  assert(strview_str_eq(world, "world"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// create, but do not use a tokenizer in case ASan can detect leaking memory
+static void test_no_usage(void) {
+  const char input[] = "hello world";
+  tok_t t = tok(input, " ");
+
+  // squash compiler warnings for `t` being unused
+  (void)t;
+}
+
+/// an example with multiple tokens
+static void test_multiple(void) {
+  const char input[] = "foo/bar/baz";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t baz = tok_get(&t);
+  assert(strview_str_eq(baz, "baz"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// input that starts with a separator
+static void test_leading(void) {
+  const char input[] = "/foo/bar";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// input that ends with a separator
+static void test_trailing(void) {
+  const char input[] = "foo/bar/";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// check multiple separators are coalesced into one
+static void test_coalesce(void) {
+  const char input[] = "foo//bar";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// tokenizing the empty string should produce one token
+static void test_empty(void) {
+  const char input[] = "";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// multiple different separators
+static void test_multiple_separators(void) {
+  const char input[] = "foo/bar:baz:/:qux";
+  tok_t t = tok(input, "/:");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t baz = tok_get(&t);
+  assert(strview_str_eq(baz, "baz"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t qux = tok_get(&t);
+  assert(strview_str_eq(qux, "qux"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// input that consists solely of separators
+static void test_only_separators(void) {
+  const char input[] = "//:g/";
+  tok_t t = tok(input, "/:g");
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t empty2 = tok_get(&t);
+  assert(strview_str_eq(empty2, ""));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+int main(void) {
+
+#define RUN(t)                                                                 \
+  do {                                                                         \
+    printf("running test_%s... ", #t);                                         \
+    fflush(stdout);                                                            \
+    test_##t();                                                                \
+    printf("OK\n");                                                            \
+  } while (0)
+
+  RUN(basic);
+  RUN(no_usage);
+  RUN(multiple);
+  RUN(leading);
+  RUN(trailing);
+  RUN(coalesce);
+  RUN(empty);
+  RUN(multiple_separators);
+  RUN(only_separators);
+
+#undef RUN
+
+  return EXIT_SUCCESS;
+}
diff --git a/lib/cgraph/tokenize.h b/lib/cgraph/tokenize.h

new file mode 100644 (file)

index 0000000..b110103
--- /dev/null
+++ b/lib/cgraph/tokenize.h
@@ -0,0 +1,99 @@
+/// \file
+/// \brief String tokenization
+///
+/// This is essentially equivalent to `strtok` but with two main improvements:
+///
+///   1. The input string is not modified. This means, if you have a `const`
+///      string, you do not need to `strdup` it in order to tokenize it. This
+///      (combined with other properties like no opaque struct pointers) enables
+///      you to tokenize a string with no heap allocation.
+///
+///   2. No global state. All the state for tokenization is contained in the
+///      `tok_t` struct.
+///
+/// The above two properties are intended to make string tokenization scalable
+/// (no locks, no thread-shared state) and transparent to the compiler (a good
+/// optimizing compiler implements all the string.h functions we use as
+/// built-ins and, if `separators` is a compile-time literal, can typically
+/// flatten everything into a tight loop with no function calls).
+///
+/// Sample usage:
+///
+///   const char my_input[] = "foo; bar:/baz";
+///   for (tok_t t = tok(my_input, ";:/"); !tok_end(&t); tok_next(&t)) {
+///     strview_t s = tok_get(&t);
+///     printf("%.*s\n", (int)s.size, s.data);
+///   }
+///   // prints “foo”, “ bar”, “baz”
+
+#include <assert.h>
+#include <cgraph/strview.h>
+#include <stddef.h>
+#include <string.h>
+
+/// state for an in-progress string tokenization
+typedef struct {
+  const char *start;      ///< start of the string being scanned
+  const char *separators; ///< characters to treat as token separators
+  strview_t next;         ///< next token to yield
+} tok_t;
+
+/// begin tokenization of a new string
+static inline tok_t tok(const char *input, const char *separators) {
+
+  assert(input != NULL);
+  assert(separators != NULL);
+  assert(strcmp(separators, "") != 0 &&
+         "at least one separator must be provided");
+
+  tok_t t = {.start = input, .separators = separators};
+
+  // find the end of the first token
+  size_t size = strcspn(input, separators);
+  t.next = (strview_t){.data = input, .size = size};
+
+  return t;
+}
+
+/// is this tokenizer exhausted?
+static inline bool tok_end(const tok_t *t) {
+
+  assert(t != NULL);
+
+  return t->next.data == NULL;
+}
+
+/// get the current token
+static inline strview_t tok_get(const tok_t *t) {
+
+  assert(t != NULL);
+  assert(t->next.data != NULL && "extracting from an exhausted tokenizer");
+
+  return t->next;
+}
+
+/// advance to the next token in the string being scanned
+static inline void tok_next(tok_t *t) {
+
+  assert(t != NULL);
+  assert(t->start != NULL);
+  assert(t->separators != NULL);
+  assert(t->next.data != NULL && "advancing an exhausted tokenizer");
+
+  // resume from where the previous token ended
+  const char *start = t->next.data + t->next.size;
+
+  // if we are at the end of the string, we are done
+  if (start == t->start + strlen(t->start)) {
+    t->next = (strview_t){0};
+    return;
+  }
+
+  // skip last separator characters
+  start += strspn(start, t->separators);
+
+  // find the end of the next token
+  size_t size = strcspn(start, t->separators);
+
+  t->next = (strview_t){.data = start, .size = size};
+}
diff --git a/tests/test_c_utils.py b/tests/test_c_utils.py

index 89403706e0de63d12b7834d3555f3d8a8d5edfa9..7ac616db5ee46f377666f2442edd20054aab5baf 100644 (file)
--- a/tests/test_c_utils.py
+++ b/tests/test_c_utils.py
@@ -9,7 +9,7 @@ import pytest
  sys.path.append(os.path.dirname(__file__))
  from gvtest import run_c #pylint: disable=wrong-import-position
  
-@pytest.mark.parametrize("utility", ("bitarray", "stack"))
+@pytest.mark.parametrize("utility", ("bitarray", "stack", "tokenize"))
  def test_utility(utility: str):
    """run the given utility’s unit tests"""
author	Matthew Fernandez <matthew.fernandez@gmail.com>
	Sat, 9 Jul 2022 05:10:45 +0000 (22:10 -0700)
committer	Matthew Fernandez <matthew.fernandez@gmail.com>
	Fri, 22 Jul 2022 00:41:57 +0000 (17:41 -0700)
lib/cgraph/CMakeLists.txt		patch \| blob \| history
lib/cgraph/Makefile.am		patch \| blob \| history
lib/cgraph/cgraph.vcxproj		patch \| blob \| history
lib/cgraph/cgraph.vcxproj.filters		patch \| blob \| history
lib/cgraph/test_tokenize.c	[new file with mode: 0644]	patch \| blob
lib/cgraph/tokenize.h	[new file with mode: 0644]	patch \| blob
tests/test_c_utils.py		patch \| blob \| history