From: Matthew Fernandez <matthew.fernandez@gmail.com>
Date: Sat, 9 Jul 2022 05:10:45 +0000 (-0700)
Subject: cgraph: implement a string tokenization API
X-Git-Tag: 5.0.1~35^2~9
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=68b2e24a5ac789a40d01dc821a5a02a1747b9911;p=graphviz

cgraph: implement a string tokenization API

This will be used in an upcoming commit.
---

diff --git a/lib/cgraph/CMakeLists.txt b/lib/cgraph/CMakeLists.txt
index d3c5f4568..ac14bc490 100644
--- a/lib/cgraph/CMakeLists.txt
+++ b/lib/cgraph/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(cgraph SHARED
   stack.h
   strcasecmp.h
   strview.h
+  tokenize.h
   unreachable.h
   unused.h
 
diff --git a/lib/cgraph/Makefile.am b/lib/cgraph/Makefile.am
index 9542406db..1a58bda7f 100644
--- a/lib/cgraph/Makefile.am
+++ b/lib/cgraph/Makefile.am
@@ -10,7 +10,7 @@ endif
 
 pkginclude_HEADERS = cgraph.h
 noinst_HEADERS = agxbuf.h alloc.h bitarray.h cghdr.h exit.h itos.h likely.h \
-	prisize_t.h stack.h strcasecmp.h strview.h unreachable.h unused.h
+	prisize_t.h stack.h strcasecmp.h strview.h tokenize.h unreachable.h unused.h
 noinst_LTLIBRARIES = libcgraph_C.la
 lib_LTLIBRARIES = libcgraph.la
 pkgconfig_DATA = libcgraph.pc
diff --git a/lib/cgraph/cgraph.vcxproj b/lib/cgraph/cgraph.vcxproj
index c1ac98aa1..4fd19c689 100644
--- a/lib/cgraph/cgraph.vcxproj
+++ b/lib/cgraph/cgraph.vcxproj
@@ -109,6 +109,7 @@ win_flex -oscan.c scan.l</Command>
     <ClInclude Include="stack.h" />
     <ClInclude Include="strcasecmp.h" />
     <ClInclude Include="strview.h" />
+    <ClInclude Include="tokenize.h" />
     <ClInclude Include="unreachable.h" />
     <ClInclude Include="unused.h" />
   </ItemGroup>
diff --git a/lib/cgraph/cgraph.vcxproj.filters b/lib/cgraph/cgraph.vcxproj.filters
index 39676c160..83633d7db 100644
--- a/lib/cgraph/cgraph.vcxproj.filters
+++ b/lib/cgraph/cgraph.vcxproj.filters
@@ -51,6 +51,9 @@
     <ClInclude Include="strview.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="tokenize.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="unreachable.h">
       <Filter>Header Files</Filter>
     </ClInclude>
diff --git a/lib/cgraph/test_tokenize.c b/lib/cgraph/test_tokenize.c
new file mode 100644
index 000000000..aea304467
--- /dev/null
+++ b/lib/cgraph/test_tokenize.c
@@ -0,0 +1,210 @@
+/// \file
+/// \brief basic unit tester for tokenize.h
+
+#ifdef NDEBUG
+#error this is not intended to be compiled with assertions off
+#endif
+
+#include <assert.h>
+#include <cgraph/strview.h>
+#include <cgraph/tokenize.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/// basic lifecycle
+static void test_basic(void) {
+  const char input[] = "hello world";
+  tok_t t = tok(input, " ");
+
+  assert(!tok_end(&t));
+  strview_t hello = tok_get(&t);
+  assert(strview_str_eq(hello, "hello"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t world = tok_get(&t);
+  assert(strview_str_eq(world, "world"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// create, but do not use a tokenizer in case ASan can detect leaking memory
+static void test_no_usage(void) {
+  const char input[] = "hello world";
+  tok_t t = tok(input, " ");
+
+  // squash compiler warnings for `t` being unused
+  (void)t;
+}
+
+/// an example with multiple tokens
+static void test_multiple(void) {
+  const char input[] = "foo/bar/baz";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t baz = tok_get(&t);
+  assert(strview_str_eq(baz, "baz"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// input that starts with a separator
+static void test_leading(void) {
+  const char input[] = "/foo/bar";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// input that ends with a separator
+static void test_trailing(void) {
+  const char input[] = "foo/bar/";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// check multiple separators are coalesced into one
+static void test_coalesce(void) {
+  const char input[] = "foo//bar";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// tokenizing the empty string should produce one token
+static void test_empty(void) {
+  const char input[] = "";
+  tok_t t = tok(input, "/");
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// multiple different separators
+static void test_multiple_separators(void) {
+  const char input[] = "foo/bar:baz:/:qux";
+  tok_t t = tok(input, "/:");
+
+  assert(!tok_end(&t));
+  strview_t foo = tok_get(&t);
+  assert(strview_str_eq(foo, "foo"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t bar = tok_get(&t);
+  assert(strview_str_eq(bar, "bar"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t baz = tok_get(&t);
+  assert(strview_str_eq(baz, "baz"));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t qux = tok_get(&t);
+  assert(strview_str_eq(qux, "qux"));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+/// input that consists solely of separators
+static void test_only_separators(void) {
+  const char input[] = "//:g/";
+  tok_t t = tok(input, "/:g");
+
+  assert(!tok_end(&t));
+  strview_t empty = tok_get(&t);
+  assert(strview_str_eq(empty, ""));
+  tok_next(&t);
+
+  assert(!tok_end(&t));
+  strview_t empty2 = tok_get(&t);
+  assert(strview_str_eq(empty2, ""));
+  tok_next(&t);
+
+  assert(tok_end(&t));
+}
+
+int main(void) {
+
+#define RUN(t)                                                                 \
+  do {                                                                         \
+    printf("running test_%s... ", #t);                                         \
+    fflush(stdout);                                                            \
+    test_##t();                                                                \
+    printf("OK\n");                                                            \
+  } while (0)
+
+  RUN(basic);
+  RUN(no_usage);
+  RUN(multiple);
+  RUN(leading);
+  RUN(trailing);
+  RUN(coalesce);
+  RUN(empty);
+  RUN(multiple_separators);
+  RUN(only_separators);
+
+#undef RUN
+
+  return EXIT_SUCCESS;
+}
diff --git a/lib/cgraph/tokenize.h b/lib/cgraph/tokenize.h
new file mode 100644
index 000000000..b11010315
--- /dev/null
+++ b/lib/cgraph/tokenize.h
@@ -0,0 +1,99 @@
+/// \file
+/// \brief String tokenization
+///
+/// This is essentially equivalent to `strtok` but with two main improvements:
+///
+///   1. The input string is not modified. This means, if you have a `const`
+///      string, you do not need to `strdup` it in order to tokenize it. This
+///      (combined with other properties like no opaque struct pointers) enables
+///      you to tokenize a string with no heap allocation.
+///
+///   2. No global state. All the state for tokenization is contained in the
+///      `tok_t` struct.
+///
+/// The above two properties are intended to make string tokenization scalable
+/// (no locks, no thread-shared state) and transparent to the compiler (a good
+/// optimizing compiler implements all the string.h functions we use as
+/// built-ins and, if `separators` is a compile-time literal, can typically
+/// flatten everything into a tight loop with no function calls).
+///
+/// Sample usage:
+///
+///   const char my_input[] = "foo; bar:/baz";
+///   for (tok_t t = tok(my_input, ";:/"); !tok_end(&t); tok_next(&t)) {
+///     strview_t s = tok_get(&t);
+///     printf("%.*s\n", (int)s.size, s.data);
+///   }
+///   // prints âfooâ, â barâ, âbazâ
+
+#include <assert.h>
+#include <cgraph/strview.h>
+#include <stddef.h>
+#include <string.h>
+
+/// state for an in-progress string tokenization
+typedef struct {
+  const char *start;      ///< start of the string being scanned
+  const char *separators; ///< characters to treat as token separators
+  strview_t next;         ///< next token to yield
+} tok_t;
+
+/// begin tokenization of a new string
+static inline tok_t tok(const char *input, const char *separators) {
+
+  assert(input != NULL);
+  assert(separators != NULL);
+  assert(strcmp(separators, "") != 0 &&
+         "at least one separator must be provided");
+
+  tok_t t = {.start = input, .separators = separators};
+
+  // find the end of the first token
+  size_t size = strcspn(input, separators);
+  t.next = (strview_t){.data = input, .size = size};
+
+  return t;
+}
+
+/// is this tokenizer exhausted?
+static inline bool tok_end(const tok_t *t) {
+
+  assert(t != NULL);
+
+  return t->next.data == NULL;
+}
+
+/// get the current token
+static inline strview_t tok_get(const tok_t *t) {
+
+  assert(t != NULL);
+  assert(t->next.data != NULL && "extracting from an exhausted tokenizer");
+
+  return t->next;
+}
+
+/// advance to the next token in the string being scanned
+static inline void tok_next(tok_t *t) {
+
+  assert(t != NULL);
+  assert(t->start != NULL);
+  assert(t->separators != NULL);
+  assert(t->next.data != NULL && "advancing an exhausted tokenizer");
+
+  // resume from where the previous token ended
+  const char *start = t->next.data + t->next.size;
+
+  // if we are at the end of the string, we are done
+  if (start == t->start + strlen(t->start)) {
+    t->next = (strview_t){0};
+    return;
+  }
+
+  // skip last separator characters
+  start += strspn(start, t->separators);
+
+  // find the end of the next token
+  size_t size = strcspn(start, t->separators);
+
+  t->next = (strview_t){.data = start, .size = size};
+}
diff --git a/tests/test_c_utils.py b/tests/test_c_utils.py
index 89403706e..7ac616db5 100644
--- a/tests/test_c_utils.py
+++ b/tests/test_c_utils.py
@@ -9,7 +9,7 @@ import pytest
 sys.path.append(os.path.dirname(__file__))
 from gvtest import run_c #pylint: disable=wrong-import-position
 
-@pytest.mark.parametrize("utility", ("bitarray", "stack"))
+@pytest.mark.parametrize("utility", ("bitarray", "stack", "tokenize"))
 def test_utility(utility: str):
   """run the given utilityâs unit tests"""