From: Matthew Fernandez Date: Sat, 9 Jul 2022 05:10:45 +0000 (-0700) Subject: cgraph: implement a string tokenization API X-Git-Tag: 5.0.1~35^2~9 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=68b2e24a5ac789a40d01dc821a5a02a1747b9911;p=graphviz cgraph: implement a string tokenization API This will be used in an upcoming commit. --- diff --git a/lib/cgraph/CMakeLists.txt b/lib/cgraph/CMakeLists.txt index d3c5f4568..ac14bc490 100644 --- a/lib/cgraph/CMakeLists.txt +++ b/lib/cgraph/CMakeLists.txt @@ -23,6 +23,7 @@ add_library(cgraph SHARED stack.h strcasecmp.h strview.h + tokenize.h unreachable.h unused.h diff --git a/lib/cgraph/Makefile.am b/lib/cgraph/Makefile.am index 9542406db..1a58bda7f 100644 --- a/lib/cgraph/Makefile.am +++ b/lib/cgraph/Makefile.am @@ -10,7 +10,7 @@ endif pkginclude_HEADERS = cgraph.h noinst_HEADERS = agxbuf.h alloc.h bitarray.h cghdr.h exit.h itos.h likely.h \ - prisize_t.h stack.h strcasecmp.h strview.h unreachable.h unused.h + prisize_t.h stack.h strcasecmp.h strview.h tokenize.h unreachable.h unused.h noinst_LTLIBRARIES = libcgraph_C.la lib_LTLIBRARIES = libcgraph.la pkgconfig_DATA = libcgraph.pc diff --git a/lib/cgraph/cgraph.vcxproj b/lib/cgraph/cgraph.vcxproj index c1ac98aa1..4fd19c689 100644 --- a/lib/cgraph/cgraph.vcxproj +++ b/lib/cgraph/cgraph.vcxproj @@ -109,6 +109,7 @@ win_flex -oscan.c scan.l + diff --git a/lib/cgraph/cgraph.vcxproj.filters b/lib/cgraph/cgraph.vcxproj.filters index 39676c160..83633d7db 100644 --- a/lib/cgraph/cgraph.vcxproj.filters +++ b/lib/cgraph/cgraph.vcxproj.filters @@ -51,6 +51,9 @@ Header Files + + Header Files + Header Files diff --git a/lib/cgraph/test_tokenize.c b/lib/cgraph/test_tokenize.c new file mode 100644 index 000000000..aea304467 --- /dev/null +++ b/lib/cgraph/test_tokenize.c @@ -0,0 +1,210 @@ +/// \file +/// \brief basic unit tester for tokenize.h + +#ifdef NDEBUG +#error this is not intended to be compiled with assertions off +#endif + +#include +#include +#include +#include +#include + +/// basic lifecycle +static void test_basic(void) { + const char input[] = "hello world"; + tok_t t = tok(input, " "); + + assert(!tok_end(&t)); + strview_t hello = tok_get(&t); + assert(strview_str_eq(hello, "hello")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t world = tok_get(&t); + assert(strview_str_eq(world, "world")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// create, but do not use a tokenizer in case ASan can detect leaking memory +static void test_no_usage(void) { + const char input[] = "hello world"; + tok_t t = tok(input, " "); + + // squash compiler warnings for `t` being unused + (void)t; +} + +/// an example with multiple tokens +static void test_multiple(void) { + const char input[] = "foo/bar/baz"; + tok_t t = tok(input, "/"); + + assert(!tok_end(&t)); + strview_t foo = tok_get(&t); + assert(strview_str_eq(foo, "foo")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t bar = tok_get(&t); + assert(strview_str_eq(bar, "bar")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t baz = tok_get(&t); + assert(strview_str_eq(baz, "baz")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// input that starts with a separator +static void test_leading(void) { + const char input[] = "/foo/bar"; + tok_t t = tok(input, "/"); + + assert(!tok_end(&t)); + strview_t empty = tok_get(&t); + assert(strview_str_eq(empty, "")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t foo = tok_get(&t); + assert(strview_str_eq(foo, "foo")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t bar = tok_get(&t); + assert(strview_str_eq(bar, "bar")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// input that ends with a separator +static void test_trailing(void) { + const char input[] = "foo/bar/"; + tok_t t = tok(input, "/"); + + assert(!tok_end(&t)); + strview_t foo = tok_get(&t); + assert(strview_str_eq(foo, "foo")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t bar = tok_get(&t); + assert(strview_str_eq(bar, "bar")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t empty = tok_get(&t); + assert(strview_str_eq(empty, "")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// check multiple separators are coalesced into one +static void test_coalesce(void) { + const char input[] = "foo//bar"; + tok_t t = tok(input, "/"); + + assert(!tok_end(&t)); + strview_t foo = tok_get(&t); + assert(strview_str_eq(foo, "foo")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t bar = tok_get(&t); + assert(strview_str_eq(bar, "bar")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// tokenizing the empty string should produce one token +static void test_empty(void) { + const char input[] = ""; + tok_t t = tok(input, "/"); + + assert(!tok_end(&t)); + strview_t empty = tok_get(&t); + assert(strview_str_eq(empty, "")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// multiple different separators +static void test_multiple_separators(void) { + const char input[] = "foo/bar:baz:/:qux"; + tok_t t = tok(input, "/:"); + + assert(!tok_end(&t)); + strview_t foo = tok_get(&t); + assert(strview_str_eq(foo, "foo")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t bar = tok_get(&t); + assert(strview_str_eq(bar, "bar")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t baz = tok_get(&t); + assert(strview_str_eq(baz, "baz")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t qux = tok_get(&t); + assert(strview_str_eq(qux, "qux")); + tok_next(&t); + + assert(tok_end(&t)); +} + +/// input that consists solely of separators +static void test_only_separators(void) { + const char input[] = "//:g/"; + tok_t t = tok(input, "/:g"); + + assert(!tok_end(&t)); + strview_t empty = tok_get(&t); + assert(strview_str_eq(empty, "")); + tok_next(&t); + + assert(!tok_end(&t)); + strview_t empty2 = tok_get(&t); + assert(strview_str_eq(empty2, "")); + tok_next(&t); + + assert(tok_end(&t)); +} + +int main(void) { + +#define RUN(t) \ + do { \ + printf("running test_%s... ", #t); \ + fflush(stdout); \ + test_##t(); \ + printf("OK\n"); \ + } while (0) + + RUN(basic); + RUN(no_usage); + RUN(multiple); + RUN(leading); + RUN(trailing); + RUN(coalesce); + RUN(empty); + RUN(multiple_separators); + RUN(only_separators); + +#undef RUN + + return EXIT_SUCCESS; +} diff --git a/lib/cgraph/tokenize.h b/lib/cgraph/tokenize.h new file mode 100644 index 000000000..b11010315 --- /dev/null +++ b/lib/cgraph/tokenize.h @@ -0,0 +1,99 @@ +/// \file +/// \brief String tokenization +/// +/// This is essentially equivalent to `strtok` but with two main improvements: +/// +/// 1. The input string is not modified. This means, if you have a `const` +/// string, you do not need to `strdup` it in order to tokenize it. This +/// (combined with other properties like no opaque struct pointers) enables +/// you to tokenize a string with no heap allocation. +/// +/// 2. No global state. All the state for tokenization is contained in the +/// `tok_t` struct. +/// +/// The above two properties are intended to make string tokenization scalable +/// (no locks, no thread-shared state) and transparent to the compiler (a good +/// optimizing compiler implements all the string.h functions we use as +/// built-ins and, if `separators` is a compile-time literal, can typically +/// flatten everything into a tight loop with no function calls). +/// +/// Sample usage: +/// +/// const char my_input[] = "foo; bar:/baz"; +/// for (tok_t t = tok(my_input, ";:/"); !tok_end(&t); tok_next(&t)) { +/// strview_t s = tok_get(&t); +/// printf("%.*s\n", (int)s.size, s.data); +/// } +/// // prints “foo”, “ bar”, “baz” + +#include +#include +#include +#include + +/// state for an in-progress string tokenization +typedef struct { + const char *start; ///< start of the string being scanned + const char *separators; ///< characters to treat as token separators + strview_t next; ///< next token to yield +} tok_t; + +/// begin tokenization of a new string +static inline tok_t tok(const char *input, const char *separators) { + + assert(input != NULL); + assert(separators != NULL); + assert(strcmp(separators, "") != 0 && + "at least one separator must be provided"); + + tok_t t = {.start = input, .separators = separators}; + + // find the end of the first token + size_t size = strcspn(input, separators); + t.next = (strview_t){.data = input, .size = size}; + + return t; +} + +/// is this tokenizer exhausted? +static inline bool tok_end(const tok_t *t) { + + assert(t != NULL); + + return t->next.data == NULL; +} + +/// get the current token +static inline strview_t tok_get(const tok_t *t) { + + assert(t != NULL); + assert(t->next.data != NULL && "extracting from an exhausted tokenizer"); + + return t->next; +} + +/// advance to the next token in the string being scanned +static inline void tok_next(tok_t *t) { + + assert(t != NULL); + assert(t->start != NULL); + assert(t->separators != NULL); + assert(t->next.data != NULL && "advancing an exhausted tokenizer"); + + // resume from where the previous token ended + const char *start = t->next.data + t->next.size; + + // if we are at the end of the string, we are done + if (start == t->start + strlen(t->start)) { + t->next = (strview_t){0}; + return; + } + + // skip last separator characters + start += strspn(start, t->separators); + + // find the end of the next token + size_t size = strcspn(start, t->separators); + + t->next = (strview_t){.data = start, .size = size}; +} diff --git a/tests/test_c_utils.py b/tests/test_c_utils.py index 89403706e..7ac616db5 100644 --- a/tests/test_c_utils.py +++ b/tests/test_c_utils.py @@ -9,7 +9,7 @@ import pytest sys.path.append(os.path.dirname(__file__)) from gvtest import run_c #pylint: disable=wrong-import-position -@pytest.mark.parametrize("utility", ("bitarray", "stack")) +@pytest.mark.parametrize("utility", ("bitarray", "stack", "tokenize")) def test_utility(utility: str): """run the given utility’s unit tests"""