--- /dev/null
+/// \file
+/// \brief basic unit tester for tokenize.h
+
+#ifdef NDEBUG
+#error this is not intended to be compiled with assertions off
+#endif
+
+#include <assert.h>
+#include <cgraph/strview.h>
+#include <cgraph/tokenize.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/// basic lifecycle
+static void test_basic(void) {
+ const char input[] = "hello world";
+ tok_t t = tok(input, " ");
+
+ assert(!tok_end(&t));
+ strview_t hello = tok_get(&t);
+ assert(strview_str_eq(hello, "hello"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t world = tok_get(&t);
+ assert(strview_str_eq(world, "world"));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// create, but do not use a tokenizer in case ASan can detect leaking memory
+static void test_no_usage(void) {
+ const char input[] = "hello world";
+ tok_t t = tok(input, " ");
+
+ // squash compiler warnings for `t` being unused
+ (void)t;
+}
+
+/// an example with multiple tokens
+static void test_multiple(void) {
+ const char input[] = "foo/bar/baz";
+ tok_t t = tok(input, "/");
+
+ assert(!tok_end(&t));
+ strview_t foo = tok_get(&t);
+ assert(strview_str_eq(foo, "foo"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t bar = tok_get(&t);
+ assert(strview_str_eq(bar, "bar"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t baz = tok_get(&t);
+ assert(strview_str_eq(baz, "baz"));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// input that starts with a separator
+static void test_leading(void) {
+ const char input[] = "/foo/bar";
+ tok_t t = tok(input, "/");
+
+ assert(!tok_end(&t));
+ strview_t empty = tok_get(&t);
+ assert(strview_str_eq(empty, ""));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t foo = tok_get(&t);
+ assert(strview_str_eq(foo, "foo"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t bar = tok_get(&t);
+ assert(strview_str_eq(bar, "bar"));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// input that ends with a separator
+static void test_trailing(void) {
+ const char input[] = "foo/bar/";
+ tok_t t = tok(input, "/");
+
+ assert(!tok_end(&t));
+ strview_t foo = tok_get(&t);
+ assert(strview_str_eq(foo, "foo"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t bar = tok_get(&t);
+ assert(strview_str_eq(bar, "bar"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t empty = tok_get(&t);
+ assert(strview_str_eq(empty, ""));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// check multiple separators are coalesced into one
+static void test_coalesce(void) {
+ const char input[] = "foo//bar";
+ tok_t t = tok(input, "/");
+
+ assert(!tok_end(&t));
+ strview_t foo = tok_get(&t);
+ assert(strview_str_eq(foo, "foo"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t bar = tok_get(&t);
+ assert(strview_str_eq(bar, "bar"));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// tokenizing the empty string should produce one token
+static void test_empty(void) {
+ const char input[] = "";
+ tok_t t = tok(input, "/");
+
+ assert(!tok_end(&t));
+ strview_t empty = tok_get(&t);
+ assert(strview_str_eq(empty, ""));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// multiple different separators
+static void test_multiple_separators(void) {
+ const char input[] = "foo/bar:baz:/:qux";
+ tok_t t = tok(input, "/:");
+
+ assert(!tok_end(&t));
+ strview_t foo = tok_get(&t);
+ assert(strview_str_eq(foo, "foo"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t bar = tok_get(&t);
+ assert(strview_str_eq(bar, "bar"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t baz = tok_get(&t);
+ assert(strview_str_eq(baz, "baz"));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t qux = tok_get(&t);
+ assert(strview_str_eq(qux, "qux"));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+/// input that consists solely of separators
+static void test_only_separators(void) {
+ const char input[] = "//:g/";
+ tok_t t = tok(input, "/:g");
+
+ assert(!tok_end(&t));
+ strview_t empty = tok_get(&t);
+ assert(strview_str_eq(empty, ""));
+ tok_next(&t);
+
+ assert(!tok_end(&t));
+ strview_t empty2 = tok_get(&t);
+ assert(strview_str_eq(empty2, ""));
+ tok_next(&t);
+
+ assert(tok_end(&t));
+}
+
+int main(void) {
+
+#define RUN(t) \
+ do { \
+ printf("running test_%s... ", #t); \
+ fflush(stdout); \
+ test_##t(); \
+ printf("OK\n"); \
+ } while (0)
+
+ RUN(basic);
+ RUN(no_usage);
+ RUN(multiple);
+ RUN(leading);
+ RUN(trailing);
+ RUN(coalesce);
+ RUN(empty);
+ RUN(multiple_separators);
+ RUN(only_separators);
+
+#undef RUN
+
+ return EXIT_SUCCESS;
+}
--- /dev/null
+/// \file
+/// \brief String tokenization
+///
+/// This is essentially equivalent to `strtok` but with two main improvements:
+///
+/// 1. The input string is not modified. This means, if you have a `const`
+/// string, you do not need to `strdup` it in order to tokenize it. This
+/// (combined with other properties like no opaque struct pointers) enables
+/// you to tokenize a string with no heap allocation.
+///
+/// 2. No global state. All the state for tokenization is contained in the
+/// `tok_t` struct.
+///
+/// The above two properties are intended to make string tokenization scalable
+/// (no locks, no thread-shared state) and transparent to the compiler (a good
+/// optimizing compiler implements all the string.h functions we use as
+/// built-ins and, if `separators` is a compile-time literal, can typically
+/// flatten everything into a tight loop with no function calls).
+///
+/// Sample usage:
+///
+/// const char my_input[] = "foo; bar:/baz";
+/// for (tok_t t = tok(my_input, ";:/"); !tok_end(&t); tok_next(&t)) {
+/// strview_t s = tok_get(&t);
+/// printf("%.*s\n", (int)s.size, s.data);
+/// }
+/// // prints “foo”, “ bar”, “baz”
+
+#include <assert.h>
+#include <cgraph/strview.h>
+#include <stddef.h>
+#include <string.h>
+
+/// state for an in-progress string tokenization
+typedef struct {
+ const char *start; ///< start of the string being scanned
+ const char *separators; ///< characters to treat as token separators
+ strview_t next; ///< next token to yield
+} tok_t;
+
+/// begin tokenization of a new string
+static inline tok_t tok(const char *input, const char *separators) {
+
+ assert(input != NULL);
+ assert(separators != NULL);
+ assert(strcmp(separators, "") != 0 &&
+ "at least one separator must be provided");
+
+ tok_t t = {.start = input, .separators = separators};
+
+ // find the end of the first token
+ size_t size = strcspn(input, separators);
+ t.next = (strview_t){.data = input, .size = size};
+
+ return t;
+}
+
+/// is this tokenizer exhausted?
+static inline bool tok_end(const tok_t *t) {
+
+ assert(t != NULL);
+
+ return t->next.data == NULL;
+}
+
+/// get the current token
+static inline strview_t tok_get(const tok_t *t) {
+
+ assert(t != NULL);
+ assert(t->next.data != NULL && "extracting from an exhausted tokenizer");
+
+ return t->next;
+}
+
+/// advance to the next token in the string being scanned
+static inline void tok_next(tok_t *t) {
+
+ assert(t != NULL);
+ assert(t->start != NULL);
+ assert(t->separators != NULL);
+ assert(t->next.data != NULL && "advancing an exhausted tokenizer");
+
+ // resume from where the previous token ended
+ const char *start = t->next.data + t->next.size;
+
+ // if we are at the end of the string, we are done
+ if (start == t->start + strlen(t->start)) {
+ t->next = (strview_t){0};
+ return;
+ }
+
+ // skip last separator characters
+ start += strspn(start, t->separators);
+
+ // find the end of the next token
+ size_t size = strcspn(start, t->separators);
+
+ t->next = (strview_t){.data = start, .size = size};
+}