From 8be94c8e968fa4d152c3b1339bf7e5379f1cd23a Mon Sep 17 00:00:00 2001 From: Matthew Fernandez Date: Fri, 1 Oct 2021 18:35:26 -0700 Subject: [PATCH] xml_string0: outline the inner loop body functionality The motivation for this change is to move towards consolidating XML-escaping functionality. The introduced function, `xml_core`, is intended to be usable by `xml_url_string` and other Graphviz XML escaping functions in future. This aim of consolidation hopefully also explains why this change looks more complicated than might be expected. It is intended to be functionally a no-op. But `xml_core` takes a generalized callback and state because it is imagined a future caller may want to pass in, e.g. `gvputs` as their target sink for XML-escaped data. That is, future changes may side step the static buffer usage and allocation that happens in `xml_string0` and instead pump bytes directly into a final target file. Related to #1868 --- lib/common/xml.c | 199 ++++++++++++++++++++++++++++++----------------- 1 file changed, 129 insertions(+), 70 deletions(-) diff --git a/lib/common/xml.c b/lib/common/xml.c index 064e2fab3..ebe348aea 100644 --- a/lib/common/xml.c +++ b/lib/common/xml.c @@ -1,8 +1,12 @@ +#include #include #include #include #include +#include #include +#include +#include // variant of `isalpha` that assumes a C locale static bool isalpha_no_locale(char c) { @@ -42,11 +46,111 @@ static int xml_isentity(const char *s) return 0; } +// options to tweak the behavior of XML escaping +typedef struct { + // assume no embedded escapes, and escape "\n" and "\r" + unsigned raw : 1; +} xml_flags_t; + +/** XML-escape a character + * + * \param previous The source character preceding the current one or '\0' if + * there was no prior character. + * \param current Pointer to the current position in a source string being + * escaped. + * \param flags Options for configuring behavior. + * \param cb User function for emitting escaped data. This is expected to take a + * caller-defined state type as the first parameter and the string to emit as + * the second, and then return an opaque value that is passed back to the + * caller. + * \param state Data to pass as the first parameter when calling `cb`. + * \return The return value of a call to `cb`. + */ +static int xml_core(char previous, const char *current, xml_flags_t flags, + int (*cb)(void *state, const char *s), void *state) { + + char c = *current; + + // escape '&' only if not part of a legal entity sequence + if (c == '&' && (flags.raw || !xml_isentity(current))) + return cb(state, "&"); + + // '<' '>' are safe to substitute even if string is already UTF-8 coded since + // UTF-8 strings won't contain '<' or '>' + if (c == '<') + return cb(state, "<"); + + if (c == '>') + return cb(state, ">"); + + // '-' cannot be used in XML comment strings + if (c == '-') + return cb(state, "-"); + + if (c == ' ' && previous == ' ') + // substitute 2nd and subsequent spaces with required_spaces + return cb(state, " "); // Inkscape does not recognize   + + if (c == '"') + return cb(state, """); + + if (c == '\'') + return cb(state, "'"); + + if (c == '\n' && flags.raw) + return cb(state, " "); + + if (c == '\r' && flags.raw) + return cb(state, " "); + + // otherwise, output the character as-is + char buffer[2] = {c, '\0'}; + return cb(state, buffer); +} + char *xml_string(char *s) { return xml_string0 (s, FALSE); } +// a dynamically resizable string +typedef struct { + char *base; + size_t length; + size_t capacity; +} buffer_t; + +/** Write string data to a buffer + * + * \param dst A `buffer_t` to write to, but `void*` typed to align with the + * callback type `xml_core` expects. + * \param src String to append. + * \return Number of characters written. + */ +static int buffer_put(void *dst, const char *src) { + + buffer_t *buffer = dst; + size_t length = strlen(src); + + // do we need to expand this buffer? + assert(buffer->base != NULL && "buffer not initialized in xml_string0?"); + while (length > buffer->capacity || + buffer->capacity - length <= buffer->length) { + size_t capacity = buffer->capacity == 0 ? 64 : (buffer->capacity * 2); + char *base = grealloc(buffer->base, capacity); + buffer->base = base; + buffer->capacity = capacity; + } + + // write source data into the buffer + strcpy(buffer->base + buffer->length, src); + buffer->length += length; + + // `xml_core` should only have given us short data + assert(length <= INT_MAX && "too large XML escape sequence"); + return (int)length; +} + /* xml_string0: * Encode input string as an xml string. * If raw is true, the input is interpreted as having no @@ -54,78 +158,33 @@ char *xml_string(char *s) * into and , respectively. * Uses a static buffer, so non-re-entrant. */ -char *xml_string0(char *s, boolean raw) -{ - static char *buf = NULL; - static int bufsize = 0; - char *p, *sub, *prev = NULL; - int len, pos = 0; +char *xml_string0(char *s, boolean raw) { + static char *buf = NULL; + static size_t bufsize = 0; + char prev = '\0'; - if (!buf) { - bufsize = 64; - buf = gmalloc(bufsize); - } + const xml_flags_t flags = {.raw = raw != FALSE}; - p = buf; - while (s && *s) { - if (pos > (bufsize - 8)) { - bufsize *= 2; - buf = grealloc(buf, bufsize); - p = buf + pos; - } - /* escape '&' only if not part of a legal entity sequence */ - if (*s == '&' && (raw || !(xml_isentity(s)))) { - sub = "&"; - len = 5; - } - /* '<' '>' are safe to substitute even if string is already UTF-8 coded - * since UTF-8 strings won't contain '<' or '>' */ - else if (*s == '<') { - sub = "<"; - len = 4; - } - else if (*s == '>') { - sub = ">"; - len = 4; - } - else if (*s == '-') { /* can't be used in xml comment strings */ - sub = "-"; - len = 5; - } - else if (*s == ' ' && prev && *prev == ' ') { - /* substitute 2nd and subsequent spaces with required_spaces */ - sub = " "; /* inkscape doesn't recognise   */ - len = 6; - } - else if (*s == '"') { - sub = """; - len = 6; - } - else if (*s == '\'') { - sub = "'"; - len = 5; - } - else if ((*s == '\n') && raw) { - sub = " "; - len = 5; - } - else if ((*s == '\r') && raw) { - sub = " "; - len = 5; - } - else { - sub = s; - len = 1; - } - while (len--) { - *p++ = *sub++; - pos++; - } - prev = s; - s++; - } - *p = '\0'; - return buf; + if (!buf) { + bufsize = 64; + buf = gmalloc(bufsize); + } + + // generate an escaped version of this string into `buf` + buffer_t buffer = {.base = buf, .capacity = bufsize}; + while (s && *s) { + (void)xml_core(prev, s, flags, buffer_put, &buffer); + prev = *s; + s++; + } + assert(buffer.length < buffer.capacity && "no room for NUL"); + buffer.base[buffer.length] = '\0'; + + // save the static buffer (it may have been realloced) for reuse next time + buf = buffer.base; + bufsize = buffer.capacity; + + return buf; } /* a variant of xml_string for urls in hrefs */ -- 2.40.0