# src_files are the primary files, and will be included in doxygen documentation
set(src_files
+ Sources/libMultiMarkdown/aho-corasick.c
Sources/libMultiMarkdown/beamer.c
Sources/libMultiMarkdown/char.c
Sources/libMultiMarkdown/d_string.c
# Primary header files, also for doxygen documentation
set(header_files
+ Sources/libMultiMarkdown/aho-corasick.h
Sources/libMultiMarkdown/beamer.h
Sources/libMultiMarkdown/char.h
Sources/libMultiMarkdown/include/d_string.h
--- /dev/null
+/**
+
+ C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more.
+
+ @file aho-corasick.c
+
+ @brief C implementation of the Aho-Corasick algorithm for searching text
+ for multiple strings simultaneously in a single pass without backtracking.
+
+ <https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm>
+
+
+ @author Fletcher T. Penney
+ @bug
+
+**/
+
+/*
+
+ Copyright © 2015-2017 Fletcher T. Penney.
+
+
+ The `c-template` project is released under the MIT License.
+
+ GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+
+ https://github.com/fletcher/MultiMarkdown-4/
+
+ MMD 4 is released under both the MIT License and GPL.
+
+
+ CuTest is released under the zlib/libpng license. See CuTest.c for the text
+ of the license.
+
+
+ ## The MIT License ##
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "aho-corasick.h"
+
+#define kTrieStartingSize 256
+
+void trie_to_graphviz(trie * a);
+
+
+trie * trie_new(size_t startingSize) {
+ trie * a = malloc(sizeof(trie));
+
+ if (a) {
+ if (startingSize <= 1)
+ startingSize = kTrieStartingSize;
+
+ a->node = malloc(sizeof(trie_node) * startingSize);
+
+ if (!a->node) {
+ free(a);
+ return NULL;
+ }
+
+ // Clear memory
+ memset(a->node, 0, sizeof(trie_node) * startingSize);
+
+ // All tries have a root node
+ a->size = 1;
+ a->capacity = startingSize;
+ }
+
+ return a;
+}
+
+
+void trie_free(trie * a) {
+ free(a->node);
+ free(a);
+}
+
+
+bool trie_node_insert(trie * a, size_t s, const unsigned char * key, unsigned short match_type, unsigned short depth) {
+ // Get node for state s
+ trie_node * n = &a->node[s];
+
+ size_t i;
+
+ if (key[0] == '\0') {
+ // We've hit end of key
+ n->match_type = match_type;
+ n->len = depth;
+ return true; // Success
+ }
+
+ if (n->child[key[0]] != 0) {
+ // First character already in trie, advance forward
+ return trie_node_insert(a, n->child[key[0]], key + 1, match_type, ++depth);
+ } else {
+ // Create new node
+
+ // Ensure capacity
+ if (a->size == a->capacity) {
+ a->capacity *= 2;
+ a->node = realloc(a->node, a->capacity * sizeof(trie_node));
+
+ // Set n to new location
+ n = &(a->node[s]);
+ }
+
+ // Current node points to next node
+ i = a->size;
+ n->child[key[0]] = i;
+
+ // Initialize new node to 0
+ n = &a->node[i];
+ memset(n, 0, sizeof(trie_node));
+
+ // Set char for new node
+ n->c = key[0];
+
+ // Incremement size
+ a->size++;
+
+ // Advance forward
+ return trie_node_insert(a, i, key + 1, match_type, ++depth);
+ }
+}
+
+
+bool trie_insert(trie * a, const char * key, unsigned short match_type) {
+ if (a && key && (key[0] != '\0')) {
+ return trie_node_insert(a, 0, (const unsigned char *)key, match_type, 0);
+ }
+
+ return false;
+}
+
+
+#ifdef TEST
+void Test_trie_insert(CuTest* tc) {
+ trie * a = trie_new(0);
+
+ CuAssertIntEquals(tc, kTrieStartingSize, a->capacity);
+ CuAssertIntEquals(tc, 1, a->size);
+
+ trie_insert(a, "foo", 42);
+
+ trie_node * n = &a->node[0];
+ CuAssertIntEquals(tc, 0, n->match_type);
+ CuAssertIntEquals(tc, 1, n->child['f']);
+ CuAssertIntEquals(tc, '\0', n->c);
+
+ n = &a->node[1];
+ CuAssertIntEquals(tc, 0, n->match_type);
+ CuAssertIntEquals(tc, 2, n->child['o']);
+ CuAssertIntEquals(tc, 'f', n->c);
+
+ n = &a->node[2];
+ CuAssertIntEquals(tc, 0, n->match_type);
+ CuAssertIntEquals(tc, 3, n->child['o']);
+ CuAssertIntEquals(tc, 'o', n->c);
+
+ n = &a->node[3];
+ CuAssertIntEquals(tc, 42, n->match_type);
+ CuAssertIntEquals(tc, 3, n->len);
+ CuAssertIntEquals(tc, 'o', n->c);
+
+ trie_free(a);
+}
+#endif
+
+
+size_t trie_node_search(trie * a, size_t s, const char * query) {
+ if (query[0] == '\0') {
+ // Found matching state
+ return s;
+ }
+
+ if (a->node[s].child[query[0]] == 0) {
+ // Failed to match
+ return -1;
+ }
+
+ // Partial match, keep going
+ return trie_node_search(a, a->node[s].child[query[0]], query + 1);
+}
+
+
+size_t trie_search(trie * a, const char * query) {
+ if (a && query) {
+ return trie_node_search(a, 0, query);
+ }
+
+ return 0;
+}
+
+
+unsigned short trie_search_match_type(trie * a, const char * query) {
+ size_t s = trie_search(a, query);
+
+ if (s == -1)
+ return -1;
+
+ return a->node[s].match_type;
+}
+
+
+#ifdef TEST
+void Test_trie_search(CuTest* tc) {
+ trie * a = trie_new(0);
+
+ trie_insert(a, "foo", 42);
+ trie_insert(a, "bar", 41);
+ trie_insert(a, "food", 40);
+
+ CuAssertIntEquals(tc, 3, trie_search(a, "foo"));
+ CuAssertIntEquals(tc, 42, trie_search_match_type(a, "foo"));
+
+ CuAssertIntEquals(tc, 6, trie_search(a, "bar"));
+ CuAssertIntEquals(tc, 41, trie_search_match_type(a, "bar"));
+
+ CuAssertIntEquals(tc, 7, trie_search(a, "food"));
+ CuAssertIntEquals(tc, 40, trie_search_match_type(a, "food"));
+
+ CuAssertIntEquals(tc, -1, trie_search(a, "foot"));
+ CuAssertIntEquals(tc, (unsigned short) -1, trie_search_match_type(a, "foot"));
+
+ trie_free(a);
+}
+#endif
+
+
+void ac_trie_node_prepare(trie * a, size_t s, char * buffer, unsigned short depth, size_t last_match_state) {
+
+ buffer[depth] = '\0';
+ buffer[depth + 1] = '\0';
+
+ // Current node
+ trie_node * n = &(a->node[s]);
+
+ char * suffix = buffer;
+
+ // No suffix for first level matches
+ unsigned short last_match_depth = a->node[last_match_state].len;
+
+ if (depth == 1) {
+ last_match_depth = 1;
+ }
+
+ // Longest match seen so far??
+ suffix += 1;
+
+ // Find valid suffixes for failure path
+ while ((suffix[0] != '\0') && (n->ac_fail == 0)) {
+ n->ac_fail = trie_search(a, suffix);
+
+ if (n->ac_fail == -1)
+ n->ac_fail = 0;
+
+ if (n->ac_fail == s) {
+ // Something went wrong
+ fprintf(stderr, "Recursive trie fallback detected at state %lu('%c') - suffix:'%s'!\n", s, n->c, suffix);
+ n->ac_fail = 0;
+ }
+
+ suffix++;
+ }
+
+
+ // Prepare children
+ for (int i = 0; i < 256; ++i)
+ {
+ if ((n->child[i] != 0) &&
+ (n->child[i] != s)) {
+ buffer[depth] = i;
+
+ ac_trie_node_prepare(a, n->child[i], buffer, depth + 1, last_match_state);
+ }
+ }
+}
+
+/// Prepare trie for Aho-Corasick search algorithm by mapping failure connections
+void ac_trie_prepare(trie * a) {
+ // Clear old pointers
+ for (size_t i = 0; i < a->size; ++i)
+ {
+ a->node[i].ac_fail = 0;
+ }
+
+ // Create a buffer to use
+ char buffer[a->capacity];
+
+ ac_trie_node_prepare(a, 0, buffer, 0, 0);
+}
+
+
+
+#ifdef TEST
+void Test_trie_prepare(CuTest* tc) {
+ trie * a = trie_new(0);
+
+ trie_insert(a, "a", 1);
+ trie_insert(a, "aa", 2);
+ trie_insert(a, "aaa", 3);
+ trie_insert(a, "aaaa", 4);
+
+ ac_trie_prepare(a);
+
+ trie_free(a);
+}
+#endif
+
+
+match * match_new(size_t start, size_t len, unsigned short match_type) {
+ match * m = malloc(sizeof(match));
+
+ if (m) {
+ m->start = start;
+ m->len = len;
+ m->match_type = match_type;
+ m->next = NULL;
+ }
+
+ return m;
+}
+
+
+void match_free(match * m) {
+ if (m) {
+ if (m->next) {
+ match_free(m->next);
+ }
+
+ free(m);
+ }
+}
+
+
+match * match_add(match * last, size_t start, size_t len, unsigned short match_type) {
+ if (last) {
+ last->next = match_new(start, len, match_type);
+ last->next->prev = last;
+ return last->next;
+ } else {
+ return match_new(start, len, match_type);
+ }
+
+ return NULL;
+}
+
+
+match * ac_trie_search(trie * a, const char * source, size_t len) {
+
+ // Store results in a linked list
+// match * result = match_new(0, 0, 0);
+ match * result = NULL;
+ match * m = result;
+
+ // Keep track of our state
+ size_t state = 0;
+ size_t temp_state;
+
+ // Character being compared
+ char test_value;
+ size_t counter = 0;
+
+ while ((counter < len) && (source[counter] != '\0')) {
+ // Read next character
+ test_value = source[counter++];
+
+ // Check for path that allows us to match next character
+ while (state != 0 && a->node[state].child[test_value] == 0) {
+ state = a->node[state].ac_fail;
+ }
+
+ // Advance state for the next character
+ state = a->node[state].child[test_value];
+
+ // Check for partial matches
+ temp_state = state;
+
+ while (temp_state != 0) {
+ if (a->node[temp_state].match_type) {
+ // This is a match
+ if (!m) {
+ result = match_new(0, 0, 0);
+ m = result;
+ }
+ m = match_add(m, counter - a->node[temp_state].len,
+ a->node[temp_state].len, a->node[temp_state].match_type);
+ }
+
+ // Iterate to find shorter matches
+ temp_state = a->node[temp_state].ac_fail;
+ }
+ }
+
+ return result;
+}
+
+
+void match_excise(match * m) {
+ if (m->prev) {
+ m->prev->next = m->next;
+ }
+
+ if (m->next) {
+ m->next->prev = m->prev;
+ }
+
+ free(m);
+}
+
+int match_count(match * m) {
+ int result = 0;
+ m = m->next; // Skip header
+
+ while (m) {
+ result++;
+ m = m->next;
+ }
+
+ return result;
+}
+
+
+void match_describe(match * m, const char * source) {
+ fprintf(stderr, "'%.*s'(%d) at %lu:%lu\n", (int)m->len, &source[m->start],
+ m->match_type, m->start, m->start + m->len);
+}
+
+
+void match_set_describe(match * m, const char * source) {
+ m = m->next; // Skip header
+ while (m) {
+ match_describe(m, source);
+ m = m->next;
+ }
+}
+
+
+void match_set_filter_leftmost_longest(match * header) {
+ // Filter results to include only leftmost/longest results
+ match * m = header->next; // Skip header
+ match * n;
+
+ while (m) {
+ if (m->next) {
+ if (m->start == m->next->start) {
+ // The next match is longer than this one
+ n = m;
+ m = m->next;
+ match_excise(n);
+ continue;
+ }
+
+ while (m->next &&
+ m->next->start > m->start &&
+ m->next->start < m->start + m->len) {
+ // This match is "lefter" than next
+ match_excise(m->next);
+ }
+
+ while (m->next &&
+ m->next->start < m->start) {
+ // Next match is "lefter" than us
+ n = m;
+ m = m->prev;
+ match_excise(n);
+ }
+ }
+
+ while (m->prev->len &&
+ m->prev->start >= m->start) {
+ // We are "lefter" than previous
+ n = m->prev;
+ match_excise(n);
+ }
+
+ m = m->next;
+ }
+}
+
+
+match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len) {
+ match * result = ac_trie_search(a, source, len);
+
+ if (result)
+ match_set_filter_leftmost_longest(result);
+
+ return result;
+}
+
+
+#ifdef TEST
+void Test_aho_trie_search(CuTest* tc) {
+ trie * a = trie_new(0);
+
+ trie_insert(a, "foo", 42);
+ trie_insert(a, "bar", 41);
+ trie_insert(a, "food", 40);
+
+ ac_trie_prepare(a);
+
+ match * m = ac_trie_search(a, "this is a bar that serves food.", 31);
+
+ match_free(m);
+ trie_free(a);
+
+
+ a = trie_new(0);
+
+ trie_insert(a, "A", 1);
+ trie_insert(a, "AB", 2);
+ trie_insert(a, "ABC", 3);
+ trie_insert(a, "BC", 4);
+ trie_insert(a, "BCD", 5);
+ trie_insert(a, "E", 6);
+ trie_insert(a, "EFGHIJ", 7);
+ trie_insert(a, "F", 8);
+ trie_insert(a, "ZABCABCZ", 9);
+ trie_insert(a, "ZAB", 10);
+
+ ac_trie_prepare(a);
+
+ m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
+ fprintf(stderr, "Finish with %d matches\n", match_count(m));
+ match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
+ match_free(m);
+
+ m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
+ fprintf(stderr, "Finish with %d matches\n", match_count(m));
+ match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
+ match_free(m);
+
+ // trie_to_graphviz(a);
+
+ trie_free(a);
+}
+#endif
+
+
+void trie_node_to_graphviz(trie * a, size_t s) {
+ trie_node * n = &a->node[s];
+
+ if (n->match_type)
+ fprintf(stderr, "\"%lu\" [shape=doublecircle]\n", s);
+
+ for (int i = 0; i < 256; ++i)
+ {
+ if (n->child[i]) {
+ switch (i) {
+ default:
+ fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"%c\"]\n", s, n->child[i], (char)i);
+ }
+ }
+ }
+
+ if (n->ac_fail)
+ fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"fail\"]\n", s, n->ac_fail);
+}
+
+
+void trie_to_graphviz(trie * a) {
+ fprintf(stderr, "digraph dfa {\n");
+ for (int i = 0; i < a->size; ++i)
+ {
+ trie_node_to_graphviz(a, i);
+ }
+ fprintf(stderr, "}\n");
+}
+
--- /dev/null
+/**
+
+ C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more.
+
+ @file aho-corasick.h
+
+ @brief
+
+
+ @author Fletcher T. Penney
+ @bug
+
+**/
+
+/*
+
+ Copyright © 2015-2017 Fletcher T. Penney.
+
+
+ The `c-template` project is released under the MIT License.
+
+ GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+
+ https://github.com/fletcher/MultiMarkdown-4/
+
+ MMD 4 is released under both the MIT License and GPL.
+
+
+ CuTest is released under the zlib/libpng license. See CuTest.c for the text
+ of the license.
+
+
+ ## The MIT License ##
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+*/
+
+
+#ifndef AC_TEMPLATE_H
+#define AC_TEMPLATE_H
+
+#include <string.h>
+
+struct trie_node {
+ char c; // Character for this node
+ unsigned short match_type; // 0 = no match, otherwise what have we matched?
+ unsigned short len; // Length of string matched
+ size_t child[256]; // Where should we go next?
+ size_t ac_fail; // Where should we go if we fail?
+};
+
+typedef struct trie_node trie_node;
+
+
+struct trie {
+ size_t size; // How many nodes are in use?
+ size_t capacity; // How many nodes can we hold
+
+ trie_node * node; // Pointer to stack of nodes
+};
+
+typedef struct trie trie;
+
+
+struct match {
+ size_t start; // Starting offset for this match
+ size_t len; // Length for this match
+ unsigned short match_type; // Match type
+ struct match * next; // Pointer to next match in list
+ struct match * prev; // Pointer to previous match in list
+};
+
+typedef struct match match;
+
+
+trie * trie_new(size_t startingSize);
+
+bool trie_insert(trie * a, const char * key, unsigned short match_type);
+
+void ac_trie_prepare(trie * a);
+
+match * ac_trie_search(trie * a, const char * source, size_t len);
+
+match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len);
+
+void trie_free(trie * a);
+
+void match_set_describe(match * m, const char * source);
+
+void match_set_filter_leftmost_longest(match * header);
+
+void match_free(match * m);
+
+
+#ifdef TEST
+#include "CuTest.h"
+#endif
+
+
+#endif
mmd_export_token_tree_html(out, source, t->child, scratch);
break;
case PAIR_BRACKET_ABBREVIATION:
+ // Which might also be an "auto-tagged" abbreviation
if (scratch->extensions & EXT_NOTES) {
// Note-based syntax enabled
// Get instance of the note used
temp_note = stack_peek_index(scratch->used_abbreviations, temp_short - 1);
- t->child->type = TEXT_EMPTY;
- t->child->mate->type = TEXT_EMPTY;
+ if (t->child) {
+ t->child->type = TEXT_EMPTY;
+ t->child->mate->type = TEXT_EMPTY;
+ }
if (temp_short3 == scratch->inline_abbreviations_to_free->size) {
// This is a reference definition
print_const("<abbr title=\"");
mmd_print_string_html(out, temp_note->clean_text, false);
print_const("\">");
- mmd_export_token_tree_html(out, source, t->child, scratch);
+ if (t->child)
+ mmd_export_token_tree_html(out, source, t->child, scratch);
+ else
+ print_token(t);
print_const("</abbr>");
} else {
// This is the first time this note was used
print_const(" (<abbr title=\"");
mmd_print_string_html(out, temp_note->clean_text, false);
print_const("\">");
- mmd_export_token_tree_html(out, source, t->child, scratch);
+ if (t->child)
+ mmd_export_token_tree_html(out, source, t->child, scratch);
+ else
+ print_token(t);
print_const("</abbr>)");
}
} else {
}
break;
case PAIR_BRACKET_GLOSSARY:
+ // Which might also be an "auto-tagged" glossary
if (scratch->extensions & EXT_NOTES) {
// Note-based syntax enabled
void token_split_on_char(token * t, const char * source, const char c);
+void token_split(token * t, size_t start, size_t len, unsigned short new_type);
+
#endif
mmd_export_token_tree_latex(out, source, t->child, scratch);
break;
case PAIR_BRACKET_ABBREVIATION:
+ // Which might also be an "auto-tagged" abbreviation
if (scratch->extensions & EXT_NOTES) {
// Note-based syntax enabled
}
break;
case PAIR_BRACKET_GLOSSARY:
+ // Which might also be an "auto-tagged" glossary
if (scratch->extensions & EXT_NOTES) {
// Note-based syntax enabled
-/* Generated by re2c 0.14.3 on Sat Mar 4 20:43:39 2017 */
+/* Generated by re2c 0.14.3 on Thu Mar 9 19:02:17 2017 */
/**
MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
++YYCURSOR;
{ return TEXT_PERCENT; }
yy52:
- YYCTXMARKER = YYCURSOR + 1;
yyaccept = 5;
yych = *(YYMARKER = ++YYCURSOR);
switch (yych) {
default: goto yy61;
}
yy69:
+ YYCTXMARKER = YYCURSOR + 1;
yych = *++YYCURSOR;
switch (yych) {
case '\t':
default: goto yy264;
}
yy71:
- YYCTXMARKER = YYCURSOR + 1;
++YYCURSOR;
yych = *YYCURSOR;
switch (yych) {
'}' { return TEXT_BRACE_RIGHT; }
'\\' { return TEXT_BACKSLASH; }
- [0-9]+ / ('.' (SP|NL)) { return TEXT_NUMBER_POSS_LIST; }
+ [0-9]+ '.' / (SP|NL) { return TEXT_NUMBER_POSS_LIST; }
'.' / (SP|NL) { return TEXT_PERIOD; }
TEXT_LINEBREAK { return TEXT_LINEBREAK; }
break;
case TEXT_NUMBER_POSS_LIST:
switch(source[line->child->next->start]) {
- case '.':
- switch(source[line->child->next->start + 1]) {
- case ' ':
- case '\t':
- line->type = LINE_LIST_ENUMERATED;
- line->child->type = MARKER_LIST_ENUMERATOR;
-
- // Strip period
- line->child->next->type = TEXT_EMPTY;
-
- switch (line->child->next->next->type) {
- case TEXT_PLAIN:
- // Strip whitespace between bullet and text
- while (char_is_whitespace(source[line->child->next->next->start])) {
- line->child->next->next->start++;
- line->child->next->next->len--;
- }
- break;
- case INDENT_SPACE:
- case INDENT_TAB:
- case NON_INDENT_SPACE:
- t = line->child->next;
- while(t->next && ((t->next->type == INDENT_SPACE) ||
- (t->next->type == INDENT_TAB) ||
- (t->next->type == NON_INDENT_SPACE))) {
- tokens_prune(t->next, t->next);
- }
- break;
+ case ' ':
+ case '\t':
+ line->type = LINE_LIST_ENUMERATED;
+ line->child->type = MARKER_LIST_ENUMERATOR;
+
+ switch (line->child->next->type) {
+ case TEXT_PLAIN:
+ // Strip whitespace between bullet and text
+ while (char_is_whitespace(source[line->child->next->start])) {
+ line->child->next->start++;
+ line->child->next->len--;
}
break;
- default:
- line->type = LINE_PLAIN;
- line->child->type = TEXT_PLAIN;
+ case INDENT_SPACE:
+ case INDENT_TAB:
+ case NON_INDENT_SPACE:
+ t = line->child;
+ while(t->next && ((t->next->type == INDENT_SPACE) ||
+ (t->next->type == INDENT_TAB) ||
+ (t->next->type == NON_INDENT_SPACE))) {
+ tokens_prune(t->next, t->next);
+ }
break;
}
break;
}
break;
case PAIR_BRACKET_ABBREVIATION:
+ // Which might also be an "auto-tagged" abbreviation
if (scratch->extensions & EXT_NOTES) {
// Note-based syntax enabled
// Get instance of the note used
temp_note = stack_peek_index(scratch->used_abbreviations, temp_short - 1);
- t->child->type = TEXT_EMPTY;
- t->child->mate->type = TEXT_EMPTY;
+ if (t->child) {
+ t->child->type = TEXT_EMPTY;
+ t->child->mate->type = TEXT_EMPTY;
+ }
if (temp_short2 == scratch->used_abbreviations->size) {
// This is a re-use of a previously used note
if (temp_short3 == scratch->inline_abbreviations_to_free->size) {
// This is a reference definition
- mmd_export_token_tree_odf(out, source, t->child, scratch);
+ mmd_print_string_odf(out, temp_note->clean_text);
+// mmd_export_token_tree_odf(out, source, t->child, scratch);
} else {
// This is an inline definition
- mmd_export_token_tree_odf(out, source, t->child, scratch);
+ mmd_print_string_odf(out, temp_note->clean_text);
+// mmd_export_token_tree_odf(out, source, t->child, scratch);
}
} else {
// This is the first time this note was used
}
break;
case PAIR_BRACKET_GLOSSARY:
+ // Which might also be an "auto-tagged" glossary
if (scratch->extensions & EXT_NOTES) {
// Note-based syntax enabled
}
}
+
+// Split a token and create
+void token_split(token * t, size_t start, size_t len, unsigned short new_type) {
+ if (!t)
+ return;
+
+ token * u = token_new(new_type, start, len);
+ size_t stop = start + len;
+
+ if (t->start + t->len > stop) {
+ token * v = token_new(t->type, stop, t->start + t->len - stop);
+
+ u->next = v;
+ v->prev = u;
+ v->next = t->next;
+ } else {
+ u->next = t->next;
+ }
+
+ t->next = u;
+ u->prev = t;
+
+ t->len = start - t->start;
+}
+
#include "libMultiMarkdown.h"
+#include "aho-corasick.h"
#include "beamer.h"
#include "char.h"
#include "d_string.h"
scratch->base_header_level = header_level;
}
-/// kmp from http://stackoverflow.com/questions/8584644/strstr-for-a-string-that-is-not-null-terminated
-/// Search for a string within certain bounds (so we don't go past the end of the token)
-int *kmp_borders(char * needle, size_t nlen){
- if (!needle) return NULL;
- int i, j, *borders = malloc((nlen+1)*sizeof(*borders));
- if (!borders) return NULL;
- i = 0;
- j = -1;
- borders[i] = j;
- while((size_t)i < nlen){
- while(j >= 0 && needle[i] != needle[j]){
- j = borders[j];
- }
- ++i;
- ++j;
- borders[i] = j;
- }
- return borders;
-}
-
-const char * kmp_search(const char * haystack, size_t haylen, char * needle, size_t nlen, int * borders){
- size_t max_index = haylen-nlen, i = 0, j = 0;
- while(i <= max_index){
- while(j < nlen && *haystack && needle[j] == *haystack){
- ++j;
- ++haystack;
- }
- if (j == nlen){
- return haystack-nlen;
- }
- if (!(*haystack)){
- return NULL;
- }
- if (j == 0){
- ++haystack;
- ++i;
- } else {
- do{
- i += j - (size_t)borders[j];
- j = borders[j];
- }while(j > 0 && needle[j] != *haystack);
- }
- }
- return NULL;
-}
-
-const char * sstrnstr(const char * haystack, char * needle, size_t haylen){
- if (!haystack || !needle){
- return NULL;
- }
- size_t nlen = strlen(needle);
- if (haylen < nlen){
- return NULL;
- }
- int *borders = kmp_borders(needle, nlen);
- if (!borders){
- return NULL;
- }
- const char *match = kmp_search(haystack, haylen, needle, nlen, borders);
- free(borders);
- return match;
-}
-
-
-/// Search a text node for abbreviation matches
-/// TODO: This is an inefficient algorithm, searching
-/// each node once for *each* abbreviation. A more
-/// advanced algorithm would search for all abbreviations
-/// simultaneously but require more setup (e.g. Aho-Corasick)
-void abbr_search_text(mmd_engine * e, token * t) {
- const char * str = &e->dstr->str[t->start];
-
- const char * match;
- abbr * a;
- for (int i = 0; i < e->abbreviation_stack->size; ++i)
- {
- a = stack_peek_index(e->abbreviation_stack, i);
+void automatic_search_text(mmd_engine * e, token * t, trie * ac) {
+ match * m = ac_trie_leftmost_longest_search(ac, &e->dstr->str[t->start], t->len);
+
+ match * walker;
+
+ token * tok = t;
+
+ if (m) {
+ walker = m->next;
- match = sstrnstr(str, a->abbr, t->len);
+ while (walker) {
+ token_split(tok, walker->start + t->start, walker->len, walker->match_type);
+
+ // Advance token to section after the split (if present)
+ tok = tok->next->next;
- if (match) {
- fprintf(stderr, "Found match '%s' -> '%s' at %lu\n", a->abbr, a->expansion, (size_t) (match - e->dstr->str));
+ // Advance to next match (if present)
+ walker = walker->next;
}
}
+
+ match_free(m);
}
/// Determine which nodes to descend into to search for abbreviations
-void abbr_search(mmd_engine * e, token * t) {
+void automatic_search(mmd_engine * e, token * t, trie * ac) {
while (t) {
switch (t->type) {
case TEXT_PLAIN:
- abbr_search_text(e, t);
+ automatic_search_text(e, t, ac);
break;
case DOC_START_TOKEN:
- case BLOCK_LIST_BULLETED:
- case BLOCK_LIST_BULLETED_LOOSE:
- case BLOCK_LIST_ENUMERATED:
- case BLOCK_LIST_ENUMERATED_LOOSE:
- abbr_search(e, t->child);
- break;
case BLOCK_PARA:
case BLOCK_H1:
case BLOCK_H2:
case BLOCK_H4:
case BLOCK_H5:
case BLOCK_H6:
+ case BLOCK_LIST_BULLETED:
+ case BLOCK_LIST_BULLETED_LOOSE:
+ case BLOCK_LIST_ENUMERATED:
+ case BLOCK_LIST_ENUMERATED_LOOSE:
case BLOCK_LIST_ITEM_TIGHT:
case BLOCK_LIST_ITEM:
case BLOCK_SETEXT_1:
case BLOCK_TABLE:
case BLOCK_TABLE_HEADER:
case BLOCK_TABLE_SECTION:
- abbr_search_text(e, t);
+ case PAIR_QUOTE_DOUBLE:
+ case PAIR_QUOTE_SINGLE:
+ case PAIR_STAR:
+ case PAIR_UL:
+ automatic_search(e, t->child, ac);
break;
+// case PAIR_PAREN:
default:
break;
}
}
-void process_abbreviation_stack(mmd_engine * e, scratch_pad * scratch) {
- abbr * a;
+void identify_global_search_terms(mmd_engine * e, scratch_pad * scratch) {
+ // Only search if we have a target
+ int count = e->abbreviation_stack->size + e->glossary_stack->size;
- // Describe which abbreviations we are searching for
+ if (count == 0) {
+ return;
+ }
+
+ trie * ac = trie_new(0);
+ footnote * f;
+
+ // Add abbreviations to search trie
for (int i = 0; i < e->abbreviation_stack->size; ++i)
{
- a = stack_peek_index(e->abbreviation_stack, i);
+ f = stack_peek_index(e->abbreviation_stack, i);
+ trie_insert(ac, f->label_text, PAIR_BRACKET_ABBREVIATION);
+ }
+
+ // Add glossary to search trie
+ for (int i = 0; i < e->glossary_stack->size; ++i)
+ {
+ f = stack_peek_index(e->glossary_stack, i);
+ trie_insert(ac, f->clean_text, PAIR_BRACKET_GLOSSARY);
}
- abbr_search(e, e->root);
+ ac_trie_prepare(ac);
+ automatic_search(e, e->root, ac);
+ trie_free(ac);
}
// Process metadata
process_metadata_stack(e, scratch);
- // Process abbreviations
- // process_abbreviation_stack(e, scratch);
+ // Process abbreviations, glossary, etc.
+ if (!(e->extensions & EXT_COMPATIBILITY))
+ identify_global_search_terms(e, scratch);
switch (scratch->output_format) {
void glossary_from_bracket(const char * source, scratch_pad * scratch, token * t, short * num) {
// Get text inside bracket
- char * text = text_inside_pair(source, t);
+ char * text;
+
+ if (t->child) {
+ text = text_inside_pair(source, t);
+ } else {
+ text = malloc(t->len + 2);
+ text[0] = '?';
+ memcpy(&text[1], &source[t->start], t->len);
+ text[t->len + 1] = '\0';
+ }
+
short glossary_id = extract_glossary_from_stack(scratch, text);
free(text);
if (glossary_id == -1) {
// No match, this is an inline glossary -- create a new glossary entry
- t->child->type = TEXT_EMPTY;
- t->child->mate->type = TEXT_EMPTY;
+ if (t->child) {
+ t->child->type = TEXT_EMPTY;
+ t->child->mate->type = TEXT_EMPTY;
+ }
// Create glossary
token * label = t->child;
void abbreviation_from_bracket(const char * source, scratch_pad * scratch, token * t, short * num) {
// Get text inside bracket
- char * text = text_inside_pair(source, t);
+ char * text;
+
+ if (t->child) {
+ text = text_inside_pair(source, t);
+ } else {
+ text = malloc(t->len + 2);
+ text[0] = '>';
+ memcpy(&text[1], &source[t->start], t->len);
+ text[t->len + 1] = '\0';
+ }
+
short abbr_id = extract_abbreviation_from_stack(scratch, &text[1]);
free(text);
if (abbr_id == -1) {
// No match, this is an inline glossary -- create a new glossary entry
- t->child->type = TEXT_EMPTY;
- t->child->mate->type = TEXT_EMPTY;
-
+ if (t->child) {
+ t->child->type = TEXT_EMPTY;
+ t->child->mate->type = TEXT_EMPTY;
+ }
+
// Create glossary
token * label = t->child;
while (label && label->type != PAIR_PAREN)
<text:p text:style-name="Standard">BAR (bar)</text:p>
-<text:p text:style-name="Standard">foo bar</text:p>
+<text:p text:style-name="Standard">FOO BAR</text:p>
<text:p text:style-name="Standard">FOOBAR (foobar)</text:p>
<text:p text:style-name="Standard">5</text:p>
<text:p text:style-name="Standard"><text:p text:style-name="Footnote">BAZ (BAT)</text:p> (baz)</text:p>
+
+<text:p text:style-name="Standard">FOO</text:p>
+
+<text:p text:style-name="Standard">BAR</text:p>
+
+<text:p text:style-name="Standard">FOOBAR</text:p>
+
+<text:p text:style-name="Standard">FOO BAR baz</text:p>
</office:text>
</office:body>
</office:document>
<p>BAZ (BAT) (<abbr title="BAZ (BAT)">baz</abbr>)</p>
+<p><abbr title="FOO">foo</abbr></p>
+
+<p><abbr title="BAR">bar</abbr></p>
+
+<p><abbr title="FOOBAR">foobar</abbr></p>
+
+<p><abbr title="FOO BAR">foo bar</abbr> baz</p>
+
</body>
</html>
<p>5</p>
<p>[>(baz) BAZ (BAT)]</p>
+
+<p>foo</p>
+
+<p>bar</p>
+
+<p>foobar</p>
+
+<p>foo bar baz</p>
\newacronym{baz}{baz}{BAZ (BAT)}\gls{baz}
+\gls{foo}
+
+\gls{bar}
+
+\gls{foobar}
+
+\gls{foo bar} baz
+
\input{mmd6-article-footer}
\end{document}
[>(baz) BAZ (BAT)]
+foo
+
+bar
+
+foobar
+
+foo bar baz
+
[>foo]: FOO
[>bar]: BAR
<text:p text:style-name="Standard">[?bar]</text:p>
<text:p text:style-name="Standard">5</text:p>
+
+<text:p text:style-name="Standard">foo</text:p>
+
+<text:p text:style-name="Standard">Foo Bar</text:p>
</office:text>
</office:body>
</office:document>
<p>5</p>
+<p><a href="#gn:1" title="see glossary" class="glossary">foo</a></p>
+
+<p><a href="#gn:3" title="see glossary" class="glossary">Foo Bar</a></p>
+
<div class="glossary">
<hr />
<ol>
<p>5</p>
+<p>foo</p>
+
+<p>Foo Bar</p>
+
<pre><code>With second para.
</code></pre>
5
+\gls{foo}
+
+\gls{foobar}
+
\input{mmd6-article-footer}
\end{document}
5
+foo
+
+Foo Bar
+
+
[?foo]: Reference
[?Foo Bar]: Reference