From 04c2748fbb3cd8472e353815677ca4444e0d6f61 Mon Sep 17 00:00:00 2001 From: "Fletcher T. Penney" Date: Thu, 9 Mar 2017 19:37:09 -0500 Subject: [PATCH] ADDED: Add functionality to automatically identify abbreviations and glossary terms in source --- CMakeLists.txt | 2 + Sources/libMultiMarkdown/aho-corasick.c | 591 +++++++++++++++++++++++ Sources/libMultiMarkdown/aho-corasick.h | 117 +++++ Sources/libMultiMarkdown/html.c | 18 +- Sources/libMultiMarkdown/include/token.h | 2 + Sources/libMultiMarkdown/latex.c | 2 + Sources/libMultiMarkdown/lexer.c | 5 +- Sources/libMultiMarkdown/lexer.re | 2 +- Sources/libMultiMarkdown/mmd.c | 51 +- Sources/libMultiMarkdown/odf.c | 14 +- Sources/libMultiMarkdown/token.c | 25 + Sources/libMultiMarkdown/writer.c | 191 ++++---- tests/MMD6Tests/Abbreviations.fodt | 10 +- tests/MMD6Tests/Abbreviations.html | 8 + tests/MMD6Tests/Abbreviations.htmlc | 8 + tests/MMD6Tests/Abbreviations.tex | 8 + tests/MMD6Tests/Abbreviations.text | 8 + tests/MMD6Tests/Glossaries.fodt | 4 + tests/MMD6Tests/Glossaries.html | 4 + tests/MMD6Tests/Glossaries.htmlc | 4 + tests/MMD6Tests/Glossaries.tex | 4 + tests/MMD6Tests/Glossaries.text | 5 + 22 files changed, 936 insertions(+), 147 deletions(-) create mode 100644 Sources/libMultiMarkdown/aho-corasick.c create mode 100644 Sources/libMultiMarkdown/aho-corasick.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 214ebde..4029546 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,6 +172,7 @@ configure_file ( # src_files are the primary files, and will be included in doxygen documentation set(src_files + Sources/libMultiMarkdown/aho-corasick.c Sources/libMultiMarkdown/beamer.c Sources/libMultiMarkdown/char.c Sources/libMultiMarkdown/d_string.c @@ -194,6 +195,7 @@ set(src_files # Primary header files, also for doxygen documentation set(header_files + Sources/libMultiMarkdown/aho-corasick.h Sources/libMultiMarkdown/beamer.h Sources/libMultiMarkdown/char.h Sources/libMultiMarkdown/include/d_string.h diff --git a/Sources/libMultiMarkdown/aho-corasick.c b/Sources/libMultiMarkdown/aho-corasick.c new file mode 100644 index 0000000..7445aa4 --- /dev/null +++ b/Sources/libMultiMarkdown/aho-corasick.c @@ -0,0 +1,591 @@ +/** + + C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more. + + @file aho-corasick.c + + @brief C implementation of the Aho-Corasick algorithm for searching text + for multiple strings simultaneously in a single pass without backtracking. + + + + + @author Fletcher T. Penney + @bug + +**/ + +/* + + Copyright © 2015-2017 Fletcher T. Penney. + + + The `c-template` project is released under the MIT License. + + GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project: + + https://github.com/fletcher/MultiMarkdown-4/ + + MMD 4 is released under both the MIT License and GPL. + + + CuTest is released under the zlib/libpng license. See CuTest.c for the text + of the license. + + + ## The MIT License ## + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +*/ + +#include +#include +#include + +#include "aho-corasick.h" + +#define kTrieStartingSize 256 + +void trie_to_graphviz(trie * a); + + +trie * trie_new(size_t startingSize) { + trie * a = malloc(sizeof(trie)); + + if (a) { + if (startingSize <= 1) + startingSize = kTrieStartingSize; + + a->node = malloc(sizeof(trie_node) * startingSize); + + if (!a->node) { + free(a); + return NULL; + } + + // Clear memory + memset(a->node, 0, sizeof(trie_node) * startingSize); + + // All tries have a root node + a->size = 1; + a->capacity = startingSize; + } + + return a; +} + + +void trie_free(trie * a) { + free(a->node); + free(a); +} + + +bool trie_node_insert(trie * a, size_t s, const unsigned char * key, unsigned short match_type, unsigned short depth) { + // Get node for state s + trie_node * n = &a->node[s]; + + size_t i; + + if (key[0] == '\0') { + // We've hit end of key + n->match_type = match_type; + n->len = depth; + return true; // Success + } + + if (n->child[key[0]] != 0) { + // First character already in trie, advance forward + return trie_node_insert(a, n->child[key[0]], key + 1, match_type, ++depth); + } else { + // Create new node + + // Ensure capacity + if (a->size == a->capacity) { + a->capacity *= 2; + a->node = realloc(a->node, a->capacity * sizeof(trie_node)); + + // Set n to new location + n = &(a->node[s]); + } + + // Current node points to next node + i = a->size; + n->child[key[0]] = i; + + // Initialize new node to 0 + n = &a->node[i]; + memset(n, 0, sizeof(trie_node)); + + // Set char for new node + n->c = key[0]; + + // Incremement size + a->size++; + + // Advance forward + return trie_node_insert(a, i, key + 1, match_type, ++depth); + } +} + + +bool trie_insert(trie * a, const char * key, unsigned short match_type) { + if (a && key && (key[0] != '\0')) { + return trie_node_insert(a, 0, (const unsigned char *)key, match_type, 0); + } + + return false; +} + + +#ifdef TEST +void Test_trie_insert(CuTest* tc) { + trie * a = trie_new(0); + + CuAssertIntEquals(tc, kTrieStartingSize, a->capacity); + CuAssertIntEquals(tc, 1, a->size); + + trie_insert(a, "foo", 42); + + trie_node * n = &a->node[0]; + CuAssertIntEquals(tc, 0, n->match_type); + CuAssertIntEquals(tc, 1, n->child['f']); + CuAssertIntEquals(tc, '\0', n->c); + + n = &a->node[1]; + CuAssertIntEquals(tc, 0, n->match_type); + CuAssertIntEquals(tc, 2, n->child['o']); + CuAssertIntEquals(tc, 'f', n->c); + + n = &a->node[2]; + CuAssertIntEquals(tc, 0, n->match_type); + CuAssertIntEquals(tc, 3, n->child['o']); + CuAssertIntEquals(tc, 'o', n->c); + + n = &a->node[3]; + CuAssertIntEquals(tc, 42, n->match_type); + CuAssertIntEquals(tc, 3, n->len); + CuAssertIntEquals(tc, 'o', n->c); + + trie_free(a); +} +#endif + + +size_t trie_node_search(trie * a, size_t s, const char * query) { + if (query[0] == '\0') { + // Found matching state + return s; + } + + if (a->node[s].child[query[0]] == 0) { + // Failed to match + return -1; + } + + // Partial match, keep going + return trie_node_search(a, a->node[s].child[query[0]], query + 1); +} + + +size_t trie_search(trie * a, const char * query) { + if (a && query) { + return trie_node_search(a, 0, query); + } + + return 0; +} + + +unsigned short trie_search_match_type(trie * a, const char * query) { + size_t s = trie_search(a, query); + + if (s == -1) + return -1; + + return a->node[s].match_type; +} + + +#ifdef TEST +void Test_trie_search(CuTest* tc) { + trie * a = trie_new(0); + + trie_insert(a, "foo", 42); + trie_insert(a, "bar", 41); + trie_insert(a, "food", 40); + + CuAssertIntEquals(tc, 3, trie_search(a, "foo")); + CuAssertIntEquals(tc, 42, trie_search_match_type(a, "foo")); + + CuAssertIntEquals(tc, 6, trie_search(a, "bar")); + CuAssertIntEquals(tc, 41, trie_search_match_type(a, "bar")); + + CuAssertIntEquals(tc, 7, trie_search(a, "food")); + CuAssertIntEquals(tc, 40, trie_search_match_type(a, "food")); + + CuAssertIntEquals(tc, -1, trie_search(a, "foot")); + CuAssertIntEquals(tc, (unsigned short) -1, trie_search_match_type(a, "foot")); + + trie_free(a); +} +#endif + + +void ac_trie_node_prepare(trie * a, size_t s, char * buffer, unsigned short depth, size_t last_match_state) { + + buffer[depth] = '\0'; + buffer[depth + 1] = '\0'; + + // Current node + trie_node * n = &(a->node[s]); + + char * suffix = buffer; + + // No suffix for first level matches + unsigned short last_match_depth = a->node[last_match_state].len; + + if (depth == 1) { + last_match_depth = 1; + } + + // Longest match seen so far?? + suffix += 1; + + // Find valid suffixes for failure path + while ((suffix[0] != '\0') && (n->ac_fail == 0)) { + n->ac_fail = trie_search(a, suffix); + + if (n->ac_fail == -1) + n->ac_fail = 0; + + if (n->ac_fail == s) { + // Something went wrong + fprintf(stderr, "Recursive trie fallback detected at state %lu('%c') - suffix:'%s'!\n", s, n->c, suffix); + n->ac_fail = 0; + } + + suffix++; + } + + + // Prepare children + for (int i = 0; i < 256; ++i) + { + if ((n->child[i] != 0) && + (n->child[i] != s)) { + buffer[depth] = i; + + ac_trie_node_prepare(a, n->child[i], buffer, depth + 1, last_match_state); + } + } +} + +/// Prepare trie for Aho-Corasick search algorithm by mapping failure connections +void ac_trie_prepare(trie * a) { + // Clear old pointers + for (size_t i = 0; i < a->size; ++i) + { + a->node[i].ac_fail = 0; + } + + // Create a buffer to use + char buffer[a->capacity]; + + ac_trie_node_prepare(a, 0, buffer, 0, 0); +} + + + +#ifdef TEST +void Test_trie_prepare(CuTest* tc) { + trie * a = trie_new(0); + + trie_insert(a, "a", 1); + trie_insert(a, "aa", 2); + trie_insert(a, "aaa", 3); + trie_insert(a, "aaaa", 4); + + ac_trie_prepare(a); + + trie_free(a); +} +#endif + + +match * match_new(size_t start, size_t len, unsigned short match_type) { + match * m = malloc(sizeof(match)); + + if (m) { + m->start = start; + m->len = len; + m->match_type = match_type; + m->next = NULL; + } + + return m; +} + + +void match_free(match * m) { + if (m) { + if (m->next) { + match_free(m->next); + } + + free(m); + } +} + + +match * match_add(match * last, size_t start, size_t len, unsigned short match_type) { + if (last) { + last->next = match_new(start, len, match_type); + last->next->prev = last; + return last->next; + } else { + return match_new(start, len, match_type); + } + + return NULL; +} + + +match * ac_trie_search(trie * a, const char * source, size_t len) { + + // Store results in a linked list +// match * result = match_new(0, 0, 0); + match * result = NULL; + match * m = result; + + // Keep track of our state + size_t state = 0; + size_t temp_state; + + // Character being compared + char test_value; + size_t counter = 0; + + while ((counter < len) && (source[counter] != '\0')) { + // Read next character + test_value = source[counter++]; + + // Check for path that allows us to match next character + while (state != 0 && a->node[state].child[test_value] == 0) { + state = a->node[state].ac_fail; + } + + // Advance state for the next character + state = a->node[state].child[test_value]; + + // Check for partial matches + temp_state = state; + + while (temp_state != 0) { + if (a->node[temp_state].match_type) { + // This is a match + if (!m) { + result = match_new(0, 0, 0); + m = result; + } + m = match_add(m, counter - a->node[temp_state].len, + a->node[temp_state].len, a->node[temp_state].match_type); + } + + // Iterate to find shorter matches + temp_state = a->node[temp_state].ac_fail; + } + } + + return result; +} + + +void match_excise(match * m) { + if (m->prev) { + m->prev->next = m->next; + } + + if (m->next) { + m->next->prev = m->prev; + } + + free(m); +} + +int match_count(match * m) { + int result = 0; + m = m->next; // Skip header + + while (m) { + result++; + m = m->next; + } + + return result; +} + + +void match_describe(match * m, const char * source) { + fprintf(stderr, "'%.*s'(%d) at %lu:%lu\n", (int)m->len, &source[m->start], + m->match_type, m->start, m->start + m->len); +} + + +void match_set_describe(match * m, const char * source) { + m = m->next; // Skip header + while (m) { + match_describe(m, source); + m = m->next; + } +} + + +void match_set_filter_leftmost_longest(match * header) { + // Filter results to include only leftmost/longest results + match * m = header->next; // Skip header + match * n; + + while (m) { + if (m->next) { + if (m->start == m->next->start) { + // The next match is longer than this one + n = m; + m = m->next; + match_excise(n); + continue; + } + + while (m->next && + m->next->start > m->start && + m->next->start < m->start + m->len) { + // This match is "lefter" than next + match_excise(m->next); + } + + while (m->next && + m->next->start < m->start) { + // Next match is "lefter" than us + n = m; + m = m->prev; + match_excise(n); + } + } + + while (m->prev->len && + m->prev->start >= m->start) { + // We are "lefter" than previous + n = m->prev; + match_excise(n); + } + + m = m->next; + } +} + + +match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len) { + match * result = ac_trie_search(a, source, len); + + if (result) + match_set_filter_leftmost_longest(result); + + return result; +} + + +#ifdef TEST +void Test_aho_trie_search(CuTest* tc) { + trie * a = trie_new(0); + + trie_insert(a, "foo", 42); + trie_insert(a, "bar", 41); + trie_insert(a, "food", 40); + + ac_trie_prepare(a); + + match * m = ac_trie_search(a, "this is a bar that serves food.", 31); + + match_free(m); + trie_free(a); + + + a = trie_new(0); + + trie_insert(a, "A", 1); + trie_insert(a, "AB", 2); + trie_insert(a, "ABC", 3); + trie_insert(a, "BC", 4); + trie_insert(a, "BCD", 5); + trie_insert(a, "E", 6); + trie_insert(a, "EFGHIJ", 7); + trie_insert(a, "F", 8); + trie_insert(a, "ZABCABCZ", 9); + trie_insert(a, "ZAB", 10); + + ac_trie_prepare(a); + + m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26); + fprintf(stderr, "Finish with %d matches\n", match_count(m)); + match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ"); + match_free(m); + + m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26); + fprintf(stderr, "Finish with %d matches\n", match_count(m)); + match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ"); + match_free(m); + + // trie_to_graphviz(a); + + trie_free(a); +} +#endif + + +void trie_node_to_graphviz(trie * a, size_t s) { + trie_node * n = &a->node[s]; + + if (n->match_type) + fprintf(stderr, "\"%lu\" [shape=doublecircle]\n", s); + + for (int i = 0; i < 256; ++i) + { + if (n->child[i]) { + switch (i) { + default: + fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"%c\"]\n", s, n->child[i], (char)i); + } + } + } + + if (n->ac_fail) + fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"fail\"]\n", s, n->ac_fail); +} + + +void trie_to_graphviz(trie * a) { + fprintf(stderr, "digraph dfa {\n"); + for (int i = 0; i < a->size; ++i) + { + trie_node_to_graphviz(a, i); + } + fprintf(stderr, "}\n"); +} + diff --git a/Sources/libMultiMarkdown/aho-corasick.h b/Sources/libMultiMarkdown/aho-corasick.h new file mode 100644 index 0000000..73414f2 --- /dev/null +++ b/Sources/libMultiMarkdown/aho-corasick.h @@ -0,0 +1,117 @@ +/** + + C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more. + + @file aho-corasick.h + + @brief + + + @author Fletcher T. Penney + @bug + +**/ + +/* + + Copyright © 2015-2017 Fletcher T. Penney. + + + The `c-template` project is released under the MIT License. + + GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project: + + https://github.com/fletcher/MultiMarkdown-4/ + + MMD 4 is released under both the MIT License and GPL. + + + CuTest is released under the zlib/libpng license. See CuTest.c for the text + of the license. + + + ## The MIT License ## + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +*/ + + +#ifndef AC_TEMPLATE_H +#define AC_TEMPLATE_H + +#include + +struct trie_node { + char c; // Character for this node + unsigned short match_type; // 0 = no match, otherwise what have we matched? + unsigned short len; // Length of string matched + size_t child[256]; // Where should we go next? + size_t ac_fail; // Where should we go if we fail? +}; + +typedef struct trie_node trie_node; + + +struct trie { + size_t size; // How many nodes are in use? + size_t capacity; // How many nodes can we hold + + trie_node * node; // Pointer to stack of nodes +}; + +typedef struct trie trie; + + +struct match { + size_t start; // Starting offset for this match + size_t len; // Length for this match + unsigned short match_type; // Match type + struct match * next; // Pointer to next match in list + struct match * prev; // Pointer to previous match in list +}; + +typedef struct match match; + + +trie * trie_new(size_t startingSize); + +bool trie_insert(trie * a, const char * key, unsigned short match_type); + +void ac_trie_prepare(trie * a); + +match * ac_trie_search(trie * a, const char * source, size_t len); + +match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len); + +void trie_free(trie * a); + +void match_set_describe(match * m, const char * source); + +void match_set_filter_leftmost_longest(match * header); + +void match_free(match * m); + + +#ifdef TEST +#include "CuTest.h" +#endif + + +#endif diff --git a/Sources/libMultiMarkdown/html.c b/Sources/libMultiMarkdown/html.c index 327ab11..a1f836e 100644 --- a/Sources/libMultiMarkdown/html.c +++ b/Sources/libMultiMarkdown/html.c @@ -1094,6 +1094,7 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc mmd_export_token_tree_html(out, source, t->child, scratch); break; case PAIR_BRACKET_ABBREVIATION: + // Which might also be an "auto-tagged" abbreviation if (scratch->extensions & EXT_NOTES) { // Note-based syntax enabled @@ -1113,8 +1114,10 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc // Get instance of the note used temp_note = stack_peek_index(scratch->used_abbreviations, temp_short - 1); - t->child->type = TEXT_EMPTY; - t->child->mate->type = TEXT_EMPTY; + if (t->child) { + t->child->type = TEXT_EMPTY; + t->child->mate->type = TEXT_EMPTY; + } if (temp_short3 == scratch->inline_abbreviations_to_free->size) { // This is a reference definition @@ -1124,7 +1127,10 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc print_const("clean_text, false); print_const("\">"); - mmd_export_token_tree_html(out, source, t->child, scratch); + if (t->child) + mmd_export_token_tree_html(out, source, t->child, scratch); + else + print_token(t); print_const(""); } else { // This is the first time this note was used @@ -1132,7 +1138,10 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc print_const(" (clean_text, false); print_const("\">"); - mmd_export_token_tree_html(out, source, t->child, scratch); + if (t->child) + mmd_export_token_tree_html(out, source, t->child, scratch); + else + print_token(t); print_const(")"); } } else { @@ -1274,6 +1283,7 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc } break; case PAIR_BRACKET_GLOSSARY: + // Which might also be an "auto-tagged" glossary if (scratch->extensions & EXT_NOTES) { // Note-based syntax enabled diff --git a/Sources/libMultiMarkdown/include/token.h b/Sources/libMultiMarkdown/include/token.h index 12de1bf..5f720b8 100644 --- a/Sources/libMultiMarkdown/include/token.h +++ b/Sources/libMultiMarkdown/include/token.h @@ -222,5 +222,7 @@ void token_skip_until_type_multiple(token ** t, int n, ...); void token_split_on_char(token * t, const char * source, const char c); +void token_split(token * t, size_t start, size_t len, unsigned short new_type); + #endif diff --git a/Sources/libMultiMarkdown/latex.c b/Sources/libMultiMarkdown/latex.c index 0a6d7e5..f59fdab 100644 --- a/Sources/libMultiMarkdown/latex.c +++ b/Sources/libMultiMarkdown/latex.c @@ -1013,6 +1013,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat mmd_export_token_tree_latex(out, source, t->child, scratch); break; case PAIR_BRACKET_ABBREVIATION: + // Which might also be an "auto-tagged" abbreviation if (scratch->extensions & EXT_NOTES) { // Note-based syntax enabled @@ -1206,6 +1207,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat } break; case PAIR_BRACKET_GLOSSARY: + // Which might also be an "auto-tagged" glossary if (scratch->extensions & EXT_NOTES) { // Note-based syntax enabled diff --git a/Sources/libMultiMarkdown/lexer.c b/Sources/libMultiMarkdown/lexer.c index b52fed1..ed85c4c 100644 --- a/Sources/libMultiMarkdown/lexer.c +++ b/Sources/libMultiMarkdown/lexer.c @@ -1,4 +1,4 @@ -/* Generated by re2c 0.14.3 on Sat Mar 4 20:43:39 2017 */ +/* Generated by re2c 0.14.3 on Thu Mar 9 19:02:17 2017 */ /** MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. @@ -348,7 +348,6 @@ yy50: ++YYCURSOR; { return TEXT_PERCENT; } yy52: - YYCTXMARKER = YYCURSOR + 1; yyaccept = 5; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { @@ -413,6 +412,7 @@ yy68: default: goto yy61; } yy69: + YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': @@ -437,7 +437,6 @@ yy70: default: goto yy264; } yy71: - YYCTXMARKER = YYCURSOR + 1; ++YYCURSOR; yych = *YYCURSOR; switch (yych) { diff --git a/Sources/libMultiMarkdown/lexer.re b/Sources/libMultiMarkdown/lexer.re index 9729c19..7417f6e 100644 --- a/Sources/libMultiMarkdown/lexer.re +++ b/Sources/libMultiMarkdown/lexer.re @@ -218,7 +218,7 @@ int scan(Scanner * s, const char * stop) { '}' { return TEXT_BRACE_RIGHT; } '\\' { return TEXT_BACKSLASH; } - [0-9]+ / ('.' (SP|NL)) { return TEXT_NUMBER_POSS_LIST; } + [0-9]+ '.' / (SP|NL) { return TEXT_NUMBER_POSS_LIST; } '.' / (SP|NL) { return TEXT_PERIOD; } TEXT_LINEBREAK { return TEXT_LINEBREAK; } diff --git a/Sources/libMultiMarkdown/mmd.c b/Sources/libMultiMarkdown/mmd.c index d53e759..b8e498c 100644 --- a/Sources/libMultiMarkdown/mmd.c +++ b/Sources/libMultiMarkdown/mmd.c @@ -418,39 +418,28 @@ void mmd_assign_line_type(mmd_engine * e, token * line) { break; case TEXT_NUMBER_POSS_LIST: switch(source[line->child->next->start]) { - case '.': - switch(source[line->child->next->start + 1]) { - case ' ': - case '\t': - line->type = LINE_LIST_ENUMERATED; - line->child->type = MARKER_LIST_ENUMERATOR; - - // Strip period - line->child->next->type = TEXT_EMPTY; - - switch (line->child->next->next->type) { - case TEXT_PLAIN: - // Strip whitespace between bullet and text - while (char_is_whitespace(source[line->child->next->next->start])) { - line->child->next->next->start++; - line->child->next->next->len--; - } - break; - case INDENT_SPACE: - case INDENT_TAB: - case NON_INDENT_SPACE: - t = line->child->next; - while(t->next && ((t->next->type == INDENT_SPACE) || - (t->next->type == INDENT_TAB) || - (t->next->type == NON_INDENT_SPACE))) { - tokens_prune(t->next, t->next); - } - break; + case ' ': + case '\t': + line->type = LINE_LIST_ENUMERATED; + line->child->type = MARKER_LIST_ENUMERATOR; + + switch (line->child->next->type) { + case TEXT_PLAIN: + // Strip whitespace between bullet and text + while (char_is_whitespace(source[line->child->next->start])) { + line->child->next->start++; + line->child->next->len--; } break; - default: - line->type = LINE_PLAIN; - line->child->type = TEXT_PLAIN; + case INDENT_SPACE: + case INDENT_TAB: + case NON_INDENT_SPACE: + t = line->child; + while(t->next && ((t->next->type == INDENT_SPACE) || + (t->next->type == INDENT_TAB) || + (t->next->type == NON_INDENT_SPACE))) { + tokens_prune(t->next, t->next); + } break; } break; diff --git a/Sources/libMultiMarkdown/odf.c b/Sources/libMultiMarkdown/odf.c index 9d2505f..33146b8 100644 --- a/Sources/libMultiMarkdown/odf.c +++ b/Sources/libMultiMarkdown/odf.c @@ -1151,6 +1151,7 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch } break; case PAIR_BRACKET_ABBREVIATION: + // Which might also be an "auto-tagged" abbreviation if (scratch->extensions & EXT_NOTES) { // Note-based syntax enabled @@ -1170,18 +1171,22 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch // Get instance of the note used temp_note = stack_peek_index(scratch->used_abbreviations, temp_short - 1); - t->child->type = TEXT_EMPTY; - t->child->mate->type = TEXT_EMPTY; + if (t->child) { + t->child->type = TEXT_EMPTY; + t->child->mate->type = TEXT_EMPTY; + } if (temp_short2 == scratch->used_abbreviations->size) { // This is a re-use of a previously used note if (temp_short3 == scratch->inline_abbreviations_to_free->size) { // This is a reference definition - mmd_export_token_tree_odf(out, source, t->child, scratch); + mmd_print_string_odf(out, temp_note->clean_text); +// mmd_export_token_tree_odf(out, source, t->child, scratch); } else { // This is an inline definition - mmd_export_token_tree_odf(out, source, t->child, scratch); + mmd_print_string_odf(out, temp_note->clean_text); +// mmd_export_token_tree_odf(out, source, t->child, scratch); } } else { // This is the first time this note was used @@ -1210,6 +1215,7 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch } break; case PAIR_BRACKET_GLOSSARY: + // Which might also be an "auto-tagged" glossary if (scratch->extensions & EXT_NOTES) { // Note-based syntax enabled diff --git a/Sources/libMultiMarkdown/token.c b/Sources/libMultiMarkdown/token.c index c853141..8a30ffe 100644 --- a/Sources/libMultiMarkdown/token.c +++ b/Sources/libMultiMarkdown/token.c @@ -631,3 +631,28 @@ void token_split_on_char(token * t, const char * source, const char c) { } } + +// Split a token and create +void token_split(token * t, size_t start, size_t len, unsigned short new_type) { + if (!t) + return; + + token * u = token_new(new_type, start, len); + size_t stop = start + len; + + if (t->start + t->len > stop) { + token * v = token_new(t->type, stop, t->start + t->len - stop); + + u->next = v; + v->prev = u; + v->next = t->next; + } else { + u->next = t->next; + } + + t->next = u; + u->prev = t; + + t->len = start - t->start; +} + diff --git a/Sources/libMultiMarkdown/writer.c b/Sources/libMultiMarkdown/writer.c index ae76e4d..9d1cbee 100644 --- a/Sources/libMultiMarkdown/writer.c +++ b/Sources/libMultiMarkdown/writer.c @@ -59,6 +59,7 @@ #include "libMultiMarkdown.h" +#include "aho-corasick.h" #include "beamer.h" #include "char.h" #include "d_string.h" @@ -1473,108 +1474,40 @@ void process_metadata_stack(mmd_engine * e, scratch_pad * scratch) { scratch->base_header_level = header_level; } -/// kmp from http://stackoverflow.com/questions/8584644/strstr-for-a-string-that-is-not-null-terminated -/// Search for a string within certain bounds (so we don't go past the end of the token) -int *kmp_borders(char * needle, size_t nlen){ - if (!needle) return NULL; - int i, j, *borders = malloc((nlen+1)*sizeof(*borders)); - if (!borders) return NULL; - i = 0; - j = -1; - borders[i] = j; - while((size_t)i < nlen){ - while(j >= 0 && needle[i] != needle[j]){ - j = borders[j]; - } - ++i; - ++j; - borders[i] = j; - } - return borders; -} - -const char * kmp_search(const char * haystack, size_t haylen, char * needle, size_t nlen, int * borders){ - size_t max_index = haylen-nlen, i = 0, j = 0; - while(i <= max_index){ - while(j < nlen && *haystack && needle[j] == *haystack){ - ++j; - ++haystack; - } - if (j == nlen){ - return haystack-nlen; - } - if (!(*haystack)){ - return NULL; - } - if (j == 0){ - ++haystack; - ++i; - } else { - do{ - i += j - (size_t)borders[j]; - j = borders[j]; - }while(j > 0 && needle[j] != *haystack); - } - } - return NULL; -} - -const char * sstrnstr(const char * haystack, char * needle, size_t haylen){ - if (!haystack || !needle){ - return NULL; - } - size_t nlen = strlen(needle); - if (haylen < nlen){ - return NULL; - } - int *borders = kmp_borders(needle, nlen); - if (!borders){ - return NULL; - } - const char *match = kmp_search(haystack, haylen, needle, nlen, borders); - free(borders); - return match; -} - - -/// Search a text node for abbreviation matches -/// TODO: This is an inefficient algorithm, searching -/// each node once for *each* abbreviation. A more -/// advanced algorithm would search for all abbreviations -/// simultaneously but require more setup (e.g. Aho-Corasick) -void abbr_search_text(mmd_engine * e, token * t) { - const char * str = &e->dstr->str[t->start]; - - const char * match; - abbr * a; - for (int i = 0; i < e->abbreviation_stack->size; ++i) - { - a = stack_peek_index(e->abbreviation_stack, i); +void automatic_search_text(mmd_engine * e, token * t, trie * ac) { + match * m = ac_trie_leftmost_longest_search(ac, &e->dstr->str[t->start], t->len); + + match * walker; + + token * tok = t; + + if (m) { + walker = m->next; - match = sstrnstr(str, a->abbr, t->len); + while (walker) { + token_split(tok, walker->start + t->start, walker->len, walker->match_type); + + // Advance token to section after the split (if present) + tok = tok->next->next; - if (match) { - fprintf(stderr, "Found match '%s' -> '%s' at %lu\n", a->abbr, a->expansion, (size_t) (match - e->dstr->str)); + // Advance to next match (if present) + walker = walker->next; } } + + match_free(m); } /// Determine which nodes to descend into to search for abbreviations -void abbr_search(mmd_engine * e, token * t) { +void automatic_search(mmd_engine * e, token * t, trie * ac) { while (t) { switch (t->type) { case TEXT_PLAIN: - abbr_search_text(e, t); + automatic_search_text(e, t, ac); break; case DOC_START_TOKEN: - case BLOCK_LIST_BULLETED: - case BLOCK_LIST_BULLETED_LOOSE: - case BLOCK_LIST_ENUMERATED: - case BLOCK_LIST_ENUMERATED_LOOSE: - abbr_search(e, t->child); - break; case BLOCK_PARA: case BLOCK_H1: case BLOCK_H2: @@ -1582,6 +1515,10 @@ void abbr_search(mmd_engine * e, token * t) { case BLOCK_H4: case BLOCK_H5: case BLOCK_H6: + case BLOCK_LIST_BULLETED: + case BLOCK_LIST_BULLETED_LOOSE: + case BLOCK_LIST_ENUMERATED: + case BLOCK_LIST_ENUMERATED_LOOSE: case BLOCK_LIST_ITEM_TIGHT: case BLOCK_LIST_ITEM: case BLOCK_SETEXT_1: @@ -1589,8 +1526,13 @@ void abbr_search(mmd_engine * e, token * t) { case BLOCK_TABLE: case BLOCK_TABLE_HEADER: case BLOCK_TABLE_SECTION: - abbr_search_text(e, t); + case PAIR_QUOTE_DOUBLE: + case PAIR_QUOTE_SINGLE: + case PAIR_STAR: + case PAIR_UL: + automatic_search(e, t->child, ac); break; +// case PAIR_PAREN: default: break; } @@ -1600,16 +1542,34 @@ void abbr_search(mmd_engine * e, token * t) { } -void process_abbreviation_stack(mmd_engine * e, scratch_pad * scratch) { - abbr * a; +void identify_global_search_terms(mmd_engine * e, scratch_pad * scratch) { + // Only search if we have a target + int count = e->abbreviation_stack->size + e->glossary_stack->size; - // Describe which abbreviations we are searching for + if (count == 0) { + return; + } + + trie * ac = trie_new(0); + footnote * f; + + // Add abbreviations to search trie for (int i = 0; i < e->abbreviation_stack->size; ++i) { - a = stack_peek_index(e->abbreviation_stack, i); + f = stack_peek_index(e->abbreviation_stack, i); + trie_insert(ac, f->label_text, PAIR_BRACKET_ABBREVIATION); + } + + // Add glossary to search trie + for (int i = 0; i < e->glossary_stack->size; ++i) + { + f = stack_peek_index(e->glossary_stack, i); + trie_insert(ac, f->clean_text, PAIR_BRACKET_GLOSSARY); } - abbr_search(e, e->root); + ac_trie_prepare(ac); + automatic_search(e, e->root, ac); + trie_free(ac); } @@ -1627,8 +1587,9 @@ void mmd_export_token_tree(DString * out, mmd_engine * e, short format) { // Process metadata process_metadata_stack(e, scratch); - // Process abbreviations - // process_abbreviation_stack(e, scratch); + // Process abbreviations, glossary, etc. + if (!(e->extensions & EXT_COMPATIBILITY)) + identify_global_search_terms(e, scratch); switch (scratch->output_format) { @@ -2017,15 +1978,27 @@ void citation_from_bracket(const char * source, scratch_pad * scratch, token * t void glossary_from_bracket(const char * source, scratch_pad * scratch, token * t, short * num) { // Get text inside bracket - char * text = text_inside_pair(source, t); + char * text; + + if (t->child) { + text = text_inside_pair(source, t); + } else { + text = malloc(t->len + 2); + text[0] = '?'; + memcpy(&text[1], &source[t->start], t->len); + text[t->len + 1] = '\0'; + } + short glossary_id = extract_glossary_from_stack(scratch, text); free(text); if (glossary_id == -1) { // No match, this is an inline glossary -- create a new glossary entry - t->child->type = TEXT_EMPTY; - t->child->mate->type = TEXT_EMPTY; + if (t->child) { + t->child->type = TEXT_EMPTY; + t->child->mate->type = TEXT_EMPTY; + } // Create glossary token * label = t->child; @@ -2056,16 +2029,28 @@ void glossary_from_bracket(const char * source, scratch_pad * scratch, token * t void abbreviation_from_bracket(const char * source, scratch_pad * scratch, token * t, short * num) { // Get text inside bracket - char * text = text_inside_pair(source, t); + char * text; + + if (t->child) { + text = text_inside_pair(source, t); + } else { + text = malloc(t->len + 2); + text[0] = '>'; + memcpy(&text[1], &source[t->start], t->len); + text[t->len + 1] = '\0'; + } + short abbr_id = extract_abbreviation_from_stack(scratch, &text[1]); free(text); if (abbr_id == -1) { // No match, this is an inline glossary -- create a new glossary entry - t->child->type = TEXT_EMPTY; - t->child->mate->type = TEXT_EMPTY; - + if (t->child) { + t->child->type = TEXT_EMPTY; + t->child->mate->type = TEXT_EMPTY; + } + // Create glossary token * label = t->child; while (label && label->type != PAIR_PAREN) diff --git a/tests/MMD6Tests/Abbreviations.fodt b/tests/MMD6Tests/Abbreviations.fodt index f39d962..eed1c05 100644 --- a/tests/MMD6Tests/Abbreviations.fodt +++ b/tests/MMD6Tests/Abbreviations.fodt @@ -280,7 +280,7 @@ BAR (bar) -foo bar +FOO BAR FOOBAR (foobar) @@ -289,6 +289,14 @@ 5 BAZ (BAT) (baz) + +FOO + +BAR + +FOOBAR + +FOO BAR baz diff --git a/tests/MMD6Tests/Abbreviations.html b/tests/MMD6Tests/Abbreviations.html index f5e2b4e..539a78b 100644 --- a/tests/MMD6Tests/Abbreviations.html +++ b/tests/MMD6Tests/Abbreviations.html @@ -20,6 +20,14 @@

BAZ (BAT) (baz)

+

foo

+ +

bar

+ +

foobar

+ +

foo bar baz

+ diff --git a/tests/MMD6Tests/Abbreviations.htmlc b/tests/MMD6Tests/Abbreviations.htmlc index ff3286b..dabcbaf 100644 --- a/tests/MMD6Tests/Abbreviations.htmlc +++ b/tests/MMD6Tests/Abbreviations.htmlc @@ -14,3 +14,11 @@ latex config: article

5

[>(baz) BAZ (BAT)]

+ +

foo

+ +

bar

+ +

foobar

+ +

foo bar baz

diff --git a/tests/MMD6Tests/Abbreviations.tex b/tests/MMD6Tests/Abbreviations.tex index 3e1c4b4..3042175 100644 --- a/tests/MMD6Tests/Abbreviations.tex +++ b/tests/MMD6Tests/Abbreviations.tex @@ -24,5 +24,13 @@ \newacronym{baz}{baz}{BAZ (BAT)}\gls{baz} +\gls{foo} + +\gls{bar} + +\gls{foobar} + +\gls{foo bar} baz + \input{mmd6-article-footer} \end{document} diff --git a/tests/MMD6Tests/Abbreviations.text b/tests/MMD6Tests/Abbreviations.text index 9bd19fa..cac55dc 100644 --- a/tests/MMD6Tests/Abbreviations.text +++ b/tests/MMD6Tests/Abbreviations.text @@ -15,6 +15,14 @@ latex config: article [>(baz) BAZ (BAT)] +foo + +bar + +foobar + +foo bar baz + [>foo]: FOO [>bar]: BAR diff --git a/tests/MMD6Tests/Glossaries.fodt b/tests/MMD6Tests/Glossaries.fodt index 92a66ff..4e50cd6 100644 --- a/tests/MMD6Tests/Glossaries.fodt +++ b/tests/MMD6Tests/Glossaries.fodt @@ -289,6 +289,10 @@ [?bar] 5 + +foo + +Foo Bar diff --git a/tests/MMD6Tests/Glossaries.html b/tests/MMD6Tests/Glossaries.html index 6acdac5..49eb2e7 100644 --- a/tests/MMD6Tests/Glossaries.html +++ b/tests/MMD6Tests/Glossaries.html @@ -18,6 +18,10 @@

5

+

foo

+ +

Foo Bar

+

    diff --git a/tests/MMD6Tests/Glossaries.htmlc b/tests/MMD6Tests/Glossaries.htmlc index 1d44024..1355517 100644 --- a/tests/MMD6Tests/Glossaries.htmlc +++ b/tests/MMD6Tests/Glossaries.htmlc @@ -13,5 +13,9 @@ latex config: article

    5

    +

    foo

    + +

    Foo Bar

    +
    With second para.
     
    diff --git a/tests/MMD6Tests/Glossaries.tex b/tests/MMD6Tests/Glossaries.tex index a5a6a09..2001950 100644 --- a/tests/MMD6Tests/Glossaries.tex +++ b/tests/MMD6Tests/Glossaries.tex @@ -22,5 +22,9 @@ With second para.} 5 +\gls{foo} + +\gls{foobar} + \input{mmd6-article-footer} \end{document} diff --git a/tests/MMD6Tests/Glossaries.text b/tests/MMD6Tests/Glossaries.text index 3a3f860..5d529ea 100644 --- a/tests/MMD6Tests/Glossaries.text +++ b/tests/MMD6Tests/Glossaries.text @@ -13,6 +13,11 @@ latex config: article 5 +foo + +Foo Bar + + [?foo]: Reference [?Foo Bar]: Reference -- 2.40.0