]> granicus.if.org Git - multimarkdown/commitdiff
ADDED: Add functionality to automatically identify abbreviations and glossary terms...
authorFletcher T. Penney <fletcher@fletcherpenney.net>
Fri, 10 Mar 2017 00:37:09 +0000 (19:37 -0500)
committerFletcher T. Penney <fletcher@fletcherpenney.net>
Fri, 10 Mar 2017 00:37:09 +0000 (19:37 -0500)
22 files changed:
CMakeLists.txt
Sources/libMultiMarkdown/aho-corasick.c [new file with mode: 0644]
Sources/libMultiMarkdown/aho-corasick.h [new file with mode: 0644]
Sources/libMultiMarkdown/html.c
Sources/libMultiMarkdown/include/token.h
Sources/libMultiMarkdown/latex.c
Sources/libMultiMarkdown/lexer.c
Sources/libMultiMarkdown/lexer.re
Sources/libMultiMarkdown/mmd.c
Sources/libMultiMarkdown/odf.c
Sources/libMultiMarkdown/token.c
Sources/libMultiMarkdown/writer.c
tests/MMD6Tests/Abbreviations.fodt
tests/MMD6Tests/Abbreviations.html
tests/MMD6Tests/Abbreviations.htmlc
tests/MMD6Tests/Abbreviations.tex
tests/MMD6Tests/Abbreviations.text
tests/MMD6Tests/Glossaries.fodt
tests/MMD6Tests/Glossaries.html
tests/MMD6Tests/Glossaries.htmlc
tests/MMD6Tests/Glossaries.tex
tests/MMD6Tests/Glossaries.text

index 214ebded280c9dea2f764659a79c07fe001010a1..4029546e450e75b8ab8caa5ff53592202171b29c 100644 (file)
@@ -172,6 +172,7 @@ configure_file (
 
 # src_files are the primary files, and will be included in doxygen documentation
 set(src_files
+       Sources/libMultiMarkdown/aho-corasick.c
        Sources/libMultiMarkdown/beamer.c
        Sources/libMultiMarkdown/char.c
        Sources/libMultiMarkdown/d_string.c
@@ -194,6 +195,7 @@ set(src_files
 
 # Primary header files, also for doxygen documentation
 set(header_files
+       Sources/libMultiMarkdown/aho-corasick.h
        Sources/libMultiMarkdown/beamer.h
        Sources/libMultiMarkdown/char.h
        Sources/libMultiMarkdown/include/d_string.h
diff --git a/Sources/libMultiMarkdown/aho-corasick.c b/Sources/libMultiMarkdown/aho-corasick.c
new file mode 100644 (file)
index 0000000..7445aa4
--- /dev/null
@@ -0,0 +1,591 @@
+/**
+
+       C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more.
+
+       @file aho-corasick.c
+
+       @brief C implementation of the Aho-Corasick algorithm for searching text
+       for multiple strings simultaneously in a single pass without backtracking.
+
+       <https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm>
+
+
+       @author Fletcher T. Penney
+       @bug    
+
+**/
+
+/*
+
+       Copyright © 2015-2017 Fletcher T. Penney.
+
+
+       The `c-template` project is released under the MIT License.
+       
+       GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+       
+               https://github.com/fletcher/MultiMarkdown-4/
+       
+       MMD 4 is released under both the MIT License and GPL.
+       
+       
+       CuTest is released under the zlib/libpng license. See CuTest.c for the text
+       of the license.
+       
+       
+       ## The MIT License ##
+       
+       Permission is hereby granted, free of charge, to any person obtaining a copy
+       of this software and associated documentation files (the "Software"), to deal
+       in the Software without restriction, including without limitation the rights
+       to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+       copies of the Software, and to permit persons to whom the Software is
+       furnished to do so, subject to the following conditions:
+       
+       The above copyright notice and this permission notice shall be included in
+       all copies or substantial portions of the Software.
+       
+       THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+       IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+       FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+       AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+       LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+       OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+       THE SOFTWARE.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "aho-corasick.h"
+
+#define kTrieStartingSize 256
+
+void trie_to_graphviz(trie * a);
+
+
+trie * trie_new(size_t startingSize) {
+       trie * a = malloc(sizeof(trie));
+
+       if (a) {
+               if (startingSize <= 1)
+                       startingSize = kTrieStartingSize;
+
+               a->node = malloc(sizeof(trie_node) * startingSize);
+
+               if (!a->node) {
+                       free(a);
+                       return NULL;
+               }
+
+               // Clear memory
+               memset(a->node, 0, sizeof(trie_node) * startingSize);
+
+               // All tries have a root node
+               a->size = 1;
+               a->capacity = startingSize;
+       }
+
+       return a;
+}
+
+
+void trie_free(trie * a) {
+       free(a->node);
+       free(a);
+}
+
+
+bool trie_node_insert(trie * a, size_t s, const unsigned char * key, unsigned short match_type, unsigned short depth) {
+       // Get node for state s
+       trie_node * n = &a->node[s];
+
+       size_t i;
+
+       if (key[0] == '\0') {
+               // We've hit end of key
+               n->match_type = match_type;
+               n->len = depth;
+               return true;            // Success
+       }
+
+       if (n->child[key[0]] != 0) {
+               // First character already in trie, advance forward
+               return trie_node_insert(a, n->child[key[0]], key + 1, match_type, ++depth);
+       } else {
+               // Create new node
+
+               // Ensure capacity
+               if (a->size == a->capacity) {
+                       a->capacity *= 2;
+                       a->node = realloc(a->node, a->capacity * sizeof(trie_node));
+
+                       // Set n to new location
+                       n = &(a->node[s]);
+               }
+
+               // Current node points to next node
+               i = a->size;
+               n->child[key[0]] = i;
+
+               // Initialize new node to 0
+               n = &a->node[i];
+               memset(n, 0, sizeof(trie_node));
+
+               // Set char for new node
+               n->c = key[0];
+
+               // Incremement size
+               a->size++;
+
+               // Advance forward
+               return trie_node_insert(a, i, key + 1, match_type, ++depth);
+       }
+}
+
+
+bool trie_insert(trie * a, const char * key, unsigned short match_type) {
+       if (a && key && (key[0] != '\0')) {
+               return trie_node_insert(a, 0, (const unsigned char *)key, match_type, 0);
+       }
+
+       return false;
+}
+
+
+#ifdef TEST
+void Test_trie_insert(CuTest* tc) {
+       trie * a = trie_new(0);
+
+       CuAssertIntEquals(tc, kTrieStartingSize, a->capacity);
+       CuAssertIntEquals(tc, 1, a->size);
+
+       trie_insert(a, "foo", 42);
+
+       trie_node * n = &a->node[0];
+       CuAssertIntEquals(tc, 0, n->match_type);
+       CuAssertIntEquals(tc, 1, n->child['f']);
+       CuAssertIntEquals(tc, '\0', n->c);
+
+       n = &a->node[1];
+       CuAssertIntEquals(tc, 0, n->match_type);
+       CuAssertIntEquals(tc, 2, n->child['o']);
+       CuAssertIntEquals(tc, 'f', n->c);
+
+       n = &a->node[2];
+       CuAssertIntEquals(tc, 0, n->match_type);
+       CuAssertIntEquals(tc, 3, n->child['o']);
+       CuAssertIntEquals(tc, 'o', n->c);
+
+       n = &a->node[3];
+       CuAssertIntEquals(tc, 42, n->match_type);
+       CuAssertIntEquals(tc, 3, n->len);
+       CuAssertIntEquals(tc, 'o', n->c);
+
+       trie_free(a);
+}
+#endif
+
+
+size_t trie_node_search(trie * a, size_t s, const char * query) {
+       if (query[0] == '\0') {
+               // Found matching state
+               return s;
+       }
+
+       if (a->node[s].child[query[0]] == 0) {
+               // Failed to match
+               return -1;
+       }
+
+       // Partial match, keep going
+       return trie_node_search(a, a->node[s].child[query[0]], query + 1);
+}
+
+
+size_t trie_search(trie * a, const char * query) {
+       if (a && query) {
+               return trie_node_search(a, 0, query);
+       }
+
+       return 0;
+}
+
+
+unsigned short trie_search_match_type(trie * a, const char * query) {
+       size_t s = trie_search(a, query);
+
+       if (s == -1)
+               return -1;
+
+       return a->node[s].match_type;
+}
+
+
+#ifdef TEST
+void Test_trie_search(CuTest* tc) {
+       trie * a = trie_new(0);
+
+       trie_insert(a, "foo", 42);
+       trie_insert(a, "bar", 41);
+       trie_insert(a, "food", 40);
+
+       CuAssertIntEquals(tc, 3, trie_search(a, "foo"));
+       CuAssertIntEquals(tc, 42, trie_search_match_type(a, "foo"));
+
+       CuAssertIntEquals(tc, 6, trie_search(a, "bar"));
+       CuAssertIntEquals(tc, 41, trie_search_match_type(a, "bar"));
+
+       CuAssertIntEquals(tc, 7, trie_search(a, "food"));
+       CuAssertIntEquals(tc, 40, trie_search_match_type(a, "food"));
+
+       CuAssertIntEquals(tc, -1, trie_search(a, "foot"));
+       CuAssertIntEquals(tc, (unsigned short) -1, trie_search_match_type(a, "foot"));
+
+       trie_free(a);
+}
+#endif
+
+
+void ac_trie_node_prepare(trie * a, size_t s, char * buffer, unsigned short depth, size_t last_match_state) {
+
+       buffer[depth] = '\0';
+       buffer[depth + 1] = '\0';
+
+       // Current node
+       trie_node * n = &(a->node[s]);
+
+       char * suffix = buffer;
+
+       // No suffix for first level matches
+       unsigned short last_match_depth = a->node[last_match_state].len;
+
+       if (depth == 1) {
+               last_match_depth = 1;
+       }
+
+       // Longest match seen so far??
+       suffix += 1;
+
+       // Find valid suffixes for failure path
+       while ((suffix[0] != '\0') && (n->ac_fail == 0)) {
+               n->ac_fail = trie_search(a, suffix);
+
+               if (n->ac_fail == -1)
+                       n->ac_fail = 0;
+
+               if (n->ac_fail == s) {
+                       // Something went wrong
+                       fprintf(stderr, "Recursive trie fallback detected at state %lu('%c') - suffix:'%s'!\n", s, n->c, suffix);
+                       n->ac_fail = 0;
+               }
+
+               suffix++;
+       }
+
+
+       // Prepare children
+       for (int i = 0; i < 256; ++i)
+       {
+               if ((n->child[i] != 0) &&
+                       (n->child[i] != s)) {
+                       buffer[depth] = i;
+
+                       ac_trie_node_prepare(a, n->child[i], buffer, depth + 1, last_match_state);
+               }
+       }
+}
+
+/// Prepare trie for Aho-Corasick search algorithm by mapping failure connections
+void ac_trie_prepare(trie * a) {
+       // Clear old pointers
+       for (size_t i = 0; i < a->size; ++i)
+       {
+               a->node[i].ac_fail = 0;
+       }
+
+       // Create a buffer to use
+       char buffer[a->capacity];
+
+       ac_trie_node_prepare(a, 0, buffer, 0, 0);
+}
+
+
+
+#ifdef TEST
+void Test_trie_prepare(CuTest* tc) {
+       trie * a = trie_new(0);
+
+       trie_insert(a, "a", 1);
+       trie_insert(a, "aa", 2);
+       trie_insert(a, "aaa", 3);
+       trie_insert(a, "aaaa", 4);
+
+       ac_trie_prepare(a);
+
+       trie_free(a);
+}
+#endif
+
+
+match * match_new(size_t start, size_t len, unsigned short match_type) {
+       match * m = malloc(sizeof(match));
+
+       if (m) {
+               m->start = start;
+               m->len = len;
+               m->match_type = match_type;
+               m->next = NULL;
+       }
+
+       return m;
+}
+
+
+void match_free(match * m) {
+       if (m) {
+               if (m->next) {
+                       match_free(m->next);
+               }
+
+               free(m);
+       }
+}
+
+
+match * match_add(match * last, size_t start, size_t len, unsigned short match_type) {
+       if (last) {
+               last->next = match_new(start, len, match_type);
+               last->next->prev = last;
+               return last->next;
+       } else {
+               return match_new(start, len, match_type);
+       }
+
+       return NULL;
+}
+
+
+match * ac_trie_search(trie * a, const char * source, size_t len) {
+
+       // Store results in a linked list
+//     match * result = match_new(0, 0, 0);
+       match * result = NULL;
+       match * m = result;
+
+       // Keep track of our state
+       size_t state = 0;
+       size_t temp_state;
+
+       // Character being compared
+       char test_value;
+       size_t counter = 0;
+
+       while ((counter < len) && (source[counter] != '\0')) {
+               // Read next character
+               test_value = source[counter++];
+
+               // Check for path that allows us to match next character
+               while (state != 0 && a->node[state].child[test_value] == 0) {
+                       state = a->node[state].ac_fail;
+               }
+
+               // Advance state for the next character
+               state = a->node[state].child[test_value];
+
+               // Check for partial matches
+               temp_state = state;
+
+               while (temp_state != 0) {
+                       if (a->node[temp_state].match_type) {
+                               // This is a match
+                               if (!m) {
+                                       result = match_new(0, 0, 0);
+                                       m = result;
+                               }
+                               m = match_add(m, counter - a->node[temp_state].len,
+                                       a->node[temp_state].len, a->node[temp_state].match_type);
+                       }
+
+                       // Iterate to find shorter matches
+                       temp_state = a->node[temp_state].ac_fail;
+               }
+       }
+
+       return result;
+}
+
+
+void match_excise(match * m) {
+       if (m->prev) {
+               m->prev->next = m->next;
+       }
+
+       if (m->next) {
+               m->next->prev = m->prev;
+       }
+
+       free(m);
+}
+
+int match_count(match * m) {
+       int result = 0;
+       m = m->next;    // Skip header
+
+       while (m) {
+               result++;
+               m = m->next;
+       }
+
+       return result;
+}
+
+
+void match_describe(match * m, const char * source) {
+       fprintf(stderr, "'%.*s'(%d) at %lu:%lu\n", (int)m->len, &source[m->start], 
+               m->match_type, m->start, m->start + m->len);
+}
+
+
+void match_set_describe(match * m, const char * source) {
+       m = m->next;    // Skip header
+       while (m) {
+               match_describe(m, source);
+               m = m->next;
+       }
+}
+
+
+void match_set_filter_leftmost_longest(match * header) {
+       // Filter results to include only leftmost/longest results
+       match * m = header->next;       // Skip header
+       match * n;
+
+       while (m) {
+               if (m->next) {
+                       if (m->start == m->next->start) {
+                               // The next match is longer than this one
+                               n = m;
+                               m = m->next;
+                               match_excise(n);
+                               continue;
+                       }
+
+                       while (m->next && 
+                               m->next->start > m->start &&
+                               m->next->start < m->start + m->len) {
+                               // This match is "lefter" than next
+                               match_excise(m->next);
+                       }
+
+                       while (m->next &&
+                               m->next->start < m->start) {
+                               // Next match is "lefter" than us
+                               n = m;
+                               m = m->prev;
+                               match_excise(n);
+                       }
+               }
+
+               while (m->prev->len && 
+                       m->prev->start >= m->start) {
+                       // We are "lefter" than previous
+                       n = m->prev;
+                       match_excise(n);
+               }
+
+               m = m->next;
+       }
+}
+
+
+match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len) {
+       match * result = ac_trie_search(a, source, len);
+
+       if (result)
+               match_set_filter_leftmost_longest(result);
+
+       return result;
+}
+
+
+#ifdef TEST
+void Test_aho_trie_search(CuTest* tc) {
+       trie * a = trie_new(0);
+
+       trie_insert(a, "foo", 42);
+       trie_insert(a, "bar", 41);
+       trie_insert(a, "food", 40);
+
+       ac_trie_prepare(a);
+
+       match * m = ac_trie_search(a, "this is a bar that serves food.", 31);
+
+       match_free(m);
+       trie_free(a);
+
+
+       a = trie_new(0);
+
+       trie_insert(a, "A", 1);
+       trie_insert(a, "AB", 2);
+       trie_insert(a, "ABC", 3);
+       trie_insert(a, "BC", 4);
+       trie_insert(a, "BCD", 5);
+       trie_insert(a, "E", 6);
+       trie_insert(a, "EFGHIJ", 7);
+       trie_insert(a, "F", 8);
+       trie_insert(a, "ZABCABCZ", 9);
+       trie_insert(a, "ZAB", 10);
+
+       ac_trie_prepare(a);
+
+       m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
+       fprintf(stderr, "Finish with %d matches\n", match_count(m));
+       match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
+       match_free(m);
+
+       m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
+       fprintf(stderr, "Finish with %d matches\n", match_count(m));
+       match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
+       match_free(m);
+
+       // trie_to_graphviz(a);
+
+       trie_free(a);
+}
+#endif
+
+
+void trie_node_to_graphviz(trie * a, size_t s) {
+       trie_node * n = &a->node[s];
+
+       if (n->match_type)
+               fprintf(stderr, "\"%lu\" [shape=doublecircle]\n", s);
+
+       for (int i = 0; i < 256; ++i)
+       {
+               if (n->child[i]) {
+                       switch (i) {
+                               default:
+                                       fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"%c\"]\n", s, n->child[i], (char)i);
+                       }
+               }
+       }
+
+       if (n->ac_fail)
+               fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"fail\"]\n", s, n->ac_fail);
+}
+
+
+void trie_to_graphviz(trie * a) {
+       fprintf(stderr, "digraph dfa {\n");
+       for (int i = 0; i <  a->size; ++i)
+       {
+               trie_node_to_graphviz(a, i);
+       }
+       fprintf(stderr, "}\n");
+}
+
diff --git a/Sources/libMultiMarkdown/aho-corasick.h b/Sources/libMultiMarkdown/aho-corasick.h
new file mode 100644 (file)
index 0000000..73414f2
--- /dev/null
@@ -0,0 +1,117 @@
+/**
+
+       C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more.
+
+       @file aho-corasick.h
+
+       @brief 
+
+
+       @author Fletcher T. Penney
+       @bug    
+
+**/
+
+/*
+
+       Copyright © 2015-2017 Fletcher T. Penney.
+
+
+       The `c-template` project is released under the MIT License.
+       
+       GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+       
+               https://github.com/fletcher/MultiMarkdown-4/
+       
+       MMD 4 is released under both the MIT License and GPL.
+       
+       
+       CuTest is released under the zlib/libpng license. See CuTest.c for the text
+       of the license.
+       
+       
+       ## The MIT License ##
+       
+       Permission is hereby granted, free of charge, to any person obtaining a copy
+       of this software and associated documentation files (the "Software"), to deal
+       in the Software without restriction, including without limitation the rights
+       to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+       copies of the Software, and to permit persons to whom the Software is
+       furnished to do so, subject to the following conditions:
+       
+       The above copyright notice and this permission notice shall be included in
+       all copies or substantial portions of the Software.
+       
+       THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+       IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+       FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+       AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+       LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+       OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+       THE SOFTWARE.
+
+*/
+
+
+#ifndef AC_TEMPLATE_H
+#define AC_TEMPLATE_H
+
+#include <string.h>
+
+struct trie_node {
+       char                            c;                                      // Character for this node
+       unsigned short          match_type;                     // 0 = no match, otherwise what have we matched?
+       unsigned short          len;                            // Length of string matched
+       size_t                          child[256];                     // Where should we go next?
+       size_t                          ac_fail;                        // Where should we go if we fail?
+};
+
+typedef struct trie_node trie_node;
+
+
+struct trie {
+       size_t                          size;                           // How many nodes are in use?
+       size_t                          capacity;                       // How many nodes can we hold
+
+       trie_node *                     node;                           // Pointer to stack of nodes
+};
+
+typedef struct trie trie;
+
+
+struct match {
+       size_t                          start;                          // Starting offset for this match
+       size_t                          len;                            // Length for this match
+       unsigned short          match_type;                     // Match type
+       struct match *          next;                           // Pointer to next match in list
+       struct match *          prev;                           // Pointer to previous match in list
+};
+
+typedef struct match match;
+
+
+trie * trie_new(size_t startingSize);
+
+bool trie_insert(trie * a, const char * key, unsigned short match_type);
+
+void ac_trie_prepare(trie * a);
+
+match * ac_trie_search(trie * a, const char * source, size_t len);
+
+match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len);
+
+void trie_free(trie * a);
+
+void match_set_describe(match * m, const char * source);
+
+void match_set_filter_leftmost_longest(match * header);
+
+void match_free(match * m);
+
+
+#ifdef TEST
+#include "CuTest.h"
+#endif
+
+
+#endif
index 327ab1154d9c100e5494ba4116b37700cb543223..a1f836ee4649f6f74b4b9b1f3f5f314dccfea27a 100644 (file)
@@ -1094,6 +1094,7 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc
                        mmd_export_token_tree_html(out, source, t->child, scratch);
                        break;
                case PAIR_BRACKET_ABBREVIATION:
+                       // Which might also be an "auto-tagged" abbreviation
                        if (scratch->extensions & EXT_NOTES) {
                                // Note-based syntax enabled
 
@@ -1113,8 +1114,10 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc
                                // Get instance of the note used
                                temp_note = stack_peek_index(scratch->used_abbreviations, temp_short - 1);
 
-                               t->child->type = TEXT_EMPTY;
-                               t->child->mate->type = TEXT_EMPTY;
+                               if (t->child) {
+                                       t->child->type = TEXT_EMPTY;
+                                       t->child->mate->type = TEXT_EMPTY;
+                               }
 
                                if (temp_short3 == scratch->inline_abbreviations_to_free->size) {
                                        // This is a reference definition
@@ -1124,7 +1127,10 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc
                                                print_const("<abbr title=\"");
                                                mmd_print_string_html(out, temp_note->clean_text, false);
                                                print_const("\">");
-                                               mmd_export_token_tree_html(out, source, t->child, scratch);
+                                               if (t->child)
+                                                       mmd_export_token_tree_html(out, source, t->child, scratch);
+                                               else
+                                                       print_token(t);
                                                print_const("</abbr>");
                                        } else {
                                                // This is the first time this note was used
@@ -1132,7 +1138,10 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc
                                                print_const(" (<abbr title=\"");
                                                mmd_print_string_html(out, temp_note->clean_text, false);
                                                print_const("\">");
-                                               mmd_export_token_tree_html(out, source, t->child, scratch);
+                                               if (t->child)
+                                                       mmd_export_token_tree_html(out, source, t->child, scratch);
+                                               else
+                                                       print_token(t);
                                                print_const("</abbr>)");
                                        }
                                } else {
@@ -1274,6 +1283,7 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc
                        }
                        break;
                case PAIR_BRACKET_GLOSSARY:
+                       // Which might also be an "auto-tagged" glossary
                        if (scratch->extensions & EXT_NOTES) {
                                // Note-based syntax enabled
 
index 12de1bf857f16700b5e786a633369c2fc0343dec..5f720b86e9bc30a1919f2e53dd448a8bd0f8976f 100644 (file)
@@ -222,5 +222,7 @@ void token_skip_until_type_multiple(token ** t, int n, ...);
 
 void token_split_on_char(token * t, const char * source, const char c);
 
+void token_split(token * t, size_t start, size_t len, unsigned short new_type);
+
 #endif
 
index 0a6d7e57f598e633955bb1a6b4ce395930ecaf05..f59fdabfbb86d9e71f19d96cefaf35ac2affef9a 100644 (file)
@@ -1013,6 +1013,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat
                        mmd_export_token_tree_latex(out, source, t->child, scratch);
                        break;
                case PAIR_BRACKET_ABBREVIATION:
+                       // Which might also be an "auto-tagged" abbreviation
                        if (scratch->extensions & EXT_NOTES) {
                                // Note-based syntax enabled
 
@@ -1206,6 +1207,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat
                        }
                        break;
                case PAIR_BRACKET_GLOSSARY:
+                       // Which might also be an "auto-tagged" glossary
                        if (scratch->extensions & EXT_NOTES) {
                                // Note-based syntax enabled
 
index b52fed1a2b65d7b103ae79863f47301b15f6e772..ed85c4c0c14321f581181d34be791763ce1228f6 100644 (file)
@@ -1,4 +1,4 @@
-/* Generated by re2c 0.14.3 on Sat Mar  4 20:43:39 2017 */
+/* Generated by re2c 0.14.3 on Thu Mar  9 19:02:17 2017 */
 /**
 
        MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more.
@@ -348,7 +348,6 @@ yy50:
        ++YYCURSOR;
        { return TEXT_PERCENT; }
 yy52:
-       YYCTXMARKER = YYCURSOR + 1;
        yyaccept = 5;
        yych = *(YYMARKER = ++YYCURSOR);
        switch (yych) {
@@ -413,6 +412,7 @@ yy68:
        default:        goto yy61;
        }
 yy69:
+       YYCTXMARKER = YYCURSOR + 1;
        yych = *++YYCURSOR;
        switch (yych) {
        case '\t':
@@ -437,7 +437,6 @@ yy70:
        default:        goto yy264;
        }
 yy71:
-       YYCTXMARKER = YYCURSOR + 1;
        ++YYCURSOR;
        yych = *YYCURSOR;
        switch (yych) {
index 9729c1957a41dbf32d699d3df0a71e7eaa6878a6..7417f6e4f1949c0b3cf7f4b9bc0461d45cc667b8 100644 (file)
@@ -218,7 +218,7 @@ int scan(Scanner * s, const char * stop) {
                '}'                                                             { return TEXT_BRACE_RIGHT; }
                '\\'                                                    { return TEXT_BACKSLASH; }
 
-               [0-9]+ / ('.' (SP|NL))                  { return TEXT_NUMBER_POSS_LIST; }
+               [0-9]+ '.' / (SP|NL)                    { return TEXT_NUMBER_POSS_LIST; }
                '.' / (SP|NL)                                   { return TEXT_PERIOD; }
 
                TEXT_LINEBREAK                                  { return TEXT_LINEBREAK; }
index d53e7599bb92c86c30f14685befa93620188c1be..b8e498c2dfe6aa601c410d1fa0def879f558dc18 100644 (file)
@@ -418,39 +418,28 @@ void mmd_assign_line_type(mmd_engine * e, token * line) {
                        break;
                case TEXT_NUMBER_POSS_LIST:
                        switch(source[line->child->next->start]) {
-                               case '.':
-                                       switch(source[line->child->next->start + 1]) {
-                                               case ' ':
-                                               case '\t':
-                                                       line->type = LINE_LIST_ENUMERATED;
-                                                       line->child->type = MARKER_LIST_ENUMERATOR;
-
-                                                       // Strip period
-                                                       line->child->next->type = TEXT_EMPTY;
-
-                                                       switch (line->child->next->next->type) {
-                                                               case TEXT_PLAIN:
-                                                                       // Strip whitespace between bullet and text
-                                                                       while (char_is_whitespace(source[line->child->next->next->start])) {
-                                                                               line->child->next->next->start++;
-                                                                               line->child->next->next->len--;
-                                                                       }
-                                                                       break;
-                                                               case INDENT_SPACE:
-                                                               case INDENT_TAB:
-                                                               case NON_INDENT_SPACE:
-                                                                       t = line->child->next;
-                                                                       while(t->next && ((t->next->type == INDENT_SPACE) ||
-                                                                               (t->next->type == INDENT_TAB) ||
-                                                                               (t->next->type == NON_INDENT_SPACE))) {
-                                                                               tokens_prune(t->next, t->next);
-                                                                       }
-                                                                       break;
+                               case ' ':
+                               case '\t':
+                                       line->type = LINE_LIST_ENUMERATED;
+                                       line->child->type = MARKER_LIST_ENUMERATOR;
+
+                                       switch (line->child->next->type) {
+                                               case TEXT_PLAIN:
+                                                       // Strip whitespace between bullet and text
+                                                       while (char_is_whitespace(source[line->child->next->start])) {
+                                                               line->child->next->start++;
+                                                               line->child->next->len--;
                                                        }
                                                        break;
-                                               default:
-                                                       line->type = LINE_PLAIN;
-                                                       line->child->type = TEXT_PLAIN;
+                                               case INDENT_SPACE:
+                                               case INDENT_TAB:
+                                               case NON_INDENT_SPACE:
+                                                       t = line->child;
+                                                       while(t->next && ((t->next->type == INDENT_SPACE) ||
+                                                               (t->next->type == INDENT_TAB) ||
+                                                               (t->next->type == NON_INDENT_SPACE))) {
+                                                               tokens_prune(t->next, t->next);
+                                                       }
                                                        break;
                                        }
                                        break;
index 9d2505fc73c16c30d36644903940e8340f0ce128..33146b84ccd4dcb301cab1c8ba46dce8e98f144f 100644 (file)
@@ -1151,6 +1151,7 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch
                        }
                        break;
                case PAIR_BRACKET_ABBREVIATION:
+                       // Which might also be an "auto-tagged" abbreviation
                        if (scratch->extensions & EXT_NOTES) {
                                // Note-based syntax enabled
 
@@ -1170,18 +1171,22 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch
                                // Get instance of the note used
                                temp_note = stack_peek_index(scratch->used_abbreviations, temp_short - 1);
 
-                               t->child->type = TEXT_EMPTY;
-                               t->child->mate->type = TEXT_EMPTY;
+                               if (t->child) {
+                                       t->child->type = TEXT_EMPTY;
+                                       t->child->mate->type = TEXT_EMPTY;
+                               }
 
                                if (temp_short2 == scratch->used_abbreviations->size) {
                                        // This is a re-use of a previously used note
 
                                        if (temp_short3 == scratch->inline_abbreviations_to_free->size) {
                                                // This is a reference definition
-                                               mmd_export_token_tree_odf(out, source, t->child, scratch);
+                                               mmd_print_string_odf(out, temp_note->clean_text);
+//                                             mmd_export_token_tree_odf(out, source, t->child, scratch);
                                        } else {
                                                // This is an inline definition
-                                               mmd_export_token_tree_odf(out, source, t->child, scratch);
+                                               mmd_print_string_odf(out, temp_note->clean_text);
+//                                             mmd_export_token_tree_odf(out, source, t->child, scratch);
                                        }
                                } else {
                                        // This is the first time this note was used
@@ -1210,6 +1215,7 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch
                        }
                        break;
                case PAIR_BRACKET_GLOSSARY:
+                       // Which might also be an "auto-tagged" glossary
                        if (scratch->extensions & EXT_NOTES) {
                                // Note-based syntax enabled
 
index c85314132a2979a944691043dc6b63b8dc9482b8..8a30ffe42d863d82b129ad58a81b48251589be60 100644 (file)
@@ -631,3 +631,28 @@ void token_split_on_char(token * t, const char * source, const char c) {
        }
 }
 
+
+// Split a token and create 
+void token_split(token * t, size_t start, size_t len, unsigned short new_type) {
+       if (!t)
+               return;
+
+       token * u = token_new(new_type, start, len);
+       size_t stop = start + len;
+
+       if (t->start + t->len > stop) {
+               token * v = token_new(t->type, stop, t->start + t->len - stop);
+
+               u->next = v;
+               v->prev = u;
+               v->next = t->next;
+       } else {
+               u->next = t->next;
+       }
+
+       t->next = u;
+       u->prev = t;
+
+       t->len = start - t->start;
+}
+
index ae76e4d4a143a612d5d14de3dc3b7a98cd653c4c..9d1cbee2e5f9c7c3d244045ee26d5628aee04080 100644 (file)
@@ -59,6 +59,7 @@
 
 #include "libMultiMarkdown.h"
 
+#include "aho-corasick.h"
 #include "beamer.h"
 #include "char.h"
 #include "d_string.h"
@@ -1473,108 +1474,40 @@ void process_metadata_stack(mmd_engine * e, scratch_pad * scratch) {
                scratch->base_header_level = header_level;
 }
 
-/// kmp from http://stackoverflow.com/questions/8584644/strstr-for-a-string-that-is-not-null-terminated
-/// Search for a string within certain bounds (so we don't go past the end of the token)
-int *kmp_borders(char * needle, size_t nlen){
-    if (!needle) return NULL;
-    int i, j, *borders = malloc((nlen+1)*sizeof(*borders));
-    if (!borders) return NULL;
-    i = 0;
-    j = -1;
-    borders[i] = j;
-    while((size_t)i < nlen){
-        while(j >= 0 && needle[i] != needle[j]){
-            j = borders[j];
-        }
-        ++i;
-        ++j;
-        borders[i] = j;
-    }
-    return borders;
-}
-
-const char * kmp_search(const char * haystack, size_t haylen, char * needle, size_t nlen, int * borders){
-    size_t max_index = haylen-nlen, i = 0, j = 0;
-    while(i <= max_index){
-        while(j < nlen && *haystack && needle[j] == *haystack){
-            ++j;
-            ++haystack;
-        }
-        if (j == nlen){
-            return haystack-nlen;
-        }
-        if (!(*haystack)){
-            return NULL;
-        }
-        if (j == 0){
-            ++haystack;
-            ++i;
-        } else {
-            do{
-                i += j - (size_t)borders[j];
-                j = borders[j];
-            }while(j > 0 && needle[j] != *haystack);
-        }
-    }
-    return NULL;
-}
-
-const char * sstrnstr(const char * haystack, char * needle, size_t haylen){
-    if (!haystack || !needle){
-        return NULL;
-    }
-    size_t nlen = strlen(needle);
-    if (haylen < nlen){
-        return NULL;
-    }
-    int *borders = kmp_borders(needle, nlen);
-    if (!borders){
-        return NULL;
-    }
-    const char *match = kmp_search(haystack, haylen, needle, nlen, borders);
-    free(borders);
-    return match;
-}
-
-
-/// Search a text node for abbreviation matches
-/// TODO: This is an inefficient algorithm, searching
-/// each node once for *each* abbreviation.  A more
-/// advanced algorithm would search for all abbreviations
-/// simultaneously but require more setup (e.g. Aho-Corasick)
-void abbr_search_text(mmd_engine * e, token * t) {
-       const char * str = &e->dstr->str[t->start];
-
-       const char * match;
-       abbr * a;
 
-       for (int i = 0; i < e->abbreviation_stack->size; ++i)
-       {
-               a = stack_peek_index(e->abbreviation_stack, i);
+void automatic_search_text(mmd_engine * e, token * t, trie * ac) {
+       match * m = ac_trie_leftmost_longest_search(ac, &e->dstr->str[t->start], t->len);
+
+       match * walker;
+
+       token * tok = t;
+
+       if (m) {
+               walker = m->next;
 
-               match = sstrnstr(str, a->abbr, t->len);
+               while (walker) {
+                       token_split(tok, walker->start + t->start, walker->len, walker->match_type);
+
+                       // Advance token to section after the split (if present)
+                       tok = tok->next->next;
 
-               if (match) {
-                       fprintf(stderr, "Found match '%s' -> '%s' at %lu\n", a->abbr, a->expansion, (size_t) (match - e->dstr->str));
+                       // Advance to next match (if present)
+                       walker = walker->next;
                }
        }
+
+       match_free(m);
 }
 
 
 /// Determine which nodes to descend into to search for abbreviations
-void abbr_search(mmd_engine * e, token * t) {
+void automatic_search(mmd_engine * e, token * t, trie * ac) {
        while (t) {
                switch (t->type) {
                        case TEXT_PLAIN:
-                               abbr_search_text(e, t);
+                               automatic_search_text(e, t, ac);
                                break;
                        case DOC_START_TOKEN:
-                       case BLOCK_LIST_BULLETED:
-                       case BLOCK_LIST_BULLETED_LOOSE:
-                       case BLOCK_LIST_ENUMERATED:
-                       case BLOCK_LIST_ENUMERATED_LOOSE:
-                               abbr_search(e, t->child);
-                               break;
                        case BLOCK_PARA:
                        case BLOCK_H1:
                        case BLOCK_H2:
@@ -1582,6 +1515,10 @@ void abbr_search(mmd_engine * e, token * t) {
                        case BLOCK_H4:
                        case BLOCK_H5:
                        case BLOCK_H6:
+                       case BLOCK_LIST_BULLETED:
+                       case BLOCK_LIST_BULLETED_LOOSE:
+                       case BLOCK_LIST_ENUMERATED:
+                       case BLOCK_LIST_ENUMERATED_LOOSE:
                        case BLOCK_LIST_ITEM_TIGHT:
                        case BLOCK_LIST_ITEM:
                        case BLOCK_SETEXT_1:
@@ -1589,8 +1526,13 @@ void abbr_search(mmd_engine * e, token * t) {
                        case BLOCK_TABLE:
                        case BLOCK_TABLE_HEADER:
                        case BLOCK_TABLE_SECTION:
-                               abbr_search_text(e, t);
+                       case PAIR_QUOTE_DOUBLE:
+                       case PAIR_QUOTE_SINGLE:
+                       case PAIR_STAR:
+                       case PAIR_UL:
+                               automatic_search(e, t->child, ac);
                                break;                  
+//                     case PAIR_PAREN:
                        default:
                                break;
                }
@@ -1600,16 +1542,34 @@ void abbr_search(mmd_engine * e, token * t) {
 }
 
 
-void process_abbreviation_stack(mmd_engine * e, scratch_pad * scratch) {
-       abbr * a;
+void identify_global_search_terms(mmd_engine * e, scratch_pad * scratch) {
+       // Only search if we have a target
+       int count = e->abbreviation_stack->size + e->glossary_stack->size;
 
-       // Describe which abbreviations we are searching for
+       if (count == 0) {
+               return;
+       }
+
+       trie * ac = trie_new(0);
+       footnote * f;
+
+       // Add abbreviations to search trie
        for (int i = 0; i < e->abbreviation_stack->size; ++i)
        {
-               a = stack_peek_index(e->abbreviation_stack, i);
+               f = stack_peek_index(e->abbreviation_stack, i);
+               trie_insert(ac, f->label_text, PAIR_BRACKET_ABBREVIATION);
+       }
+
+       // Add glossary to search trie
+       for (int i = 0; i < e->glossary_stack->size; ++i)
+       {
+               f = stack_peek_index(e->glossary_stack, i);
+               trie_insert(ac, f->clean_text, PAIR_BRACKET_GLOSSARY);
        }
 
-       abbr_search(e, e->root);
+       ac_trie_prepare(ac);
+       automatic_search(e, e->root, ac);
+       trie_free(ac);
 }
 
 
@@ -1627,8 +1587,9 @@ void mmd_export_token_tree(DString * out, mmd_engine * e, short format) {
        // Process metadata
        process_metadata_stack(e, scratch);
 
-       // Process abbreviations
-       // process_abbreviation_stack(e, scratch);
+       // Process abbreviations, glossary, etc.
+       if (!(e->extensions & EXT_COMPATIBILITY)) 
+               identify_global_search_terms(e, scratch);
 
 
        switch (scratch->output_format) {
@@ -2017,15 +1978,27 @@ void citation_from_bracket(const char * source, scratch_pad * scratch, token * t
 
 void glossary_from_bracket(const char * source, scratch_pad * scratch, token * t, short * num) {
        // Get text inside bracket
-       char * text = text_inside_pair(source, t);
+       char * text;
+
+       if (t->child) {
+               text = text_inside_pair(source, t);
+       } else {
+               text = malloc(t->len + 2);
+               text[0] = '?';
+               memcpy(&text[1], &source[t->start], t->len);
+               text[t->len + 1] = '\0';
+       }
+
        short glossary_id = extract_glossary_from_stack(scratch, text);
        
        free(text);
 
        if (glossary_id == -1) {
                // No match, this is an inline glossary -- create a new glossary entry
-               t->child->type = TEXT_EMPTY;
-               t->child->mate->type = TEXT_EMPTY;
+               if (t->child) {
+                       t->child->type = TEXT_EMPTY;
+                       t->child->mate->type = TEXT_EMPTY;
+               }
 
                // Create glossary
                token * label = t->child;
@@ -2056,16 +2029,28 @@ void glossary_from_bracket(const char * source, scratch_pad * scratch, token * t
 
 void abbreviation_from_bracket(const char * source, scratch_pad * scratch, token * t, short * num) {
        // Get text inside bracket
-       char * text = text_inside_pair(source, t);
+       char * text;
+
+       if (t->child) {
+               text = text_inside_pair(source, t);
+       } else {
+               text = malloc(t->len + 2);
+               text[0] = '>';
+               memcpy(&text[1], &source[t->start], t->len);
+               text[t->len + 1] = '\0';
+       }
+
        short abbr_id = extract_abbreviation_from_stack(scratch, &text[1]);
        
        free(text);
 
        if (abbr_id == -1) {
                // No match, this is an inline glossary -- create a new glossary entry
-               t->child->type = TEXT_EMPTY;
-               t->child->mate->type = TEXT_EMPTY;
-
+               if (t->child) {
+                       t->child->type = TEXT_EMPTY;
+                       t->child->mate->type = TEXT_EMPTY;
+               }
+               
                // Create glossary
                token * label = t->child;
                while (label && label->type != PAIR_PAREN)
index f39d962f3d4469ec6ec96c81347f36efe7480a9d..eed1c05b15517b410a8a2d30102382a681c43735 100644 (file)
 
 <text:p text:style-name="Standard">BAR (bar)</text:p>
 
-<text:p text:style-name="Standard">foo bar</text:p>
+<text:p text:style-name="Standard">FOO BAR</text:p>
 
 <text:p text:style-name="Standard">FOOBAR (foobar)</text:p>
 
 <text:p text:style-name="Standard">5</text:p>
 
 <text:p text:style-name="Standard"><text:p text:style-name="Footnote">BAZ (BAT)</text:p> (baz)</text:p>
+
+<text:p text:style-name="Standard">FOO</text:p>
+
+<text:p text:style-name="Standard">BAR</text:p>
+
+<text:p text:style-name="Standard">FOOBAR</text:p>
+
+<text:p text:style-name="Standard">FOO BAR baz</text:p>
 </office:text>
 </office:body>
 </office:document>
index f5e2b4e69704a6f172ec233ab47610b6fd4c4108..539a78b03ea8f9c8dd5b2d7b439c843cef7164a8 100644 (file)
 
 <p>BAZ (BAT) (<abbr title="BAZ (BAT)">baz</abbr>)</p>
 
+<p><abbr title="FOO">foo</abbr></p>
+
+<p><abbr title="BAR">bar</abbr></p>
+
+<p><abbr title="FOOBAR">foobar</abbr></p>
+
+<p><abbr title="FOO BAR">foo bar</abbr> baz</p>
+
 </body>
 </html>
 
index ff3286b76f16e26c413844287518c8b89541aa4f..dabcbaf9928b379937ffef1488e5ca1f2b8012d0 100644 (file)
@@ -14,3 +14,11 @@ latex config:        article</p>
 <p>5</p>
 
 <p>[>(baz) BAZ (BAT)]</p>
+
+<p>foo</p>
+
+<p>bar</p>
+
+<p>foobar</p>
+
+<p>foo bar baz</p>
index 3e1c4b407dfe2620b66d0ae68b45087e686ff06f..3042175f6e14364d2669b7fdfc973528e3dfe639 100644 (file)
 
 \newacronym{baz}{baz}{BAZ (BAT)}\gls{baz}
 
+\gls{foo}
+
+\gls{bar}
+
+\gls{foobar}
+
+\gls{foo bar} baz
+
 \input{mmd6-article-footer}
 \end{document}
index 9bd19fafa581e99805144feb67353dc15754677b..cac55dc1a13b1d49184b94573092767b6b24cb98 100644 (file)
@@ -15,6 +15,14 @@ latex config:        article
 
 [>(baz) BAZ (BAT)]
 
+foo
+
+bar
+
+foobar
+
+foo bar baz
+
 
 [>foo]: FOO
 [>bar]: BAR
index 92a66ff32946138f200f1984394ec580133e1a6e..4e50cd638235a94778688151940f044da81ae024 100644 (file)
 <text:p text:style-name="Standard">[?bar]</text:p>
 
 <text:p text:style-name="Standard">5</text:p>
+
+<text:p text:style-name="Standard">foo</text:p>
+
+<text:p text:style-name="Standard">Foo Bar</text:p>
 </office:text>
 </office:body>
 </office:document>
index 6acdac5c64eb1f256091109d8af06c92d60d9c06..49eb2e76c281d11a009dbca171e921a904faf17a 100644 (file)
 
 <p>5</p>
 
+<p><a href="#gn:1" title="see glossary" class="glossary">foo</a></p>
+
+<p><a href="#gn:3" title="see glossary" class="glossary">Foo Bar</a></p>
+
 <div class="glossary">
 <hr />
 <ol>
index 1d4402459da3eb0671cc2c9b7d63bfac4e462ef1..13555178067bf07db0f0a85a4ef1b9e47b138384 100644 (file)
@@ -13,5 +13,9 @@ latex config: article</p>
 
 <p>5</p>
 
+<p>foo</p>
+
+<p>Foo Bar</p>
+
 <pre><code>With second para.
 </code></pre>
index a5a6a09ecb4b523853d05fdc7f9d83a76d2ca7b8..2001950f7f48d54f024777a15b278ad1d6727273 100644 (file)
@@ -22,5 +22,9 @@ With second para.}
 
 5
 
+\gls{foo}
+
+\gls{foobar}
+
 \input{mmd6-article-footer}
 \end{document}
index 3a3f860099cd9e53711bddb70766224e45a43049..5d529ea6e6ed11a4b6339f04ad6a5cb5d6b3c394 100644 (file)
@@ -13,6 +13,11 @@ latex config:        article
 
 5
 
+foo
+
+Foo Bar
+
+
 [?foo]: Reference
 
 [?Foo Bar]: Reference