From ae656007968d9f73e74bf083bcb3a4369f45f076 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Mon, 14 Sep 2015 13:19:15 +0100 Subject: [PATCH] Compacted keys representation (with '--skeleton'). Determine maximal path length and maximal rule number while constructing skeleton; take maximim of these two values; choose unsigned integer type of minimal width capable of holding maximim. Note: re2c operates on exact-width integers, but the generated program doesn't (it might not have ). When generating the program, re2c choses one of unsigned 'char', 'short', 'int' and 'long' types (that one 'sizeof' which is equal to the disired key size). re2c makes some implicit assumptions (generated program is run on the same platform as re2c, byte consists of 8 bits, etc.). Perhaps re2c should hardcode these assumptions in the generated program and check them on start. --- re2c/src/codegen/emit_dfa.cc | 2 +- re2c/src/codegen/skeleton/generate_code.cc | 53 +++++++--- re2c/src/codegen/skeleton/generate_data.cc | 110 +++++++++++---------- re2c/src/codegen/skeleton/maxlen.cc | 14 --- re2c/src/codegen/skeleton/skeleton.cc | 43 +++++++- re2c/src/codegen/skeleton/skeleton.h | 16 ++- 6 files changed, 147 insertions(+), 91 deletions(-) diff --git a/re2c/src/codegen/emit_dfa.cc b/re2c/src/codegen/emit_dfa.cc index 68046a95..3f91e1f8 100644 --- a/re2c/src/codegen/emit_dfa.cc +++ b/re2c/src/codegen/emit_dfa.cc @@ -119,7 +119,7 @@ void DFA::emit(Output & output, uint32_t& ind, bool isLastCond, bool& bPrologBra if (flag_skeleton) { skeleton->emit_data (o.file_name); - Skeleton::emit_prolog (o, output.max_fill); + skeleton->emit_prolog (o, output.max_fill); } if (bProlog) { diff --git a/re2c/src/codegen/skeleton/generate_code.cc b/re2c/src/codegen/skeleton/generate_code.cc index 4a1ec723..785bfdf4 100644 --- a/re2c/src/codegen/skeleton/generate_code.cc +++ b/re2c/src/codegen/skeleton/generate_code.cc @@ -4,28 +4,43 @@ namespace re2c { -void Skeleton::emit_prolog (OutputFile & o, uint32_t maxfill) +static void exact_uint (OutputFile & o, size_t width) { - std::string yyctype; - switch (encoding.szCodeUnit ()) + switch (width) { - case 1: - yyctype = "unsigned char"; + case sizeof (char): + o << "unsigned char"; break; - case 2: - yyctype = "unsigned short"; + case sizeof (short): + o << "unsigned short"; break; - case 4: - yyctype = "unsigned int"; + case sizeof (int): + o << "unsigned int"; + break; + case sizeof (long): + o << "unsigned long"; + break; + default: + o << "uint" << width * 8 << "_t"; break; } +} +void Skeleton::emit_prolog (OutputFile & o, uint32_t maxfill) const +{ o << "\n" << "#include "; o << "\n" << "#include // malloc, free"; o << "\n" << "#include // memset"; o << "\n"; - o << "\n" << "typedef " << yyctype << " YYCTYPE;"; - o << "\n" << "typedef unsigned int YYKEYTYPE;"; + + o << "\n" << "typedef "; + exact_uint (o, encoding.szCodeUnit ()); + o << " YYCTYPE;"; + + o << "\n" << "typedef "; + exact_uint (o, sizeof_key); + o << " YYKEYTYPE;"; + o << "\n"; o << "\n" << "#define YYPEEK() *cursor"; o << "\n" << "#define YYSKIP() ++cursor"; @@ -60,10 +75,18 @@ void Skeleton::emit_prolog (OutputFile & o, uint32_t maxfill) o << "\n" << indString << "}"; o << "\n" << indString << "else"; o << "\n" << indString << "{"; - o << "\n" << indString << indString << "const char * fmt = \"error at position %ld (iteration %u):\\n\""; - o << "\n" << indString << indString << indString << "\"\\texpected: match length %ld, rule %u\\n\""; - o << "\n" << indString << indString << indString << "\"\\tactual: match length %ld, rule %u\\n\";"; - o << "\n" << indString << indString << "fprintf (stderr, fmt, pos, i, len_exp, rule_exp, len_act, rule_act);"; + o << "\n" << indString << indString << "fprintf"; + o << "\n" << indString << indString << indString << "( stderr"; + o << "\n" << indString << indString << indString << ", \"error at position %ld (iteration %u):\\n\""; + o << "\n" << indString << indString << indString << indString << "\"\\texpected: match length %ld, rule %u\\n\""; + o << "\n" << indString << indString << indString << indString << "\"\\tactual: match length %ld, rule %u\\n\""; + o << "\n" << indString << indString << indString << ", pos"; + o << "\n" << indString << indString << indString << ", i"; + o << "\n" << indString << indString << indString << ", len_exp"; + o << "\n" << indString << indString << indString << ", rule_exp"; + o << "\n" << indString << indString << indString << ", len_act"; + o << "\n" << indString << indString << indString << ", rule_act"; + o << "\n" << indString << indString << indString << ");"; o << "\n" << indString << indString << "return 1;"; o << "\n" << indString << "}"; o << "\n" << "}"; diff --git a/re2c/src/codegen/skeleton/generate_data.cc b/re2c/src/codegen/skeleton/generate_data.cc index 994902a3..625b7047 100644 --- a/re2c/src/codegen/skeleton/generate_data.cc +++ b/re2c/src/codegen/skeleton/generate_data.cc @@ -8,8 +8,10 @@ namespace re2c { -static void permutate_one (FILE * input, FILE * keys, const multipath_t & path); -static arccount_t cover_one (FILE * input, FILE * keys, const multipath_t & prefix, const path_t & suffix); +template + static void permutate_one (FILE * input, FILE * keys, const multipath_t & path); +template + static arccount_t cover_one (FILE * input, FILE * keys, const multipath_t & prefix, const path_t & suffix); /* * note [estimating total size of paths in skeleton] @@ -96,11 +98,12 @@ arccount_t Node::sizeof_permutate (arccount_t wid, arccount_t len) * abandoned and recursion returns immediately. * */ -void Node::permutate (const multipath_t & prefix, FILE * input, FILE * keys) +template + void Node::permutate (const multipath_t & prefix, FILE * input, FILE * keys) { if (end ()) { - permutate_one (input, keys, prefix); + permutate_one (input, keys, prefix); } else if (loop < 2) { @@ -109,7 +112,7 @@ void Node::permutate (const multipath_t & prefix, FILE * input, FILE * keys) { multipath_t new_prefix = prefix; new_prefix.extend (i->first->rule, &i->second); - i->first->permutate (new_prefix, input, keys); + i->first->permutate (new_prefix, input, keys); } } } @@ -139,12 +142,13 @@ void Node::permutate (const multipath_t & prefix, FILE * input, FILE * keys) * abandoned and recursion returns immediately. * */ -arccount_t Node::cover (const multipath_t & prefix, FILE * input, FILE * keys) +template + arccount_t Node::cover (const multipath_t & prefix, FILE * input, FILE * keys) { arccount_t size (0u); if (suffix != NULL) { - size = cover_one (input, keys, prefix, *suffix); + size = cover_one (input, keys, prefix, *suffix); } else if (end ()) { @@ -157,7 +161,7 @@ arccount_t Node::cover (const multipath_t & prefix, FILE * input, FILE * keys) { multipath_t new_prefix = prefix; new_prefix.extend (i->first->rule, &i->second); - size = size + i->first->cover (new_prefix, input, keys); + size = size + i->first->cover (new_prefix, input, keys); if (size.overflow ()) { return arccount_t::limit (); @@ -172,12 +176,13 @@ arccount_t Node::cover (const multipath_t & prefix, FILE * input, FILE * keys) return size; } -void Skeleton::generate_paths (FILE * input, FILE * keys) +template + void Skeleton::generate_paths_cunit_key (FILE * input, FILE * keys) { multipath_t prefix (nodes->rule); if (nodes->sizeof_permutate (arccount_t (1u), arccount_t (0u)).overflow ()) { - if (nodes->cover (prefix, input, keys).overflow ()) + if (nodes->cover (prefix, input, keys).overflow ()) { warning ( NULL @@ -190,7 +195,28 @@ void Skeleton::generate_paths (FILE * input, FILE * keys) } else { - nodes->permutate (prefix, input, keys); + nodes->permutate (prefix, input, keys); + } +} + +template + void Skeleton::generate_paths_cunit (FILE * input, FILE * keys) +{ + switch (sizeof_key) + { + case 4: generate_paths_cunit_key (input, keys); break; + case 2: generate_paths_cunit_key (input, keys); break; + case 1: generate_paths_cunit_key (input, keys); break; + } +} + +void Skeleton::generate_paths (FILE * input, FILE * keys) +{ + switch (encoding.szCodeUnit ()) + { + case 4: generate_paths_cunit (input, keys); break; + case 2: generate_paths_cunit (input, keys); break; + case 1: generate_paths_cunit (input, keys); break; } } @@ -217,24 +243,23 @@ void Skeleton::emit_data (const char * fname) fclose (keys); } -static void keygen (FILE * f, size_t count, size_t len, size_t len_match, rule_rank_t match) +template + static void keygen (FILE * f, size_t count, size_t len, size_t len_match, rule_rank_t match) { - assert (sizeof (uint32_t) == sizeof (unsigned int)); - const size_t keys_size = 3 * count; - uint32_t * keys = new uint32_t [keys_size]; + key_t * keys = new key_t [keys_size]; for (uint32_t i = 0; i < keys_size;) { - keys[i++] = static_cast (len); - keys[i++] = static_cast (len_match); - keys[i++] = match.uint32 (); + keys[i++] = static_cast (len); + keys[i++] = static_cast (len_match); + keys[i++] = static_cast (match.uint32 ()); } - fwrite (keys, sizeof (uint32_t), keys_size, f); + fwrite (keys, sizeof (key_t), keys_size, f); delete [] keys; } -template -static void generic_permutate_one (FILE * input, FILE * keys, const multipath_t & path) +template + static void permutate_one (FILE * input, FILE * keys, const multipath_t & path) { const size_t len = path.len (); @@ -246,7 +271,7 @@ static void generic_permutate_one (FILE * input, FILE * keys, const multipath_t // input const size_t buffer_size = len * count; - type_t * buffer = new type_t [buffer_size]; + cunit_t * buffer = new cunit_t [buffer_size]; for (size_t i = 0, period = count; i < len; ++i) { const multiarc_t & arc = *path[i]; @@ -255,28 +280,18 @@ static void generic_permutate_one (FILE * input, FILE * keys, const multipath_t for (size_t j = 0; j < count; ++j) { const size_t k = (j / period) % width; - buffer[j * len + i] = static_cast (arc[k]); + buffer[j * len + i] = static_cast (arc[k]); } } - fwrite (buffer, sizeof (type_t), buffer_size, input); + fwrite (buffer, sizeof (cunit_t), buffer_size, input); delete [] buffer; // keys - keygen (keys, count, len, path.len_matching (), path.match ()); -} - -static void permutate_one (FILE * input, FILE * keys, const multipath_t & path) -{ - switch (encoding.szCodeUnit ()) - { - case 4: generic_permutate_one (input, keys, path); break; - case 2: generic_permutate_one (input, keys, path); break; - case 1: generic_permutate_one (input, keys, path); break; - } + keygen (keys, count, len, path.len_matching (), path.match ()); } -template -static arccount_t generic_cover_one (FILE * input, FILE * keys, const multipath_t & prefix, const path_t & suffix) +template + static arccount_t cover_one (FILE * input, FILE * keys, const multipath_t & prefix, const path_t & suffix) { const size_t prefix_len = prefix.len (); const size_t suffix_len = suffix.len (); @@ -293,7 +308,7 @@ static arccount_t generic_cover_one (FILE * input, FILE * keys, const multipath_ { // input const size_t buffer_size = size.uint32 (); - type_t * buffer = new type_t [buffer_size]; + cunit_t * buffer = new cunit_t [buffer_size]; for (size_t i = 0; i < prefix_len; ++i) { const std::vector & arc = *prefix[i]; @@ -301,19 +316,19 @@ static arccount_t generic_cover_one (FILE * input, FILE * keys, const multipath_ for (size_t j = 0; j < count; ++j) { const size_t k = j % width; - buffer[j * len + i] = static_cast (arc[k]); + buffer[j * len + i] = static_cast (arc[k]); } } for (size_t i = 0; i < suffix_len; ++i) { - const type_t c = static_cast (suffix[i]); + const cunit_t c = static_cast (suffix[i]); const size_t k = prefix_len + i; for (size_t j = 0; j < count; ++j) { buffer[j * len + k] = c; } } - fwrite (buffer, sizeof (type_t), buffer_size, input); + fwrite (buffer, sizeof (cunit_t), buffer_size, input); delete [] buffer; // keys @@ -324,21 +339,10 @@ static arccount_t generic_cover_one (FILE * input, FILE * keys, const multipath_ const rule_rank_t match = none ? prefix.match () : suffix.match (); - keygen (keys, count, len, len_match, match); + keygen (keys, count, len, len_match, match); } return size; } -static arccount_t cover_one (FILE * input, FILE * keys, const multipath_t & prefix, const path_t & suffix) -{ - switch (encoding.szCodeUnit ()) - { - case 4: return generic_cover_one (input, keys, prefix, suffix); - case 2: return generic_cover_one (input, keys, prefix, suffix); - case 1: return generic_cover_one (input, keys, prefix, suffix); - default: return arccount_t (0u); - } -} - } // namespace re2c diff --git a/re2c/src/codegen/skeleton/maxlen.cc b/re2c/src/codegen/skeleton/maxlen.cc index 12516e20..cac7a8b4 100644 --- a/re2c/src/codegen/skeleton/maxlen.cc +++ b/re2c/src/codegen/skeleton/maxlen.cc @@ -1,7 +1,4 @@ -#include // exit - #include "src/codegen/skeleton/skeleton.h" -#include "src/conf/msg.h" namespace re2c { @@ -37,15 +34,4 @@ void Node::calc_dist () } } -void Skeleton::calc_maxlen () -{ - nodes->calc_dist (); - maxlen = nodes->dist; - if (maxlen == Node::DIST_MAX) - { - error ("DFA path %sis too long", incond (cond).c_str ()); - exit (1); - } -} - } // namespace re2c diff --git a/re2c/src/codegen/skeleton/skeleton.cc b/re2c/src/codegen/skeleton/skeleton.cc index 71870461..68cbc8b6 100644 --- a/re2c/src/codegen/skeleton/skeleton.cc +++ b/re2c/src/codegen/skeleton/skeleton.cc @@ -1,4 +1,7 @@ +#include // exit + #include "src/codegen/skeleton/skeleton.h" +#include "src/conf/msg.h" #include "src/ir/regexp/regexp_rule.h" namespace re2c @@ -55,8 +58,9 @@ Skeleton::Skeleton (const DFA & dfa) // +1 for default DFA state (NULL) : cond (dfa.cond) , line (dfa.line) - , nodes (new Node [dfa.nStates + 1]) - , maxlen (Node::DIST_MAX) + , nodes_count (dfa.nStates + 1) // +1 for default state + , nodes (new Node [nodes_count]) + , sizeof_key (0) { Node * n; @@ -77,7 +81,40 @@ Skeleton::Skeleton (const DFA & dfa) } n->init (NULL, s2n); - calc_maxlen (); + // calculate maximal path length, check overflow + nodes->calc_dist (); + const uint32_t maxlen = nodes->dist; + if (maxlen == Node::DIST_MAX) + { + error ("DFA path %sis too long", incond (cond).c_str ()); + exit (1); + } + + // calculate maximal rule rank + uint32_t maxrule = 0; + for (uint32_t i = 0; i < nodes_count; ++i) + { + const rule_rank_t r = nodes[i].rule; + if (!r.is_none ()) + { + maxrule = std::max (maxrule, r.uint32 ()); + } + } + + // initialize size of key + const uint32_t max = std::max (maxlen, maxrule); + if (max <= UINT8_MAX) + { + sizeof_key = 1; + } + else if (max <= UINT16_MAX) + { + sizeof_key = 2; + } + else + { + sizeof_key = 4; + } } Skeleton::~Skeleton () diff --git a/re2c/src/codegen/skeleton/skeleton.h b/re2c/src/codegen/skeleton/skeleton.h index 08ef7c14..cab3b21e 100644 --- a/re2c/src/codegen/skeleton/skeleton.h +++ b/re2c/src/codegen/skeleton/skeleton.h @@ -49,8 +49,10 @@ struct Node bool end () const; void calc_dist (); arccount_t sizeof_permutate (arccount_t inarcs, arccount_t len); - void permutate (const multipath_t & prefix, FILE * input, FILE * keys); - arccount_t cover (const multipath_t & prefix, FILE * input, FILE * keys); + template + void permutate (const multipath_t & prefix, FILE * input, FILE * keys); + template + arccount_t cover (const multipath_t & prefix, FILE * input, FILE * keys); arccount_t naked_ways (const way_t & prefix, std::vector & ways); FORBID_COPY (Node); @@ -61,19 +63,23 @@ struct Skeleton const std::string cond; const uint32_t line; + const uint32_t nodes_count; Node * nodes; - uint32_t maxlen; + size_t sizeof_key; Skeleton (const DFA & dfa); ~Skeleton (); void warn_undefined_control_flow (); void emit_data (const char * fname); - static void emit_prolog (OutputFile & o, uint32_t maxfill); + void emit_prolog (OutputFile & o, uint32_t maxfill) const; static void emit_epilog (OutputFile & o); static void emit_action (OutputFile & o, uint32_t ind, rule_rank_t rank); private: - void calc_maxlen (); + template + void generate_paths_cunit_key (FILE * input, FILE * keys); + template + void generate_paths_cunit (FILE * input, FILE * keys); void generate_paths (FILE * input, FILE * keys); FORBID_COPY (Skeleton); -- 2.40.0