From c17f717d2114063c9cdf4e00b83884ff62cb6e81 Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sat, 5 Sep 2015 09:39:28 +0100 Subject: [PATCH] With '--skeleton', store input data in binary form (rather than C/C++ code). There's a limitation on the size of input files for C/C++ compiler and the compiled binary will have to contain all that data (and thus may grow very large). Storing data in binary form and reading it from file dynamically is the way it should be. --- re2c/src/codegen/emit_action.cc | 4 +- re2c/src/codegen/skeleton/generate_code.cc | 27 +++++++- re2c/src/codegen/skeleton/generate_data.cc | 72 ++++++++-------------- re2c/src/codegen/skeleton/skeleton.h | 6 +- 4 files changed, 55 insertions(+), 54 deletions(-) diff --git a/re2c/src/codegen/emit_action.cc b/re2c/src/codegen/emit_action.cc index 91bc76a8..c1c0565c 100644 --- a/re2c/src/codegen/emit_action.cc +++ b/re2c/src/codegen/emit_action.cc @@ -254,9 +254,9 @@ void emit_rule (OutputFile & o, uint32_t ind, const State * const s, const RuleO << "{ if (cursor == token + result[i].len_matching && result[i].match == " << rule->rank << ") " << "{ cursor = token + result[i].len; continue; }" << " else " - << "{ printf (\"error: %ld/%lu, %u/%u, '%s'\\n\", cursor - token, result[i].len_matching, result[i].match, " + << "{ printf (\"error at %u: %ld/%lu, %u/%u\\n\", i, cursor - token, result[i].len_matching, result[i].match, " << rule->rank - << ", token); return 1; } }\n"; + << "); return 1; } }\n"; } else { diff --git a/re2c/src/codegen/skeleton/generate_code.cc b/re2c/src/codegen/skeleton/generate_code.cc index e25f256a..4f74340d 100644 --- a/re2c/src/codegen/skeleton/generate_code.cc +++ b/re2c/src/codegen/skeleton/generate_code.cc @@ -21,7 +21,8 @@ void emit_prolog (OutputFile & o, uint32_t ind, uint32_t maxfill) } o << indent (ind) << "#include \n"; - o << indent (ind) << "#include \"" << o.file_name << ".input" << "\"\n"; + o << indent (ind) << "#include // malloc, free\n"; + o << indent (ind) << "#include // memset\n"; o << indent (ind) << "#include \"" << o.file_name << ".keys" << "\"\n"; o << indent (ind) << "int main ()\n"; o << indent (ind) << "{\n"; @@ -34,11 +35,27 @@ void emit_prolog (OutputFile & o, uint32_t ind, uint32_t maxfill) o << "#define " << mapCodeName["YYRESTORECTX"] << "() cursor = ctxmarker\n"; o << "#define " << mapCodeName["YYLESSTHAN"] << "(n) (limit - cursor) < n\n"; o << "#define " << mapCodeName["YYFILL"] << "(n) { break; }\n"; + o << "\n"; + o << indent (ind + 1) << "FILE * f = fopen (\"" << o.file_name << ".input" << "\", \"rb\");\n"; + o << "\n"; + o << indent (ind + 1) << "// get file size (measured in code units)\n"; + o << indent (ind + 1) << "fseek (f, 0, SEEK_END);\n"; + o << indent (ind + 1) << "const size_t size = ((size_t) ftell (f)) / sizeof (YYCTYPE);\n"; + o << indent (ind + 1) << "fseek (f, 0, SEEK_SET);\n"; + o << "\n"; + o << indent (ind + 1) << "// read file contents in buffer and pad it with YYMAXFILL zeroes\n"; + o << indent (ind + 1) << "const size_t YYMAXFILL = " << maxfill << ";\n"; + o << indent (ind + 1) << "YYCTYPE * data = (YYCTYPE *) malloc ((size + YYMAXFILL) * sizeof (YYCTYPE));\n"; + o << indent (ind + 1) << "fread (data, sizeof (YYCTYPE), size, f);\n"; + o << indent (ind + 1) << "memset (data + size, 0, YYMAXFILL * sizeof (YYCTYPE));\n"; + o << "\n"; o << indent (ind + 1) << "const YYCTYPE * cursor = data;\n"; o << indent (ind + 1) << "const YYCTYPE * marker = data;\n"; o << indent (ind + 1) << "const YYCTYPE * ctxmarker = data;\n"; - o << indent (ind + 1) << "const YYCTYPE * const limit = data + data_size + " << maxfill << ";\n"; - o << indent (ind + 1) << "for (unsigned int i = 0; cursor < data + data_size; ++i)\n"; + o << indent (ind + 1) << "const YYCTYPE * const limit = data + size + YYMAXFILL;\n"; + o << indent (ind + 1) << "const YYCTYPE * const cursor_end = data + size;\n"; + o << "\n"; + o << indent (ind + 1) << "for (unsigned int i = 0; cursor < cursor_end; ++i)\n"; o << indent (ind + 1) << "{\n"; o << indent (ind + 2) << "const YYCTYPE * token = cursor;\n"; } @@ -55,6 +72,10 @@ void emit_epilog (OutputFile & o, uint32_t ind) o << "#undef " << mapCodeName["YYRESTORECTX"] << "\n"; o << "#undef " << mapCodeName["YYLESSTHAN"] << "\n"; o << "#undef " << mapCodeName["YYFILL"] << "\n"; + o << "\n"; + o << indent (ind + 1) << "free (data);\n"; + o << indent (ind + 1) << "fclose (f);\n"; + o << "\n"; o << indent (ind + 1) << "return 0;\n"; o << indent (ind) << "}\n"; } diff --git a/re2c/src/codegen/skeleton/generate_data.cc b/re2c/src/codegen/skeleton/generate_data.cc index e27cb61a..4d4cacd7 100644 --- a/re2c/src/codegen/skeleton/generate_data.cc +++ b/re2c/src/codegen/skeleton/generate_data.cc @@ -8,7 +8,7 @@ namespace re2c { -static void write_string (std::ofstream & f, const path_t & path); +static void write_string (FILE * f, const path_t & path); static void write_key (std::ofstream & f, const path_t & path); /* @@ -80,7 +80,7 @@ arccount_t Node::estimate_size_all (arccount_t wid, arccount_t len) } } -void Node::generate_paths_all (const std::vector & prefixes, std::ofstream & input, std::ofstream & keys) +void Node::generate_paths_all (const std::vector & prefixes, FILE * input, std::ofstream & keys) { const size_t wid = prefixes.size (); if (end ()) @@ -114,7 +114,7 @@ void Node::generate_paths_all (const std::vector & prefixes, std::ofstre } // see note [estimating total size of paths in skeleton] -arccount_t Node::generate_paths_cover (const std::vector & prefixes, std::ofstream & input, std::ofstream & keys) +arccount_t Node::generate_paths_cover (const std::vector & prefixes, FILE * input, std::ofstream & keys) { arccount_t size (0u); const size_t wid = prefixes.size (); @@ -163,15 +163,13 @@ arccount_t Node::generate_paths_cover (const std::vector & prefixes, std return size; } -uint32_t Skeleton::generate_paths (uint32_t line, const std::string & cond, std::ofstream & input, std::ofstream & keys) +void Skeleton::generate_paths (uint32_t line, const std::string & cond, FILE * input, std::ofstream & keys) { std::vector prefixes; prefixes.push_back (path_t ()); - arccount_t size = nodes->estimate_size_all (arccount_t (1u), arccount_t (0u)); - if (size.overflow ()) + if (nodes->estimate_size_all (arccount_t (1u), arccount_t (0u)).overflow ()) { - size = nodes->generate_paths_cover (prefixes, input, keys); - if (size.overflow ()) + if (nodes->generate_paths_cover (prefixes, input, keys).overflow ()) { warning ( NULL @@ -186,35 +184,17 @@ uint32_t Skeleton::generate_paths (uint32_t line, const std::string & cond, std: { nodes->generate_paths_all (prefixes, input, keys); } - return size.uint32 (); } void Skeleton::emit_data (uint32_t line, const std::string & cond, const char * fname) { const std::string input_name = std::string (fname) + ".input"; - std::ofstream input; - input.open (input_name.c_str (), std::ofstream::out | std::ofstream::binary); - if (!input.is_open ()) + FILE * input = fopen (input_name.c_str (), "wb"); + if (!input) { error ("cannot open file: %s", input_name.c_str ()); exit (1); } - std::string yyctype; - switch (encoding.szCodeUnit ()) - { - case 1: - yyctype = "unsigned char"; - break; - case 2: - yyctype = "unsigned short"; - break; - case 4: - yyctype = "unsigned int"; - break; - } - input << "// These strings correspond to paths in DFA.\n"; - input << yyctype << " data [] =\n"; - input << "{\n"; const std::string keys_name = std::string (fname) + ".keys"; std::ofstream keys; @@ -233,35 +213,35 @@ void Skeleton::emit_data (uint32_t line, const std::string & cond, const char * keys << "Result result [] =\n"; keys << "{\n"; - const uint32_t size = generate_paths (line, cond, input, keys); + generate_paths (line, cond, input, keys); - input << indent (1); - // pad with 0x100 zeroes - // should have been YYMAXLEN zeroes, but we don't know YYMAXFILL yet - // temporary hack - for (uint32_t i = 0; i < 0x100; ++i) - { - input << "0,"; - } - input << "\n"; - input << "};\n"; - input << "const unsigned int data_size = " << size << ";\n"; - input.close (); + fclose (input); keys << "};\n"; keys.close (); } -void write_string (std::ofstream & f, const path_t & path) +template +static void write_cunits (FILE * f, const path_t & path) { - f << indent (1); const size_t len = path.len (); + type_t * cunits = new type_t [len]; for (size_t i = 0 ; i < len; ++i) { - prtChOrHex (f, path[i]); - f << ","; + cunits[i] = static_cast (path[i]); + } + fwrite (cunits, sizeof (type_t), len, f); + delete [] cunits; +} + +void write_string (FILE * f, const path_t & path) +{ + switch (encoding.szCodeUnit ()) + { + case 4: write_cunits (f, path); break; + case 2: write_cunits (f, path); break; + case 1: write_cunits (f, path); break; } - f << "\n"; } void write_key (std::ofstream & f, const path_t & path) diff --git a/re2c/src/codegen/skeleton/skeleton.h b/re2c/src/codegen/skeleton/skeleton.h index 8e252d4d..b3f9d391 100644 --- a/re2c/src/codegen/skeleton/skeleton.h +++ b/re2c/src/codegen/skeleton/skeleton.h @@ -44,8 +44,8 @@ struct Node ~Node (); bool end () const; arccount_t estimate_size_all (arccount_t inarcs, arccount_t len); - void generate_paths_all (const std::vector & prefixes, std::ofstream & input, std::ofstream & keys); - arccount_t generate_paths_cover (const std::vector & prefixes, std::ofstream & input, std::ofstream & keys); + void generate_paths_all (const std::vector & prefixes, FILE * input, std::ofstream & keys); + arccount_t generate_paths_cover (const std::vector & prefixes, FILE * input, std::ofstream & keys); arccount_t generate_paths_default (const multipath_t & prefix, std::vector & paths); FORBID_COPY (Node); @@ -61,7 +61,7 @@ struct Skeleton void emit_data (uint32_t line, const std::string & cond, const char * fname); private: - uint32_t generate_paths (uint32_t line, const std::string & cond, std::ofstream & input, std::ofstream & keys); + void generate_paths (uint32_t line, const std::string & cond, FILE * input, std::ofstream & keys); FORBID_COPY (Skeleton); }; -- 2.40.0