From 4b074270111bc7ce7f40a58fe82f1b26bde59b5a Mon Sep 17 00:00:00 2001 From: Ulya Trofimovich Date: Sun, 6 Nov 2016 15:11:31 +0000 Subject: [PATCH] Skeleton: don't keep expanded range representation in nodes. For the most part character range is represented as a pair of bounds. When generating data for skeleton programs we need to pick some 256 distinct characters from each range; only these characters will participate in data generation. Before this commit range expansion happened at the time of skeleton construction. Now it is delayed until the time of dumping skeleton paths to file. This means that expansion of particular range is done multiple times (every time the corresponding skeleton multiarc is dumped to file). However, data generation only happens with '--skeleton' option, while skeleton construction is unavoidable. --- re2c/src/ir/skeleton/control_flow.cc | 10 ++-- re2c/src/ir/skeleton/generate_data.cc | 72 +++++++++++++++++++-------- re2c/src/ir/skeleton/path.h | 4 -- re2c/src/ir/skeleton/skeleton.cc | 23 ++------- re2c/src/ir/skeleton/skeleton.h | 7 +-- 5 files changed, 63 insertions(+), 53 deletions(-) diff --git a/re2c/src/ir/skeleton/control_flow.cc b/re2c/src/ir/skeleton/control_flow.cc index 7124c1a7..91d959cb 100644 --- a/re2c/src/ir/skeleton/control_flow.cc +++ b/re2c/src/ir/skeleton/control_flow.cc @@ -48,9 +48,9 @@ static void naked_paths( size = size + ucf_size_t::from64(prefix.len()); } else if (!loop) { loop = true; - Node::arcsets_t::const_iterator - arc = node.arcsets.begin(), - end = node.arcsets.end(); + Node::arcs_t::const_iterator + arc = node.arcs.begin(), + end = node.arcs.end(); for (; arc != end && !size.overflow(); ++arc) { const size_t j = arc->first; prefix.push(j); @@ -72,7 +72,7 @@ void warn_undefined_control_flow(const Skeleton &skel) } } -static void fprint_default_arc(FILE *f, const Node::arcset_t &arc) +static void fprint_default_arc(FILE *f, const Node::arc_t &arc) { const size_t ranges = arc.size(); if (ranges == 1 && arc[0].first == arc[0].second) { @@ -102,7 +102,7 @@ void fprint_default_path( if (i > 0) { fprintf(f, " "); } - const Node::arcset_t &arc = p.arcset(skel, i); + const Node::arc_t &arc = p.arc(skel, i); fprint_default_arc(stderr, arc); } fprintf(f, "'"); diff --git a/re2c/src/ir/skeleton/generate_data.cc b/re2c/src/ir/skeleton/generate_data.cc index b417b0e6..cdf58436 100644 --- a/re2c/src/ir/skeleton/generate_data.cc +++ b/re2c/src/ir/skeleton/generate_data.cc @@ -88,40 +88,72 @@ template static void keygen(FILE *f, size_t count, delete[] keys; } +// pick at most 0x100 unique edges from this range +// (for 1-byte code units this covers the whole range: [0 - 0xFF]) +// - range bounds must be included +// - values should be evenly distributed +// - values should be deterministic +static uint32_t step(uint32_t lower, uint32_t upper) +{ + return 1 + (upper - lower) / 0x100; +} + template static cover_size_t cover_one( const Skeleton &skel, cover_t &cover) { const path_t &path = cover.prefix; const size_t len = path.len(); - size_t count = 0; + size_t width = 0; for (size_t i = 0; i < len; ++i) { - count = std::max (count, path.arc(skel, i).size()); + + // width of multiarc: total number of characters picked from all ranges + size_t w = 0; + const Node::arc_t &arc = path.arc(skel, i); + for (Node::citer_t a = arc.begin(); a != arc.end(); ++a) { + const uint32_t + l = a->first, + u = a->second; + w += 2 + (u - l - 1) / step(l, u); + } + + // width of multipath: maximal width of multiarc + width = std::max(width, w); } const cover_size_t size = cover_size_t::from64(len) - * cover_size_t::from64(count); - if (!size.overflow()) { - // input - const size_t buffer_size = size.uint32(); - cunit_t *buffer = new cunit_t [buffer_size]; - for (size_t i = 0; i < len; ++i) { - const std::vector &arc = path.arc(skel, i); - const size_t width = arc.size(); - for (size_t j = 0; j < count; ++j) { - const size_t k = j % width; - buffer[j * len + i] = to_le( - static_cast(arc[k])); + * cover_size_t::from64(width); + if (size.overflow()) return size; + + // input + const size_t buffer_size = size.uint32(); + cunit_t *buffer = new cunit_t [buffer_size]; + for (size_t i = 0; i < len; ++i) { + + // pick some characters from ranges + size_t j = 0; + const Node::arc_t &arc = path.arc(skel, i); + for (Node::citer_t a = arc.begin(); a != arc.end(); ++a) { + const uint32_t + l = a->first, + u = a->second, + d = step(l, u); + for (uint32_t m = l; m < u + d; m += d, ++j) { + buffer[j * len + i] = to_le(static_cast(std::min(m, u))); } } - fwrite (buffer, sizeof(cunit_t), buffer_size, cover.input); - delete[] buffer; - // keys - const key_t match = skel.rule2key(path.match(skel), skel.defrule); - keygen(cover.keys, count, len, - path.len_matching(skel), match); + // fill the rest by rotating picked characters + for (const size_t w = j; j < width; ++j) { + buffer[j * len + i] = buffer[(j % w) * len + i]; + } } + fwrite (buffer, sizeof(cunit_t), buffer_size, cover.input); + delete[] buffer; + + // keys + const key_t match = skel.rule2key(path.match(skel), skel.defrule); + keygen(cover.keys, width, len, path.len_matching(skel), match); return size; } diff --git a/re2c/src/ir/skeleton/path.h b/re2c/src/ir/skeleton/path.h index d09f5539..3625941a 100644 --- a/re2c/src/ir/skeleton/path.h +++ b/re2c/src/ir/skeleton/path.h @@ -81,10 +81,6 @@ public: { return skel.nodes[arcs[i]].arcs.find(arcs[i + 1])->second; } - const Node::arcset_t& arcset(const Skeleton &skel, size_t i) const - { - return skel.nodes[arcs[i]].arcsets.find(arcs[i + 1])->second; - } void push(size_t n) { arcs.push_back(n); diff --git a/re2c/src/ir/skeleton/skeleton.cc b/re2c/src/ir/skeleton/skeleton.cc index 5a827d8c..6168b62d 100644 --- a/re2c/src/ir/skeleton/skeleton.cc +++ b/re2c/src/ir/skeleton/skeleton.cc @@ -10,7 +10,6 @@ namespace re2c Node::Node() : arcs() - , arcsets() , rule(Rule::NONE) , trail(Tag::NONE) , trver(TAGVER_ZERO) @@ -30,27 +29,13 @@ void Node::init(const bool *ts, size_t r, size_t tr, tagver_t tv, trail = tr; trver = tv; - uint32_t lb = 0; std::vector >::const_iterator i = a.begin(), e = a.end(); - for (; i != e; ++i) { - const size_t n = i->first; - const uint32_t ub = i->second - 1; - - // pick at most 0x100 unique edges from this range - // (for 1-byte code units this covers the whole range: [0 - 0xFF]) - // - range bounds must be included - // - values should be evenly distributed - // - values should be deterministic - const uint32_t step = 1 + (ub - lb) / 0x100; - for (uint32_t c = lb; c < ub; c += step) { - arcs[n].push_back(c); - } - arcs[n].push_back(ub); - - arcsets[n].push_back(std::make_pair(lb, ub)); - lb = ub + 1; + for (uint32_t l = 0; i != e; ++i) { + const uint32_t u = i->second; + arcs[i->first].push_back(std::make_pair(l, u - 1)); + l = u; } } diff --git a/re2c/src/ir/skeleton/skeleton.h b/re2c/src/ir/skeleton/skeleton.h index 72ca0b72..4d982ffd 100644 --- a/re2c/src/ir/skeleton/skeleton.h +++ b/re2c/src/ir/skeleton/skeleton.h @@ -27,14 +27,11 @@ typedef local_increment_t local_inc; struct Node { - typedef std::vector > arcset_t; - typedef std::map arcsets_t; - - typedef std::vector arc_t; + typedef std::vector > arc_t; typedef std::map arcs_t; + typedef arc_t::const_iterator citer_t; arcs_t arcs; - arcsets_t arcsets; size_t rule; size_t trail; tagver_t trver; -- 2.40.0