This commit effectively reverts commit
5cf441dc936c16e2264e49038ebe9a108dc750b9
"Base '+' (one or more repetitions) on '*' (zero or more repetitions)."
It turn out, there is a good reason for using '+' as base operation:
'*' can be expressed in terms of '+' as 'r* ::= r+ | <empty>', while
'+' expands as 'r+ ::= r r*' and 'r' is duplicated.
Duplication becomes crucial in presence of tags: if the duplicated
subexpression has tags, then duplication causes an error.
-/* Generated by re2c 0.16 on Wed May 11 15:38:08 2016 */
+/* Generated by re2c 0.16 on Mon Sep 26 12:27:15 2016 */
#line 1 "../src/parse/lex.re"
#include "src/util/c99_stdint.h"
#include <stddef.h>
185, 185, 188, 197, 208, 212, 218, 224, 231, 240,
248, 258, 269, 275, 281, 284, 291, 297, 307, 310,
317, 321, 327, 331, 338, 342, 349, 353, 360, 364,
- 379, 398, 402, 406, 410, 417, 427, 431
+ 381, 400, 404, 408, 412, 419, 429, 433
};
#endif
case 39:
{
+ // see note [Kleene star is expressed in terms of plus]
switch((yyvsp[(2) - (2)].op))
{
case '*':
- (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp));
+ (yyval.regexp) = RegExp::make_alt(RegExp::make_nil(),
+ RegExp::make_iter((yyvsp[(1) - (2)].regexp)));
break;
case '+':
- (yyval.regexp) = RegExp::make_cat(RegExp::make_iter((yyvsp[(1) - (2)].regexp)), (yyvsp[(1) - (2)].regexp));
+ (yyval.regexp) = RegExp::make_iter((yyvsp[(1) - (2)].regexp));
break;
case '?':
(yyval.regexp) = mkAlt((yyvsp[(1) - (2)].regexp), RegExp::make_nil());
s = regexp2nfa(nfa, nrule, tagidx, re->cat.re2, t);
s = regexp2nfa(nfa, nrule, tagidx, re->cat.re1, s);
break;
- case RegExp::ITER:
- s = &nfa.states[nfa.size++];
- s->make_alt(nrule, t, regexp2nfa(nfa, nrule, tagidx, re->iter, s));
+ case RegExp::ITER: {
+ // see note [Kleene star is expressed in terms of plus]
+ nfa_state_t *q = &nfa.states[nfa.size++];
+ s = regexp2nfa(nfa, nrule, tagidx, re->iter, q);
+ q->make_alt(nrule, t, s);
break;
+ }
case RegExp::TAG:
if ((*nfa.tags)[tagidx].type == Tag::VAR) {
s = &nfa.states[nfa.size++];
return true;
}
switch (re->type) {
- case RegExp::NIL:
- case RegExp::ITER:
- return true;
+ default: assert(false);
+ case RegExp::NIL: return true;
+ case RegExp::SYM:
+ case RegExp::ITER: return false;
case RegExp::TAG:
if (re->tag == NULL) {
trail = true;
}
return true;
- case RegExp::SYM:
- return false;
case RegExp::ALT:
return nullable(re->alt.re1, trail)
|| nullable(re->alt.re2, trail);
case RegExp::CAT:
return nullable(re->cat.re1, trail)
&& nullable(re->cat.re2, trail);
- default:
- assert(false);
}
}
// see note [counted repetition expansion]
const RegExp *repeat_from(const RegExp *re, uint32_t n)
{
+ // see note [Kleene star is expressed in terms of plus]
return doCat(repeat(re, n),
- RegExp::make_iter(re));
+ RegExp::make_alt(RegExp::make_nil(), RegExp::make_iter(re)));
}
} // namespace re2c
typedef std::vector<uint32_t> charset_t;
+/* note [Kleene star is expressed in terms of plus]
+ *
+ * In literature Kleene star 'r*' (zero or more repetitions of 'r')
+ * is the basic operation. In practice it is more convenient to use
+ * 'r+' (one or more repetitions of 'r'), because expansion 'r+ ::= r r*'
+ * duplicates 'r', while expansion 'r* = r+ | <empty>' allows to
+ * avoid duplication. This is more efficient in general and crucial
+ * in cases when duplication of 'r' is forbidden (e.g. if 'r' has tags).
+ */
+
struct RegExp
{
static free_list<RegExp*> flist;
}
| primary close
{
+ // see note [Kleene star is expressed in terms of plus]
switch($2)
{
case '*':
- $$ = RegExp::make_iter($1);
+ $$ = RegExp::make_alt(RegExp::make_nil(),
+ RegExp::make_iter($1));
break;
case '+':
- $$ = RegExp::make_cat(RegExp::make_iter($1), $1);
+ $$ = RegExp::make_iter($1);
break;
case '?':
$$ = mkAlt($1, RegExp::make_nil());
--- /dev/null
+/* Generated by re2c */
+// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r'
+// this is crucial if 'r' contains tags (tag duplication is forbidden)
+
+
+{
+ YYCTYPE yych;
+ long yytag0p;
+ YYCTXMARKER = YYCURSOR;
+ if (YYLIMIT <= YYCURSOR) YYFILL(1);
+ yych = *YYCURSOR;
+ switch (yych) {
+ case 'a':
+ yytag0p = (YYCURSOR - YYCTXMARKER);
+ goto yy4;
+ default: goto yy2;
+ }
+yy2:
+ ++YYCURSOR;
+ { d }
+yy4:
+ ++YYCURSOR;
+ if (YYLIMIT <= YYCURSOR) YYFILL(1);
+ yych = *YYCURSOR;
+ switch (yych) {
+ case 'a':
+ yytag0p = (YYCURSOR - YYCTXMARKER);
+ goto yy4;
+ default: goto yy6;
+ }
+yy6:
+ { (YYCTXMARKER + yytag0p) }
+}
+
--- /dev/null
+// ensure 'r+' (one or more repetitions) expansion does not duplicate 'r'
+// this is crucial if 'r' contains tags (tag duplication is forbidden)
+
+/*!re2c
+ (@p "a")+ { @p }
+ * { d }
+*/