Added scanner stack flags for case-insensitivity.
Moved case-folding code from DFA-generation to parse time read-macros.
Added localized case-sensitivity syntax from Perl.
Added test for new syntax in test suite.
Documented new syntax.
}
}
- if (caseins && !useecs) {
- register int j;
-
- for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) {
- if (state[i] == 0 && state[j] != 0)
- /* We're adding a transition. */
- ++totaltrans;
-
- else if (state[i] != 0 && state[j] == 0)
- /* We're taking away a transition. */
- --totaltrans;
-
- state[i] = state[j];
- }
- }
numsnpairs += totaltrans;
}
}
- else if (sym >= 'A' && sym <= 'Z' && caseins)
- flexfatal (_
- ("consistency check failed in symfollowset"));
-
else if (sym == SYM_EPSILON) { /* do nothing */
}
@item (r)
match an @samp{r}; parentheses are used to override precedence (see below)
+@item (?r-s:pattern)
+apply option @samp{r} and omit option @samp{s} while interpreting pattern.
+Options may be zero or more of the characters @samp{i}, @samp{s}, or @samp{x}.
+
+@samp{i} means case-insensitive. @samp{-i} means case-sensitive.
+
+@samp{s} alters the meaning of the @samp{.} syntax to match any single byte whatsoever.
+@samp{-s} alters the meaning of @samp{.} to match any byte except @samp{\n}.
+
+@samp{x} ignores comments and whitespace in patterns. Whitespace is ignored unless
+it is backslash-escaped, contained within @samp{""}s, or appears inside a
+character class. TODO -- Do we ignore Perl comments, C comments, or both?
+
+The following are all valid:
+
+@verbatim
+(?:foo) same as (foo)
+(?i:ab7) same as ([aA][bB]7)
+(?-i:ab) same as (ab)
+(?s:.) same as [\x00-\xFF]
+(?-s:.) same as [^\n]
+(?ix-s: a . b) same as ([Aa][^\n][bB])
+(?x:a b) same as ("ab")
+(?x:a\ b) same as ("a b")
+(?x:a" "b) same as ("a b")
+(?x:a[ ]b) same as ("a b")
+@end verbatim
+
@item (?# comment )
omit everything within @samp{()}. The first @samp{)}
character encountered ends the pattern. It is not possible to for the comment
* nowarn - if true (-w), do not generate warnings
* spprdflt - if true (-s), suppress the default rule
* interactive - if true (-I), generate an interactive scanner
- * caseins - if true (-i), generate a case-insensitive scanner
* lex_compat - if true (-l), maximize compatibility with AT&T lex
* posix_compat - if true (-X), maximize compatibility with POSIX lex
* do_yylineno - if true, generate code to maintain yylineno
extern int printstats, syntaxerror, eofseen, ddebug, trace, nowarn,
spprdflt;
-extern int interactive, caseins, lex_compat, posix_compat, do_yylineno;
+extern int interactive, lex_compat, posix_compat, do_yylineno;
extern int useecs, fulltbl, usemecs, fullspd;
extern int gen_line_dirs, performance_report, backing_up_report;
extern int reentrant, bison_bridge_lval, bison_bridge_lloc;
/* From "scanflags.h" */
typedef unsigned int scanflags_t;
extern scanflags_t* _sf_stk;
-extern size_t _sf_n, _sf_max; /**< stack of scanner flags. */
+extern size_t _sf_top_ix, _sf_max; /**< stack of scanner flags. */
#define _SF_CASE_INS 0x0001
#define _SF_DOT_ALL 0x0002
#define _SF_SKIP_WS 0x0004
-
-#define sf_top() (_sf_stk[sf_n])
+#define sf_top() (_sf_stk[_sf_top_ix])
#define sf_case_ins() (sf_top() & _SF_CASE_INS)
#define sf_dot_all() (sf_top() & _SF_DOT_ALL)
#define sf_skip_ws() (sf_top() & _SF_SKIP_WS)
#define sf_set_case_ins(X) ((X) ? (sf_top() |= _SF_CASE_INS) : (sf_top() &= ~_SF_CASE_INS))
#define sf_set_dot_all(X) ((X) ? (sf_top() |= _SF_DOT_ALL) : (sf_top() &= ~_SF_DOT_ALL))
#define sf_set_skip_ws(X) ((X) ? (sf_top() |= _SF_SKIP_WS) : (sf_top() &= ~_SF_SKIP_WS))
-
+extern void sf_init(void);
extern void sf_push(void);
extern void sf_pop(void);
(flex_int32_t *) calloc (tbl->td_lolen, sizeof (flex_int32_t));
for (i = 1; i < csize; ++i) {
- if (caseins && isupper (i))
- ecgroup[i] = ecgroup[tolower (i)];
-
ecgroup[i] = ABS (ecgroup[i]);
tdata[i] = ecgroup[i];
}
out_str_dec (get_int32_decl (), "yy_ec", csize);
for (i = 1; i < csize; ++i) {
- if (caseins && (i >= 'A') && (i <= 'Z'))
- ecgroup[i] = ecgroup[clower (i)];
-
ecgroup[i] = ABS (ecgroup[i]);
mkdata (ecgroup[i]);
}
/* these globals are all defined and commented in flexdef.h */
int printstats, syntaxerror, eofseen, ddebug, trace, nowarn, spprdflt;
-int interactive, caseins, lex_compat, posix_compat, do_yylineno,
+int interactive, lex_compat, posix_compat, do_yylineno,
useecs, fulltbl, usemecs;
int fullspd, gen_line_dirs, performance_report, backing_up_report;
int C_plus_plus, long_align, use_read, yytext_is_array, do_yywrap,
putc ('b', stderr);
if (ddebug)
putc ('d', stderr);
- if (caseins)
+ if (sf_case_ins())
putc ('i', stderr);
if (lex_compat)
putc ('l', stderr);
char *arg;
scanopt_t sopt;
- printstats = syntaxerror = trace = spprdflt = caseins = false;
+ printstats = syntaxerror = trace = spprdflt = false;
lex_compat = posix_compat = C_plus_plus = backing_up_report =
ddebug = fulltbl = false;
fullspd = long_align = nowarn = yymore_used = continued_action =
buf_append (&m4defs_buf, &m4defs_init_str, 2);
}
+ sf_init ();
+
/* initialize regex lib */
flex_init_regex();
break;
case OPT_CASE_INSENSITIVE:
- caseins = true;
+ sf_set_case_ins(true);
break;
case OPT_LEX_COMPAT:
{
++rulelen;
- if ( caseins && isupper($1))
- $1 = clower( $1 );
-
if ($1 == nlch)
rule_has_nl[num_rules] = true;
- $$ = mkstate( $1 );
+ if (sf_case_ins() && has_case($1))
+ /* create an alternation, as in (a|A) */
+ $$ = mkor (mkstate($1), mkstate(reverse_case($1)));
+ else
+ $$ = mkstate( $1 );
}
;
fullccl:
ccl : ccl CHAR '-' CHAR
{
- if (caseins)
+ if (sf_case_ins())
{
- /* Squish the character range to lowercase only if BOTH
- * ends of the range are uppercase.
- */
- if (isupper ($2) && isupper ($4))
- {
- $2 = tolower ($2);
- $4 = tolower ($4);
- }
/* If one end of the range has case and the other
* does not, or the cases are different, then we're not
* sure what range the user is trying to express.
* Examples: [@-z] or [S-t]
*/
- else if (has_case ($2) != has_case ($4)
- || (has_case ($2) && (b_islower ($2) != b_islower ($4))))
+ if (has_case ($2) != has_case ($4)
+ || (has_case ($2) && (b_islower ($2) != b_islower ($4)))
+ || (has_case ($2) && (b_isupper ($2) != b_isupper ($4))))
format_warn3 (
_("the character range [%c-%c] is ambiguous in a case-insensitive scanner"),
$2, $4);
*/
cclsorted = cclsorted && ($2 > lastchar);
lastchar = $4;
+
+ /* Do it again for upper/lowercase */
+ if (sf_case_ins() && has_case($2) && has_case($4)){
+ $2 = reverse_case ($2);
+ $4 = reverse_case ($4);
+
+ for ( i = $2; i <= $4; ++i )
+ ccladd( $1, i );
+
+ cclsorted = cclsorted && ($2 > lastchar);
+ lastchar = $4;
+ }
+
}
$$ = $1;
| ccl CHAR
{
- if ( caseins && isupper($2))
- $2 = clower( $2 );
-
ccladd( $1, $2 );
cclsorted = cclsorted && ($2 > lastchar);
lastchar = $2;
+
+ /* Do it again for upper/lowercase */
+ if (sf_case_ins() && has_case($2)){
+ $1 = reverse_case ($2);
+ ccladd ($1, reverse_case ($2));
+
+ cclsorted = cclsorted && ($2 > lastchar);
+ lastchar = $2;
+ }
+
$$ = $1;
}
| CCE_CNTRL { CCL_EXPR(iscntrl); }
| CCE_DIGIT { CCL_EXPR(isdigit); }
| CCE_GRAPH { CCL_EXPR(isgraph); }
- | CCE_LOWER { CCL_EXPR(islower); }
+ | CCE_LOWER {
+ CCL_EXPR(islower);
+ if (sf_case_ins())
+ CCL_EXPR(isupper);
+ }
| CCE_PRINT { CCL_EXPR(isprint); }
| CCE_PUNCT { CCL_EXPR(ispunct); }
| CCE_SPACE { CCL_EXPR(isspace); }
| CCE_XDIGIT { CCL_EXPR(isxdigit); }
| CCE_UPPER {
- if ( caseins )
- CCL_EXPR(islower);
- else
- CCL_EXPR(isupper);
+ CCL_EXPR(isupper);
+ if (sf_case_ins())
+ CCL_EXPR(islower);
}
| CCE_NEG_ALNUM { CCL_NEG_EXPR(isalnum); }
| CCE_NEG_SPACE { CCL_NEG_EXPR(isspace); }
| CCE_NEG_XDIGIT { CCL_NEG_EXPR(isxdigit); }
| CCE_NEG_LOWER {
- if ( caseins )
+ if ( sf_case_ins() )
warn(_("[:^lower:] is ambiguous in case insensitive scanner"));
else
CCL_NEG_EXPR(islower);
}
| CCE_NEG_UPPER {
- if ( caseins )
+ if ( sf_case_ins() )
warn(_("[:^upper:] ambiguous in case insensitive scanner"));
else
CCL_NEG_EXPR(isupper);
string : string CHAR
{
- if ( caseins && isupper($2))
- $2 = clower( $2 );
-
if ( $2 == nlch )
rule_has_nl[num_rules] = true;
++rulelen;
- $$ = link_machines( $1, mkstate( $2 ) );
+ if (sf_case_ins() && has_case($2))
+ $$ = mkor (mkstate($2), mkstate(reverse_case($2)));
+ else
+ $$ = mkstate ($2);
+
+ $$ = link_machines( $1, $$);
}
|
bison_bridge_lval = true;
}
"c++" C_plus_plus = option_sense;
- caseful|case-sensitive caseins = ! option_sense;
- caseless|case-insensitive caseins = option_sense;
+ caseful|case-sensitive sf_set_case_ins(!option_sense);
+ caseless|case-insensitive sf_set_case_ins(option_sense);
debug ddebug = option_sense;
default spprdflt = ! option_sense;
ecs useecs = option_sense;
}
"(?#" BEGIN(EXTENDED_COMMENT);
- "(?" BEGIN(GROUP_WITH_PARAMS); return '('; /* TODO: push parameterized rule state. */
- "(" return '('; /* TODO: push parameterized rule state. */
- ")" return ')'; /* TODO: pop parameterized rule state. */
+ "(?" sf_push(); BEGIN(GROUP_WITH_PARAMS); return '(';
+ "(" sf_push(); return '(';
+ ")" sf_pop(); return ')';
[/|*+?.(){}] return (unsigned char) yytext[0];
. RETURNCHAR;
<GROUP_WITH_PARAMS>{
":" BEGIN(SECT2);
"-" BEGIN(GROUP_MINUS_PARAMS);
- i ; /* TODO: temporarily case-insensitive. */
- s ; /* TODO: temporary dot-all. */
+ i sf_set_case_ins(1);
+ s sf_set_dot_all(1);
+ x sf_set_skip_ws(1);
}
<GROUP_MINUS_PARAMS>{
":" BEGIN(SECT2);
- i ; /* TODO: temporarily NOT case-insensitive. */
- s ; /* TODO: temporarily NOT dot-all. */
+ i sf_set_case_ins(0);
+ s sf_set_dot_all(0);
+ x sf_set_skip_ws(0);
}
<FIRSTCCL>{
#include "flexdef.h"
scanflags_t* _sf_stk = NULL;
-size_t _sf_n=0, _sf_max=0;
+size_t _sf_top_ix=0, _sf_max=0;
void
sf_push (void)
{
- if (_sf_n + 1 >= _sf_max)
+ if (_sf_top_ix + 1 >= _sf_max)
_sf_stk = (scanflags_t*) flex_realloc ( (void*) _sf_stk, sizeof(scanflags_t) * (_sf_max += 32));
// copy the top element
- _sf_stk[_sf_n + 1] = _sf_stk[_sf_n];
- ++_sf_n;
+ _sf_stk[_sf_top_ix + 1] = _sf_stk[_sf_top_ix];
+ ++_sf_top_ix;
}
void
sf_pop (void)
{
- assert(_sf_n > 0);
- --_sf_n;
+ assert(_sf_top_ix > 0);
+ --_sf_top_ix;
}
/* one-time initialization. Should be called before any sf_ functions. */
{
assert(_sf_stk == NULL);
_sf_stk = (scanflags_t*) flex_alloc ( sizeof(scanflags_t) * (_sf_max = 32));
- _sf_stk[_sf_n] = 0;
+ _sf_stk[_sf_top_ix] = 0;
}
/* vim:set expandtab cindent tabstop=4 softtabstop=4 shiftwidth=4 textwidth=0: */
#include "config.h"
/*#include "parser.h" */
+#define err_abort() do{printf("ERROR: flex line %d. input line %d.\n", __LINE__, yylineno); abort();} while(0)
%}
%option 8bit outfile="scanner.c" prefix="test"
^"abcd-bc:"([abcd]{-}[bc])+@abcd-bc@\n printf("OK: %s", yytext); ++yylineno; return 1;
^"abcde-b-c:"([abcde]{-}[b]{-}[c])+@abcde-b-c@\n printf("OK: %s", yytext); ++yylineno; return 1;
^"^XY-^XYZ:"([^XY]{-}[^XYZ])+@^XY-^XYZ@\n printf("OK: %s", yytext); ++yylineno; return 1;
+^"ia:"(?i:a)+@ia@\n printf("OK: %s", yytext); ++yylineno; return 1;
+^"iabc:"(?i:abc)+@iabc@\n printf("OK: %s", yytext); ++yylineno; return 1;
+^"ia-c:"(?i:[a-c]+)@ia-c@\n printf("OK: %s", yytext); ++yylineno; return 1;
-.|\n {
- printf("ERROR: at line %d\n", yylineno);
- abort();
- }
+ /* We don't want this one to match. */
+^"check-a:"(?i:(?-i:A))@\n err_abort();
+^"check-a:"(?i:(?-i:(?i:A)))@\n printf("OK: %s", yytext); ++yylineno; return 1;
+
+.|\n { err_abort(); }
%%
int main(void);
abcd-bc:aaaaddddaaaa@abcd-bc@
abcde-b-c:aaaaddddeeee@abcde-b-c@
^XY-^XYZ:ZZZZZZZZZZZ@^XY-^XYZ@
+ia:AaAa@ia@
+iabc:ABCabcAbCaBc@iabc@
+ia-c:ABCabcAbCaBc@ia-c@
+check-a:a@