From 114cc57e7f149f1695572b5ee37fadd104a0e480 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mariusz=20Pluci=C5=84ski?= Date: Sat, 21 Jun 2014 00:38:00 +0200 Subject: [PATCH] Add implementation for charset support --- src/flex.skl | 206 +++++++++++++++++++++++++++++++++++++++++++++++++- src/flexdef.h | 6 ++ src/main.c | 35 ++++++++- src/parse.y | 6 +- src/scan.l | 2 + 5 files changed, 248 insertions(+), 7 deletions(-) diff --git a/src/flex.skl b/src/flex.skl index 65f88b4..5738453 100644 --- a/src/flex.skl +++ b/src/flex.skl @@ -77,6 +77,10 @@ m4_ifelse(M4_YY_PREFIX,yy,, #define yyalloc M4_YY_PREFIX[[alloc]] #define yyrealloc M4_YY_PREFIX[[realloc]] #define yyfree M4_YY_PREFIX[[free]] +m4_ifdef( [[M4_YY_CHARSET]], [[ +#define yycharset M4_YY_PREFIX[[charset]] +#define yycharset_handler M4_YY_PREFIX[[charset_handler]] +]]) ) %endif %endif @@ -152,6 +156,12 @@ m4preproc_define(`M4_GEN_PREFIX', [[ M4_GEN_PREFIX(`get_column') M4_GEN_PREFIX(`set_column') + m4_ifdef( [[M4_YY_CHARSET]], [[ + M4_GEN_PREFIX(`get_charset') + M4_GEN_PREFIX(`set_charset') + M4_GEN_PREFIX(`get_charset_handler') + M4_GEN_PREFIX(`set_charset_handler') + ]]) ]]) M4_GEN_PREFIX(`wrap') %endif @@ -326,6 +336,10 @@ m4_define( [[M4_YY_DOC_PARAM]], [[@param yyscanner The scanner object.]]) #define yytext YY_G(yytext_r) #define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) #define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +m4_ifdef( [[M4_YY_CHARSET]], [[ +#define yycharset YY_G(yycharset_r) +#define yycharset_handler YY_G(yycharset_handler_r) +]]) #define yy_flex_debug YY_G(yy_flex_debug_r) m4_define( [[M4_YY_INCR_LINENO]], @@ -400,6 +414,19 @@ m4_ifdef( [[M4_YY_NO_ANSI_FUNC_DEFS]], $3 $4; [[\]] $5 $6; [[\]] M4_YY_DECL_LAST_ARG]]) + m4_define( [[YYFARGS4]], [[($2,$4,$6,$8 M4_YY_DEF_LAST_ARG) [[\]] + $1 $2; [[\]] + $3 $4; [[\]] + $5 $6; [[\]] + $7 $8; [[\]] + M4_YY_DECL_LAST_ARG]]) + m4_define( [[YYFARGS5]], [[($2,$4,$6,$8,$10 M4_YY_DEF_LAST_ARG) [[\]] + $1 $2; [[\]] + $3 $4; [[\]] + $5 $6; [[\]] + $7 $8; [[\]] + $9 $10; [[\]] + M4_YY_DECL_LAST_ARG]]) ]], [[ %# Generate C99 function defs. @@ -407,6 +434,10 @@ m4_ifdef( [[M4_YY_NO_ANSI_FUNC_DEFS]], m4_define( [[YYFARGS1]], [[($1 $2 M4_YY_DEF_LAST_ARG)]]) m4_define( [[YYFARGS2]], [[($1 $2, $3 $4 M4_YY_DEF_LAST_ARG)]]) m4_define( [[YYFARGS3]], [[($1 $2, $3 $4, $5 $6 M4_YY_DEF_LAST_ARG)]]) + m4_define( [[YYFARGS4]], [[($1 $2, $3 $4, $5 $6, $7 $8 + M4_YY_DEF_LAST_ARG)]]) + m4_define( [[YYFARGS5]], [[($1 $2, $3 $4, $5 $6, $7 $8, $9 $10 + M4_YY_DEF_LAST_ARG)]]) ]]) m4_ifdef( [[M4_YY_NOT_IN_HEADER]], @@ -483,6 +514,9 @@ extern yy_size_t yyleng; %if-c-only %if-not-reentrant extern FILE *yyin, *yyout; +m4_ifdef( [[M4_YY_CHARSET]], [[ +extern char *yycharset; +]]) %endif %endif @@ -604,6 +638,12 @@ struct yy_buffer_state int yy_fill_buffer; int yy_buffer_status; + +m4_ifdef( [[M4_YY_CHARSET]],[[ + char *yy_input_buffer; + size_t yy_input_buffer_length; +]]) + m4_ifdef( [[M4_YY_NOT_IN_HEADER]], [[ #define YY_BUFFER_NEW 0 @@ -883,6 +923,11 @@ m4_ifdef( [[]], YYLTYPE * yylloc_r; ]]) +m4_ifdef( [[M4_YY_CHARSET]], [[ + char *yycharset_r; /** current charset name */ + yycharset_handler_t yycharset_handler_r; /** charset handle function */ +]]) + }; /* end struct yyguts_t */ ]]) @@ -1003,6 +1048,27 @@ void yyset_column M4_YY_PARAMS( int _column_no M4_YY_PROTO_LAST_ARG ); ]]) ]]) +m4_ifdef( [[M4_YY_REENTRANT]],[[ +m4_ifdef( [[M4_YY_CHARSET]],[[ +m4_ifdef( [[M4_YY_NO_GET_CHARSET]],,[[ +char *yyget_charset M4_YY_PARAMS( M4_YY_PROTO_ONLY_ARG ); +yycharset_handler_t yyget_charset_handler M4_YY_PARAMS( M4_YY_PROTO_ONLY_ARG ); +]]) +]]) +]]) + +m4_ifdef( [[M4_YY_REENTRANT]],[[ +/* YY_REENTRANT */ +m4_ifdef( [[M4_YY_CHARSET]], [[ +/* YY_CHARSET */ +m4_ifdef( [[M4_YY_NO_SET_CHARSET]],,[[ +/* !YY_NO_SET_CHARSET */ +void yyset_charset M4_YY_PARAMS( char *charset M4_YY_PROTO_LAST_ARG ); +void yyset_charset_handler M4_YY_PARAMS( yycharset_handler_t charset_handler M4_YY_PROTO_LAST_ARG ); +]]) +]]) +]]) + %if-bison-bridge m4_ifdef( [[M4_YY_NO_GET_LVAL]],, [[ @@ -1140,13 +1206,14 @@ m4_ifdef( [[M4_YY_NOT_IN_HEADER]], */ #ifndef YY_INPUT #define YY_INPUT(buf,result,max_size) \ +do {\ %% [5.0] fread()/read() definition of YY_INPUT goes here unless we're doing C++ \ \ %if-c++-only C++ definition \ - if ( (int)(result = LexerInput( (char *) buf, max_size )) < 0 ) \ - YY_FATAL_ERROR( "input in flex scanner failed" ); -%endif - + if ( (result = LexerInput( (char *) buf, max_size )) < 0 ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ +%endif \ +} while(0) #endif ]]) @@ -1622,6 +1689,36 @@ void yyFlexLexer::LexerOutput( const char* buf, int size ) %ok-for-header %endif +m4_ifdef( [[M4_YY_NOT_IN_HEADER]],[[ +m4_ifdef( [[M4_YY_CHARSET]],[[ +/* yycharset_convert - convert incoming data from arbitrary + * charset into internal representation + */ +static size_t yycharset_convert YYFARGS5( + char*, source, size_t, source_bytes, + YY_CHAR*, target, size_t, target_length, + size_t*, converted_bytes) { + M4_YY_DECL_GUTS_VAR(); + if(strcmp(yycharset, "M4_YY_CHARSET_SOURCE")==0) { + if(target_length < source_bytes) + YY_FATAL_ERROR("Too small buffer"); + strncpy((char*)target, source, source_bytes); + *converted_bytes = source_bytes; + return source_bytes; + } else if(yycharset_handler) + return yycharset_handler(yycharset, source, source_bytes, + target, target_length, converted_bytes M4_YY_CALL_LAST_ARG); + else { + char msg[256]; + snprintf(msg, sizeof(msg), + "Unsupported character encoding: %s", yycharset); + YY_FATAL_ERROR(msg); + } + return 0; +} +]]) +]]) + m4_ifdef( [[M4_YY_NOT_IN_HEADER]], [[ /* yy_get_next_buffer - try to read in a new buffer @@ -1733,6 +1830,43 @@ m4_ifdef( [[M4_YY_USES_REJECT]], num_to_read = YY_READ_BUF_SIZE; /* Read in more data. */ +m4_ifdef([[M4_YY_CHARSET]],[[ + if(yycharset) { + const size_t max_size = YY_READ_BUF_SIZE * sizeof(YY_CHAR); + char buffer[max_size]; + memcpy(buffer, YY_CURRENT_BUFFER_LVALUE->yy_input_buffer, + YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length); + + size_t read_bytes, converted_characters; + YY_INPUT( + (&buffer[YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length]), + (read_bytes), + (max_size-YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length) + ); + size_t converted_bytes = 0; + converted_characters = yycharset_convert( + buffer, + YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length+read_bytes, + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move], + num_to_read, + &converted_bytes M4_YY_CALL_LAST_ARG); + if(converted_characters == 0 && read_bytes != 0) + YY_FATAL_ERROR("Could not convert input characters"); + YY_G(yy_n_chars) = converted_characters; + + /* store left bytes in yy_input_buffer */ + YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length += + read_bytes-converted_bytes; + + YY_CURRENT_BUFFER_LVALUE->yy_input_buffer = (char*)yyrealloc( + (void*) YY_CURRENT_BUFFER_LVALUE->yy_input_buffer, + YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length + M4_YY_CALL_LAST_ARG); + memcpy(YY_CURRENT_BUFFER_LVALUE->yy_input_buffer, + &buffer[converted_bytes], + YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length); + } else +]]) YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), YY_G(yy_n_chars), num_to_read ); @@ -2155,6 +2289,11 @@ m4_ifdef( [[M4_YY_ALWAYS_INTERACTIVE]], b->yy_is_interactive = 0; %endif errno = oerrno; + +m4_ifdef([[M4_YY_CHARSET]],[[ + b->yy_input_buffer = NULL; + b->yy_input_buffer_length = 0; +]]) } /** Discard all buffered characters. On the next scan, YY_INPUT will be called. @@ -2589,6 +2728,32 @@ int yyget_column YYFARGS0(void) ]]) ]]) +m4_ifdef( [[M4_YY_REENTRANT]],[[ +m4_ifdef( [[M4_YY_CHARSET]], [[ +m4_ifdef( [[M4_YY_NO_GET_CHARSET]],,[[ +/** Get the currently set charset name + * M4_YY_DOC_PARAM + */ +char *yyget_charset YYFARGS0(void) +{ + M4_YY_DECL_GUTS_VAR(); + return yycharset; +} +]]) + +m4_ifdef( [[M4_YY_NO_GET_CHARSET_HANDLER]],,[[ +/** Get the currently set charset handler + * M4_YY_DOC_PARAM + */ +yycharset_handler_t yyget_charset_handler YYFARGS0(void) +{ + M4_YY_DECL_GUTS_VAR(); + return yycharset_handler; +} +]]) +]]) +]]) + m4_ifdef( [[M4_YY_NO_GET_IN]],, [[ /** Get the input stream. @@ -2695,6 +2860,34 @@ void yyset_column YYFARGS1( int , _column_no) ]]) ]]) +m4_ifdef( [[M4_YY_REENTRANT]],[[ +m4_ifdef( [[M4_YY_CHARSET]], [[ +m4_ifdef( [[M4_YY_NO_SET_CHARSET]],,[[ +/** Set the current charset name + * @param charset charset name + * M4_YY_DOC_PARAM + */ +void yyset_charset YYFARGS1( char*, charset) +{ + M4_YY_DECL_GUTS_VAR(); + yycharset = strdup(charset); +} +]]) + +m4_ifdef( [[M4_YY_NO_SET_CHARSET_HANDLER]],,[[ +/** Set the current charset handler + * @param charset_handler handler function + * M4_YY_DOC_PARAM + */ +void yyset_charset_handler YYFARGS1( yycharset_handler_t, charset_handler) +{ + M4_YY_DECL_GUTS_VAR(); + yycharset_handler = charset_handler; +} +]]) +]]) +]]) + m4_ifdef( [[M4_YY_NO_SET_IN]],, [[ @@ -2910,6 +3103,11 @@ m4_ifdef( [[M4_YY_TEXT_IS_ARRAY]], YY_G(yy_prev_more_offset) = 0; ]]) +m4_ifdef( [[M4_YY_CHARSET]],[[ + yycharset = NULL; + yycharset_handler = NULL; +]]) + /* Defined in main.c */ #ifdef YY_STDINIT yyin = stdin; diff --git a/src/flexdef.h b/src/flexdef.h index 506959a..7efef4a 100644 --- a/src/flexdef.h +++ b/src/flexdef.h @@ -396,6 +396,7 @@ char *alloca (); * of what we think based on references to it in the user's actions. * reject_really_used - same for REJECT * trace_hex - use hexadecimal numbers in trace/debug outputs instead of octals + * charset_enabled - true if charset interface has been enabled */ extern int printstats, syntaxerror, eofseen, ddebug, trace, nowarn, @@ -411,6 +412,7 @@ extern int yymore_used, reject, real_reject, continued_action, in_rule; extern int yymore_really_used, reject_really_used; extern int trace_hex; +extern bool charset_enabled; /* Variables used in the flex input routines: * datapos - characters on current output line @@ -435,6 +437,8 @@ extern int trace_hex; * num_input_files - size of input_files array * program_name - name with which program was invoked * + * charset_source - character set that has been declared as used in source file + * * action_array - array to hold the rule actions * action_size - size of action_array * defs1_offset - index where the user's section 1 definitions start @@ -457,6 +461,8 @@ extern char **input_files; extern int num_input_files; extern char *program_name; +extern char *charset_source; + extern char *action_array; extern int action_size; extern int defs1_offset, prolog_offset, action_offset, action_index; diff --git a/src/main.c b/src/main.c index befbb3c..4671a08 100644 --- a/src/main.c +++ b/src/main.c @@ -107,7 +107,8 @@ int num_input_files; jmp_buf flex_main_jmp_buf; bool *rule_has_nl, *ccl_has_nl; int nlch = '\n'; -bool ansi_func_defs, ansi_func_protos; +bool ansi_func_defs, ansi_func_protos, charset_enabled = false; +char *charset_source = NULL; bool tablesext, tablesverify, gentables; char *tablesfilename=0,*tablesname=0; @@ -477,6 +478,12 @@ void check_options () if (do_yylineno) buf_m4_define (&m4defs_buf, "M4_YY_USE_LINENO", NULL); + if(charset_enabled) + buf_m4_define(&m4defs_buf, "M4_YY_CHARSET", NULL); + + if(charset_source) + buf_m4_define(&m4defs_buf, "M4_YY_CHARSET_SOURCE", charset_source); + /* Create the alignment type. */ buf_strdefine (&userdef_buf, "YY_INT_ALIGNED", long_align ? "long int" : "short int"); @@ -1506,6 +1513,10 @@ void readin () static char character_type_char[] = "typedef char YY_CHAR;"; static char character_defined[] = "#define YY_CHAR_DEFINED"; + static char charset_handler_t[] = "typedef size_t(*yycharset_handler_t)(char*,char*,size_t,YY_CHAR*,size_t,size_t*);\n"; + static char charset_handler_t_reentrant[] = "typedef size_t(*yycharset_handler_t)(char*,char*,size_t,YY_CHAR*,size_t,size_t*,yyscan_t);\n"; + + line_directive_out ((FILE *) 0, 1); if (yyparse ()) { @@ -1737,6 +1748,12 @@ void readin () outn ("extern YY_CHAR yytext[];\n"); } else { + /* This prevents warning of "already defined macro" in multiple + * non-reentrant scanners */ + outn("#ifdef yytext_ptr"); + outn("#undef yytext_ptr"); + outn("#endif"); + if (reentrant) { outn ("#define yytext_ptr yytext_r"); } @@ -1751,6 +1768,22 @@ void readin () ("%option yyclass only meaningful for C++ scanners")); } + outn(""); + + if(charset_enabled) { + if(!reentrant) + outn(charset_handler_t); + else + outn(charset_handler_t_reentrant); + + OUT_BEGIN_CODE (); + if(!C_plus_plus && !reentrant) { + outn("char *yycharset = NULL;"); + outn("yycharset_handler_t yycharset_handler = NULL;"); + } + OUT_END_CODE (); + } + if (useecs) numecs = cre8ecs (nextecm, ecgroup, csize); else diff --git a/src/parse.y b/src/parse.y index 30efb00..6fa87a0 100644 --- a/src/parse.y +++ b/src/parse.y @@ -1,8 +1,8 @@ /* parse.y - parser for flex input */ %token CHAR NUMBER SECTEND SCDECL XSCDECL NAME PREVCCL EOF_OP -%token OPTION_OP OPT_OUTFILE OPT_PREFIX OPT_YYCLASS OPT_HEADER OPT_HEADER_CHAR OPT_EXTRA_TYPE -%token OPT_TABLES +%token OPTION_OP OPT_OUTFILE OPT_PREFIX OPT_YYCLASS OPT_HEADER OPT_HEADER_CHAR +%token OPT_EXTRA_TYPE OPT_TABLES OPT_CHARSET_SOURCE %token CCE_ALNUM CCE_ALPHA CCE_BLANK CCE_CNTRL CCE_DIGIT CCE_GRAPH %token CCE_LOWER CCE_PRINT CCE_PUNCT CCE_SPACE CCE_UPPER CCE_XDIGIT @@ -208,6 +208,8 @@ option : OPT_OUTFILE '=' NAME { headercharfilename = copy_string( nmstr ); } | OPT_TABLES '=' NAME { tablesext = true; tablesfilename = copy_string( nmstr ); } + | OPT_CHARSET_SOURCE '=' NAME + { charset_source = copy_string(nmstr); } ; sect2 : sect2 scon initforrule flexrule '\n' diff --git a/src/scan.l b/src/scan.l index 6cbefe6..8fa7db3 100644 --- a/src/scan.l +++ b/src/scan.l @@ -349,6 +349,7 @@ M4QEND "]]" "c++" C_plus_plus = option_sense; caseful|case-sensitive sf_set_case_ins(!option_sense); caseless|case-insensitive sf_set_case_ins(option_sense); + charset charset_enabled = true; debug ddebug = option_sense; default spprdflt = ! option_sense; ecs useecs = option_sense; @@ -427,6 +428,7 @@ M4QEND "]]" header(-file)? return OPT_HEADER; header-char(-file)? return OPT_HEADER_CHAR; tables-file return OPT_TABLES; + charset-source return OPT_CHARSET_SOURCE; tables-verify { tablesverify = option_sense; if(!tablesext && option_sense) -- 2.50.1