]> granicus.if.org Git - flex/commitdiff
Add implementation for charset support
authorMariusz Pluciński <mplucinski@mplucinski.com>
Fri, 20 Jun 2014 22:38:00 +0000 (00:38 +0200)
committerWill Estes <westes575@gmail.com>
Mon, 1 Dec 2014 00:22:43 +0000 (19:22 -0500)
src/flex.skl
src/flexdef.h
src/main.c
src/parse.y
src/scan.l

index 65f88b438eb048c7cb75b1876ce2e8c5646bdb8f..57384535c912fa4720d0af478e48d5359458ca7b 100644 (file)
@@ -77,6 +77,10 @@ m4_ifelse(M4_YY_PREFIX,yy,,
 #define yyalloc M4_YY_PREFIX[[alloc]]
 #define yyrealloc M4_YY_PREFIX[[realloc]]
 #define yyfree M4_YY_PREFIX[[free]]
+m4_ifdef( [[M4_YY_CHARSET]], [[
+#define yycharset M4_YY_PREFIX[[charset]]
+#define yycharset_handler M4_YY_PREFIX[[charset_handler]]
+]])
 )
 %endif
 %endif
@@ -152,6 +156,12 @@ m4preproc_define(`M4_GEN_PREFIX',
     [[
         M4_GEN_PREFIX(`get_column')
         M4_GEN_PREFIX(`set_column')
+        m4_ifdef( [[M4_YY_CHARSET]], [[
+            M4_GEN_PREFIX(`get_charset')
+            M4_GEN_PREFIX(`set_charset')
+            M4_GEN_PREFIX(`get_charset_handler')
+            M4_GEN_PREFIX(`set_charset_handler')
+        ]])
     ]])
     M4_GEN_PREFIX(`wrap')
 %endif
@@ -326,6 +336,10 @@ m4_define( [[M4_YY_DOC_PARAM]], [[@param yyscanner The scanner object.]])
 #define yytext YY_G(yytext_r)
 #define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
 #define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+m4_ifdef( [[M4_YY_CHARSET]], [[
+#define yycharset YY_G(yycharset_r)
+#define yycharset_handler YY_G(yycharset_handler_r)
+]])
 #define yy_flex_debug YY_G(yy_flex_debug_r)
 
 m4_define( [[M4_YY_INCR_LINENO]],
@@ -400,6 +414,19 @@ m4_ifdef( [[M4_YY_NO_ANSI_FUNC_DEFS]],
         $3 $4; [[\]]
         $5 $6; [[\]]
         M4_YY_DECL_LAST_ARG]])
+    m4_define( [[YYFARGS4]], [[($2,$4,$6,$8 M4_YY_DEF_LAST_ARG) [[\]]
+        $1 $2; [[\]]
+        $3 $4; [[\]]
+        $5 $6; [[\]]
+        $7 $8; [[\]]
+        M4_YY_DECL_LAST_ARG]])
+    m4_define( [[YYFARGS5]], [[($2,$4,$6,$8,$10 M4_YY_DEF_LAST_ARG) [[\]]
+        $1 $2; [[\]]
+        $3 $4; [[\]]
+        $5 $6; [[\]]
+        $7 $8; [[\]]
+        $9 $10; [[\]]
+        M4_YY_DECL_LAST_ARG]])
 ]],
 [[
 %# Generate C99 function defs.
@@ -407,6 +434,10 @@ m4_ifdef( [[M4_YY_NO_ANSI_FUNC_DEFS]],
     m4_define( [[YYFARGS1]], [[($1 $2 M4_YY_DEF_LAST_ARG)]])
     m4_define( [[YYFARGS2]], [[($1 $2, $3 $4 M4_YY_DEF_LAST_ARG)]])
     m4_define( [[YYFARGS3]], [[($1 $2, $3 $4, $5 $6 M4_YY_DEF_LAST_ARG)]])
+    m4_define( [[YYFARGS4]], [[($1 $2, $3 $4, $5 $6, $7 $8
+            M4_YY_DEF_LAST_ARG)]])
+    m4_define( [[YYFARGS5]], [[($1 $2, $3 $4, $5 $6, $7 $8, $9 $10
+            M4_YY_DEF_LAST_ARG)]])
 ]])
 
 m4_ifdef( [[M4_YY_NOT_IN_HEADER]],
@@ -483,6 +514,9 @@ extern yy_size_t yyleng;
 %if-c-only
 %if-not-reentrant
 extern FILE *yyin, *yyout;
+m4_ifdef( [[M4_YY_CHARSET]], [[
+extern char *yycharset;
+]])
 %endif
 %endif
 
@@ -604,6 +638,12 @@ struct yy_buffer_state
        int yy_fill_buffer;
 
        int yy_buffer_status;
+
+m4_ifdef( [[M4_YY_CHARSET]],[[
+       char *yy_input_buffer;
+       size_t yy_input_buffer_length;
+]])
+
 m4_ifdef( [[M4_YY_NOT_IN_HEADER]],
 [[
 #define YY_BUFFER_NEW 0
@@ -883,6 +923,11 @@ m4_ifdef( [[<M4_YY_BISON_LLOC>]],
     YYLTYPE * yylloc_r;
 ]])
 
+m4_ifdef( [[M4_YY_CHARSET]], [[
+    char *yycharset_r; /** current charset name */
+    yycharset_handler_t yycharset_handler_r; /** charset handle function */
+]])
+
     }; /* end struct yyguts_t */
 ]])
 
@@ -1003,6 +1048,27 @@ void yyset_column M4_YY_PARAMS( int _column_no M4_YY_PROTO_LAST_ARG );
 ]])
 ]])
 
+m4_ifdef( [[M4_YY_REENTRANT]],[[
+m4_ifdef( [[M4_YY_CHARSET]],[[
+m4_ifdef( [[M4_YY_NO_GET_CHARSET]],,[[
+char *yyget_charset M4_YY_PARAMS( M4_YY_PROTO_ONLY_ARG );
+yycharset_handler_t yyget_charset_handler M4_YY_PARAMS( M4_YY_PROTO_ONLY_ARG );
+]])
+]])
+]])
+
+m4_ifdef( [[M4_YY_REENTRANT]],[[
+/* YY_REENTRANT */
+m4_ifdef( [[M4_YY_CHARSET]], [[
+/* YY_CHARSET */
+m4_ifdef( [[M4_YY_NO_SET_CHARSET]],,[[
+/* !YY_NO_SET_CHARSET */
+void yyset_charset M4_YY_PARAMS( char *charset M4_YY_PROTO_LAST_ARG );
+void yyset_charset_handler M4_YY_PARAMS( yycharset_handler_t charset_handler M4_YY_PROTO_LAST_ARG );
+]])
+]])
+]])
+
 %if-bison-bridge
 m4_ifdef( [[M4_YY_NO_GET_LVAL]],,
 [[
@@ -1140,13 +1206,14 @@ m4_ifdef( [[M4_YY_NOT_IN_HEADER]],
  */
 #ifndef YY_INPUT
 #define YY_INPUT(buf,result,max_size) \
+do {\
 %% [5.0] fread()/read() definition of YY_INPUT goes here unless we're doing C++ \
 \
 %if-c++-only C++ definition \
-       if ( (int)(result = LexerInput( (char *) buf, max_size )) < 0 ) \
-               YY_FATAL_ERROR( "input in flex scanner failed" );
-%endif
-
+       if ( (result = LexerInput( (char *) buf, max_size )) < 0 ) \
+               YY_FATAL_ERROR( "input in flex scanner failed" ); \
+%endif \
+} while(0)
 #endif
 ]])
 
@@ -1622,6 +1689,36 @@ void yyFlexLexer::LexerOutput( const char* buf, int size )
 %ok-for-header
 %endif
 
+m4_ifdef( [[M4_YY_NOT_IN_HEADER]],[[
+m4_ifdef( [[M4_YY_CHARSET]],[[
+/* yycharset_convert - convert incoming data from arbitrary
+ *    charset into internal representation
+ */
+static size_t yycharset_convert YYFARGS5(
+        char*, source, size_t, source_bytes,
+        YY_CHAR*, target, size_t, target_length,
+        size_t*, converted_bytes) {
+    M4_YY_DECL_GUTS_VAR();
+    if(strcmp(yycharset, "M4_YY_CHARSET_SOURCE")==0) {
+        if(target_length < source_bytes)
+            YY_FATAL_ERROR("Too small buffer");
+        strncpy((char*)target, source, source_bytes);
+        *converted_bytes = source_bytes;
+        return source_bytes;
+    } else if(yycharset_handler)
+        return yycharset_handler(yycharset, source, source_bytes,
+            target, target_length, converted_bytes M4_YY_CALL_LAST_ARG);
+    else {
+        char msg[256];
+        snprintf(msg, sizeof(msg),
+            "Unsupported character encoding: %s", yycharset);
+        YY_FATAL_ERROR(msg);
+    }
+    return 0;
+}
+]])
+]])
+
 m4_ifdef( [[M4_YY_NOT_IN_HEADER]],
 [[
 /* yy_get_next_buffer - try to read in a new buffer
@@ -1733,6 +1830,43 @@ m4_ifdef( [[M4_YY_USES_REJECT]],
                        num_to_read = YY_READ_BUF_SIZE;
 
                /* Read in more data. */
+m4_ifdef([[M4_YY_CHARSET]],[[
+               if(yycharset) {
+                       const size_t max_size = YY_READ_BUF_SIZE * sizeof(YY_CHAR);
+                       char buffer[max_size];
+                       memcpy(buffer, YY_CURRENT_BUFFER_LVALUE->yy_input_buffer,
+                                       YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length);
+
+                       size_t read_bytes, converted_characters;
+                       YY_INPUT(
+                                       (&buffer[YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length]),
+                                       (read_bytes),
+                                       (max_size-YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length)
+                       );
+                       size_t converted_bytes = 0;
+                       converted_characters = yycharset_convert(
+                                       buffer,
+                                       YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length+read_bytes,
+                                       &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move],
+                                       num_to_read,
+                                       &converted_bytes M4_YY_CALL_LAST_ARG);
+                       if(converted_characters == 0 && read_bytes != 0)
+                           YY_FATAL_ERROR("Could not convert input characters");
+                       YY_G(yy_n_chars) = converted_characters;
+
+                       /* store left bytes in yy_input_buffer */
+                       YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length +=
+                                       read_bytes-converted_bytes;
+
+                       YY_CURRENT_BUFFER_LVALUE->yy_input_buffer = (char*)yyrealloc(
+                                       (void*) YY_CURRENT_BUFFER_LVALUE->yy_input_buffer,
+                                       YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length
+                                       M4_YY_CALL_LAST_ARG);
+                       memcpy(YY_CURRENT_BUFFER_LVALUE->yy_input_buffer,
+                                       &buffer[converted_bytes],
+                                       YY_CURRENT_BUFFER_LVALUE->yy_input_buffer_length);
+               } else
+]])
                YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
                        YY_G(yy_n_chars), num_to_read );
 
@@ -2155,6 +2289,11 @@ m4_ifdef( [[M4_YY_ALWAYS_INTERACTIVE]],
        b->yy_is_interactive = 0;
 %endif
        errno = oerrno;
+
+m4_ifdef([[M4_YY_CHARSET]],[[
+       b->yy_input_buffer = NULL;
+       b->yy_input_buffer_length = 0;
+]])
 }
 
 /** Discard all buffered characters. On the next scan, YY_INPUT will be called.
@@ -2589,6 +2728,32 @@ int yyget_column  YYFARGS0(void)
 ]])
 ]])
 
+m4_ifdef( [[M4_YY_REENTRANT]],[[
+m4_ifdef( [[M4_YY_CHARSET]], [[
+m4_ifdef( [[M4_YY_NO_GET_CHARSET]],,[[
+/** Get the currently set charset name
+ * M4_YY_DOC_PARAM
+ */
+char *yyget_charset  YYFARGS0(void)
+{
+    M4_YY_DECL_GUTS_VAR();
+    return yycharset;
+}
+]])
+
+m4_ifdef( [[M4_YY_NO_GET_CHARSET_HANDLER]],,[[
+/** Get the currently set charset handler
+ * M4_YY_DOC_PARAM
+ */
+yycharset_handler_t yyget_charset_handler YYFARGS0(void)
+{
+    M4_YY_DECL_GUTS_VAR();
+    return yycharset_handler;
+}
+]])
+]])
+]])
+
 m4_ifdef( [[M4_YY_NO_GET_IN]],,
 [[
 /** Get the input stream.
@@ -2695,6 +2860,34 @@ void yyset_column YYFARGS1( int , _column_no)
 ]])
 ]])
 
+m4_ifdef( [[M4_YY_REENTRANT]],[[
+m4_ifdef( [[M4_YY_CHARSET]], [[
+m4_ifdef( [[M4_YY_NO_SET_CHARSET]],,[[
+/** Set the current charset name
+ * @param charset charset name
+ * M4_YY_DOC_PARAM
+ */
+void yyset_charset YYFARGS1( char*, charset)
+{
+    M4_YY_DECL_GUTS_VAR();
+    yycharset = strdup(charset);
+}
+]])
+
+m4_ifdef( [[M4_YY_NO_SET_CHARSET_HANDLER]],,[[
+/** Set the current charset handler
+ * @param charset_handler handler function
+ * M4_YY_DOC_PARAM
+ */
+void yyset_charset_handler YYFARGS1( yycharset_handler_t, charset_handler)
+{
+    M4_YY_DECL_GUTS_VAR();
+    yycharset_handler = charset_handler;
+}
+]])
+]])
+]])
+
 
 m4_ifdef( [[M4_YY_NO_SET_IN]],,
 [[
@@ -2910,6 +3103,11 @@ m4_ifdef( [[M4_YY_TEXT_IS_ARRAY]],
     YY_G(yy_prev_more_offset) = 0;
 ]])
 
+m4_ifdef( [[M4_YY_CHARSET]],[[
+    yycharset = NULL;
+    yycharset_handler = NULL;
+]])
+
 /* Defined in main.c */
 #ifdef YY_STDINIT
     yyin = stdin;
index 506959a047fa19c46a22e7ae95523a1a2312d09a..7efef4a8291996ad51dcd4a46ee8f8457c958757 100644 (file)
@@ -396,6 +396,7 @@ char *alloca ();
  *   of what we think based on references to it in the user's actions.
  * reject_really_used - same for REJECT
  * trace_hex - use hexadecimal numbers in trace/debug outputs instead of octals
+ * charset_enabled - true if charset interface has been enabled
  */
 
 extern int printstats, syntaxerror, eofseen, ddebug, trace, nowarn,
@@ -411,6 +412,7 @@ extern int yymore_used, reject, real_reject, continued_action, in_rule;
 
 extern int yymore_really_used, reject_really_used;
 extern int trace_hex;
+extern bool charset_enabled;
 
 /* Variables used in the flex input routines:
  * datapos - characters on current output line
@@ -435,6 +437,8 @@ extern int trace_hex;
  * num_input_files - size of input_files array
  * program_name - name with which program was invoked
  *
+ * charset_source - character set that has been declared as used in source file
+ *
  * action_array - array to hold the rule actions
  * action_size - size of action_array
  * defs1_offset - index where the user's section 1 definitions start
@@ -457,6 +461,8 @@ extern char **input_files;
 extern int num_input_files;
 extern char *program_name;
 
+extern char *charset_source;
+
 extern char *action_array;
 extern int action_size;
 extern int defs1_offset, prolog_offset, action_offset, action_index;
index befbb3ce45badfadb6f9a97b0f695508c281e5a6..4671a086ad5a71c4e16118118fbf5f08463f2e6c 100644 (file)
@@ -107,7 +107,8 @@ int     num_input_files;
 jmp_buf flex_main_jmp_buf;
 bool   *rule_has_nl, *ccl_has_nl;
 int     nlch = '\n';
-bool    ansi_func_defs, ansi_func_protos;
+bool    ansi_func_defs, ansi_func_protos, charset_enabled = false;
+char   *charset_source = NULL;
 
 bool    tablesext, tablesverify, gentables;
 char   *tablesfilename=0,*tablesname=0;
@@ -477,6 +478,12 @@ void check_options ()
        if (do_yylineno)
                buf_m4_define (&m4defs_buf, "M4_YY_USE_LINENO", NULL);
 
+       if(charset_enabled)
+               buf_m4_define(&m4defs_buf, "M4_YY_CHARSET", NULL);
+
+       if(charset_source)
+               buf_m4_define(&m4defs_buf, "M4_YY_CHARSET_SOURCE", charset_source);
+
        /* Create the alignment type. */
        buf_strdefine (&userdef_buf, "YY_INT_ALIGNED",
                       long_align ? "long int" : "short int");
@@ -1506,6 +1513,10 @@ void readin ()
        static char character_type_char[] = "typedef char YY_CHAR;";
        static char character_defined[] = "#define YY_CHAR_DEFINED";
 
+       static char charset_handler_t[] = "typedef size_t(*yycharset_handler_t)(char*,char*,size_t,YY_CHAR*,size_t,size_t*);\n";
+       static char charset_handler_t_reentrant[] = "typedef size_t(*yycharset_handler_t)(char*,char*,size_t,YY_CHAR*,size_t,size_t*,yyscan_t);\n";
+
+
        line_directive_out ((FILE *) 0, 1);
 
        if (yyparse ()) {
@@ -1737,6 +1748,12 @@ void readin ()
                                outn ("extern YY_CHAR yytext[];\n");
                }
                else {
+                       /* This prevents warning of "already defined macro" in multiple
+                        * non-reentrant scanners */
+                       outn("#ifdef yytext_ptr");
+                       outn("#undef yytext_ptr");
+                       outn("#endif");
+
                        if (reentrant) {
                                outn ("#define yytext_ptr yytext_r");
                        }
@@ -1751,6 +1768,22 @@ void readin ()
                                   ("%option yyclass only meaningful for C++ scanners"));
        }
 
+       outn("");
+
+       if(charset_enabled) {
+               if(!reentrant)
+                       outn(charset_handler_t);
+               else
+                       outn(charset_handler_t_reentrant);
+
+               OUT_BEGIN_CODE ();
+               if(!C_plus_plus && !reentrant) {
+                       outn("char *yycharset = NULL;");
+                       outn("yycharset_handler_t yycharset_handler = NULL;");
+               }
+               OUT_END_CODE ();
+       }
+
        if (useecs)
                numecs = cre8ecs (nextecm, ecgroup, csize);
        else
index 30efb008e828fdadb6704ceef44fe0e08bc199be..6fa87a05dd6b73893381fbe4870523c6f02b2d9e 100644 (file)
@@ -1,8 +1,8 @@
 /* parse.y - parser for flex input */
 
 %token CHAR NUMBER SECTEND SCDECL XSCDECL NAME PREVCCL EOF_OP
-%token OPTION_OP OPT_OUTFILE OPT_PREFIX OPT_YYCLASS OPT_HEADER OPT_HEADER_CHAR OPT_EXTRA_TYPE
-%token OPT_TABLES
+%token OPTION_OP OPT_OUTFILE OPT_PREFIX OPT_YYCLASS OPT_HEADER OPT_HEADER_CHAR
+%token OPT_EXTRA_TYPE OPT_TABLES OPT_CHARSET_SOURCE
 
 %token CCE_ALNUM CCE_ALPHA CCE_BLANK CCE_CNTRL CCE_DIGIT CCE_GRAPH
 %token CCE_LOWER CCE_PRINT CCE_PUNCT CCE_SPACE CCE_UPPER CCE_XDIGIT
@@ -208,6 +208,8 @@ option              :  OPT_OUTFILE '=' NAME
                        { headercharfilename = copy_string( nmstr ); }
            |  OPT_TABLES '=' NAME
             { tablesext = true; tablesfilename = copy_string( nmstr ); }
+        |  OPT_CHARSET_SOURCE '=' NAME
+            { charset_source = copy_string(nmstr); }
                ;
 
 sect2          :  sect2 scon initforrule flexrule '\n'
index 6cbefe6d04f76e7a026a6b366c366addae7bf702..8fa7db3213150f90d1f0cab11993e35c9520ea1c 100644 (file)
@@ -349,6 +349,7 @@ M4QEND      "]]"
        "c++"           C_plus_plus = option_sense;
        caseful|case-sensitive          sf_set_case_ins(!option_sense);
        caseless|case-insensitive       sf_set_case_ins(option_sense);
+       charset         charset_enabled = true;
        debug           ddebug = option_sense;
        default         spprdflt = ! option_sense;
        ecs             useecs = option_sense;
@@ -427,6 +428,7 @@ M4QEND      "]]"
        header(-file)?      return OPT_HEADER;
        header-char(-file)? return OPT_HEADER_CHAR;
        tables-file         return OPT_TABLES;
+       charset-source      return OPT_CHARSET_SOURCE;
        tables-verify   {
                     tablesverify = option_sense;
                     if(!tablesext && option_sense)