From 1748b159c312b4043d439bfb1e5f43e759efa3a8 Mon Sep 17 00:00:00 2001 From: helly Date: Sun, 1 Jan 2006 17:13:57 +0000 Subject: [PATCH] - Added support for c/c++ compatible \u and \U unicode notation. --- CHANGELOG | 1 + actions.cc | 55 +++++++++++++++++++++++++++++++++++++++++++++---- re2c.1.in | 16 ++++++++++---- test/error10.c | 3 +++ test/error10.re | 3 +++ test/error11.c | 3 +++ test/error11.re | 3 +++ test/error5.re | 2 +- test/error7.re | 2 +- test/error8.c | 2 +- test/error8.re | 2 +- test/error9.c | 2 +- test/error9.re | 2 +- 13 files changed, 82 insertions(+), 14 deletions(-) create mode 100755 test/error10.c create mode 100755 test/error10.re create mode 100755 test/error11.c create mode 100755 test/error11.re diff --git a/CHANGELOG b/CHANGELOG index 3f77c61e..ada71428 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,6 @@ Version 0.10.0 (????-??-??) --------------------------- +- Added support for c/c++ compatible \u and \U unicode notation. - Added ability to control indendation. - Made scanner error out in case an ambiguous /* is found. - Fixed indendation of generated code. diff --git a/actions.cc b/actions.cc index 7ab2ad1f..16c624d7 100644 --- a/actions.cc +++ b/actions.cc @@ -566,11 +566,52 @@ uint Scanner::unescape(SubStr &s) const } } + case 'U': + { + if (s.len < 8) + { + fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required"); + return ~0; + } + + uint l = 0; + + if (s.str[0] == '0') + { + l++; + if (s.str[1] == '0') + { + l++; + if (s.str[2] == '0') + { + l++; + if (s.str[3] == '0') + { + l++; + } + } + } + } + + if (l != 4) + { + fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required"); + } + + s.len -= 4; + s.str += 4; + + // no break; + } case 'X': + case 'u': { if (s.len < 4) { - fatal(s.ofs()+s.len, "Illegal hexadecimal character code, four hexadecimal digits are required"); + fatal(s.ofs()+s.len, + c == 'X' + ? "Illegal hexadecimal character code, four hexadecimal digits are required" + : "Illegal unicode character, four hexadecimal digits are required"); return ~0; } @@ -581,7 +622,10 @@ uint Scanner::unescape(SubStr &s) const if (!p1 || !p2 || !p3 || !p4) { - fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), "Illegal hexadecimal character code"); + fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), + c == 'X' + ? "Illegal hexadecimal character code, non hexxdecimal digit found" + : "Illegal unicode character, non hexadecimal digit found"); return ~0; } else @@ -596,7 +640,10 @@ uint Scanner::unescape(SubStr &s) const if (v >= nRealChars) { - fatal(s.ofs(), "Illegal hexadecimal character code, out of range"); + fatal(s.ofs(), + c == 'X' + ? "Illegal hexadecimal character code, out of range" + : "Illegal unicode character, out of range"); } return v; @@ -629,7 +676,7 @@ uint Scanner::unescape(SubStr &s) const if (!p0 || !p1 || !p2) { - fatal(s.ofs()+(p1?1:0), "Illegal octal character code"); + fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found"); return ~0; } else diff --git a/re2c.1.in b/re2c.1.in index 45e86c56..1a1b3d61 100644 --- a/re2c.1.in +++ b/re2c.1.in @@ -7,6 +7,9 @@ .ds rx regular expression .ds lx \fIl\fP-expression \"$Log$ +\"Revision 1.33 2006/01/01 17:13:56 helly +\"- Added support for c/c++ compatible \u and \U unicode notation. +\" \"Revision 1.32 2005/12/31 00:54:55 helly \"- Update docu \" @@ -454,10 +457,15 @@ Character classes and string literals may contain octoal or hexadecimal character definitions and the following set of escape sequences (\fB\\n\fP, \fB\\t\fP, \fB\\v\fP, \fB\\b\fP, \fB\\r\fP, \fB\\f\fP, \fB\\a\fP, \fB\\\\\fP). An octal character is defined by a backslash followed by its three octal digits -and a hexadecimal character is defined by backslash, a lower cased 'x' and its -two hexadecimal digits or a backslash, an upper cased 'X' and its four -hexadecimal digits. Since characters greater \fB0X00FF\fP are not allowed in -non unicode mode the only portable "\fBany\fP" rule is \fB(.|"\\n")\fP. +and a hexadecimal character is defined by backslash, a lower cased '\fBx\fP' +and its two hexadecimal digits or a backslash, an upper cased \fBX\fP and its +four hexadecimal digits. Since characters greater \fB\\X00FF\fP are not allowed +in non unicode mode, the only portable "\fBany\fP" rule is \fB(.|"\\n")\fP. re2c +further more supports the c/c++ unicode notation. That is a backslash followed +by either a lowercased \fBu\fP and its four hexadecimal digits or an uppercased +\fBU\fP and its eight hexadecimal digits. However using the U notation it is +not possible to support characters greater \fB\\U0000FFFF\fP due to an internal +limitation of re2c. .LP The regular expressions listed above are grouped according to precedence, from highest precedence at the top to lowest at the bottom. diff --git a/test/error10.c b/test/error10.c new file mode 100755 index 00000000..3f841c40 --- /dev/null +++ b/test/error10.c @@ -0,0 +1,3 @@ +line 2, column 6: Illegal unicode character, eight hexadecimal digits are required +/* Generated by re2c */ +#line 1 "error10.re" diff --git a/test/error10.re b/test/error10.re new file mode 100755 index 00000000..2d1a5446 --- /dev/null +++ b/test/error10.re @@ -0,0 +1,3 @@ +/*!re2c +[\U00900000YY] {} +*/ diff --git a/test/error11.c b/test/error11.c new file mode 100755 index 00000000..1b6154a1 --- /dev/null +++ b/test/error11.c @@ -0,0 +1,3 @@ +line 2, column 9: Illegal unicode character, non hexadecimal digit found +/* Generated by re2c */ +#line 1 "error11.re" diff --git a/test/error11.re b/test/error11.re new file mode 100755 index 00000000..705b32c8 --- /dev/null +++ b/test/error11.re @@ -0,0 +1,3 @@ +/*!re2c +[\U00000ZZZYY] {} +*/ diff --git a/test/error5.re b/test/error5.re index b63f5f90..5e2dfe16 100755 --- a/test/error5.re +++ b/test/error5.re @@ -1,3 +1,3 @@ /*!re2c -[\x0Z] {} +[\x0ZYY] {} */ diff --git a/test/error7.re b/test/error7.re index 6ebb711f..82555e3a 100755 --- a/test/error7.re +++ b/test/error7.re @@ -1,3 +1,3 @@ /*!re2c -[\400] {} +[\400YY] {} */ diff --git a/test/error8.c b/test/error8.c index c862ec0a..f676fcda 100755 --- a/test/error8.c +++ b/test/error8.c @@ -1,3 +1,3 @@ -line 2, column 4: Illegal octal character code +line 2, column 4: Illegal octal character code, non octal digit found /* Generated by re2c */ #line 1 "error8.re" diff --git a/test/error8.re b/test/error8.re index b007922c..f82b0a4c 100755 --- a/test/error8.re +++ b/test/error8.re @@ -1,3 +1,3 @@ /*!re2c -[\090] {} +[\090YY] {} */ diff --git a/test/error9.c b/test/error9.c index 24b5c6c9..1bbcc908 100755 --- a/test/error9.c +++ b/test/error9.c @@ -1,3 +1,3 @@ -line 2, column 5: Illegal octal character code +line 2, column 5: Illegal octal character code, non octal digit found /* Generated by re2c */ #line 1 "error9.re" diff --git a/test/error9.re b/test/error9.re index d0d1f0cd..b82051a0 100755 --- a/test/error9.re +++ b/test/error9.re @@ -1,3 +1,3 @@ /*!re2c -[\009] {} +[\009YY] {} */ -- 2.50.1