]> granicus.if.org Git - re2c/commitdiff
- Added support for c/c++ compatible \u and \U unicode notation.
authorhelly <helly@642ea486-5414-0410-9d7f-a0204ed87703>
Sun, 1 Jan 2006 17:13:57 +0000 (17:13 +0000)
committerhelly <helly@642ea486-5414-0410-9d7f-a0204ed87703>
Sun, 1 Jan 2006 17:13:57 +0000 (17:13 +0000)
13 files changed:
CHANGELOG
actions.cc
re2c.1.in
test/error10.c [new file with mode: 0755]
test/error10.re [new file with mode: 0755]
test/error11.c [new file with mode: 0755]
test/error11.re [new file with mode: 0755]
test/error5.re
test/error7.re
test/error8.c
test/error8.re
test/error9.c
test/error9.re

index 3f77c61e41249a20ba01acd24057efa54e1b74d2..ada71428fbccb903c72bb8500c86e20530c0ef5f 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,6 @@
 Version 0.10.0 (????-??-??)
 ---------------------------
+- Added support for c/c++ compatible \u and \U unicode notation.
 - Added ability to control indendation.
 - Made scanner error out in case an ambiguous /* is found.
 - Fixed indendation of generated code.
index 7ab2ad1f6e0a63496c81cf1fbf6b14c927177bb9..16c624d725fa4c43db88088d0bd78501c2d65d24 100644 (file)
@@ -566,11 +566,52 @@ uint Scanner::unescape(SubStr &s) const
                        }
                }
 
+               case 'U':
+               {
+                       if (s.len < 8)
+                       {
+                               fatal(s.ofs()+s.len, "Illegal unicode character, eight hexadecimal digits are required");
+                               return ~0;
+                       }
+
+                       uint l = 0;
+                                               
+                       if (s.str[0] == '0')
+                       {
+                               l++;
+                               if (s.str[1] == '0')
+                               {
+                                       l++;
+                                       if (s.str[2] == '0')
+                                       {
+                                               l++;
+                                               if (s.str[3] == '0')
+                                               {
+                                                       l++;
+                                               }
+                                       }
+                               }
+                       }
+
+                       if (l != 4)
+                       {
+                               fatal(s.ofs()+l, "Illegal unicode character, eight hexadecimal digits are required");
+                       }
+
+                       s.len -= 4;
+                       s.str += 4;
+                       
+                       // no break;
+               }
                case 'X':
+               case 'u':
                {
                        if (s.len < 4)
                        {
-                               fatal(s.ofs()+s.len, "Illegal hexadecimal character code, four hexadecimal digits are required");
+                               fatal(s.ofs()+s.len, 
+                                       c == 'X'
+                                       ? "Illegal hexadecimal character code, four hexadecimal digits are required"
+                                       : "Illegal unicode character, four hexadecimal digits are required");
                                return ~0;
                        }
                        
@@ -581,7 +622,10 @@ uint Scanner::unescape(SubStr &s) const
 
                        if (!p1 || !p2 || !p3 || !p4)
                        {
-                               fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), "Illegal hexadecimal character code");
+                               fatal(s.ofs()+(p1?1:0)+(p2?1:0)+(p3?1:0), 
+                                       c == 'X'
+                                       ? "Illegal hexadecimal character code, non hexxdecimal digit found"
+                                       : "Illegal unicode character, non hexadecimal digit found");
                                return ~0;
                        }
                        else
@@ -596,7 +640,10 @@ uint Scanner::unescape(SubStr &s) const
        
                                if (v >= nRealChars)
                                {
-                                       fatal(s.ofs(), "Illegal hexadecimal character code, out of range");
+                                       fatal(s.ofs(),
+                                               c == 'X'
+                                               ? "Illegal hexadecimal character code, out of range"
+                                               : "Illegal unicode character, out of range");
                                }
        
                                return v;
@@ -629,7 +676,7 @@ uint Scanner::unescape(SubStr &s) const
 
                        if (!p0 || !p1 || !p2)
                        {
-                               fatal(s.ofs()+(p1?1:0), "Illegal octal character code");
+                               fatal(s.ofs()+(p1?1:0), "Illegal octal character code, non octal digit found");
                                return ~0;
                        }
                        else
index 45e86c565b6f94fae3f9207df21e829f4e0ff349..1a1b3d612c0bfc053ccc430d915a7cdf7eb461f1 100644 (file)
--- a/re2c.1.in
+++ b/re2c.1.in
@@ -7,6 +7,9 @@
 .ds rx regular expression
 .ds lx \fIl\fP-expression
 \"$Log$
+\"Revision 1.33  2006/01/01 17:13:56  helly
+\"- Added support for c/c++ compatible \u and \U unicode notation.
+\"
 \"Revision 1.32  2005/12/31 00:54:55  helly
 \"- Update docu
 \"
@@ -454,10 +457,15 @@ Character classes and string literals may contain octoal or hexadecimal
 character definitions and the following set of escape sequences (\fB\\n\fP,
  \fB\\t\fP, \fB\\v\fP, \fB\\b\fP, \fB\\r\fP, \fB\\f\fP, \fB\\a\fP, \fB\\\\\fP).
 An octal character is defined by a backslash followed by its three octal digits
-and a hexadecimal character is defined by backslash, a lower cased 'x' and its
-two hexadecimal digits or a backslash, an upper cased 'X' and its four 
-hexadecimal digits. Since characters greater \fB0X00FF\fP are not allowed in 
-non unicode mode the only portable "\fBany\fP" rule is \fB(.|"\\n")\fP.
+and a hexadecimal character is defined by backslash, a lower cased '\fBx\fP' 
+and its two hexadecimal digits or a backslash, an upper cased \fBX\fP and its 
+four hexadecimal digits. Since characters greater \fB\\X00FF\fP are not allowed 
+in non unicode mode, the only portable "\fBany\fP" rule is \fB(.|"\\n")\fP. re2c
+further more supports the c/c++ unicode notation. That is a backslash followed
+by either a lowercased \fBu\fP and its four hexadecimal digits or an uppercased 
+\fBU\fP and its eight hexadecimal digits. However using the U notation it is 
+not possible to support characters greater \fB\\U0000FFFF\fP due to an internal 
+limitation of re2c.
 .LP
 The regular expressions listed above are grouped according to
 precedence, from highest precedence at the top to lowest at the bottom.
diff --git a/test/error10.c b/test/error10.c
new file mode 100755 (executable)
index 0000000..3f841c4
--- /dev/null
@@ -0,0 +1,3 @@
+line 2, column 6: Illegal unicode character, eight hexadecimal digits are required
+/* Generated by re2c */
+#line 1 "error10.re"
diff --git a/test/error10.re b/test/error10.re
new file mode 100755 (executable)
index 0000000..2d1a544
--- /dev/null
@@ -0,0 +1,3 @@
+/*!re2c
+[\U00900000YY] {}
+*/
diff --git a/test/error11.c b/test/error11.c
new file mode 100755 (executable)
index 0000000..1b6154a
--- /dev/null
@@ -0,0 +1,3 @@
+line 2, column 9: Illegal unicode character, non hexadecimal digit found
+/* Generated by re2c */
+#line 1 "error11.re"
diff --git a/test/error11.re b/test/error11.re
new file mode 100755 (executable)
index 0000000..705b32c
--- /dev/null
@@ -0,0 +1,3 @@
+/*!re2c
+[\U00000ZZZYY] {}
+*/
index b63f5f90a47703cd029aceb1b25b74b8bd3b6509..5e2dfe1664d1615f0677ffadbb223cb1e9bbaa87 100755 (executable)
@@ -1,3 +1,3 @@
 /*!re2c
-[\x0Z] {}
+[\x0ZYY] {}
 */
index 6ebb711f4bdd4b3f0eb1367bfedc3bab79cfe9fc..82555e3a8f62559af5789fbd9ab030d4f1e22452 100755 (executable)
@@ -1,3 +1,3 @@
 /*!re2c
-[\400] {}
+[\400YY] {}
 */
index c862ec0a663c34f2e1eaa8e4bdd5c2f67bb15f08..f676fcda9f4a49564f2c7d719b27cfb59a0a8ed8 100755 (executable)
@@ -1,3 +1,3 @@
-line 2, column 4: Illegal octal character code
+line 2, column 4: Illegal octal character code, non octal digit found
 /* Generated by re2c */
 #line 1 "error8.re"
index b007922c90401629aab565578e48c6a4eaad069f..f82b0a4c9dc42e112e6062b8105f2f034910afb8 100755 (executable)
@@ -1,3 +1,3 @@
 /*!re2c
-[\090] {}
+[\090YY] {}
 */
index 24b5c6c9f32cd5aedb91665411065733ffe5d7a9..1bbcc9088428a1eaab97507023a257331d4b7070 100755 (executable)
@@ -1,3 +1,3 @@
-line 2, column 5: Illegal octal character code
+line 2, column 5: Illegal octal character code, non octal digit found
 /* Generated by re2c */
 #line 1 "error9.re"
index d0d1f0cd841ce4844c2498be9eb8cc74acbb7d07..b82051a0d5ca7ab756aa23e8bfb2396a4fecfeeb 100755 (executable)
@@ -1,3 +1,3 @@
 /*!re2c
-[\009] {}
+[\009YY] {}
 */