This example is based on `Recognizing integers: the sentinel method`_ example,
only now integer literals are parsed rather than simply recognized.
-The aim of this example is to show how to use multiple re2c blocks,
-not how to parse integers (overflows are not handled). ``:)``
+Parsing integers is simple: one can easily do it by hand.
+However, re2c-generated code *does* look like a simple handwritten parser:
+a couple of dereferences and conditional jumps. No overhead. ``:)``
`[04_parsing_integers_blocks.re] <examples/04_parsing_integers_blocks.re>`_
Notes:
-* Configurations and definitions (lines 9 - 15) are not scoped to a single re2c block --- they are global.
+* Configurations and definitions (lines 20 - 26) are not scoped to a single re2c block --- they are global.
Each block may override configurations, but this affects global scope.
* Blocks don't have to be in the same function: they can be in separate functions or elsewhere
as long as the exposed interface fits into lexical scope.
$ re2c -o example.cc 04_parsing_integers_blocks.re
$ g++ -o example example.cc
- $ ./example "" 0 0b11100001 012345 67890 0xffE 0x 0b
- error :[
+ $ ./example 0 12345678901234567890 0xFFFFffffFFFFffff 0x1FFFFffffFFFFffff 0xAbcDEf 0x00 007 0B0 0b110101010 ""
0
- 225
- 5349
- 67890
- 4094
- error :[
- error :[
+ 12345678901234567890
+ 18446744073709551615
+ error
+ 11259375
+ 0
+ 7
+ 0
+ 426
+ error
.. _Parsing integers (conditions):
Notes:
* Conditions are enabled with ``-c`` option.
+
* Conditions are only syntactic sugar, they can be translated into multiple blocks.
+
* Each condition is a standalone lexer (DFA).
-* Conditions are interconnected: transitions are allowed between final states of one DFA
- and start state of another DFA (but no transitions between inner states of different DFAs).
- The generated code starts with dispatch on conditions.
-* Each condition has a unique identifier: ``/*!types:re2c*/`` directive (line 3)
- tells re2c to generate enumeration of them (names are prefixed with ``yyc`` by default).
- These identifiers are used in the initial dispatch on conditions:
- lexer uses ``YYGETCONDITION`` to get current condition (line 16)
- and ``YYSETCONDITION`` to set it (line 18).
+
+* Each condition has a unique identifier: ``/*!types:re2c*/`` tells re2c to generate
+ enumeration of all identifiers (names are prefixed with ``yyc`` by default).
+ Lexer uses ``YYGETCONDITION`` to get the identifier of current condition
+ and ``YYSETCONDITION`` to set it.
+
* Each condition has a unique label (prefixed with ``yyc_`` by default).
- Actions can use these labels to jump between conditions.
- Alternatively the whole block may be enclosed in a loop:
- then lexer will go through the initial dispatch on each iteration (but this might be slow).
-* Star rule ``<*>`` (line 21) is merged to all conditions (low priority).
-* Rule with multiple conditions (line 28) is merged to each listed condition (normal priority).
-* ``:=>`` (lines 23, 24, 25, 26) implies immediate transition
- (bypassing initial dispatch).
+
+* Conditions are connected: transitions are allowed between final states of one condition
+ and start state of another condition (but not between inner states of different conditions).
+ The generated code starts with dispatch.
+ Actions can either jump to the initial dispatch or jump directly to any condition.
+
+* Rule ``<*>`` is merged to all conditions (low priority).
+
+* Rules with multiple conditions are merged to each listed condition (normal priority).
+
+* ``:=>`` jumps directly to the next condition (bypassing the initial dispatch).
Generate, compile and run:
$ re2c -c -o example.cc 05_parsing_integers_conditions.re
$ g++ -o example example.cc
- $ ./example "" 0 0b11100001 012345 67890 0xffE 0x 0b
- error :[
+ $ ./example 0 12345678901234567890 0xFFFFffffFFFFffff 0x1FFFFffffFFFFffff 0xAbcDEf 0x00 007 0B0 0b110101010 ""
+ 0
+ 12345678901234567890
+ 18446744073709551615
+ error
+ 11259375
+ 0
+ 7
0
- 225
- 5349
- 67890
- 4094
- error :[
- error :[
+ 426
+ error
.. Braille patterns (encodings):
.. include:: examples/06_braille.utf8.txt
It appears to be UTF-8 encoded `[06_braille.utf8.txt] <examples/06_braille.utf8.txt.html>`_.
-Now translate it into UTF-16, UTF-32 or UCS-2:
+Convert it into UTF-16, UTF-32 or UCS-2:
.. code-block:: bash
+#include <limits.h>
#include <stdio.h>
-static int lex(const char *s)
+template<int base>
+static bool adddgt(unsigned long &u, unsigned int d)
+{
+ if (u > (ULONG_MAX - d) / base) {
+ return false;
+ }
+ u = u * base + d;
+ return true;
+}
+
+static bool lex(const char *s, unsigned long &u)
{
const char *YYMARKER;
const char *YYCTXMARKER;
- int n = 0;
+ u = 0;
/*!re2c
re2c:yyfill:enable = 0;
*/
/*!re2c
- * { return -1; }
+ * { return false; }
'0b' / [01] { goto bin; }
"0" { goto oct; }
"" / [1-9] { goto dec; }
bin:
/*!re2c
- * { return -1; }
- end { return n; }
- [01] { n = (n << 1) + (s[-1] - '0'); goto bin; }
+ * { return false; }
+ end { return true; }
+ [01] { if (!adddgt<2>(u, s[-1] - '0')) return false; goto bin; }
*/
oct:
/*!re2c
- * { return -1; }
- end { return n; }
- [0-7] { n = (n << 3) + (s[-1] - '0'); goto oct; }
+ * { return false; }
+ end { return true; }
+ [0-7] { if (!adddgt<8>(u, s[-1] - '0')) return false; goto oct; }
*/
dec:
/*!re2c
- * { return -1; }
- end { return n; }
- [0-9] { n = (n * 10) + (s[-1] - '0'); goto dec; }
+ * { return false; }
+ end { return true; }
+ [0-9] { if (!adddgt<10>(u, s[-1] - '0')) return false; goto dec; }
*/
hex:
/*!re2c
- * { return -1; }
- end { return n; }
- [0-9] { n = (n << 4) + (s[-1] - '0'); goto hex; }
- [a-f] { n = (n << 4) + (s[-1] - 'a' + 10); goto hex; }
- [A-F] { n = (n << 4) + (s[-1] - 'A' + 10); goto hex; }
+ * { return false; }
+ end { return true; }
+ [0-9] { if (!adddgt<16>(u, s[-1] - '0')) return false; goto hex; }
+ [a-f] { if (!adddgt<16>(u, s[-1] - 'a' + 10)) return false; goto hex; }
+ [A-F] { if (!adddgt<16>(u, s[-1] - 'A' + 10)) return false; goto hex; }
*/
}
int main(int argc, char **argv)
{
for (int i = 1; i < argc; ++i) {
- const int n = lex(argv[i]);
- if (n < 0) {
- printf("error :[\n");
+ unsigned long u;
+ if (lex(argv[i], u)) {
+ printf("%lu\n", u);
} else {
- printf("%d\n", n);
+ printf("error\n");
}
}
return 0;
+#include <limits.h>
#include <stdio.h>
+template<int base>
+static bool adddgt(unsigned long &u, unsigned int d)
+{
+ if (u > (ULONG_MAX - d) / base) {
+ return false;
+ }
+ u = u * base + d;
+ return true;
+}
+
/*!types:re2c*/
-static int lex(const char *s)
+static bool lex(const char *s, unsigned long &u)
{
const char *YYMARKER;
const char *YYCTXMARKER;
- int n = 0;
int c = yycinit;
-
+ u = 0;
/*!re2c
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = char;
re2c:define:YYSETCONDITION = "c = @@;";
re2c:define:YYSETCONDITION:naked = 1;
- <*> * { return -1; }
+ <*> * { return false; }
<init> '0b' / [01] :=> bin
<init> "0" :=> oct
<init> "" / [1-9] :=> dec
<init> '0x' / [0-9a-fA-F] :=> hex
- <bin, oct, dec, hex> "\x00" { return n; }
- <bin> [01] { n = (n << 1) + (s[-1] - '0'); goto yyc_bin; }
- <oct> [0-7] { n = (n << 3) + (s[-1] - '0'); goto yyc_oct; }
- <dec> [0-9] { n = (n * 10) + (s[-1] - '0'); goto yyc_dec; }
- <hex> [0-9] { n = (n << 4) + (s[-1] - '0'); goto yyc_hex; }
- <hex> [a-f] { n = (n << 4) + (s[-1] - 'a' + 10); goto yyc_hex; }
- <hex> [A-F] { n = (n << 4) + (s[-1] - 'A' + 10); goto yyc_hex; }
+ <bin, oct, dec, hex> "\x00" { return true; }
+ <bin> [01] { if (!adddgt<2>(u, s[-1] - '0')) return false; goto yyc_bin; }
+ <oct> [0-7] { if (!adddgt<8>(u, s[-1] - '0')) return false; goto yyc_oct; }
+ <dec> [0-9] { if (!adddgt<10>(u, s[-1] - '0')) return false; goto yyc_dec; }
+ <hex> [0-9] { if (!adddgt<16>(u, s[-1] - '0')) return false; goto yyc_hex; }
+ <hex> [a-f] { if (!adddgt<16>(u, s[-1] - 'a' + 10)) return false; goto yyc_hex; }
+ <hex> [A-F] { if (!adddgt<16>(u, s[-1] - 'A' + 10)) return false; goto yyc_hex; }
*/
}
int main(int argc, char **argv)
{
for (int i = 1; i < argc; ++i) {
- const int n = lex(argv[i]);
- if (n < 0) {
- printf("error :[\n");
+ unsigned long u;
+ if (lex(argv[i], u)) {
+ printf("%lu\n", u);
} else {
- printf("%d\n", n);
+ printf("error\n");
}
}
return 0;