From: Ulya Trofimovich Date: Sun, 1 Sep 2019 09:03:48 +0000 (+0100) Subject: Added ninja lexers to tests. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2ee9c50c058969f452be3ad9df9ab82b7e61b682;p=re2c Added ninja lexers to tests. Note: ninja_lexer.ib.c differs from the original lexer in the ninja project, because the latter was generated with re2c-0.16, which used slightly different constructs for complex expressions: Re2c-0.16 used generated something like this: ++YYCURSOR; if (yych = *YYCURSOR) ... And the latest re2c generates something like this: yych = *++YYCYRSOR; if (yych) ... The difference is insignificant. --- diff --git a/test/real_world/ninja_depfile_parser.ib.c b/test/real_world/ninja_depfile_parser.ib.c new file mode 100644 index 00000000..89723536 --- /dev/null +++ b/test/real_world/ninja_depfile_parser.ib.c @@ -0,0 +1,339 @@ +/* Generated by re2c */ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "depfile_parser.h" +#include "util.h" + +DepfileParser::DepfileParser(DepfileParserOptions options) + : options_(options) +{ +} + +// A note on backslashes in Makefiles, from reading the docs: +// Backslash-newline is the line continuation character. +// Backslash-# escapes a # (otherwise meaningful as a comment start). +// Backslash-% escapes a % (otherwise meaningful as a special). +// Finally, quoting the GNU manual, "Backslashes that are not in danger +// of quoting ‘%’ characters go unmolested." +// How do you end a line with a backslash? The netbsd Make docs suggest +// reading the result of a shell command echoing a backslash! +// +// Rather than implement all of above, we follow what GCC/Clang produces: +// Backslashes escape a space or hash sign. +// When a space is preceded by 2N+1 backslashes, it is represents N backslashes +// followed by space. +// When a space is preceded by 2N backslashes, it represents 2N backslashes at +// the end of a filename. +// A hash sign is escaped by a single backslash. All other backslashes remain +// unchanged. +// +// If anyone actually has depfiles that rely on the more complicated +// behavior we can adjust this. +bool DepfileParser::Parse(string* content, string* err) { + // in: current parser input point. + // end: end of input. + // parsing_targets: whether we are parsing targets or dependencies. + char* in = &(*content)[0]; + char* end = in + content->size(); + bool have_target = false; + bool have_secondary_target_on_this_rule = false; + bool have_newline_since_primary_target = false; + bool warned_distinct_target_lines = false; + bool parsing_targets = true; + while (in < end) { + bool have_newline = false; + // out: current output point (typically same as in, but can fall behind + // as we de-escape backslashes). + char* out = in; + // filename: start of the current parsed filename. + char* filename = out; + for (;;) { + // start: beginning of the current parsed span. + const char* start = in; + char* yymarker = NULL; + + { + unsigned char yych; + static const unsigned char yybm[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 128, 0, 0, 0, 128, 0, 0, + 128, 128, 0, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 0, 0, 128, 0, 0, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 0, 128, 0, 128, + 0, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 0, 128, 128, 0, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + }; + yych = *in; + if (yybm[0+yych] & 128) { + goto yy9; + } + if (yych <= '\r') { + if (yych <= '\t') { + if (yych >= 0x01) goto yy4; + } else { + if (yych <= '\n') goto yy6; + if (yych <= '\f') goto yy4; + goto yy8; + } + } else { + if (yych <= '$') { + if (yych <= '#') goto yy4; + goto yy12; + } else { + if (yych <= '?') goto yy4; + if (yych <= '\\') goto yy13; + goto yy4; + } + } + ++in; + { + break; + } +yy4: + ++in; +yy5: + { + // For any other character (e.g. whitespace), swallow it here, + // allowing the outer logic to loop around again. + break; + } +yy6: + ++in; + { + // A newline ends the current file name and the current rule. + have_newline = true; + break; + } +yy8: + yych = *++in; + if (yych == '\n') goto yy6; + goto yy5; +yy9: + yych = *++in; + if (yybm[0+yych] & 128) { + goto yy9; + } +yy11: + { + // Got a span of plain text. + int len = (int)(in - start); + // Need to shift it over if we're overwriting backslashes. + if (out < start) + memmove(out, start, len); + out += len; + continue; + } +yy12: + yych = *++in; + if (yych == '$') goto yy14; + goto yy5; +yy13: + yych = *(yymarker = ++in); + if (yych <= 0x1F) { + if (yych <= '\n') { + if (yych <= 0x00) goto yy5; + if (yych <= '\t') goto yy16; + goto yy17; + } else { + if (yych == '\r') goto yy19; + goto yy16; + } + } else { + if (yych <= '#') { + if (yych <= ' ') goto yy21; + if (yych <= '"') goto yy16; + goto yy23; + } else { + if (yych == '\\') goto yy25; + goto yy16; + } + } +yy14: + ++in; + { + // De-escape dollar character. + *out++ = '$'; + continue; + } +yy16: + ++in; + goto yy11; +yy17: + ++in; + { + // A line continuation ends the current file name. + break; + } +yy19: + yych = *++in; + if (yych == '\n') goto yy17; + in = yymarker; + goto yy5; +yy21: + ++in; + { + // 2N+1 backslashes plus space -> N backslashes plus space. + int len = (int)(in - start); + int n = len / 2 - 1; + if (out < start) + memset(out, '\\', n); + out += n; + *out++ = ' '; + continue; + } +yy23: + ++in; + { + // De-escape hash sign, but preserve other leading backslashes. + int len = (int)(in - start); + if (len > 2 && out < start) + memset(out, '\\', len - 2); + out += len - 2; + *out++ = '#'; + continue; + } +yy25: + yych = *++in; + if (yych <= 0x1F) { + if (yych <= '\n') { + if (yych <= 0x00) goto yy11; + if (yych <= '\t') goto yy16; + goto yy11; + } else { + if (yych == '\r') goto yy11; + goto yy16; + } + } else { + if (yych <= '#') { + if (yych <= ' ') goto yy26; + if (yych <= '"') goto yy16; + goto yy23; + } else { + if (yych == '\\') goto yy28; + goto yy16; + } + } +yy26: + ++in; + { + // 2N backslashes plus space -> 2N backslashes, end of filename. + int len = (int)(in - start); + if (out < start) + memset(out, '\\', len - 1); + out += len - 1; + break; + } +yy28: + yych = *++in; + if (yych <= 0x1F) { + if (yych <= '\n') { + if (yych <= 0x00) goto yy11; + if (yych <= '\t') goto yy16; + goto yy11; + } else { + if (yych == '\r') goto yy11; + goto yy16; + } + } else { + if (yych <= '#') { + if (yych <= ' ') goto yy21; + if (yych <= '"') goto yy16; + goto yy23; + } else { + if (yych == '\\') goto yy25; + goto yy16; + } + } + } + + } + + int len = (int)(out - filename); + const bool is_dependency = !parsing_targets; + if (len > 0 && filename[len - 1] == ':') { + len--; // Strip off trailing colon, if any. + parsing_targets = false; + have_target = true; + } + + if (len > 0) { + if (is_dependency) { + if (have_secondary_target_on_this_rule) { + if (!have_newline_since_primary_target) { + *err = "depfile has multiple output paths"; + return false; + } else if (options_.depfile_distinct_target_lines_action_ == + kDepfileDistinctTargetLinesActionError) { + *err = + "depfile has multiple output paths (on separate lines)" + " [-w depfilemulti=err]"; + return false; + } else { + if (!warned_distinct_target_lines) { + warned_distinct_target_lines = true; + Warning("depfile has multiple output paths (on separate lines); " + "continuing anyway [-w depfilemulti=warn]"); + } + continue; + } + } + ins_.push_back(StringPiece(filename, len)); + } else if (!out_.str_) { + out_ = StringPiece(filename, len); + } else if (out_ != StringPiece(filename, len)) { + have_secondary_target_on_this_rule = true; + } + } + + if (have_newline) { + // A newline ends a rule so the next filename will be a new target. + parsing_targets = true; + have_secondary_target_on_this_rule = false; + if (have_target) { + have_newline_since_primary_target = true; + } + } + } + if (!have_target) { + *err = "expected ':' in depfile"; + return false; + } + return true; +} diff --git a/test/real_world/ninja_depfile_parser.ib.re b/test/real_world/ninja_depfile_parser.ib.re new file mode 100644 index 00000000..735a0c3a --- /dev/null +++ b/test/real_world/ninja_depfile_parser.ib.re @@ -0,0 +1,191 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "depfile_parser.h" +#include "util.h" + +DepfileParser::DepfileParser(DepfileParserOptions options) + : options_(options) +{ +} + +// A note on backslashes in Makefiles, from reading the docs: +// Backslash-newline is the line continuation character. +// Backslash-# escapes a # (otherwise meaningful as a comment start). +// Backslash-% escapes a % (otherwise meaningful as a special). +// Finally, quoting the GNU manual, "Backslashes that are not in danger +// of quoting ‘%’ characters go unmolested." +// How do you end a line with a backslash? The netbsd Make docs suggest +// reading the result of a shell command echoing a backslash! +// +// Rather than implement all of above, we follow what GCC/Clang produces: +// Backslashes escape a space or hash sign. +// When a space is preceded by 2N+1 backslashes, it is represents N backslashes +// followed by space. +// When a space is preceded by 2N backslashes, it represents 2N backslashes at +// the end of a filename. +// A hash sign is escaped by a single backslash. All other backslashes remain +// unchanged. +// +// If anyone actually has depfiles that rely on the more complicated +// behavior we can adjust this. +bool DepfileParser::Parse(string* content, string* err) { + // in: current parser input point. + // end: end of input. + // parsing_targets: whether we are parsing targets or dependencies. + char* in = &(*content)[0]; + char* end = in + content->size(); + bool have_target = false; + bool have_secondary_target_on_this_rule = false; + bool have_newline_since_primary_target = false; + bool warned_distinct_target_lines = false; + bool parsing_targets = true; + while (in < end) { + bool have_newline = false; + // out: current output point (typically same as in, but can fall behind + // as we de-escape backslashes). + char* out = in; + // filename: start of the current parsed filename. + char* filename = out; + for (;;) { + // start: beginning of the current parsed span. + const char* start = in; + char* yymarker = NULL; + /*!re2c + re2c:define:YYCTYPE = "unsigned char"; + re2c:define:YYCURSOR = in; + re2c:define:YYLIMIT = end; + re2c:define:YYMARKER = yymarker; + + re2c:yyfill:enable = 0; + + re2c:indent:top = 2; + re2c:indent:string = " "; + + nul = "\000"; + newline = '\r'?'\n'; + + '\\\\'* '\\ ' { + // 2N+1 backslashes plus space -> N backslashes plus space. + int len = (int)(in - start); + int n = len / 2 - 1; + if (out < start) + memset(out, '\\', n); + out += n; + *out++ = ' '; + continue; + } + '\\\\'+ ' ' { + // 2N backslashes plus space -> 2N backslashes, end of filename. + int len = (int)(in - start); + if (out < start) + memset(out, '\\', len - 1); + out += len - 1; + break; + } + '\\'+ '#' { + // De-escape hash sign, but preserve other leading backslashes. + int len = (int)(in - start); + if (len > 2 && out < start) + memset(out, '\\', len - 2); + out += len - 2; + *out++ = '#'; + continue; + } + '$$' { + // De-escape dollar character. + *out++ = '$'; + continue; + } + '\\'+ [^\000\r\n] | [a-zA-Z0-9+,/_:.~()}{%=@\x5B\x5D!\x80-\xFF-]+ { + // Got a span of plain text. + int len = (int)(in - start); + // Need to shift it over if we're overwriting backslashes. + if (out < start) + memmove(out, start, len); + out += len; + continue; + } + nul { + break; + } + '\\' newline { + // A line continuation ends the current file name. + break; + } + newline { + // A newline ends the current file name and the current rule. + have_newline = true; + break; + } + [^] { + // For any other character (e.g. whitespace), swallow it here, + // allowing the outer logic to loop around again. + break; + } + */ + } + + int len = (int)(out - filename); + const bool is_dependency = !parsing_targets; + if (len > 0 && filename[len - 1] == ':') { + len--; // Strip off trailing colon, if any. + parsing_targets = false; + have_target = true; + } + + if (len > 0) { + if (is_dependency) { + if (have_secondary_target_on_this_rule) { + if (!have_newline_since_primary_target) { + *err = "depfile has multiple output paths"; + return false; + } else if (options_.depfile_distinct_target_lines_action_ == + kDepfileDistinctTargetLinesActionError) { + *err = + "depfile has multiple output paths (on separate lines)" + " [-w depfilemulti=err]"; + return false; + } else { + if (!warned_distinct_target_lines) { + warned_distinct_target_lines = true; + Warning("depfile has multiple output paths (on separate lines); " + "continuing anyway [-w depfilemulti=warn]"); + } + continue; + } + } + ins_.push_back(StringPiece(filename, len)); + } else if (!out_.str_) { + out_ = StringPiece(filename, len); + } else if (out_ != StringPiece(filename, len)) { + have_secondary_target_on_this_rule = true; + } + } + + if (have_newline) { + // A newline ends a rule so the next filename will be a new target. + parsing_targets = true; + have_secondary_target_on_this_rule = false; + if (have_target) { + have_newline_since_primary_target = true; + } + } + } + if (!have_target) { + *err = "expected ':' in depfile"; + return false; + } + return true; +} diff --git a/test/real_world/ninja_lexer.ib.c b/test/real_world/ninja_lexer.ib.c new file mode 100644 index 00000000..940c3dab --- /dev/null +++ b/test/real_world/ninja_lexer.ib.c @@ -0,0 +1,820 @@ +/* Generated by re2c */ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lexer.h" + +#include + +#include "eval_env.h" +#include "util.h" + +bool Lexer::Error(const string& message, string* err) { + // Compute line/column. + int line = 1; + const char* line_start = input_.str_; + for (const char* p = input_.str_; p < last_token_; ++p) { + if (*p == '\n') { + ++line; + line_start = p + 1; + } + } + int col = last_token_ ? (int)(last_token_ - line_start) : 0; + + char buf[1024]; + snprintf(buf, sizeof(buf), "%s:%d: ", filename_.AsString().c_str(), line); + *err = buf; + *err += message + "\n"; + + // Add some context to the message. + const int kTruncateColumn = 72; + if (col > 0 && col < kTruncateColumn) { + int len; + bool truncated = true; + for (len = 0; len < kTruncateColumn; ++len) { + if (line_start[len] == 0 || line_start[len] == '\n') { + truncated = false; + break; + } + } + *err += string(line_start, len); + if (truncated) + *err += "..."; + *err += "\n"; + *err += string(col, ' '); + *err += "^ near here"; + } + + return false; +} + +Lexer::Lexer(const char* input) { + Start("input", input); +} + +void Lexer::Start(StringPiece filename, StringPiece input) { + filename_ = filename; + input_ = input; + ofs_ = input_.str_; + last_token_ = NULL; +} + +const char* Lexer::TokenName(Token t) { + switch (t) { + case ERROR: return "lexing error"; + case BUILD: return "'build'"; + case COLON: return "':'"; + case DEFAULT: return "'default'"; + case EQUALS: return "'='"; + case IDENT: return "identifier"; + case INCLUDE: return "'include'"; + case INDENT: return "indent"; + case NEWLINE: return "newline"; + case PIPE2: return "'||'"; + case PIPE: return "'|'"; + case POOL: return "'pool'"; + case RULE: return "'rule'"; + case SUBNINJA: return "'subninja'"; + case TEOF: return "eof"; + } + return NULL; // not reached +} + +const char* Lexer::TokenErrorHint(Token expected) { + switch (expected) { + case COLON: + return " ($ also escapes ':')"; + default: + return ""; + } +} + +string Lexer::DescribeLastError() { + if (last_token_) { + switch (last_token_[0]) { + case '\t': + return "tabs are not allowed, use spaces"; + } + } + return "lexing error"; +} + +void Lexer::UnreadToken() { + ofs_ = last_token_; +} + +Lexer::Token Lexer::ReadToken() { + const char* p = ofs_; + const char* q; + const char* start; + Lexer::Token token; + for (;;) { + start = p; + +{ + unsigned char yych; + unsigned int yyaccept = 0; + static const unsigned char yybm[] = { + 0, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 0, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 160, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 192, 192, 128, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 128, 128, 128, 128, 128, 128, + 128, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 128, 128, 128, 128, 192, + 128, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + }; + yych = *p; + if (yybm[0+yych] & 32) { + goto yy9; + } + if (yych <= '^') { + if (yych <= ',') { + if (yych <= '\f') { + if (yych <= 0x00) goto yy2; + if (yych == '\n') goto yy6; + goto yy4; + } else { + if (yych <= '\r') goto yy8; + if (yych == '#') goto yy12; + goto yy4; + } + } else { + if (yych <= ':') { + if (yych == '/') goto yy4; + if (yych <= '9') goto yy13; + goto yy16; + } else { + if (yych <= '=') { + if (yych <= '<') goto yy4; + goto yy18; + } else { + if (yych <= '@') goto yy4; + if (yych <= 'Z') goto yy13; + goto yy4; + } + } + } + } else { + if (yych <= 'i') { + if (yych <= 'b') { + if (yych == '`') goto yy4; + if (yych <= 'a') goto yy13; + goto yy20; + } else { + if (yych == 'd') goto yy21; + if (yych <= 'h') goto yy13; + goto yy22; + } + } else { + if (yych <= 'r') { + if (yych == 'p') goto yy23; + if (yych <= 'q') goto yy13; + goto yy24; + } else { + if (yych <= 'z') { + if (yych <= 's') goto yy25; + goto yy13; + } else { + if (yych == '|') goto yy26; + goto yy4; + } + } + } + } +yy2: + ++p; + { token = TEOF; break; } +yy4: + ++p; +yy5: + { token = ERROR; break; } +yy6: + ++p; + { token = NEWLINE; break; } +yy8: + yych = *++p; + if (yych == '\n') goto yy28; + goto yy5; +yy9: + yyaccept = 0; + yych = *(q = ++p); + if (yybm[0+yych] & 32) { + goto yy9; + } + if (yych <= '\f') { + if (yych == '\n') goto yy6; + } else { + if (yych <= '\r') goto yy30; + if (yych == '#') goto yy32; + } +yy11: + { token = INDENT; break; } +yy12: + yyaccept = 1; + yych = *(q = ++p); + if (yych <= 0x00) goto yy5; + goto yy33; +yy13: + yych = *++p; +yy14: + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = IDENT; break; } +yy16: + ++p; + { token = COLON; break; } +yy18: + ++p; + { token = EQUALS; break; } +yy20: + yych = *++p; + if (yych == 'u') goto yy36; + goto yy14; +yy21: + yych = *++p; + if (yych == 'e') goto yy37; + goto yy14; +yy22: + yych = *++p; + if (yych == 'n') goto yy38; + goto yy14; +yy23: + yych = *++p; + if (yych == 'o') goto yy39; + goto yy14; +yy24: + yych = *++p; + if (yych == 'u') goto yy40; + goto yy14; +yy25: + yych = *++p; + if (yych == 'u') goto yy41; + goto yy14; +yy26: + yych = *++p; + if (yych == '|') goto yy42; + { token = PIPE; break; } +yy28: + ++p; + { token = NEWLINE; break; } +yy30: + yych = *++p; + if (yych == '\n') goto yy28; +yy31: + p = q; + if (yyaccept == 0) { + goto yy11; + } else { + goto yy5; + } +yy32: + yych = *++p; +yy33: + if (yybm[0+yych] & 128) { + goto yy32; + } + if (yych <= 0x00) goto yy31; + ++p; + { continue; } +yy36: + yych = *++p; + if (yych == 'i') goto yy44; + goto yy14; +yy37: + yych = *++p; + if (yych == 'f') goto yy45; + goto yy14; +yy38: + yych = *++p; + if (yych == 'c') goto yy46; + goto yy14; +yy39: + yych = *++p; + if (yych == 'o') goto yy47; + goto yy14; +yy40: + yych = *++p; + if (yych == 'l') goto yy48; + goto yy14; +yy41: + yych = *++p; + if (yych == 'b') goto yy49; + goto yy14; +yy42: + ++p; + { token = PIPE2; break; } +yy44: + yych = *++p; + if (yych == 'l') goto yy50; + goto yy14; +yy45: + yych = *++p; + if (yych == 'a') goto yy51; + goto yy14; +yy46: + yych = *++p; + if (yych == 'l') goto yy52; + goto yy14; +yy47: + yych = *++p; + if (yych == 'l') goto yy53; + goto yy14; +yy48: + yych = *++p; + if (yych == 'e') goto yy55; + goto yy14; +yy49: + yych = *++p; + if (yych == 'n') goto yy57; + goto yy14; +yy50: + yych = *++p; + if (yych == 'd') goto yy58; + goto yy14; +yy51: + yych = *++p; + if (yych == 'u') goto yy60; + goto yy14; +yy52: + yych = *++p; + if (yych == 'u') goto yy61; + goto yy14; +yy53: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = POOL; break; } +yy55: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = RULE; break; } +yy57: + yych = *++p; + if (yych == 'i') goto yy62; + goto yy14; +yy58: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = BUILD; break; } +yy60: + yych = *++p; + if (yych == 'l') goto yy63; + goto yy14; +yy61: + yych = *++p; + if (yych == 'd') goto yy64; + goto yy14; +yy62: + yych = *++p; + if (yych == 'n') goto yy65; + goto yy14; +yy63: + yych = *++p; + if (yych == 't') goto yy66; + goto yy14; +yy64: + yych = *++p; + if (yych == 'e') goto yy68; + goto yy14; +yy65: + yych = *++p; + if (yych == 'j') goto yy70; + goto yy14; +yy66: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = DEFAULT; break; } +yy68: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = INCLUDE; break; } +yy70: + yych = *++p; + if (yych != 'a') goto yy14; + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy13; + } + { token = SUBNINJA; break; } +} + + } + + last_token_ = start; + ofs_ = p; + if (token != NEWLINE && token != TEOF) + EatWhitespace(); + return token; +} + +bool Lexer::PeekToken(Token token) { + Token t = ReadToken(); + if (t == token) + return true; + UnreadToken(); + return false; +} + +void Lexer::EatWhitespace() { + const char* p = ofs_; + const char* q; + for (;;) { + ofs_ = p; + +{ + unsigned char yych; + static const unsigned char yybm[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 128, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + yych = *p; + if (yybm[0+yych] & 128) { + goto yy79; + } + if (yych <= 0x00) goto yy75; + if (yych == '$') goto yy82; + goto yy77; +yy75: + ++p; + { break; } +yy77: + ++p; +yy78: + { break; } +yy79: + yych = *++p; + if (yybm[0+yych] & 128) { + goto yy79; + } + { continue; } +yy82: + yych = *(q = ++p); + if (yych == '\n') goto yy83; + if (yych == '\r') goto yy85; + goto yy78; +yy83: + ++p; + { continue; } +yy85: + yych = *++p; + if (yych == '\n') goto yy87; + p = q; + goto yy78; +yy87: + ++p; + { continue; } +} + + } +} + +bool Lexer::ReadIdent(string* out) { + const char* p = ofs_; + const char* start; + for (;;) { + start = p; + +{ + unsigned char yych; + static const unsigned char yybm[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 128, 128, 0, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 0, 0, 0, 0, 0, 0, + 0, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 0, 0, 0, 0, 128, + 0, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + yych = *p; + if (yybm[0+yych] & 128) { + goto yy93; + } + ++p; + { + last_token_ = start; + return false; + } +yy93: + yych = *++p; + if (yybm[0+yych] & 128) { + goto yy93; + } + { + out->assign(start, p - start); + break; + } +} + + } + last_token_ = start; + ofs_ = p; + EatWhitespace(); + return true; +} + +bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { + const char* p = ofs_; + const char* q; + const char* start; + for (;;) { + start = p; + +{ + unsigned char yych; + static const unsigned char yybm[] = { + 0, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 0, 16, 16, 0, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 32, 16, 16, 16, 0, 16, 16, 16, + 16, 16, 16, 16, 16, 208, 144, 16, + 208, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 0, 16, 16, 16, 16, 16, + 16, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 208, 16, 16, 16, 16, 208, + 16, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 208, 208, 208, 208, 208, 208, + 208, 208, 208, 16, 0, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + }; + yych = *p; + if (yybm[0+yych] & 16) { + goto yy100; + } + if (yych <= '\r') { + if (yych <= 0x00) goto yy98; + if (yych <= '\n') goto yy103; + goto yy105; + } else { + if (yych <= ' ') goto yy103; + if (yych <= '$') goto yy107; + goto yy103; + } +yy98: + ++p; + { + last_token_ = start; + return Error("unexpected EOF", err); + } +yy100: + yych = *++p; + if (yybm[0+yych] & 16) { + goto yy100; + } + { + eval->AddText(StringPiece(start, p - start)); + continue; + } +yy103: + ++p; + { + if (path) { + p = start; + break; + } else { + if (*start == '\n') + break; + eval->AddText(StringPiece(start, 1)); + continue; + } + } +yy105: + yych = *++p; + if (yych == '\n') goto yy108; + { + last_token_ = start; + return Error(DescribeLastError(), err); + } +yy107: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy120; + } + if (yych <= ' ') { + if (yych <= '\f') { + if (yych == '\n') goto yy112; + goto yy110; + } else { + if (yych <= '\r') goto yy115; + if (yych <= 0x1F) goto yy110; + goto yy116; + } + } else { + if (yych <= '/') { + if (yych == '$') goto yy118; + goto yy110; + } else { + if (yych <= ':') goto yy123; + if (yych <= '`') goto yy110; + if (yych <= '{') goto yy125; + goto yy110; + } + } +yy108: + ++p; + { + if (path) + p = start; + break; + } +yy110: + ++p; +yy111: + { + last_token_ = start; + return Error("bad $-escape (literal $ must be written as $$)", err); + } +yy112: + yych = *++p; + if (yybm[0+yych] & 32) { + goto yy112; + } + { + continue; + } +yy115: + yych = *++p; + if (yych == '\n') goto yy126; + goto yy111; +yy116: + ++p; + { + eval->AddText(StringPiece(" ", 1)); + continue; + } +yy118: + ++p; + { + eval->AddText(StringPiece("$", 1)); + continue; + } +yy120: + yych = *++p; + if (yybm[0+yych] & 64) { + goto yy120; + } + { + eval->AddSpecial(StringPiece(start + 1, p - start - 1)); + continue; + } +yy123: + ++p; + { + eval->AddText(StringPiece(":", 1)); + continue; + } +yy125: + yych = *(q = ++p); + if (yybm[0+yych] & 128) { + goto yy129; + } + goto yy111; +yy126: + yych = *++p; + if (yych == ' ') goto yy126; + { + continue; + } +yy129: + yych = *++p; + if (yybm[0+yych] & 128) { + goto yy129; + } + if (yych == '}') goto yy132; + p = q; + goto yy111; +yy132: + ++p; + { + eval->AddSpecial(StringPiece(start + 2, p - start - 3)); + continue; + } +} + + } + last_token_ = start; + ofs_ = p; + if (path) + EatWhitespace(); + // Non-path strings end in newlines, so there's no whitespace to eat. + return true; +} diff --git a/test/real_world/ninja_lexer.ib.re b/test/real_world/ninja_lexer.ib.re new file mode 100644 index 00000000..c1fb8227 --- /dev/null +++ b/test/real_world/ninja_lexer.ib.re @@ -0,0 +1,278 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lexer.h" + +#include + +#include "eval_env.h" +#include "util.h" + +bool Lexer::Error(const string& message, string* err) { + // Compute line/column. + int line = 1; + const char* line_start = input_.str_; + for (const char* p = input_.str_; p < last_token_; ++p) { + if (*p == '\n') { + ++line; + line_start = p + 1; + } + } + int col = last_token_ ? (int)(last_token_ - line_start) : 0; + + char buf[1024]; + snprintf(buf, sizeof(buf), "%s:%d: ", filename_.AsString().c_str(), line); + *err = buf; + *err += message + "\n"; + + // Add some context to the message. + const int kTruncateColumn = 72; + if (col > 0 && col < kTruncateColumn) { + int len; + bool truncated = true; + for (len = 0; len < kTruncateColumn; ++len) { + if (line_start[len] == 0 || line_start[len] == '\n') { + truncated = false; + break; + } + } + *err += string(line_start, len); + if (truncated) + *err += "..."; + *err += "\n"; + *err += string(col, ' '); + *err += "^ near here"; + } + + return false; +} + +Lexer::Lexer(const char* input) { + Start("input", input); +} + +void Lexer::Start(StringPiece filename, StringPiece input) { + filename_ = filename; + input_ = input; + ofs_ = input_.str_; + last_token_ = NULL; +} + +const char* Lexer::TokenName(Token t) { + switch (t) { + case ERROR: return "lexing error"; + case BUILD: return "'build'"; + case COLON: return "':'"; + case DEFAULT: return "'default'"; + case EQUALS: return "'='"; + case IDENT: return "identifier"; + case INCLUDE: return "'include'"; + case INDENT: return "indent"; + case NEWLINE: return "newline"; + case PIPE2: return "'||'"; + case PIPE: return "'|'"; + case POOL: return "'pool'"; + case RULE: return "'rule'"; + case SUBNINJA: return "'subninja'"; + case TEOF: return "eof"; + } + return NULL; // not reached +} + +const char* Lexer::TokenErrorHint(Token expected) { + switch (expected) { + case COLON: + return " ($ also escapes ':')"; + default: + return ""; + } +} + +string Lexer::DescribeLastError() { + if (last_token_) { + switch (last_token_[0]) { + case '\t': + return "tabs are not allowed, use spaces"; + } + } + return "lexing error"; +} + +void Lexer::UnreadToken() { + ofs_ = last_token_; +} + +Lexer::Token Lexer::ReadToken() { + const char* p = ofs_; + const char* q; + const char* start; + Lexer::Token token; + for (;;) { + start = p; + /*!re2c + re2c:define:YYCTYPE = "unsigned char"; + re2c:define:YYCURSOR = p; + re2c:define:YYMARKER = q; + re2c:yyfill:enable = 0; + + nul = "\000"; + simple_varname = [a-zA-Z0-9_-]+; + varname = [a-zA-Z0-9_.-]+; + + [ ]*"#"[^\000\n]*"\n" { continue; } + [ ]*"\r\n" { token = NEWLINE; break; } + [ ]*"\n" { token = NEWLINE; break; } + [ ]+ { token = INDENT; break; } + "build" { token = BUILD; break; } + "pool" { token = POOL; break; } + "rule" { token = RULE; break; } + "default" { token = DEFAULT; break; } + "=" { token = EQUALS; break; } + ":" { token = COLON; break; } + "||" { token = PIPE2; break; } + "|" { token = PIPE; break; } + "include" { token = INCLUDE; break; } + "subninja" { token = SUBNINJA; break; } + varname { token = IDENT; break; } + nul { token = TEOF; break; } + [^] { token = ERROR; break; } + */ + } + + last_token_ = start; + ofs_ = p; + if (token != NEWLINE && token != TEOF) + EatWhitespace(); + return token; +} + +bool Lexer::PeekToken(Token token) { + Token t = ReadToken(); + if (t == token) + return true; + UnreadToken(); + return false; +} + +void Lexer::EatWhitespace() { + const char* p = ofs_; + const char* q; + for (;;) { + ofs_ = p; + /*!re2c + [ ]+ { continue; } + "$\r\n" { continue; } + "$\n" { continue; } + nul { break; } + [^] { break; } + */ + } +} + +bool Lexer::ReadIdent(string* out) { + const char* p = ofs_; + const char* start; + for (;;) { + start = p; + /*!re2c + varname { + out->assign(start, p - start); + break; + } + [^] { + last_token_ = start; + return false; + } + */ + } + last_token_ = start; + ofs_ = p; + EatWhitespace(); + return true; +} + +bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { + const char* p = ofs_; + const char* q; + const char* start; + for (;;) { + start = p; + /*!re2c + [^$ :\r\n|\000]+ { + eval->AddText(StringPiece(start, p - start)); + continue; + } + "\r\n" { + if (path) + p = start; + break; + } + [ :|\n] { + if (path) { + p = start; + break; + } else { + if (*start == '\n') + break; + eval->AddText(StringPiece(start, 1)); + continue; + } + } + "$$" { + eval->AddText(StringPiece("$", 1)); + continue; + } + "$ " { + eval->AddText(StringPiece(" ", 1)); + continue; + } + "$\r\n"[ ]* { + continue; + } + "$\n"[ ]* { + continue; + } + "${"varname"}" { + eval->AddSpecial(StringPiece(start + 2, p - start - 3)); + continue; + } + "$"simple_varname { + eval->AddSpecial(StringPiece(start + 1, p - start - 1)); + continue; + } + "$:" { + eval->AddText(StringPiece(":", 1)); + continue; + } + "$". { + last_token_ = start; + return Error("bad $-escape (literal $ must be written as $$)", err); + } + nul { + last_token_ = start; + return Error("unexpected EOF", err); + } + [^] { + last_token_ = start; + return Error(DescribeLastError(), err); + } + */ + } + last_token_ = start; + ofs_ = p; + if (path) + EatWhitespace(); + // Non-path strings end in newlines, so there's no whitespace to eat. + return true; +}