[MC] - Don't assert when non-english characters are used.

author George Rimar <grimar@accesssoftek.com>

Wed, 4 Oct 2017 08:50:08 +0000 (08:50 +0000)

committer George Rimar <grimar@accesssoftek.com>

Wed, 4 Oct 2017 08:50:08 +0000 (08:50 +0000)
author George Rimar <grimar@accesssoftek.com>
Wed, 4 Oct 2017 08:50:08 +0000 (08:50 +0000)
committer George Rimar <grimar@accesssoftek.com>
Wed, 4 Oct 2017 08:50:08 +0000 (08:50 +0000)
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h

index cc32bf43f29c8d707bcca1e9233953004a4872f2..a9a8c87d0d7a38687d7a5250289ab14c789fe9e1 100644 (file)
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -59,6 +59,21 @@ static inline unsigned hexDigitValue(char C) {
    return -1U;
  }
  
+/// Checks if character \p C is one of the 10 decimal digits.
+static inline bool isDigit(char C) { return C >= '0' && C <= '9'; }
+
+/// Checks if character \p C is a hexadecimal numeric character.
+static inline bool isHexDigit(char C) { return hexDigitValue(C) != -1U; }
+
+/// Checks if character \p C is a valid letter as classified by "C" locale.
+static inline bool isAlpha(char C) {
+  return ('a' <= C && C <= 'z') || ('A' <= C && C <= 'Z');
+}
+
+/// Checks whether character \p C is either a decimal digit or an uppercase or
+/// lowercase letter as classified by "C" locale.
+static inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
+
  static inline std::string utohexstr(uint64_t X, bool LowerCase = false) {
    char Buffer[17];
    char *BufPtr = std::end(Buffer);
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp

index 2b963607b8374ba92f31955fa8d4db93a1dd65cc..e9123b9d71420ba491d8da1bf821b5bc587bdcbe 100644 (file)
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -14,6 +14,7 @@
  #include "llvm/MC/MCParser/AsmLexer.h"
  #include "llvm/ADT/APInt.h"
  #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
  #include "llvm/ADT/StringRef.h"
  #include "llvm/ADT/StringSwitch.h"
  #include "llvm/MC/MCAsmInfo.h"
@@ -68,7 +69,7 @@ int AsmLexer::getNextChar() {
  /// consumed.
  AsmToken AsmLexer::LexFloatLiteral() {
    // Skip the fractional digit sequence.
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
      ++CurPtr;
  
    // Check for exponent; we intentionally accept a slighlty wider set of
@@ -78,7 +79,7 @@ AsmToken AsmLexer::LexFloatLiteral() {
      ++CurPtr;
      if (*CurPtr == '-' || *CurPtr == '+')
        ++CurPtr;
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
        ++CurPtr;
    }
  
@@ -102,7 +103,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
      ++CurPtr;
  
      const char *FracStart = CurPtr;
-    while (isxdigit(*CurPtr))
+    while (isHexDigit(*CurPtr))
        ++CurPtr;
  
      NoFracDigits = CurPtr == FracStart;
@@ -123,7 +124,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
  
    // N.b. exponent digits are *not* hex
    const char *ExpStart = CurPtr;
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
      ++CurPtr;
  
    if (CurPtr == ExpStart)
@@ -135,15 +136,15 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
  
  /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
  static bool IsIdentifierChar(char c, bool AllowAt) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
+  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
           (c == '@' && AllowAt) || c == '?';
  }
  
  AsmToken AsmLexer::LexIdentifier() {
    // Check for floating point literals.
-  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
+  if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
      // Disambiguate a .1243foo identifier from a floating literal.
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
        ++CurPtr;
      if (*CurPtr == 'e' || *CurPtr == 'E' ||
          !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
@@ -244,9 +245,9 @@ static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
    const char *FirstHex = nullptr;
    const char *LookAhead = CurPtr;
    while (true) {
-    if (isdigit(*LookAhead)) {
+    if (isDigit(*LookAhead)) {
        ++LookAhead;
-    } else if (isxdigit(*LookAhead)) {
+    } else if (isHexDigit(*LookAhead)) {
        if (!FirstHex)
          FirstHex = LookAhead;
        ++LookAhead;
@@ -282,7 +283,7 @@ AsmToken AsmLexer::LexDigit() {
      const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
                                     CurPtr - 1 : nullptr;
      const char *OldCurPtr = CurPtr;
-    while (isxdigit(*CurPtr)) {
+    while (isHexDigit(*CurPtr)) {
        if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
          FirstNonBinary = CurPtr;
        ++CurPtr;
@@ -346,7 +347,7 @@ AsmToken AsmLexer::LexDigit() {
    if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
      ++CurPtr;
      // See if we actually have "0b" as part of something like "jmp 0b\n"
-    if (!isdigit(CurPtr[0])) {
+    if (!isDigit(CurPtr[0])) {
        --CurPtr;
        StringRef Result(TokStart, CurPtr - TokStart);
        return AsmToken(AsmToken::Integer, Result, 0);
@@ -375,7 +376,7 @@ AsmToken AsmLexer::LexDigit() {
    if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
      ++CurPtr;
      const char *NumStart = CurPtr;
-    while (isxdigit(CurPtr[0]))
+    while (isHexDigit(CurPtr[0]))
        ++CurPtr;
  
      // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
diff --git a/test/MC/AsmParser/non-english-characters.s b/test/MC/AsmParser/non-english-characters.s

new file mode 100644 (file)

index 0000000..12d78ee
--- /dev/null
+++ b/test/MC/AsmParser/non-english-characters.s
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple i386-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-readobj %t | FileCheck %s
+# CHECK: Format: ELF32-i386
+
+# 0bÑ
+# 0xÑ
+# .Ñ4
+# .XÑ
+# .1Ñ
+# .1eÑ
+# 0x.Ñ
+# 0x0pÑ
+.intel_syntax
+# 1Ñ
author	George Rimar <grimar@accesssoftek.com>
	Wed, 4 Oct 2017 08:50:08 +0000 (08:50 +0000)
committer	George Rimar <grimar@accesssoftek.com>
	Wed, 4 Oct 2017 08:50:08 +0000 (08:50 +0000)
include/llvm/ADT/StringExtras.h		patch \| blob \| history
lib/MC/MCParser/AsmLexer.cpp		patch \| blob \| history
test/MC/AsmParser/non-english-characters.s	[new file with mode: 0644]	patch \| blob