From: George Rimar <grimar@accesssoftek.com>
Date: Wed, 4 Oct 2017 08:50:08 +0000 (+0000)
Subject: [MC] - Don't assert when non-english characters are used.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e584a7b630af211b67ce556f15d4812baa803eff;p=llvm

[MC] - Don't assert when non-english characters are used.

I found that llvm-mc does not like non-english characters even in comments,
which it tries to tokenize.

Problem happens because of functions like isdigit(), isalnum() which takes
int argument and expects it is not negative.
But at the same time MCParser uses char* to store input buffer poiner, char has signed value,
so it is possible to pass negative value to one of functions from above and
that triggers an assert.
Testcase for demonstration is provided.

To fix the issue helper functions were introduced in StringExtras.h

Differential revision: https://reviews.llvm.org/D38461

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@314883 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index cc32bf43f29..a9a8c87d0d7 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -59,6 +59,21 @@ static inline unsigned hexDigitValue(char C) {
   return -1U;
 }
 
+/// Checks if character \p C is one of the 10 decimal digits.
+static inline bool isDigit(char C) { return C >= '0' && C <= '9'; }
+
+/// Checks if character \p C is a hexadecimal numeric character.
+static inline bool isHexDigit(char C) { return hexDigitValue(C) != -1U; }
+
+/// Checks if character \p C is a valid letter as classified by "C" locale.
+static inline bool isAlpha(char C) {
+  return ('a' <= C && C <= 'z') || ('A' <= C && C <= 'Z');
+}
+
+/// Checks whether character \p C is either a decimal digit or an uppercase or
+/// lowercase letter as classified by "C" locale.
+static inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }
+
 static inline std::string utohexstr(uint64_t X, bool LowerCase = false) {
   char Buffer[17];
   char *BufPtr = std::end(Buffer);
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 2b963607b83..e9123b9d714 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -68,7 +69,7 @@ int AsmLexer::getNextChar() {
 /// consumed.
 AsmToken AsmLexer::LexFloatLiteral() {
   // Skip the fractional digit sequence.
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
     ++CurPtr;
 
   // Check for exponent; we intentionally accept a slighlty wider set of
@@ -78,7 +79,7 @@ AsmToken AsmLexer::LexFloatLiteral() {
     ++CurPtr;
     if (*CurPtr == '-' || *CurPtr == '+')
       ++CurPtr;
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
       ++CurPtr;
   }
 
@@ -102,7 +103,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
     ++CurPtr;
 
     const char *FracStart = CurPtr;
-    while (isxdigit(*CurPtr))
+    while (isHexDigit(*CurPtr))
       ++CurPtr;
 
     NoFracDigits = CurPtr == FracStart;
@@ -123,7 +124,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
 
   // N.b. exponent digits are *not* hex
   const char *ExpStart = CurPtr;
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
     ++CurPtr;
 
   if (CurPtr == ExpStart)
@@ -135,15 +136,15 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
 
 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c, bool AllowAt) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
+  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
          (c == '@' && AllowAt) || c == '?';
 }
 
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
-  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
+  if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
     // Disambiguate a .1243foo identifier from a floating literal.
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
       ++CurPtr;
     if (*CurPtr == 'e' || *CurPtr == 'E' ||
         !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
@@ -244,9 +245,9 @@ static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
   const char *FirstHex = nullptr;
   const char *LookAhead = CurPtr;
   while (true) {
-    if (isdigit(*LookAhead)) {
+    if (isDigit(*LookAhead)) {
       ++LookAhead;
-    } else if (isxdigit(*LookAhead)) {
+    } else if (isHexDigit(*LookAhead)) {
       if (!FirstHex)
         FirstHex = LookAhead;
       ++LookAhead;
@@ -282,7 +283,7 @@ AsmToken AsmLexer::LexDigit() {
     const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
                                    CurPtr - 1 : nullptr;
     const char *OldCurPtr = CurPtr;
-    while (isxdigit(*CurPtr)) {
+    while (isHexDigit(*CurPtr)) {
       if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
         FirstNonBinary = CurPtr;
       ++CurPtr;
@@ -346,7 +347,7 @@ AsmToken AsmLexer::LexDigit() {
   if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
     ++CurPtr;
     // See if we actually have "0b" as part of something like "jmp 0b\n"
-    if (!isdigit(CurPtr[0])) {
+    if (!isDigit(CurPtr[0])) {
       --CurPtr;
       StringRef Result(TokStart, CurPtr - TokStart);
       return AsmToken(AsmToken::Integer, Result, 0);
@@ -375,7 +376,7 @@ AsmToken AsmLexer::LexDigit() {
   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
     ++CurPtr;
     const char *NumStart = CurPtr;
-    while (isxdigit(CurPtr[0]))
+    while (isHexDigit(CurPtr[0]))
       ++CurPtr;
 
     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
diff --git a/test/MC/AsmParser/non-english-characters.s b/test/MC/AsmParser/non-english-characters.s
new file mode 100644
index 00000000000..12d78ee83be
--- /dev/null
+++ b/test/MC/AsmParser/non-english-characters.s
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple i386-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-readobj %t | FileCheck %s
+# CHECK: Format: ELF32-i386
+
+# 0bÑ
+# 0xÑ
+# .Ñ4
+# .XÑ
+# .1Ñ
+# .1eÑ
+# 0x.Ñ
+# 0x0pÑ
+.intel_syntax
+# 1Ñ