Eat the UTF-8 BOM at the beginning of a file since it's ignored anyhow.

author Eric Christopher <echristo@apple.com>

Sat, 9 Apr 2011 00:01:04 +0000 (00:01 +0000)

committer Eric Christopher <echristo@apple.com>

Sat, 9 Apr 2011 00:01:04 +0000 (00:01 +0000)
author Eric Christopher <echristo@apple.com>
Sat, 9 Apr 2011 00:01:04 +0000 (00:01 +0000)
committer Eric Christopher <echristo@apple.com>
Sat, 9 Apr 2011 00:01:04 +0000 (00:01 +0000)
diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp

index b6939ec7d55512cb320c1dfc1fb9d0b861bf0b58..8262feba7f7d44053029285330585783402fbe4b 100644 (file)
--- a/lib/Basic/SourceManager.cpp
+++ b/lib/Basic/SourceManager.cpp
@@ -126,13 +126,12 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag,
      if (Invalid) *Invalid = true;
      return Buffer.getPointer();
    }
-  
+
    // If the buffer is valid, check to see if it has a UTF Byte Order Mark
-  // (BOM).  We only support UTF-8 without a BOM right now.  See
+  // (BOM).  We only support UTF-8 with and without a BOM right now.  See
    // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
    llvm::StringRef BufStr = Buffer.getPointer()->getBuffer();
-  const char *BOM = llvm::StringSwitch<const char *>(BufStr)
-    .StartsWith("\xEF\xBB\xBF", "UTF-8")
+  const char *InvalidBOM = llvm::StringSwitch<const char *>(BufStr)
      .StartsWith("\xFE\xFF", "UTF-16 (BE)")
      .StartsWith("\xFF\xFE", "UTF-16 (LE)")
      .StartsWith("\x00\x00\xFE\xFF", "UTF-32 (BE)")
@@ -145,9 +144,9 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag,
      .StartsWith("\x84\x31\x95\x33", "GB-18030")
      .Default(0);
  
-  if (BOM) {
+  if (InvalidBOM) {
      Diag.Report(Loc, diag::err_unsupported_bom)
-      << BOM << ContentsEntry->getName();
+      << InvalidBOM << ContentsEntry->getName();
      Buffer.setInt(Buffer.getInt() | InvalidFlag);
    }
    
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp

index 34b16c747773aad1f20935c7b4edaf67e62768e7..ea2a2deb0f5a287f3053eb08cd2d393f5f125591 100644 (file)
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -71,9 +71,22 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
           "We assume that the input buffer has a null character at the end"
           " to simplify lexing!");
  
+  // Check whether we have a BOM in the beginning of the buffer. If yes - act
+  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
+  // skip the UTF-8 BOM if it's present.
+  if (BufferStart == BufferPtr) {
+    // Determine the size of the BOM.
+    size_t BOMLength = llvm::StringSwitch<size_t>(BufferStart)
+      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
+      .Default(0);
+
+    // Skip the BOM.
+    BufferPtr += BOMLength;
+  }
+
    Is_PragmaLexer = false;
    IsInConflictMarker = false;
-  
+
    // Start of the file is a start of line.
    IsAtStartOfLine = true;
author	Eric Christopher <echristo@apple.com>
	Sat, 9 Apr 2011 00:01:04 +0000 (00:01 +0000)
committer	Eric Christopher <echristo@apple.com>
	Sat, 9 Apr 2011 00:01:04 +0000 (00:01 +0000)
lib/Basic/SourceManager.cpp		patch \| blob \| history
lib/Lex/Lexer.cpp		patch \| blob \| history