From: Eric Christopher Date: Sat, 9 Apr 2011 00:01:04 +0000 (+0000) Subject: Eat the UTF-8 BOM at the beginning of a file since it's ignored anyhow. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=156119df1d076b63609618976281961283f871db;p=clang Eat the UTF-8 BOM at the beginning of a file since it's ignored anyhow. Nom Nom Nom. Patch by Anton Korobeynikov! git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@129174 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp index b6939ec7d5..8262feba7f 100644 --- a/lib/Basic/SourceManager.cpp +++ b/lib/Basic/SourceManager.cpp @@ -126,13 +126,12 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag, if (Invalid) *Invalid = true; return Buffer.getPointer(); } - + // If the buffer is valid, check to see if it has a UTF Byte Order Mark - // (BOM). We only support UTF-8 without a BOM right now. See + // (BOM). We only support UTF-8 with and without a BOM right now. See // http://en.wikipedia.org/wiki/Byte_order_mark for more information. llvm::StringRef BufStr = Buffer.getPointer()->getBuffer(); - const char *BOM = llvm::StringSwitch(BufStr) - .StartsWith("\xEF\xBB\xBF", "UTF-8") + const char *InvalidBOM = llvm::StringSwitch(BufStr) .StartsWith("\xFE\xFF", "UTF-16 (BE)") .StartsWith("\xFF\xFE", "UTF-16 (LE)") .StartsWith("\x00\x00\xFE\xFF", "UTF-32 (BE)") @@ -145,9 +144,9 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag, .StartsWith("\x84\x31\x95\x33", "GB-18030") .Default(0); - if (BOM) { + if (InvalidBOM) { Diag.Report(Loc, diag::err_unsupported_bom) - << BOM << ContentsEntry->getName(); + << InvalidBOM << ContentsEntry->getName(); Buffer.setInt(Buffer.getInt() | InvalidFlag); } diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 34b16c7477..ea2a2deb0f 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -71,9 +71,22 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr, "We assume that the input buffer has a null character at the end" " to simplify lexing!"); + // Check whether we have a BOM in the beginning of the buffer. If yes - act + // accordingly. Right now we support only UTF-8 with and without BOM, so, just + // skip the UTF-8 BOM if it's present. + if (BufferStart == BufferPtr) { + // Determine the size of the BOM. + size_t BOMLength = llvm::StringSwitch(BufferStart) + .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM + .Default(0); + + // Skip the BOM. + BufferPtr += BOMLength; + } + Is_PragmaLexer = false; IsInConflictMarker = false; - + // Start of the file is a start of line. IsAtStartOfLine = true;