cmListFileLexer: Allow a leading UTF-8 Byte-Order-Mark (#11137)

Teach the lexer to read a UTF-8, UTF-16 BE/LE, or UTF-32 BE/LE
Byte-Order-Mark from the start of a file if any is present.  Report an
error on files using UTF-16 or UTF-32 and accept a UTF-8 or missing BOM.
This commit is contained in:
Brad King 2013-10-14 15:13:11 -04:00
parent 56457837e2
commit dbd933365e
19 changed files with 168 additions and 10 deletions

View File

@ -57,13 +57,26 @@ cmListFileParser::~cmListFileParser()
bool cmListFileParser::ParseFile() bool cmListFileParser::ParseFile()
{ {
// Open the file. // Open the file.
if(!cmListFileLexer_SetFileName(this->Lexer, this->FileName)) cmListFileLexer_BOM bom;
if(!cmListFileLexer_SetFileName(this->Lexer, this->FileName, &bom))
{ {
cmSystemTools::Error("cmListFileCache: error can not open file ", cmSystemTools::Error("cmListFileCache: error can not open file ",
this->FileName); this->FileName);
return false; return false;
} }
// Verify the Byte-Order-Mark, if any.
if(bom != cmListFileLexer_BOM_None &&
bom != cmListFileLexer_BOM_UTF8)
{
cmListFileLexer_SetFileName(this->Lexer, 0, 0);
cmOStringStream m;
m << "File\n " << this->FileName << "\n"
<< "starts with a Byte-Order-Mark that is not UTF-8.";
this->Makefile->IssueMessage(cmake::FATAL_ERROR, m.str());
return false;
}
// Use a simple recursive-descent parser to process the token // Use a simple recursive-descent parser to process the token
// stream. // stream.
bool haveNewline = true; bool haveNewline = true;

View File

@ -2307,19 +2307,68 @@ cmListFileLexer* cmListFileLexer_New()
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
void cmListFileLexer_Delete(cmListFileLexer* lexer) void cmListFileLexer_Delete(cmListFileLexer* lexer)
{ {
cmListFileLexer_SetFileName(lexer, 0); cmListFileLexer_SetFileName(lexer, 0, 0);
free(lexer); free(lexer);
} }
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name) static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
{
unsigned char b[2];
if(fread(b, 1, 2, f) == 2)
{
if(b[0] == 0xEF && b[1] == 0xBB)
{
if(fread(b, 1, 1, f) == 1 && b[0] == 0xBF)
{
return cmListFileLexer_BOM_UTF8;
}
}
else if(b[0] == 0xFE && b[1] == 0xFF)
{
/* UTF-16 BE */
return cmListFileLexer_BOM_UTF16BE;
}
else if(b[0] == 0 && b[1] == 0)
{
if(fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF)
{
return cmListFileLexer_BOM_UTF32BE;
}
}
else if(b[0] == 0xFF && b[1] == 0xFE)
{
fpos_t p;
fgetpos(f, &p);
if(fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0)
{
return cmListFileLexer_BOM_UTF32LE;
}
fsetpos(f, &p);
return cmListFileLexer_BOM_UTF16LE;
}
}
rewind(f);
return cmListFileLexer_BOM_None;
}
/*--------------------------------------------------------------------------*/
int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name,
cmListFileLexer_BOM* bom)
{ {
int result = 1; int result = 1;
cmListFileLexerDestroy(lexer); cmListFileLexerDestroy(lexer);
if(name) if(name)
{ {
lexer->file = fopen(name, "r"); lexer->file = fopen(name, "r");
if(!lexer->file) if(lexer->file)
{
if(bom)
{
*bom = cmListFileLexer_ReadBOM(lexer->file);
}
}
else
{ {
result = 0; result = 0;
} }
@ -2365,7 +2414,7 @@ cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer* lexer)
} }
else else
{ {
cmListFileLexer_SetFileName(lexer, 0); cmListFileLexer_SetFileName(lexer, 0, 0);
return 0; return 0;
} }
} }

View File

@ -36,6 +36,17 @@ struct cmListFileLexer_Token_s
int column; int column;
}; };
enum cmListFileLexer_BOM_e
{
cmListFileLexer_BOM_None,
cmListFileLexer_BOM_UTF8,
cmListFileLexer_BOM_UTF16BE,
cmListFileLexer_BOM_UTF16LE,
cmListFileLexer_BOM_UTF32BE,
cmListFileLexer_BOM_UTF32LE
};
typedef enum cmListFileLexer_BOM_e cmListFileLexer_BOM;
typedef struct cmListFileLexer_s cmListFileLexer; typedef struct cmListFileLexer_s cmListFileLexer;
#ifdef __cplusplus #ifdef __cplusplus
@ -44,7 +55,8 @@ extern "C"
#endif #endif
cmListFileLexer* cmListFileLexer_New(); cmListFileLexer* cmListFileLexer_New();
int cmListFileLexer_SetFileName(cmListFileLexer*, const char*); int cmListFileLexer_SetFileName(cmListFileLexer*, const char*,
cmListFileLexer_BOM* bom);
int cmListFileLexer_SetString(cmListFileLexer*, const char*); int cmListFileLexer_SetString(cmListFileLexer*, const char*);
cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer*); cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer*);
long cmListFileLexer_GetCurrentLine(cmListFileLexer*); long cmListFileLexer_GetCurrentLine(cmListFileLexer*);

View File

@ -328,19 +328,68 @@ cmListFileLexer* cmListFileLexer_New()
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
void cmListFileLexer_Delete(cmListFileLexer* lexer) void cmListFileLexer_Delete(cmListFileLexer* lexer)
{ {
cmListFileLexer_SetFileName(lexer, 0); cmListFileLexer_SetFileName(lexer, 0, 0);
free(lexer); free(lexer);
} }
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name) static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
{
unsigned char b[2];
if(fread(b, 1, 2, f) == 2)
{
if(b[0] == 0xEF && b[1] == 0xBB)
{
if(fread(b, 1, 1, f) == 1 && b[0] == 0xBF)
{
return cmListFileLexer_BOM_UTF8;
}
}
else if(b[0] == 0xFE && b[1] == 0xFF)
{
/* UTF-16 BE */
return cmListFileLexer_BOM_UTF16BE;
}
else if(b[0] == 0 && b[1] == 0)
{
if(fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF)
{
return cmListFileLexer_BOM_UTF32BE;
}
}
else if(b[0] == 0xFF && b[1] == 0xFE)
{
fpos_t p;
fgetpos(f, &p);
if(fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0)
{
return cmListFileLexer_BOM_UTF32LE;
}
fsetpos(f, &p);
return cmListFileLexer_BOM_UTF16LE;
}
}
rewind(f);
return cmListFileLexer_BOM_None;
}
/*--------------------------------------------------------------------------*/
int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name,
cmListFileLexer_BOM* bom)
{ {
int result = 1; int result = 1;
cmListFileLexerDestroy(lexer); cmListFileLexerDestroy(lexer);
if(name) if(name)
{ {
lexer->file = fopen(name, "r"); lexer->file = fopen(name, "r");
if(!lexer->file) if(lexer->file)
{
if(bom)
{
*bom = cmListFileLexer_ReadBOM(lexer->file);
}
}
else
{ {
result = 0; result = 0;
} }
@ -386,7 +435,7 @@ cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer* lexer)
} }
else else
{ {
cmListFileLexer_SetFileName(lexer, 0); cmListFileLexer_SetFileName(lexer, 0, 0);
return 0; return 0;
} }
} }

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1,6 @@
CMake Error at CMakeLists.txt:3 \(include\):
File
.*/Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake
starts with a Byte-Order-Mark that is not UTF-8.

Binary file not shown.

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1,6 @@
CMake Error at CMakeLists.txt:3 \(include\):
File
.*/Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake
starts with a Byte-Order-Mark that is not UTF-8.

Binary file not shown.

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1,6 @@
CMake Error at CMakeLists.txt:3 \(include\):
File
.*/Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake
starts with a Byte-Order-Mark that is not UTF-8.

Binary file not shown.

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1,6 @@
CMake Error at CMakeLists.txt:3 \(include\):
File
.*/Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake
starts with a Byte-Order-Mark that is not UTF-8.

Binary file not shown.

View File

@ -0,0 +1 @@
-- message

View File

@ -0,0 +1 @@
message(STATUS "message")

View File

@ -1,5 +1,10 @@
include(RunCMake) include(RunCMake)
run_cmake(BOM-UTF-8)
run_cmake(BOM-UTF-16-LE)
run_cmake(BOM-UTF-16-BE)
run_cmake(BOM-UTF-32-LE)
run_cmake(BOM-UTF-32-BE)
run_cmake(CommandSpaces) run_cmake(CommandSpaces)
run_cmake(CommandTabs) run_cmake(CommandTabs)
run_cmake(CommandNewlines) run_cmake(CommandNewlines)