Merge topic 'file-strings-encoding'

5b30ec28 file: Add ENCODING option to file(STRINGS) command (#10519)
ffa373e7 file: Refactor internal implementation of file(STRINGS)
This commit is contained in:
Brad King 2014-08-06 09:26:28 -04:00 committed by CMake Topic Stage
commit 78efe8d4fd
5 changed files with 118 additions and 19 deletions

View File

@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
``REGEX <regex>`` ``REGEX <regex>``
Consider only strings that match the given regular expression. Consider only strings that match the given regular expression.
``ENCODING <encoding-type>``
Consider strings of a given encoding. "UTF-8" is currently supported.
For example, the code For example, the code
.. code-block:: cmake .. code-block:: cmake

View File

@ -0,0 +1,5 @@
file-strings-encoding
---------------------
* The :command:`file(STRINGS)` command gained a new ``ENCODING``
option to enable extraction of ``UTF-8`` strings.

View File

@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
arg_length_minimum, arg_length_minimum,
arg_length_maximum, arg_length_maximum,
arg__maximum, arg__maximum,
arg_regex }; arg_regex,
arg_encoding };
unsigned int minlen = 0; unsigned int minlen = 0;
unsigned int maxlen = 0; unsigned int maxlen = 0;
int limit_input = -1; int limit_input = -1;
@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
bool have_regex = false; bool have_regex = false;
bool newline_consume = false; bool newline_consume = false;
bool hex_conversion_enabled = true; bool hex_conversion_enabled = true;
bool utf8_encoding = false;
int arg_mode = arg_none; int arg_mode = arg_none;
for(unsigned int i=3; i < args.size(); ++i) for(unsigned int i=3; i < args.size(); ++i)
{ {
@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
hex_conversion_enabled = false; hex_conversion_enabled = false;
arg_mode = arg_none; arg_mode = arg_none;
} }
else if(args[i] == "ENCODING")
{
arg_mode = arg_encoding;
}
else if(arg_mode == arg_limit_input) else if(arg_mode == arg_limit_input)
{ {
if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 || if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
have_regex = true; have_regex = true;
arg_mode = arg_none; arg_mode = arg_none;
} }
else if(arg_mode == arg_encoding)
{
if(args[i] == "UTF-8")
{
utf8_encoding = true;
}
else
{
cmOStringStream e;
e << "STRINGS option ENCODING \""
<< args[i] << "\" not recognized.";
this->SetError(e.str());
return false;
}
arg_mode = arg_none;
}
else else
{ {
cmOStringStream e; cmOStringStream e;
@ -596,11 +618,75 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
int output_size = 0; int output_size = 0;
std::vector<std::string> strings; std::vector<std::string> strings;
std::string s; std::string s;
int c;
while((!limit_count || strings.size() < limit_count) && while((!limit_count || strings.size() < limit_count) &&
(limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) && (limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) &&
(c = fin.get(), fin)) fin)
{ {
std::string current_str;
int c = fin.get();
if(c == '\r')
{
// Ignore CR character to make output always have UNIX newlines.
continue;
}
else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
(c == '\n' && newline_consume))
{
// This is an ASCII character that may be part of a string.
// Cast added to avoid compiler warning. Cast is ok because
// c is guaranteed to fit in char by the above if...
current_str += static_cast<char>(c);
}
else if(utf8_encoding)
{
// Check for UTF-8 encoded string (up to 4 octets)
static const unsigned char utf8_check_table[3][2] =
{
{0xE0, 0xC0},
{0xF0, 0xE0},
{0xF8, 0xF0},
};
// how many octets are there?
unsigned int num_utf8_bytes = 0;
for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
{
if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
num_utf8_bytes = j+2;
}
// get subsequent octets and check that they are valid
for(unsigned int j=0; j<num_utf8_bytes; j++)
{
if(j != 0)
{
c = fin.get();
if(!fin || (c & 0xC0) != 0x80)
{
fin.putback(static_cast<char>(c));
break;
}
}
current_str += static_cast<char>(c);
}
// if this was an invalid utf8 sequence, discard the data, and put
// back subsequent characters
if((current_str.length() != num_utf8_bytes))
{
for(unsigned int j=0; j<current_str.size()-1; j++)
{
c = current_str[current_str.size() - 1 - j];
fin.putback(static_cast<char>(c));
}
current_str = "";
}
}
if(c == '\n' && !newline_consume) if(c == '\n' && !newline_consume)
{ {
// The current line has been terminated. Check if the current // The current line has been terminated. Check if the current
@ -621,21 +707,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// Reset the string to empty. // Reset the string to empty.
s = ""; s = "";
} }
else if(c == '\r') else if(current_str.empty())
{ {
// Ignore CR character to make output always have UNIX newlines.
}
else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
(c == '\n' && newline_consume))
{
// This is an ASCII character that may be part of a string.
// Cast added to avoid compiler warning. Cast is ok because
// c is guaranteed to fit in char by the above if...
s += static_cast<char>(c);
}
else
{
// TODO: Support ENCODING option. See issue #10519.
// A non-string character has been found. Check if the current // A non-string character has been found. Check if the current
// string matches the requirements. We require that the length // string matches the requirements. We require that the length
// be at least one no matter what the user specified. // be at least one no matter what the user specified.
@ -654,10 +727,15 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
// Reset the string to empty. // Reset the string to empty.
s = ""; s = "";
} }
else
{
s += current_str;
}
// Terminate a string if the maximum length is reached.
if(maxlen > 0 && s.size() == maxlen) if(maxlen > 0 && s.size() == maxlen)
{ {
// Terminate a string if the maximum length is reached.
if(s.length() >= minlen && if(s.length() >= minlen &&
(!have_regex || regex.find(s.c_str()))) (!have_regex || regex.find(s.c_str())))
{ {

View File

@ -55,6 +55,16 @@ else()
"file(STRINGS) incorrectly read from srec file [${infile_strings}]") "file(STRINGS) incorrectly read from srec file [${infile_strings}]")
endif() endif()
#this file has utf-8 content
file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
list(LENGTH infile_strings content_len)
if(content_len MATCHES "3")
message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
else()
message(SEND_ERROR
"file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
endif()
# String test # String test
string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great") string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake") string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")

View File

@ -0,0 +1,3 @@
The value of π (pi) is 3.141593
Line mixed with binary partially matches valid utf8: Ï€ is à93.1593
à