file: Add ENCODING option to file(STRINGS) command (#10519)
Support extraction of UTF-8 strings.
This commit is contained in:
parent
ffa373e711
commit
5b30ec28f9
|
@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
|
||||||
``REGEX <regex>``
|
``REGEX <regex>``
|
||||||
Consider only strings that match the given regular expression.
|
Consider only strings that match the given regular expression.
|
||||||
|
|
||||||
|
``ENCODING <encoding-type>``
|
||||||
|
Consider strings of a given encoding. "UTF-8" is currently supported.
|
||||||
|
|
||||||
For example, the code
|
For example, the code
|
||||||
|
|
||||||
.. code-block:: cmake
|
.. code-block:: cmake
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
file-strings-encoding
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* The :command:`file(STRINGS)` command gained a new ``ENCODING``
|
||||||
|
option to enable extraction of ``UTF-8`` strings.
|
|
@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
arg_length_minimum,
|
arg_length_minimum,
|
||||||
arg_length_maximum,
|
arg_length_maximum,
|
||||||
arg__maximum,
|
arg__maximum,
|
||||||
arg_regex };
|
arg_regex,
|
||||||
|
arg_encoding };
|
||||||
unsigned int minlen = 0;
|
unsigned int minlen = 0;
|
||||||
unsigned int maxlen = 0;
|
unsigned int maxlen = 0;
|
||||||
int limit_input = -1;
|
int limit_input = -1;
|
||||||
|
@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
bool have_regex = false;
|
bool have_regex = false;
|
||||||
bool newline_consume = false;
|
bool newline_consume = false;
|
||||||
bool hex_conversion_enabled = true;
|
bool hex_conversion_enabled = true;
|
||||||
|
bool utf8_encoding = false;
|
||||||
int arg_mode = arg_none;
|
int arg_mode = arg_none;
|
||||||
for(unsigned int i=3; i < args.size(); ++i)
|
for(unsigned int i=3; i < args.size(); ++i)
|
||||||
{
|
{
|
||||||
|
@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
hex_conversion_enabled = false;
|
hex_conversion_enabled = false;
|
||||||
arg_mode = arg_none;
|
arg_mode = arg_none;
|
||||||
}
|
}
|
||||||
|
else if(args[i] == "ENCODING")
|
||||||
|
{
|
||||||
|
arg_mode = arg_encoding;
|
||||||
|
}
|
||||||
else if(arg_mode == arg_limit_input)
|
else if(arg_mode == arg_limit_input)
|
||||||
{
|
{
|
||||||
if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
|
if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
|
||||||
|
@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
have_regex = true;
|
have_regex = true;
|
||||||
arg_mode = arg_none;
|
arg_mode = arg_none;
|
||||||
}
|
}
|
||||||
|
else if(arg_mode == arg_encoding)
|
||||||
|
{
|
||||||
|
if(args[i] == "UTF-8")
|
||||||
|
{
|
||||||
|
utf8_encoding = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cmOStringStream e;
|
||||||
|
e << "STRINGS option ENCODING \""
|
||||||
|
<< args[i] << "\" not recognized.";
|
||||||
|
this->SetError(e.str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
arg_mode = arg_none;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cmOStringStream e;
|
cmOStringStream e;
|
||||||
|
@ -618,6 +640,52 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
// c is guaranteed to fit in char by the above if...
|
// c is guaranteed to fit in char by the above if...
|
||||||
current_str += static_cast<char>(c);
|
current_str += static_cast<char>(c);
|
||||||
}
|
}
|
||||||
|
else if(utf8_encoding)
|
||||||
|
{
|
||||||
|
// Check for UTF-8 encoded string (up to 4 octets)
|
||||||
|
static const unsigned char utf8_check_table[3][2] =
|
||||||
|
{
|
||||||
|
{0xE0, 0xC0},
|
||||||
|
{0xF0, 0xE0},
|
||||||
|
{0xF8, 0xF0},
|
||||||
|
};
|
||||||
|
|
||||||
|
// how many octets are there?
|
||||||
|
unsigned int num_utf8_bytes = 0;
|
||||||
|
for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
|
||||||
|
{
|
||||||
|
if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
|
||||||
|
num_utf8_bytes = j+2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get subsequent octets and check that they are valid
|
||||||
|
for(unsigned int j=0; j<num_utf8_bytes; j++)
|
||||||
|
{
|
||||||
|
if(j != 0)
|
||||||
|
{
|
||||||
|
c = fin.get();
|
||||||
|
if(!fin || (c & 0xC0) != 0x80)
|
||||||
|
{
|
||||||
|
fin.putback(static_cast<char>(c));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_str += static_cast<char>(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if this was an invalid utf8 sequence, discard the data, and put
|
||||||
|
// back subsequent characters
|
||||||
|
if((current_str.length() != num_utf8_bytes))
|
||||||
|
{
|
||||||
|
for(unsigned int j=0; j<current_str.size()-1; j++)
|
||||||
|
{
|
||||||
|
c = current_str[current_str.size() - 1 - j];
|
||||||
|
fin.putback(static_cast<char>(c));
|
||||||
|
}
|
||||||
|
current_str = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if(c == '\n' && !newline_consume)
|
if(c == '\n' && !newline_consume)
|
||||||
{
|
{
|
||||||
|
|
|
@ -55,6 +55,16 @@ else()
|
||||||
"file(STRINGS) incorrectly read from srec file [${infile_strings}]")
|
"file(STRINGS) incorrectly read from srec file [${infile_strings}]")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#this file has utf-8 content
|
||||||
|
file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
|
||||||
|
list(LENGTH infile_strings content_len)
|
||||||
|
if(content_len MATCHES "3")
|
||||||
|
message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
|
||||||
|
else()
|
||||||
|
message(SEND_ERROR
|
||||||
|
"file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
|
||||||
|
endif()
|
||||||
|
|
||||||
# String test
|
# String test
|
||||||
string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
|
string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
|
||||||
string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
|
string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
The value of π (pi) is 3.141593
|
||||||
|
Line mixed with binary partially matches valid utf8: Ï€ is à93.1593
|
||||||
|
à
|
Loading…
Reference in New Issue