file: Teach STRINGS to support UTF-16 and UTF-32 encodings
This commit is contained in:
parent
19e57a48cd
commit
1f77a7001b
|
@ -65,7 +65,10 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
|
||||||
Consider only strings that match the given regular expression.
|
Consider only strings that match the given regular expression.
|
||||||
|
|
||||||
``ENCODING <encoding-type>``
|
``ENCODING <encoding-type>``
|
||||||
Consider strings of a given encoding. "UTF-8" is currently supported.
|
Consider strings of a given encoding. Currently supported encodings are:
|
||||||
|
UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE. If the ENCODING option
|
||||||
|
is not provided and the file has a Byte Order Mark, the ENCODING option
|
||||||
|
will be defaulted to respect the Byte Order Mark.
|
||||||
|
|
||||||
For example, the code
|
For example, the code
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
file-strings-utf-16
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* The :command:`file(STRINGS)` now supports UTF-16LE, UTF-16BE,
|
||||||
|
UTF-32LE, UTF-32BE as ``ENCODING`` options.
|
|
@ -472,7 +472,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
bool have_regex = false;
|
bool have_regex = false;
|
||||||
bool newline_consume = false;
|
bool newline_consume = false;
|
||||||
bool hex_conversion_enabled = true;
|
bool hex_conversion_enabled = true;
|
||||||
bool utf8_encoding = false;
|
enum { encoding_none = cmsys::FStream::BOM_None,
|
||||||
|
encoding_utf8 = cmsys::FStream::BOM_UTF8,
|
||||||
|
encoding_utf16le = cmsys::FStream::BOM_UTF16LE,
|
||||||
|
encoding_utf16be = cmsys::FStream::BOM_UTF16BE,
|
||||||
|
encoding_utf32le = cmsys::FStream::BOM_UTF32LE,
|
||||||
|
encoding_utf32be = cmsys::FStream::BOM_UTF32BE};
|
||||||
|
int encoding = encoding_none;
|
||||||
int arg_mode = arg_none;
|
int arg_mode = arg_none;
|
||||||
for(unsigned int i=3; i < args.size(); ++i)
|
for(unsigned int i=3; i < args.size(); ++i)
|
||||||
{
|
{
|
||||||
|
@ -599,7 +605,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
{
|
{
|
||||||
if(args[i] == "UTF-8")
|
if(args[i] == "UTF-8")
|
||||||
{
|
{
|
||||||
utf8_encoding = true;
|
encoding = encoding_utf8;
|
||||||
|
}
|
||||||
|
else if(args[i] == "UTF-16LE")
|
||||||
|
{
|
||||||
|
encoding = encoding_utf16le;
|
||||||
|
}
|
||||||
|
else if(args[i] == "UTF-16BE")
|
||||||
|
{
|
||||||
|
encoding = encoding_utf16be;
|
||||||
|
}
|
||||||
|
else if(args[i] == "UTF-32LE")
|
||||||
|
{
|
||||||
|
encoding = encoding_utf32le;
|
||||||
|
}
|
||||||
|
else if(args[i] == "UTF-32BE")
|
||||||
|
{
|
||||||
|
encoding = encoding_utf32be;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -647,6 +669,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//If BOM is found and encoding was not specified, use the BOM
|
||||||
|
int bom_found = cmsys::FStream::ReadBOM(fin);
|
||||||
|
if(encoding == encoding_none && bom_found != cmsys::FStream::BOM_None)
|
||||||
|
{
|
||||||
|
encoding = bom_found;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int bytes_rem = 0;
|
||||||
|
if(encoding == encoding_utf16le || encoding == encoding_utf16be)
|
||||||
|
{
|
||||||
|
bytes_rem = 1;
|
||||||
|
}
|
||||||
|
if(encoding == encoding_utf32le || encoding == encoding_utf32be)
|
||||||
|
{
|
||||||
|
bytes_rem = 3;
|
||||||
|
}
|
||||||
|
|
||||||
// Parse strings out of the file.
|
// Parse strings out of the file.
|
||||||
int output_size = 0;
|
int output_size = 0;
|
||||||
std::vector<std::string> strings;
|
std::vector<std::string> strings;
|
||||||
|
@ -658,6 +697,25 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
std::string current_str;
|
std::string current_str;
|
||||||
|
|
||||||
int c = fin.get();
|
int c = fin.get();
|
||||||
|
for(unsigned int i=0; i<bytes_rem; ++i)
|
||||||
|
{
|
||||||
|
int c1 = fin.get();
|
||||||
|
if(!fin)
|
||||||
|
{
|
||||||
|
fin.putback(static_cast<char>(c1));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
c = (c << 8) | c1;
|
||||||
|
}
|
||||||
|
if(encoding == encoding_utf16le)
|
||||||
|
{
|
||||||
|
c = ((c & 0xFF) << 8) | ((c & 0xFF00) >> 8);
|
||||||
|
}
|
||||||
|
else if(encoding == encoding_utf32le)
|
||||||
|
{
|
||||||
|
c = (((c & 0xFF) << 24) | ((c & 0xFF00) << 8) |
|
||||||
|
((c & 0xFF0000) >> 8) | ((c & 0xFF000000) >> 24));
|
||||||
|
}
|
||||||
|
|
||||||
if(c == '\r')
|
if(c == '\r')
|
||||||
{
|
{
|
||||||
|
@ -673,7 +731,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
|
||||||
// c is guaranteed to fit in char by the above if...
|
// c is guaranteed to fit in char by the above if...
|
||||||
current_str += static_cast<char>(c);
|
current_str += static_cast<char>(c);
|
||||||
}
|
}
|
||||||
else if(utf8_encoding)
|
else if(encoding == encoding_utf8)
|
||||||
{
|
{
|
||||||
// Check for UTF-8 encoded string (up to 4 octets)
|
// Check for UTF-8 encoded string (up to 4 octets)
|
||||||
static const unsigned char utf8_check_table[3][2] =
|
static const unsigned char utf8_check_table[3][2] =
|
||||||
|
|
|
@ -12,3 +12,8 @@ run_cmake(UuidMissingTypeValue)
|
||||||
run_cmake(UuidBadType)
|
run_cmake(UuidBadType)
|
||||||
|
|
||||||
run_cmake(RegexClear)
|
run_cmake(RegexClear)
|
||||||
|
|
||||||
|
run_cmake(UTF-16BE)
|
||||||
|
run_cmake(UTF-16LE)
|
||||||
|
run_cmake(UTF-32BE)
|
||||||
|
run_cmake(UTF-32LE)
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Hello World
|
||||||
|
Hello World
|
|
@ -0,0 +1,4 @@
|
||||||
|
file(STRINGS UTF-16BE.txt str ENCODING UTF-16BE LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
||||||
|
file(STRINGS UTF-16BE.txt str LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
Binary file not shown.
|
@ -0,0 +1,2 @@
|
||||||
|
Hello World
|
||||||
|
Hello World
|
|
@ -0,0 +1,4 @@
|
||||||
|
file(STRINGS UTF-16LE.txt str ENCODING UTF-16LE LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
||||||
|
file(STRINGS UTF-16LE.txt str LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
Binary file not shown.
|
@ -0,0 +1,2 @@
|
||||||
|
Hello World
|
||||||
|
Hello World
|
|
@ -0,0 +1,4 @@
|
||||||
|
file(STRINGS UTF-32BE.txt str ENCODING UTF-32BE LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
||||||
|
file(STRINGS UTF-32BE.txt str LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
Binary file not shown.
|
@ -0,0 +1,2 @@
|
||||||
|
Hello World
|
||||||
|
Hello World
|
|
@ -0,0 +1,4 @@
|
||||||
|
file(STRINGS UTF-32LE.txt str ENCODING UTF-32LE LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
||||||
|
file(STRINGS UTF-32LE.txt str LENGTH_MINIMUM 4)
|
||||||
|
message("${str}")
|
Binary file not shown.
Loading…
Reference in New Issue