diff --git a/Help/command/file.rst b/Help/command/file.rst index b0d4792c3..73d4cfa79 100644 --- a/Help/command/file.rst +++ b/Help/command/file.rst @@ -65,7 +65,10 @@ Parse a list of ASCII strings from ```` and store it in Consider only strings that match the given regular expression. ``ENCODING `` - Consider strings of a given encoding. "UTF-8" is currently supported. + Consider strings of a given encoding. Currently supported encodings are: + UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE. If the ENCODING option + is not provided and the file has a Byte Order Mark, the ENCODING option + will be defaulted to respect the Byte Order Mark. For example, the code diff --git a/Help/release/dev/file-strings-utf-16.rst b/Help/release/dev/file-strings-utf-16.rst new file mode 100644 index 000000000..f40b63eb9 --- /dev/null +++ b/Help/release/dev/file-strings-utf-16.rst @@ -0,0 +1,5 @@ +file-strings-utf-16 +------------------- + +* The :command:`file(STRINGS)` now supports UTF-16LE, UTF-16BE, + UTF-32LE, UTF-32BE as ``ENCODING`` options. diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx index f12529215..579e7156e 100644 --- a/Source/cmFileCommand.cxx +++ b/Source/cmFileCommand.cxx @@ -472,7 +472,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) bool have_regex = false; bool newline_consume = false; bool hex_conversion_enabled = true; - bool utf8_encoding = false; + enum { encoding_none = cmsys::FStream::BOM_None, + encoding_utf8 = cmsys::FStream::BOM_UTF8, + encoding_utf16le = cmsys::FStream::BOM_UTF16LE, + encoding_utf16be = cmsys::FStream::BOM_UTF16BE, + encoding_utf32le = cmsys::FStream::BOM_UTF32LE, + encoding_utf32be = cmsys::FStream::BOM_UTF32BE}; + int encoding = encoding_none; int arg_mode = arg_none; for(unsigned int i=3; i < args.size(); ++i) { @@ -599,7 +605,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) { if(args[i] == "UTF-8") { - utf8_encoding = true; + encoding = encoding_utf8; + } + else if(args[i] == "UTF-16LE") + { + encoding = encoding_utf16le; + } + else if(args[i] == "UTF-16BE") + { + encoding = encoding_utf16be; + } + else if(args[i] == "UTF-32LE") + { + encoding = encoding_utf32le; + } + else if(args[i] == "UTF-32BE") + { + encoding = encoding_utf32be; } else { @@ -647,6 +669,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) return false; } + //If BOM is found and encoding was not specified, use the BOM + int bom_found = cmsys::FStream::ReadBOM(fin); + if(encoding == encoding_none && bom_found != cmsys::FStream::BOM_None) + { + encoding = bom_found; + } + + unsigned int bytes_rem = 0; + if(encoding == encoding_utf16le || encoding == encoding_utf16be) + { + bytes_rem = 1; + } + if(encoding == encoding_utf32le || encoding == encoding_utf32be) + { + bytes_rem = 3; + } + // Parse strings out of the file. int output_size = 0; std::vector strings; @@ -658,6 +697,25 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) std::string current_str; int c = fin.get(); + for(unsigned int i=0; i(c1)); + break; + } + c = (c << 8) | c1; + } + if(encoding == encoding_utf16le) + { + c = ((c & 0xFF) << 8) | ((c & 0xFF00) >> 8); + } + else if(encoding == encoding_utf32le) + { + c = (((c & 0xFF) << 24) | ((c & 0xFF00) << 8) | + ((c & 0xFF0000) >> 8) | ((c & 0xFF000000) >> 24)); + } if(c == '\r') { @@ -673,7 +731,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) // c is guaranteed to fit in char by the above if... current_str += static_cast(c); } - else if(utf8_encoding) + else if(encoding == encoding_utf8) { // Check for UTF-8 encoded string (up to 4 octets) static const unsigned char utf8_check_table[3][2] = diff --git a/Tests/RunCMake/string/RunCMakeTest.cmake b/Tests/RunCMake/string/RunCMakeTest.cmake index fc913c647..89f7ea517 100644 --- a/Tests/RunCMake/string/RunCMakeTest.cmake +++ b/Tests/RunCMake/string/RunCMakeTest.cmake @@ -12,3 +12,8 @@ run_cmake(UuidMissingTypeValue) run_cmake(UuidBadType) run_cmake(RegexClear) + +run_cmake(UTF-16BE) +run_cmake(UTF-16LE) +run_cmake(UTF-32BE) +run_cmake(UTF-32LE) diff --git a/Tests/RunCMake/string/UTF-16BE-stderr.txt b/Tests/RunCMake/string/UTF-16BE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-16BE.cmake b/Tests/RunCMake/string/UTF-16BE.cmake new file mode 100644 index 000000000..da986c0cc --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-16BE.txt str ENCODING UTF-16BE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-16BE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-16BE.txt b/Tests/RunCMake/string/UTF-16BE.txt new file mode 100644 index 000000000..9d976bc64 Binary files /dev/null and b/Tests/RunCMake/string/UTF-16BE.txt differ diff --git a/Tests/RunCMake/string/UTF-16LE-stderr.txt b/Tests/RunCMake/string/UTF-16LE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-16LE.cmake b/Tests/RunCMake/string/UTF-16LE.cmake new file mode 100644 index 000000000..326d848d2 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-16LE.txt str ENCODING UTF-16LE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-16LE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-16LE.txt b/Tests/RunCMake/string/UTF-16LE.txt new file mode 100644 index 000000000..ebba8748b Binary files /dev/null and b/Tests/RunCMake/string/UTF-16LE.txt differ diff --git a/Tests/RunCMake/string/UTF-32BE-stderr.txt b/Tests/RunCMake/string/UTF-32BE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-32BE.cmake b/Tests/RunCMake/string/UTF-32BE.cmake new file mode 100644 index 000000000..debdeaa73 --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-32BE.txt str ENCODING UTF-32BE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-32BE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-32BE.txt b/Tests/RunCMake/string/UTF-32BE.txt new file mode 100644 index 000000000..6725fbb45 Binary files /dev/null and b/Tests/RunCMake/string/UTF-32BE.txt differ diff --git a/Tests/RunCMake/string/UTF-32LE-stderr.txt b/Tests/RunCMake/string/UTF-32LE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-32LE.cmake b/Tests/RunCMake/string/UTF-32LE.cmake new file mode 100644 index 000000000..22aab5f24 --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-32LE.txt str ENCODING UTF-32LE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-32LE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-32LE.txt b/Tests/RunCMake/string/UTF-32LE.txt new file mode 100644 index 000000000..cf5102f58 Binary files /dev/null and b/Tests/RunCMake/string/UTF-32LE.txt differ