From 1f77a7001b2e3f8f9224cb603e5acfee45573064 Mon Sep 17 00:00:00 2001 From: Justin Borodinsky Date: Sun, 11 Jan 2015 14:33:36 -0500 Subject: [PATCH] file: Teach STRINGS to support UTF-16 and UTF-32 encodings --- Help/command/file.rst | 5 +- Help/release/dev/file-strings-utf-16.rst | 5 ++ Source/cmFileCommand.cxx | 64 +++++++++++++++++++++- Tests/RunCMake/string/RunCMakeTest.cmake | 5 ++ Tests/RunCMake/string/UTF-16BE-stderr.txt | 2 + Tests/RunCMake/string/UTF-16BE.cmake | 4 ++ Tests/RunCMake/string/UTF-16BE.txt | Bin 0 -> 83 bytes Tests/RunCMake/string/UTF-16LE-stderr.txt | 2 + Tests/RunCMake/string/UTF-16LE.cmake | 4 ++ Tests/RunCMake/string/UTF-16LE.txt | Bin 0 -> 83 bytes Tests/RunCMake/string/UTF-32BE-stderr.txt | 2 + Tests/RunCMake/string/UTF-32BE.cmake | 4 ++ Tests/RunCMake/string/UTF-32BE.txt | Bin 0 -> 165 bytes Tests/RunCMake/string/UTF-32LE-stderr.txt | 2 + Tests/RunCMake/string/UTF-32LE.cmake | 4 ++ Tests/RunCMake/string/UTF-32LE.txt | Bin 0 -> 165 bytes 16 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 Help/release/dev/file-strings-utf-16.rst create mode 100644 Tests/RunCMake/string/UTF-16BE-stderr.txt create mode 100644 Tests/RunCMake/string/UTF-16BE.cmake create mode 100644 Tests/RunCMake/string/UTF-16BE.txt create mode 100644 Tests/RunCMake/string/UTF-16LE-stderr.txt create mode 100644 Tests/RunCMake/string/UTF-16LE.cmake create mode 100644 Tests/RunCMake/string/UTF-16LE.txt create mode 100644 Tests/RunCMake/string/UTF-32BE-stderr.txt create mode 100644 Tests/RunCMake/string/UTF-32BE.cmake create mode 100644 Tests/RunCMake/string/UTF-32BE.txt create mode 100644 Tests/RunCMake/string/UTF-32LE-stderr.txt create mode 100644 Tests/RunCMake/string/UTF-32LE.cmake create mode 100644 Tests/RunCMake/string/UTF-32LE.txt diff --git a/Help/command/file.rst b/Help/command/file.rst index b0d4792c3..73d4cfa79 100644 --- a/Help/command/file.rst +++ b/Help/command/file.rst @@ -65,7 +65,10 @@ Parse a list of ASCII strings from ```` and store it in Consider only strings that match the given regular expression. ``ENCODING `` - Consider strings of a given encoding. "UTF-8" is currently supported. + Consider strings of a given encoding. Currently supported encodings are: + UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE. If the ENCODING option + is not provided and the file has a Byte Order Mark, the ENCODING option + will be defaulted to respect the Byte Order Mark. For example, the code diff --git a/Help/release/dev/file-strings-utf-16.rst b/Help/release/dev/file-strings-utf-16.rst new file mode 100644 index 000000000..f40b63eb9 --- /dev/null +++ b/Help/release/dev/file-strings-utf-16.rst @@ -0,0 +1,5 @@ +file-strings-utf-16 +------------------- + +* The :command:`file(STRINGS)` now supports UTF-16LE, UTF-16BE, + UTF-32LE, UTF-32BE as ``ENCODING`` options. diff --git a/Source/cmFileCommand.cxx b/Source/cmFileCommand.cxx index f12529215..579e7156e 100644 --- a/Source/cmFileCommand.cxx +++ b/Source/cmFileCommand.cxx @@ -472,7 +472,13 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) bool have_regex = false; bool newline_consume = false; bool hex_conversion_enabled = true; - bool utf8_encoding = false; + enum { encoding_none = cmsys::FStream::BOM_None, + encoding_utf8 = cmsys::FStream::BOM_UTF8, + encoding_utf16le = cmsys::FStream::BOM_UTF16LE, + encoding_utf16be = cmsys::FStream::BOM_UTF16BE, + encoding_utf32le = cmsys::FStream::BOM_UTF32LE, + encoding_utf32be = cmsys::FStream::BOM_UTF32BE}; + int encoding = encoding_none; int arg_mode = arg_none; for(unsigned int i=3; i < args.size(); ++i) { @@ -599,7 +605,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) { if(args[i] == "UTF-8") { - utf8_encoding = true; + encoding = encoding_utf8; + } + else if(args[i] == "UTF-16LE") + { + encoding = encoding_utf16le; + } + else if(args[i] == "UTF-16BE") + { + encoding = encoding_utf16be; + } + else if(args[i] == "UTF-32LE") + { + encoding = encoding_utf32le; + } + else if(args[i] == "UTF-32BE") + { + encoding = encoding_utf32be; } else { @@ -647,6 +669,23 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) return false; } + //If BOM is found and encoding was not specified, use the BOM + int bom_found = cmsys::FStream::ReadBOM(fin); + if(encoding == encoding_none && bom_found != cmsys::FStream::BOM_None) + { + encoding = bom_found; + } + + unsigned int bytes_rem = 0; + if(encoding == encoding_utf16le || encoding == encoding_utf16be) + { + bytes_rem = 1; + } + if(encoding == encoding_utf32le || encoding == encoding_utf32be) + { + bytes_rem = 3; + } + // Parse strings out of the file. int output_size = 0; std::vector strings; @@ -658,6 +697,25 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) std::string current_str; int c = fin.get(); + for(unsigned int i=0; i(c1)); + break; + } + c = (c << 8) | c1; + } + if(encoding == encoding_utf16le) + { + c = ((c & 0xFF) << 8) | ((c & 0xFF00) >> 8); + } + else if(encoding == encoding_utf32le) + { + c = (((c & 0xFF) << 24) | ((c & 0xFF00) << 8) | + ((c & 0xFF0000) >> 8) | ((c & 0xFF000000) >> 24)); + } if(c == '\r') { @@ -673,7 +731,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector const& args) // c is guaranteed to fit in char by the above if... current_str += static_cast(c); } - else if(utf8_encoding) + else if(encoding == encoding_utf8) { // Check for UTF-8 encoded string (up to 4 octets) static const unsigned char utf8_check_table[3][2] = diff --git a/Tests/RunCMake/string/RunCMakeTest.cmake b/Tests/RunCMake/string/RunCMakeTest.cmake index fc913c647..89f7ea517 100644 --- a/Tests/RunCMake/string/RunCMakeTest.cmake +++ b/Tests/RunCMake/string/RunCMakeTest.cmake @@ -12,3 +12,8 @@ run_cmake(UuidMissingTypeValue) run_cmake(UuidBadType) run_cmake(RegexClear) + +run_cmake(UTF-16BE) +run_cmake(UTF-16LE) +run_cmake(UTF-32BE) +run_cmake(UTF-32LE) diff --git a/Tests/RunCMake/string/UTF-16BE-stderr.txt b/Tests/RunCMake/string/UTF-16BE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-16BE.cmake b/Tests/RunCMake/string/UTF-16BE.cmake new file mode 100644 index 000000000..da986c0cc --- /dev/null +++ b/Tests/RunCMake/string/UTF-16BE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-16BE.txt str ENCODING UTF-16BE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-16BE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-16BE.txt b/Tests/RunCMake/string/UTF-16BE.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d976bc640912c08d845b6400a57dec6532d0690 GIT binary patch literal 83 zcmezOpTUD6l_3WR^BEKv!hvKFke$N7<)08)>nB#~!@$L2&*H#h!D7T>%HqVJz+wYr LJ1}tN;L->H?642a literal 0 HcmV?d00001 diff --git a/Tests/RunCMake/string/UTF-16LE-stderr.txt b/Tests/RunCMake/string/UTF-16LE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-16LE.cmake b/Tests/RunCMake/string/UTF-16LE.cmake new file mode 100644 index 000000000..326d848d2 --- /dev/null +++ b/Tests/RunCMake/string/UTF-16LE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-16LE.txt str ENCODING UTF-16LE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-16LE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-16LE.txt b/Tests/RunCMake/string/UTF-16LE.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebba8748bf4f8ca44583eafa0329105cc8798bf9 GIT binary patch literal 83 zcmezW&x0YAAqNQa859`8fn*Voox;Gykl%Ee&M;=p3TV#H#~;>4oBU;|`3 LuyExt;L->H_hApr literal 0 HcmV?d00001 diff --git a/Tests/RunCMake/string/UTF-32BE-stderr.txt b/Tests/RunCMake/string/UTF-32BE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-32BE.cmake b/Tests/RunCMake/string/UTF-32BE.cmake new file mode 100644 index 000000000..debdeaa73 --- /dev/null +++ b/Tests/RunCMake/string/UTF-32BE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-32BE.txt str ENCODING UTF-32BE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-32BE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-32BE.txt b/Tests/RunCMake/string/UTF-32BE.txt new file mode 100644 index 0000000000000000000000000000000000000000..6725fbb45a40b4fbff6690c34d564d2c8e7353fa GIT binary patch literal 165 zcmZQz`1hZIfx!caQ-L@Kit~ZA0uYA-F^F9R#2~p8Am(CV@K0c1h^%E`@DpQTDD{De mvDh;(us8s*1rQqnu_+Kc0p%2cv<(o0#DVe*Tp)u;vJe2f-4DzF literal 0 HcmV?d00001 diff --git a/Tests/RunCMake/string/UTF-32LE-stderr.txt b/Tests/RunCMake/string/UTF-32LE-stderr.txt new file mode 100644 index 000000000..8254f875c --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE-stderr.txt @@ -0,0 +1,2 @@ +Hello World +Hello World diff --git a/Tests/RunCMake/string/UTF-32LE.cmake b/Tests/RunCMake/string/UTF-32LE.cmake new file mode 100644 index 000000000..22aab5f24 --- /dev/null +++ b/Tests/RunCMake/string/UTF-32LE.cmake @@ -0,0 +1,4 @@ +file(STRINGS UTF-32LE.txt str ENCODING UTF-32LE LENGTH_MINIMUM 4) +message("${str}") +file(STRINGS UTF-32LE.txt str LENGTH_MINIMUM 4) +message("${str}") diff --git a/Tests/RunCMake/string/UTF-32LE.txt b/Tests/RunCMake/string/UTF-32LE.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf5102f58ac6a1d260fc95bc2a56d3c748b7b9ec GIT binary patch literal 165 zcmezWkAcC1fq@|vh;yJgA4n?zaX1iz*hN4Ll1l+%E+9_uXJDv}WMB~UV_@(pg^1a+ iFfcd(u>}wt0kJ6%I{~o*P`wS128n~@xj+VyY9RpTBoE90 literal 0 HcmV?d00001