Merge topic 'file-strings-encoding'

5b30ec28 file: Add ENCODING option to file(STRINGS) command (#10519) ffa373e7 file: Refactor internal implementation of file(STRINGS)
2014-08-06 09:26:28 -04:00 · 2014-08-06 09:26:28 -04:00 · 78efe8d4fd
commit 78efe8d4fd
parent 4ec6ff8f9f 5b30ec28f9
5 changed files with 118 additions and 19 deletions
--- a/Help/command/file.rst
+++ b/Help/command/file.rst
@ -64,6 +64,9 @@ Parse a list of ASCII strings from ``<filename>`` and store it in
 ``REGEX <regex>``
 Consider only strings that match the given regular expression.
 ``ENCODING <encoding-type>``
 Consider strings of a given encoding.  "UTF-8" is currently supported.
 For example, the code
 .. code-block:: cmake
--- a/Help/release/dev/file-strings-encoding.rst
+++ b/Help/release/dev/file-strings-encoding.rst
@ -0,0 +1,5 @@
 file-strings-encoding
 ---------------------
 * The :command:`file(STRINGS)` command gained a new ``ENCODING``
  option to enable extraction of ``UTF-8`` strings.
--- a/Source/cmFileCommand.cxx
+++ b/Source/cmFileCommand.cxx
@ -428,7 +428,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
         arg_length_minimum,
         arg_length_maximum,
         arg__maximum,
-         arg_regex };
+         arg_regex,
         arg_encoding };
  unsigned int minlen = 0;
  unsigned int maxlen = 0;
  int limit_input = -1;
@ -438,6 +439,7 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
  bool have_regex = false;
  bool newline_consume = false;
  bool hex_conversion_enabled = true;
  bool utf8_encoding = false;
  int arg_mode = arg_none;
  for(unsigned int i=3; i < args.size(); ++i)
    {
@ -475,6 +477,10 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
      hex_conversion_enabled = false;
      arg_mode = arg_none;
      }
    else if(args[i] == "ENCODING")
      {
      arg_mode = arg_encoding;
      }
    else if(arg_mode == arg_limit_input)
      {
      if(sscanf(args[i].c_str(), "%d", &limit_input) != 1 ||
@ -556,6 +562,22 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
      have_regex = true;
      arg_mode = arg_none;
      }
    else if(arg_mode == arg_encoding)
      {
      if(args[i] == "UTF-8")
        {
        utf8_encoding = true;
        }
      else
        {
        cmOStringStream e;
        e << "STRINGS option ENCODING \""
          << args[i] << "\" not recognized.";
        this->SetError(e.str());
        return false;
        }
      arg_mode = arg_none;
      }
    else
      {
      cmOStringStream e;
@ -596,11 +618,75 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
  int output_size = 0;
  std::vector<std::string> strings;
  std::string s;
  int c;
  while((!limit_count || strings.size() < limit_count) &&
        (limit_input < 0 || static_cast<int>(fin.tellg()) < limit_input) &&
-        (c = fin.get(), fin))
+        fin)
    {
    std::string current_str;
    int c = fin.get();
    if(c == '\r')
      {
      // Ignore CR character to make output always have UNIX newlines.
      continue;
      }
    else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
            (c == '\n' && newline_consume))
      {
      // This is an ASCII character that may be part of a string.
      // Cast added to avoid compiler warning. Cast is ok because
      // c is guaranteed to fit in char by the above if...
      current_str += static_cast<char>(c);
      }
    else if(utf8_encoding)
      {
      // Check for UTF-8 encoded string (up to 4 octets)
      static const unsigned char utf8_check_table[3][2] =
        {
          {0xE0, 0xC0},
          {0xF0, 0xE0},
          {0xF8, 0xF0},
        };
      // how many octets are there?
      unsigned int num_utf8_bytes = 0;
      for(unsigned int j=0; num_utf8_bytes == 0 && j<3; j++)
        {
        if((c & utf8_check_table[j][0]) == utf8_check_table[j][1])
          num_utf8_bytes = j+2;
        }
      // get subsequent octets and check that they are valid
      for(unsigned int j=0; j<num_utf8_bytes; j++)
        {
        if(j != 0)
          {
          c = fin.get();
          if(!fin || (c & 0xC0) != 0x80)
            {
            fin.putback(static_cast<char>(c));
            break;
            }
          }
        current_str += static_cast<char>(c);
        }
      // if this was an invalid utf8 sequence, discard the data, and put
      // back subsequent characters
      if((current_str.length() != num_utf8_bytes))
        {
        for(unsigned int j=0; j<current_str.size()-1; j++)
          {
          c = current_str[current_str.size() - 1 - j];
          fin.putback(static_cast<char>(c));
          }
        current_str = "";
        }
      }
    if(c == '\n' && !newline_consume)
      {
      // The current line has been terminated.  Check if the current
@ -621,21 +707,8 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
      // Reset the string to empty.
      s = "";
      }
-    else if(c == '\r')
+    else if(current_str.empty())
      {
      // Ignore CR character to make output always have UNIX newlines.
      }
    else if((c >= 0x20 && c < 0x7F) || c == '\t' ||
            (c == '\n' && newline_consume))
      {
      // This is an ASCII character that may be part of a string.
      // Cast added to avoid compiler warning. Cast is ok because
      // c is guaranteed to fit in char by the above if...
      s += static_cast<char>(c);
      }
    else
      {
      // TODO: Support ENCODING option.  See issue #10519.
      // A non-string character has been found.  Check if the current
      // string matches the requirements.  We require that the length
      // be at least one no matter what the user specified.
@ -654,10 +727,15 @@ bool cmFileCommand::HandleStringsCommand(std::vector<std::string> const& args)
      // Reset the string to empty.
      s = "";
      }
    else
      {
      s += current_str;
      }
    // Terminate a string if the maximum length is reached.
    if(maxlen > 0 && s.size() == maxlen)
      {
      // Terminate a string if the maximum length is reached.
      if(s.length() >= minlen &&
         (!have_regex || regex.find(s.c_str())))
        {
--- a/Tests/StringFileTest/CMakeLists.txt
+++ b/Tests/StringFileTest/CMakeLists.txt
@ -55,6 +55,16 @@ else()
    "file(STRINGS) incorrectly read from srec file [${infile_strings}]")
 endif()
 #this file has utf-8 content
 file(STRINGS test.utf8 infile_strings ENCODING UTF-8)
 list(LENGTH infile_strings content_len)
 if(content_len MATCHES "3")
  message("file(STRINGS) correctly read from utf8 file [${infile_strings}]")
 else()
  message(SEND_ERROR
    "file(STRINGS) incorrectly read from utf8 file [${infile_strings}]")
 endif()
 # String test
 string(REGEX MATCH "[cC][mM][aA][kK][eE]" rmvar "CMake is great")
 string(REGEX MATCHALL "[cC][mM][aA][kK][eE]" rmallvar "CMake is better than cmake or CMake")
--- a/Tests/StringFileTest/test.utf8
+++ b/Tests/StringFileTest/test.utf8
@ -0,0 +1,3 @@
 The value of Ï€ (pi) is 3.141593
 Line mixed with binary partially matches valid utf8: Ï€ is à93.1593
 à