ENH: add method to attempt to check if a file is text or binary

This commit is contained in:
Sebastien Barre 2005-07-28 13:21:03 -04:00
parent fac61306a2
commit 6e4b6ca0d3
2 changed files with 80 additions and 1 deletions

View File

@ -2748,6 +2748,65 @@ bool SystemTools::FileHasSignature(const char *filename,
return res;
}
SystemTools::FileTypeEnum
SystemTools::DetectFileType(const char *filename,
unsigned long length,
double percent_bin)
{
if (!filename || percent_bin < 0)
{
return SystemTools::FileTypeUnknown;
}
FILE *fp;
fp = fopen(filename, "rb");
if (!fp)
{
return SystemTools::FileTypeUnknown;
}
// Allocate buffer and read bytes
unsigned char *buffer = new unsigned char [length];
size_t read_length = fread(buffer, 1, length, fp);
fclose(fp);
if (read_length == 0)
{
return SystemTools::FileTypeUnknown;
}
// Loop over contents and count
size_t text_count = 0;
const unsigned char *ptr = buffer;
const unsigned char *buffer_end = buffer + read_length;
while (ptr != buffer_end)
{
if ((*ptr >= 0x20 && *ptr <= 0x7F) ||
*ptr == '\n' ||
*ptr == '\r' ||
*ptr == '\t')
{
text_count++;
}
ptr++;
}
delete [] buffer;
double current_percent_bin =
((double)(read_length - text_count) / (double)read_length);
if (current_percent_bin >= percent_bin)
{
return SystemTools::FileTypeBinary;
}
return SystemTools::FileTypeText;
}
bool SystemTools::LocateFileInDir(const char *filename,
const char *dir,
kwsys_stl::string& filename_found,

View File

@ -507,11 +507,31 @@ public:
static bool FileIsSymlink(const char* name);
/**
* return true if the file has a given signature (first set of bytes)
* Return true if the file has a given signature (first set of bytes)
*/
static bool FileHasSignature(
const char* filename, const char *signature, long offset = 0);
/**
* Attempt to detect and return the type of a file.
* Up to 'length' bytes are read from the file, if more than 'percent_bin' %
* of the bytes are non-textual elements, the file is considered binary,
* otherwise textual. Textual elements are bytes in the ASCII [0x20, 0x7E]
* range, but also \n, \r, \t.
* The algorithm is simplistic, and should probably check for usual file
* extensions, 'magic' signature, unicode, etc.
*/
enum FileTypeEnum
{
FileTypeUnknown,
FileTypeBinary,
FileTypeText
};
static SystemTools::FileTypeEnum DetectFileType(
const char* filename,
unsigned long length = 256,
double percent_bin = 0.05);
/**
* Try to locate the file 'filename' in the directory 'dir'.
* If 'filename' is a fully qualified filename, the basename of the file is