ENH: add method to attempt to check if a file is text or binary
This commit is contained in:
parent
fac61306a2
commit
6e4b6ca0d3
|
@ -2748,6 +2748,65 @@ bool SystemTools::FileHasSignature(const char *filename,
|
|||
return res;
|
||||
}
|
||||
|
||||
SystemTools::FileTypeEnum
|
||||
SystemTools::DetectFileType(const char *filename,
|
||||
unsigned long length,
|
||||
double percent_bin)
|
||||
{
|
||||
if (!filename || percent_bin < 0)
|
||||
{
|
||||
return SystemTools::FileTypeUnknown;
|
||||
}
|
||||
|
||||
FILE *fp;
|
||||
fp = fopen(filename, "rb");
|
||||
if (!fp)
|
||||
{
|
||||
return SystemTools::FileTypeUnknown;
|
||||
}
|
||||
|
||||
// Allocate buffer and read bytes
|
||||
|
||||
unsigned char *buffer = new unsigned char [length];
|
||||
size_t read_length = fread(buffer, 1, length, fp);
|
||||
fclose(fp);
|
||||
if (read_length == 0)
|
||||
{
|
||||
return SystemTools::FileTypeUnknown;
|
||||
}
|
||||
|
||||
// Loop over contents and count
|
||||
|
||||
size_t text_count = 0;
|
||||
|
||||
const unsigned char *ptr = buffer;
|
||||
const unsigned char *buffer_end = buffer + read_length;
|
||||
|
||||
while (ptr != buffer_end)
|
||||
{
|
||||
if ((*ptr >= 0x20 && *ptr <= 0x7F) ||
|
||||
*ptr == '\n' ||
|
||||
*ptr == '\r' ||
|
||||
*ptr == '\t')
|
||||
{
|
||||
text_count++;
|
||||
}
|
||||
ptr++;
|
||||
}
|
||||
|
||||
delete [] buffer;
|
||||
|
||||
double current_percent_bin =
|
||||
((double)(read_length - text_count) / (double)read_length);
|
||||
|
||||
if (current_percent_bin >= percent_bin)
|
||||
{
|
||||
return SystemTools::FileTypeBinary;
|
||||
}
|
||||
|
||||
return SystemTools::FileTypeText;
|
||||
}
|
||||
|
||||
bool SystemTools::LocateFileInDir(const char *filename,
|
||||
const char *dir,
|
||||
kwsys_stl::string& filename_found,
|
||||
|
|
|
@ -507,11 +507,31 @@ public:
|
|||
static bool FileIsSymlink(const char* name);
|
||||
|
||||
/**
|
||||
* return true if the file has a given signature (first set of bytes)
|
||||
* Return true if the file has a given signature (first set of bytes)
|
||||
*/
|
||||
static bool FileHasSignature(
|
||||
const char* filename, const char *signature, long offset = 0);
|
||||
|
||||
/**
|
||||
* Attempt to detect and return the type of a file.
|
||||
* Up to 'length' bytes are read from the file, if more than 'percent_bin' %
|
||||
* of the bytes are non-textual elements, the file is considered binary,
|
||||
* otherwise textual. Textual elements are bytes in the ASCII [0x20, 0x7E]
|
||||
* range, but also \n, \r, \t.
|
||||
* The algorithm is simplistic, and should probably check for usual file
|
||||
* extensions, 'magic' signature, unicode, etc.
|
||||
*/
|
||||
enum FileTypeEnum
|
||||
{
|
||||
FileTypeUnknown,
|
||||
FileTypeBinary,
|
||||
FileTypeText
|
||||
};
|
||||
static SystemTools::FileTypeEnum DetectFileType(
|
||||
const char* filename,
|
||||
unsigned long length = 256,
|
||||
double percent_bin = 0.05);
|
||||
|
||||
/**
|
||||
* Try to locate the file 'filename' in the directory 'dir'.
|
||||
* If 'filename' is a fully qualified filename, the basename of the file is
|
||||
|
|
Loading…
Reference in New Issue