Critical bug fixes:
- save_item now returns real Item from database, not a hardcoded fake
- AsyncDataService::save() reuses self.sync_service instead of creating redundant instance
- GenerateStatus trait signature mismatch fixed (CLI/API decoupling)
Performance improvements (pipe path untouched):
- CompressionEngine::open() returns Box<dyn Read + Send> enabling true streaming
- mode_get eliminates triple full-file read (was sampling then re-reading entire file)
- FilteringReader adds fast-path bypass when no filters, pre-allocates temp buffer
- text.rs meta plugin processes &[u8] slice directly, eliminates data.to_vec() clone
API correctness:
- Tag parse errors now return 400 instead of being silently discarded
- compute_diff uses similar crate (LCS-based) instead of naive positional comparison
Cleanup:
- Modernize string formatting (format!({x})) across codebase
- Remove redundant DB query in get mode
- Derive Debug/ToSchema on public types
- Delete placeholder test files with no real assertions
- Extract parse_comma_tags utility function
232 lines
8.9 KiB
Rust
232 lines
8.9 KiB
Rust
/// Detect if data is binary or text
|
|
/// Returns true if data is likely binary, false if likely text
|
|
pub fn is_binary(data: &[u8]) -> bool {
|
|
if data.is_empty() {
|
|
return false;
|
|
}
|
|
|
|
// First check for known binary file signatures
|
|
if has_binary_signature(data) {
|
|
return true;
|
|
}
|
|
|
|
// Check for UTF-16 BOM (text)
|
|
if data.len() >= 2
|
|
&& ((data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF))
|
|
{
|
|
return false; // UTF-16 with BOM is text
|
|
}
|
|
|
|
// Check for UTF-8 BOM (text)
|
|
if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
|
|
return false; // UTF-8 with BOM is text
|
|
}
|
|
|
|
// Check if it's valid UTF-8
|
|
if std::str::from_utf8(data).is_ok() {
|
|
// Valid UTF-8, check printable character ratio
|
|
return calculate_printable_ratio(data) < 0.7;
|
|
}
|
|
|
|
// Not valid UTF-8, check if it might be UTF-16 without BOM
|
|
if looks_like_utf16(data) {
|
|
return false; // Likely UTF-16 text
|
|
}
|
|
|
|
// Check for TAR format (special case with no magic number)
|
|
if looks_like_tar(data) {
|
|
return true;
|
|
}
|
|
|
|
// Final fallback: check printable character ratio
|
|
// For 1KB of random data, we expect very few printable characters
|
|
calculate_printable_ratio(data) < 0.7
|
|
}
|
|
|
|
/// Check for known binary file signatures
|
|
fn has_binary_signature(data: &[u8]) -> bool {
|
|
// Define binary file signatures with their minimum required lengths
|
|
let signatures: &[(&[u8], usize)] = &[
|
|
// Image formats
|
|
(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], 8), // PNG
|
|
(&[0xFF, 0xD8, 0xFF], 3), // JPEG (various subtypes)
|
|
(&[0x47, 0x49, 0x46, 0x38, 0x37, 0x61], 6), // GIF87a
|
|
(&[0x47, 0x49, 0x46, 0x38, 0x39, 0x61], 6), // GIF89a
|
|
(&[0x42, 0x4D], 2), // BMP
|
|
(&[0x00, 0x00, 0x01, 0x00], 4), // ICO
|
|
(&[0x49, 0x49, 0x2A, 0x00], 4), // TIFF (little endian)
|
|
(&[0x4D, 0x4D, 0x00, 0x2A], 4), // TIFF (big endian)
|
|
(&[0x52, 0x49, 0x46, 0x46], 4), // WebP (RIFF container)
|
|
(&[0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20], 8), // JPEG 2000
|
|
// Audio/Video formats
|
|
(&[0x49, 0x44, 0x33], 3), // MP3 with ID3v2
|
|
(&[0xFF, 0xFB], 2), // MP3
|
|
(&[0xFF, 0xF3], 2), // MP3
|
|
(&[0xFF, 0xF2], 2), // MP3
|
|
(&[0x4F, 0x67, 0x67, 0x53], 4), // OGG
|
|
(&[0x66, 0x74, 0x79, 0x70], 4), // MP4/M4A/MOV (at offset 4)
|
|
(&[0x52, 0x49, 0x46, 0x46], 4), // WAV/AVI (RIFF)
|
|
(&[0x46, 0x4C, 0x56], 3), // FLV
|
|
(&[0x1A, 0x45, 0xDF, 0xA3], 4), // MKV/WebM
|
|
// Archive formats
|
|
(&[0x50, 0x4B, 0x03, 0x04], 4), // ZIP
|
|
(&[0x50, 0x4B, 0x05, 0x06], 4), // ZIP (empty)
|
|
(&[0x50, 0x4B, 0x07, 0x08], 4), // ZIP (spanned)
|
|
(&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00], 7), // RAR v1.5+
|
|
(&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00], 8), // RAR v5.0+
|
|
(&[0x1F, 0x8B], 2), // GZIP
|
|
(&[0x42, 0x5A, 0x68], 3), // BZIP2
|
|
(&[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00], 6), // XZ
|
|
(&[0x28, 0xB5, 0x2F, 0xFD], 4), // Zstandard
|
|
(&[0x04, 0x22, 0x4D, 0x18], 4), // LZ4
|
|
(&[0x1F, 0x9D], 2), // LZW compressed
|
|
(&[0x1F, 0xA0], 2), // LZH compressed
|
|
(&[0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C], 6), // 7-Zip
|
|
// Document formats
|
|
(&[0x25, 0x50, 0x44, 0x46], 4), // PDF
|
|
(&[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 8), // MS Office (OLE)
|
|
(&[0x50, 0x4B, 0x03, 0x04], 4), // Office Open XML (also ZIP)
|
|
(&[0x7B, 0x5C, 0x72, 0x74, 0x66], 5), // RTF
|
|
// Executables and object files
|
|
(&[0x7F, 0x45, 0x4C, 0x46], 4), // ELF
|
|
(&[0x4D, 0x5A], 2), // Windows PE/DOS
|
|
(&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Mach-O (big endian)
|
|
(&[0xFE, 0xED, 0xFA, 0xCE], 4), // Mach-O 32-bit (little endian)
|
|
(&[0xFE, 0xED, 0xFA, 0xCF], 4), // Mach-O 64-bit (little endian)
|
|
(&[0xCE, 0xFA, 0xED, 0xFE], 4), // Mach-O 32-bit (big endian)
|
|
(&[0xCF, 0xFA, 0xED, 0xFE], 4), // Mach-O 64-bit (big endian)
|
|
(&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Java class file
|
|
(&[0xDE, 0xC0, 0x17, 0x0B], 4), // Dalvik executable
|
|
// Database formats
|
|
(
|
|
&[
|
|
0x53, 0x51, 0x4C, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6F, 0x72, 0x6D, 0x61, 0x74, 0x20,
|
|
0x33, 0x00,
|
|
],
|
|
16,
|
|
), // SQLite
|
|
(&[0x00, 0x01, 0x00, 0x00], 4), // Palm Database
|
|
// Font formats
|
|
(&[0x00, 0x01, 0x00, 0x00, 0x00], 5), // TrueType
|
|
(&[0x4F, 0x54, 0x54, 0x4F], 4), // OpenType
|
|
(&[0x77, 0x4F, 0x46, 0x46], 4), // WOFF
|
|
(&[0x77, 0x4F, 0x46, 0x32], 4), // WOFF2
|
|
// Virtual machine formats
|
|
(&[0x76, 0x6D, 0x64, 0x6B], 4), // VMDK
|
|
(
|
|
&[
|
|
0x3C, 0x3C, 0x3C, 0x20, 0x4F, 0x72, 0x61, 0x63, 0x6C, 0x65, 0x20, 0x56, 0x4D, 0x20,
|
|
0x56, 0x69, 0x72, 0x74, 0x75, 0x61, 0x6C, 0x42, 0x6F, 0x78, 0x20, 0x44, 0x69, 0x73,
|
|
0x6B, 0x20, 0x49, 0x6D, 0x61, 0x67, 0x65, 0x20, 0x3E, 0x3E, 0x3E,
|
|
],
|
|
39,
|
|
), // VirtualBox VDI
|
|
// Disk image formats
|
|
(&[0xEB, 0x3C, 0x90], 3), // FAT12/16/32
|
|
(&[0xEB, 0x58, 0x90], 3), // FAT32
|
|
(&[0x55, 0xAA], 2), // Boot sector (at offset 510)
|
|
// Other binary formats
|
|
(&[0x21, 0x3C, 0x61, 0x72, 0x63, 0x68, 0x3E, 0x0A], 8), // AR archive
|
|
(&[0x78, 0x01], 2), // zlib (default compression)
|
|
(&[0x78, 0x9C], 2), // zlib (best compression)
|
|
(&[0x78, 0xDA], 2), // zlib (fast compression)
|
|
(&[0x62, 0x76, 0x78, 0x32], 4), // LZFSE
|
|
];
|
|
|
|
for (signature, min_len) in signatures {
|
|
if data.len() >= *min_len && data.starts_with(signature) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Special case: check for ftyp box in MP4/MOV files (at offset 4)
|
|
if data.len() >= 8 && &data[4..8] == b"ftyp" {
|
|
return true;
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
/// Check if data looks like UTF-16 without BOM
|
|
fn looks_like_utf16(data: &[u8]) -> bool {
|
|
if data.len() < 4 || data.len() % 2 != 0 {
|
|
return false;
|
|
}
|
|
|
|
// Check if it could be UTF-16 by looking at null patterns
|
|
let mut null_pairs = 0;
|
|
let max_checks = std::cmp::min(data.len() / 2, 50); // Check up to 50 character pairs
|
|
|
|
for i in 0..max_checks {
|
|
if data[i * 2 + 1] == 0 {
|
|
null_pairs += 1;
|
|
}
|
|
}
|
|
|
|
// If most high bytes are zero, it's likely UTF-16
|
|
if max_checks > 0 && null_pairs as f64 / max_checks as f64 > 0.7 {
|
|
return true;
|
|
}
|
|
|
|
// Also check the reverse pattern (little-endian UTF-16)
|
|
let mut null_pairs_reverse = 0;
|
|
for i in 0..max_checks {
|
|
if i * 2 + 1 < data.len() && data[i * 2] == 0 {
|
|
null_pairs_reverse += 1;
|
|
}
|
|
}
|
|
|
|
null_pairs_reverse as f64 / max_checks as f64 > 0.7
|
|
}
|
|
|
|
/// Check if data looks like a TAR archive
|
|
fn looks_like_tar(data: &[u8]) -> bool {
|
|
if data.len() < 512 {
|
|
return false;
|
|
}
|
|
|
|
// TAR header structure validation
|
|
// Filename should not start with null
|
|
if data[0] == 0 {
|
|
return false;
|
|
}
|
|
|
|
// Check file mode field (should be octal digits)
|
|
for byte in data.iter().skip(100).take(8) {
|
|
if *byte != 0 && !(b'0'..=b'7').contains(byte) && *byte != b' ' {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check checksum field (should be octal digits or spaces)
|
|
for &b in &data[148..156] {
|
|
if b != 0 && !(b'0'..=b'7').contains(&b) && b != b' ' {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check magic field for POSIX TAR
|
|
if data.len() >= 265 {
|
|
let magic = &data[257..262];
|
|
if magic == b"ustar" {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Additional heuristic: check if the structure looks reasonable
|
|
// Mode field
|
|
|
|
data[0] != 0 && // Filename starts
|
|
data[100..108].iter().all(|&b| b == 0 || (b'0'..=b'7').contains(&b) || b == b' ')
|
|
}
|
|
|
|
/// Calculate the ratio of printable characters in the data
|
|
fn calculate_printable_ratio(data: &[u8]) -> f64 {
|
|
let printable_count = data
|
|
.iter()
|
|
.filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace())
|
|
.count();
|
|
|
|
printable_count as f64 / data.len() as f64
|
|
}
|