refactor: improve binary detection with expanded file type support and cleaner code structure

Co-authored-by: aider (openai/andrew/openrouter/anthropic/claude-sonnet-4) <aider@aider.chat>
2025-08-11 11:38:12 -03:00
parent 68d182ee0b
commit b97e79ed2f
1 changed files with 197 additions and 98 deletions
--- a/src/meta_plugin/system.rs
+++ b/src/meta_plugin/system.rs
@@ -38,113 +38,212 @@ impl BinaryMetaPlugin {
            return false;
        }
        
+        // First check for known binary file signatures
+        if Self::has_binary_signature(data) {
+            return true;
+        }
+        
+        // Check for UTF-16 BOM (text)
+        if data.len() >= 2 {
+            if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) {
+                return false; // UTF-16 with BOM is text
+            }
+        }
+        
+        // Check for UTF-8 BOM (text)
+        if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
+            return false; // UTF-8 with BOM is text
+        }
+        
        // Check if it's valid UTF-8
        if std::str::from_utf8(data).is_ok() {
-            // Valid UTF-8, but might still be binary
-            // Check if it's UTF-16
-            if data.len() >= 2 {
-                // Check for BOM
-                if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) {
-                    // UTF-16 with BOM is text
-                    return false;
-                }
-            }
+            // Valid UTF-8, check printable character ratio
+            return Self::calculate_printable_ratio(data) < 0.7;
+        }
+        
+        // Not valid UTF-8, check if it might be UTF-16 without BOM
+        if Self::looks_like_utf16(data) {
+            return false; // Likely UTF-16 text
+        }
+        
+        // Check for TAR format (special case with no magic number)
+        if Self::looks_like_tar(data) {
+            return true;
+        }
+        
+        // Final fallback: check printable character ratio
+        Self::calculate_printable_ratio(data) < 0.3
+    }
+    
+    /// Check for known binary file signatures
+    fn has_binary_signature(data: &[u8]) -> bool {
+        // Define binary file signatures with their minimum required lengths
+        let signatures: &[(&[u8], usize)] = &[
+            // Image formats
+            (&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], 8), // PNG
+            (&[0xFF, 0xD8, 0xFF], 3), // JPEG (various subtypes)
+            (&[0x47, 0x49, 0x46, 0x38, 0x37, 0x61], 6), // GIF87a
+            (&[0x47, 0x49, 0x46, 0x38, 0x39, 0x61], 6), // GIF89a
+            (&[0x42, 0x4D], 2), // BMP
+            (&[0x00, 0x00, 0x01, 0x00], 4), // ICO
+            (&[0x49, 0x49, 0x2A, 0x00], 4), // TIFF (little endian)
+            (&[0x4D, 0x4D, 0x00, 0x2A], 4), // TIFF (big endian)
+            (&[0x52, 0x49, 0x46, 0x46], 4), // WebP (RIFF container)
+            (&[0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20], 8), // JPEG 2000
            
-            // Count printable characters
-            let printable_count = data.iter().filter(|&&b| {
-                b.is_ascii_alphanumeric() || 
-                b.is_ascii_punctuation() || 
-                b.is_ascii_whitespace() ||
-                b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
-            }).count();
+            // Audio/Video formats
+            (&[0x49, 0x44, 0x33], 3), // MP3 with ID3v2
+            (&[0xFF, 0xFB], 2), // MP3
+            (&[0xFF, 0xF3], 2), // MP3
+            (&[0xFF, 0xF2], 2), // MP3
+            (&[0x4F, 0x67, 0x67, 0x53], 4), // OGG
+            (&[0x66, 0x74, 0x79, 0x70], 4), // MP4/M4A/MOV (at offset 4)
+            (&[0x52, 0x49, 0x46, 0x46], 4), // WAV/AVI (RIFF)
+            (&[0x46, 0x4C, 0x56], 3), // FLV
+            (&[0x1A, 0x45, 0xDF, 0xA3], 4), // MKV/WebM
            
-            // If less than 70% of bytes are printable, consider it binary
-            let printable_ratio = printable_count as f64 / data.len() as f64;
-            return printable_ratio < 0.7;
-        } else {
-            // Not valid UTF-8, likely binary
-            // But check if it might be UTF-16 without BOM
-            if data.len() >= 2 && data.len() % 2 == 0 {
-                // Check if it looks like UTF-16 (every other byte is 0)
-                let mut zero_count = 0;
-                for (i, &byte) in data.iter().enumerate() {
-                    if i % 2 == 1 && byte == 0 {
-                        zero_count += 1;
-                    }
-                }
-                // If more than 50% of odd positions are zero, might be UTF-16
-                if zero_count as f64 / (data.len() / 2) as f64 > 0.5 {
-                    return false; // Likely UTF-16 text
-                }
-            }
+            // Archive formats
+            (&[0x50, 0x4B, 0x03, 0x04], 4), // ZIP
+            (&[0x50, 0x4B, 0x05, 0x06], 4), // ZIP (empty)
+            (&[0x50, 0x4B, 0x07, 0x08], 4), // ZIP (spanned)
+            (&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00], 7), // RAR v1.5+
+            (&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00], 8), // RAR v5.0+
+            (&[0x1F, 0x8B], 2), // GZIP
+            (&[0x42, 0x5A, 0x68], 3), // BZIP2
+            (&[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00], 6), // XZ
+            (&[0x28, 0xB5, 0x2F, 0xFD], 4), // Zstandard
+            (&[0x04, 0x22, 0x4D, 0x18], 4), // LZ4
+            (&[0x1F, 0x9D], 2), // LZW compressed
+            (&[0x1F, 0xA0], 2), // LZH compressed
+            (&[0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C], 6), // 7-Zip
            
-            // Check for common binary file signatures
-            if data.len() >= 4 {
-                // Check for common binary file headers
-                let headers = [
-                    // Image formats
-                    &[0x89, 0x50, 0x4E, 0x47], // PNG
-                    &[0xFF, 0xD8, 0xFF, 0xE0], // JPEG
-                    &[0x47, 0x49, 0x46, 0x38], // GIF
-                    &[0x42, 0x4D],             // BMP
-                    // Document formats
-                    &[0x25, 0x50, 0x44, 0x46], // PDF
-                    // Archive formats
-                    &[0x50, 0x4B, 0x03, 0x04], // ZIP
-                    &[0x52, 0x61, 0x72, 0x21], // RAR
-                    &[0x1F, 0x8B],             // GZIP
-                    &[0x42, 0x5A, 0x68],       // BZIP2
-                    &[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00], // XZ
-                    // TAR has no magic number, but we can check for common patterns
-                    // Executables and object files
-                    &[0x7F, 0x45, 0x4C, 0x46], // ELF
-                    &[0x4D, 0x5A],             // Windows PE
-                    // Compressed formats
-                    &[0x1F, 0x9D],             // LZW compressed
-                    &[0x1F, 0xA0],             // LZH compressed
-                ];
-                
-                for header in &headers {
-                    if data.starts_with(header) {
-                        return true; // Definitely binary
-                    }
-                }
-                
-                // Special case for TAR files (no consistent magic number)
-                // Check if it looks like a TAR header
-                if data.len() >= 512 {
-                    // TAR headers have specific structure
-                    // First 100 bytes are filename (null-terminated)
-                    // Next 8 bytes are file mode (octal, null-terminated)
-                    // If we see this pattern, it's likely a TAR file
-                    let has_tar_structure = 
-                        data[0] != 0 && // First byte of filename should not be null
-                        data[100] == 0 && // File mode should start with null
-                        data[101] >= b'0' && data[101] <= b'7'; // File mode should be octal digit
-                    
-                    if has_tar_structure {
-                        return true;
-                    }
-                }
-            }
+            // Document formats
+            (&[0x25, 0x50, 0x44, 0x46], 4), // PDF
+            (&[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 8), // MS Office (OLE)
+            (&[0x50, 0x4B, 0x03, 0x04], 4), // Office Open XML (also ZIP)
+            (&[0x7B, 0x5C, 0x72, 0x74, 0x66], 5), // RTF
            
-            // Check for AR format (used for static libraries)
-            if data.len() >= 8 && &data[0..8] == b"!<arch>\n" {
+            // Executables and object files
+            (&[0x7F, 0x45, 0x4C, 0x46], 4), // ELF
+            (&[0x4D, 0x5A], 2), // Windows PE/DOS
+            (&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Mach-O (big endian)
+            (&[0xFE, 0xED, 0xFA, 0xCE], 4), // Mach-O 32-bit (little endian)
+            (&[0xFE, 0xED, 0xFA, 0xCF], 4), // Mach-O 64-bit (little endian)
+            (&[0xCE, 0xFA, 0xED, 0xFE], 4), // Mach-O 32-bit (big endian)
+            (&[0xCF, 0xFA, 0xED, 0xFE], 4), // Mach-O 64-bit (big endian)
+            (&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Java class file
+            (&[0xDE, 0xC0, 0x17, 0x0B], 4), // Dalvik executable
+            
+            // Database formats
+            (&[0x53, 0x51, 0x4C, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6F, 0x72, 0x6D, 0x61, 0x74, 0x20, 0x33, 0x00], 16), // SQLite
+            (&[0x00, 0x01, 0x00, 0x00], 4), // Palm Database
+            
+            // Font formats
+            (&[0x00, 0x01, 0x00, 0x00, 0x00], 5), // TrueType
+            (&[0x4F, 0x54, 0x54, 0x4F], 4), // OpenType
+            (&[0x77, 0x4F, 0x46, 0x46], 4), // WOFF
+            (&[0x77, 0x4F, 0x46, 0x32], 4), // WOFF2
+            
+            // Virtual machine formats
+            (&[0x76, 0x6D, 0x64, 0x6B], 4), // VMDK
+            (&[0x3C, 0x3C, 0x3C, 0x20, 0x4F, 0x72, 0x61, 0x63, 0x6C, 0x65, 0x20, 0x56, 0x4D, 0x20, 0x56, 0x69, 0x72, 0x74, 0x75, 0x61, 0x6C, 0x42, 0x6F, 0x78, 0x20, 0x44, 0x69, 0x73, 0x6B, 0x20, 0x49, 0x6D, 0x61, 0x67, 0x65, 0x20, 0x3E, 0x3E, 0x3E], 39), // VirtualBox VDI
+            
+            // Disk image formats
+            (&[0xEB, 0x3C, 0x90], 3), // FAT12/16/32
+            (&[0xEB, 0x58, 0x90], 3), // FAT32
+            (&[0x55, 0xAA], 2), // Boot sector (at offset 510)
+            
+            // Other binary formats
+            (&[0x21, 0x3C, 0x61, 0x72, 0x63, 0x68, 0x3E, 0x0A], 8), // AR archive
+            (&[0x78, 0x01], 2), // zlib (default compression)
+            (&[0x78, 0x9C], 2), // zlib (best compression)
+            (&[0x78, 0xDA], 2), // zlib (fast compression)
+            (&[0x62, 0x76, 0x78, 0x32], 4), // LZFSE
+        ];
+        
+        for (signature, min_len) in signatures {
+            if data.len() >= *min_len && data.starts_with(signature) {
                return true;
            }
-            
-            // Count printable characters as a fallback
-            let printable_count = data.iter().filter(|&&b| {
-                b.is_ascii_alphanumeric() || 
-                b.is_ascii_punctuation() || 
-                b.is_ascii_whitespace() ||
-                b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
-            }).count();
-            
-            // If less than 30% of bytes are printable, consider it binary
-            let printable_ratio = printable_count as f64 / data.len() as f64;
-            printable_ratio < 0.3
        }
+        
+        // Special case: check for ftyp box in MP4/MOV files (at offset 4)
+        if data.len() >= 8 && &data[4..8] == b"ftyp" {
+            return true;
+        }
+        
+        false
+    }
+    
+    /// Check if data looks like UTF-16 without BOM
+    fn looks_like_utf16(data: &[u8]) -> bool {
+        if data.len() < 4 || data.len() % 2 != 0 {
+            return false;
+        }
+        
+        let mut zero_count = 0;
+        let pairs = data.len() / 2;
+        
+        // Check if every other byte is zero (indicating UTF-16)
+        for i in 0..pairs {
+            if data[i * 2 + 1] == 0 {
+                zero_count += 1;
+            }
+        }
+        
+        // If more than 50% of odd positions are zero, might be UTF-16
+        zero_count as f64 / pairs as f64 > 0.5
+    }
+    
+    /// Check if data looks like a TAR archive
+    fn looks_like_tar(data: &[u8]) -> bool {
+        if data.len() < 512 {
+            return false;
+        }
+        
+        // TAR header structure validation
+        // Filename should not start with null
+        if data[0] == 0 {
+            return false;
+        }
+        
+        // Check file mode field (should be octal digits)
+        for i in 100..108 {
+            if data[i] != 0 && (data[i] < b'0' || data[i] > b'7') && data[i] != b' ' {
+                return false;
+            }
+        }
+        
+        // Check checksum field (should be octal digits or spaces)
+        for i in 148..156 {
+            if data[i] != 0 && (data[i] < b'0' || data[i] > b'7') && data[i] != b' ' {
+                return false;
+            }
+        }
+        
+        // Check magic field for POSIX TAR
+        if data.len() >= 265 {
+            let magic = &data[257..262];
+            if magic == b"ustar" {
+                return true;
+            }
+        }
+        
+        // Additional heuristic: check if the structure looks reasonable
+        let has_reasonable_structure = 
+            data[0] != 0 && // Filename starts
+            data[100..108].iter().all(|&b| b == 0 || (b >= b'0' && b <= b'7') || b == b' '); // Mode field
+        
+        has_reasonable_structure
+    }
+    
+    /// Calculate the ratio of printable characters in the data
+    fn calculate_printable_ratio(data: &[u8]) -> f64 {
+        let printable_count = data.iter().filter(|&&b| {
+            b.is_ascii_graphic() || b.is_ascii_whitespace()
+        }).count();
+        
+        printable_count as f64 / data.len() as f64
    }
 }