From 86dabbdbc02a03837b1c1136c77bfcdec69bd9b9 Mon Sep 17 00:00:00 2001 From: Andrew Phillips Date: Mon, 11 Aug 2025 14:18:53 -0300 Subject: [PATCH] refactor: move binary detection to common module and enhance get logic Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) --- src/common.rs | 217 ++++++++++++++++++++++++++++++++++++++ src/meta_plugin/system.rs | 3 +- src/modes/get.rs | 43 ++++++-- 3 files changed, 255 insertions(+), 8 deletions(-) create mode 100644 src/common.rs diff --git a/src/common.rs b/src/common.rs new file mode 100644 index 0000000..177f671 --- /dev/null +++ b/src/common.rs @@ -0,0 +1,217 @@ +use std::io::Read; + +/// Detect if data is binary or text +/// Returns true if data is likely binary, false if likely text +pub fn is_binary(data: &[u8]) -> bool { + if data.is_empty() { + return false; + } + + // First check for known binary file signatures + if has_binary_signature(data) { + return true; + } + + // Check for UTF-16 BOM (text) + if data.len() >= 2 { + if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) { + return false; // UTF-16 with BOM is text + } + } + + // Check for UTF-8 BOM (text) + if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF { + return false; // UTF-8 with BOM is text + } + + // Check if it's valid UTF-8 + if std::str::from_utf8(data).is_ok() { + // Valid UTF-8, check printable character ratio + return calculate_printable_ratio(data) < 0.7; + } + + // Not valid UTF-8, check if it might be UTF-16 without BOM + if looks_like_utf16(data) { + return false; // Likely UTF-16 text + } + + // Check for TAR format (special case with no magic number) + if looks_like_tar(data) { + return true; + } + + // Final fallback: check printable character ratio + // For 1KB of random data, we expect very few printable characters + calculate_printable_ratio(data) < 0.7 +} + +/// Check for known binary file signatures +fn has_binary_signature(data: &[u8]) -> bool { + // Define binary file signatures with their minimum required lengths + let signatures: &[(&[u8], usize)] = &[ + // Image formats + (&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], 8), // PNG + (&[0xFF, 0xD8, 0xFF], 3), // JPEG (various subtypes) + (&[0x47, 0x49, 0x46, 0x38, 0x37, 0x61], 6), // GIF87a + (&[0x47, 0x49, 0x46, 0x38, 0x39, 0x61], 6), // GIF89a + (&[0x42, 0x4D], 2), // BMP + (&[0x00, 0x00, 0x01, 0x00], 4), // ICO + (&[0x49, 0x49, 0x2A, 0x00], 4), // TIFF (little endian) + (&[0x4D, 0x4D, 0x00, 0x2A], 4), // TIFF (big endian) + (&[0x52, 0x49, 0x46, 0x46], 4), // WebP (RIFF container) + (&[0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20], 8), // JPEG 2000 + + // Audio/Video formats + (&[0x49, 0x44, 0x33], 3), // MP3 with ID3v2 + (&[0xFF, 0xFB], 2), // MP3 + (&[0xFF, 0xF3], 2), // MP3 + (&[0xFF, 0xF2], 2), // MP3 + (&[0x4F, 0x67, 0x67, 0x53], 4), // OGG + (&[0x66, 0x74, 0x79, 0x70], 4), // MP4/M4A/MOV (at offset 4) + (&[0x52, 0x49, 0x46, 0x46], 4), // WAV/AVI (RIFF) + (&[0x46, 0x4C, 0x56], 3), // FLV + (&[0x1A, 0x45, 0xDF, 0xA3], 4), // MKV/WebM + + // Archive formats + (&[0x50, 0x4B, 0x03, 0x04], 4), // ZIP + (&[0x50, 0x4B, 0x05, 0x06], 4), // ZIP (empty) + (&[0x50, 0x4B, 0x07, 0x08], 4), // ZIP (spanned) + (&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00], 7), // RAR v1.5+ + (&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00], 8), // RAR v5.0+ + (&[0x1F, 0x8B], 2), // GZIP + (&[0x42, 0x5A, 0x68], 3), // BZIP2 + (&[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00], 6), // XZ + (&[0x28, 0xB5, 0x2F, 0xFD], 4), // Zstandard + (&[0x04, 0x22, 0x4D, 0x18], 4), // LZ4 + (&[0x1F, 0x9D], 2), // LZW compressed + (&[0x1F, 0xA0], 2), // LZH compressed + (&[0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C], 6), // 7-Zip + + // Document formats + (&[0x25, 0x50, 0x44, 0x46], 4), // PDF + (&[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 8), // MS Office (OLE) + (&[0x50, 0x4B, 0x03, 0x04], 4), // Office Open XML (also ZIP) + (&[0x7B, 0x5C, 0x72, 0x74, 0x66], 5), // RTF + + // Executables and object files + (&[0x7F, 0x45, 0x4C, 0x46], 4), // ELF + (&[0x4D, 0x5A], 2), // Windows PE/DOS + (&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Mach-O (big endian) + (&[0xFE, 0xED, 0xFA, 0xCE], 4), // Mach-O 32-bit (little endian) + (&[0xFE, 0xED, 0xFA, 0xCF], 4), // Mach-O 64-bit (little endian) + (&[0xCE, 0xFA, 0xED, 0xFE], 4), // Mach-O 32-bit (big endian) + (&[0xCF, 0xFA, 0xED, 0xFE], 4), // Mach-O 64-bit (big endian) + (&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Java class file + (&[0xDE, 0xC0, 0x17, 0x0B], 4), // Dalvik executable + + // Database formats + (&[0x53, 0x51, 0x4C, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6F, 0x72, 0x6D, 0x61, 0x74, 0x20, 0x33, 0x00], 16), // SQLite + (&[0x00, 0x01, 0x00, 0x00], 4), // Palm Database + + // Font formats + (&[0x00, 0x01, 0x00, 0x00, 0x00], 5), // TrueType + (&[0x4F, 0x54, 0x54, 0x4F], 4), // OpenType + (&[0x77, 0x4F, 0x46, 0x46], 4), // WOFF + (&[0x77, 0x4F, 0x46, 0x32], 4), // WOFF2 + + // Virtual machine formats + (&[0x76, 0x6D, 0x64, 0x6B], 4), // VMDK + (&[0x3C, 0x3C, 0x3C, 0x20, 0x4F, 0x72, 0x61, 0x63, 0x6C, 0x65, 0x20, 0x56, 0x4D, 0x20, 0x56, 0x69, 0x72, 0x74, 0x75, 0x61, 0x6C, 0x42, 0x6F, 0x78, 0x20, 0x44, 0x69, 0x73, 0x6B, 0x20, 0x49, 0x6D, 0x61, 0x67, 0x65, 0x20, 0x3E, 0x3E, 0x3E], 39), // VirtualBox VDI + + // Disk image formats + (&[0xEB, 0x3C, 0x90], 3), // FAT12/16/32 + (&[0xEB, 0x58, 0x90], 3), // FAT32 + (&[0x55, 0xAA], 2), // Boot sector (at offset 510) + + // Other binary formats + (&[0x21, 0x3C, 0x61, 0x72, 0x63, 0x68, 0x3E, 0x0A], 8), // AR archive + (&[0x78, 0x01], 2), // zlib (default compression) + (&[0x78, 0x9C], 2), // zlib (best compression) + (&[0x78, 0xDA], 2), // zlib (fast compression) + (&[0x62, 0x76, 0x78, 0x32], 4), // LZFSE + ]; + + for (signature, min_len) in signatures { + if data.len() >= *min_len && data.starts_with(signature) { + return true; + } + } + + // Special case: check for ftyp box in MP4/MOV files (at offset 4) + if data.len() >= 8 && &data[4..8] == b"ftyp" { + return true; + } + + false +} + +/// Check if data looks like UTF-16 without BOM +fn looks_like_utf16(data: &[u8]) -> bool { + if data.len() < 4 || data.len() % 2 != 0 { + return false; + } + + let mut zero_count = 0; + let pairs = data.len() / 2; + + // Check if every other byte is zero (indicating UTF-16) + for i in 0..pairs { + if data[i * 2 + 1] == 0 { + zero_count += 1; + } + } + + // If more than 50% of odd positions are zero, might be UTF-16 + zero_count as f64 / pairs as f64 > 0.5 +} + +/// Check if data looks like a TAR archive +fn looks_like_tar(data: &[u8]) -> bool { + if data.len() < 512 { + return false; + } + + // TAR header structure validation + // Filename should not start with null + if data[0] == 0 { + return false; + } + + // Check file mode field (should be octal digits) + for i in 100..108 { + if data[i] != 0 && (data[i] < b'0' || data[i] > b'7') && data[i] != b' ' { + return false; + } + } + + // Check checksum field (should be octal digits or spaces) + for i in 148..156 { + if data[i] != 0 && (data[i] < b'0' || data[i] > b'7') && data[i] != b' ' { + return false; + } + } + + // Check magic field for POSIX TAR + if data.len() >= 265 { + let magic = &data[257..262]; + if magic == b"ustar" { + return true; + } + } + + // Additional heuristic: check if the structure looks reasonable + let has_reasonable_structure = + data[0] != 0 && // Filename starts + data[100..108].iter().all(|&b| b == 0 || (b >= b'0' && b <= b'7') || b == b' '); // Mode field + + has_reasonable_structure +} + +/// Calculate the ratio of printable characters in the data +fn calculate_printable_ratio(data: &[u8]) -> f64 { + let printable_count = data.iter().filter(|&&b| { + b.is_ascii_graphic() || b.is_ascii_whitespace() + }).count(); + + printable_count as f64 / data.len() as f64 +} diff --git a/src/meta_plugin/system.rs b/src/meta_plugin/system.rs index 4fa6ef4..8a2c027 100644 --- a/src/meta_plugin/system.rs +++ b/src/meta_plugin/system.rs @@ -8,6 +8,7 @@ use std::env; use std::process; use uzers::{get_current_uid, get_current_gid, get_current_username, get_current_groupname}; +use crate::common::is_binary; use crate::meta_plugin::MetaPlugin; #[derive(Debug, Clone, Default)] @@ -258,7 +259,7 @@ impl MetaPlugin for BinaryMetaPlugin { } fn finalize(&mut self) -> io::Result { - let is_binary = Self::is_binary(&self.buffer); + let is_binary = is_binary(&self.buffer); Ok(if is_binary { "true".to_string() } else { "false".to_string() }) } diff --git a/src/modes/get.rs b/src/modes/get.rs index f07d38a..f095f35 100644 --- a/src/modes/get.rs +++ b/src/modes/get.rs @@ -1,6 +1,8 @@ use anyhow::anyhow; +use std::io::Read; use crate::compression_engine::{CompressionType, get_compression_engine}; +use crate::common::is_binary; use clap::Command; use std::path::PathBuf; use std::str::FromStr; @@ -43,22 +45,49 @@ pub fn mode_get( let mut item_path = data_path.clone(); item_path.push(item_id.to_string()); - // Check if this is a binary item and we're outputting to a TTY - if !args.options.force { + // Determine if we should detect binary data + let mut detect_binary = !args.options.force && is_stdout_tty(); + + // If we're detecting binary and there's binary metadata, check it + if detect_binary { let item_meta = crate::db::get_item_meta(conn, &item)?; let binary_meta = item_meta.into_iter().find(|meta| meta.name == "binary"); if let Some(binary_meta) = binary_meta { - if binary_meta.value == "true" { - if is_stdout_tty() { - return Err(anyhow!("Refusing to output binary data to TTY, use --force to override")); - } + if binary_meta.value == "false" { + // If metadata says it's not binary, don't detect + detect_binary = false; + } else if binary_meta.value == "true" { + // If metadata says it's binary, error immediately + return Err(anyhow!("Refusing to output binary data to TTY, use --force to override")); } } } let compression_type = CompressionType::from_str(&item.compression)?; let compression_engine = get_compression_engine(compression_type)?; - compression_engine.cat(item_path.clone())?; + + // If we need to detect binary, read first 4KB and check + if detect_binary { + // Open the file through compression engine to read first 4KB + let mut reader = compression_engine.open(item_path.clone())?; + let mut buffer = [0u8; 4096]; + let bytes_read = reader.read(&mut buffer)?; + + // Check if this data is binary + if is_binary(&buffer[..bytes_read]) { + return Err(anyhow!("Refusing to output binary data to TTY, use --force to override")); + } + + // If not binary, output the data we've read + std::io::stdout().write_all(&buffer[..bytes_read])?; + + // Continue reading and outputting the rest of the data + let mut stdout = std::io::stdout(); + std::io::copy(&mut reader, &mut stdout)?; + } else { + // No binary detection needed, just output the data + compression_engine.cat(item_path.clone())?; + } Ok(()) } else {