feat: add binary meta plugin to detect text vs binary content

Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
Andrew Phillips
2025-08-11 11:32:16 -03:00
parent ac531354d5
commit dc550c3f35

View File

@@ -15,6 +15,126 @@ pub struct CwdMetaPlugin {
meta_name: String,
}
#[derive(Debug, Clone, Default)]
pub struct BinaryMetaPlugin {
meta_name: String,
buffer: Vec<u8>,
max_buffer_size: usize,
}
impl BinaryMetaPlugin {
pub fn new() -> BinaryMetaPlugin {
BinaryMetaPlugin {
meta_name: "binary".to_string(),
buffer: Vec::new(),
max_buffer_size: 4096, // 4KB
}
}
/// Detect if data is binary or text
/// Returns true if data is likely binary, false if likely text
fn is_binary(data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
// Check if it's valid UTF-8
if std::str::from_utf8(data).is_ok() {
// Valid UTF-8, but might still be binary
// Check if it's UTF-16
if data.len() >= 2 {
// Check for BOM
if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) {
// UTF-16 with BOM is text
return false;
}
}
// Count printable characters
let printable_count = data.iter().filter(|&&b| {
b.is_ascii_alphanumeric() ||
b.is_ascii_punctuation() ||
b.is_ascii_whitespace() ||
b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
}).count();
// If less than 70% of bytes are printable, consider it binary
let printable_ratio = printable_count as f64 / data.len() as f64;
return printable_ratio < 0.7;
} else {
// Not valid UTF-8, likely binary
// But check if it might be UTF-16 without BOM
if data.len() >= 2 && data.len() % 2 == 0 {
// Check if it looks like UTF-16 (every other byte is 0)
let mut zero_count = 0;
for (i, &byte) in data.iter().enumerate() {
if i % 2 == 1 && byte == 0 {
zero_count += 1;
}
}
// If more than 50% of odd positions are zero, might be UTF-16
if zero_count as f64 / (data.len() / 2) as f64 > 0.5 {
return false; // Likely UTF-16 text
}
}
// Check for common binary file signatures
if data.len() >= 4 {
// Check for common binary file headers
let headers = [
&[0x89, 0x50, 0x4E, 0x47], // PNG
&[0xFF, 0xD8, 0xFF, 0xE0], // JPEG
&[0x25, 0x50, 0x44, 0x46], // PDF
&[0x50, 0x4B, 0x03, 0x04], // ZIP
&[0x52, 0x61, 0x72, 0x21], // RAR
];
for header in &headers {
if data.starts_with(header) {
return true; // Definitely binary
}
}
}
// Count printable characters as a fallback
let printable_count = data.iter().filter(|&&b| {
b.is_ascii_alphanumeric() ||
b.is_ascii_punctuation() ||
b.is_ascii_whitespace() ||
b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
}).count();
// If less than 30% of bytes are printable, consider it binary
let printable_ratio = printable_count as f64 / data.len() as f64;
printable_ratio < 0.3
}
}
}
impl MetaPlugin for BinaryMetaPlugin {
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
let is_binary = Self::is_binary(&self.buffer);
Ok(if is_binary { "true".to_string() } else { "false".to_string() })
}
fn update(&mut self, data: &[u8]) {
// Only collect up to max_buffer_size
let remaining_capacity = self.max_buffer_size.saturating_sub(self.buffer.len());
if remaining_capacity > 0 {
let bytes_to_copy = std::cmp::min(data.len(), remaining_capacity);
self.buffer.extend_from_slice(&data[..bytes_to_copy]);
}
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
impl CwdMetaPlugin {
pub fn new() -> CwdMetaPlugin {
CwdMetaPlugin {