feat: add binary meta plugin to detect text vs binary content
Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
@@ -15,6 +15,126 @@ pub struct CwdMetaPlugin {
|
||||
meta_name: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct BinaryMetaPlugin {
|
||||
meta_name: String,
|
||||
buffer: Vec<u8>,
|
||||
max_buffer_size: usize,
|
||||
}
|
||||
|
||||
impl BinaryMetaPlugin {
|
||||
pub fn new() -> BinaryMetaPlugin {
|
||||
BinaryMetaPlugin {
|
||||
meta_name: "binary".to_string(),
|
||||
buffer: Vec::new(),
|
||||
max_buffer_size: 4096, // 4KB
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect if data is binary or text
|
||||
/// Returns true if data is likely binary, false if likely text
|
||||
fn is_binary(data: &[u8]) -> bool {
|
||||
if data.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if it's valid UTF-8
|
||||
if std::str::from_utf8(data).is_ok() {
|
||||
// Valid UTF-8, but might still be binary
|
||||
// Check if it's UTF-16
|
||||
if data.len() >= 2 {
|
||||
// Check for BOM
|
||||
if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) {
|
||||
// UTF-16 with BOM is text
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Count printable characters
|
||||
let printable_count = data.iter().filter(|&&b| {
|
||||
b.is_ascii_alphanumeric() ||
|
||||
b.is_ascii_punctuation() ||
|
||||
b.is_ascii_whitespace() ||
|
||||
b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
|
||||
}).count();
|
||||
|
||||
// If less than 70% of bytes are printable, consider it binary
|
||||
let printable_ratio = printable_count as f64 / data.len() as f64;
|
||||
return printable_ratio < 0.7;
|
||||
} else {
|
||||
// Not valid UTF-8, likely binary
|
||||
// But check if it might be UTF-16 without BOM
|
||||
if data.len() >= 2 && data.len() % 2 == 0 {
|
||||
// Check if it looks like UTF-16 (every other byte is 0)
|
||||
let mut zero_count = 0;
|
||||
for (i, &byte) in data.iter().enumerate() {
|
||||
if i % 2 == 1 && byte == 0 {
|
||||
zero_count += 1;
|
||||
}
|
||||
}
|
||||
// If more than 50% of odd positions are zero, might be UTF-16
|
||||
if zero_count as f64 / (data.len() / 2) as f64 > 0.5 {
|
||||
return false; // Likely UTF-16 text
|
||||
}
|
||||
}
|
||||
|
||||
// Check for common binary file signatures
|
||||
if data.len() >= 4 {
|
||||
// Check for common binary file headers
|
||||
let headers = [
|
||||
&[0x89, 0x50, 0x4E, 0x47], // PNG
|
||||
&[0xFF, 0xD8, 0xFF, 0xE0], // JPEG
|
||||
&[0x25, 0x50, 0x44, 0x46], // PDF
|
||||
&[0x50, 0x4B, 0x03, 0x04], // ZIP
|
||||
&[0x52, 0x61, 0x72, 0x21], // RAR
|
||||
];
|
||||
|
||||
for header in &headers {
|
||||
if data.starts_with(header) {
|
||||
return true; // Definitely binary
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Count printable characters as a fallback
|
||||
let printable_count = data.iter().filter(|&&b| {
|
||||
b.is_ascii_alphanumeric() ||
|
||||
b.is_ascii_punctuation() ||
|
||||
b.is_ascii_whitespace() ||
|
||||
b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
|
||||
}).count();
|
||||
|
||||
// If less than 30% of bytes are printable, consider it binary
|
||||
let printable_ratio = printable_count as f64 / data.len() as f64;
|
||||
printable_ratio < 0.3
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MetaPlugin for BinaryMetaPlugin {
|
||||
fn create(&self) -> Result<Box<dyn Write>> {
|
||||
Ok(Box::new(io::sink()))
|
||||
}
|
||||
|
||||
fn finalize(&mut self) -> io::Result<String> {
|
||||
let is_binary = Self::is_binary(&self.buffer);
|
||||
Ok(if is_binary { "true".to_string() } else { "false".to_string() })
|
||||
}
|
||||
|
||||
fn update(&mut self, data: &[u8]) {
|
||||
// Only collect up to max_buffer_size
|
||||
let remaining_capacity = self.max_buffer_size.saturating_sub(self.buffer.len());
|
||||
if remaining_capacity > 0 {
|
||||
let bytes_to_copy = std::cmp::min(data.len(), remaining_capacity);
|
||||
self.buffer.extend_from_slice(&data[..bytes_to_copy]);
|
||||
}
|
||||
}
|
||||
|
||||
fn meta_name(&mut self) -> String {
|
||||
self.meta_name.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl CwdMetaPlugin {
|
||||
pub fn new() -> CwdMetaPlugin {
|
||||
CwdMetaPlugin {
|
||||
|
||||
Reference in New Issue
Block a user