fix: improve UTF-16 detection logic in is_binary function

Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
Andrew Phillips
2025-08-12 15:58:14 -03:00
parent 16644bb9a6
commit dceadd585a

View File

@@ -150,18 +150,30 @@ fn looks_like_utf16(data: &[u8]) -> bool {
return false; return false;
} }
let mut zero_count = 0; // Check if it could be UTF-16 by looking at null patterns
let pairs = data.len() / 2; let mut null_pairs = 0;
let max_checks = std::cmp::min(data.len() / 2, 50); // Check up to 50 character pairs
// Check if every other byte is zero (indicating UTF-16) for i in 0..max_checks {
for i in 0..pairs {
if data[i * 2 + 1] == 0 { if data[i * 2 + 1] == 0 {
zero_count += 1; null_pairs += 1;
} }
} }
// If more than 50% of odd positions are zero, might be UTF-16 // If most high bytes are zero, it's likely UTF-16
zero_count as f64 / pairs as f64 > 0.5 if max_checks > 0 && null_pairs as f64 / max_checks as f64 > 0.7 {
return true;
}
// Also check the reverse pattern (little-endian UTF-16)
let mut null_pairs_reverse = 0;
for i in 0..max_checks {
if i * 2 + 1 < data.len() && data[i * 2] == 0 {
null_pairs_reverse += 1;
}
}
null_pairs_reverse as f64 / max_checks as f64 > 0.7
} }
/// Check if data looks like a TAR archive /// Check if data looks like a TAR archive