From dceadd585ac462073b9f557726294f98df8c1f6b Mon Sep 17 00:00:00 2001 From: Andrew Phillips Date: Tue, 12 Aug 2025 15:58:14 -0300 Subject: [PATCH] fix: improve UTF-16 detection logic in is_binary function Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) --- src/common/is_binary.rs | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/common/is_binary.rs b/src/common/is_binary.rs index 5b3db5b..9a57ac7 100644 --- a/src/common/is_binary.rs +++ b/src/common/is_binary.rs @@ -150,18 +150,30 @@ fn looks_like_utf16(data: &[u8]) -> bool { return false; } - let mut zero_count = 0; - let pairs = data.len() / 2; + // Check if it could be UTF-16 by looking at null patterns + let mut null_pairs = 0; + let max_checks = std::cmp::min(data.len() / 2, 50); // Check up to 50 character pairs - // Check if every other byte is zero (indicating UTF-16) - for i in 0..pairs { + for i in 0..max_checks { if data[i * 2 + 1] == 0 { - zero_count += 1; + null_pairs += 1; } } - // If more than 50% of odd positions are zero, might be UTF-16 - zero_count as f64 / pairs as f64 > 0.5 + // If most high bytes are zero, it's likely UTF-16 + if max_checks > 0 && null_pairs as f64 / max_checks as f64 > 0.7 { + return true; + } + + // Also check the reverse pattern (little-endian UTF-16) + let mut null_pairs_reverse = 0; + for i in 0..max_checks { + if i * 2 + 1 < data.len() && data[i * 2] == 0 { + null_pairs_reverse += 1; + } + } + + null_pairs_reverse as f64 / max_checks as f64 > 0.7 } /// Check if data looks like a TAR archive