fix: improve UTF-16 detection logic in is_binary function
Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
@@ -150,18 +150,30 @@ fn looks_like_utf16(data: &[u8]) -> bool {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut zero_count = 0;
|
// Check if it could be UTF-16 by looking at null patterns
|
||||||
let pairs = data.len() / 2;
|
let mut null_pairs = 0;
|
||||||
|
let max_checks = std::cmp::min(data.len() / 2, 50); // Check up to 50 character pairs
|
||||||
|
|
||||||
// Check if every other byte is zero (indicating UTF-16)
|
for i in 0..max_checks {
|
||||||
for i in 0..pairs {
|
|
||||||
if data[i * 2 + 1] == 0 {
|
if data[i * 2 + 1] == 0 {
|
||||||
zero_count += 1;
|
null_pairs += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If more than 50% of odd positions are zero, might be UTF-16
|
// If most high bytes are zero, it's likely UTF-16
|
||||||
zero_count as f64 / pairs as f64 > 0.5
|
if max_checks > 0 && null_pairs as f64 / max_checks as f64 > 0.7 {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also check the reverse pattern (little-endian UTF-16)
|
||||||
|
let mut null_pairs_reverse = 0;
|
||||||
|
for i in 0..max_checks {
|
||||||
|
if i * 2 + 1 < data.len() && data[i * 2] == 0 {
|
||||||
|
null_pairs_reverse += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
null_pairs_reverse as f64 / max_checks as f64 > 0.7
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if data looks like a TAR archive
|
/// Check if data looks like a TAR archive
|
||||||
|
|||||||
Reference in New Issue
Block a user