Files
keep/src/meta_plugin/system.rs
Andrew Phillips ab48d19dcd fix: adjust binary detection threshold to properly classify random data as binary
Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
2025-08-11 13:21:58 -03:00

662 lines
18 KiB
Rust

use anyhow::Result;
use gethostname::gethostname;
use local_ip_address::local_ip;
use dns_lookup::lookup_addr;
use std::io;
use std::io::Write;
use std::env;
use std::process;
use uzers::{get_current_uid, get_current_gid, get_current_username, get_current_groupname};
use crate::meta_plugin::MetaPlugin;
#[derive(Debug, Clone, Default)]
pub struct CwdMetaPlugin {
meta_name: String,
}
#[derive(Debug, Clone, Default)]
pub struct BinaryMetaPlugin {
meta_name: String,
buffer: Vec<u8>,
max_buffer_size: usize,
}
impl BinaryMetaPlugin {
pub fn new() -> BinaryMetaPlugin {
BinaryMetaPlugin {
meta_name: "binary".to_string(),
buffer: Vec::new(),
max_buffer_size: 4096, // 4KB
}
}
/// Detect if data is binary or text
/// Returns true if data is likely binary, false if likely text
fn is_binary(data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
// First check for known binary file signatures
if Self::has_binary_signature(data) {
return true;
}
// Check for UTF-16 BOM (text)
if data.len() >= 2 {
if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) {
return false; // UTF-16 with BOM is text
}
}
// Check for UTF-8 BOM (text)
if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
return false; // UTF-8 with BOM is text
}
// Check if it's valid UTF-8
if std::str::from_utf8(data).is_ok() {
// Valid UTF-8, check printable character ratio
return Self::calculate_printable_ratio(data) < 0.7;
}
// Not valid UTF-8, check if it might be UTF-16 without BOM
if Self::looks_like_utf16(data) {
return false; // Likely UTF-16 text
}
// Check for TAR format (special case with no magic number)
if Self::looks_like_tar(data) {
return true;
}
// Final fallback: check printable character ratio
Self::calculate_printable_ratio(data) < 0.7
}
/// Check for known binary file signatures
fn has_binary_signature(data: &[u8]) -> bool {
// Define binary file signatures with their minimum required lengths
let signatures: &[(&[u8], usize)] = &[
// Image formats
(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], 8), // PNG
(&[0xFF, 0xD8, 0xFF], 3), // JPEG (various subtypes)
(&[0x47, 0x49, 0x46, 0x38, 0x37, 0x61], 6), // GIF87a
(&[0x47, 0x49, 0x46, 0x38, 0x39, 0x61], 6), // GIF89a
(&[0x42, 0x4D], 2), // BMP
(&[0x00, 0x00, 0x01, 0x00], 4), // ICO
(&[0x49, 0x49, 0x2A, 0x00], 4), // TIFF (little endian)
(&[0x4D, 0x4D, 0x00, 0x2A], 4), // TIFF (big endian)
(&[0x52, 0x49, 0x46, 0x46], 4), // WebP (RIFF container)
(&[0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20], 8), // JPEG 2000
// Audio/Video formats
(&[0x49, 0x44, 0x33], 3), // MP3 with ID3v2
(&[0xFF, 0xFB], 2), // MP3
(&[0xFF, 0xF3], 2), // MP3
(&[0xFF, 0xF2], 2), // MP3
(&[0x4F, 0x67, 0x67, 0x53], 4), // OGG
(&[0x66, 0x74, 0x79, 0x70], 4), // MP4/M4A/MOV (at offset 4)
(&[0x52, 0x49, 0x46, 0x46], 4), // WAV/AVI (RIFF)
(&[0x46, 0x4C, 0x56], 3), // FLV
(&[0x1A, 0x45, 0xDF, 0xA3], 4), // MKV/WebM
// Archive formats
(&[0x50, 0x4B, 0x03, 0x04], 4), // ZIP
(&[0x50, 0x4B, 0x05, 0x06], 4), // ZIP (empty)
(&[0x50, 0x4B, 0x07, 0x08], 4), // ZIP (spanned)
(&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x00], 7), // RAR v1.5+
(&[0x52, 0x61, 0x72, 0x21, 0x1A, 0x07, 0x01, 0x00], 8), // RAR v5.0+
(&[0x1F, 0x8B], 2), // GZIP
(&[0x42, 0x5A, 0x68], 3), // BZIP2
(&[0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00], 6), // XZ
(&[0x28, 0xB5, 0x2F, 0xFD], 4), // Zstandard
(&[0x04, 0x22, 0x4D, 0x18], 4), // LZ4
(&[0x1F, 0x9D], 2), // LZW compressed
(&[0x1F, 0xA0], 2), // LZH compressed
(&[0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C], 6), // 7-Zip
// Document formats
(&[0x25, 0x50, 0x44, 0x46], 4), // PDF
(&[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 8), // MS Office (OLE)
(&[0x50, 0x4B, 0x03, 0x04], 4), // Office Open XML (also ZIP)
(&[0x7B, 0x5C, 0x72, 0x74, 0x66], 5), // RTF
// Executables and object files
(&[0x7F, 0x45, 0x4C, 0x46], 4), // ELF
(&[0x4D, 0x5A], 2), // Windows PE/DOS
(&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Mach-O (big endian)
(&[0xFE, 0xED, 0xFA, 0xCE], 4), // Mach-O 32-bit (little endian)
(&[0xFE, 0xED, 0xFA, 0xCF], 4), // Mach-O 64-bit (little endian)
(&[0xCE, 0xFA, 0xED, 0xFE], 4), // Mach-O 32-bit (big endian)
(&[0xCF, 0xFA, 0xED, 0xFE], 4), // Mach-O 64-bit (big endian)
(&[0xCA, 0xFE, 0xBA, 0xBE], 4), // Java class file
(&[0xDE, 0xC0, 0x17, 0x0B], 4), // Dalvik executable
// Database formats
(&[0x53, 0x51, 0x4C, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6F, 0x72, 0x6D, 0x61, 0x74, 0x20, 0x33, 0x00], 16), // SQLite
(&[0x00, 0x01, 0x00, 0x00], 4), // Palm Database
// Font formats
(&[0x00, 0x01, 0x00, 0x00, 0x00], 5), // TrueType
(&[0x4F, 0x54, 0x54, 0x4F], 4), // OpenType
(&[0x77, 0x4F, 0x46, 0x46], 4), // WOFF
(&[0x77, 0x4F, 0x46, 0x32], 4), // WOFF2
// Virtual machine formats
(&[0x76, 0x6D, 0x64, 0x6B], 4), // VMDK
(&[0x3C, 0x3C, 0x3C, 0x20, 0x4F, 0x72, 0x61, 0x63, 0x6C, 0x65, 0x20, 0x56, 0x4D, 0x20, 0x56, 0x69, 0x72, 0x74, 0x75, 0x61, 0x6C, 0x42, 0x6F, 0x78, 0x20, 0x44, 0x69, 0x73, 0x6B, 0x20, 0x49, 0x6D, 0x61, 0x67, 0x65, 0x20, 0x3E, 0x3E, 0x3E], 39), // VirtualBox VDI
// Disk image formats
(&[0xEB, 0x3C, 0x90], 3), // FAT12/16/32
(&[0xEB, 0x58, 0x90], 3), // FAT32
(&[0x55, 0xAA], 2), // Boot sector (at offset 510)
// Other binary formats
(&[0x21, 0x3C, 0x61, 0x72, 0x63, 0x68, 0x3E, 0x0A], 8), // AR archive
(&[0x78, 0x01], 2), // zlib (default compression)
(&[0x78, 0x9C], 2), // zlib (best compression)
(&[0x78, 0xDA], 2), // zlib (fast compression)
(&[0x62, 0x76, 0x78, 0x32], 4), // LZFSE
];
for (signature, min_len) in signatures {
if data.len() >= *min_len && data.starts_with(signature) {
return true;
}
}
// Special case: check for ftyp box in MP4/MOV files (at offset 4)
if data.len() >= 8 && &data[4..8] == b"ftyp" {
return true;
}
false
}
/// Check if data looks like UTF-16 without BOM
fn looks_like_utf16(data: &[u8]) -> bool {
if data.len() < 4 || data.len() % 2 != 0 {
return false;
}
let mut zero_count = 0;
let pairs = data.len() / 2;
// Check if every other byte is zero (indicating UTF-16)
for i in 0..pairs {
if data[i * 2 + 1] == 0 {
zero_count += 1;
}
}
// If more than 50% of odd positions are zero, might be UTF-16
zero_count as f64 / pairs as f64 > 0.5
}
/// Check if data looks like a TAR archive
fn looks_like_tar(data: &[u8]) -> bool {
if data.len() < 512 {
return false;
}
// TAR header structure validation
// Filename should not start with null
if data[0] == 0 {
return false;
}
// Check file mode field (should be octal digits)
for i in 100..108 {
if data[i] != 0 && (data[i] < b'0' || data[i] > b'7') && data[i] != b' ' {
return false;
}
}
// Check checksum field (should be octal digits or spaces)
for i in 148..156 {
if data[i] != 0 && (data[i] < b'0' || data[i] > b'7') && data[i] != b' ' {
return false;
}
}
// Check magic field for POSIX TAR
if data.len() >= 265 {
let magic = &data[257..262];
if magic == b"ustar" {
return true;
}
}
// Additional heuristic: check if the structure looks reasonable
let has_reasonable_structure =
data[0] != 0 && // Filename starts
data[100..108].iter().all(|&b| b == 0 || (b >= b'0' && b <= b'7') || b == b' '); // Mode field
has_reasonable_structure
}
/// Calculate the ratio of printable characters in the data
fn calculate_printable_ratio(data: &[u8]) -> f64 {
let printable_count = data.iter().filter(|&&b| {
b.is_ascii_graphic() || b.is_ascii_whitespace()
}).count();
printable_count as f64 / data.len() as f64
}
}
impl MetaPlugin for BinaryMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
let is_binary = Self::is_binary(&self.buffer);
Ok(if is_binary { "true".to_string() } else { "false".to_string() })
}
fn update(&mut self, data: &[u8]) {
// Only collect up to max_buffer_size
let remaining_capacity = self.max_buffer_size.saturating_sub(self.buffer.len());
if remaining_capacity > 0 {
let bytes_to_copy = std::cmp::min(data.len(), remaining_capacity);
self.buffer.extend_from_slice(&data[..bytes_to_copy]);
}
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
impl CwdMetaPlugin {
pub fn new() -> CwdMetaPlugin {
CwdMetaPlugin {
meta_name: "cwd".to_string(),
}
}
}
impl MetaPlugin for CwdMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
match env::current_dir() {
Ok(path) => Ok(path.to_string_lossy().to_string()),
Err(_) => Ok("unknown".to_string()),
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct UidMetaPlugin {
meta_name: String,
}
impl UidMetaPlugin {
pub fn new() -> UidMetaPlugin {
UidMetaPlugin {
meta_name: "uid".to_string(),
}
}
}
impl MetaPlugin for UidMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
Ok(get_current_uid().to_string())
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct UserMetaPlugin {
meta_name: String,
}
impl UserMetaPlugin {
pub fn new() -> UserMetaPlugin {
UserMetaPlugin {
meta_name: "user".to_string(),
}
}
}
impl MetaPlugin for UserMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
match get_current_username() {
Some(username) => Ok(username.to_string_lossy().to_string()),
None => Ok("unknown".to_string()),
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct GidMetaPlugin {
meta_name: String,
}
impl GidMetaPlugin {
pub fn new() -> GidMetaPlugin {
GidMetaPlugin {
meta_name: "gid".to_string(),
}
}
}
impl MetaPlugin for GidMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
Ok(get_current_gid().to_string())
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct GroupMetaPlugin {
meta_name: String,
}
impl GroupMetaPlugin {
pub fn new() -> GroupMetaPlugin {
GroupMetaPlugin {
meta_name: "group".to_string(),
}
}
}
impl MetaPlugin for GroupMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
match get_current_groupname() {
Some(groupname) => Ok(groupname.to_string_lossy().to_string()),
None => Ok("unknown".to_string()),
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct ShellMetaPlugin {
meta_name: String,
}
impl ShellMetaPlugin {
pub fn new() -> ShellMetaPlugin {
ShellMetaPlugin {
meta_name: "shell".to_string(),
}
}
}
impl MetaPlugin for ShellMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
match env::var("SHELL") {
Ok(shell) => Ok(shell),
Err(_) => Ok("unknown".to_string()),
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct ShellPidMetaPlugin {
meta_name: String,
}
impl ShellPidMetaPlugin {
pub fn new() -> ShellPidMetaPlugin {
ShellPidMetaPlugin {
meta_name: "shell_pid".to_string(),
}
}
}
impl MetaPlugin for ShellPidMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
match env::var("PPID") {
Ok(ppid) => Ok(ppid),
Err(_) => Ok(process::id().to_string()),
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct KeepPidMetaPlugin {
meta_name: String,
}
impl KeepPidMetaPlugin {
pub fn new() -> KeepPidMetaPlugin {
KeepPidMetaPlugin {
meta_name: "keep_pid".to_string(),
}
}
}
impl MetaPlugin for KeepPidMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
Ok(process::id().to_string())
}
fn update(&mut self, _data: &[u8]) {
// No update needed
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct HostnameMetaPlugin {
meta_name: String,
}
impl HostnameMetaPlugin {
pub fn new() -> HostnameMetaPlugin {
HostnameMetaPlugin {
meta_name: "hostname".to_string(),
}
}
}
impl MetaPlugin for HostnameMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
match gethostname().into_string() {
Ok(hostname) => Ok(hostname),
Err(_) => Ok("unknown".to_string()),
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed for hostname
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}
#[derive(Debug, Clone, Default)]
pub struct FullHostnameMetaPlugin {
meta_name: String,
}
impl FullHostnameMetaPlugin {
pub fn new() -> FullHostnameMetaPlugin {
FullHostnameMetaPlugin {
meta_name: "full_hostname".to_string(),
}
}
}
impl MetaPlugin for FullHostnameMetaPlugin {
fn is_internal(&self) -> bool {
true
}
fn create(&self) -> Result<Box<dyn Write>> {
Ok(Box::new(io::sink()))
}
fn finalize(&mut self) -> io::Result<String> {
// Try to get the FQDN through reverse DNS lookup
match local_ip() {
Ok(my_local_ip) => {
match lookup_addr(&my_local_ip) {
Ok(hostname) => Ok(hostname),
Err(_) => {
// Fall back to regular hostname if reverse DNS fails
match gethostname().into_string() {
Ok(hostname) => Ok(hostname),
Err(_) => Ok("unknown".to_string()),
}
}
}
}
Err(_) => {
// Fall back to regular hostname if we can't get local IP
match gethostname().into_string() {
Ok(hostname) => Ok(hostname),
Err(_) => Ok("unknown".to_string()),
}
}
}
}
fn update(&mut self, _data: &[u8]) {
// No update needed for full hostname
}
fn meta_name(&mut self) -> String {
self.meta_name.clone()
}
}