From 80c6573e71cc9274da1e3de4c97e12ff0698947b Mon Sep 17 00:00:00 2001 From: Andrew Phillips Date: Tue, 26 Aug 2025 19:05:40 -0300 Subject: [PATCH] feat: add text meta plugin Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) --- src/meta_plugin.rs | 4 + src/meta_plugin/text.rs | 280 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+) diff --git a/src/meta_plugin.rs b/src/meta_plugin.rs index cef8b68..0205201 100644 --- a/src/meta_plugin.rs +++ b/src/meta_plugin.rs @@ -6,12 +6,14 @@ pub mod digest; pub mod system; pub mod magic; pub mod binary; +pub mod text; use crate::meta_plugin::program::MetaPluginProgram; use crate::meta_plugin::digest::{DigestSha256MetaPlugin, ReadTimeMetaPlugin, ReadRateMetaPlugin}; use crate::meta_plugin::system::{CwdMetaPlugin, UserMetaPlugin, ShellMetaPlugin, ShellPidMetaPlugin, KeepPidMetaPlugin, HostnameMetaPlugin}; use crate::meta_plugin::magic::MagicFileMetaPlugin; use crate::meta_plugin::binary::BinaryMetaPlugin; +use crate::meta_plugin::text::TextMetaPlugin; /// Represents metadata to be stored #[derive(Debug, Clone, Serialize, Deserialize)] @@ -112,6 +114,7 @@ pub enum MetaPluginType { WordCount, Cwd, Binary, + Text, User, Shell, ShellPid, @@ -274,6 +277,7 @@ pub fn get_meta_plugin(meta_plugin_type: MetaPluginType) -> Box MetaPluginType::WordCount => Box::new(MetaPluginProgram::new_simple("wc", vec!["-w"], "word_count".to_string(), true)), MetaPluginType::Cwd => Box::new(CwdMetaPlugin::new_simple()), MetaPluginType::Binary => Box::new(BinaryMetaPlugin::new_simple()), + MetaPluginType::Text => Box::new(TextMetaPlugin::new_simple()), MetaPluginType::User => Box::new(UserMetaPlugin::new_simple()), MetaPluginType::Shell => Box::new(ShellMetaPlugin::new_simple()), MetaPluginType::ShellPid => Box::new(ShellPidMetaPlugin::new_simple()), diff --git a/src/meta_plugin/text.rs b/src/meta_plugin/text.rs index e69de29..fdc9b64 100644 --- a/src/meta_plugin/text.rs +++ b/src/meta_plugin/text.rs @@ -0,0 +1,280 @@ +use crate::common::is_binary::is_binary; +use crate::common::PIPESIZE; +use crate::meta_plugin::{MetaPlugin, MetaPluginResponse}; + +#[derive(Debug, Clone, Default)] +pub struct TextMetaPlugin { + buffer: Vec, + max_buffer_size: usize, + is_finalized: bool, + word_count: usize, + line_count: usize, + is_binary_content: Option, + base: crate::meta_plugin::BaseMetaPlugin, +} + +impl TextMetaPlugin { + pub fn new( + options: Option>, + outputs: Option>, + ) -> TextMetaPlugin { + let mut base = crate::meta_plugin::BaseMetaPlugin::new(); + base.meta_name = "text".to_string(); + + // Initialize with helper function + base.initialize_plugin( + &["text", "binary", "text_word_count", "text_line_count"], + options, + outputs, + ); + + let max_buffer_size = base.options.get("max_buffer_size") + .and_then(|v| v.as_u64()) + .unwrap_or(PIPESIZE as u64) as usize; + + TextMetaPlugin { + buffer: Vec::new(), + max_buffer_size, + is_finalized: false, + word_count: 0, + line_count: 0, + is_binary_content: None, + base, + } + } + + pub fn new_simple() -> TextMetaPlugin { + Self::new(None, None) + } + + /// Count words and lines in a text chunk + fn count_text_stats(&mut self, data: &[u8]) { + // Count lines (newlines) + self.line_count += data.iter().filter(|&&b| b == b'\n').count(); + + // Count words - we'll use a simple approach that counts whitespace-separated sequences + let text = match std::str::from_utf8(data) { + Ok(text) => text, + Err(_) => return, // Not valid UTF-8, can't count words reliably + }; + + // Simple word counting - this counts sequences of non-whitespace characters + self.word_count += text.split_whitespace().count(); + } +} + +impl MetaPlugin for TextMetaPlugin { + fn is_finalized(&self) -> bool { + self.is_finalized + } + + fn set_finalized(&mut self, finalized: bool) { + self.is_finalized = finalized; + } + + fn update(&mut self, data: &[u8]) -> MetaPluginResponse { + // If already finalized, don't process more data + if self.is_finalized { + return MetaPluginResponse { + metadata: Vec::new(), + is_finalized: true, + }; + } + + // If we've already determined it's binary, stop processing + if self.is_binary_content == Some(true) { + return MetaPluginResponse { + metadata: Vec::new(), + is_finalized: false, // We might still want to finalize later + }; + } + + let mut metadata = Vec::new(); + + // Calculate how much data we can still accept + let remaining_capacity = self.max_buffer_size.saturating_sub(self.buffer.len()); + if remaining_capacity > 0 { + // Determine how much data to copy + let bytes_to_take = std::cmp::min(data.len(), remaining_capacity); + + // Add data to our buffer + self.buffer.extend_from_slice(&data[..bytes_to_take]); + + // If we have enough data to make a binary determination, do it now + if self.buffer.len() >= std::cmp::min(1024, self.max_buffer_size) && self.is_binary_content.is_none() { + let is_binary_result = is_binary(&self.buffer); + self.is_binary_content = Some(is_binary_result); + + // Output text and binary status immediately + let text_value = if is_binary_result { "false".to_string() } else { "true".to_string() }; + let binary_value = if is_binary_result { "true".to_string() } else { "false".to_string() }; + + // Use process_metadata_outputs to handle output mapping + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text", + text_value, + self.base.outputs() + ) { + metadata.push(meta_data); + } + + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "binary", + binary_value, + self.base.outputs() + ) { + metadata.push(meta_data); + } + + // If it's binary, we're done with this plugin + if is_binary_result { + self.is_finalized = true; + return MetaPluginResponse { + metadata, + is_finalized: true, + }; + } + } + + // If content is text, count words and lines + if self.is_binary_content == Some(false) { + self.count_text_stats(&data[..bytes_to_take]); + } + } + + // If we've reached our buffer limit and haven't finalized yet + if self.buffer.len() >= self.max_buffer_size && !self.is_finalized { + // We already determined it's text at this point, so we can finalize + if self.is_binary_content == Some(false) { + // Output word and line counts + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_word_count", + self.word_count.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_line_count", + self.line_count.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + } + + // Mark as finalized + self.is_finalized = true; + } + + let is_finalized = self.is_finalized; + MetaPluginResponse { + metadata, + is_finalized, + } + } + + fn finalize(&mut self) -> MetaPluginResponse { + // If already finalized, don't process again + if self.is_finalized { + return MetaPluginResponse { + metadata: Vec::new(), + is_finalized: true, + }; + } + + let mut metadata = Vec::new(); + + // If we haven't determined binary status yet, do it now + if self.is_binary_content.is_none() && !self.buffer.is_empty() { + let is_binary_result = is_binary(&self.buffer); + self.is_binary_content = Some(is_binary_result); + + // Output text and binary status + let text_value = if is_binary_result { "false".to_string() } else { "true".to_string() }; + let binary_value = if is_binary_result { "true".to_string() } else { "false".to_string() }; + + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text", + text_value, + self.base.outputs() + ) { + metadata.push(meta_data); + } + + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "binary", + binary_value, + self.base.outputs() + ) { + metadata.push(meta_data); + } + } + + // If content is text and we have some data, output word and line counts + if self.is_binary_content == Some(false) && !self.buffer.is_empty() { + // Count any remaining words/lines in the buffer if we haven't already + if self.word_count == 0 && self.line_count == 0 { + self.count_text_stats(&self.buffer); + } + + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_word_count", + self.word_count.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_line_count", + self.line_count.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + } + + // Mark as finalized + self.is_finalized = true; + + MetaPluginResponse { + metadata, + is_finalized: true, + } + } + + fn meta_name(&self) -> String { + self.base.meta_name.clone() + } + + fn outputs(&self) -> &std::collections::HashMap { + self.base.outputs() + } + + fn outputs_mut(&mut self) -> &mut std::collections::HashMap { + self.base.outputs_mut() + } + + fn default_outputs(&self) -> Vec { + vec!["text".to_string(), "binary".to_string(), "text_word_count".to_string(), "text_line_count".to_string()] + } + + fn options(&self) -> &std::collections::HashMap { + self.base.options() + } + + fn options_mut(&mut self) -> &mut std::collections::HashMap { + self.base.options_mut() + } + + fn configure_options(&mut self, options: &std::collections::HashMap) -> anyhow::Result<()> { + if let Some(max_buffer_size) = options.get("max_buffer_size") { + if let Some(size) = max_buffer_size.as_u64() { + self.max_buffer_size = size as usize; + } + } + Ok(()) + } +}