feat: add text meta plugin

Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
Andrew Phillips
2025-08-26 19:05:40 -03:00
parent 06e7e1a616
commit 80c6573e71
2 changed files with 284 additions and 0 deletions

View File

@@ -6,12 +6,14 @@ pub mod digest;
pub mod system;
pub mod magic;
pub mod binary;
pub mod text;
use crate::meta_plugin::program::MetaPluginProgram;
use crate::meta_plugin::digest::{DigestSha256MetaPlugin, ReadTimeMetaPlugin, ReadRateMetaPlugin};
use crate::meta_plugin::system::{CwdMetaPlugin, UserMetaPlugin, ShellMetaPlugin, ShellPidMetaPlugin, KeepPidMetaPlugin, HostnameMetaPlugin};
use crate::meta_plugin::magic::MagicFileMetaPlugin;
use crate::meta_plugin::binary::BinaryMetaPlugin;
use crate::meta_plugin::text::TextMetaPlugin;
/// Represents metadata to be stored
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -112,6 +114,7 @@ pub enum MetaPluginType {
WordCount,
Cwd,
Binary,
Text,
User,
Shell,
ShellPid,
@@ -274,6 +277,7 @@ pub fn get_meta_plugin(meta_plugin_type: MetaPluginType) -> Box<dyn MetaPlugin>
MetaPluginType::WordCount => Box::new(MetaPluginProgram::new_simple("wc", vec!["-w"], "word_count".to_string(), true)),
MetaPluginType::Cwd => Box::new(CwdMetaPlugin::new_simple()),
MetaPluginType::Binary => Box::new(BinaryMetaPlugin::new_simple()),
MetaPluginType::Text => Box::new(TextMetaPlugin::new_simple()),
MetaPluginType::User => Box::new(UserMetaPlugin::new_simple()),
MetaPluginType::Shell => Box::new(ShellMetaPlugin::new_simple()),
MetaPluginType::ShellPid => Box::new(ShellPidMetaPlugin::new_simple()),

View File

@@ -0,0 +1,280 @@
use crate::common::is_binary::is_binary;
use crate::common::PIPESIZE;
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse};
#[derive(Debug, Clone, Default)]
pub struct TextMetaPlugin {
buffer: Vec<u8>,
max_buffer_size: usize,
is_finalized: bool,
word_count: usize,
line_count: usize,
is_binary_content: Option<bool>,
base: crate::meta_plugin::BaseMetaPlugin,
}
impl TextMetaPlugin {
pub fn new(
options: Option<std::collections::HashMap<String, serde_yaml::Value>>,
outputs: Option<std::collections::HashMap<String, serde_yaml::Value>>,
) -> TextMetaPlugin {
let mut base = crate::meta_plugin::BaseMetaPlugin::new();
base.meta_name = "text".to_string();
// Initialize with helper function
base.initialize_plugin(
&["text", "binary", "text_word_count", "text_line_count"],
options,
outputs,
);
let max_buffer_size = base.options.get("max_buffer_size")
.and_then(|v| v.as_u64())
.unwrap_or(PIPESIZE as u64) as usize;
TextMetaPlugin {
buffer: Vec::new(),
max_buffer_size,
is_finalized: false,
word_count: 0,
line_count: 0,
is_binary_content: None,
base,
}
}
pub fn new_simple() -> TextMetaPlugin {
Self::new(None, None)
}
/// Count words and lines in a text chunk
fn count_text_stats(&mut self, data: &[u8]) {
// Count lines (newlines)
self.line_count += data.iter().filter(|&&b| b == b'\n').count();
// Count words - we'll use a simple approach that counts whitespace-separated sequences
let text = match std::str::from_utf8(data) {
Ok(text) => text,
Err(_) => return, // Not valid UTF-8, can't count words reliably
};
// Simple word counting - this counts sequences of non-whitespace characters
self.word_count += text.split_whitespace().count();
}
}
impl MetaPlugin for TextMetaPlugin {
fn is_finalized(&self) -> bool {
self.is_finalized
}
fn set_finalized(&mut self, finalized: bool) {
self.is_finalized = finalized;
}
fn update(&mut self, data: &[u8]) -> MetaPluginResponse {
// If already finalized, don't process more data
if self.is_finalized {
return MetaPluginResponse {
metadata: Vec::new(),
is_finalized: true,
};
}
// If we've already determined it's binary, stop processing
if self.is_binary_content == Some(true) {
return MetaPluginResponse {
metadata: Vec::new(),
is_finalized: false, // We might still want to finalize later
};
}
let mut metadata = Vec::new();
// Calculate how much data we can still accept
let remaining_capacity = self.max_buffer_size.saturating_sub(self.buffer.len());
if remaining_capacity > 0 {
// Determine how much data to copy
let bytes_to_take = std::cmp::min(data.len(), remaining_capacity);
// Add data to our buffer
self.buffer.extend_from_slice(&data[..bytes_to_take]);
// If we have enough data to make a binary determination, do it now
if self.buffer.len() >= std::cmp::min(1024, self.max_buffer_size) && self.is_binary_content.is_none() {
let is_binary_result = is_binary(&self.buffer);
self.is_binary_content = Some(is_binary_result);
// Output text and binary status immediately
let text_value = if is_binary_result { "false".to_string() } else { "true".to_string() };
let binary_value = if is_binary_result { "true".to_string() } else { "false".to_string() };
// Use process_metadata_outputs to handle output mapping
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"text",
text_value,
self.base.outputs()
) {
metadata.push(meta_data);
}
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"binary",
binary_value,
self.base.outputs()
) {
metadata.push(meta_data);
}
// If it's binary, we're done with this plugin
if is_binary_result {
self.is_finalized = true;
return MetaPluginResponse {
metadata,
is_finalized: true,
};
}
}
// If content is text, count words and lines
if self.is_binary_content == Some(false) {
self.count_text_stats(&data[..bytes_to_take]);
}
}
// If we've reached our buffer limit and haven't finalized yet
if self.buffer.len() >= self.max_buffer_size && !self.is_finalized {
// We already determined it's text at this point, so we can finalize
if self.is_binary_content == Some(false) {
// Output word and line counts
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"text_word_count",
self.word_count.to_string(),
self.base.outputs()
) {
metadata.push(meta_data);
}
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"text_line_count",
self.line_count.to_string(),
self.base.outputs()
) {
metadata.push(meta_data);
}
}
// Mark as finalized
self.is_finalized = true;
}
let is_finalized = self.is_finalized;
MetaPluginResponse {
metadata,
is_finalized,
}
}
fn finalize(&mut self) -> MetaPluginResponse {
// If already finalized, don't process again
if self.is_finalized {
return MetaPluginResponse {
metadata: Vec::new(),
is_finalized: true,
};
}
let mut metadata = Vec::new();
// If we haven't determined binary status yet, do it now
if self.is_binary_content.is_none() && !self.buffer.is_empty() {
let is_binary_result = is_binary(&self.buffer);
self.is_binary_content = Some(is_binary_result);
// Output text and binary status
let text_value = if is_binary_result { "false".to_string() } else { "true".to_string() };
let binary_value = if is_binary_result { "true".to_string() } else { "false".to_string() };
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"text",
text_value,
self.base.outputs()
) {
metadata.push(meta_data);
}
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"binary",
binary_value,
self.base.outputs()
) {
metadata.push(meta_data);
}
}
// If content is text and we have some data, output word and line counts
if self.is_binary_content == Some(false) && !self.buffer.is_empty() {
// Count any remaining words/lines in the buffer if we haven't already
if self.word_count == 0 && self.line_count == 0 {
self.count_text_stats(&self.buffer);
}
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"text_word_count",
self.word_count.to_string(),
self.base.outputs()
) {
metadata.push(meta_data);
}
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
"text_line_count",
self.line_count.to_string(),
self.base.outputs()
) {
metadata.push(meta_data);
}
}
// Mark as finalized
self.is_finalized = true;
MetaPluginResponse {
metadata,
is_finalized: true,
}
}
fn meta_name(&self) -> String {
self.base.meta_name.clone()
}
fn outputs(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
self.base.outputs()
}
fn outputs_mut(&mut self) -> &mut std::collections::HashMap<String, serde_yaml::Value> {
self.base.outputs_mut()
}
fn default_outputs(&self) -> Vec<String> {
vec!["text".to_string(), "binary".to_string(), "text_word_count".to_string(), "text_line_count".to_string()]
}
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
self.base.options()
}
fn options_mut(&mut self) -> &mut std::collections::HashMap<String, serde_yaml::Value> {
self.base.options_mut()
}
fn configure_options(&mut self, options: &std::collections::HashMap<String, serde_yaml::Value>) -> anyhow::Result<()> {
if let Some(max_buffer_size) = options.get("max_buffer_size") {
if let Some(size) = max_buffer_size.as_u64() {
self.max_buffer_size = size as usize;
}
}
Ok(())
}
}