feat: add text meta plugin
Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
@@ -6,12 +6,14 @@ pub mod digest;
|
||||
pub mod system;
|
||||
pub mod magic;
|
||||
pub mod binary;
|
||||
pub mod text;
|
||||
|
||||
use crate::meta_plugin::program::MetaPluginProgram;
|
||||
use crate::meta_plugin::digest::{DigestSha256MetaPlugin, ReadTimeMetaPlugin, ReadRateMetaPlugin};
|
||||
use crate::meta_plugin::system::{CwdMetaPlugin, UserMetaPlugin, ShellMetaPlugin, ShellPidMetaPlugin, KeepPidMetaPlugin, HostnameMetaPlugin};
|
||||
use crate::meta_plugin::magic::MagicFileMetaPlugin;
|
||||
use crate::meta_plugin::binary::BinaryMetaPlugin;
|
||||
use crate::meta_plugin::text::TextMetaPlugin;
|
||||
|
||||
/// Represents metadata to be stored
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -112,6 +114,7 @@ pub enum MetaPluginType {
|
||||
WordCount,
|
||||
Cwd,
|
||||
Binary,
|
||||
Text,
|
||||
User,
|
||||
Shell,
|
||||
ShellPid,
|
||||
@@ -274,6 +277,7 @@ pub fn get_meta_plugin(meta_plugin_type: MetaPluginType) -> Box<dyn MetaPlugin>
|
||||
MetaPluginType::WordCount => Box::new(MetaPluginProgram::new_simple("wc", vec!["-w"], "word_count".to_string(), true)),
|
||||
MetaPluginType::Cwd => Box::new(CwdMetaPlugin::new_simple()),
|
||||
MetaPluginType::Binary => Box::new(BinaryMetaPlugin::new_simple()),
|
||||
MetaPluginType::Text => Box::new(TextMetaPlugin::new_simple()),
|
||||
MetaPluginType::User => Box::new(UserMetaPlugin::new_simple()),
|
||||
MetaPluginType::Shell => Box::new(ShellMetaPlugin::new_simple()),
|
||||
MetaPluginType::ShellPid => Box::new(ShellPidMetaPlugin::new_simple()),
|
||||
|
||||
@@ -0,0 +1,280 @@
|
||||
use crate::common::is_binary::is_binary;
|
||||
use crate::common::PIPESIZE;
|
||||
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse};
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TextMetaPlugin {
|
||||
buffer: Vec<u8>,
|
||||
max_buffer_size: usize,
|
||||
is_finalized: bool,
|
||||
word_count: usize,
|
||||
line_count: usize,
|
||||
is_binary_content: Option<bool>,
|
||||
base: crate::meta_plugin::BaseMetaPlugin,
|
||||
}
|
||||
|
||||
impl TextMetaPlugin {
|
||||
pub fn new(
|
||||
options: Option<std::collections::HashMap<String, serde_yaml::Value>>,
|
||||
outputs: Option<std::collections::HashMap<String, serde_yaml::Value>>,
|
||||
) -> TextMetaPlugin {
|
||||
let mut base = crate::meta_plugin::BaseMetaPlugin::new();
|
||||
base.meta_name = "text".to_string();
|
||||
|
||||
// Initialize with helper function
|
||||
base.initialize_plugin(
|
||||
&["text", "binary", "text_word_count", "text_line_count"],
|
||||
options,
|
||||
outputs,
|
||||
);
|
||||
|
||||
let max_buffer_size = base.options.get("max_buffer_size")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(PIPESIZE as u64) as usize;
|
||||
|
||||
TextMetaPlugin {
|
||||
buffer: Vec::new(),
|
||||
max_buffer_size,
|
||||
is_finalized: false,
|
||||
word_count: 0,
|
||||
line_count: 0,
|
||||
is_binary_content: None,
|
||||
base,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_simple() -> TextMetaPlugin {
|
||||
Self::new(None, None)
|
||||
}
|
||||
|
||||
/// Count words and lines in a text chunk
|
||||
fn count_text_stats(&mut self, data: &[u8]) {
|
||||
// Count lines (newlines)
|
||||
self.line_count += data.iter().filter(|&&b| b == b'\n').count();
|
||||
|
||||
// Count words - we'll use a simple approach that counts whitespace-separated sequences
|
||||
let text = match std::str::from_utf8(data) {
|
||||
Ok(text) => text,
|
||||
Err(_) => return, // Not valid UTF-8, can't count words reliably
|
||||
};
|
||||
|
||||
// Simple word counting - this counts sequences of non-whitespace characters
|
||||
self.word_count += text.split_whitespace().count();
|
||||
}
|
||||
}
|
||||
|
||||
impl MetaPlugin for TextMetaPlugin {
|
||||
fn is_finalized(&self) -> bool {
|
||||
self.is_finalized
|
||||
}
|
||||
|
||||
fn set_finalized(&mut self, finalized: bool) {
|
||||
self.is_finalized = finalized;
|
||||
}
|
||||
|
||||
fn update(&mut self, data: &[u8]) -> MetaPluginResponse {
|
||||
// If already finalized, don't process more data
|
||||
if self.is_finalized {
|
||||
return MetaPluginResponse {
|
||||
metadata: Vec::new(),
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
|
||||
// If we've already determined it's binary, stop processing
|
||||
if self.is_binary_content == Some(true) {
|
||||
return MetaPluginResponse {
|
||||
metadata: Vec::new(),
|
||||
is_finalized: false, // We might still want to finalize later
|
||||
};
|
||||
}
|
||||
|
||||
let mut metadata = Vec::new();
|
||||
|
||||
// Calculate how much data we can still accept
|
||||
let remaining_capacity = self.max_buffer_size.saturating_sub(self.buffer.len());
|
||||
if remaining_capacity > 0 {
|
||||
// Determine how much data to copy
|
||||
let bytes_to_take = std::cmp::min(data.len(), remaining_capacity);
|
||||
|
||||
// Add data to our buffer
|
||||
self.buffer.extend_from_slice(&data[..bytes_to_take]);
|
||||
|
||||
// If we have enough data to make a binary determination, do it now
|
||||
if self.buffer.len() >= std::cmp::min(1024, self.max_buffer_size) && self.is_binary_content.is_none() {
|
||||
let is_binary_result = is_binary(&self.buffer);
|
||||
self.is_binary_content = Some(is_binary_result);
|
||||
|
||||
// Output text and binary status immediately
|
||||
let text_value = if is_binary_result { "false".to_string() } else { "true".to_string() };
|
||||
let binary_value = if is_binary_result { "true".to_string() } else { "false".to_string() };
|
||||
|
||||
// Use process_metadata_outputs to handle output mapping
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"text",
|
||||
text_value,
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"binary",
|
||||
binary_value,
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
|
||||
// If it's binary, we're done with this plugin
|
||||
if is_binary_result {
|
||||
self.is_finalized = true;
|
||||
return MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// If content is text, count words and lines
|
||||
if self.is_binary_content == Some(false) {
|
||||
self.count_text_stats(&data[..bytes_to_take]);
|
||||
}
|
||||
}
|
||||
|
||||
// If we've reached our buffer limit and haven't finalized yet
|
||||
if self.buffer.len() >= self.max_buffer_size && !self.is_finalized {
|
||||
// We already determined it's text at this point, so we can finalize
|
||||
if self.is_binary_content == Some(false) {
|
||||
// Output word and line counts
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"text_word_count",
|
||||
self.word_count.to_string(),
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"text_line_count",
|
||||
self.line_count.to_string(),
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark as finalized
|
||||
self.is_finalized = true;
|
||||
}
|
||||
|
||||
let is_finalized = self.is_finalized;
|
||||
MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized,
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(&mut self) -> MetaPluginResponse {
|
||||
// If already finalized, don't process again
|
||||
if self.is_finalized {
|
||||
return MetaPluginResponse {
|
||||
metadata: Vec::new(),
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
|
||||
let mut metadata = Vec::new();
|
||||
|
||||
// If we haven't determined binary status yet, do it now
|
||||
if self.is_binary_content.is_none() && !self.buffer.is_empty() {
|
||||
let is_binary_result = is_binary(&self.buffer);
|
||||
self.is_binary_content = Some(is_binary_result);
|
||||
|
||||
// Output text and binary status
|
||||
let text_value = if is_binary_result { "false".to_string() } else { "true".to_string() };
|
||||
let binary_value = if is_binary_result { "true".to_string() } else { "false".to_string() };
|
||||
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"text",
|
||||
text_value,
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"binary",
|
||||
binary_value,
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
}
|
||||
|
||||
// If content is text and we have some data, output word and line counts
|
||||
if self.is_binary_content == Some(false) && !self.buffer.is_empty() {
|
||||
// Count any remaining words/lines in the buffer if we haven't already
|
||||
if self.word_count == 0 && self.line_count == 0 {
|
||||
self.count_text_stats(&self.buffer);
|
||||
}
|
||||
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"text_word_count",
|
||||
self.word_count.to_string(),
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
|
||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||
"text_line_count",
|
||||
self.line_count.to_string(),
|
||||
self.base.outputs()
|
||||
) {
|
||||
metadata.push(meta_data);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark as finalized
|
||||
self.is_finalized = true;
|
||||
|
||||
MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized: true,
|
||||
}
|
||||
}
|
||||
|
||||
fn meta_name(&self) -> String {
|
||||
self.base.meta_name.clone()
|
||||
}
|
||||
|
||||
fn outputs(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
||||
self.base.outputs()
|
||||
}
|
||||
|
||||
fn outputs_mut(&mut self) -> &mut std::collections::HashMap<String, serde_yaml::Value> {
|
||||
self.base.outputs_mut()
|
||||
}
|
||||
|
||||
fn default_outputs(&self) -> Vec<String> {
|
||||
vec!["text".to_string(), "binary".to_string(), "text_word_count".to_string(), "text_line_count".to_string()]
|
||||
}
|
||||
|
||||
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
||||
self.base.options()
|
||||
}
|
||||
|
||||
fn options_mut(&mut self) -> &mut std::collections::HashMap<String, serde_yaml::Value> {
|
||||
self.base.options_mut()
|
||||
}
|
||||
|
||||
fn configure_options(&mut self, options: &std::collections::HashMap<String, serde_yaml::Value>) -> anyhow::Result<()> {
|
||||
if let Some(max_buffer_size) = options.get("max_buffer_size") {
|
||||
if let Some(size) = max_buffer_size.as_u64() {
|
||||
self.max_buffer_size = size as usize;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user