diff --git a/src/meta_plugin/text.rs b/src/meta_plugin/text.rs index aab3cda..3b8277b 100644 --- a/src/meta_plugin/text.rs +++ b/src/meta_plugin/text.rs @@ -2,7 +2,7 @@ use crate::common::is_binary::is_binary; use crate::common::PIPESIZE; use crate::meta_plugin::{MetaPlugin, MetaPluginResponse}; -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct TextMetaPlugin { buffer: Option>, max_buffer_size: usize, @@ -15,6 +15,13 @@ pub struct TextMetaPlugin { // Buffer for handling UTF-8 character boundaries utf8_buffer: Vec, base: crate::meta_plugin::BaseMetaPlugin, + // Options to track specific statistics + track_word_count: bool, + track_line_count: bool, + track_line_lengths: bool, + // For tracking line lengths + line_lengths: Option>, + current_line_length: usize, } impl TextMetaPlugin { @@ -27,17 +34,40 @@ impl TextMetaPlugin { // Initialize with helper function base.initialize_plugin( - &["text", "binary", "text_word_count", "text_line_count"], + &["text", "binary", "text_word_count", "text_line_count", + "text_line_max_len", "text_line_mean_len", "text_line_median_len"], options, outputs, ); log::debug!("TEXT: Plugin initialized with outputs: {:?}", base.outputs); - let max_buffer_size = base.options.get("max_buffer_size") + // Get text_detect_size (previously max_buffer_size) + let max_buffer_size = base.options.get("text_detect_size") + .or_else(|| base.options.get("max_buffer_size")) // Handle backward compatibility .and_then(|v| v.as_u64()) .unwrap_or(PIPESIZE as u64) as usize; + // Get which statistics to track + let track_word_count = base.options.get("text_word_count") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let track_line_count = base.options.get("text_line_count") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let track_line_max_len = base.options.get("text_line_max_len") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let track_line_mean_len = base.options.get("text_line_mean_len") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let track_line_median_len = base.options.get("text_line_median_len") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + + // Track line lengths if any of the line length options are enabled + let track_line_lengths = track_line_max_len || track_line_mean_len || track_line_median_len; + TextMetaPlugin { buffer: Some(Vec::new()), max_buffer_size, @@ -48,6 +78,12 @@ impl TextMetaPlugin { in_word: false, utf8_buffer: Vec::new(), base, + // Add fields for line length tracking + track_word_count, + track_line_count, + track_line_lengths, + line_lengths: if track_line_lengths { Some(Vec::new()) } else { None }, + current_line_length: 0, } } @@ -57,8 +93,10 @@ impl TextMetaPlugin { /// Count words and lines in a text chunk, handling block boundaries correctly fn count_text_stats(&mut self, data: &[u8]) { - // Count lines (newlines) - self.line_count += data.iter().filter(|&&b| b == b'\n').count(); + // Count lines (newlines) if needed + if self.track_line_count { + self.line_count += data.iter().filter(|&&b| b == b'\n').count(); + } // Handle UTF-8 character boundaries by combining with any buffered bytes let combined_data = if !self.utf8_buffer.is_empty() { @@ -88,17 +126,33 @@ impl TextMetaPlugin { } }; - // Count words using wc-like algorithm that tracks state across chunks - for ch in text.chars() { - let is_whitespace = ch.is_whitespace(); - - if !self.in_word && !is_whitespace { - // Transition from whitespace to word - start of new word - self.word_count += 1; - self.in_word = true; - } else if self.in_word && is_whitespace { - // Transition from word to whitespace - end of current word - self.in_word = false; + // Count words if needed + if self.track_word_count { + for ch in text.chars() { + let is_whitespace = ch.is_whitespace(); + + if !self.in_word && !is_whitespace { + // Transition from whitespace to word - start of new word + self.word_count += 1; + self.in_word = true; + } else if self.in_word && is_whitespace { + // Transition from word to whitespace - end of current word + self.in_word = false; + } + } + } + + // Track line lengths if needed + if self.track_line_lengths { + for ch in text.chars() { + if ch == '\n' { + if let Some(ref mut lengths) = self.line_lengths { + lengths.push(self.current_line_length); + } + self.current_line_length = 0; + } else { + self.current_line_length += 1; + } } } } @@ -143,31 +197,86 @@ impl TextMetaPlugin { self.count_text_stats(&[]); } + // Handle the last line if tracking line lengths + if self.track_line_lengths && self.current_line_length > 0 { + if let Some(ref mut lengths) = self.line_lengths { + lengths.push(self.current_line_length); + } + } + // Debug: check if outputs are configured log::debug!("TEXT: Outputs: {:?}", self.base.outputs()); log::debug!("TEXT: Word count: {}, Line count: {}", self.word_count, self.line_count); - // Output word and line counts - if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( - "text_word_count", - self.word_count.to_string(), - self.base.outputs() - ) { - log::debug!("TEXT: Adding word count metadata: {:?}", meta_data); - metadata.push(meta_data); - } else { - log::debug!("TEXT: Word count output is disabled or not mapped"); + // Output word count if tracked + if self.track_word_count { + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_word_count", + self.word_count.to_string(), + self.base.outputs() + ) { + log::debug!("TEXT: Adding word count metadata: {:?}", meta_data); + metadata.push(meta_data); + } else { + log::debug!("TEXT: Word count output is disabled or not mapped"); + } } - if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( - "text_line_count", - self.line_count.to_string(), - self.base.outputs() - ) { - log::debug!("TEXT: Adding line count metadata: {:?}", meta_data); - metadata.push(meta_data); - } else { - log::debug!("TEXT: Line count output is disabled or not mapped"); + // Output line count if tracked + if self.track_line_count { + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_line_count", + self.line_count.to_string(), + self.base.outputs() + ) { + log::debug!("TEXT: Adding line count metadata: {:?}", meta_data); + metadata.push(meta_data); + } else { + log::debug!("TEXT: Line count output is disabled or not mapped"); + } + } + + // Output line length statistics if tracked + if self.track_line_lengths { + if let Some(lengths) = &self.line_lengths { + if !lengths.is_empty() { + // Calculate max, mean, median + let max_len = lengths.iter().max().unwrap(); + let sum: usize = lengths.iter().sum(); + let mean_len = sum as f64 / lengths.len() as f64; + + let mut sorted_lengths = lengths.clone(); + sorted_lengths.sort(); + let median_len = if lengths.len() % 2 == 0 { + (sorted_lengths[lengths.len() / 2 - 1] + sorted_lengths[lengths.len() / 2]) as f64 / 2.0 + } else { + sorted_lengths[lengths.len() / 2] as f64 + }; + + // Add each statistic if its corresponding option is enabled + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_line_max_len", + max_len.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_line_mean_len", + mean_len.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( + "text_line_median_len", + median_len.to_string(), + self.base.outputs() + ) { + metadata.push(meta_data); + } + } + } } metadata @@ -336,7 +445,15 @@ impl MetaPlugin for TextMetaPlugin { } fn default_outputs(&self) -> Vec { - vec!["text".to_string(), "binary".to_string(), "text_word_count".to_string(), "text_line_count".to_string()] + vec![ + "text".to_string(), + "binary".to_string(), + "text_word_count".to_string(), + "text_line_count".to_string(), + "text_line_max_len".to_string(), + "text_line_mean_len".to_string(), + "text_line_median_len".to_string() + ] } fn options(&self) -> &std::collections::HashMap { @@ -348,11 +465,50 @@ impl MetaPlugin for TextMetaPlugin { } fn configure_options(&mut self, options: &std::collections::HashMap) -> anyhow::Result<()> { - if let Some(max_buffer_size) = options.get("max_buffer_size") { + if let Some(text_detect_size) = options.get("text_detect_size") { + if let Some(size) = text_detect_size.as_u64() { + self.max_buffer_size = size as usize; + } + } + // Handle the old option name for backward compatibility + else if let Some(max_buffer_size) = options.get("max_buffer_size") { if let Some(size) = max_buffer_size.as_u64() { self.max_buffer_size = size as usize; } } + + // Update tracking options + if let Some(track) = options.get("text_word_count") { + if let Some(track_bool) = track.as_bool() { + self.track_word_count = track_bool; + } + } + if let Some(track) = options.get("text_line_count") { + if let Some(track_bool) = track.as_bool() { + self.track_line_count = track_bool; + } + } + if let Some(track) = options.get("text_line_max_len") { + if let Some(track_bool) = track.as_bool() { + if track_bool { + self.track_line_lengths = true; + if self.line_lengths.is_none() { + self.line_lengths = Some(Vec::new()); + } + } + } + } + // Similar for mean and median, but we'll just check if any are true to enable tracking + // For simplicity, we'll enable tracking if any of the line length options are true + let track_line_max = options.get("text_line_max_len").and_then(|v| v.as_bool()).unwrap_or(false); + let track_line_mean = options.get("text_line_mean_len").and_then(|v| v.as_bool()).unwrap_or(false); + let track_line_median = options.get("text_line_median_len").and_then(|v| v.as_bool()).unwrap_or(false); + + self.track_line_lengths = track_line_max || track_line_mean || track_line_median; + if self.track_line_lengths && self.line_lengths.is_none() { + self.line_lengths = Some(Vec::new()); + } + Ok(()) } }