feat: add text line length statistics and options
Co-authored-by: aider (openai/andrew/openrouter/deepseek/deepseek-chat-v3.1) <aider@aider.chat>
This commit is contained in:
@@ -2,7 +2,7 @@ use crate::common::is_binary::is_binary;
|
|||||||
use crate::common::PIPESIZE;
|
use crate::common::PIPESIZE;
|
||||||
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse};
|
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct TextMetaPlugin {
|
pub struct TextMetaPlugin {
|
||||||
buffer: Option<Vec<u8>>,
|
buffer: Option<Vec<u8>>,
|
||||||
max_buffer_size: usize,
|
max_buffer_size: usize,
|
||||||
@@ -15,6 +15,13 @@ pub struct TextMetaPlugin {
|
|||||||
// Buffer for handling UTF-8 character boundaries
|
// Buffer for handling UTF-8 character boundaries
|
||||||
utf8_buffer: Vec<u8>,
|
utf8_buffer: Vec<u8>,
|
||||||
base: crate::meta_plugin::BaseMetaPlugin,
|
base: crate::meta_plugin::BaseMetaPlugin,
|
||||||
|
// Options to track specific statistics
|
||||||
|
track_word_count: bool,
|
||||||
|
track_line_count: bool,
|
||||||
|
track_line_lengths: bool,
|
||||||
|
// For tracking line lengths
|
||||||
|
line_lengths: Option<Vec<usize>>,
|
||||||
|
current_line_length: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TextMetaPlugin {
|
impl TextMetaPlugin {
|
||||||
@@ -27,17 +34,40 @@ impl TextMetaPlugin {
|
|||||||
|
|
||||||
// Initialize with helper function
|
// Initialize with helper function
|
||||||
base.initialize_plugin(
|
base.initialize_plugin(
|
||||||
&["text", "binary", "text_word_count", "text_line_count"],
|
&["text", "binary", "text_word_count", "text_line_count",
|
||||||
|
"text_line_max_len", "text_line_mean_len", "text_line_median_len"],
|
||||||
options,
|
options,
|
||||||
outputs,
|
outputs,
|
||||||
);
|
);
|
||||||
|
|
||||||
log::debug!("TEXT: Plugin initialized with outputs: {:?}", base.outputs);
|
log::debug!("TEXT: Plugin initialized with outputs: {:?}", base.outputs);
|
||||||
|
|
||||||
let max_buffer_size = base.options.get("max_buffer_size")
|
// Get text_detect_size (previously max_buffer_size)
|
||||||
|
let max_buffer_size = base.options.get("text_detect_size")
|
||||||
|
.or_else(|| base.options.get("max_buffer_size")) // Handle backward compatibility
|
||||||
.and_then(|v| v.as_u64())
|
.and_then(|v| v.as_u64())
|
||||||
.unwrap_or(PIPESIZE as u64) as usize;
|
.unwrap_or(PIPESIZE as u64) as usize;
|
||||||
|
|
||||||
|
// Get which statistics to track
|
||||||
|
let track_word_count = base.options.get("text_word_count")
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(true);
|
||||||
|
let track_line_count = base.options.get("text_line_count")
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(true);
|
||||||
|
let track_line_max_len = base.options.get("text_line_max_len")
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(true);
|
||||||
|
let track_line_mean_len = base.options.get("text_line_mean_len")
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(true);
|
||||||
|
let track_line_median_len = base.options.get("text_line_median_len")
|
||||||
|
.and_then(|v| v.as_bool())
|
||||||
|
.unwrap_or(true);
|
||||||
|
|
||||||
|
// Track line lengths if any of the line length options are enabled
|
||||||
|
let track_line_lengths = track_line_max_len || track_line_mean_len || track_line_median_len;
|
||||||
|
|
||||||
TextMetaPlugin {
|
TextMetaPlugin {
|
||||||
buffer: Some(Vec::new()),
|
buffer: Some(Vec::new()),
|
||||||
max_buffer_size,
|
max_buffer_size,
|
||||||
@@ -48,6 +78,12 @@ impl TextMetaPlugin {
|
|||||||
in_word: false,
|
in_word: false,
|
||||||
utf8_buffer: Vec::new(),
|
utf8_buffer: Vec::new(),
|
||||||
base,
|
base,
|
||||||
|
// Add fields for line length tracking
|
||||||
|
track_word_count,
|
||||||
|
track_line_count,
|
||||||
|
track_line_lengths,
|
||||||
|
line_lengths: if track_line_lengths { Some(Vec::new()) } else { None },
|
||||||
|
current_line_length: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,8 +93,10 @@ impl TextMetaPlugin {
|
|||||||
|
|
||||||
/// Count words and lines in a text chunk, handling block boundaries correctly
|
/// Count words and lines in a text chunk, handling block boundaries correctly
|
||||||
fn count_text_stats(&mut self, data: &[u8]) {
|
fn count_text_stats(&mut self, data: &[u8]) {
|
||||||
// Count lines (newlines)
|
// Count lines (newlines) if needed
|
||||||
|
if self.track_line_count {
|
||||||
self.line_count += data.iter().filter(|&&b| b == b'\n').count();
|
self.line_count += data.iter().filter(|&&b| b == b'\n').count();
|
||||||
|
}
|
||||||
|
|
||||||
// Handle UTF-8 character boundaries by combining with any buffered bytes
|
// Handle UTF-8 character boundaries by combining with any buffered bytes
|
||||||
let combined_data = if !self.utf8_buffer.is_empty() {
|
let combined_data = if !self.utf8_buffer.is_empty() {
|
||||||
@@ -88,7 +126,8 @@ impl TextMetaPlugin {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Count words using wc-like algorithm that tracks state across chunks
|
// Count words if needed
|
||||||
|
if self.track_word_count {
|
||||||
for ch in text.chars() {
|
for ch in text.chars() {
|
||||||
let is_whitespace = ch.is_whitespace();
|
let is_whitespace = ch.is_whitespace();
|
||||||
|
|
||||||
@@ -103,6 +142,21 @@ impl TextMetaPlugin {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Track line lengths if needed
|
||||||
|
if self.track_line_lengths {
|
||||||
|
for ch in text.chars() {
|
||||||
|
if ch == '\n' {
|
||||||
|
if let Some(ref mut lengths) = self.line_lengths {
|
||||||
|
lengths.push(self.current_line_length);
|
||||||
|
}
|
||||||
|
self.current_line_length = 0;
|
||||||
|
} else {
|
||||||
|
self.current_line_length += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper method to perform binary detection and return appropriate metadata
|
/// Helper method to perform binary detection and return appropriate metadata
|
||||||
/// Returns (metadata, should_finalize) tuple
|
/// Returns (metadata, should_finalize) tuple
|
||||||
fn perform_binary_detection(&mut self, buffer: &[u8]) -> (Vec<crate::meta_plugin::MetaData>, bool) {
|
fn perform_binary_detection(&mut self, buffer: &[u8]) -> (Vec<crate::meta_plugin::MetaData>, bool) {
|
||||||
@@ -143,11 +197,19 @@ impl TextMetaPlugin {
|
|||||||
self.count_text_stats(&[]);
|
self.count_text_stats(&[]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle the last line if tracking line lengths
|
||||||
|
if self.track_line_lengths && self.current_line_length > 0 {
|
||||||
|
if let Some(ref mut lengths) = self.line_lengths {
|
||||||
|
lengths.push(self.current_line_length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Debug: check if outputs are configured
|
// Debug: check if outputs are configured
|
||||||
log::debug!("TEXT: Outputs: {:?}", self.base.outputs());
|
log::debug!("TEXT: Outputs: {:?}", self.base.outputs());
|
||||||
log::debug!("TEXT: Word count: {}, Line count: {}", self.word_count, self.line_count);
|
log::debug!("TEXT: Word count: {}, Line count: {}", self.word_count, self.line_count);
|
||||||
|
|
||||||
// Output word and line counts
|
// Output word count if tracked
|
||||||
|
if self.track_word_count {
|
||||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||||
"text_word_count",
|
"text_word_count",
|
||||||
self.word_count.to_string(),
|
self.word_count.to_string(),
|
||||||
@@ -158,7 +220,10 @@ impl TextMetaPlugin {
|
|||||||
} else {
|
} else {
|
||||||
log::debug!("TEXT: Word count output is disabled or not mapped");
|
log::debug!("TEXT: Word count output is disabled or not mapped");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output line count if tracked
|
||||||
|
if self.track_line_count {
|
||||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||||
"text_line_count",
|
"text_line_count",
|
||||||
self.line_count.to_string(),
|
self.line_count.to_string(),
|
||||||
@@ -169,6 +234,50 @@ impl TextMetaPlugin {
|
|||||||
} else {
|
} else {
|
||||||
log::debug!("TEXT: Line count output is disabled or not mapped");
|
log::debug!("TEXT: Line count output is disabled or not mapped");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output line length statistics if tracked
|
||||||
|
if self.track_line_lengths {
|
||||||
|
if let Some(lengths) = &self.line_lengths {
|
||||||
|
if !lengths.is_empty() {
|
||||||
|
// Calculate max, mean, median
|
||||||
|
let max_len = lengths.iter().max().unwrap();
|
||||||
|
let sum: usize = lengths.iter().sum();
|
||||||
|
let mean_len = sum as f64 / lengths.len() as f64;
|
||||||
|
|
||||||
|
let mut sorted_lengths = lengths.clone();
|
||||||
|
sorted_lengths.sort();
|
||||||
|
let median_len = if lengths.len() % 2 == 0 {
|
||||||
|
(sorted_lengths[lengths.len() / 2 - 1] + sorted_lengths[lengths.len() / 2]) as f64 / 2.0
|
||||||
|
} else {
|
||||||
|
sorted_lengths[lengths.len() / 2] as f64
|
||||||
|
};
|
||||||
|
|
||||||
|
// Add each statistic if its corresponding option is enabled
|
||||||
|
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||||
|
"text_line_max_len",
|
||||||
|
max_len.to_string(),
|
||||||
|
self.base.outputs()
|
||||||
|
) {
|
||||||
|
metadata.push(meta_data);
|
||||||
|
}
|
||||||
|
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||||
|
"text_line_mean_len",
|
||||||
|
mean_len.to_string(),
|
||||||
|
self.base.outputs()
|
||||||
|
) {
|
||||||
|
metadata.push(meta_data);
|
||||||
|
}
|
||||||
|
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||||
|
"text_line_median_len",
|
||||||
|
median_len.to_string(),
|
||||||
|
self.base.outputs()
|
||||||
|
) {
|
||||||
|
metadata.push(meta_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
metadata
|
metadata
|
||||||
}
|
}
|
||||||
@@ -336,7 +445,15 @@ impl MetaPlugin for TextMetaPlugin {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn default_outputs(&self) -> Vec<String> {
|
fn default_outputs(&self) -> Vec<String> {
|
||||||
vec!["text".to_string(), "binary".to_string(), "text_word_count".to_string(), "text_line_count".to_string()]
|
vec![
|
||||||
|
"text".to_string(),
|
||||||
|
"binary".to_string(),
|
||||||
|
"text_word_count".to_string(),
|
||||||
|
"text_line_count".to_string(),
|
||||||
|
"text_line_max_len".to_string(),
|
||||||
|
"text_line_mean_len".to_string(),
|
||||||
|
"text_line_median_len".to_string()
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
||||||
@@ -348,11 +465,50 @@ impl MetaPlugin for TextMetaPlugin {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn configure_options(&mut self, options: &std::collections::HashMap<String, serde_yaml::Value>) -> anyhow::Result<()> {
|
fn configure_options(&mut self, options: &std::collections::HashMap<String, serde_yaml::Value>) -> anyhow::Result<()> {
|
||||||
if let Some(max_buffer_size) = options.get("max_buffer_size") {
|
if let Some(text_detect_size) = options.get("text_detect_size") {
|
||||||
|
if let Some(size) = text_detect_size.as_u64() {
|
||||||
|
self.max_buffer_size = size as usize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Handle the old option name for backward compatibility
|
||||||
|
else if let Some(max_buffer_size) = options.get("max_buffer_size") {
|
||||||
if let Some(size) = max_buffer_size.as_u64() {
|
if let Some(size) = max_buffer_size.as_u64() {
|
||||||
self.max_buffer_size = size as usize;
|
self.max_buffer_size = size as usize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update tracking options
|
||||||
|
if let Some(track) = options.get("text_word_count") {
|
||||||
|
if let Some(track_bool) = track.as_bool() {
|
||||||
|
self.track_word_count = track_bool;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(track) = options.get("text_line_count") {
|
||||||
|
if let Some(track_bool) = track.as_bool() {
|
||||||
|
self.track_line_count = track_bool;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(track) = options.get("text_line_max_len") {
|
||||||
|
if let Some(track_bool) = track.as_bool() {
|
||||||
|
if track_bool {
|
||||||
|
self.track_line_lengths = true;
|
||||||
|
if self.line_lengths.is_none() {
|
||||||
|
self.line_lengths = Some(Vec::new());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Similar for mean and median, but we'll just check if any are true to enable tracking
|
||||||
|
// For simplicity, we'll enable tracking if any of the line length options are true
|
||||||
|
let track_line_max = options.get("text_line_max_len").and_then(|v| v.as_bool()).unwrap_or(false);
|
||||||
|
let track_line_mean = options.get("text_line_mean_len").and_then(|v| v.as_bool()).unwrap_or(false);
|
||||||
|
let track_line_median = options.get("text_line_median_len").and_then(|v| v.as_bool()).unwrap_or(false);
|
||||||
|
|
||||||
|
self.track_line_lengths = track_line_max || track_line_mean || track_line_median;
|
||||||
|
if self.track_line_lengths && self.line_lengths.is_none() {
|
||||||
|
self.line_lengths = Some(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user