feat: implement accurate word counting across block boundaries
Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
This commit is contained in:
@@ -10,6 +10,10 @@ pub struct TextMetaPlugin {
|
|||||||
word_count: usize,
|
word_count: usize,
|
||||||
line_count: usize,
|
line_count: usize,
|
||||||
is_binary_content: Option<bool>,
|
is_binary_content: Option<bool>,
|
||||||
|
// State for tracking word boundaries across chunks
|
||||||
|
in_word: bool,
|
||||||
|
// Buffer for handling UTF-8 character boundaries
|
||||||
|
utf8_buffer: Vec<u8>,
|
||||||
base: crate::meta_plugin::BaseMetaPlugin,
|
base: crate::meta_plugin::BaseMetaPlugin,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -39,6 +43,8 @@ impl TextMetaPlugin {
|
|||||||
word_count: 0,
|
word_count: 0,
|
||||||
line_count: 0,
|
line_count: 0,
|
||||||
is_binary_content: None,
|
is_binary_content: None,
|
||||||
|
in_word: false,
|
||||||
|
utf8_buffer: Vec::new(),
|
||||||
base,
|
base,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -47,19 +53,52 @@ impl TextMetaPlugin {
|
|||||||
Self::new(None, None)
|
Self::new(None, None)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Count words and lines in a text chunk
|
/// Count words and lines in a text chunk, handling block boundaries correctly
|
||||||
fn count_text_stats(&mut self, data: &[u8]) {
|
fn count_text_stats(&mut self, data: &[u8]) {
|
||||||
// Count lines (newlines)
|
// Count lines (newlines)
|
||||||
self.line_count += data.iter().filter(|&&b| b == b'\n').count();
|
self.line_count += data.iter().filter(|&&b| b == b'\n').count();
|
||||||
|
|
||||||
// Count words - we'll use a simple approach that counts whitespace-separated sequences
|
// Handle UTF-8 character boundaries by combining with any buffered bytes
|
||||||
let text = match std::str::from_utf8(data) {
|
let combined_data = if !self.utf8_buffer.is_empty() {
|
||||||
Ok(text) => text,
|
let mut combined = self.utf8_buffer.clone();
|
||||||
Err(_) => return, // Not valid UTF-8, can't count words reliably
|
combined.extend_from_slice(data);
|
||||||
|
combined
|
||||||
|
} else {
|
||||||
|
data.to_vec()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Simple word counting - this counts sequences of non-whitespace characters
|
// Clear the UTF-8 buffer
|
||||||
self.word_count += text.split_whitespace().count();
|
self.utf8_buffer.clear();
|
||||||
|
|
||||||
|
// Convert to string, handling potential UTF-8 boundaries
|
||||||
|
let text = match std::str::from_utf8(&combined_data) {
|
||||||
|
Ok(text) => text,
|
||||||
|
Err(e) => {
|
||||||
|
// If we have incomplete UTF-8 at the end, buffer those bytes for next chunk
|
||||||
|
let valid_up_to = e.valid_up_to();
|
||||||
|
if valid_up_to < combined_data.len() {
|
||||||
|
self.utf8_buffer.extend_from_slice(&combined_data[valid_up_to..]);
|
||||||
|
}
|
||||||
|
match std::str::from_utf8(&combined_data[..valid_up_to]) {
|
||||||
|
Ok(text) => text,
|
||||||
|
Err(_) => return, // Can't process this data
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Count words using wc-like algorithm that tracks state across chunks
|
||||||
|
for ch in text.chars() {
|
||||||
|
let is_whitespace = ch.is_whitespace();
|
||||||
|
|
||||||
|
if !self.in_word && !is_whitespace {
|
||||||
|
// Transition from whitespace to word - start of new word
|
||||||
|
self.word_count += 1;
|
||||||
|
self.in_word = true;
|
||||||
|
} else if self.in_word && is_whitespace {
|
||||||
|
// Transition from word to whitespace - end of current word
|
||||||
|
self.in_word = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,12 +253,14 @@ impl MetaPlugin for TextMetaPlugin {
|
|||||||
|
|
||||||
// If content is text and we have some data, output word and line counts
|
// If content is text and we have some data, output word and line counts
|
||||||
if self.is_binary_content == Some(false) && !self.buffer.is_empty() {
|
if self.is_binary_content == Some(false) && !self.buffer.is_empty() {
|
||||||
// Count any remaining words/lines in the buffer if we haven't already
|
// Process any remaining data in utf8_buffer
|
||||||
if self.word_count == 0 && self.line_count == 0 {
|
if !self.utf8_buffer.is_empty() {
|
||||||
let buffer_copy = self.buffer.clone();
|
self.count_text_stats(&[]);
|
||||||
self.count_text_stats(&buffer_copy);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we're still in a word at the end of the stream, we've counted it correctly
|
||||||
|
// No special handling needed for this case
|
||||||
|
|
||||||
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
|
||||||
"text_word_count",
|
"text_word_count",
|
||||||
self.word_count.to_string(),
|
self.word_count.to_string(),
|
||||||
|
|||||||
Reference in New Issue
Block a user