From 0b57da071a251edfe12f7b5a3c5e49dc0513ac5c Mon Sep 17 00:00:00 2001 From: Andrew Phillips Date: Tue, 26 Aug 2025 19:12:27 -0300 Subject: [PATCH] feat: implement accurate word counting across block boundaries Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) --- src/meta_plugin/text.rs | 63 ++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/src/meta_plugin/text.rs b/src/meta_plugin/text.rs index a7f4c50..2394d7e 100644 --- a/src/meta_plugin/text.rs +++ b/src/meta_plugin/text.rs @@ -10,6 +10,10 @@ pub struct TextMetaPlugin { word_count: usize, line_count: usize, is_binary_content: Option, + // State for tracking word boundaries across chunks + in_word: bool, + // Buffer for handling UTF-8 character boundaries + utf8_buffer: Vec, base: crate::meta_plugin::BaseMetaPlugin, } @@ -39,6 +43,8 @@ impl TextMetaPlugin { word_count: 0, line_count: 0, is_binary_content: None, + in_word: false, + utf8_buffer: Vec::new(), base, } } @@ -47,19 +53,52 @@ impl TextMetaPlugin { Self::new(None, None) } - /// Count words and lines in a text chunk + /// Count words and lines in a text chunk, handling block boundaries correctly fn count_text_stats(&mut self, data: &[u8]) { // Count lines (newlines) self.line_count += data.iter().filter(|&&b| b == b'\n').count(); - // Count words - we'll use a simple approach that counts whitespace-separated sequences - let text = match std::str::from_utf8(data) { - Ok(text) => text, - Err(_) => return, // Not valid UTF-8, can't count words reliably + // Handle UTF-8 character boundaries by combining with any buffered bytes + let combined_data = if !self.utf8_buffer.is_empty() { + let mut combined = self.utf8_buffer.clone(); + combined.extend_from_slice(data); + combined + } else { + data.to_vec() }; - // Simple word counting - this counts sequences of non-whitespace characters - self.word_count += text.split_whitespace().count(); + // Clear the UTF-8 buffer + self.utf8_buffer.clear(); + + // Convert to string, handling potential UTF-8 boundaries + let text = match std::str::from_utf8(&combined_data) { + Ok(text) => text, + Err(e) => { + // If we have incomplete UTF-8 at the end, buffer those bytes for next chunk + let valid_up_to = e.valid_up_to(); + if valid_up_to < combined_data.len() { + self.utf8_buffer.extend_from_slice(&combined_data[valid_up_to..]); + } + match std::str::from_utf8(&combined_data[..valid_up_to]) { + Ok(text) => text, + Err(_) => return, // Can't process this data + } + } + }; + + // Count words using wc-like algorithm that tracks state across chunks + for ch in text.chars() { + let is_whitespace = ch.is_whitespace(); + + if !self.in_word && !is_whitespace { + // Transition from whitespace to word - start of new word + self.word_count += 1; + self.in_word = true; + } else if self.in_word && is_whitespace { + // Transition from word to whitespace - end of current word + self.in_word = false; + } + } } } @@ -214,12 +253,14 @@ impl MetaPlugin for TextMetaPlugin { // If content is text and we have some data, output word and line counts if self.is_binary_content == Some(false) && !self.buffer.is_empty() { - // Count any remaining words/lines in the buffer if we haven't already - if self.word_count == 0 && self.line_count == 0 { - let buffer_copy = self.buffer.clone(); - self.count_text_stats(&buffer_copy); + // Process any remaining data in utf8_buffer + if !self.utf8_buffer.is_empty() { + self.count_text_stats(&[]); } + // If we're still in a word at the end of the stream, we've counted it correctly + // No special handling needed for this case + if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs( "text_word_count", self.word_count.to_string(),