From 0b57da071a251edfe12f7b5a3c5e49dc0513ac5c Mon Sep 17 00:00:00 2001
From: Andrew Phillips <andrew.phillips2@canada.ca>
Date: Tue, 26 Aug 2025 19:12:27 -0300
Subject: [PATCH] feat: implement accurate word counting across block
 boundaries

Co-authored-by: aider (openai/andrew/openrouter/qwen/qwen3-coder) <aider@aider.chat>
---
 src/meta_plugin/text.rs | 63 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 11 deletions(-)
diff --git a/src/meta_plugin/text.rs b/src/meta_plugin/text.rs
index a7f4c50..2394d7e 100644
--- a/src/meta_plugin/text.rs
+++ b/src/meta_plugin/text.rs
@@ -10,6 +10,10 @@ pub struct TextMetaPlugin {
     word_count: usize,
     line_count: usize,
     is_binary_content: Option<bool>,
+    // State for tracking word boundaries across chunks
+    in_word: bool,
+    // Buffer for handling UTF-8 character boundaries
+    utf8_buffer: Vec<u8>,
     base: crate::meta_plugin::BaseMetaPlugin,
 }
 
@@ -39,6 +43,8 @@ impl TextMetaPlugin {
             word_count: 0,
             line_count: 0,
             is_binary_content: None,
+            in_word: false,
+            utf8_buffer: Vec::new(),
             base,
         }
     }
@@ -47,19 +53,52 @@ impl TextMetaPlugin {
         Self::new(None, None)
     }
     
-    /// Count words and lines in a text chunk
+    /// Count words and lines in a text chunk, handling block boundaries correctly
     fn count_text_stats(&mut self, data: &[u8]) {
         // Count lines (newlines)
         self.line_count += data.iter().filter(|&&b| b == b'\n').count();
         
-        // Count words - we'll use a simple approach that counts whitespace-separated sequences
-        let text = match std::str::from_utf8(data) {
-            Ok(text) => text,
-            Err(_) => return, // Not valid UTF-8, can't count words reliably
+        // Handle UTF-8 character boundaries by combining with any buffered bytes
+        let combined_data = if !self.utf8_buffer.is_empty() {
+            let mut combined = self.utf8_buffer.clone();
+            combined.extend_from_slice(data);
+            combined
+        } else {
+            data.to_vec()
         };
         
-        // Simple word counting - this counts sequences of non-whitespace characters
-        self.word_count += text.split_whitespace().count();
+        // Clear the UTF-8 buffer
+        self.utf8_buffer.clear();
+        
+        // Convert to string, handling potential UTF-8 boundaries
+        let text = match std::str::from_utf8(&combined_data) {
+            Ok(text) => text,
+            Err(e) => {
+                // If we have incomplete UTF-8 at the end, buffer those bytes for next chunk
+                let valid_up_to = e.valid_up_to();
+                if valid_up_to < combined_data.len() {
+                    self.utf8_buffer.extend_from_slice(&combined_data[valid_up_to..]);
+                }
+                match std::str::from_utf8(&combined_data[..valid_up_to]) {
+                    Ok(text) => text,
+                    Err(_) => return, // Can't process this data
+                }
+            }
+        };
+        
+        // Count words using wc-like algorithm that tracks state across chunks
+        for ch in text.chars() {
+            let is_whitespace = ch.is_whitespace();
+            
+            if !self.in_word && !is_whitespace {
+                // Transition from whitespace to word - start of new word
+                self.word_count += 1;
+                self.in_word = true;
+            } else if self.in_word && is_whitespace {
+                // Transition from word to whitespace - end of current word
+                self.in_word = false;
+            }
+        }
     }
 }
 
@@ -214,12 +253,14 @@ impl MetaPlugin for TextMetaPlugin {
         
         // If content is text and we have some data, output word and line counts
         if self.is_binary_content == Some(false) && !self.buffer.is_empty() {
-            // Count any remaining words/lines in the buffer if we haven't already
-            if self.word_count == 0 && self.line_count == 0 {
-                let buffer_copy = self.buffer.clone();
-                self.count_text_stats(&buffer_copy);
+            // Process any remaining data in utf8_buffer
+            if !self.utf8_buffer.is_empty() {
+                self.count_text_stats(&[]);
             }
             
+            // If we're still in a word at the end of the stream, we've counted it correctly
+            // No special handling needed for this case
+            
             if let Some(meta_data) = crate::meta_plugin::process_metadata_outputs(
                 "text_word_count", 
                 self.word_count.to_string(),