feat: add plugin schema system, tokenizer cache, and config validation

- Add plugin schema types and runtime discovery for meta/filter plugins - Rewrite --generate-config to use schema system instead of hardcoded types - Add Settings::validate_config() for startup validation - Cache tokenizer instances via static Lazy to avoid repeated BPE loading - Add split_by_token_iter() and count_bounded() to Tokenizer - Fix double-counting bug in TokensMetaPlugin when buffer < max_buffer_size - Eliminate unnecessary allocations in token count methods - Refactor token filters: remove Option<Tokenizer>, use iterator API - Fix TailTokensFilter correctness: unbounded buffer instead of ring buffer - Add encoding option to all token filters - Add description() to MetaPlugin and FilterPlugin traits - Fix unused_mut warning in compression engine (feature-gated code) Co-Authored-By: code-review-bot <noreply@anthropic.com>
2026-03-13 20:23:17 -03:00
parent 914190e119
commit e7d8a83369
16 changed files with 831 additions and 420 deletions
--- a/src/meta_plugin/tokens.rs
+++ b/src/meta_plugin/tokens.rs
@@ -1,7 +1,7 @@
 use crate::common::PIPESIZE;
 use crate::common::is_binary::is_binary;
 use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
-use crate::tokenizer::{TokenEncoding, Tokenizer};
+use crate::tokenizer::{TokenEncoding, get_tokenizer};

 #[derive(Debug, Clone)]
 pub struct TokensMetaPlugin {
@@ -15,8 +15,8 @@ pub struct TokensMetaPlugin {
    /// UTF-8 boundary carry buffer.
    utf8_buffer: Vec<u8>,
    base: crate::meta_plugin::BaseMetaPlugin,
-    /// The tokenizer instance.
-    tokenizer: Tokenizer,
+    /// The tokenizer encoding.
+    encoding: TokenEncoding,
 }

 impl TokensMetaPlugin {
@@ -59,8 +59,6 @@ impl TokensMetaPlugin {
            .and_then(|s| s.parse::<TokenEncoding>().ok())
            .unwrap_or_default();

-        let tokenizer = Tokenizer::new(encoding).expect("Failed to create tokenizer");
-
        Self {
            buffer: Some(Vec::new()),
            max_buffer_size,
@@ -69,7 +67,7 @@ impl TokensMetaPlugin {
            token_count: 0,
            utf8_buffer: Vec::new(),
            base,
-            tokenizer,
+            encoding,
        }
    }

@@ -77,36 +75,59 @@ impl TokensMetaPlugin {
    ///
    /// Combines with any pending UTF-8 carry bytes, converts to text,
    /// and adds the token count to the running total.
+    ///
+    /// Avoids unnecessary allocations when there is no pending UTF-8 carry
+    /// and the data is valid UTF-8.
    fn count_tokens(&mut self, data: &[u8]) {
        if data.is_empty() && self.utf8_buffer.is_empty() {
            return;
        }

-        let combined = if !self.utf8_buffer.is_empty() {
-            let mut c = self.utf8_buffer.clone();
-            c.extend_from_slice(data);
-            c
-        } else {
-            data.to_vec()
-        };
-        self.utf8_buffer.clear();
+        let tokenizer = get_tokenizer(self.encoding);

-        let text = match std::str::from_utf8(&combined) {
-            Ok(t) => t,
-            Err(e) => {
-                let valid = e.valid_up_to();
-                if valid < combined.len() {
-                    self.utf8_buffer.extend_from_slice(&combined[valid..]);
+        if self.utf8_buffer.is_empty() {
+            // Fast path: no pending carry — try to use data directly
+            match std::str::from_utf8(data) {
+                Ok(text) => {
+                    if !text.is_empty() {
+                        self.token_count += tokenizer.count(text);
+                    }
+                    return;
                }
-                match std::str::from_utf8(&combined[..valid]) {
-                    Ok(t) => t,
-                    Err(_) => return,
+                Err(e) => {
+                    let valid_up_to = e.valid_up_to();
+                    if valid_up_to > 0 {
+                        // Count the valid prefix without copying
+                        let text =
+                            std::str::from_utf8(&data[..valid_up_to]).expect("validated prefix");
+                        self.token_count += tokenizer.count(text);
+                    }
+                    // Save invalid trailing bytes for next call
+                    self.utf8_buffer.extend_from_slice(&data[valid_up_to..]);
+                    return;
                }
            }
-        };
+        }

-        if !text.is_empty() {
-            self.token_count += self.tokenizer.count(text);
+        // Slow path: pending carry bytes — must build combined buffer
+        let mut combined = std::mem::take(&mut self.utf8_buffer);
+        combined.extend_from_slice(data);
+
+        match std::str::from_utf8(&combined) {
+            Ok(text) => {
+                if !text.is_empty() {
+                    self.token_count += tokenizer.count(text);
+                }
+            }
+            Err(e) => {
+                let valid_up_to = e.valid_up_to();
+                if valid_up_to > 0 {
+                    let text =
+                        std::str::from_utf8(&combined[..valid_up_to]).expect("validated prefix");
+                    self.token_count += tokenizer.count(text);
+                }
+                self.utf8_buffer.extend_from_slice(&combined[valid_up_to..]);
+            }
        }
    }

@@ -149,8 +170,8 @@ impl MetaPlugin for TokensMetaPlugin {
            };

            if should_detect {
-                let buf_clone = self.buffer.as_ref().unwrap().clone();
-                let is_binary = self.detect_binary(&buf_clone);
+                let buffer_data = self.buffer.as_ref().unwrap().clone();
+                let is_binary = self.detect_binary(&buffer_data);

                if is_binary {
                    if let Some(md) = crate::meta_plugin::process_metadata_outputs(
@@ -168,19 +189,10 @@ impl MetaPlugin for TokensMetaPlugin {
                    };
                }

-                // It's text — tokenize the full accumulated buffer
-                self.count_tokens(&buf_clone);
-
-                if buf_clone.len() >= self.max_buffer_size {
-                    self.buffer = None;
-                }
-            } else if self.buffer.is_some() {
-                // Still building up buffer — tokenize what was just added
-                let remaining = self
-                    .max_buffer_size
-                    .saturating_sub(self.buffer.as_ref().map_or(0, |b| b.len()));
-                let to_take = std::cmp::min(data.len(), remaining);
-                self.count_tokens(&data[..to_take]);
+                // It's text — tokenize the full buffer (nothing was counted yet),
+                // then clear to avoid double-counting in finalize().
+                self.count_tokens(&buffer_data);
+                self.buffer = Some(Vec::new());
            }
        } else if self.is_binary_content == Some(false) {
            self.count_tokens(data);
@@ -212,8 +224,8 @@ impl MetaPlugin for TokensMetaPlugin {
        if self.is_binary_content.is_none() {
            if let Some(buffer) = &self.buffer {
                if !buffer.is_empty() {
-                    let buf_clone = buffer.clone();
-                    let is_binary = self.detect_binary(&buf_clone);
+                    let buffer_data = buffer.clone();
+                    let is_binary = self.detect_binary(&buffer_data);

                    if is_binary {
                        if let Some(md) = crate::meta_plugin::process_metadata_outputs(
@@ -234,6 +246,12 @@ impl MetaPlugin for TokensMetaPlugin {
            }
        }

+        // Tokenize any bytes in the buffer
+        if let Some(buffer) = &self.buffer {
+            let data = buffer.clone();
+            self.count_tokens(&data);
+        }
+
        // Process any remaining UTF-8 bytes
        if !self.utf8_buffer.is_empty() {
            self.count_tokens(&[]);