feat: add plugin schema system, tokenizer cache, and config validation

- Add plugin schema types and runtime discovery for meta/filter plugins
- Rewrite --generate-config to use schema system instead of hardcoded types
- Add Settings::validate_config() for startup validation
- Cache tokenizer instances via static Lazy to avoid repeated BPE loading
- Add split_by_token_iter() and count_bounded() to Tokenizer
- Fix double-counting bug in TokensMetaPlugin when buffer < max_buffer_size
- Eliminate unnecessary allocations in token count methods
- Refactor token filters: remove Option<Tokenizer>, use iterator API
- Fix TailTokensFilter correctness: unbounded buffer instead of ring buffer
- Add encoding option to all token filters
- Add description() to MetaPlugin and FilterPlugin traits
- Fix unused_mut warning in compression engine (feature-gated code)

Co-Authored-By: code-review-bot <noreply@anthropic.com>
This commit is contained in:
2026-03-13 20:23:17 -03:00
parent 914190e119
commit e7d8a83369
16 changed files with 831 additions and 420 deletions

View File

@@ -1,7 +1,7 @@
use crate::common::PIPESIZE;
use crate::common::is_binary::is_binary;
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
use crate::tokenizer::{TokenEncoding, Tokenizer};
use crate::tokenizer::{TokenEncoding, get_tokenizer};
#[derive(Debug, Clone)]
pub struct TokensMetaPlugin {
@@ -15,8 +15,8 @@ pub struct TokensMetaPlugin {
/// UTF-8 boundary carry buffer.
utf8_buffer: Vec<u8>,
base: crate::meta_plugin::BaseMetaPlugin,
/// The tokenizer instance.
tokenizer: Tokenizer,
/// The tokenizer encoding.
encoding: TokenEncoding,
}
impl TokensMetaPlugin {
@@ -59,8 +59,6 @@ impl TokensMetaPlugin {
.and_then(|s| s.parse::<TokenEncoding>().ok())
.unwrap_or_default();
let tokenizer = Tokenizer::new(encoding).expect("Failed to create tokenizer");
Self {
buffer: Some(Vec::new()),
max_buffer_size,
@@ -69,7 +67,7 @@ impl TokensMetaPlugin {
token_count: 0,
utf8_buffer: Vec::new(),
base,
tokenizer,
encoding,
}
}
@@ -77,36 +75,59 @@ impl TokensMetaPlugin {
///
/// Combines with any pending UTF-8 carry bytes, converts to text,
/// and adds the token count to the running total.
///
/// Avoids unnecessary allocations when there is no pending UTF-8 carry
/// and the data is valid UTF-8.
fn count_tokens(&mut self, data: &[u8]) {
if data.is_empty() && self.utf8_buffer.is_empty() {
return;
}
let combined = if !self.utf8_buffer.is_empty() {
let mut c = self.utf8_buffer.clone();
c.extend_from_slice(data);
c
} else {
data.to_vec()
};
self.utf8_buffer.clear();
let tokenizer = get_tokenizer(self.encoding);
let text = match std::str::from_utf8(&combined) {
Ok(t) => t,
Err(e) => {
let valid = e.valid_up_to();
if valid < combined.len() {
self.utf8_buffer.extend_from_slice(&combined[valid..]);
if self.utf8_buffer.is_empty() {
// Fast path: no pending carry — try to use data directly
match std::str::from_utf8(data) {
Ok(text) => {
if !text.is_empty() {
self.token_count += tokenizer.count(text);
}
return;
}
match std::str::from_utf8(&combined[..valid]) {
Ok(t) => t,
Err(_) => return,
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to > 0 {
// Count the valid prefix without copying
let text =
std::str::from_utf8(&data[..valid_up_to]).expect("validated prefix");
self.token_count += tokenizer.count(text);
}
// Save invalid trailing bytes for next call
self.utf8_buffer.extend_from_slice(&data[valid_up_to..]);
return;
}
}
};
}
if !text.is_empty() {
self.token_count += self.tokenizer.count(text);
// Slow path: pending carry bytes — must build combined buffer
let mut combined = std::mem::take(&mut self.utf8_buffer);
combined.extend_from_slice(data);
match std::str::from_utf8(&combined) {
Ok(text) => {
if !text.is_empty() {
self.token_count += tokenizer.count(text);
}
}
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to > 0 {
let text =
std::str::from_utf8(&combined[..valid_up_to]).expect("validated prefix");
self.token_count += tokenizer.count(text);
}
self.utf8_buffer.extend_from_slice(&combined[valid_up_to..]);
}
}
}
@@ -149,8 +170,8 @@ impl MetaPlugin for TokensMetaPlugin {
};
if should_detect {
let buf_clone = self.buffer.as_ref().unwrap().clone();
let is_binary = self.detect_binary(&buf_clone);
let buffer_data = self.buffer.as_ref().unwrap().clone();
let is_binary = self.detect_binary(&buffer_data);
if is_binary {
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
@@ -168,19 +189,10 @@ impl MetaPlugin for TokensMetaPlugin {
};
}
// It's text — tokenize the full accumulated buffer
self.count_tokens(&buf_clone);
if buf_clone.len() >= self.max_buffer_size {
self.buffer = None;
}
} else if self.buffer.is_some() {
// Still building up buffer — tokenize what was just added
let remaining = self
.max_buffer_size
.saturating_sub(self.buffer.as_ref().map_or(0, |b| b.len()));
let to_take = std::cmp::min(data.len(), remaining);
self.count_tokens(&data[..to_take]);
// It's text — tokenize the full buffer (nothing was counted yet),
// then clear to avoid double-counting in finalize().
self.count_tokens(&buffer_data);
self.buffer = Some(Vec::new());
}
} else if self.is_binary_content == Some(false) {
self.count_tokens(data);
@@ -212,8 +224,8 @@ impl MetaPlugin for TokensMetaPlugin {
if self.is_binary_content.is_none() {
if let Some(buffer) = &self.buffer {
if !buffer.is_empty() {
let buf_clone = buffer.clone();
let is_binary = self.detect_binary(&buf_clone);
let buffer_data = buffer.clone();
let is_binary = self.detect_binary(&buffer_data);
if is_binary {
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
@@ -234,6 +246,12 @@ impl MetaPlugin for TokensMetaPlugin {
}
}
// Tokenize any bytes in the buffer
if let Some(buffer) = &self.buffer {
let data = buffer.clone();
self.count_tokens(&data);
}
// Process any remaining UTF-8 bytes
if !self.utf8_buffer.is_empty() {
self.count_tokens(&[]);