feat: add plugin schema system, tokenizer cache, and config validation
- Add plugin schema types and runtime discovery for meta/filter plugins - Rewrite --generate-config to use schema system instead of hardcoded types - Add Settings::validate_config() for startup validation - Cache tokenizer instances via static Lazy to avoid repeated BPE loading - Add split_by_token_iter() and count_bounded() to Tokenizer - Fix double-counting bug in TokensMetaPlugin when buffer < max_buffer_size - Eliminate unnecessary allocations in token count methods - Refactor token filters: remove Option<Tokenizer>, use iterator API - Fix TailTokensFilter correctness: unbounded buffer instead of ring buffer - Add encoding option to all token filters - Add description() to MetaPlugin and FilterPlugin traits - Fix unused_mut warning in compression engine (feature-gated code) Co-Authored-By: code-review-bot <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
use crate::common::PIPESIZE;
|
||||
use crate::common::is_binary::is_binary;
|
||||
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
|
||||
use crate::tokenizer::{TokenEncoding, Tokenizer};
|
||||
use crate::tokenizer::{TokenEncoding, get_tokenizer};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TokensMetaPlugin {
|
||||
@@ -15,8 +15,8 @@ pub struct TokensMetaPlugin {
|
||||
/// UTF-8 boundary carry buffer.
|
||||
utf8_buffer: Vec<u8>,
|
||||
base: crate::meta_plugin::BaseMetaPlugin,
|
||||
/// The tokenizer instance.
|
||||
tokenizer: Tokenizer,
|
||||
/// The tokenizer encoding.
|
||||
encoding: TokenEncoding,
|
||||
}
|
||||
|
||||
impl TokensMetaPlugin {
|
||||
@@ -59,8 +59,6 @@ impl TokensMetaPlugin {
|
||||
.and_then(|s| s.parse::<TokenEncoding>().ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
let tokenizer = Tokenizer::new(encoding).expect("Failed to create tokenizer");
|
||||
|
||||
Self {
|
||||
buffer: Some(Vec::new()),
|
||||
max_buffer_size,
|
||||
@@ -69,7 +67,7 @@ impl TokensMetaPlugin {
|
||||
token_count: 0,
|
||||
utf8_buffer: Vec::new(),
|
||||
base,
|
||||
tokenizer,
|
||||
encoding,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,36 +75,59 @@ impl TokensMetaPlugin {
|
||||
///
|
||||
/// Combines with any pending UTF-8 carry bytes, converts to text,
|
||||
/// and adds the token count to the running total.
|
||||
///
|
||||
/// Avoids unnecessary allocations when there is no pending UTF-8 carry
|
||||
/// and the data is valid UTF-8.
|
||||
fn count_tokens(&mut self, data: &[u8]) {
|
||||
if data.is_empty() && self.utf8_buffer.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let combined = if !self.utf8_buffer.is_empty() {
|
||||
let mut c = self.utf8_buffer.clone();
|
||||
c.extend_from_slice(data);
|
||||
c
|
||||
} else {
|
||||
data.to_vec()
|
||||
};
|
||||
self.utf8_buffer.clear();
|
||||
let tokenizer = get_tokenizer(self.encoding);
|
||||
|
||||
let text = match std::str::from_utf8(&combined) {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
let valid = e.valid_up_to();
|
||||
if valid < combined.len() {
|
||||
self.utf8_buffer.extend_from_slice(&combined[valid..]);
|
||||
if self.utf8_buffer.is_empty() {
|
||||
// Fast path: no pending carry — try to use data directly
|
||||
match std::str::from_utf8(data) {
|
||||
Ok(text) => {
|
||||
if !text.is_empty() {
|
||||
self.token_count += tokenizer.count(text);
|
||||
}
|
||||
return;
|
||||
}
|
||||
match std::str::from_utf8(&combined[..valid]) {
|
||||
Ok(t) => t,
|
||||
Err(_) => return,
|
||||
Err(e) => {
|
||||
let valid_up_to = e.valid_up_to();
|
||||
if valid_up_to > 0 {
|
||||
// Count the valid prefix without copying
|
||||
let text =
|
||||
std::str::from_utf8(&data[..valid_up_to]).expect("validated prefix");
|
||||
self.token_count += tokenizer.count(text);
|
||||
}
|
||||
// Save invalid trailing bytes for next call
|
||||
self.utf8_buffer.extend_from_slice(&data[valid_up_to..]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if !text.is_empty() {
|
||||
self.token_count += self.tokenizer.count(text);
|
||||
// Slow path: pending carry bytes — must build combined buffer
|
||||
let mut combined = std::mem::take(&mut self.utf8_buffer);
|
||||
combined.extend_from_slice(data);
|
||||
|
||||
match std::str::from_utf8(&combined) {
|
||||
Ok(text) => {
|
||||
if !text.is_empty() {
|
||||
self.token_count += tokenizer.count(text);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
let valid_up_to = e.valid_up_to();
|
||||
if valid_up_to > 0 {
|
||||
let text =
|
||||
std::str::from_utf8(&combined[..valid_up_to]).expect("validated prefix");
|
||||
self.token_count += tokenizer.count(text);
|
||||
}
|
||||
self.utf8_buffer.extend_from_slice(&combined[valid_up_to..]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,8 +170,8 @@ impl MetaPlugin for TokensMetaPlugin {
|
||||
};
|
||||
|
||||
if should_detect {
|
||||
let buf_clone = self.buffer.as_ref().unwrap().clone();
|
||||
let is_binary = self.detect_binary(&buf_clone);
|
||||
let buffer_data = self.buffer.as_ref().unwrap().clone();
|
||||
let is_binary = self.detect_binary(&buffer_data);
|
||||
|
||||
if is_binary {
|
||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||
@@ -168,19 +189,10 @@ impl MetaPlugin for TokensMetaPlugin {
|
||||
};
|
||||
}
|
||||
|
||||
// It's text — tokenize the full accumulated buffer
|
||||
self.count_tokens(&buf_clone);
|
||||
|
||||
if buf_clone.len() >= self.max_buffer_size {
|
||||
self.buffer = None;
|
||||
}
|
||||
} else if self.buffer.is_some() {
|
||||
// Still building up buffer — tokenize what was just added
|
||||
let remaining = self
|
||||
.max_buffer_size
|
||||
.saturating_sub(self.buffer.as_ref().map_or(0, |b| b.len()));
|
||||
let to_take = std::cmp::min(data.len(), remaining);
|
||||
self.count_tokens(&data[..to_take]);
|
||||
// It's text — tokenize the full buffer (nothing was counted yet),
|
||||
// then clear to avoid double-counting in finalize().
|
||||
self.count_tokens(&buffer_data);
|
||||
self.buffer = Some(Vec::new());
|
||||
}
|
||||
} else if self.is_binary_content == Some(false) {
|
||||
self.count_tokens(data);
|
||||
@@ -212,8 +224,8 @@ impl MetaPlugin for TokensMetaPlugin {
|
||||
if self.is_binary_content.is_none() {
|
||||
if let Some(buffer) = &self.buffer {
|
||||
if !buffer.is_empty() {
|
||||
let buf_clone = buffer.clone();
|
||||
let is_binary = self.detect_binary(&buf_clone);
|
||||
let buffer_data = buffer.clone();
|
||||
let is_binary = self.detect_binary(&buffer_data);
|
||||
|
||||
if is_binary {
|
||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||
@@ -234,6 +246,12 @@ impl MetaPlugin for TokensMetaPlugin {
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize any bytes in the buffer
|
||||
if let Some(buffer) = &self.buffer {
|
||||
let data = buffer.clone();
|
||||
self.count_tokens(&data);
|
||||
}
|
||||
|
||||
// Process any remaining UTF-8 bytes
|
||||
if !self.utf8_buffer.is_empty() {
|
||||
self.count_tokens(&[]);
|
||||
|
||||
Reference in New Issue
Block a user