feat: add LLM token counting meta plugin and token filters

Add tiktoken-based token counting via new 'tokens' feature flag.

New components:
- Shared tokenizer module wrapping tiktoken CoreBPE (cl100k_base, o200k_base)
- TokensMetaPlugin: streaming token counter, tokenizes each chunk independently
- head_tokens(N): stream first N tokens, split at exact boundary when mid-chunk
- skip_tokens(N): skip first N tokens, stream the rest
- tail_tokens(N): bounded ring buffer (~16KB), outputs last N tokens at finalize

All filters are fully streaming — no full-stream buffering.
Meta plugin accuracy: exact for normal text, ±1-2 tokens if long whitespace
sequence spans a chunk boundary.

Also: add 'client' and 'tokens' to default features, add curl to Dockerfile builder stage.
This commit is contained in:
2026-03-13 16:48:31 -03:00
parent e672ec751e
commit 914190e119
9 changed files with 1128 additions and 3 deletions

295
src/meta_plugin/tokens.rs Normal file
View File

@@ -0,0 +1,295 @@
use crate::common::PIPESIZE;
use crate::common::is_binary::is_binary;
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
use crate::tokenizer::{TokenEncoding, Tokenizer};
#[derive(Debug, Clone)]
pub struct TokensMetaPlugin {
/// Buffer for binary detection (up to PIPESIZE bytes).
buffer: Option<Vec<u8>>,
max_buffer_size: usize,
is_finalized: bool,
is_binary_content: Option<bool>,
/// Running token count accumulated across chunks.
token_count: usize,
/// UTF-8 boundary carry buffer.
utf8_buffer: Vec<u8>,
base: crate::meta_plugin::BaseMetaPlugin,
/// The tokenizer instance.
tokenizer: Tokenizer,
}
impl TokensMetaPlugin {
pub fn new(
options: Option<std::collections::HashMap<String, serde_yaml::Value>>,
outputs: Option<std::collections::HashMap<String, serde_yaml::Value>>,
) -> Self {
let mut base = crate::meta_plugin::BaseMetaPlugin::new();
base.initialize_plugin(&["token_count"], &options, &outputs);
// Set default options
let default_options = vec![
(
"token_detect_size",
serde_yaml::Value::Number(PIPESIZE.into()),
),
(
"encoding",
serde_yaml::Value::String("cl100k_base".to_string()),
),
];
for (key, value) in default_options {
if !base.options.contains_key(key) {
base.options.insert(key.to_string(), value);
}
}
let max_buffer_size = base
.options
.get("token_detect_size")
.and_then(|v| v.as_u64())
.unwrap_or(PIPESIZE as u64) as usize;
let encoding = base
.options
.get("encoding")
.and_then(|v| v.as_str())
.and_then(|s| s.parse::<TokenEncoding>().ok())
.unwrap_or_default();
let tokenizer = Tokenizer::new(encoding).expect("Failed to create tokenizer");
Self {
buffer: Some(Vec::new()),
max_buffer_size,
is_finalized: false,
is_binary_content: None,
token_count: 0,
utf8_buffer: Vec::new(),
base,
tokenizer,
}
}
/// Tokenize a byte chunk, handling UTF-8 boundaries.
///
/// Combines with any pending UTF-8 carry bytes, converts to text,
/// and adds the token count to the running total.
fn count_tokens(&mut self, data: &[u8]) {
if data.is_empty() && self.utf8_buffer.is_empty() {
return;
}
let combined = if !self.utf8_buffer.is_empty() {
let mut c = self.utf8_buffer.clone();
c.extend_from_slice(data);
c
} else {
data.to_vec()
};
self.utf8_buffer.clear();
let text = match std::str::from_utf8(&combined) {
Ok(t) => t,
Err(e) => {
let valid = e.valid_up_to();
if valid < combined.len() {
self.utf8_buffer.extend_from_slice(&combined[valid..]);
}
match std::str::from_utf8(&combined[..valid]) {
Ok(t) => t,
Err(_) => return,
}
}
};
if !text.is_empty() {
self.token_count += self.tokenizer.count(text);
}
}
/// Perform binary detection on the buffer.
fn detect_binary(&mut self, buffer: &[u8]) -> bool {
let result = is_binary(buffer);
self.is_binary_content = Some(result);
result
}
}
impl MetaPlugin for TokensMetaPlugin {
fn is_finalized(&self) -> bool {
self.is_finalized
}
fn set_finalized(&mut self, finalized: bool) {
self.is_finalized = finalized;
}
fn update(&mut self, data: &[u8]) -> MetaPluginResponse {
if self.is_finalized {
return MetaPluginResponse {
metadata: Vec::new(),
is_finalized: true,
};
}
let mut metadata = Vec::new();
if self.is_binary_content.is_none() {
// Add data to the buffer
let should_detect = if let Some(ref mut buffer) = self.buffer {
let remaining = self.max_buffer_size.saturating_sub(buffer.len());
let to_take = std::cmp::min(data.len(), remaining);
buffer.extend_from_slice(&data[..to_take]);
buffer.len() >= std::cmp::min(1024, self.max_buffer_size)
} else {
false
};
if should_detect {
let buf_clone = self.buffer.as_ref().unwrap().clone();
let is_binary = self.detect_binary(&buf_clone);
if is_binary {
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
"token_count",
serde_yaml::Value::Null,
self.base.outputs(),
) {
metadata.push(md);
}
self.buffer = None;
self.is_finalized = true;
return MetaPluginResponse {
metadata,
is_finalized: true,
};
}
// It's text — tokenize the full accumulated buffer
self.count_tokens(&buf_clone);
if buf_clone.len() >= self.max_buffer_size {
self.buffer = None;
}
} else if self.buffer.is_some() {
// Still building up buffer — tokenize what was just added
let remaining = self
.max_buffer_size
.saturating_sub(self.buffer.as_ref().map_or(0, |b| b.len()));
let to_take = std::cmp::min(data.len(), remaining);
self.count_tokens(&data[..to_take]);
}
} else if self.is_binary_content == Some(false) {
self.count_tokens(data);
} else if self.is_binary_content == Some(true) {
self.is_finalized = true;
return MetaPluginResponse {
metadata: Vec::new(),
is_finalized: true,
};
}
MetaPluginResponse {
metadata,
is_finalized: self.is_finalized,
}
}
fn finalize(&mut self) -> MetaPluginResponse {
if self.is_finalized {
return MetaPluginResponse {
metadata: Vec::new(),
is_finalized: true,
};
}
let mut metadata = Vec::new();
// If binary detection hasn't completed, do it now
if self.is_binary_content.is_none() {
if let Some(buffer) = &self.buffer {
if !buffer.is_empty() {
let buf_clone = buffer.clone();
let is_binary = self.detect_binary(&buf_clone);
if is_binary {
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
"token_count",
serde_yaml::Value::Null,
self.base.outputs(),
) {
metadata.push(md);
}
self.buffer = None;
self.is_finalized = true;
return MetaPluginResponse {
metadata,
is_finalized: true,
};
}
}
}
}
// Process any remaining UTF-8 bytes
if !self.utf8_buffer.is_empty() {
self.count_tokens(&[]);
}
// Emit token count
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
"token_count",
serde_yaml::Value::String(self.token_count.to_string()),
self.base.outputs(),
) {
metadata.push(md);
}
self.buffer = None;
self.is_finalized = true;
MetaPluginResponse {
metadata,
is_finalized: true,
}
}
fn meta_type(&self) -> MetaPluginType {
MetaPluginType::Tokens
}
fn outputs(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
self.base.outputs()
}
fn outputs_mut(
&mut self,
) -> anyhow::Result<&mut std::collections::HashMap<String, serde_yaml::Value>> {
Ok(self.base.outputs_mut())
}
fn default_outputs(&self) -> Vec<String> {
vec!["token_count".to_string()]
}
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
self.base.options()
}
fn options_mut(
&mut self,
) -> anyhow::Result<&mut std::collections::HashMap<String, serde_yaml::Value>> {
Ok(self.base.options_mut())
}
}
use crate::meta_plugin::register_meta_plugin;
#[ctor::ctor]
fn register_tokens_plugin() {
register_meta_plugin(MetaPluginType::Tokens, |options, outputs| {
Box::new(TokensMetaPlugin::new(options, outputs))
});
}