feat: add LLM token counting meta plugin and token filters
Add tiktoken-based token counting via new 'tokens' feature flag. New components: - Shared tokenizer module wrapping tiktoken CoreBPE (cl100k_base, o200k_base) - TokensMetaPlugin: streaming token counter, tokenizes each chunk independently - head_tokens(N): stream first N tokens, split at exact boundary when mid-chunk - skip_tokens(N): skip first N tokens, stream the rest - tail_tokens(N): bounded ring buffer (~16KB), outputs last N tokens at finalize All filters are fully streaming — no full-stream buffering. Meta plugin accuracy: exact for normal text, ±1-2 tokens if long whitespace sequence spans a chunk boundary. Also: add 'client' and 'tokens' to default features, add curl to Dockerfile builder stage.
This commit is contained in:
295
src/meta_plugin/tokens.rs
Normal file
295
src/meta_plugin/tokens.rs
Normal file
@@ -0,0 +1,295 @@
|
||||
use crate::common::PIPESIZE;
|
||||
use crate::common::is_binary::is_binary;
|
||||
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
|
||||
use crate::tokenizer::{TokenEncoding, Tokenizer};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TokensMetaPlugin {
|
||||
/// Buffer for binary detection (up to PIPESIZE bytes).
|
||||
buffer: Option<Vec<u8>>,
|
||||
max_buffer_size: usize,
|
||||
is_finalized: bool,
|
||||
is_binary_content: Option<bool>,
|
||||
/// Running token count accumulated across chunks.
|
||||
token_count: usize,
|
||||
/// UTF-8 boundary carry buffer.
|
||||
utf8_buffer: Vec<u8>,
|
||||
base: crate::meta_plugin::BaseMetaPlugin,
|
||||
/// The tokenizer instance.
|
||||
tokenizer: Tokenizer,
|
||||
}
|
||||
|
||||
impl TokensMetaPlugin {
|
||||
pub fn new(
|
||||
options: Option<std::collections::HashMap<String, serde_yaml::Value>>,
|
||||
outputs: Option<std::collections::HashMap<String, serde_yaml::Value>>,
|
||||
) -> Self {
|
||||
let mut base = crate::meta_plugin::BaseMetaPlugin::new();
|
||||
|
||||
base.initialize_plugin(&["token_count"], &options, &outputs);
|
||||
|
||||
// Set default options
|
||||
let default_options = vec![
|
||||
(
|
||||
"token_detect_size",
|
||||
serde_yaml::Value::Number(PIPESIZE.into()),
|
||||
),
|
||||
(
|
||||
"encoding",
|
||||
serde_yaml::Value::String("cl100k_base".to_string()),
|
||||
),
|
||||
];
|
||||
|
||||
for (key, value) in default_options {
|
||||
if !base.options.contains_key(key) {
|
||||
base.options.insert(key.to_string(), value);
|
||||
}
|
||||
}
|
||||
|
||||
let max_buffer_size = base
|
||||
.options
|
||||
.get("token_detect_size")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(PIPESIZE as u64) as usize;
|
||||
|
||||
let encoding = base
|
||||
.options
|
||||
.get("encoding")
|
||||
.and_then(|v| v.as_str())
|
||||
.and_then(|s| s.parse::<TokenEncoding>().ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
let tokenizer = Tokenizer::new(encoding).expect("Failed to create tokenizer");
|
||||
|
||||
Self {
|
||||
buffer: Some(Vec::new()),
|
||||
max_buffer_size,
|
||||
is_finalized: false,
|
||||
is_binary_content: None,
|
||||
token_count: 0,
|
||||
utf8_buffer: Vec::new(),
|
||||
base,
|
||||
tokenizer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenize a byte chunk, handling UTF-8 boundaries.
|
||||
///
|
||||
/// Combines with any pending UTF-8 carry bytes, converts to text,
|
||||
/// and adds the token count to the running total.
|
||||
fn count_tokens(&mut self, data: &[u8]) {
|
||||
if data.is_empty() && self.utf8_buffer.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let combined = if !self.utf8_buffer.is_empty() {
|
||||
let mut c = self.utf8_buffer.clone();
|
||||
c.extend_from_slice(data);
|
||||
c
|
||||
} else {
|
||||
data.to_vec()
|
||||
};
|
||||
self.utf8_buffer.clear();
|
||||
|
||||
let text = match std::str::from_utf8(&combined) {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
let valid = e.valid_up_to();
|
||||
if valid < combined.len() {
|
||||
self.utf8_buffer.extend_from_slice(&combined[valid..]);
|
||||
}
|
||||
match std::str::from_utf8(&combined[..valid]) {
|
||||
Ok(t) => t,
|
||||
Err(_) => return,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if !text.is_empty() {
|
||||
self.token_count += self.tokenizer.count(text);
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform binary detection on the buffer.
|
||||
fn detect_binary(&mut self, buffer: &[u8]) -> bool {
|
||||
let result = is_binary(buffer);
|
||||
self.is_binary_content = Some(result);
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl MetaPlugin for TokensMetaPlugin {
|
||||
fn is_finalized(&self) -> bool {
|
||||
self.is_finalized
|
||||
}
|
||||
|
||||
fn set_finalized(&mut self, finalized: bool) {
|
||||
self.is_finalized = finalized;
|
||||
}
|
||||
|
||||
fn update(&mut self, data: &[u8]) -> MetaPluginResponse {
|
||||
if self.is_finalized {
|
||||
return MetaPluginResponse {
|
||||
metadata: Vec::new(),
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
|
||||
let mut metadata = Vec::new();
|
||||
|
||||
if self.is_binary_content.is_none() {
|
||||
// Add data to the buffer
|
||||
let should_detect = if let Some(ref mut buffer) = self.buffer {
|
||||
let remaining = self.max_buffer_size.saturating_sub(buffer.len());
|
||||
let to_take = std::cmp::min(data.len(), remaining);
|
||||
buffer.extend_from_slice(&data[..to_take]);
|
||||
buffer.len() >= std::cmp::min(1024, self.max_buffer_size)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if should_detect {
|
||||
let buf_clone = self.buffer.as_ref().unwrap().clone();
|
||||
let is_binary = self.detect_binary(&buf_clone);
|
||||
|
||||
if is_binary {
|
||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||
"token_count",
|
||||
serde_yaml::Value::Null,
|
||||
self.base.outputs(),
|
||||
) {
|
||||
metadata.push(md);
|
||||
}
|
||||
self.buffer = None;
|
||||
self.is_finalized = true;
|
||||
return MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
|
||||
// It's text — tokenize the full accumulated buffer
|
||||
self.count_tokens(&buf_clone);
|
||||
|
||||
if buf_clone.len() >= self.max_buffer_size {
|
||||
self.buffer = None;
|
||||
}
|
||||
} else if self.buffer.is_some() {
|
||||
// Still building up buffer — tokenize what was just added
|
||||
let remaining = self
|
||||
.max_buffer_size
|
||||
.saturating_sub(self.buffer.as_ref().map_or(0, |b| b.len()));
|
||||
let to_take = std::cmp::min(data.len(), remaining);
|
||||
self.count_tokens(&data[..to_take]);
|
||||
}
|
||||
} else if self.is_binary_content == Some(false) {
|
||||
self.count_tokens(data);
|
||||
} else if self.is_binary_content == Some(true) {
|
||||
self.is_finalized = true;
|
||||
return MetaPluginResponse {
|
||||
metadata: Vec::new(),
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
|
||||
MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized: self.is_finalized,
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(&mut self) -> MetaPluginResponse {
|
||||
if self.is_finalized {
|
||||
return MetaPluginResponse {
|
||||
metadata: Vec::new(),
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
|
||||
let mut metadata = Vec::new();
|
||||
|
||||
// If binary detection hasn't completed, do it now
|
||||
if self.is_binary_content.is_none() {
|
||||
if let Some(buffer) = &self.buffer {
|
||||
if !buffer.is_empty() {
|
||||
let buf_clone = buffer.clone();
|
||||
let is_binary = self.detect_binary(&buf_clone);
|
||||
|
||||
if is_binary {
|
||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||
"token_count",
|
||||
serde_yaml::Value::Null,
|
||||
self.base.outputs(),
|
||||
) {
|
||||
metadata.push(md);
|
||||
}
|
||||
self.buffer = None;
|
||||
self.is_finalized = true;
|
||||
return MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process any remaining UTF-8 bytes
|
||||
if !self.utf8_buffer.is_empty() {
|
||||
self.count_tokens(&[]);
|
||||
}
|
||||
|
||||
// Emit token count
|
||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||
"token_count",
|
||||
serde_yaml::Value::String(self.token_count.to_string()),
|
||||
self.base.outputs(),
|
||||
) {
|
||||
metadata.push(md);
|
||||
}
|
||||
|
||||
self.buffer = None;
|
||||
self.is_finalized = true;
|
||||
MetaPluginResponse {
|
||||
metadata,
|
||||
is_finalized: true,
|
||||
}
|
||||
}
|
||||
|
||||
fn meta_type(&self) -> MetaPluginType {
|
||||
MetaPluginType::Tokens
|
||||
}
|
||||
|
||||
fn outputs(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
||||
self.base.outputs()
|
||||
}
|
||||
|
||||
fn outputs_mut(
|
||||
&mut self,
|
||||
) -> anyhow::Result<&mut std::collections::HashMap<String, serde_yaml::Value>> {
|
||||
Ok(self.base.outputs_mut())
|
||||
}
|
||||
|
||||
fn default_outputs(&self) -> Vec<String> {
|
||||
vec!["token_count".to_string()]
|
||||
}
|
||||
|
||||
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
||||
self.base.options()
|
||||
}
|
||||
|
||||
fn options_mut(
|
||||
&mut self,
|
||||
) -> anyhow::Result<&mut std::collections::HashMap<String, serde_yaml::Value>> {
|
||||
Ok(self.base.options_mut())
|
||||
}
|
||||
}
|
||||
|
||||
use crate::meta_plugin::register_meta_plugin;
|
||||
|
||||
#[ctor::ctor]
|
||||
fn register_tokens_plugin() {
|
||||
register_meta_plugin(MetaPluginType::Tokens, |options, outputs| {
|
||||
Box::new(TokensMetaPlugin::new(options, outputs))
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user