use crate::common::PIPESIZE; use crate::common::is_binary::is_binary; use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType}; use crate::tokenizer::{TokenEncoding, get_tokenizer}; #[derive(Debug, Clone)] pub struct TokensMetaPlugin { /// Buffer for binary detection (up to PIPESIZE bytes). buffer: Option>, max_buffer_size: usize, is_finalized: bool, is_binary_content: Option, /// Running token count accumulated across chunks. token_count: usize, /// UTF-8 boundary carry buffer. utf8_buffer: Vec, base: crate::meta_plugin::BaseMetaPlugin, /// The tokenizer encoding. encoding: TokenEncoding, } impl TokensMetaPlugin { pub fn new( options: Option>, outputs: Option>, ) -> Self { let mut base = crate::meta_plugin::BaseMetaPlugin::new(); base.initialize_plugin(&["token_count"], &options, &outputs); // Set default options let default_options = vec![ ( "token_detect_size", serde_yaml::Value::Number(PIPESIZE.into()), ), ( "encoding", serde_yaml::Value::String("cl100k_base".to_string()), ), ]; for (key, value) in default_options { if !base.options.contains_key(key) { base.options.insert(key.to_string(), value); } } let max_buffer_size = base .options .get("token_detect_size") .and_then(|v| v.as_u64()) .unwrap_or(PIPESIZE as u64) as usize; let encoding = base .options .get("encoding") .and_then(|v| v.as_str()) .and_then(|s| s.parse::().ok()) .unwrap_or_default(); Self { buffer: Some(Vec::new()), max_buffer_size, is_finalized: false, is_binary_content: None, token_count: 0, utf8_buffer: Vec::new(), base, encoding, } } /// Tokenize a byte chunk, handling UTF-8 boundaries. /// /// Combines with any pending UTF-8 carry bytes, converts to text, /// and adds the token count to the running total. /// /// Avoids unnecessary allocations when there is no pending UTF-8 carry /// and the data is valid UTF-8. fn count_tokens(&mut self, data: &[u8]) { if data.is_empty() && self.utf8_buffer.is_empty() { return; } let tokenizer = get_tokenizer(self.encoding); if self.utf8_buffer.is_empty() { // Fast path: no pending carry — try to use data directly match std::str::from_utf8(data) { Ok(text) => { if !text.is_empty() { self.token_count += tokenizer.count(text); } return; } Err(e) => { let valid_up_to = e.valid_up_to(); if valid_up_to > 0 { // Count the valid prefix without copying let text = std::str::from_utf8(&data[..valid_up_to]).expect("validated prefix"); self.token_count += tokenizer.count(text); } // Save invalid trailing bytes for next call self.utf8_buffer.extend_from_slice(&data[valid_up_to..]); return; } } } // Slow path: pending carry bytes — must build combined buffer let mut combined = std::mem::take(&mut self.utf8_buffer); combined.extend_from_slice(data); match std::str::from_utf8(&combined) { Ok(text) => { if !text.is_empty() { self.token_count += tokenizer.count(text); } } Err(e) => { let valid_up_to = e.valid_up_to(); if valid_up_to > 0 { let text = std::str::from_utf8(&combined[..valid_up_to]).expect("validated prefix"); self.token_count += tokenizer.count(text); } self.utf8_buffer.extend_from_slice(&combined[valid_up_to..]); } } } /// Perform binary detection on the buffer. fn detect_binary(&mut self, buffer: &[u8]) -> bool { let result = is_binary(buffer); self.is_binary_content = Some(result); result } } impl MetaPlugin for TokensMetaPlugin { fn is_finalized(&self) -> bool { self.is_finalized } fn set_finalized(&mut self, finalized: bool) { self.is_finalized = finalized; } fn set_save_meta(&mut self, save_meta: crate::meta_plugin::SaveMetaFn) { self.base.set_save_meta(save_meta); } fn save_meta(&self, name: &str, value: &str) { self.base.save_meta(name, value); } fn update(&mut self, data: &[u8]) -> MetaPluginResponse { if self.is_finalized { return MetaPluginResponse { metadata: Vec::new(), is_finalized: true, }; } let mut metadata = Vec::new(); if self.is_binary_content.is_none() { // Add data to the buffer let should_detect = if let Some(ref mut buffer) = self.buffer { let remaining = self.max_buffer_size.saturating_sub(buffer.len()); let to_take = std::cmp::min(data.len(), remaining); buffer.extend_from_slice(&data[..to_take]); buffer.len() >= std::cmp::min(1024, self.max_buffer_size) } else { false }; if should_detect { let buffer_data = self.buffer.as_ref().unwrap().clone(); let is_binary = self.detect_binary(&buffer_data); if is_binary { if let Some(md) = crate::meta_plugin::process_metadata_outputs( "token_count", serde_yaml::Value::Null, self.base.outputs(), ) { metadata.push(md); } self.buffer = None; self.is_finalized = true; return MetaPluginResponse { metadata, is_finalized: true, }; } // It's text — tokenize the full buffer (nothing was counted yet), // then clear to avoid double-counting in finalize(). self.count_tokens(&buffer_data); self.buffer = Some(Vec::new()); } } else if self.is_binary_content == Some(false) { self.count_tokens(data); } else if self.is_binary_content == Some(true) { self.is_finalized = true; return MetaPluginResponse { metadata: Vec::new(), is_finalized: true, }; } MetaPluginResponse { metadata, is_finalized: self.is_finalized, } } fn finalize(&mut self) -> MetaPluginResponse { if self.is_finalized { return MetaPluginResponse { metadata: Vec::new(), is_finalized: true, }; } let mut metadata = Vec::new(); // If binary detection hasn't completed, do it now if self.is_binary_content.is_none() && let Some(buffer) = &self.buffer && !buffer.is_empty() { let buffer_data = buffer.clone(); let is_binary = self.detect_binary(&buffer_data); if is_binary { if let Some(md) = crate::meta_plugin::process_metadata_outputs( "token_count", serde_yaml::Value::Null, self.base.outputs(), ) { metadata.push(md); } self.buffer = None; self.is_finalized = true; return MetaPluginResponse { metadata, is_finalized: true, }; } } // Tokenize any bytes in the buffer if let Some(buffer) = &self.buffer { let data = buffer.clone(); self.count_tokens(&data); } // Process any remaining UTF-8 bytes if !self.utf8_buffer.is_empty() { self.count_tokens(&[]); } // Emit token count if let Some(md) = crate::meta_plugin::process_metadata_outputs( "token_count", serde_yaml::Value::String(self.token_count.to_string()), self.base.outputs(), ) { metadata.push(md); } self.buffer = None; self.is_finalized = true; MetaPluginResponse { metadata, is_finalized: true, } } fn meta_type(&self) -> MetaPluginType { MetaPluginType::Tokens } fn outputs(&self) -> &std::collections::HashMap { self.base.outputs() } fn outputs_mut( &mut self, ) -> anyhow::Result<&mut std::collections::HashMap> { Ok(self.base.outputs_mut()) } fn default_outputs(&self) -> Vec { vec!["token_count".to_string()] } fn options(&self) -> &std::collections::HashMap { self.base.options() } fn options_mut( &mut self, ) -> anyhow::Result<&mut std::collections::HashMap> { Ok(self.base.options_mut()) } fn parallel_safe(&self) -> bool { true } } use crate::meta_plugin::register_meta_plugin; #[ctor::ctor] fn register_tokens_plugin() { register_meta_plugin(MetaPluginType::Tokens, |options, outputs| { Box::new(TokensMetaPlugin::new(options, outputs)) }) .expect("Failed to register TokensMetaPlugin"); }