- Add SaveMetaFn callback pattern: meta plugins receive a closure instead of
&Connection, enabling the same plugin code to work in local, client, and
server contexts (collect-to-Vec, collect-to-HashMap, or direct DB write)
- Client save now runs meta plugins locally during streaming (smart client
sets meta=false, server skips its own plugins)
- Add POST /api/item/{id}/update endpoint for re-running plugins on stored
content without downloading compressed data
- Add client update mode (--update with --meta-plugin flags)
- Extract shared utilities: stream_copy, print_serialized, build_path_table,
ensure_default_tag to reduce duplication across modes
- Add upsert_tag for idempotent tag addition (INSERT OR IGNORE)
- Add warn logging on save_meta lock failure in BaseMetaPlugin and MetaService
326 lines
10 KiB
Rust
326 lines
10 KiB
Rust
use crate::common::PIPESIZE;
|
|
use crate::common::is_binary::is_binary;
|
|
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
|
|
use crate::tokenizer::{TokenEncoding, get_tokenizer};
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct TokensMetaPlugin {
|
|
/// Buffer for binary detection (up to PIPESIZE bytes).
|
|
buffer: Option<Vec<u8>>,
|
|
max_buffer_size: usize,
|
|
is_finalized: bool,
|
|
is_binary_content: Option<bool>,
|
|
/// Running token count accumulated across chunks.
|
|
token_count: usize,
|
|
/// UTF-8 boundary carry buffer.
|
|
utf8_buffer: Vec<u8>,
|
|
base: crate::meta_plugin::BaseMetaPlugin,
|
|
/// The tokenizer encoding.
|
|
encoding: TokenEncoding,
|
|
}
|
|
|
|
impl TokensMetaPlugin {
|
|
pub fn new(
|
|
options: Option<std::collections::HashMap<String, serde_yaml::Value>>,
|
|
outputs: Option<std::collections::HashMap<String, serde_yaml::Value>>,
|
|
) -> Self {
|
|
let mut base = crate::meta_plugin::BaseMetaPlugin::new();
|
|
|
|
base.initialize_plugin(&["token_count"], &options, &outputs);
|
|
|
|
// Set default options
|
|
let default_options = vec![
|
|
(
|
|
"token_detect_size",
|
|
serde_yaml::Value::Number(PIPESIZE.into()),
|
|
),
|
|
(
|
|
"encoding",
|
|
serde_yaml::Value::String("cl100k_base".to_string()),
|
|
),
|
|
];
|
|
|
|
for (key, value) in default_options {
|
|
if !base.options.contains_key(key) {
|
|
base.options.insert(key.to_string(), value);
|
|
}
|
|
}
|
|
|
|
let max_buffer_size = base
|
|
.options
|
|
.get("token_detect_size")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(PIPESIZE as u64) as usize;
|
|
|
|
let encoding = base
|
|
.options
|
|
.get("encoding")
|
|
.and_then(|v| v.as_str())
|
|
.and_then(|s| s.parse::<TokenEncoding>().ok())
|
|
.unwrap_or_default();
|
|
|
|
Self {
|
|
buffer: Some(Vec::new()),
|
|
max_buffer_size,
|
|
is_finalized: false,
|
|
is_binary_content: None,
|
|
token_count: 0,
|
|
utf8_buffer: Vec::new(),
|
|
base,
|
|
encoding,
|
|
}
|
|
}
|
|
|
|
/// Tokenize a byte chunk, handling UTF-8 boundaries.
|
|
///
|
|
/// Combines with any pending UTF-8 carry bytes, converts to text,
|
|
/// and adds the token count to the running total.
|
|
///
|
|
/// Avoids unnecessary allocations when there is no pending UTF-8 carry
|
|
/// and the data is valid UTF-8.
|
|
fn count_tokens(&mut self, data: &[u8]) {
|
|
if data.is_empty() && self.utf8_buffer.is_empty() {
|
|
return;
|
|
}
|
|
|
|
let tokenizer = get_tokenizer(self.encoding);
|
|
|
|
if self.utf8_buffer.is_empty() {
|
|
// Fast path: no pending carry — try to use data directly
|
|
match std::str::from_utf8(data) {
|
|
Ok(text) => {
|
|
if !text.is_empty() {
|
|
self.token_count += tokenizer.count(text);
|
|
}
|
|
return;
|
|
}
|
|
Err(e) => {
|
|
let valid_up_to = e.valid_up_to();
|
|
if valid_up_to > 0 {
|
|
// Count the valid prefix without copying
|
|
let text =
|
|
std::str::from_utf8(&data[..valid_up_to]).expect("validated prefix");
|
|
self.token_count += tokenizer.count(text);
|
|
}
|
|
// Save invalid trailing bytes for next call
|
|
self.utf8_buffer.extend_from_slice(&data[valid_up_to..]);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Slow path: pending carry bytes — must build combined buffer
|
|
let mut combined = std::mem::take(&mut self.utf8_buffer);
|
|
combined.extend_from_slice(data);
|
|
|
|
match std::str::from_utf8(&combined) {
|
|
Ok(text) => {
|
|
if !text.is_empty() {
|
|
self.token_count += tokenizer.count(text);
|
|
}
|
|
}
|
|
Err(e) => {
|
|
let valid_up_to = e.valid_up_to();
|
|
if valid_up_to > 0 {
|
|
let text =
|
|
std::str::from_utf8(&combined[..valid_up_to]).expect("validated prefix");
|
|
self.token_count += tokenizer.count(text);
|
|
}
|
|
self.utf8_buffer.extend_from_slice(&combined[valid_up_to..]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Perform binary detection on the buffer.
|
|
fn detect_binary(&mut self, buffer: &[u8]) -> bool {
|
|
let result = is_binary(buffer);
|
|
self.is_binary_content = Some(result);
|
|
result
|
|
}
|
|
}
|
|
|
|
impl MetaPlugin for TokensMetaPlugin {
|
|
fn is_finalized(&self) -> bool {
|
|
self.is_finalized
|
|
}
|
|
|
|
fn set_finalized(&mut self, finalized: bool) {
|
|
self.is_finalized = finalized;
|
|
}
|
|
|
|
fn set_save_meta(&mut self, save_meta: crate::meta_plugin::SaveMetaFn) {
|
|
self.base.set_save_meta(save_meta);
|
|
}
|
|
|
|
fn save_meta(&self, name: &str, value: &str) {
|
|
self.base.save_meta(name, value);
|
|
}
|
|
|
|
fn update(&mut self, data: &[u8]) -> MetaPluginResponse {
|
|
if self.is_finalized {
|
|
return MetaPluginResponse {
|
|
metadata: Vec::new(),
|
|
is_finalized: true,
|
|
};
|
|
}
|
|
|
|
let mut metadata = Vec::new();
|
|
|
|
if self.is_binary_content.is_none() {
|
|
// Add data to the buffer
|
|
let should_detect = if let Some(ref mut buffer) = self.buffer {
|
|
let remaining = self.max_buffer_size.saturating_sub(buffer.len());
|
|
let to_take = std::cmp::min(data.len(), remaining);
|
|
buffer.extend_from_slice(&data[..to_take]);
|
|
buffer.len() >= std::cmp::min(1024, self.max_buffer_size)
|
|
} else {
|
|
false
|
|
};
|
|
|
|
if should_detect {
|
|
let buffer_data = self.buffer.as_ref().unwrap().clone();
|
|
let is_binary = self.detect_binary(&buffer_data);
|
|
|
|
if is_binary {
|
|
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
|
"token_count",
|
|
serde_yaml::Value::Null,
|
|
self.base.outputs(),
|
|
) {
|
|
metadata.push(md);
|
|
}
|
|
self.buffer = None;
|
|
self.is_finalized = true;
|
|
return MetaPluginResponse {
|
|
metadata,
|
|
is_finalized: true,
|
|
};
|
|
}
|
|
|
|
// It's text — tokenize the full buffer (nothing was counted yet),
|
|
// then clear to avoid double-counting in finalize().
|
|
self.count_tokens(&buffer_data);
|
|
self.buffer = Some(Vec::new());
|
|
}
|
|
} else if self.is_binary_content == Some(false) {
|
|
self.count_tokens(data);
|
|
} else if self.is_binary_content == Some(true) {
|
|
self.is_finalized = true;
|
|
return MetaPluginResponse {
|
|
metadata: Vec::new(),
|
|
is_finalized: true,
|
|
};
|
|
}
|
|
|
|
MetaPluginResponse {
|
|
metadata,
|
|
is_finalized: self.is_finalized,
|
|
}
|
|
}
|
|
|
|
fn finalize(&mut self) -> MetaPluginResponse {
|
|
if self.is_finalized {
|
|
return MetaPluginResponse {
|
|
metadata: Vec::new(),
|
|
is_finalized: true,
|
|
};
|
|
}
|
|
|
|
let mut metadata = Vec::new();
|
|
|
|
// If binary detection hasn't completed, do it now
|
|
if self.is_binary_content.is_none()
|
|
&& let Some(buffer) = &self.buffer
|
|
&& !buffer.is_empty()
|
|
{
|
|
let buffer_data = buffer.clone();
|
|
let is_binary = self.detect_binary(&buffer_data);
|
|
|
|
if is_binary {
|
|
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
|
"token_count",
|
|
serde_yaml::Value::Null,
|
|
self.base.outputs(),
|
|
) {
|
|
metadata.push(md);
|
|
}
|
|
self.buffer = None;
|
|
self.is_finalized = true;
|
|
return MetaPluginResponse {
|
|
metadata,
|
|
is_finalized: true,
|
|
};
|
|
}
|
|
}
|
|
|
|
// Tokenize any bytes in the buffer
|
|
if let Some(buffer) = &self.buffer {
|
|
let data = buffer.clone();
|
|
self.count_tokens(&data);
|
|
}
|
|
|
|
// Process any remaining UTF-8 bytes
|
|
if !self.utf8_buffer.is_empty() {
|
|
self.count_tokens(&[]);
|
|
}
|
|
|
|
// Emit token count
|
|
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
|
"token_count",
|
|
serde_yaml::Value::String(self.token_count.to_string()),
|
|
self.base.outputs(),
|
|
) {
|
|
metadata.push(md);
|
|
}
|
|
|
|
self.buffer = None;
|
|
self.is_finalized = true;
|
|
MetaPluginResponse {
|
|
metadata,
|
|
is_finalized: true,
|
|
}
|
|
}
|
|
|
|
fn meta_type(&self) -> MetaPluginType {
|
|
MetaPluginType::Tokens
|
|
}
|
|
|
|
fn outputs(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
|
self.base.outputs()
|
|
}
|
|
|
|
fn outputs_mut(
|
|
&mut self,
|
|
) -> anyhow::Result<&mut std::collections::HashMap<String, serde_yaml::Value>> {
|
|
Ok(self.base.outputs_mut())
|
|
}
|
|
|
|
fn default_outputs(&self) -> Vec<String> {
|
|
vec!["token_count".to_string()]
|
|
}
|
|
|
|
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
|
|
self.base.options()
|
|
}
|
|
|
|
fn options_mut(
|
|
&mut self,
|
|
) -> anyhow::Result<&mut std::collections::HashMap<String, serde_yaml::Value>> {
|
|
Ok(self.base.options_mut())
|
|
}
|
|
|
|
fn parallel_safe(&self) -> bool {
|
|
true
|
|
}
|
|
}
|
|
|
|
use crate::meta_plugin::register_meta_plugin;
|
|
|
|
#[ctor::ctor]
|
|
fn register_tokens_plugin() {
|
|
register_meta_plugin(MetaPluginType::Tokens, |options, outputs| {
|
|
Box::new(TokensMetaPlugin::new(options, outputs))
|
|
})
|
|
.expect("Failed to register TokensMetaPlugin");
|
|
}
|