feat: add LLM token counting meta plugin and token filters
Add tiktoken-based token counting via new 'tokens' feature flag. New components: - Shared tokenizer module wrapping tiktoken CoreBPE (cl100k_base, o200k_base) - TokensMetaPlugin: streaming token counter, tokenizes each chunk independently - head_tokens(N): stream first N tokens, split at exact boundary when mid-chunk - skip_tokens(N): skip first N tokens, stream the rest - tail_tokens(N): bounded ring buffer (~16KB), outputs last N tokens at finalize All filters are fully streaming — no full-stream buffering. Meta plugin accuracy: exact for normal text, ±1-2 tokens if long whitespace sequence spans a chunk boundary. Also: add 'client' and 'tokens' to default features, add curl to Dockerfile builder stage.
This commit is contained in:
11
src/lib.rs
11
src/lib.rs
@@ -43,6 +43,9 @@ pub mod services;
|
||||
#[cfg(feature = "client")]
|
||||
pub mod client;
|
||||
|
||||
#[cfg(feature = "tokens")]
|
||||
pub mod tokenizer;
|
||||
|
||||
// Re-export Args struct for library usage
|
||||
pub use args::Args;
|
||||
// Re-export PIPESIZE constant
|
||||
@@ -52,6 +55,10 @@ pub use common::PIPESIZE;
|
||||
#[allow(unused_imports)]
|
||||
use filter_plugin::{grep, head, skip, strip_ansi, tail};
|
||||
|
||||
#[cfg(feature = "tokens")]
|
||||
#[allow(unused_imports)]
|
||||
use filter_plugin::tokens as token_filters;
|
||||
|
||||
use crate::meta_plugin::{
|
||||
cwd, digest, env, exec, hostname, keep_pid, read_rate, read_time, shell, shell_pid, user,
|
||||
};
|
||||
@@ -60,6 +67,10 @@ use crate::meta_plugin::{
|
||||
#[allow(unused_imports)]
|
||||
use crate::meta_plugin::magic_file;
|
||||
|
||||
#[cfg(feature = "tokens")]
|
||||
#[allow(unused_imports)]
|
||||
use crate::meta_plugin::tokens;
|
||||
|
||||
/// Initializes plugins at library load time.
|
||||
///
|
||||
/// Plugin registration happens automatically via `#[ctor]` constructors
|
||||
|
||||
Reference in New Issue
Block a user