feat: add plugin schema system, tokenizer cache, and config validation
- Add plugin schema types and runtime discovery for meta/filter plugins - Rewrite --generate-config to use schema system instead of hardcoded types - Add Settings::validate_config() for startup validation - Cache tokenizer instances via static Lazy to avoid repeated BPE loading - Add split_by_token_iter() and count_bounded() to Tokenizer - Fix double-counting bug in TokensMetaPlugin when buffer < max_buffer_size - Eliminate unnecessary allocations in token count methods - Refactor token filters: remove Option<Tokenizer>, use iterator API - Fix TailTokensFilter correctness: unbounded buffer instead of ring buffer - Add encoding option to all token filters - Add description() to MetaPlugin and FilterPlugin traits - Fix unused_mut warning in compression engine (feature-gated code) Co-Authored-By: code-review-bot <noreply@anthropic.com>
This commit is contained in:
@@ -3,5 +3,8 @@ pub mod is_binary;
|
|||||||
/// Detects if data is binary or text based on signatures and printable ratios.
|
/// Detects if data is binary or text based on signatures and printable ratios.
|
||||||
pub mod status;
|
pub mod status;
|
||||||
|
|
||||||
|
/// Plugin schema types and discovery functions.
|
||||||
|
pub mod schema;
|
||||||
|
|
||||||
/// Standard buffer size for I/O operations (8KB)
|
/// Standard buffer size for I/O operations (8KB)
|
||||||
pub const PIPESIZE: usize = 8192;
|
pub const PIPESIZE: usize = 8192;
|
||||||
|
|||||||
166
src/common/schema.rs
Normal file
166
src/common/schema.rs
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use strum::IntoEnumIterator;
|
||||||
|
|
||||||
|
/// Value type for a plugin option.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum OptionType {
|
||||||
|
String,
|
||||||
|
Integer,
|
||||||
|
Boolean,
|
||||||
|
Any,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OptionType {
|
||||||
|
/// Infer the option type from a YAML value.
|
||||||
|
pub fn from_yaml_value(value: &serde_yaml::Value) -> Self {
|
||||||
|
match value {
|
||||||
|
serde_yaml::Value::Bool(_) => OptionType::Boolean,
|
||||||
|
serde_yaml::Value::Number(_) => OptionType::Integer,
|
||||||
|
serde_yaml::Value::String(_) => OptionType::String,
|
||||||
|
_ => OptionType::Any,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schema for a single plugin option.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct OptionSchema {
|
||||||
|
pub name: String,
|
||||||
|
pub option_type: OptionType,
|
||||||
|
pub default: Option<serde_yaml::Value>,
|
||||||
|
pub required: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schema for a single plugin output.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct OutputSchema {
|
||||||
|
pub name: String,
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schema describing a plugin's configuration requirements.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct PluginSchema {
|
||||||
|
pub name: String,
|
||||||
|
pub description: String,
|
||||||
|
pub options: Vec<OptionSchema>,
|
||||||
|
pub outputs: Vec<OutputSchema>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gathers schemas from all registered meta plugins.
|
||||||
|
///
|
||||||
|
/// Iterates all `MetaPluginType` variants, attempts to create a default instance,
|
||||||
|
/// and collects their schemas. Plugins that fail to register (e.g., feature-gated)
|
||||||
|
/// are silently skipped.
|
||||||
|
pub fn gather_meta_plugin_schemas() -> Vec<PluginSchema> {
|
||||||
|
use crate::meta_plugin::{MetaPluginType, get_meta_plugin};
|
||||||
|
|
||||||
|
let mut schemas = Vec::new();
|
||||||
|
let mut sorted_types: Vec<MetaPluginType> = MetaPluginType::iter().collect();
|
||||||
|
sorted_types.sort_by_key(|t| t.to_string());
|
||||||
|
|
||||||
|
for plugin_type in sorted_types {
|
||||||
|
let plugin = match get_meta_plugin(plugin_type.clone(), None, None) {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
let name = plugin.meta_type().to_string();
|
||||||
|
|
||||||
|
let options: Vec<OptionSchema> = plugin
|
||||||
|
.options()
|
||||||
|
.iter()
|
||||||
|
.map(|(key, value)| {
|
||||||
|
let option_type = OptionType::from_yaml_value(value);
|
||||||
|
let (default, required) = if value.is_null() {
|
||||||
|
(None, true)
|
||||||
|
} else {
|
||||||
|
(Some(value.clone()), false)
|
||||||
|
};
|
||||||
|
OptionSchema {
|
||||||
|
name: key.clone(),
|
||||||
|
option_type,
|
||||||
|
default,
|
||||||
|
required,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut outputs: Vec<OutputSchema> = Vec::new();
|
||||||
|
for (key, value) in plugin.outputs() {
|
||||||
|
if !value.is_null() {
|
||||||
|
outputs.push(OutputSchema {
|
||||||
|
name: key.clone(),
|
||||||
|
description: key.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Also include default outputs if outputs map is empty
|
||||||
|
if outputs.is_empty() {
|
||||||
|
for output_name in plugin.default_outputs() {
|
||||||
|
outputs.push(OutputSchema {
|
||||||
|
name: output_name.clone(),
|
||||||
|
description: output_name,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
schemas.push(PluginSchema {
|
||||||
|
name,
|
||||||
|
description: plugin.description().to_string(),
|
||||||
|
options,
|
||||||
|
outputs,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
schemas
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gathers schemas from all registered filter plugins.
|
||||||
|
///
|
||||||
|
/// Uses the global filter plugin registry to discover all registered filters,
|
||||||
|
/// creates a default instance of each, and collects their option schemas.
|
||||||
|
pub fn gather_filter_plugin_schemas() -> Vec<PluginSchema> {
|
||||||
|
use crate::services::filter_service::get_available_filter_plugins;
|
||||||
|
|
||||||
|
let plugins = get_available_filter_plugins();
|
||||||
|
let mut schemas: Vec<PluginSchema> = plugins
|
||||||
|
.into_iter()
|
||||||
|
.map(|(name, creator)| {
|
||||||
|
let plugin = creator();
|
||||||
|
let options: Vec<OptionSchema> = plugin
|
||||||
|
.options()
|
||||||
|
.iter()
|
||||||
|
.map(|opt| {
|
||||||
|
let option_type = match &opt.default {
|
||||||
|
Some(serde_json::Value::Bool(_)) => OptionType::Boolean,
|
||||||
|
Some(serde_json::Value::Number(_)) => OptionType::Integer,
|
||||||
|
Some(serde_json::Value::String(_)) => OptionType::String,
|
||||||
|
_ => OptionType::Any,
|
||||||
|
};
|
||||||
|
OptionSchema {
|
||||||
|
name: opt.name.clone(),
|
||||||
|
option_type,
|
||||||
|
default: opt.default.as_ref().map(|v| {
|
||||||
|
// Convert serde_json::Value to serde_yaml::Value
|
||||||
|
serde_yaml::to_value(v).unwrap_or(serde_yaml::Value::Null)
|
||||||
|
}),
|
||||||
|
required: opt.required,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
PluginSchema {
|
||||||
|
name: name.clone(),
|
||||||
|
description: plugin.description().to_string(),
|
||||||
|
options,
|
||||||
|
outputs: Vec::new(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
schemas.sort_by(|a, b| a.name.cmp(&b.name));
|
||||||
|
schemas
|
||||||
|
}
|
||||||
@@ -175,6 +175,7 @@ impl Clone for Box<dyn CompressionEngine> {
|
|||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref COMPRESSION_ENGINES: EnumMap<CompressionType, Box<dyn CompressionEngine>> = {
|
static ref COMPRESSION_ENGINES: EnumMap<CompressionType, Box<dyn CompressionEngine>> = {
|
||||||
|
#[allow(unused_mut)] // mut needed when gzip/lz4 features are enabled
|
||||||
let mut em = enum_map! {
|
let mut em = enum_map! {
|
||||||
CompressionType::LZ4 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new(
|
CompressionType::LZ4 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new(
|
||||||
"lz4",
|
"lz4",
|
||||||
|
|||||||
@@ -573,4 +573,65 @@ impl Settings {
|
|||||||
.map(|plugins| plugins.iter().map(|p| p.name.clone()).collect())
|
.map(|plugins| plugins.iter().map(|p| p.name.clone()).collect())
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Validates the configuration against plugin schemas.
|
||||||
|
///
|
||||||
|
/// Checks that:
|
||||||
|
/// - All configured meta plugin names are valid and registered
|
||||||
|
/// - Required options are present for each meta plugin
|
||||||
|
/// - Compression plugin name (if set) is a valid compression type
|
||||||
|
///
|
||||||
|
/// Returns a list of warning strings. An empty list means the config is valid.
|
||||||
|
pub fn validate_config(&self) -> Vec<String> {
|
||||||
|
use crate::common::schema::gather_meta_plugin_schemas;
|
||||||
|
use crate::compression_engine::CompressionType;
|
||||||
|
use strum::IntoEnumIterator;
|
||||||
|
|
||||||
|
let mut warnings = Vec::new();
|
||||||
|
|
||||||
|
// Validate compression plugin
|
||||||
|
if let Some(ref comp) = self.compression_plugin {
|
||||||
|
let valid_types: Vec<String> =
|
||||||
|
CompressionType::iter().map(|ct| ct.to_string()).collect();
|
||||||
|
if !valid_types.contains(&comp.name) {
|
||||||
|
warnings.push(format!(
|
||||||
|
"Unknown compression_plugin.name: '{}'. Valid types: {}",
|
||||||
|
comp.name,
|
||||||
|
valid_types.join(", ")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate meta plugins
|
||||||
|
if let Some(ref plugins) = self.meta_plugins {
|
||||||
|
let schemas = gather_meta_plugin_schemas();
|
||||||
|
let schema_map: std::collections::HashMap<&str, &crate::common::schema::PluginSchema> =
|
||||||
|
schemas.iter().map(|s| (s.name.as_str(), s)).collect();
|
||||||
|
|
||||||
|
for plugin in plugins {
|
||||||
|
match schema_map.get(plugin.name.as_str()) {
|
||||||
|
Some(schema) => {
|
||||||
|
// Check required options
|
||||||
|
for opt in &schema.options {
|
||||||
|
if opt.required && !plugin.options.contains_key(&opt.name) {
|
||||||
|
warnings.push(format!(
|
||||||
|
"Meta plugin '{}': missing required option '{}'",
|
||||||
|
plugin.name, opt.name
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
warnings.push(format!(
|
||||||
|
"Unknown meta plugin: '{}'. Available: {}",
|
||||||
|
plugin.name,
|
||||||
|
schema_map.keys().copied().collect::<Vec<_>>().join(", ")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
warnings
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
use super::{FilterPlugin, FilterOption};
|
use super::{FilterOption, FilterPlugin};
|
||||||
use std::io::{Result, Read, Write};
|
|
||||||
use std::process::{Command, Stdio, Child};
|
|
||||||
use which::which;
|
|
||||||
use log::*;
|
use log::*;
|
||||||
|
use std::io::{Read, Result, Write};
|
||||||
|
use std::process::{Child, Command, Stdio};
|
||||||
|
use which::which;
|
||||||
|
|
||||||
/// A filter that executes an external program and pipes input through it.
|
/// A filter that executes an external program and pipes input through it.
|
||||||
///
|
///
|
||||||
@@ -43,16 +43,13 @@ impl ExecFilter {
|
|||||||
/// let filter = ExecFilter::new("grep", vec!["-i", "error"], false);
|
/// let filter = ExecFilter::new("grep", vec!["-i", "error"], false);
|
||||||
/// assert!(filter.supported);
|
/// assert!(filter.supported);
|
||||||
/// ```
|
/// ```
|
||||||
pub fn new(
|
pub fn new(program: &str, args: Vec<&str>, split_whitespace: bool) -> ExecFilter {
|
||||||
program: &str,
|
|
||||||
args: Vec<&str>,
|
|
||||||
split_whitespace: bool,
|
|
||||||
) -> ExecFilter {
|
|
||||||
let program_path = which(program);
|
let program_path = which(program);
|
||||||
let supported = program_path.is_ok();
|
let supported = program_path.is_ok();
|
||||||
|
|
||||||
ExecFilter {
|
ExecFilter {
|
||||||
program: program_path.map_or_else(|| program.to_string(), |p| p.to_string_lossy().to_string()),
|
program: program_path
|
||||||
|
.map_or_else(|| program.to_string(), |p| p.to_string_lossy().to_string()),
|
||||||
args: args.iter().map(|s| s.to_string()).collect(),
|
args: args.iter().map(|s| s.to_string()).collect(),
|
||||||
supported,
|
supported,
|
||||||
split_whitespace,
|
split_whitespace,
|
||||||
@@ -101,7 +98,10 @@ impl FilterPlugin for ExecFilter {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("FILTER_EXEC: Executing command: {} {:?}", self.program, self.args);
|
debug!(
|
||||||
|
"FILTER_EXEC: Executing command: {} {:?}",
|
||||||
|
self.program, self.args
|
||||||
|
);
|
||||||
|
|
||||||
// Read all input first
|
// Read all input first
|
||||||
let mut input_data = Vec::new();
|
let mut input_data = Vec::new();
|
||||||
@@ -129,7 +129,7 @@ impl FilterPlugin for ExecFilter {
|
|||||||
|
|
||||||
// Write input to child stdin
|
// Write input to child stdin
|
||||||
stdin.write_all(&input_data)?;
|
stdin.write_all(&input_data)?;
|
||||||
drop(stdin); // Close stdin to signal EOF
|
drop(stdin); // Close stdin to signal EOF
|
||||||
|
|
||||||
let mut stdout = child.stdout.take().ok_or_else(|| {
|
let mut stdout = child.stdout.take().ok_or_else(|| {
|
||||||
std::io::Error::new(
|
std::io::Error::new(
|
||||||
@@ -142,13 +142,12 @@ impl FilterPlugin for ExecFilter {
|
|||||||
std::io::copy(&mut stdout, writer)?;
|
std::io::copy(&mut stdout, writer)?;
|
||||||
|
|
||||||
// Wait for the child process to finish
|
// Wait for the child process to finish
|
||||||
let output = child.wait_with_output()
|
let output = child.wait_with_output().map_err(|e| {
|
||||||
.map_err(|e| {
|
std::io::Error::new(
|
||||||
std::io::Error::new(
|
std::io::ErrorKind::Other,
|
||||||
std::io::ErrorKind::Other,
|
format!("Failed to wait on child process: {}", e),
|
||||||
format!("Failed to wait on child process: {}", e),
|
)
|
||||||
)
|
})?;
|
||||||
})?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
if !output.status.success() {
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
@@ -205,6 +204,10 @@ impl FilterPlugin for ExecFilter {
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Pipe input through an external command"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register the plugin at module initialization time
|
// Register the plugin at module initialization time
|
||||||
|
|||||||
@@ -132,4 +132,8 @@ impl FilterPlugin for GrepFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Filter lines matching a regex pattern"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -140,6 +140,10 @@ impl FilterPlugin for HeadBytesFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Read the first N bytes"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A filter that reads the first N lines from the input stream.
|
/// A filter that reads the first N lines from the input stream.
|
||||||
@@ -270,6 +274,10 @@ impl FilterPlugin for HeadLinesFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Read the first N lines"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register the plugin at module initialization time
|
// Register the plugin at module initialization time
|
||||||
|
|||||||
@@ -172,6 +172,15 @@ pub trait FilterPlugin: Send {
|
|||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
fn options(&self) -> Vec<FilterOption>;
|
fn options(&self) -> Vec<FilterOption>;
|
||||||
|
|
||||||
|
/// Returns a human-readable description of this filter.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A description string (empty by default).
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Enum representing the different types of filters.
|
/// Enum representing the different types of filters.
|
||||||
@@ -684,12 +693,13 @@ fn create_specific_filter(
|
|||||||
"head_tokens filter requires 'count' parameter",
|
"head_tokens filter requires 'count' parameter",
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let encoding = crate::tokenizer::TokenEncoding::Cl100kBase;
|
let encoding = options
|
||||||
|
.get("encoding")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.and_then(|s| s.parse::<crate::tokenizer::TokenEncoding>().ok())
|
||||||
|
.unwrap_or_default();
|
||||||
let mut f = tokens::HeadTokensFilter::new(count);
|
let mut f = tokens::HeadTokensFilter::new(count);
|
||||||
f.tokenizer = Some(
|
f.tokenizer = crate::tokenizer::get_tokenizer(encoding).clone();
|
||||||
crate::tokenizer::Tokenizer::new(encoding)
|
|
||||||
.map_err(|e| std::io::Error::other(e.to_string()))?,
|
|
||||||
);
|
|
||||||
f.encoding = encoding;
|
f.encoding = encoding;
|
||||||
Ok(Box::new(f))
|
Ok(Box::new(f))
|
||||||
}
|
}
|
||||||
@@ -705,12 +715,13 @@ fn create_specific_filter(
|
|||||||
"skip_tokens filter requires 'count' parameter",
|
"skip_tokens filter requires 'count' parameter",
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let encoding = crate::tokenizer::TokenEncoding::Cl100kBase;
|
let encoding = options
|
||||||
|
.get("encoding")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.and_then(|s| s.parse::<crate::tokenizer::TokenEncoding>().ok())
|
||||||
|
.unwrap_or_default();
|
||||||
let mut f = tokens::SkipTokensFilter::new(count);
|
let mut f = tokens::SkipTokensFilter::new(count);
|
||||||
f.tokenizer = Some(
|
f.tokenizer = crate::tokenizer::get_tokenizer(encoding).clone();
|
||||||
crate::tokenizer::Tokenizer::new(encoding)
|
|
||||||
.map_err(|e| std::io::Error::other(e.to_string()))?,
|
|
||||||
);
|
|
||||||
f.encoding = encoding;
|
f.encoding = encoding;
|
||||||
Ok(Box::new(f))
|
Ok(Box::new(f))
|
||||||
}
|
}
|
||||||
@@ -726,12 +737,13 @@ fn create_specific_filter(
|
|||||||
"tail_tokens filter requires 'count' parameter",
|
"tail_tokens filter requires 'count' parameter",
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let encoding = crate::tokenizer::TokenEncoding::Cl100kBase;
|
let encoding = options
|
||||||
|
.get("encoding")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.and_then(|s| s.parse::<crate::tokenizer::TokenEncoding>().ok())
|
||||||
|
.unwrap_or_default();
|
||||||
let mut f = tokens::TailTokensFilter::new(count);
|
let mut f = tokens::TailTokensFilter::new(count);
|
||||||
f.tokenizer = Some(
|
f.tokenizer = crate::tokenizer::get_tokenizer(encoding).clone();
|
||||||
crate::tokenizer::Tokenizer::new(encoding)
|
|
||||||
.map_err(|e| std::io::Error::other(e.to_string()))?,
|
|
||||||
);
|
|
||||||
f.encoding = encoding;
|
f.encoding = encoding;
|
||||||
Ok(Box::new(f))
|
Ok(Box::new(f))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -72,6 +72,10 @@ impl FilterPlugin for SkipBytesFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Skip the first N bytes"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A filter that skips the first N lines from the input stream.
|
/// A filter that skips the first N lines from the input stream.
|
||||||
@@ -137,6 +141,10 @@ impl FilterPlugin for SkipLinesFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Skip the first N lines"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register the plugin at module initialization time
|
// Register the plugin at module initialization time
|
||||||
|
|||||||
@@ -56,4 +56,8 @@ impl FilterPlugin for StripAnsiFilter {
|
|||||||
fn options(&self) -> Vec<FilterOption> {
|
fn options(&self) -> Vec<FilterOption> {
|
||||||
Vec::new() // strip_ansi doesn't take any options
|
Vec::new() // strip_ansi doesn't take any options
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Strip ANSI escape sequences"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -82,6 +82,10 @@ impl FilterPlugin for TailBytesFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Read the last N bytes"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A filter that reads the last N lines from the input stream.
|
/// A filter that reads the last N lines from the input stream.
|
||||||
@@ -156,6 +160,10 @@ impl FilterPlugin for TailLinesFilter {
|
|||||||
required: true,
|
required: true,
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Read the last N lines"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register the plugin at module initialization time
|
// Register the plugin at module initialization time
|
||||||
|
|||||||
@@ -1,20 +1,9 @@
|
|||||||
use super::{FilterOption, FilterPlugin};
|
use super::{FilterOption, FilterPlugin};
|
||||||
use crate::common::PIPESIZE;
|
use crate::common::PIPESIZE;
|
||||||
use crate::services::filter_service::register_filter_plugin;
|
use crate::services::filter_service::register_filter_plugin;
|
||||||
use crate::tokenizer::{TokenEncoding, Tokenizer};
|
use crate::tokenizer::{TokenEncoding, Tokenizer, get_tokenizer};
|
||||||
use std::collections::VecDeque;
|
|
||||||
use std::io::{Read, Result, Write};
|
use std::io::{Read, Result, Write};
|
||||||
|
|
||||||
/// Resolve the tokenizer from a JSON options map.
|
|
||||||
fn resolve_tokenizer(options: &Option<serde_json::Value>) -> Tokenizer {
|
|
||||||
let encoding = options
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.and_then(|s| s.parse::<TokenEncoding>().ok())
|
|
||||||
.unwrap_or_default();
|
|
||||||
Tokenizer::new(encoding).expect("Failed to create tokenizer")
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// head_tokens
|
// head_tokens
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -22,19 +11,21 @@ fn resolve_tokenizer(options: &Option<serde_json::Value>) -> Tokenizer {
|
|||||||
/// A filter that outputs only the first N tokens of the input stream.
|
/// A filter that outputs only the first N tokens of the input stream.
|
||||||
///
|
///
|
||||||
/// Streams bytes directly until the token limit is reached. When the limit
|
/// Streams bytes directly until the token limit is reached. When the limit
|
||||||
/// falls mid-chunk, uses `split_by_token` to find the exact byte boundary.
|
/// falls mid-chunk, uses `split_by_token_iter` to find the exact byte boundary
|
||||||
|
/// without allocating token strings beyond what is needed.
|
||||||
pub struct HeadTokensFilter {
|
pub struct HeadTokensFilter {
|
||||||
pub remaining: usize,
|
pub remaining: usize,
|
||||||
pub tokenizer: Option<Tokenizer>,
|
pub tokenizer: Tokenizer,
|
||||||
pub encoding: TokenEncoding,
|
pub encoding: TokenEncoding,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HeadTokensFilter {
|
impl HeadTokensFilter {
|
||||||
pub fn new(count: usize) -> Self {
|
pub fn new(count: usize) -> Self {
|
||||||
|
let encoding = TokenEncoding::default();
|
||||||
Self {
|
Self {
|
||||||
remaining: count,
|
remaining: count,
|
||||||
tokenizer: None,
|
tokenizer: get_tokenizer(encoding).clone(),
|
||||||
encoding: TokenEncoding::default(),
|
encoding,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -45,11 +36,7 @@ impl FilterPlugin for HeadTokensFilter {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer = self
|
let tokenizer = &self.tokenizer;
|
||||||
.tokenizer
|
|
||||||
.as_ref()
|
|
||||||
.unwrap_or_else(|| panic!("HeadTokensFilter: tokenizer not initialized"));
|
|
||||||
|
|
||||||
let mut buffer = vec![0u8; PIPESIZE];
|
let mut buffer = vec![0u8; PIPESIZE];
|
||||||
let mut total_tokens = 0usize;
|
let mut total_tokens = 0usize;
|
||||||
|
|
||||||
@@ -71,22 +58,15 @@ impl FilterPlugin for HeadTokensFilter {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Cutoff is within this chunk — split at exact token boundary
|
// Cutoff is within this chunk — use iterator to find exact
|
||||||
|
// boundary without allocating all token strings
|
||||||
let tokens_to_write = self.remaining - total_tokens;
|
let tokens_to_write = self.remaining - total_tokens;
|
||||||
let token_strs = tokenizer
|
|
||||||
.split_by_token(&text)
|
|
||||||
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
|
||||||
let mut byte_pos = 0usize;
|
let mut byte_pos = 0usize;
|
||||||
for token_str in token_strs.iter().take(tokens_to_write) {
|
for token_str in tokenizer.split_by_token_iter(&text).take(tokens_to_write) {
|
||||||
byte_pos += token_str.len();
|
byte_pos += token_str
|
||||||
|
.map_err(|e| std::io::Error::other(e.to_string()))?
|
||||||
|
.len();
|
||||||
}
|
}
|
||||||
// Write only the bytes for the tokens we want.
|
|
||||||
// Map byte positions in the lossy string back to positions in the
|
|
||||||
// original byte slice. Since from_utf8_lossy replaces invalid
|
|
||||||
// bytes with the replacement character (3 bytes), we need to be
|
|
||||||
// careful. For simplicity, write the valid prefix of the chunk.
|
|
||||||
// We use the original bytes up to the calculated position, adjusting
|
|
||||||
// for any UTF-8 replacement character differences.
|
|
||||||
let write_len = map_lossy_pos_to_bytes(chunk, &text, byte_pos);
|
let write_len = map_lossy_pos_to_bytes(chunk, &text, byte_pos);
|
||||||
writer.write_all(&chunk[..write_len])?;
|
writer.write_all(&chunk[..write_len])?;
|
||||||
break;
|
break;
|
||||||
@@ -98,20 +78,28 @@ impl FilterPlugin for HeadTokensFilter {
|
|||||||
fn clone_box(&self) -> Box<dyn FilterPlugin> {
|
fn clone_box(&self) -> Box<dyn FilterPlugin> {
|
||||||
Box::new(Self {
|
Box::new(Self {
|
||||||
remaining: self.remaining,
|
remaining: self.remaining,
|
||||||
tokenizer: self
|
tokenizer: get_tokenizer(self.encoding).clone(),
|
||||||
.tokenizer
|
|
||||||
.as_ref()
|
|
||||||
.map(|_| Tokenizer::new(self.encoding).unwrap()),
|
|
||||||
encoding: self.encoding,
|
encoding: self.encoding,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn options(&self) -> Vec<FilterOption> {
|
fn options(&self) -> Vec<FilterOption> {
|
||||||
vec![FilterOption {
|
vec![
|
||||||
name: "count".to_string(),
|
FilterOption {
|
||||||
default: None,
|
name: "count".to_string(),
|
||||||
required: true,
|
default: None,
|
||||||
}]
|
required: true,
|
||||||
|
},
|
||||||
|
FilterOption {
|
||||||
|
name: "encoding".to_string(),
|
||||||
|
default: Some(serde_json::Value::String("cl100k_base".to_string())),
|
||||||
|
required: false,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Read the first N LLM tokens"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -122,16 +110,17 @@ impl FilterPlugin for HeadTokensFilter {
|
|||||||
/// A filter that skips the first N tokens of the input stream and outputs the rest.
|
/// A filter that skips the first N tokens of the input stream and outputs the rest.
|
||||||
pub struct SkipTokensFilter {
|
pub struct SkipTokensFilter {
|
||||||
pub remaining: usize,
|
pub remaining: usize,
|
||||||
pub tokenizer: Option<Tokenizer>,
|
pub tokenizer: Tokenizer,
|
||||||
pub encoding: TokenEncoding,
|
pub encoding: TokenEncoding,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SkipTokensFilter {
|
impl SkipTokensFilter {
|
||||||
pub fn new(count: usize) -> Self {
|
pub fn new(count: usize) -> Self {
|
||||||
|
let encoding = TokenEncoding::default();
|
||||||
Self {
|
Self {
|
||||||
remaining: count,
|
remaining: count,
|
||||||
tokenizer: None,
|
tokenizer: get_tokenizer(encoding).clone(),
|
||||||
encoding: TokenEncoding::default(),
|
encoding,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -142,11 +131,7 @@ impl FilterPlugin for SkipTokensFilter {
|
|||||||
return std::io::copy(reader, writer).map(|_| ());
|
return std::io::copy(reader, writer).map(|_| ());
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer = self
|
let tokenizer = &self.tokenizer;
|
||||||
.tokenizer
|
|
||||||
.as_ref()
|
|
||||||
.unwrap_or_else(|| panic!("SkipTokensFilter: tokenizer not initialized"));
|
|
||||||
|
|
||||||
let mut buffer = vec![0u8; PIPESIZE];
|
let mut buffer = vec![0u8; PIPESIZE];
|
||||||
let mut total_tokens = 0usize;
|
let mut total_tokens = 0usize;
|
||||||
let mut done_skipping = false;
|
let mut done_skipping = false;
|
||||||
@@ -173,14 +158,14 @@ impl FilterPlugin for SkipTokensFilter {
|
|||||||
done_skipping = true;
|
done_skipping = true;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Cutoff is within this chunk — skip past the boundary, write rest
|
// Cutoff is within this chunk — use iterator to skip past
|
||||||
|
// the boundary without allocating all token strings
|
||||||
let tokens_to_skip = self.remaining - total_tokens;
|
let tokens_to_skip = self.remaining - total_tokens;
|
||||||
let token_strs = tokenizer
|
|
||||||
.split_by_token(&text)
|
|
||||||
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
|
||||||
let mut byte_pos = 0usize;
|
let mut byte_pos = 0usize;
|
||||||
for token_str in token_strs.iter().take(tokens_to_skip) {
|
for token_str in tokenizer.split_by_token_iter(&text).take(tokens_to_skip) {
|
||||||
byte_pos += token_str.len();
|
byte_pos += token_str
|
||||||
|
.map_err(|e| std::io::Error::other(e.to_string()))?
|
||||||
|
.len();
|
||||||
}
|
}
|
||||||
let skip_len = map_lossy_pos_to_bytes(chunk, &text, byte_pos);
|
let skip_len = map_lossy_pos_to_bytes(chunk, &text, byte_pos);
|
||||||
if skip_len < n {
|
if skip_len < n {
|
||||||
@@ -195,20 +180,28 @@ impl FilterPlugin for SkipTokensFilter {
|
|||||||
fn clone_box(&self) -> Box<dyn FilterPlugin> {
|
fn clone_box(&self) -> Box<dyn FilterPlugin> {
|
||||||
Box::new(Self {
|
Box::new(Self {
|
||||||
remaining: self.remaining,
|
remaining: self.remaining,
|
||||||
tokenizer: self
|
tokenizer: get_tokenizer(self.encoding).clone(),
|
||||||
.tokenizer
|
|
||||||
.as_ref()
|
|
||||||
.map(|_| Tokenizer::new(self.encoding).unwrap()),
|
|
||||||
encoding: self.encoding,
|
encoding: self.encoding,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn options(&self) -> Vec<FilterOption> {
|
fn options(&self) -> Vec<FilterOption> {
|
||||||
vec![FilterOption {
|
vec![
|
||||||
name: "count".to_string(),
|
FilterOption {
|
||||||
default: None,
|
name: "count".to_string(),
|
||||||
required: true,
|
default: None,
|
||||||
}]
|
required: true,
|
||||||
|
},
|
||||||
|
FilterOption {
|
||||||
|
name: "encoding".to_string(),
|
||||||
|
default: Some(serde_json::Value::String("cl100k_base".to_string())),
|
||||||
|
required: false,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Skip the first N LLM tokens"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -218,27 +211,24 @@ impl FilterPlugin for SkipTokensFilter {
|
|||||||
|
|
||||||
/// A filter that outputs only the last N tokens of the input stream.
|
/// A filter that outputs only the last N tokens of the input stream.
|
||||||
///
|
///
|
||||||
/// Uses a bounded ring buffer (last ~2× PIPESIZE) to keep recent bytes.
|
/// Buffers all bytes from the stream, then at finalize tokenizes the
|
||||||
/// At finalize, tokenizes the buffered content and writes only the last N tokens.
|
/// content and writes only the last N tokens.
|
||||||
pub struct TailTokensFilter {
|
pub struct TailTokensFilter {
|
||||||
pub count: usize,
|
pub count: usize,
|
||||||
/// Ring buffer holding the most recent bytes from the stream.
|
/// Buffer holding all bytes from the stream.
|
||||||
pub ring: VecDeque<u8>,
|
buffer: Vec<u8>,
|
||||||
pub ring_capacity: usize,
|
pub tokenizer: Tokenizer,
|
||||||
pub tokenizer: Option<Tokenizer>,
|
|
||||||
pub encoding: TokenEncoding,
|
pub encoding: TokenEncoding,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TailTokensFilter {
|
impl TailTokensFilter {
|
||||||
pub fn new(count: usize) -> Self {
|
pub fn new(count: usize) -> Self {
|
||||||
// Keep enough bytes for ~2 chunks worth of data
|
let encoding = TokenEncoding::default();
|
||||||
let ring_capacity = PIPESIZE * 2;
|
|
||||||
Self {
|
Self {
|
||||||
count,
|
count,
|
||||||
ring: VecDeque::with_capacity(ring_capacity),
|
buffer: Vec::with_capacity(PIPESIZE),
|
||||||
ring_capacity,
|
tokenizer: get_tokenizer(encoding).clone(),
|
||||||
tokenizer: None,
|
encoding,
|
||||||
encoding: TokenEncoding::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -249,36 +239,23 @@ impl FilterPlugin for TailTokensFilter {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer = self
|
let tokenizer = &self.tokenizer;
|
||||||
.tokenizer
|
|
||||||
.as_ref()
|
|
||||||
.unwrap_or_else(|| panic!("TailTokensFilter: tokenizer not initialized"));
|
|
||||||
|
|
||||||
// Stream all bytes through the ring buffer
|
// Buffer all bytes from the stream
|
||||||
let mut buffer = vec![0u8; PIPESIZE];
|
std::io::copy(reader, &mut self.buffer)?;
|
||||||
loop {
|
|
||||||
let n = reader.read(&mut buffer)?;
|
if self.buffer.is_empty() {
|
||||||
if n == 0 {
|
return Ok(());
|
||||||
break;
|
|
||||||
}
|
|
||||||
for &byte in &buffer[..n] {
|
|
||||||
if self.ring.len() >= self.ring_capacity {
|
|
||||||
self.ring.pop_front();
|
|
||||||
}
|
|
||||||
self.ring.push_back(byte);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize the buffered content and extract last N tokens
|
let text = String::from_utf8_lossy(&self.buffer);
|
||||||
let buffered: Vec<u8> = self.ring.iter().copied().collect();
|
|
||||||
let text = String::from_utf8_lossy(&buffered);
|
|
||||||
let token_strs = tokenizer
|
let token_strs = tokenizer
|
||||||
.split_by_token(&text)
|
.split_by_token(&text)
|
||||||
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
.map_err(|e| std::io::Error::other(e.to_string()))?;
|
||||||
|
|
||||||
if token_strs.len() <= self.count {
|
if token_strs.len() <= self.count {
|
||||||
// All tokens fit — write everything
|
// All tokens fit — write everything
|
||||||
writer.write_all(&buffered)?;
|
writer.write_all(&self.buffer)?;
|
||||||
} else {
|
} else {
|
||||||
// Write only the last N tokens
|
// Write only the last N tokens
|
||||||
let skip = token_strs.len() - self.count;
|
let skip = token_strs.len() - self.count;
|
||||||
@@ -286,9 +263,9 @@ impl FilterPlugin for TailTokensFilter {
|
|||||||
for token_str in token_strs.iter().take(skip) {
|
for token_str in token_strs.iter().take(skip) {
|
||||||
byte_offset += token_str.len();
|
byte_offset += token_str.len();
|
||||||
}
|
}
|
||||||
let write_len = map_lossy_pos_to_bytes(&buffered, &text, byte_offset);
|
let write_len = map_lossy_pos_to_bytes(&self.buffer, &text, byte_offset);
|
||||||
if write_len < buffered.len() {
|
if write_len < self.buffer.len() {
|
||||||
writer.write_all(&buffered[write_len..])?;
|
writer.write_all(&self.buffer[write_len..])?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -298,22 +275,29 @@ impl FilterPlugin for TailTokensFilter {
|
|||||||
fn clone_box(&self) -> Box<dyn FilterPlugin> {
|
fn clone_box(&self) -> Box<dyn FilterPlugin> {
|
||||||
Box::new(Self {
|
Box::new(Self {
|
||||||
count: self.count,
|
count: self.count,
|
||||||
ring: self.ring.clone(),
|
buffer: self.buffer.clone(),
|
||||||
ring_capacity: self.ring_capacity,
|
tokenizer: get_tokenizer(self.encoding).clone(),
|
||||||
tokenizer: self
|
|
||||||
.tokenizer
|
|
||||||
.as_ref()
|
|
||||||
.map(|_| Tokenizer::new(self.encoding).unwrap()),
|
|
||||||
encoding: self.encoding,
|
encoding: self.encoding,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn options(&self) -> Vec<FilterOption> {
|
fn options(&self) -> Vec<FilterOption> {
|
||||||
vec![FilterOption {
|
vec![
|
||||||
name: "count".to_string(),
|
FilterOption {
|
||||||
default: None,
|
name: "count".to_string(),
|
||||||
required: true,
|
default: None,
|
||||||
}]
|
required: true,
|
||||||
|
},
|
||||||
|
FilterOption {
|
||||||
|
name: "encoding".to_string(),
|
||||||
|
default: Some(serde_json::Value::String("cl100k_base".to_string())),
|
||||||
|
required: false,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Read the last N LLM tokens"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -393,21 +377,9 @@ fn map_lossy_pos_to_bytes(original: &[u8], lossy: &str, lossy_pos: usize) -> usi
|
|||||||
|
|
||||||
#[ctor::ctor]
|
#[ctor::ctor]
|
||||||
fn register_token_filters() {
|
fn register_token_filters() {
|
||||||
register_filter_plugin("head_tokens", || {
|
register_filter_plugin("head_tokens", || Box::new(HeadTokensFilter::new(0)));
|
||||||
let mut f = HeadTokensFilter::new(0);
|
register_filter_plugin("skip_tokens", || Box::new(SkipTokensFilter::new(0)));
|
||||||
f.tokenizer = Some(resolve_tokenizer(&None));
|
register_filter_plugin("tail_tokens", || Box::new(TailTokensFilter::new(0)));
|
||||||
Box::new(f)
|
|
||||||
});
|
|
||||||
register_filter_plugin("skip_tokens", || {
|
|
||||||
let mut f = SkipTokensFilter::new(0);
|
|
||||||
f.tokenizer = Some(resolve_tokenizer(&None));
|
|
||||||
Box::new(f)
|
|
||||||
});
|
|
||||||
register_filter_plugin("tail_tokens", || {
|
|
||||||
let mut f = TailTokensFilter::new(0);
|
|
||||||
f.tokenizer = Some(resolve_tokenizer(&None));
|
|
||||||
Box::new(f)
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -416,13 +388,13 @@ mod tests {
|
|||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
fn make_tokenizer() -> Tokenizer {
|
fn make_tokenizer() -> Tokenizer {
|
||||||
Tokenizer::new(TokenEncoding::Cl100kBase).unwrap()
|
get_tokenizer(TokenEncoding::Cl100kBase).clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_head_tokens_basic() {
|
fn test_head_tokens_basic() {
|
||||||
let mut filter = HeadTokensFilter::new(3);
|
let mut filter = HeadTokensFilter::new(3);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"The quick brown fox";
|
let input = b"The quick brown fox";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
@@ -437,7 +409,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_head_tokens_zero() {
|
fn test_head_tokens_zero() {
|
||||||
let mut filter = HeadTokensFilter::new(0);
|
let mut filter = HeadTokensFilter::new(0);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"The quick brown fox";
|
let input = b"The quick brown fox";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
@@ -448,7 +420,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_head_tokens_more_than_available() {
|
fn test_head_tokens_more_than_available() {
|
||||||
let mut filter = HeadTokensFilter::new(1000);
|
let mut filter = HeadTokensFilter::new(1000);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"Hello world";
|
let input = b"Hello world";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
@@ -459,7 +431,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_skip_tokens_basic() {
|
fn test_skip_tokens_basic() {
|
||||||
let mut filter = SkipTokensFilter::new(2);
|
let mut filter = SkipTokensFilter::new(2);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"The quick brown fox";
|
let input = b"The quick brown fox";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
@@ -473,7 +445,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_skip_tokens_zero() {
|
fn test_skip_tokens_zero() {
|
||||||
let mut filter = SkipTokensFilter::new(0);
|
let mut filter = SkipTokensFilter::new(0);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"Hello world";
|
let input = b"Hello world";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
@@ -484,7 +456,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_tail_tokens_basic() {
|
fn test_tail_tokens_basic() {
|
||||||
let mut filter = TailTokensFilter::new(2);
|
let mut filter = TailTokensFilter::new(2);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"The quick brown fox jumps over the lazy dog";
|
let input = b"The quick brown fox jumps over the lazy dog";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
@@ -499,7 +471,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_tail_tokens_zero() {
|
fn test_tail_tokens_zero() {
|
||||||
let mut filter = TailTokensFilter::new(0);
|
let mut filter = TailTokensFilter::new(0);
|
||||||
filter.tokenizer = Some(make_tokenizer());
|
filter.tokenizer = make_tokenizer();
|
||||||
|
|
||||||
let input = b"Hello world";
|
let input = b"Hello world";
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
|
|||||||
@@ -479,6 +479,71 @@ where
|
|||||||
vec![self.meta_type().to_string()]
|
vec![self.meta_type().to_string()]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a description of this plugin for display in config templates.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A description string (empty by default).
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
""
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Builds the schema for this plugin from its options and outputs.
|
||||||
|
///
|
||||||
|
/// Default implementation infers option types from YAML values and
|
||||||
|
/// collects enabled outputs.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A `PluginSchema` describing this plugin's configuration.
|
||||||
|
fn schema(&self) -> crate::common::schema::PluginSchema {
|
||||||
|
use crate::common::schema::{OptionSchema, OptionType, OutputSchema, PluginSchema};
|
||||||
|
|
||||||
|
let options: Vec<OptionSchema> = self
|
||||||
|
.options()
|
||||||
|
.iter()
|
||||||
|
.map(|(key, value)| {
|
||||||
|
let option_type = OptionType::from_yaml_value(value);
|
||||||
|
let (default, required) = if value.is_null() {
|
||||||
|
(None, true)
|
||||||
|
} else {
|
||||||
|
(Some(value.clone()), false)
|
||||||
|
};
|
||||||
|
OptionSchema {
|
||||||
|
name: key.clone(),
|
||||||
|
option_type,
|
||||||
|
default,
|
||||||
|
required,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut outputs: Vec<OutputSchema> = Vec::new();
|
||||||
|
for (key, value) in self.outputs() {
|
||||||
|
if !value.is_null() {
|
||||||
|
outputs.push(OutputSchema {
|
||||||
|
name: key.clone(),
|
||||||
|
description: key.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if outputs.is_empty() {
|
||||||
|
for output_name in self.default_outputs() {
|
||||||
|
outputs.push(OutputSchema {
|
||||||
|
name: output_name.clone(),
|
||||||
|
description: output_name,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PluginSchema {
|
||||||
|
name: self.meta_type().to_string(),
|
||||||
|
description: self.description().to_string(),
|
||||||
|
options,
|
||||||
|
outputs,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Method to downcast to concrete type (for checking finalization state).
|
/// Method to downcast to concrete type (for checking finalization state).
|
||||||
///
|
///
|
||||||
/// # Returns
|
/// # Returns
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use crate::common::PIPESIZE;
|
use crate::common::PIPESIZE;
|
||||||
use crate::common::is_binary::is_binary;
|
use crate::common::is_binary::is_binary;
|
||||||
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
|
use crate::meta_plugin::{MetaPlugin, MetaPluginResponse, MetaPluginType};
|
||||||
use crate::tokenizer::{TokenEncoding, Tokenizer};
|
use crate::tokenizer::{TokenEncoding, get_tokenizer};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct TokensMetaPlugin {
|
pub struct TokensMetaPlugin {
|
||||||
@@ -15,8 +15,8 @@ pub struct TokensMetaPlugin {
|
|||||||
/// UTF-8 boundary carry buffer.
|
/// UTF-8 boundary carry buffer.
|
||||||
utf8_buffer: Vec<u8>,
|
utf8_buffer: Vec<u8>,
|
||||||
base: crate::meta_plugin::BaseMetaPlugin,
|
base: crate::meta_plugin::BaseMetaPlugin,
|
||||||
/// The tokenizer instance.
|
/// The tokenizer encoding.
|
||||||
tokenizer: Tokenizer,
|
encoding: TokenEncoding,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokensMetaPlugin {
|
impl TokensMetaPlugin {
|
||||||
@@ -59,8 +59,6 @@ impl TokensMetaPlugin {
|
|||||||
.and_then(|s| s.parse::<TokenEncoding>().ok())
|
.and_then(|s| s.parse::<TokenEncoding>().ok())
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
let tokenizer = Tokenizer::new(encoding).expect("Failed to create tokenizer");
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
buffer: Some(Vec::new()),
|
buffer: Some(Vec::new()),
|
||||||
max_buffer_size,
|
max_buffer_size,
|
||||||
@@ -69,7 +67,7 @@ impl TokensMetaPlugin {
|
|||||||
token_count: 0,
|
token_count: 0,
|
||||||
utf8_buffer: Vec::new(),
|
utf8_buffer: Vec::new(),
|
||||||
base,
|
base,
|
||||||
tokenizer,
|
encoding,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,36 +75,59 @@ impl TokensMetaPlugin {
|
|||||||
///
|
///
|
||||||
/// Combines with any pending UTF-8 carry bytes, converts to text,
|
/// Combines with any pending UTF-8 carry bytes, converts to text,
|
||||||
/// and adds the token count to the running total.
|
/// and adds the token count to the running total.
|
||||||
|
///
|
||||||
|
/// Avoids unnecessary allocations when there is no pending UTF-8 carry
|
||||||
|
/// and the data is valid UTF-8.
|
||||||
fn count_tokens(&mut self, data: &[u8]) {
|
fn count_tokens(&mut self, data: &[u8]) {
|
||||||
if data.is_empty() && self.utf8_buffer.is_empty() {
|
if data.is_empty() && self.utf8_buffer.is_empty() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let combined = if !self.utf8_buffer.is_empty() {
|
let tokenizer = get_tokenizer(self.encoding);
|
||||||
let mut c = self.utf8_buffer.clone();
|
|
||||||
c.extend_from_slice(data);
|
|
||||||
c
|
|
||||||
} else {
|
|
||||||
data.to_vec()
|
|
||||||
};
|
|
||||||
self.utf8_buffer.clear();
|
|
||||||
|
|
||||||
let text = match std::str::from_utf8(&combined) {
|
if self.utf8_buffer.is_empty() {
|
||||||
Ok(t) => t,
|
// Fast path: no pending carry — try to use data directly
|
||||||
Err(e) => {
|
match std::str::from_utf8(data) {
|
||||||
let valid = e.valid_up_to();
|
Ok(text) => {
|
||||||
if valid < combined.len() {
|
if !text.is_empty() {
|
||||||
self.utf8_buffer.extend_from_slice(&combined[valid..]);
|
self.token_count += tokenizer.count(text);
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
match std::str::from_utf8(&combined[..valid]) {
|
Err(e) => {
|
||||||
Ok(t) => t,
|
let valid_up_to = e.valid_up_to();
|
||||||
Err(_) => return,
|
if valid_up_to > 0 {
|
||||||
|
// Count the valid prefix without copying
|
||||||
|
let text =
|
||||||
|
std::str::from_utf8(&data[..valid_up_to]).expect("validated prefix");
|
||||||
|
self.token_count += tokenizer.count(text);
|
||||||
|
}
|
||||||
|
// Save invalid trailing bytes for next call
|
||||||
|
self.utf8_buffer.extend_from_slice(&data[valid_up_to..]);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
if !text.is_empty() {
|
// Slow path: pending carry bytes — must build combined buffer
|
||||||
self.token_count += self.tokenizer.count(text);
|
let mut combined = std::mem::take(&mut self.utf8_buffer);
|
||||||
|
combined.extend_from_slice(data);
|
||||||
|
|
||||||
|
match std::str::from_utf8(&combined) {
|
||||||
|
Ok(text) => {
|
||||||
|
if !text.is_empty() {
|
||||||
|
self.token_count += tokenizer.count(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let valid_up_to = e.valid_up_to();
|
||||||
|
if valid_up_to > 0 {
|
||||||
|
let text =
|
||||||
|
std::str::from_utf8(&combined[..valid_up_to]).expect("validated prefix");
|
||||||
|
self.token_count += tokenizer.count(text);
|
||||||
|
}
|
||||||
|
self.utf8_buffer.extend_from_slice(&combined[valid_up_to..]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -149,8 +170,8 @@ impl MetaPlugin for TokensMetaPlugin {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if should_detect {
|
if should_detect {
|
||||||
let buf_clone = self.buffer.as_ref().unwrap().clone();
|
let buffer_data = self.buffer.as_ref().unwrap().clone();
|
||||||
let is_binary = self.detect_binary(&buf_clone);
|
let is_binary = self.detect_binary(&buffer_data);
|
||||||
|
|
||||||
if is_binary {
|
if is_binary {
|
||||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||||
@@ -168,19 +189,10 @@ impl MetaPlugin for TokensMetaPlugin {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// It's text — tokenize the full accumulated buffer
|
// It's text — tokenize the full buffer (nothing was counted yet),
|
||||||
self.count_tokens(&buf_clone);
|
// then clear to avoid double-counting in finalize().
|
||||||
|
self.count_tokens(&buffer_data);
|
||||||
if buf_clone.len() >= self.max_buffer_size {
|
self.buffer = Some(Vec::new());
|
||||||
self.buffer = None;
|
|
||||||
}
|
|
||||||
} else if self.buffer.is_some() {
|
|
||||||
// Still building up buffer — tokenize what was just added
|
|
||||||
let remaining = self
|
|
||||||
.max_buffer_size
|
|
||||||
.saturating_sub(self.buffer.as_ref().map_or(0, |b| b.len()));
|
|
||||||
let to_take = std::cmp::min(data.len(), remaining);
|
|
||||||
self.count_tokens(&data[..to_take]);
|
|
||||||
}
|
}
|
||||||
} else if self.is_binary_content == Some(false) {
|
} else if self.is_binary_content == Some(false) {
|
||||||
self.count_tokens(data);
|
self.count_tokens(data);
|
||||||
@@ -212,8 +224,8 @@ impl MetaPlugin for TokensMetaPlugin {
|
|||||||
if self.is_binary_content.is_none() {
|
if self.is_binary_content.is_none() {
|
||||||
if let Some(buffer) = &self.buffer {
|
if let Some(buffer) = &self.buffer {
|
||||||
if !buffer.is_empty() {
|
if !buffer.is_empty() {
|
||||||
let buf_clone = buffer.clone();
|
let buffer_data = buffer.clone();
|
||||||
let is_binary = self.detect_binary(&buf_clone);
|
let is_binary = self.detect_binary(&buffer_data);
|
||||||
|
|
||||||
if is_binary {
|
if is_binary {
|
||||||
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
if let Some(md) = crate::meta_plugin::process_metadata_outputs(
|
||||||
@@ -234,6 +246,12 @@ impl MetaPlugin for TokensMetaPlugin {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Tokenize any bytes in the buffer
|
||||||
|
if let Some(buffer) = &self.buffer {
|
||||||
|
let data = buffer.clone();
|
||||||
|
self.count_tokens(&data);
|
||||||
|
}
|
||||||
|
|
||||||
// Process any remaining UTF-8 bytes
|
// Process any remaining UTF-8 bytes
|
||||||
if !self.utf8_buffer.is_empty() {
|
if !self.utf8_buffer.is_empty() {
|
||||||
self.count_tokens(&[]);
|
self.count_tokens(&[]);
|
||||||
|
|||||||
@@ -1,81 +1,17 @@
|
|||||||
use crate::meta_plugin::MetaPlugin;
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use clap::Command;
|
use clap::Command;
|
||||||
use serde::{Deserialize, Serialize};
|
use std::collections::HashMap;
|
||||||
use serde_yaml;
|
use strum::IntoEnumIterator;
|
||||||
|
|
||||||
/// Mode for generating a default configuration file.
|
use crate::common::schema::{gather_filter_plugin_schemas, gather_meta_plugin_schemas};
|
||||||
///
|
use crate::compression_engine::CompressionType;
|
||||||
/// This module creates a commented YAML template with default values for settings,
|
use crate::config;
|
||||||
/// including list format, server config, compression, and meta plugins.
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
/// Default configuration structure for the generated template.
|
|
||||||
///
|
|
||||||
/// Includes core settings, list formatting, server options, compression, and meta plugins.
|
|
||||||
struct DefaultConfig {
|
|
||||||
dir: Option<String>,
|
|
||||||
list_format: Vec<ColumnConfig>,
|
|
||||||
human_readable: bool,
|
|
||||||
output_format: Option<String>,
|
|
||||||
quiet: bool,
|
|
||||||
force: bool,
|
|
||||||
server: Option<ServerConfig>,
|
|
||||||
compression_plugin: Option<CompressionPluginConfig>,
|
|
||||||
meta_plugins: Option<Vec<MetaPluginConfig>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
/// Configuration for a column in the list format.
|
|
||||||
struct ColumnConfig {
|
|
||||||
name: String,
|
|
||||||
label: Option<String>,
|
|
||||||
#[serde(default)]
|
|
||||||
align: ColumnAlignment,
|
|
||||||
#[serde(default)]
|
|
||||||
max_len: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
|
||||||
#[serde(rename_all = "lowercase")]
|
|
||||||
/// Alignment options for table columns.
|
|
||||||
enum ColumnAlignment {
|
|
||||||
#[default]
|
|
||||||
Left,
|
|
||||||
Right,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
/// Server configuration options.
|
|
||||||
struct ServerConfig {
|
|
||||||
address: Option<String>,
|
|
||||||
port: Option<u16>,
|
|
||||||
password_file: Option<String>,
|
|
||||||
password: Option<String>,
|
|
||||||
password_hash: Option<String>,
|
|
||||||
cors_origin: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
/// Configuration for the compression plugin.
|
|
||||||
struct CompressionPluginConfig {
|
|
||||||
name: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
/// Configuration for a meta plugin.
|
|
||||||
struct MetaPluginConfig {
|
|
||||||
name: String,
|
|
||||||
#[serde(default)]
|
|
||||||
options: std::collections::HashMap<String, serde_yaml::Value>,
|
|
||||||
#[serde(default)]
|
|
||||||
outputs: std::collections::HashMap<String, String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generates and prints a default commented YAML configuration template.
|
/// Generates and prints a default commented YAML configuration template.
|
||||||
///
|
///
|
||||||
/// Creates instances of available meta plugins to populate default options and outputs,
|
/// Discovers all registered meta plugins, filter plugins, and compression engines
|
||||||
/// then serializes the config to YAML with all lines commented for easy editing.
|
/// at runtime via the plugin schema system. Outputs a commented YAML template
|
||||||
|
/// with all available plugins and their default options/outputs.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
///
|
///
|
||||||
@@ -85,153 +21,244 @@ struct MetaPluginConfig {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// `Ok(())` on success.
|
/// `Ok(())` on success.
|
||||||
///
|
|
||||||
/// # Examples
|
|
||||||
///
|
|
||||||
/// ```ignore
|
|
||||||
/// // Example usage requires Command and Settings instances
|
|
||||||
/// mode_generate_config(&mut cmd, &settings)?;
|
|
||||||
/// ```
|
|
||||||
pub fn mode_generate_config(_cmd: &mut Command, _settings: &crate::config::Settings) -> Result<()> {
|
pub fn mode_generate_config(_cmd: &mut Command, _settings: &crate::config::Settings) -> Result<()> {
|
||||||
// Create instances of each meta plugin to get their default options and outputs
|
let meta_schemas = gather_meta_plugin_schemas();
|
||||||
let cwd_plugin = crate::meta_plugin::cwd::CwdMetaPlugin::new(None, None);
|
let filter_schemas = gather_filter_plugin_schemas();
|
||||||
let digest_plugin = crate::meta_plugin::digest::DigestMetaPlugin::new(None, None);
|
|
||||||
let hostname_plugin = crate::meta_plugin::hostname::HostnameMetaPlugin::new(None, None);
|
|
||||||
#[cfg(feature = "magic")]
|
|
||||||
let magic_file_plugin = crate::meta_plugin::magic_file::MagicFileMetaPlugin::new(None, None);
|
|
||||||
let env_plugin = crate::meta_plugin::env::EnvMetaPlugin::new(None, None);
|
|
||||||
|
|
||||||
// Create a default configuration
|
// Build list_format defaults matching config.rs
|
||||||
let default_config = DefaultConfig {
|
let list_format = default_list_format();
|
||||||
dir: Some("~/.local/share/keep".to_string()),
|
|
||||||
list_format: vec![
|
|
||||||
ColumnConfig {
|
|
||||||
name: "id".to_string(),
|
|
||||||
label: Some("Item".to_string()),
|
|
||||||
align: ColumnAlignment::Right,
|
|
||||||
max_len: None,
|
|
||||||
},
|
|
||||||
ColumnConfig {
|
|
||||||
name: "time".to_string(),
|
|
||||||
label: Some("Time".to_string()),
|
|
||||||
align: ColumnAlignment::Right,
|
|
||||||
max_len: None,
|
|
||||||
},
|
|
||||||
ColumnConfig {
|
|
||||||
name: "size".to_string(),
|
|
||||||
label: Some("Size".to_string()),
|
|
||||||
align: ColumnAlignment::Right,
|
|
||||||
max_len: None,
|
|
||||||
},
|
|
||||||
ColumnConfig {
|
|
||||||
name: "tags".to_string(),
|
|
||||||
label: Some("Tags".to_string()),
|
|
||||||
align: ColumnAlignment::Left,
|
|
||||||
max_len: Some("40".to_string()),
|
|
||||||
},
|
|
||||||
ColumnConfig {
|
|
||||||
name: "meta:hostname_full".to_string(),
|
|
||||||
label: Some("Hostname".to_string()),
|
|
||||||
align: ColumnAlignment::Left,
|
|
||||||
max_len: Some("28".to_string()),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
human_readable: false,
|
|
||||||
output_format: Some("table".to_string()),
|
|
||||||
quiet: false,
|
|
||||||
force: false,
|
|
||||||
server: Some(ServerConfig {
|
|
||||||
address: Some("127.0.0.1".to_string()),
|
|
||||||
port: Some(8080),
|
|
||||||
password_file: None,
|
|
||||||
password: None,
|
|
||||||
password_hash: None,
|
|
||||||
cors_origin: None,
|
|
||||||
}),
|
|
||||||
compression_plugin: None,
|
|
||||||
meta_plugins: Some(vec![
|
|
||||||
MetaPluginConfig {
|
|
||||||
name: "cwd".to_string(),
|
|
||||||
options: cwd_plugin.options().clone(),
|
|
||||||
outputs: convert_outputs_to_string_map(cwd_plugin.outputs()),
|
|
||||||
},
|
|
||||||
MetaPluginConfig {
|
|
||||||
name: "digest".to_string(),
|
|
||||||
options: digest_plugin.options().clone(),
|
|
||||||
outputs: convert_outputs_to_string_map(digest_plugin.outputs()),
|
|
||||||
},
|
|
||||||
MetaPluginConfig {
|
|
||||||
name: "hostname".to_string(),
|
|
||||||
options: hostname_plugin.options().clone(),
|
|
||||||
outputs: convert_outputs_to_string_map(hostname_plugin.outputs()),
|
|
||||||
},
|
|
||||||
#[cfg(feature = "magic")]
|
|
||||||
MetaPluginConfig {
|
|
||||||
name: "magic_file".to_string(),
|
|
||||||
options: magic_file_plugin.options().clone(),
|
|
||||||
outputs: convert_outputs_to_string_map(magic_file_plugin.outputs()),
|
|
||||||
},
|
|
||||||
MetaPluginConfig {
|
|
||||||
name: "env".to_string(),
|
|
||||||
options: env_plugin.options().clone(),
|
|
||||||
outputs: convert_outputs_to_string_map(env_plugin.outputs()),
|
|
||||||
},
|
|
||||||
]),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Serialize to YAML and comment out all lines
|
// Build meta_plugins with env as the default (active), rest commented
|
||||||
let yaml = serde_yaml::to_string(&default_config)?;
|
let meta_plugins = build_meta_plugins_section(&meta_schemas);
|
||||||
|
|
||||||
// Comment out every line
|
// Build the full YAML
|
||||||
let commented_yaml = yaml
|
let mut lines = Vec::with_capacity(128);
|
||||||
.lines()
|
|
||||||
.map(|line| {
|
lines.push("# Keep configuration file".to_string());
|
||||||
if line.trim().is_empty() {
|
lines.push("# Uncomment and modify the settings you need.".to_string());
|
||||||
line.to_string()
|
lines.push(String::new());
|
||||||
} else {
|
|
||||||
format!("# {line}")
|
// Core settings
|
||||||
|
lines.push("# Data directory for storing items".to_string());
|
||||||
|
lines.push("dir: ~/.local/share/keep".to_string());
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// List format
|
||||||
|
lines.push("# Column configuration for --list output".to_string());
|
||||||
|
lines.push("list_format:".to_string());
|
||||||
|
for col in &list_format {
|
||||||
|
lines.push(format!(" - name: {}", col.name));
|
||||||
|
lines.push(format!(" label: {}", col.label));
|
||||||
|
lines.push(format!(" align: {}", col.align));
|
||||||
|
}
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// Table config
|
||||||
|
lines.push("# Table display configuration".to_string());
|
||||||
|
lines.push("#table_config:".to_string());
|
||||||
|
lines.push("# style: nothing".to_string());
|
||||||
|
lines.push("# modifiers: []".to_string());
|
||||||
|
lines.push("# content_arrangement: dynamic".to_string());
|
||||||
|
lines.push("# truncination_indicator: \"\"".to_string());
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// Other settings
|
||||||
|
lines.push("human_readable: false".to_string());
|
||||||
|
lines.push("output_format: table".to_string());
|
||||||
|
lines.push("quiet: false".to_string());
|
||||||
|
lines.push("force: false".to_string());
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// Server config
|
||||||
|
lines.push("# Server configuration (only used with --server)".to_string());
|
||||||
|
lines.push("server:".to_string());
|
||||||
|
lines.push(" address: 127.0.0.1".to_string());
|
||||||
|
lines.push(" port: 8080".to_string());
|
||||||
|
lines.push("# username: keep".to_string());
|
||||||
|
lines.push("# password: null".to_string());
|
||||||
|
lines.push("# password_file: null".to_string());
|
||||||
|
lines.push("# password_hash: null".to_string());
|
||||||
|
lines.push("# jwt_secret: null".to_string());
|
||||||
|
lines.push("# jwt_secret_file: null".to_string());
|
||||||
|
lines.push("# cert_file: null".to_string());
|
||||||
|
lines.push("# key_file: null".to_string());
|
||||||
|
lines.push("# cors_origin: null".to_string());
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// Compression plugin
|
||||||
|
lines.push("# Compression plugin to use".to_string());
|
||||||
|
lines.push("#compression_plugin:".to_string());
|
||||||
|
let mut comp_types: Vec<String> = CompressionType::iter().map(|ct| ct.to_string()).collect();
|
||||||
|
comp_types.sort();
|
||||||
|
for ct in &comp_types {
|
||||||
|
lines.push(format!("# name: {ct} # {}", compression_description(ct)));
|
||||||
|
}
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// Meta plugins
|
||||||
|
lines.push("# Meta plugins to run when saving items".to_string());
|
||||||
|
lines.push("meta_plugins:".to_string());
|
||||||
|
for line in &meta_plugins {
|
||||||
|
lines.push(line.clone());
|
||||||
|
}
|
||||||
|
lines.push(String::new());
|
||||||
|
|
||||||
|
// Filter plugins reference
|
||||||
|
if !filter_schemas.is_empty() {
|
||||||
|
lines.push("# Available filter plugins (use with --filter)".to_string());
|
||||||
|
for schema in &filter_schemas {
|
||||||
|
lines.push(format!("# {}", schema.name));
|
||||||
|
if !schema.description.is_empty() {
|
||||||
|
lines.push(format!("# {}", schema.description));
|
||||||
}
|
}
|
||||||
})
|
for opt in &schema.options {
|
||||||
.collect::<Vec<String>>()
|
let req = if opt.required { "required" } else { "optional" };
|
||||||
.join("\n");
|
lines.push(format!(
|
||||||
|
"# {} ({:?}, {})",
|
||||||
|
opt.name, opt.option_type, req
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lines.push(String::new());
|
||||||
|
}
|
||||||
|
|
||||||
println!("{commented_yaml}");
|
// Client config
|
||||||
|
lines.push("# Client configuration (requires client feature)".to_string());
|
||||||
|
lines.push("#client:".to_string());
|
||||||
|
lines.push("# url: null".to_string());
|
||||||
|
lines.push("# username: null".to_string());
|
||||||
|
lines.push("# password: null".to_string());
|
||||||
|
lines.push("# jwt: null".to_string());
|
||||||
|
|
||||||
|
// Print
|
||||||
|
for line in &lines {
|
||||||
|
println!("{line}");
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function to convert outputs from serde_yaml::Value to String.
|
struct ListColumn {
|
||||||
///
|
name: String,
|
||||||
/// Handles null (uses key), strings, and other values by serializing to YAML string.
|
label: String,
|
||||||
///
|
align: String,
|
||||||
/// # Arguments
|
}
|
||||||
///
|
|
||||||
/// * `outputs` - Reference to the outputs HashMap.
|
fn default_list_format() -> Vec<ListColumn> {
|
||||||
///
|
vec![
|
||||||
/// # Returns
|
ListColumn {
|
||||||
///
|
name: "id".into(),
|
||||||
/// A HashMap with string keys and values.
|
label: "Item".into(),
|
||||||
fn convert_outputs_to_string_map(
|
align: "right".into(),
|
||||||
outputs: &std::collections::HashMap<String, serde_yaml::Value>,
|
},
|
||||||
) -> std::collections::HashMap<String, String> {
|
ListColumn {
|
||||||
let mut result = std::collections::HashMap::new();
|
name: "time".into(),
|
||||||
for (key, value) in outputs {
|
label: "Time".into(),
|
||||||
match value {
|
align: "right".into(),
|
||||||
serde_yaml::Value::Null => {
|
},
|
||||||
// For null, use the key as the value
|
ListColumn {
|
||||||
result.insert(key.clone(), key.clone());
|
name: "size".into(),
|
||||||
|
label: "Size".into(),
|
||||||
|
align: "right".into(),
|
||||||
|
},
|
||||||
|
ListColumn {
|
||||||
|
name: "meta:text_line_count".into(),
|
||||||
|
label: "Lines".into(),
|
||||||
|
align: "right".into(),
|
||||||
|
},
|
||||||
|
ListColumn {
|
||||||
|
name: "tags".into(),
|
||||||
|
label: "Tags".into(),
|
||||||
|
align: "left".into(),
|
||||||
|
},
|
||||||
|
ListColumn {
|
||||||
|
name: "meta:hostname_short".into(),
|
||||||
|
label: "Host".into(),
|
||||||
|
align: "left".into(),
|
||||||
|
},
|
||||||
|
ListColumn {
|
||||||
|
name: "meta:command".into(),
|
||||||
|
label: "Command".into(),
|
||||||
|
align: "left".into(),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_meta_plugins_section(schemas: &[crate::common::schema::PluginSchema]) -> Vec<String> {
|
||||||
|
let mut lines = Vec::new();
|
||||||
|
|
||||||
|
for (i, schema) in schemas.iter().enumerate() {
|
||||||
|
let is_default = schema.name == "env";
|
||||||
|
let prefix = if is_default { "" } else { "# " };
|
||||||
|
|
||||||
|
if i > 0 {
|
||||||
|
lines.push(format!("{prefix}# --- {name} ---", name = schema.name));
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.push(format!("{prefix}- name: {}", schema.name));
|
||||||
|
|
||||||
|
// Options
|
||||||
|
if !schema.options.is_empty() {
|
||||||
|
lines.push(format!("{prefix} options:"));
|
||||||
|
for opt in &schema.options {
|
||||||
|
if let Some(ref default) = opt.default {
|
||||||
|
let default_str = format_yaml_value(default);
|
||||||
|
lines.push(format!("{prefix} {}: {}", opt.name, default_str));
|
||||||
|
} else if opt.required {
|
||||||
|
lines.push(format!("{prefix} {}: null # required", opt.name));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
serde_yaml::Value::String(s) => {
|
} else {
|
||||||
result.insert(key.clone(), s.clone());
|
lines.push(format!("{prefix} options: {{}}"));
|
||||||
}
|
}
|
||||||
_ => {
|
|
||||||
// Convert other values to their YAML string representation
|
// Outputs
|
||||||
result.insert(
|
if !schema.outputs.is_empty() {
|
||||||
key.clone(),
|
lines.push(format!("{prefix} outputs:"));
|
||||||
serde_yaml::to_string(value).unwrap_or_default(),
|
for output in &schema.outputs {
|
||||||
);
|
lines.push(format!("{prefix} {}: {}", output.name, output.name));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
lines.push(format!("{prefix} outputs: {{}}"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result
|
|
||||||
|
lines
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_yaml_value(value: &serde_yaml::Value) -> String {
|
||||||
|
match value {
|
||||||
|
serde_yaml::Value::Null => "null".into(),
|
||||||
|
serde_yaml::Value::Bool(b) => b.to_string(),
|
||||||
|
serde_yaml::Value::Number(n) => n.to_string(),
|
||||||
|
serde_yaml::Value::String(s) => {
|
||||||
|
if s.contains(' ') || s.contains(':') || s.contains('#') {
|
||||||
|
format!("\"{s}\"")
|
||||||
|
} else {
|
||||||
|
s.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serde_yaml::Value::Sequence(_) | serde_yaml::Value::Mapping(_) => {
|
||||||
|
serde_yaml::to_string(value)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.trim()
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
|
serde_yaml::Value::Tagged(_) => serde_yaml::to_string(value)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.trim()
|
||||||
|
.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compression_description(name: &str) -> &str {
|
||||||
|
match name {
|
||||||
|
"lz4" => "Fast compression (native)",
|
||||||
|
"gzip" => "Good compression ratio (native)",
|
||||||
|
"bzip2" => "High compression (requires bzip2 binary)",
|
||||||
|
"xz" => "Very high compression (requires xz binary)",
|
||||||
|
"zstd" => "Modern fast compression (requires zstd binary)",
|
||||||
|
"none" => "No compression",
|
||||||
|
_ => "",
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use anyhow::{Result, bail};
|
use anyhow::{Result, bail};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
/// Supported LLM token encodings.
|
/// Supported LLM token encodings.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
||||||
@@ -46,6 +47,25 @@ impl std::fmt::Debug for Tokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Static tokenizer instances — loaded once per process, shared across all plugins.
|
||||||
|
static CL100K: Lazy<Tokenizer> = Lazy::new(|| {
|
||||||
|
Tokenizer::new(TokenEncoding::Cl100kBase).expect("Failed to create cl100k_base tokenizer")
|
||||||
|
});
|
||||||
|
static O200K: Lazy<Tokenizer> = Lazy::new(|| {
|
||||||
|
Tokenizer::new(TokenEncoding::O200kBase).expect("Failed to create o200k_base tokenizer")
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Returns a reference to a cached tokenizer for the given encoding.
|
||||||
|
///
|
||||||
|
/// The BPE vocabulary is loaded once per encoding and reused for the
|
||||||
|
/// lifetime of the process.
|
||||||
|
pub fn get_tokenizer(encoding: TokenEncoding) -> &'static Tokenizer {
|
||||||
|
match encoding {
|
||||||
|
TokenEncoding::Cl100kBase => &CL100K,
|
||||||
|
TokenEncoding::O200kBase => &O200K,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Tokenizer {
|
impl Tokenizer {
|
||||||
/// Creates a new tokenizer for the specified encoding.
|
/// Creates a new tokenizer for the specified encoding.
|
||||||
pub fn new(encoding: TokenEncoding) -> Result<Self> {
|
pub fn new(encoding: TokenEncoding) -> Result<Self> {
|
||||||
@@ -74,6 +94,37 @@ impl Tokenizer {
|
|||||||
self.bpe.split_by_token(text, false)
|
self.bpe.split_by_token(text, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns an iterator over decoded token strings.
|
||||||
|
///
|
||||||
|
/// Lazily produces token strings without allocating a Vec for all tokens.
|
||||||
|
/// Use this when you only need the first N tokens (e.g., head/skip filters).
|
||||||
|
pub fn split_by_token_iter<'a>(
|
||||||
|
&'a self,
|
||||||
|
text: &'a str,
|
||||||
|
) -> impl Iterator<Item = Result<String>> + 'a {
|
||||||
|
self.bpe.split_by_token_iter(text, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Counts tokens up to `max_tokens` and returns `(token_count, byte_position)`.
|
||||||
|
///
|
||||||
|
/// Uses an iterator to stop early, avoiding allocation of token strings
|
||||||
|
/// beyond `max_tokens`. The byte_position is in the lossy UTF-8 encoding
|
||||||
|
/// of `text` — use `map_lossy_pos_to_bytes` to map back to original bytes.
|
||||||
|
pub fn count_bounded(&self, text: &str, max_tokens: usize) -> (usize, usize) {
|
||||||
|
let mut count = 0usize;
|
||||||
|
let mut byte_pos = 0usize;
|
||||||
|
for token_str in self.bpe.split_by_token_iter(text, false) {
|
||||||
|
if let Ok(s) = token_str {
|
||||||
|
byte_pos += s.len();
|
||||||
|
}
|
||||||
|
count += 1;
|
||||||
|
if count >= max_tokens {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(count, byte_pos)
|
||||||
|
}
|
||||||
|
|
||||||
/// Decodes a slice of token IDs back into a string.
|
/// Decodes a slice of token IDs back into a string.
|
||||||
pub fn decode(&self, tokens: &[u32]) -> Result<String> {
|
pub fn decode(&self, tokens: &[u32]) -> Result<String> {
|
||||||
self.bpe.decode(tokens.to_vec())
|
self.bpe.decode(tokens.to_vec())
|
||||||
|
|||||||
Reference in New Issue
Block a user