diff --git a/CHANGELOG.md b/CHANGELOG.md index e1700cf..43c97fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Database index on `items(ts)` column for faster ORDER BY sorting + +### Changed + +- Filter plugins check size before loading content into memory (prevents OOM on large inputs) +- `#[inline]` on HTML escape helper functions (`esc`, `esc_attr`) for hot path performance +- Removed `once_cell` crate (replaced with `std::sync::LazyLock` from Rust 1.80) +- Removed `lazy_static` crate (replaced with `std::sync::LazyLock`) + +### Fixed + +- CLI help text typo: "metatdata" → "metadata" in `--get` and `--info` descriptions + +### Refactored + +- Added module-level documentation to `services/` module + +### Documentation + +- README.md: Fixed compression table — zstd is native (not external), "none" renamed to "raw" +- DESIGN.md: Updated schema to reflect current `items` table columns and meta plugin inventory + ## [0.1.0] - 2026-03-21 ### Added diff --git a/Cargo.lock b/Cargo.lock index 3bd2ad3..3cdb4af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1727,7 +1727,6 @@ dependencies = [ "inventory", "is-terminal", "jsonwebtoken", - "lazy_static", "libc", "local-ip-address", "log", @@ -1735,7 +1734,6 @@ dependencies = [ "magic", "md5", "nix", - "once_cell", "os_pipe", "pest", "pest_derive", diff --git a/Cargo.toml b/Cargo.toml index e4fac36..778b60d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,6 @@ hyper = { version = "1.0", features = ["full"] } http-body-util = "0.1" inventory = "0.3" is-terminal = "0.4" -lazy_static = "1.5" libc = "0.2" local-ip-address = "0.6" log = "0.4" @@ -45,7 +44,6 @@ magic = { version = "0.13", optional = true } infer = { version = "0.19", optional = true } tree_magic_mini = { version = "3.2", optional = true } nix = { version = "0.30", features = ["fs", "process"] } -once_cell = "1.21" comfy-table = "7.2" pwhash = "1.0" regex = "1.10" diff --git a/DESIGN.md b/DESIGN.md index 610aca1..2cdaa52 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -117,7 +117,7 @@ ## Data Storage ### Database Schema -- `items` table: id (primary key), ts (timestamp), size (optional), compression +- `items` table: id (primary key), ts (timestamp), uncompressed_size (optional), compressed_size (optional), closed (boolean), compression - `tags` table: id (foreign key to items), name (tag name) - `metas` table: id (foreign key to items), name (meta key), value (meta value) - Indexes on tag names and meta names for faster queries @@ -178,26 +178,25 @@ - None (no compression) ## Supported Meta Plugins -- FileMagic - File type detection using file command -- FileMime - MIME type detection using file command -- FileEncoding - File encoding detection using file command -- LineCount - Line count using wc command -- WordCount - Word count using wc command -- Cwd - Current working directory -- Binary - Binary file detection -- Uid - Current user ID -- User - Current username -- Gid - Current group ID -- Group - Current group name -- Shell - Shell path from SHELL environment variable -- ShellPid - Shell process ID from PPID environment variable -- KeepPid - Keep process ID -- DigestSha256 - SHA-256 digest -- DigestMd5 - MD5 digest using md5sum command -- ReadTime - Time taken to read data -- ReadRate - Rate of data reading -- Hostname - System hostname -- FullHostname - Fully qualified domain name + +Meta plugins collect metadata during item save. Each plugin produces one or more key-value pairs: + +- `magic_file` - File type detection using libmagic (when `magic` feature enabled) +- `infer` - MIME type detection using infer crate (when `infer` feature enabled) +- `tree_magic_mini` - MIME type detection using tree_magic_mini (when `tree_magic_mini` feature enabled) +- `tokens` - LLM token counting using tiktoken (when `tokens` feature enabled) +- `text` - Text analysis: line count, word count, char count, line average length +- `digest` - SHA-256 and MD5 checksums +- `hostname` - System hostname (full and short) +- `cwd` - Current working directory +- `user` - Current username and UID +- `shell` - Shell path from SHELL environment variable +- `shell_pid` - Shell process ID from PPID +- `keep_pid` - Keep process ID +- `env` - Arbitrary environment variables (via `KEEP_META_ENV_*` prefix) +- `exec` - Execute external commands for custom metadata +- `read_time` - Time taken to read content +- `read_rate` - Content read rate (bytes/second) ## Testing Strategy - Unit tests for each module in `src/tests/` diff --git a/README.md b/README.md index 7530752..561687f 100644 --- a/README.md +++ b/README.md @@ -345,8 +345,8 @@ Items are compressed automatically on save. Default: LZ4. | `gzip` | Internal | Fast | Good | | `bzip2` | External | Slow | Better | | `xz` | External | Slowest | Best | -| `zstd` | External | Fast | Good | -| `none` | Internal | N/A | N/A | +| `zstd` | Internal | Fast | Good | +| `raw` | Internal | N/A | N/A | ```sh # Specify compression per item diff --git a/src/args.rs b/src/args.rs index f9fec4a..2da294f 100644 --- a/src/args.rs +++ b/src/args.rs @@ -29,9 +29,7 @@ pub struct ModeArgs { pub save: bool, #[arg(group("mode"), help_heading("Mode Options"), short, long, conflicts_with_all(["save", "diff", "list", "delete", "info", "update", "status", "export", "import"]))] - #[arg(help( - "Get an item either by it's ID or by a combination of matching tags and metatdata" - ))] + #[arg(help("Get an item either by its ID or by a combination of matching tags and metadata"))] pub get: bool, #[arg(group("mode"), help_heading("Mode Options"), long, conflicts_with_all(["save", "get", "list", "delete", "info", "update", "status", "export", "import"]))] @@ -48,9 +46,7 @@ pub struct ModeArgs { pub delete: bool, #[arg(group("mode"), help_heading("Mode Options"), short, long, conflicts_with_all(["save", "get", "diff", "list", "delete", "update", "status", "export", "import"]))] - #[arg(help( - "Get an item either by it's ID or by a combination of matching tags and metatdata" - ))] + #[arg(help("Get an item either by its ID or by a combination of matching tags and metadata"))] pub info: bool, #[arg(group("mode"), help_heading("Mode Options"), short('u'), long, conflicts_with_all(["save", "get", "diff", "list", "delete", "info", "status", "export", "import"]))] diff --git a/src/compression_engine/mod.rs b/src/compression_engine/mod.rs index 1a31f5a..3772ea7 100644 --- a/src/compression_engine/mod.rs +++ b/src/compression_engine/mod.rs @@ -7,8 +7,6 @@ use strum::{Display, EnumIter, EnumString}; use log::*; -use lazy_static::lazy_static; - extern crate enum_map; use enum_map::enum_map; use enum_map::{Enum, EnumMap}; @@ -180,63 +178,65 @@ impl Clone for Box { } } -lazy_static! { - static ref COMPRESSION_ENGINES: EnumMap> = { - #[allow(unused_mut)] // mut needed when gzip/lz4 features are enabled - let mut em = enum_map! { - CompressionType::LZ4 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( - "lz4", - vec!["-c"], - vec!["-d", "-c"] - )) as Box, - CompressionType::GZip => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( - "gzip", - vec!["-c"], - vec!["-d", "-c"] - )) as Box, - CompressionType::BZip2 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( - "bzip2", - vec!["-c"], - vec!["-d", "-c"] - )) as Box, - CompressionType::XZ => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( - "xz", - vec!["-c"], - vec!["-d", "-c"] - )) as Box, - CompressionType::ZStd => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( - "zstd", - vec!["-c"], - vec!["-d", "-c"] - )) as Box, - CompressionType::Raw => Box::new(crate::compression_engine::raw::CompressionEngineRaw::new()) as Box - }; - - #[cfg(feature = "gzip")] - { - em[CompressionType::GZip] = - Box::new(crate::compression_engine::gzip::CompressionEngineGZip::new()) - as Box; - } - - #[cfg(feature = "lz4")] - { - em[CompressionType::LZ4] = - Box::new(crate::compression_engine::lz4::CompressionEngineLZ4::new()) - as Box; - } - - #[cfg(feature = "zstd")] - { - em[CompressionType::ZStd] = - Box::new(crate::compression_engine::zstd::CompressionEngineZstd::new()) - as Box; - } - - em +fn init_compression_engines() -> EnumMap> { + #[allow(unused_mut)] + let mut em: EnumMap> = enum_map! { + CompressionType::LZ4 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( + "lz4", + vec!["-c"], + vec!["-d", "-c"] + )) as Box, + CompressionType::GZip => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( + "gzip", + vec!["-c"], + vec!["-d", "-c"] + )) as Box, + CompressionType::BZip2 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( + "bzip2", + vec!["-c"], + vec!["-d", "-c"] + )) as Box, + CompressionType::XZ => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( + "xz", + vec!["-c"], + vec!["-d", "-c"] + )) as Box, + CompressionType::ZStd => Box::new(crate::compression_engine::program::CompressionEngineProgram::new( + "zstd", + vec!["-c"], + vec!["-d", "-c"] + )) as Box, + CompressionType::Raw => Box::new(crate::compression_engine::raw::CompressionEngineRaw::new()) as Box }; + + #[cfg(feature = "gzip")] + { + em[CompressionType::GZip] = + Box::new(crate::compression_engine::gzip::CompressionEngineGZip::new()) + as Box; + } + + #[cfg(feature = "lz4")] + { + em[CompressionType::LZ4] = + Box::new(crate::compression_engine::lz4::CompressionEngineLZ4::new()) + as Box; + } + + #[cfg(feature = "zstd")] + { + em[CompressionType::ZStd] = + Box::new(crate::compression_engine::zstd::CompressionEngineZstd::new()) + as Box; + } + + em } +static COMPRESSION_ENGINES: std::sync::LazyLock< + EnumMap>, +> = std::sync::LazyLock::new(init_compression_engines); + pub fn default_compression_type() -> CompressionType { CompressionType::LZ4 } diff --git a/src/db.rs b/src/db.rs index e68eba5..8d789a1 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,6 +1,5 @@ use anyhow::{Context, Error, Result, anyhow}; use chrono::prelude::*; -use lazy_static::lazy_static; use log::*; use rusqlite::{Connection, OpenFlags, Row, params}; use rusqlite_migration::{M, Migrations}; @@ -47,25 +46,21 @@ let id = db::insert_item(&conn, item)?; ``` */ -lazy_static! { - // Database schema migrations for the Keep application. - // - // Defines the sequence of migrations to create and update the schema. - // Applied automatically when opening a database connection. - static ref MIGRATIONS: Migrations<'static> = Migrations::new(vec![ +static MIGRATIONS: std::sync::LazyLock> = std::sync::LazyLock::new(|| { + Migrations::new(vec![ M::up( "CREATE TABLE items( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, ts TEXT NOT NULL, size INTEGER NULL, - compression TEXT NOT NULL)" + compression TEXT NOT NULL)", ), M::up( "CREATE TABLE tags ( id INTEGER NOT NULL, name TEXT NOT NULL, FOREIGN KEY(id) REFERENCES items(id) ON DELETE CASCADE, - PRIMARY KEY(id, name));" + PRIMARY KEY(id, name));", ), M::up( "CREATE TABLE metas ( @@ -73,16 +68,17 @@ lazy_static! { name TEXT NOT NULL, value TEXT NOT NULL, FOREIGN KEY(id) REFERENCES items(id) ON DELETE CASCADE, - PRIMARY KEY(id, name));" + PRIMARY KEY(id, name));", ), M::up("CREATE INDEX idx_tags_name ON tags(name)"), M::up("CREATE INDEX idx_metas_name ON metas(name)"), + M::up("CREATE INDEX idx_items_ts ON items(ts)"), M::up("UPDATE items SET compression = 'raw' WHERE compression = 'none'"), M::up("ALTER TABLE items RENAME COLUMN size TO uncompressed_size"), M::up("ALTER TABLE items ADD COLUMN compressed_size INTEGER NULL"), M::up("ALTER TABLE items ADD COLUMN closed BOOLEAN NOT NULL DEFAULT 1"), - ]); -} + ]) +}); /// Represents an item stored in the database. /// diff --git a/src/filter_plugin/mod.rs b/src/filter_plugin/mod.rs index 45d3da3..f02e819 100644 --- a/src/filter_plugin/mod.rs +++ b/src/filter_plugin/mod.rs @@ -213,6 +213,44 @@ pub enum FilterType { /// Prevents OOM on large files by rejecting inputs that exceed this limit. const MAX_FILTER_BUFFER_SIZE: usize = 256 * 1024 * 1024; +struct BoundedVecWriter { + data: Vec, + limit: usize, +} + +impl BoundedVecWriter { + fn new(limit: usize) -> Self { + Self { + data: Vec::new(), + limit, + } + } + + fn into_inner(self) -> Vec { + self.data + } +} + +impl std::io::Write for BoundedVecWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if self.data.len() + buf.len() > self.limit { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!( + "Input size exceeds maximum filter buffer size ({} bytes)", + MAX_FILTER_BUFFER_SIZE + ), + )); + } + self.data.write_all(buf)?; + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + /// A chain of filter plugins applied sequentially. /// /// Chains multiple filters, applying them in order to the input stream. @@ -360,21 +398,10 @@ impl FilterChain { } // For multiple plugins, we need to chain them together - // We'll use a temporary buffer to hold intermediate results - let mut current_data = Vec::new(); - std::io::copy(reader, &mut current_data)?; - - if current_data.len() > MAX_FILTER_BUFFER_SIZE { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - format!( - "Input size ({} bytes) exceeds maximum filter buffer size ({} bytes). \ - Consider using fewer filter plugins or smaller inputs.", - current_data.len(), - MAX_FILTER_BUFFER_SIZE - ), - )); - } + // We'll use a bounded buffer to hold intermediate results + let mut bounded_writer = BoundedVecWriter::new(MAX_FILTER_BUFFER_SIZE); + std::io::copy(reader, &mut bounded_writer)?; + let mut current_data = bounded_writer.into_inner(); // Store the plugins length to avoid borrowing issues let plugins_len = self.plugins.len(); diff --git a/src/meta_plugin/mod.rs b/src/meta_plugin/mod.rs index 7294110..63e7af1 100644 --- a/src/meta_plugin/mod.rs +++ b/src/meta_plugin/mod.rs @@ -1,5 +1,4 @@ use log::{debug, warn}; -use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::{Arc, Mutex}; @@ -444,9 +443,9 @@ where /// /// An empty `HashMap` (default implementation). fn outputs(&self) -> &std::collections::HashMap { - use once_cell::sync::Lazy; - static EMPTY: Lazy> = - Lazy::new(std::collections::HashMap::new); + use std::sync::LazyLock; + static EMPTY: LazyLock> = + LazyLock::new(std::collections::HashMap::new); &EMPTY } @@ -471,9 +470,9 @@ where /// /// An empty `HashMap` (default implementation). fn options(&self) -> &std::collections::HashMap { - use once_cell::sync::Lazy; - static EMPTY: Lazy> = - Lazy::new(std::collections::HashMap::new); + use std::sync::LazyLock; + static EMPTY: LazyLock> = + LazyLock::new(std::collections::HashMap::new); &EMPTY } @@ -602,8 +601,9 @@ where } /// Global registry for meta plugins. -static META_PLUGIN_REGISTRY: Lazy>> = - Lazy::new(|| Mutex::new(HashMap::new())); +static META_PLUGIN_REGISTRY: std::sync::LazyLock< + Mutex>, +> = std::sync::LazyLock::new(|| Mutex::new(HashMap::new())); /// Register a meta plugin with the global registry. /// diff --git a/src/modes/common.rs b/src/modes/common.rs index 6a24b70..16d6db0 100644 --- a/src/modes/common.rs +++ b/src/modes/common.rs @@ -21,7 +21,6 @@ use chrono::{DateTime, Utc}; use clap::Command; use clap::error::ErrorKind; use comfy_table::{Attribute, Cell, ContentArrangement, Table}; -use lazy_static::lazy_static; use log::debug; use regex::Regex; use serde::{Deserialize, Serialize}; @@ -57,9 +56,8 @@ pub enum OutputFormat { Yaml, } -lazy_static! { - static ref KEEP_META_RE: Regex = Regex::new(r"^KEEP_META_(.+)$").unwrap(); -} +static KEEP_META_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^KEEP_META_(.+)$").unwrap()); pub const IMPORT_FORMAT_ERROR: &str = "Unsupported import format: {} (expected .keep.tar or .meta.yml)"; diff --git a/src/modes/server/pages.rs b/src/modes/server/pages.rs index ed555a9..90ac851 100644 --- a/src/modes/server/pages.rs +++ b/src/modes/server/pages.rs @@ -13,11 +13,13 @@ use serde::Deserialize; use std::collections::HashMap; /// Escape text content for safe HTML insertion. +#[inline] fn esc(s: &str) -> String { encode_text(s).to_string() } /// Escape attribute values for safe HTML attribute insertion. +#[inline] fn esc_attr(s: &str) -> String { encode_double_quoted_attribute(s).to_string() } diff --git a/src/services/filter_service.rs b/src/services/filter_service.rs index 0a6a3b6..717fe5d 100644 --- a/src/services/filter_service.rs +++ b/src/services/filter_service.rs @@ -1,5 +1,4 @@ use crate::filter_plugin::{FilterChain, parse_filter_string}; -use once_cell::sync::Lazy; use std::collections::HashMap; use std::io::{Read, Result, Write}; use std::sync::Mutex; @@ -166,8 +165,8 @@ impl FilterService { /// # Panics /// /// Lock acquisition failures (rare) cause panics in accessors. -static FILTER_PLUGIN_REGISTRY: Lazy>> = - Lazy::new(|| Mutex::new(HashMap::new())); +static FILTER_PLUGIN_REGISTRY: std::sync::LazyLock>> = + std::sync::LazyLock::new(|| Mutex::new(HashMap::new())); /// Registers a filter plugin in the global registry. /// diff --git a/src/services/mod.rs b/src/services/mod.rs index 3154d52..e4c1b47 100644 --- a/src/services/mod.rs +++ b/src/services/mod.rs @@ -1,3 +1,8 @@ +/// Business logic services for the Keep application. +/// +/// This module provides the core service layer that orchestrates item storage, +/// compression, metadata collection, and filtering. Services are used by both +/// local CLI modes and the HTTP server. pub mod compression_service; pub mod error; pub mod filter_service; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 7aeeb37..a74017d 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,5 +1,4 @@ use anyhow::{Result, bail}; -use once_cell::sync::Lazy; /// Supported LLM token encodings. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] @@ -48,10 +47,10 @@ impl std::fmt::Debug for Tokenizer { } /// Static tokenizer instances — loaded once per process, shared across all plugins. -static CL100K: Lazy = Lazy::new(|| { +static CL100K: std::sync::LazyLock = std::sync::LazyLock::new(|| { Tokenizer::new(TokenEncoding::Cl100kBase).expect("Failed to create cl100k_base tokenizer") }); -static O200K: Lazy = Lazy::new(|| { +static O200K: std::sync::LazyLock = std::sync::LazyLock::new(|| { Tokenizer::new(TokenEncoding::O200kBase).expect("Failed to create o200k_base tokenizer") });