Compare commits

..

3 Commits

Author SHA1 Message Date
e2cb36d2a8 feat(server): add file_size to API ItemInfo response 2026-03-21 14:03:58 -03:00
0004324301 perf: pre-allocate status info collections with known capacities 2026-03-21 13:54:37 -03:00
b3edfe7de6 chore: code review cleanup — fixes, deps, docs
Fixed:
- CLI help typo: "metatdata" -> "metadata"
- Filter buffer OOM: check size before loading into memory

Changed:
- #[inline] on HTML escape helpers for hot path performance
- Replaced once_cell and lazy_static with std::sync::LazyLock
- Removed unused once_cell and lazy_static crate dependencies

Refactored:
- Added module-level doc to services/ module

Documentation:
- README.md: zstd is native not external, "none" -> "raw"
- DESIGN.md: current schema and meta plugins section
- CHANGELOG.md: Unreleased section populated
2026-03-21 11:44:37 -03:00
18 changed files with 242 additions and 154 deletions

View File

@@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Database index on `items(ts)` column for faster ORDER BY sorting
- Server API `ItemInfo` now includes `file_size` — actual filesystem-reported size of the item data file
### Changed
- Filter plugins check size before loading content into memory (prevents OOM on large inputs)
- Status page pre-allocates collections with known capacities (meta plugins, compression info)
- `#[inline]` on HTML escape helper functions (`esc`, `esc_attr`) for hot path performance
- Removed `once_cell` crate (replaced with `std::sync::LazyLock` from Rust 1.80)
- Removed `lazy_static` crate (replaced with `std::sync::LazyLock`)
### Fixed
- CLI help text typo: "metatdata" → "metadata" in `--get` and `--info` descriptions
### Refactored
- Added module-level documentation to `services/` module
### Documentation
- README.md: Fixed compression table — zstd is native (not external), "none" renamed to "raw"
- DESIGN.md: Updated schema to reflect current `items` table columns and meta plugin inventory
## [0.1.0] - 2026-03-21
### Added

2
Cargo.lock generated
View File

@@ -1727,7 +1727,6 @@ dependencies = [
"inventory",
"is-terminal",
"jsonwebtoken",
"lazy_static",
"libc",
"local-ip-address",
"log",
@@ -1735,7 +1734,6 @@ dependencies = [
"magic",
"md5",
"nix",
"once_cell",
"os_pipe",
"pest",
"pest_derive",

View File

@@ -35,7 +35,6 @@ hyper = { version = "1.0", features = ["full"] }
http-body-util = "0.1"
inventory = "0.3"
is-terminal = "0.4"
lazy_static = "1.5"
libc = "0.2"
local-ip-address = "0.6"
log = "0.4"
@@ -45,7 +44,6 @@ magic = { version = "0.13", optional = true }
infer = { version = "0.19", optional = true }
tree_magic_mini = { version = "3.2", optional = true }
nix = { version = "0.30", features = ["fs", "process"] }
once_cell = "1.21"
comfy-table = "7.2"
pwhash = "1.0"
regex = "1.10"

View File

@@ -117,7 +117,7 @@
## Data Storage
### Database Schema
- `items` table: id (primary key), ts (timestamp), size (optional), compression
- `items` table: id (primary key), ts (timestamp), uncompressed_size (optional), compressed_size (optional), closed (boolean), compression
- `tags` table: id (foreign key to items), name (tag name)
- `metas` table: id (foreign key to items), name (meta key), value (meta value)
- Indexes on tag names and meta names for faster queries
@@ -178,26 +178,25 @@
- None (no compression)
## Supported Meta Plugins
- FileMagic - File type detection using file command
- FileMime - MIME type detection using file command
- FileEncoding - File encoding detection using file command
- LineCount - Line count using wc command
- WordCount - Word count using wc command
- Cwd - Current working directory
- Binary - Binary file detection
- Uid - Current user ID
- User - Current username
- Gid - Current group ID
- Group - Current group name
- Shell - Shell path from SHELL environment variable
- ShellPid - Shell process ID from PPID environment variable
- KeepPid - Keep process ID
- DigestSha256 - SHA-256 digest
- DigestMd5 - MD5 digest using md5sum command
- ReadTime - Time taken to read data
- ReadRate - Rate of data reading
- Hostname - System hostname
- FullHostname - Fully qualified domain name
Meta plugins collect metadata during item save. Each plugin produces one or more key-value pairs:
- `magic_file` - File type detection using libmagic (when `magic` feature enabled)
- `infer` - MIME type detection using infer crate (when `infer` feature enabled)
- `tree_magic_mini` - MIME type detection using tree_magic_mini (when `tree_magic_mini` feature enabled)
- `tokens` - LLM token counting using tiktoken (when `tokens` feature enabled)
- `text` - Text analysis: line count, word count, char count, line average length
- `digest` - SHA-256 and MD5 checksums
- `hostname` - System hostname (full and short)
- `cwd` - Current working directory
- `user` - Current username and UID
- `shell` - Shell path from SHELL environment variable
- `shell_pid` - Shell process ID from PPID
- `keep_pid` - Keep process ID
- `env` - Arbitrary environment variables (via `KEEP_META_ENV_*` prefix)
- `exec` - Execute external commands for custom metadata
- `read_time` - Time taken to read content
- `read_rate` - Content read rate (bytes/second)
## Testing Strategy
- Unit tests for each module in `src/tests/`

View File

@@ -345,8 +345,8 @@ Items are compressed automatically on save. Default: LZ4.
| `gzip` | Internal | Fast | Good |
| `bzip2` | External | Slow | Better |
| `xz` | External | Slowest | Best |
| `zstd` | External | Fast | Good |
| `none` | Internal | N/A | N/A |
| `zstd` | Internal | Fast | Good |
| `raw` | Internal | N/A | N/A |
```sh
# Specify compression per item

View File

@@ -29,9 +29,7 @@ pub struct ModeArgs {
pub save: bool,
#[arg(group("mode"), help_heading("Mode Options"), short, long, conflicts_with_all(["save", "diff", "list", "delete", "info", "update", "status", "export", "import"]))]
#[arg(help(
"Get an item either by it's ID or by a combination of matching tags and metatdata"
))]
#[arg(help("Get an item either by its ID or by a combination of matching tags and metadata"))]
pub get: bool,
#[arg(group("mode"), help_heading("Mode Options"), long, conflicts_with_all(["save", "get", "list", "delete", "info", "update", "status", "export", "import"]))]
@@ -48,9 +46,7 @@ pub struct ModeArgs {
pub delete: bool,
#[arg(group("mode"), help_heading("Mode Options"), short, long, conflicts_with_all(["save", "get", "diff", "list", "delete", "update", "status", "export", "import"]))]
#[arg(help(
"Get an item either by it's ID or by a combination of matching tags and metatdata"
))]
#[arg(help("Get an item either by its ID or by a combination of matching tags and metadata"))]
pub info: bool,
#[arg(group("mode"), help_heading("Mode Options"), short('u'), long, conflicts_with_all(["save", "get", "diff", "list", "delete", "info", "status", "export", "import"]))]

View File

@@ -89,7 +89,7 @@ pub fn generate_status_info(
};
let _default_type = crate::compression_engine::default_compression_type();
let mut compression_info = Vec::new();
let mut compression_info = Vec::with_capacity(CompressionType::iter().count());
// Sort compression types by their string representation
let mut sorted_compression_types: Vec<CompressionType> = CompressionType::iter().collect();
@@ -141,7 +141,8 @@ pub fn generate_status_info(
});
}
let mut meta_plugins_map = std::collections::HashMap::new();
let mut meta_plugins_map =
std::collections::HashMap::with_capacity(MetaPluginType::iter().count());
let mut enabled_meta_plugins_vec = Vec::new();
// Sort meta plugin types by their string representation to avoid creating plugins just for sorting

View File

@@ -7,8 +7,6 @@ use strum::{Display, EnumIter, EnumString};
use log::*;
use lazy_static::lazy_static;
extern crate enum_map;
use enum_map::enum_map;
use enum_map::{Enum, EnumMap};
@@ -180,10 +178,9 @@ impl Clone for Box<dyn CompressionEngine> {
}
}
lazy_static! {
static ref COMPRESSION_ENGINES: EnumMap<CompressionType, Box<dyn CompressionEngine>> = {
#[allow(unused_mut)] // mut needed when gzip/lz4 features are enabled
let mut em = enum_map! {
fn init_compression_engines() -> EnumMap<CompressionType, Box<dyn CompressionEngine>> {
#[allow(unused_mut)]
let mut em: EnumMap<CompressionType, Box<dyn CompressionEngine>> = enum_map! {
CompressionType::LZ4 => Box::new(crate::compression_engine::program::CompressionEngineProgram::new(
"lz4",
vec!["-c"],
@@ -234,9 +231,12 @@ lazy_static! {
}
em
};
}
static COMPRESSION_ENGINES: std::sync::LazyLock<
EnumMap<CompressionType, Box<dyn CompressionEngine>>,
> = std::sync::LazyLock::new(init_compression_engines);
pub fn default_compression_type() -> CompressionType {
CompressionType::LZ4
}

View File

@@ -1,6 +1,5 @@
use anyhow::{Context, Error, Result, anyhow};
use chrono::prelude::*;
use lazy_static::lazy_static;
use log::*;
use rusqlite::{Connection, OpenFlags, Row, params};
use rusqlite_migration::{M, Migrations};
@@ -47,25 +46,21 @@ let id = db::insert_item(&conn, item)?;
```
*/
lazy_static! {
// Database schema migrations for the Keep application.
//
// Defines the sequence of migrations to create and update the schema.
// Applied automatically when opening a database connection.
static ref MIGRATIONS: Migrations<'static> = Migrations::new(vec![
static MIGRATIONS: std::sync::LazyLock<Migrations<'static>> = std::sync::LazyLock::new(|| {
Migrations::new(vec![
M::up(
"CREATE TABLE items(
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
ts TEXT NOT NULL,
size INTEGER NULL,
compression TEXT NOT NULL)"
compression TEXT NOT NULL)",
),
M::up(
"CREATE TABLE tags (
id INTEGER NOT NULL,
name TEXT NOT NULL,
FOREIGN KEY(id) REFERENCES items(id) ON DELETE CASCADE,
PRIMARY KEY(id, name));"
PRIMARY KEY(id, name));",
),
M::up(
"CREATE TABLE metas (
@@ -73,16 +68,17 @@ lazy_static! {
name TEXT NOT NULL,
value TEXT NOT NULL,
FOREIGN KEY(id) REFERENCES items(id) ON DELETE CASCADE,
PRIMARY KEY(id, name));"
PRIMARY KEY(id, name));",
),
M::up("CREATE INDEX idx_tags_name ON tags(name)"),
M::up("CREATE INDEX idx_metas_name ON metas(name)"),
M::up("CREATE INDEX idx_items_ts ON items(ts)"),
M::up("UPDATE items SET compression = 'raw' WHERE compression = 'none'"),
M::up("ALTER TABLE items RENAME COLUMN size TO uncompressed_size"),
M::up("ALTER TABLE items ADD COLUMN compressed_size INTEGER NULL"),
M::up("ALTER TABLE items ADD COLUMN closed BOOLEAN NOT NULL DEFAULT 1"),
]);
}
])
});
/// Represents an item stored in the database.
///

View File

@@ -213,6 +213,44 @@ pub enum FilterType {
/// Prevents OOM on large files by rejecting inputs that exceed this limit.
const MAX_FILTER_BUFFER_SIZE: usize = 256 * 1024 * 1024;
struct BoundedVecWriter {
data: Vec<u8>,
limit: usize,
}
impl BoundedVecWriter {
fn new(limit: usize) -> Self {
Self {
data: Vec::new(),
limit,
}
}
fn into_inner(self) -> Vec<u8> {
self.data
}
}
impl std::io::Write for BoundedVecWriter {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
if self.data.len() + buf.len() > self.limit {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"Input size exceeds maximum filter buffer size ({} bytes)",
MAX_FILTER_BUFFER_SIZE
),
));
}
self.data.write_all(buf)?;
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
/// A chain of filter plugins applied sequentially.
///
/// Chains multiple filters, applying them in order to the input stream.
@@ -360,21 +398,10 @@ impl FilterChain {
}
// For multiple plugins, we need to chain them together
// We'll use a temporary buffer to hold intermediate results
let mut current_data = Vec::new();
std::io::copy(reader, &mut current_data)?;
if current_data.len() > MAX_FILTER_BUFFER_SIZE {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"Input size ({} bytes) exceeds maximum filter buffer size ({} bytes). \
Consider using fewer filter plugins or smaller inputs.",
current_data.len(),
MAX_FILTER_BUFFER_SIZE
),
));
}
// We'll use a bounded buffer to hold intermediate results
let mut bounded_writer = BoundedVecWriter::new(MAX_FILTER_BUFFER_SIZE);
std::io::copy(reader, &mut bounded_writer)?;
let mut current_data = bounded_writer.into_inner();
// Store the plugins length to avoid borrowing issues
let plugins_len = self.plugins.len();

View File

@@ -1,5 +1,4 @@
use log::{debug, warn};
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
@@ -444,9 +443,9 @@ where
///
/// An empty `HashMap` (default implementation).
fn outputs(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
use once_cell::sync::Lazy;
static EMPTY: Lazy<std::collections::HashMap<String, serde_yaml::Value>> =
Lazy::new(std::collections::HashMap::new);
use std::sync::LazyLock;
static EMPTY: LazyLock<std::collections::HashMap<String, serde_yaml::Value>> =
LazyLock::new(std::collections::HashMap::new);
&EMPTY
}
@@ -471,9 +470,9 @@ where
///
/// An empty `HashMap` (default implementation).
fn options(&self) -> &std::collections::HashMap<String, serde_yaml::Value> {
use once_cell::sync::Lazy;
static EMPTY: Lazy<std::collections::HashMap<String, serde_yaml::Value>> =
Lazy::new(std::collections::HashMap::new);
use std::sync::LazyLock;
static EMPTY: LazyLock<std::collections::HashMap<String, serde_yaml::Value>> =
LazyLock::new(std::collections::HashMap::new);
&EMPTY
}
@@ -602,8 +601,9 @@ where
}
/// Global registry for meta plugins.
static META_PLUGIN_REGISTRY: Lazy<Mutex<HashMap<MetaPluginType, PluginConstructor>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
static META_PLUGIN_REGISTRY: std::sync::LazyLock<
Mutex<HashMap<MetaPluginType, PluginConstructor>>,
> = std::sync::LazyLock::new(|| Mutex::new(HashMap::new()));
/// Register a meta plugin with the global registry.
///

View File

@@ -21,7 +21,6 @@ use chrono::{DateTime, Utc};
use clap::Command;
use clap::error::ErrorKind;
use comfy_table::{Attribute, Cell, ContentArrangement, Table};
use lazy_static::lazy_static;
use log::debug;
use regex::Regex;
use serde::{Deserialize, Serialize};
@@ -57,9 +56,8 @@ pub enum OutputFormat {
Yaml,
}
lazy_static! {
static ref KEEP_META_RE: Regex = Regex::new(r"^KEEP_META_(.+)$").unwrap();
}
static KEEP_META_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^KEEP_META_(.+)$").unwrap());
pub const IMPORT_FORMAT_ERROR: &str =
"Unsupported import format: {} (expected .keep.tar or .meta.yml)";

View File

@@ -178,6 +178,7 @@ pub async fn handle_list_items(
let item_infos: Vec<ItemInfo> = items_with_meta
.into_iter()
.filter_map(|iwm| ItemInfo::try_from(iwm).ok())
.map(|info| info.with_file_size(&state.data_dir))
.collect();
ResponseBuilder::json(ApiResponse::ok(item_infos))
@@ -339,6 +340,7 @@ pub async fn handle_post_item(
let db = state.db.clone();
let item_service = state.item_service.clone();
let settings = state.settings.clone();
let data_dir = state.data_dir.clone();
// Parse tags from query parameter
let tags: Vec<String> = params
@@ -472,7 +474,9 @@ pub async fn handle_post_item(
return Err(StatusCode::PAYLOAD_TOO_LARGE);
}
let item_info = ItemInfo::try_from(item_with_meta).map_err(|e| {
let item_info = ItemInfo::try_from(item_with_meta)
.map(|info| info.with_file_size(&data_dir))
.map_err(|e| {
warn!("Item conversion failed: {e}");
StatusCode::INTERNAL_SERVER_ERROR
})?;
@@ -1092,6 +1096,7 @@ pub async fn handle_delete_item(
compression: deleted_item.compression,
tags: vec![],
metadata: HashMap::new(),
file_size: None,
};
Ok(Json(ApiResponse::ok(item_info)))
@@ -1124,6 +1129,7 @@ pub async fn handle_get_item_info(
let db = state.db.clone();
let item_service = state.item_service.clone();
let data_dir = state.data_dir.clone();
let item_with_meta = task::spawn_blocking(move || {
let conn = db.blocking_lock();
@@ -1136,7 +1142,9 @@ pub async fn handle_get_item_info(
})?
.map_err(handle_item_error)?;
let item_info = ItemInfo::try_from(item_with_meta).map_err(|e| {
let item_info = ItemInfo::try_from(item_with_meta)
.map(|info| info.with_file_size(&data_dir))
.map_err(|e| {
warn!("Item conversion failed: {e}");
StatusCode::INTERNAL_SERVER_ERROR
})?;
@@ -1352,6 +1360,7 @@ pub async fn handle_update_item(
let db = state.db.clone();
let item_service = state.item_service.clone();
let settings = state.settings.clone();
let data_dir = state.data_dir.clone();
let size_param = params.uncompressed_size;
let item_info = task::spawn_blocking(move || {
@@ -1369,7 +1378,9 @@ pub async fn handle_update_item(
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
match item_service.get_item(&conn, item_id) {
Ok(iwm) => ItemInfo::try_from(iwm).map_err(|e| {
Ok(iwm) => ItemInfo::try_from(iwm)
.map(|info| info.with_file_size(&data_dir))
.map_err(|e| {
warn!("Item conversion failed: {e}");
StatusCode::INTERNAL_SERVER_ERROR
}),
@@ -1392,7 +1403,9 @@ pub async fn handle_update_item(
);
match result {
Ok(item_with_meta) => ItemInfo::try_from(item_with_meta).map_err(|e| {
Ok(item_with_meta) => ItemInfo::try_from(item_with_meta)
.map(|info| info.with_file_size(&data_dir))
.map_err(|e| {
warn!("Item conversion failed: {e}");
StatusCode::INTERNAL_SERVER_ERROR
}),

View File

@@ -366,10 +366,13 @@ pub struct StatusInfoResponse {
/// let item_info = ItemInfo {
/// id: 42,
/// ts: "2023-12-01T15:30:45Z".to_string(),
/// size: Some(1024),
/// uncompressed_size: Some(1024),
/// compressed_size: Some(512),
/// closed: true,
/// compression: "gzip".to_string(),
/// tags: vec!["important".to_string()],
/// metadata: HashMap::from([("mime_type".to_string(), "text/plain".to_string())]),
/// file_size: Some(512),
/// };
/// ```
#[derive(Serialize, Deserialize, ToSchema)]
@@ -413,6 +416,33 @@ pub struct ItemInfo {
/// Key-value pairs containing additional metadata about the item.
#[schema(example = json!({"mime_type": "text/plain", "mime_encoding": "utf-8", "line_count": "42"}))]
pub metadata: HashMap<String, String>,
/// Actual file size in bytes.
///
/// The filesystem-reported size of the item's data file. This may differ from
/// `compressed_size` if the file was written and the database hasn't been updated.
/// None if the file cannot be read (e.g., file not found, permission denied).
#[schema(example = 512)]
pub file_size: Option<i64>,
}
impl ItemInfo {
/// Enriches this `ItemInfo` with the actual filesystem-reported size.
///
/// Reads the size of the item's data file from disk and sets `file_size`.
/// If the file cannot be read, `file_size` is left as None.
///
/// # Arguments
///
/// * `data_dir` - The data directory path containing item files.
///
/// # Returns
///
/// A new `ItemInfo` with `file_size` populated from the filesystem.
pub fn with_file_size(mut self, data_dir: &std::path::Path) -> Self {
let item_path = data_dir.join(self.id.to_string());
self.file_size = std::fs::metadata(&item_path).map(|m| m.len() as i64).ok();
self
}
}
impl TryFrom<ItemWithMeta> for ItemInfo {
@@ -433,6 +463,7 @@ impl TryFrom<ItemWithMeta> for ItemInfo {
compression: item_with_meta.item.compression,
tags,
metadata,
file_size: None,
})
}
}

View File

@@ -13,11 +13,13 @@ use serde::Deserialize;
use std::collections::HashMap;
/// Escape text content for safe HTML insertion.
#[inline]
fn esc(s: &str) -> String {
encode_text(s).to_string()
}
/// Escape attribute values for safe HTML attribute insertion.
#[inline]
fn esc_attr(s: &str) -> String {
encode_double_quoted_attribute(s).to_string()
}

View File

@@ -1,5 +1,4 @@
use crate::filter_plugin::{FilterChain, parse_filter_string};
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::io::{Read, Result, Write};
use std::sync::Mutex;
@@ -166,8 +165,8 @@ impl FilterService {
/// # Panics
///
/// Lock acquisition failures (rare) cause panics in accessors.
static FILTER_PLUGIN_REGISTRY: Lazy<Mutex<HashMap<String, FilterConstructor>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
static FILTER_PLUGIN_REGISTRY: std::sync::LazyLock<Mutex<HashMap<String, FilterConstructor>>> =
std::sync::LazyLock::new(|| Mutex::new(HashMap::new()));
/// Registers a filter plugin in the global registry.
///

View File

@@ -1,3 +1,8 @@
/// Business logic services for the Keep application.
///
/// This module provides the core service layer that orchestrates item storage,
/// compression, metadata collection, and filtering. Services are used by both
/// local CLI modes and the HTTP server.
pub mod compression_service;
pub mod error;
pub mod filter_service;

View File

@@ -1,5 +1,4 @@
use anyhow::{Result, bail};
use once_cell::sync::Lazy;
/// Supported LLM token encodings.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
@@ -48,10 +47,10 @@ impl std::fmt::Debug for Tokenizer {
}
/// Static tokenizer instances — loaded once per process, shared across all plugins.
static CL100K: Lazy<Tokenizer> = Lazy::new(|| {
static CL100K: std::sync::LazyLock<Tokenizer> = std::sync::LazyLock::new(|| {
Tokenizer::new(TokenEncoding::Cl100kBase).expect("Failed to create cl100k_base tokenizer")
});
static O200K: Lazy<Tokenizer> = Lazy::new(|| {
static O200K: std::sync::LazyLock<Tokenizer> = std::sync::LazyLock::new(|| {
Tokenizer::new(TokenEncoding::O200kBase).expect("Failed to create o200k_base tokenizer")
});