diff --git a/src/meta_plugin/text.rs b/src/meta_plugin/text.rs index d9857af..42b11f9 100644 --- a/src/meta_plugin/text.rs +++ b/src/meta_plugin/text.rs @@ -363,6 +363,132 @@ impl TextMetaPlugin { } None } + + fn process_head(&self, data: &[u8], head_bytes: Option, head_words: Option, head_lines: Option) -> Vec { + let mut result = Vec::new(); + let mut bytes_remaining = head_bytes; + let mut words_remaining = head_words; + let mut lines_remaining = head_lines; + let mut in_word = false; + + for &byte in data { + // Check if any limits are reached + if bytes_remaining == Some(0) || words_remaining == Some(0) || lines_remaining == Some(0) { + break; + } + + result.push(byte); + + // Update bytes remaining + if let Some(remaining) = &mut bytes_remaining { + *remaining -= 1; + } + + // Check for newlines + if let Some(remaining) = &mut lines_remaining { + if byte == b'\n' && *remaining > 0 { + *remaining -= 1; + } + } + + // Check for words + if let Some(remaining) = &mut words_remaining { + let is_whitespace = byte.is_ascii_whitespace(); + if in_word && is_whitespace { + in_word = false; + if *remaining > 0 { + *remaining -= 1; + } + } else if !is_whitespace { + in_word = true; + } + } + } + result + } + + fn process_tail(&self, data: &[u8], tail_bytes: Option, tail_words: Option, tail_lines: Option) -> Vec { + // For simplicity, we'll process from the end + // This implementation may not be perfect for words and lines, but it's a start + let mut result = Vec::new(); + + if let Some(bytes) = tail_bytes { + let start = if data.len() > bytes { data.len() - bytes } else { 0 }; + return data[start..].to_vec(); + } + + // For words and lines, we need to process from the end + // This is a simplified implementation + if let Some(lines) = tail_lines { + let mut line_count = 0; + let mut i = data.len(); + while i > 0 { + i -= 1; + if data[i] == b'\n' { + line_count += 1; + if line_count == lines { + break; + } + } + } + return data[i..].to_vec(); + } + + if let Some(words) = tail_words { + let mut word_count = 0; + let mut i = data.len(); + let mut in_word = false; + while i > 0 { + i -= 1; + let is_whitespace = data[i].is_ascii_whitespace(); + if !in_word && !is_whitespace { + in_word = true; + word_count += 1; + if word_count == words { + break; + } + } else if is_whitespace { + in_word = false; + } + } + return data[i..].to_vec(); + } + + data.to_vec() + } + + fn process_line_range(&self, data: &[u8], line_start: Option, line_end: Option) -> Vec { + let start_line = line_start.unwrap_or(1); + let end_line = line_end.unwrap_or(usize::MAX); + + let mut result = Vec::new(); + let mut current_line = 1; + let mut line_start_index = 0; + let mut in_range = false; + + for (i, &byte) in data.iter().enumerate() { + if current_line > end_line { + break; + } + + if current_line >= start_line && current_line <= end_line { + if !in_range { + in_range = true; + line_start_index = i; + } + result.push(byte); + } + + if byte == b'\n' { + current_line += 1; + if current_line > end_line { + break; + } + } + } + + result + } /// Helper method to output word and line counts fn output_word_line_counts(&mut self) -> Vec { @@ -427,13 +553,51 @@ impl MetaPlugin for TextMetaPlugin { let mut metadata = Vec::new(); + // Check if we have head/tail/line range options that would affect processing + // These options come from the base plugin's options + let head_bytes = self.base.options.get("head_bytes") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let head_words = self.base.options.get("head_words") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let head_lines = self.base.options.get("head_lines") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let tail_bytes = self.base.options.get("tail_bytes") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let tail_words = self.base.options.get("tail_words") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let tail_lines = self.base.options.get("tail_lines") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let line_start = self.base.options.get("line_start") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let line_end = self.base.options.get("line_end") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + + // Apply content filtering if any of the options are present + let processed_data = if head_bytes.is_some() || head_words.is_some() || head_lines.is_some() { + self.process_head(data, head_bytes, head_words, head_lines) + } else if tail_bytes.is_some() || tail_words.is_some() || tail_lines.is_some() { + self.process_tail(data, tail_bytes, tail_words, tail_lines) + } else if line_start.is_some() || line_end.is_some() { + self.process_line_range(data, line_start, line_end) + } else { + data.to_vec() + }; + // If we haven't determined if content is binary yet, build buffer and check if self.is_binary_content.is_none() { let should_finalize = if let Some(ref mut buffer) = self.buffer { - // Add data to our buffer up to max_buffer_size + // Add processed data to our buffer up to max_buffer_size let remaining_capacity = self.max_buffer_size.saturating_sub(buffer.len()); - let bytes_to_take = std::cmp::min(data.len(), remaining_capacity); - buffer.extend_from_slice(&data[..bytes_to_take]); + let bytes_to_take = std::cmp::min(processed_data.len(), remaining_capacity); + buffer.extend_from_slice(&processed_data[..bytes_to_take]); // If we have enough data to make a binary determination, do it now let buffer_len = buffer.len(); @@ -455,7 +619,7 @@ impl MetaPlugin for TextMetaPlugin { } // If it's text, count words and lines for this chunk - self.count_text_stats(&data[..bytes_to_take]); + self.count_text_stats(&processed_data[..bytes_to_take]); // If we've reached our buffer limit, drop the buffer to save memory // But don't finalize yet - we need to keep counting words and lines @@ -465,7 +629,7 @@ impl MetaPlugin for TextMetaPlugin { false // Never finalize here for text content } else { // Still building up buffer, count words and lines for this chunk - self.count_text_stats(&data[..bytes_to_take]); + self.count_text_stats(&processed_data[..bytes_to_take]); false } } else { @@ -480,7 +644,7 @@ impl MetaPlugin for TextMetaPlugin { } } else if self.is_binary_content == Some(false) { // We've already determined it's text, just count words and lines - self.count_text_stats(data); + self.count_text_stats(&processed_data); } // If is_binary_content == Some(true), we should have already finalized, but just in case: else if self.is_binary_content == Some(true) { @@ -508,13 +672,49 @@ impl MetaPlugin for TextMetaPlugin { let mut metadata = Vec::new(); + // Check if we have head/tail/line range options + let head_bytes = self.base.options.get("head_bytes") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let head_words = self.base.options.get("head_words") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let head_lines = self.base.options.get("head_lines") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let tail_bytes = self.base.options.get("tail_bytes") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let tail_words = self.base.options.get("tail_words") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let tail_lines = self.base.options.get("tail_lines") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let line_start = self.base.options.get("line_start") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + let line_end = self.base.options.get("line_end") + .and_then(|v| v.as_u64()) + .map(|v| v as usize); + // If we haven't determined binary status yet, do it now with whatever we have if self.is_binary_content.is_none() { if let Some(buffer) = &self.buffer { if !buffer.is_empty() { - // Clone the buffer data for binary detection to avoid borrowing conflicts - let buffer_clone = buffer.clone(); - let (binary_metadata, is_binary) = self.perform_binary_detection(&buffer_clone); + // Apply content filtering to the buffer if needed + let processed_buffer = if head_bytes.is_some() || head_words.is_some() || head_lines.is_some() { + self.process_head(buffer, head_bytes, head_words, head_lines) + } else if tail_bytes.is_some() || tail_words.is_some() || tail_lines.is_some() { + self.process_tail(buffer, tail_bytes, tail_words, tail_lines) + } else if line_start.is_some() || line_end.is_some() { + self.process_line_range(buffer, line_start, line_end) + } else { + buffer.clone() + }; + + // Clone the processed buffer data for binary detection + let (binary_metadata, is_binary) = self.perform_binary_detection(&processed_buffer); metadata.extend(binary_metadata); self.is_binary_content = Some(is_binary);