use std::fs;
use std::path::Path;
use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
#[cfg(feature = "symspell_cleanup")]
use crate::symspell_cleanup::fix_pdf_text as fix_pdf_spacing;
#[cfg(not(feature = "symspell_cleanup"))]
use crate::text::fix_pdf_spacing;
#[cfg(feature = "extractous")]
use log::LevelFilter;
use lopdf::Document as LopdfDocument;
use serde_json::{Value, json};
#[cfg(feature = "extractous")]
use extractous::Extractor;
#[cfg(feature = "extractous")]
use std::collections::{HashMap, VecDeque};
#[cfg(feature = "extractous")]
use std::sync::{Mutex, OnceLock};
#[derive(Debug, Clone)]
pub struct ExtractedDocument {
pub text: Option<String>,
pub metadata: Value,
pub mime_type: Option<String>,
}
impl ExtractedDocument {
#[must_use]
pub fn empty() -> Self {
Self {
text: None,
metadata: Value::Null,
mime_type: None,
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct ProcessorConfig {
pub max_text_chars: usize,
}
impl Default for ProcessorConfig {
fn default() -> Self {
Self {
max_text_chars: 2_000_000,
}
}
}
#[cfg(feature = "extractous")]
const DEFAULT_EXTRACTION_CACHE_CAPACITY: usize = 100;
#[cfg(feature = "extractous")]
struct ExtractionCache {
cache: HashMap<blake3::Hash, ExtractedDocument>,
lru_queue: VecDeque<blake3::Hash>,
capacity: usize,
hits: usize,
misses: usize,
}
#[cfg(feature = "extractous")]
impl ExtractionCache {
fn new(capacity: usize) -> Self {
Self {
cache: HashMap::with_capacity(capacity),
lru_queue: VecDeque::with_capacity(capacity),
capacity,
hits: 0,
misses: 0,
}
}
fn get(&mut self, key: &blake3::Hash) -> Option<ExtractedDocument> {
if let Some(document) = self.cache.get(key) {
self.lru_queue.retain(|k| k != key);
self.lru_queue.push_front(*key);
self.hits += 1;
Some(document.clone())
} else {
self.misses += 1;
None
}
}
fn insert(&mut self, key: blake3::Hash, value: ExtractedDocument) {
if self.cache.contains_key(&key) {
self.cache.insert(key, value);
self.lru_queue.retain(|k| *k != key);
self.lru_queue.push_front(key);
return;
}
if self.cache.len() >= self.capacity {
if let Some(oldest_key) = self.lru_queue.pop_back() {
self.cache.remove(&oldest_key);
tracing::debug!(
evicted_hash = ?oldest_key,
"Evicted oldest entry from extraction cache"
);
}
}
self.cache.insert(key, value);
self.lru_queue.push_front(key);
}
#[allow(dead_code)]
fn stats(&self) -> (usize, usize, usize) {
(self.hits, self.misses, self.cache.len())
}
}
#[cfg(feature = "extractous")]
#[derive(Debug)]
pub struct DocumentProcessor {
extractor: Mutex<Extractor>,
max_length: usize,
}
#[cfg(feature = "extractous")]
impl Default for DocumentProcessor {
fn default() -> Self {
Self::new(Default::default())
}
}
#[cfg(feature = "extractous")]
static EXTRACTION_CACHE: OnceLock<Mutex<ExtractionCache>> = OnceLock::new();
#[cfg(feature = "extractous")]
impl DocumentProcessor {
pub fn new(config: ProcessorConfig) -> Self {
let capped = config
.max_text_chars
.min(i32::MAX as usize)
.try_into()
.unwrap_or(i32::MAX);
let mut extractor = Extractor::new().set_extract_string_max_length(capped);
extractor = extractor.set_xml_output(false);
Self {
extractor: Mutex::new(extractor),
max_length: config.max_text_chars,
}
}
pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
reason: "input path contains invalid UTF-8".into(),
})?;
let extraction = {
let extractor = self.locked()?;
let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
extractor.extract_file_to_string(path_str)
};
match extraction {
Ok((mut content, metadata)) => {
if needs_pdf_fallback(&content) {
if let Ok(bytes) = fs::read(path) {
if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
content = fallback_text;
}
}
}
Ok(self.into_document(content, metadata))
}
Err(err) => {
let primary_reason = err.to_string();
if let Ok(bytes) = fs::read(path) {
match pdf_text_fallback(&bytes) {
Ok(Some(fallback_text)) => {
return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
}
Ok(None) => {}
Err(fallback_err) => {
let reason = format!(
"primary extractor error: {}; PDF fallback error: {}",
primary_reason, fallback_err
);
return Err(MemvidError::ExtractionFailed {
reason: reason.into(),
});
}
}
}
Err(MemvidError::ExtractionFailed {
reason: primary_reason.into(),
})
}
}
}
pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
let hash = blake3::hash(bytes);
if let Some(cached) = cache_lookup(&hash) {
tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
return Ok(cached);
}
let extraction = {
let extractor = self.locked()?;
let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
extractor.extract_bytes_to_string(bytes)
};
let document = match extraction {
Ok((mut content, metadata)) => {
let pdf_needed = needs_pdf_fallback(&content);
tracing::debug!(
target: "memvid::extract",
content_len = content.len(),
pdf_fallback_needed = pdf_needed,
"extractous returned content"
);
if pdf_needed {
match pdf_text_fallback(bytes) {
Ok(Some(fallback_text)) => {
tracing::debug!(
target: "memvid::extract",
fallback_len = fallback_text.len(),
"lopdf fallback succeeded"
);
content = fallback_text;
}
Ok(None) => {
tracing::debug!(
target: "memvid::extract",
"lopdf fallback returned None"
);
content = String::new();
}
Err(e) => {
tracing::debug!(
target: "memvid::extract",
error = %e,
"lopdf fallback failed"
);
content = String::new();
}
}
}
self.into_document(content, metadata)
}
Err(err) => {
let primary_reason = err.to_string();
match pdf_text_fallback(bytes) {
Ok(Some(fallback_text)) => {
self.into_document(fallback_text, pdf_fallback_metadata())
}
Ok(None) => {
return Err(MemvidError::ExtractionFailed {
reason: primary_reason.into(),
});
}
Err(fallback_err) => {
let reason = format!(
"primary extractor error: {}; PDF fallback error: {}",
primary_reason, fallback_err
);
return Err(MemvidError::ExtractionFailed {
reason: reason.into(),
});
}
}
}
};
cache_store(hash, &document);
Ok(document)
}
fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
self.extractor
.lock()
.map_err(|_| MemvidError::ExtractionFailed {
reason: "extractor mutex poisoned".into(),
})
}
fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
where
M: serde::Serialize,
{
let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
let text = if content.trim().is_empty() {
tracing::debug!(
target: "memvid::extract",
"into_document: content is empty, returning text=None"
);
None
} else {
let final_text = if content.len() > self.max_length {
let end = truncate_at_grapheme_boundary(&content, self.max_length);
content[..end].to_string()
} else {
content
};
tracing::debug!(
target: "memvid::extract",
text_len = final_text.len(),
starts_with_pdf = final_text.starts_with("%PDF"),
"into_document: returning text"
);
Some(final_text)
};
ExtractedDocument {
text,
metadata: metadata_value,
mime_type,
}
}
}
#[cfg(not(feature = "extractous"))]
#[derive(Debug)]
pub struct DocumentProcessor {
max_length: usize,
}
#[cfg(not(feature = "extractous"))]
impl Default for DocumentProcessor {
fn default() -> Self {
Self::new(Default::default())
}
}
#[cfg(not(feature = "extractous"))]
impl DocumentProcessor {
#[must_use]
pub fn new(config: ProcessorConfig) -> Self {
Self {
max_length: config.max_text_chars,
}
}
pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
reason: format!("failed to read file: {e}").into(),
})?;
self.extract_from_bytes(&bytes)
}
pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
if is_probably_pdf_simple(bytes) {
match pdf_text_extract_best(bytes) {
Ok(Some((text, extractor))) => {
let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
let truncated = &text[..truncate_len];
return Ok(ExtractedDocument {
text: Some(truncated.to_string()),
metadata: json!({
"Content-Type": "application/pdf",
"extraction": extractor,
}),
mime_type: Some("application/pdf".to_string()),
});
}
Ok(None) => {
return Ok(ExtractedDocument {
text: None,
metadata: json!({
"Content-Type": "application/pdf",
"extraction": "no_text",
}),
mime_type: Some("application/pdf".to_string()),
});
}
Err(e) => {
tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
}
}
}
if let Ok(text) = std::str::from_utf8(bytes) {
let sample = &bytes[..bytes.len().min(8192)];
if !sample.contains(&0) {
let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
let truncated = &text[..truncate_len];
return Ok(ExtractedDocument {
text: Some(truncated.to_string()),
metadata: json!({}),
mime_type: Some("text/plain".to_string()),
});
}
}
Ok(ExtractedDocument {
text: None,
metadata: json!({}),
mime_type: Some("application/octet-stream".to_string()),
})
}
}
#[cfg(feature = "extractous")]
fn needs_pdf_fallback(content: &str) -> bool {
if content.trim().is_empty() {
return true;
}
looks_like_pdf_structure_dump(content)
}
#[cfg(feature = "extractous")]
fn pdf_fallback_metadata() -> Value {
json!({
"Content-Type": "application/pdf",
"extraction": "lopdf_fallback",
})
}
#[cfg(feature = "extractous")]
const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; #[cfg(feature = "extractous")]
const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
#[cfg(feature = "extractous")]
fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
if !is_probably_pdf(bytes) {
return Ok(None);
}
if bytes.len() > PDF_FALLBACK_MAX_BYTES {
return Err(MemvidError::ExtractionFailed {
reason: format!(
"pdf fallback aborted: {} bytes exceeds limit of {} bytes",
bytes.len(),
PDF_FALLBACK_MAX_BYTES
)
.into(),
});
}
let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
let mut document =
LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
reason: format!("pdf fallback failed to load document: {err}").into(),
})?;
if document.is_encrypted() {
if document.decrypt("").is_err() {
return Err(MemvidError::ExtractionFailed {
reason: "pdf fallback cannot decrypt password-protected file".into(),
});
}
}
let _ = document.decompress();
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
if page_numbers.is_empty() {
return Ok(None);
}
page_numbers.sort_unstable();
if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
return Err(MemvidError::ExtractionFailed {
reason: format!(
"pdf fallback aborted: page count {} exceeds limit of {}",
page_numbers.len(),
PDF_FALLBACK_MAX_PAGES
)
.into(),
});
}
match document.extract_text(&page_numbers) {
Ok(text) => {
let trimmed = text.trim();
if trimmed.is_empty() {
Ok(None)
} else {
Ok(Some(fix_pdf_spacing(trimmed)))
}
}
Err(err) => Err(MemvidError::ExtractionFailed {
reason: format!("pdf fallback failed to extract text: {err}").into(),
}),
}
}
#[cfg(feature = "extractous")]
struct ScopedLogLevel {
previous: LevelFilter,
changed: bool,
}
#[cfg(feature = "extractous")]
impl ScopedLogLevel {
fn lowered(level: LevelFilter) -> Self {
let previous = log::max_level();
if level < previous {
log::set_max_level(level);
Self {
previous,
changed: true,
}
} else {
Self {
previous,
changed: false,
}
}
}
}
#[cfg(feature = "extractous")]
impl Drop for ScopedLogLevel {
fn drop(&mut self) {
if self.changed {
log::set_max_level(self.previous);
}
}
}
#[cfg(feature = "extractous")]
fn is_probably_pdf(bytes: &[u8]) -> bool {
if bytes.is_empty() {
return false;
}
let mut slice = bytes;
if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
slice = &slice[3..];
}
while let Some((first, rest)) = slice.split_first() {
if *first == 0 || first.is_ascii_whitespace() {
slice = rest;
} else {
break;
}
}
slice.starts_with(b"%PDF")
}
#[cfg(feature = "extractous")]
fn looks_like_pdf_structure_dump(content: &str) -> bool {
if content.len() < 256 {
return false;
}
let sample_len = content.len().min(8_192);
let safe_len = truncate_at_grapheme_boundary(content, sample_len);
let sample = &content[..safe_len];
let endobj_hits = sample.matches("endobj").take(3).count();
if endobj_hits < 2 {
return false;
}
let has_obj =
sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
let has_stream = sample.contains("endstream");
let has_page_type = sample.contains("/Type /Page");
endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
}
#[cfg(feature = "extractous")]
fn value_to_mime(value: &Value) -> Option<String> {
if let Some(mime) = value.as_str() {
return Some(mime.to_string());
}
if let Some(array) = value.as_array() {
for entry in array {
if let Some(mime) = entry.as_str() {
return Some(mime.to_string());
}
}
}
None
}
#[cfg(feature = "extractous")]
fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
let cache = EXTRACTION_CACHE
.get_or_init(|| Mutex::new(ExtractionCache::new(DEFAULT_EXTRACTION_CACHE_CAPACITY)));
cache.lock().ok().and_then(|mut map| map.get(hash))
}
#[cfg(feature = "extractous")]
fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
let cache = EXTRACTION_CACHE
.get_or_init(|| Mutex::new(ExtractionCache::new(DEFAULT_EXTRACTION_CACHE_CAPACITY)));
if let Ok(mut map) = cache.lock() {
map.insert(hash, document.clone());
}
}
#[allow(dead_code)]
const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; #[allow(dead_code)]
const PDF_LOPDF_MAX_PAGES: usize = 4_096;
#[allow(dead_code)]
fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
let mut best_text: Option<String> = None;
let mut best_source: &'static str = "";
#[cfg(any(feature = "pdf_oxide", feature = "pdf_extract"))]
let min_good_chars = (bytes.len() / 100).clamp(500, 5000);
#[cfg(feature = "pdf_oxide")]
{
let bytes_clone = bytes.to_vec();
let oxide_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
pdf_text_extract_oxide(&bytes_clone)
}));
match oxide_result {
Ok(Ok(Some(text))) => {
let trimmed = text.trim();
if !trimmed.is_empty() {
if trimmed.len() >= min_good_chars {
tracing::debug!(
target: "memvid::extract",
len = trimmed.len(),
"pdf_oxide succeeded with good result"
);
return Ok(Some((trimmed.to_string(), "pdf_oxide")));
}
tracing::debug!(
target: "memvid::extract",
len = trimmed.len(),
min_good = min_good_chars,
"pdf_oxide returned partial result, trying fallbacks"
);
best_text = Some(trimmed.to_string());
best_source = "pdf_oxide";
}
}
Ok(Ok(None)) => {
tracing::debug!(target: "memvid::extract", "pdf_oxide returned no text");
}
Ok(Err(e)) => {
tracing::debug!(target: "memvid::extract", error = %e, "pdf_oxide failed");
}
Err(_) => {
tracing::warn!(target: "memvid::extract", "pdf_oxide panicked (likely font parsing issue), falling back to other extractors");
}
}
}
#[cfg(feature = "pdf_extract")]
{
match pdf_extract::extract_text_from_mem(bytes) {
Ok(text) => {
let trimmed = text.trim();
if !trimmed.is_empty() {
if best_text.is_none() && trimmed.len() >= min_good_chars {
tracing::debug!(
target: "memvid::extract",
len = trimmed.len(),
"pdf_extract succeeded with good result"
);
return Ok(Some((trimmed.to_string(), "pdf_extract")));
}
if best_text
.as_ref()
.is_none_or(|prev| trimmed.len() > prev.len())
{
best_text = Some(trimmed.to_string());
best_source = "pdf_extract";
}
}
}
Err(e) => {
tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
}
}
}
match pdf_text_extract_lopdf(bytes) {
Ok(Some(text)) => {
let trimmed = text.trim();
if !trimmed.is_empty() {
if best_text
.as_ref()
.is_none_or(|prev| trimmed.len() > prev.len())
{
tracing::debug!(
target: "memvid::extract",
len = trimmed.len(),
"lopdf extracted more text"
);
best_text = Some(trimmed.to_string());
best_source = "lopdf";
}
}
}
Ok(None) => {
tracing::debug!(target: "memvid::extract", "lopdf returned no text");
}
Err(e) => {
tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
}
}
Ok(best_text.map(|t| (fix_pdf_spacing(&t), best_source)))
}
#[cfg(feature = "pdf_oxide")]
#[allow(dead_code)]
fn pdf_text_extract_oxide(bytes: &[u8]) -> Result<Option<String>> {
use pdf_oxide::PdfDocument;
use std::io::Write;
use tempfile::NamedTempFile;
let mut temp_file = NamedTempFile::new().map_err(|err| MemvidError::ExtractionFailed {
reason: format!("pdf_oxide failed to create temp file: {err}").into(),
})?;
temp_file
.write_all(bytes)
.map_err(|err| MemvidError::ExtractionFailed {
reason: format!("pdf_oxide failed to write temp file: {err}").into(),
})?;
temp_file
.flush()
.map_err(|err| MemvidError::ExtractionFailed {
reason: format!("pdf_oxide failed to flush temp file: {err}").into(),
})?;
let temp_path = temp_file.path();
let mut doc = PdfDocument::open(temp_path).map_err(|err| MemvidError::ExtractionFailed {
reason: format!("pdf_oxide failed to load PDF: {err}").into(),
})?;
let page_count = doc
.page_count()
.map_err(|err| MemvidError::ExtractionFailed {
reason: format!("pdf_oxide failed to get page count: {err}").into(),
})?;
if page_count == 0 {
return Ok(None);
}
let mut all_text = String::new();
for page_idx in 0..page_count {
match doc.extract_text(page_idx) {
Ok(text) => {
if !text.is_empty() {
if !all_text.is_empty() {
all_text.push('\n');
}
all_text.push_str(&text);
}
}
Err(e) => {
tracing::debug!(
target: "memvid::extract",
page = page_idx,
error = %e,
"pdf_oxide failed to extract page"
);
}
}
}
let trimmed = all_text.trim();
if trimmed.is_empty() {
Ok(None)
} else {
Ok(Some(trimmed.to_string()))
}
}
#[allow(dead_code)]
fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
if bytes.is_empty() {
return false;
}
let mut slice = bytes;
if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
slice = &slice[3..];
}
while let Some((first, rest)) = slice.split_first() {
if *first == 0 || first.is_ascii_whitespace() {
slice = rest;
} else {
break;
}
}
slice.starts_with(b"%PDF")
}
#[allow(dead_code)]
fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
if bytes.len() > PDF_LOPDF_MAX_BYTES {
return Err(MemvidError::ExtractionFailed {
reason: format!(
"PDF too large: {} bytes exceeds limit of {} bytes",
bytes.len(),
PDF_LOPDF_MAX_BYTES
)
.into(),
});
}
let mut document =
LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
reason: format!("failed to load PDF: {err}").into(),
})?;
if document.is_encrypted() && document.decrypt("").is_err() {
return Err(MemvidError::ExtractionFailed {
reason: "cannot decrypt password-protected PDF".into(),
});
}
let () = document.decompress();
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
if page_numbers.is_empty() {
return Ok(None);
}
page_numbers.sort_unstable();
if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
return Err(MemvidError::ExtractionFailed {
reason: format!(
"PDF has too many pages: {} exceeds limit of {}",
page_numbers.len(),
PDF_LOPDF_MAX_PAGES
)
.into(),
});
}
match document.extract_text(&page_numbers) {
Ok(text) => {
let trimmed = text.trim();
if trimmed.is_empty() {
Ok(None)
} else {
Ok(Some(trimmed.to_string()))
}
}
Err(err) => Err(MemvidError::ExtractionFailed {
reason: format!("failed to extract text from PDF: {err}").into(),
}),
}
}
#[cfg(all(test, feature = "extractous"))]
mod tests {
use super::*;
#[test]
fn detects_pdf_like_dump() {
let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
let dump = snippet.repeat(12);
assert!(looks_like_pdf_structure_dump(&dump));
}
#[test]
fn skips_normal_text() {
let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
assert!(!looks_like_pdf_structure_dump(text));
}
#[test]
fn identifies_pdf_magic() {
let bytes = b"%PDF-1.7 some data";
assert!(is_probably_pdf(bytes));
let padded = b"\n\n%PDF-1.5";
assert!(is_probably_pdf(padded));
let not_pdf = b"<!doctype html>";
assert!(!is_probably_pdf(not_pdf));
}
}
#[cfg(all(test, feature = "extractous"))]
mod pdf_fix_tests {
use super::*;
#[test]
fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
assert!(is_probably_pdf(pdf_structure));
let structure_dump =
"binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
assert!(looks_like_pdf_structure_dump(&structure_dump));
let normal_text = "This is perfectly normal extracted text from a document.";
assert!(!looks_like_pdf_structure_dump(normal_text));
}
}
#[cfg(all(test, feature = "extractous"))]
mod extraction_cache_tests {
use super::*;
fn make_doc(content: &str) -> ExtractedDocument {
ExtractedDocument {
text: Some(content.to_string()),
metadata: serde_json::json!({}),
mime_type: Some("text/plain".to_string()),
}
}
#[test]
fn test_extraction_cache_basic() {
let mut cache = ExtractionCache::new(10);
let hash = blake3::hash(b"test document");
let doc = make_doc("test content");
cache.insert(hash, doc.clone());
let retrieved = cache.get(&hash);
assert!(retrieved.is_some());
assert_eq!(retrieved.unwrap().text, Some("test content".to_string()));
}
#[test]
fn test_extraction_cache_stats() {
let mut cache = ExtractionCache::new(10);
let hash = blake3::hash(b"test");
cache.insert(hash, make_doc("test"));
let _ = cache.get(&hash);
let missing = blake3::hash(b"missing");
let _ = cache.get(&missing);
let (hits, misses, size) = cache.stats();
assert_eq!(hits, 1);
assert_eq!(misses, 1);
assert_eq!(size, 1);
}
#[test]
fn test_extraction_cache_eviction() {
let mut cache = ExtractionCache::new(3);
for i in 0..4u8 {
let hash = blake3::hash(&[i]);
cache.insert(hash, make_doc(&format!("doc{}", i)));
}
let evicted = blake3::hash(&[0u8]);
assert!(cache.get(&evicted).is_none());
for i in 1..4u8 {
let hash = blake3::hash(&[i]);
assert!(cache.get(&hash).is_some());
}
}
#[test]
fn test_extraction_cache_lru_promotion() {
let mut cache = ExtractionCache::new(3);
for i in 0..3u8 {
let hash = blake3::hash(&[i]);
cache.insert(hash, make_doc(&format!("doc{}", i)));
}
let first = blake3::hash(&[0u8]);
let _ = cache.get(&first);
let new_hash = blake3::hash(&[3u8]);
cache.insert(new_hash, make_doc("doc3"));
assert!(cache.get(&first).is_some());
let second = blake3::hash(&[1u8]);
assert!(cache.get(&second).is_none());
let third = blake3::hash(&[2u8]);
assert!(cache.get(&third).is_some());
assert!(cache.get(&new_hash).is_some());
}
#[test]
fn test_extraction_cache_update_existing() {
let mut cache = ExtractionCache::new(3);
let hash = blake3::hash(b"test");
cache.insert(hash, make_doc("original"));
cache.insert(hash, make_doc("updated"));
let retrieved = cache.get(&hash);
assert_eq!(retrieved.unwrap().text, Some("updated".to_string()));
let (_, _, size) = cache.stats();
assert_eq!(size, 1);
}
}