1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5#[cfg(feature = "symspell_cleanup")]
7use crate::symspell_cleanup::fix_pdf_text as fix_pdf_spacing;
8#[cfg(not(feature = "symspell_cleanup"))]
9use crate::text::fix_pdf_spacing;
10
11#[cfg(feature = "extractous")]
12use log::LevelFilter;
13use lopdf::Document as LopdfDocument;
14use serde_json::{Value, json};
15
16#[cfg(feature = "extractous")]
17use extractous::Extractor;
18#[cfg(feature = "extractous")]
19use std::collections::{HashMap, VecDeque};
20#[cfg(feature = "extractous")]
21use std::sync::{Mutex, OnceLock};
22
23#[derive(Debug, Clone)]
26pub struct ExtractedDocument {
27 pub text: Option<String>,
28 pub metadata: Value,
29 pub mime_type: Option<String>,
30}
31
32impl ExtractedDocument {
33 #[must_use]
34 pub fn empty() -> Self {
35 Self {
36 text: None,
37 metadata: Value::Null,
38 mime_type: None,
39 }
40 }
41}
42
43#[derive(Debug, Clone, Copy)]
44pub struct ProcessorConfig {
45 pub max_text_chars: usize,
46}
47
48impl Default for ProcessorConfig {
49 fn default() -> Self {
50 Self {
51 max_text_chars: 2_000_000,
52 }
53 }
54}
55
56#[cfg(feature = "extractous")]
62const DEFAULT_EXTRACTION_CACHE_CAPACITY: usize = 100;
63
64#[cfg(feature = "extractous")]
69struct ExtractionCache {
70 cache: HashMap<blake3::Hash, ExtractedDocument>,
72 lru_queue: VecDeque<blake3::Hash>,
74 capacity: usize,
76 hits: usize,
78 misses: usize,
80}
81
82#[cfg(feature = "extractous")]
83impl ExtractionCache {
84 fn new(capacity: usize) -> Self {
85 Self {
86 cache: HashMap::with_capacity(capacity),
87 lru_queue: VecDeque::with_capacity(capacity),
88 capacity,
89 hits: 0,
90 misses: 0,
91 }
92 }
93
94 fn get(&mut self, key: &blake3::Hash) -> Option<ExtractedDocument> {
95 if let Some(document) = self.cache.get(key) {
96 self.lru_queue.retain(|k| k != key);
98 self.lru_queue.push_front(*key);
99 self.hits += 1;
100 Some(document.clone())
101 } else {
102 self.misses += 1;
103 None
104 }
105 }
106
107 fn insert(&mut self, key: blake3::Hash, value: ExtractedDocument) {
108 if self.cache.contains_key(&key) {
110 self.cache.insert(key, value);
112 self.lru_queue.retain(|k| *k != key);
113 self.lru_queue.push_front(key);
114 return;
115 }
116
117 if self.cache.len() >= self.capacity {
119 if let Some(oldest_key) = self.lru_queue.pop_back() {
120 self.cache.remove(&oldest_key);
121 tracing::debug!(
122 evicted_hash = ?oldest_key,
123 "Evicted oldest entry from extraction cache"
124 );
125 }
126 }
127
128 self.cache.insert(key, value);
130 self.lru_queue.push_front(key);
131 }
132
133 #[allow(dead_code)]
134 fn stats(&self) -> (usize, usize, usize) {
135 (self.hits, self.misses, self.cache.len())
136 }
137}
138
139#[cfg(feature = "extractous")]
144#[derive(Debug)]
145pub struct DocumentProcessor {
146 extractor: Mutex<Extractor>,
147 max_length: usize,
148}
149
150#[cfg(feature = "extractous")]
151impl Default for DocumentProcessor {
152 fn default() -> Self {
153 Self::new(Default::default())
154 }
155}
156
157#[cfg(feature = "extractous")]
158static EXTRACTION_CACHE: OnceLock<Mutex<ExtractionCache>> = OnceLock::new();
159
160#[cfg(feature = "extractous")]
161impl DocumentProcessor {
162 pub fn new(config: ProcessorConfig) -> Self {
163 let capped = config
164 .max_text_chars
165 .min(i32::MAX as usize)
166 .try_into()
167 .unwrap_or(i32::MAX);
168 let mut extractor = Extractor::new().set_extract_string_max_length(capped);
169 extractor = extractor.set_xml_output(false);
170 Self {
171 extractor: Mutex::new(extractor),
172 max_length: config.max_text_chars,
173 }
174 }
175
176 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
177 let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
178 reason: "input path contains invalid UTF-8".into(),
179 })?;
180
181 let extraction = {
182 let extractor = self.locked()?;
183 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
184 extractor.extract_file_to_string(path_str)
185 };
186
187 match extraction {
188 Ok((mut content, metadata)) => {
189 if needs_pdf_fallback(&content) {
190 if let Ok(bytes) = fs::read(path) {
191 if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
192 content = fallback_text;
193 }
194 }
195 }
196 Ok(self.into_document(content, metadata))
197 }
198 Err(err) => {
199 let primary_reason = err.to_string();
200 if let Ok(bytes) = fs::read(path) {
201 match pdf_text_fallback(&bytes) {
202 Ok(Some(fallback_text)) => {
203 return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
204 }
205 Ok(None) => {}
206 Err(fallback_err) => {
207 let reason = format!(
208 "primary extractor error: {}; PDF fallback error: {}",
209 primary_reason, fallback_err
210 );
211 return Err(MemvidError::ExtractionFailed {
212 reason: reason.into(),
213 });
214 }
215 }
216 }
217 Err(MemvidError::ExtractionFailed {
218 reason: primary_reason.into(),
219 })
220 }
221 }
222 }
223
224 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
225 let hash = blake3::hash(bytes);
226 if let Some(cached) = cache_lookup(&hash) {
227 tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
228 return Ok(cached);
229 }
230
231 let extraction = {
232 let extractor = self.locked()?;
233 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
234 extractor.extract_bytes_to_string(bytes)
235 };
236
237 let document = match extraction {
238 Ok((mut content, metadata)) => {
239 let pdf_needed = needs_pdf_fallback(&content);
240 tracing::debug!(
241 target: "memvid::extract",
242 content_len = content.len(),
243 pdf_fallback_needed = pdf_needed,
244 "extractous returned content"
245 );
246 if pdf_needed {
247 match pdf_text_fallback(bytes) {
248 Ok(Some(fallback_text)) => {
249 tracing::debug!(
250 target: "memvid::extract",
251 fallback_len = fallback_text.len(),
252 "lopdf fallback succeeded"
253 );
254 content = fallback_text;
255 }
256 Ok(None) => {
257 tracing::debug!(
258 target: "memvid::extract",
259 "lopdf fallback returned None"
260 );
261 content = String::new();
264 }
265 Err(e) => {
266 tracing::debug!(
267 target: "memvid::extract",
268 error = %e,
269 "lopdf fallback failed"
270 );
271 content = String::new();
273 }
274 }
275 }
276 self.into_document(content, metadata)
277 }
278 Err(err) => {
279 let primary_reason = err.to_string();
280 match pdf_text_fallback(bytes) {
281 Ok(Some(fallback_text)) => {
282 self.into_document(fallback_text, pdf_fallback_metadata())
283 }
284 Ok(None) => {
285 return Err(MemvidError::ExtractionFailed {
286 reason: primary_reason.into(),
287 });
288 }
289 Err(fallback_err) => {
290 let reason = format!(
291 "primary extractor error: {}; PDF fallback error: {}",
292 primary_reason, fallback_err
293 );
294 return Err(MemvidError::ExtractionFailed {
295 reason: reason.into(),
296 });
297 }
298 }
299 }
300 };
301
302 cache_store(hash, &document);
303 Ok(document)
304 }
305
306 fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
307 self.extractor
308 .lock()
309 .map_err(|_| MemvidError::ExtractionFailed {
310 reason: "extractor mutex poisoned".into(),
311 })
312 }
313
314 fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
315 where
316 M: serde::Serialize,
317 {
318 let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
319 let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
320
321 let text = if content.trim().is_empty() {
322 tracing::debug!(
323 target: "memvid::extract",
324 "into_document: content is empty, returning text=None"
325 );
326 None
327 } else {
328 let final_text = if content.len() > self.max_length {
329 let end = truncate_at_grapheme_boundary(&content, self.max_length);
330 content[..end].to_string()
331 } else {
332 content
333 };
334 tracing::debug!(
335 target: "memvid::extract",
336 text_len = final_text.len(),
337 starts_with_pdf = final_text.starts_with("%PDF"),
338 "into_document: returning text"
339 );
340 Some(final_text)
341 };
342
343 ExtractedDocument {
344 text,
345 metadata: metadata_value,
346 mime_type,
347 }
348 }
349}
350
351#[cfg(not(feature = "extractous"))]
356#[derive(Debug)]
357pub struct DocumentProcessor {
358 max_length: usize,
359}
360
361#[cfg(not(feature = "extractous"))]
362impl Default for DocumentProcessor {
363 fn default() -> Self {
364 Self::new(Default::default())
365 }
366}
367
368#[cfg(not(feature = "extractous"))]
369impl DocumentProcessor {
370 #[must_use]
371 pub fn new(config: ProcessorConfig) -> Self {
372 Self {
373 max_length: config.max_text_chars,
374 }
375 }
376
377 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
378 let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
380 reason: format!("failed to read file: {e}").into(),
381 })?;
382 self.extract_from_bytes(&bytes)
383 }
384
385 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
386 if is_probably_pdf_simple(bytes) {
388 match pdf_text_extract_best(bytes) {
389 Ok(Some((text, extractor))) => {
390 let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
391 let truncated = &text[..truncate_len];
392 return Ok(ExtractedDocument {
393 text: Some(truncated.to_string()),
394 metadata: json!({
395 "Content-Type": "application/pdf",
396 "extraction": extractor,
397 }),
398 mime_type: Some("application/pdf".to_string()),
399 });
400 }
401 Ok(None) => {
402 return Ok(ExtractedDocument {
404 text: None,
405 metadata: json!({
406 "Content-Type": "application/pdf",
407 "extraction": "no_text",
408 }),
409 mime_type: Some("application/pdf".to_string()),
410 });
411 }
412 Err(e) => {
413 tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
414 }
416 }
417 }
418
419 if let Ok(text) = std::str::from_utf8(bytes) {
422 let sample = &bytes[..bytes.len().min(8192)];
424 if !sample.contains(&0) {
425 let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
426 let truncated = &text[..truncate_len];
427 return Ok(ExtractedDocument {
428 text: Some(truncated.to_string()),
429 metadata: json!({}),
430 mime_type: Some("text/plain".to_string()),
431 });
432 }
433 }
434
435 Ok(ExtractedDocument {
439 text: None,
440 metadata: json!({}),
441 mime_type: Some("application/octet-stream".to_string()),
442 })
443 }
444}
445
446#[cfg(feature = "extractous")]
447fn needs_pdf_fallback(content: &str) -> bool {
448 if content.trim().is_empty() {
449 return true;
450 }
451 looks_like_pdf_structure_dump(content)
452}
453
454#[cfg(feature = "extractous")]
455fn pdf_fallback_metadata() -> Value {
456 json!({
457 "Content-Type": "application/pdf",
458 "extraction": "lopdf_fallback",
459 })
460}
461
462#[cfg(feature = "extractous")]
463const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; #[cfg(feature = "extractous")]
465const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
466
467#[cfg(feature = "extractous")]
468fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
469 if !is_probably_pdf(bytes) {
470 return Ok(None);
471 }
472
473 if bytes.len() > PDF_FALLBACK_MAX_BYTES {
474 return Err(MemvidError::ExtractionFailed {
475 reason: format!(
476 "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
477 bytes.len(),
478 PDF_FALLBACK_MAX_BYTES
479 )
480 .into(),
481 });
482 }
483
484 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
485 let mut document =
486 LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
487 reason: format!("pdf fallback failed to load document: {err}").into(),
488 })?;
489
490 if document.is_encrypted() {
491 if document.decrypt("").is_err() {
492 return Err(MemvidError::ExtractionFailed {
493 reason: "pdf fallback cannot decrypt password-protected file".into(),
494 });
495 }
496 }
497
498 let _ = document.decompress();
499
500 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
501 if page_numbers.is_empty() {
502 return Ok(None);
503 }
504 page_numbers.sort_unstable();
505
506 if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
507 return Err(MemvidError::ExtractionFailed {
508 reason: format!(
509 "pdf fallback aborted: page count {} exceeds limit of {}",
510 page_numbers.len(),
511 PDF_FALLBACK_MAX_PAGES
512 )
513 .into(),
514 });
515 }
516
517 match document.extract_text(&page_numbers) {
518 Ok(text) => {
519 let trimmed = text.trim();
520 if trimmed.is_empty() {
521 Ok(None)
522 } else {
523 Ok(Some(fix_pdf_spacing(trimmed)))
525 }
526 }
527 Err(err) => Err(MemvidError::ExtractionFailed {
528 reason: format!("pdf fallback failed to extract text: {err}").into(),
529 }),
530 }
531}
532
533#[cfg(feature = "extractous")]
534struct ScopedLogLevel {
535 previous: LevelFilter,
536 changed: bool,
537}
538
539#[cfg(feature = "extractous")]
540impl ScopedLogLevel {
541 fn lowered(level: LevelFilter) -> Self {
542 let previous = log::max_level();
543 if level < previous {
544 log::set_max_level(level);
545 Self {
546 previous,
547 changed: true,
548 }
549 } else {
550 Self {
551 previous,
552 changed: false,
553 }
554 }
555 }
556}
557
558#[cfg(feature = "extractous")]
559impl Drop for ScopedLogLevel {
560 fn drop(&mut self) {
561 if self.changed {
562 log::set_max_level(self.previous);
563 }
564 }
565}
566
567#[cfg(feature = "extractous")]
568fn is_probably_pdf(bytes: &[u8]) -> bool {
569 if bytes.is_empty() {
570 return false;
571 }
572 let mut slice = bytes;
573 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
574 slice = &slice[3..];
575 }
576 while let Some((first, rest)) = slice.split_first() {
577 if *first == 0 || first.is_ascii_whitespace() {
578 slice = rest;
579 } else {
580 break;
581 }
582 }
583 slice.starts_with(b"%PDF")
584}
585
586#[cfg(feature = "extractous")]
587fn looks_like_pdf_structure_dump(content: &str) -> bool {
588 if content.len() < 256 {
589 return false;
590 }
591 let sample_len = content.len().min(8_192);
592 let safe_len = truncate_at_grapheme_boundary(content, sample_len);
594 let sample = &content[..safe_len];
595 let endobj_hits = sample.matches("endobj").take(3).count();
596 if endobj_hits < 2 {
597 return false;
598 }
599 let has_obj =
600 sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
601 let has_stream = sample.contains("endstream");
602 let has_page_type = sample.contains("/Type /Page");
603 endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
604}
605
606#[cfg(feature = "extractous")]
607fn value_to_mime(value: &Value) -> Option<String> {
608 if let Some(mime) = value.as_str() {
609 return Some(mime.to_string());
610 }
611 if let Some(array) = value.as_array() {
612 for entry in array {
613 if let Some(mime) = entry.as_str() {
614 return Some(mime.to_string());
615 }
616 }
617 }
618 None
619}
620
621#[cfg(feature = "extractous")]
622fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
623 let cache = EXTRACTION_CACHE
624 .get_or_init(|| Mutex::new(ExtractionCache::new(DEFAULT_EXTRACTION_CACHE_CAPACITY)));
625 cache.lock().ok().and_then(|mut map| map.get(hash))
626}
627
628#[cfg(feature = "extractous")]
629fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
630 let cache = EXTRACTION_CACHE
631 .get_or_init(|| Mutex::new(ExtractionCache::new(DEFAULT_EXTRACTION_CACHE_CAPACITY)));
632 if let Ok(mut map) = cache.lock() {
633 map.insert(hash, document.clone());
634 }
635}
636
637#[allow(dead_code)]
643const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; #[allow(dead_code)]
645const PDF_LOPDF_MAX_PAGES: usize = 4_096;
646
647#[allow(dead_code)]
651fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
652 let mut best_text: Option<String> = None;
653 let mut best_source: &'static str = "";
654
655 #[cfg(any(feature = "pdf_oxide", feature = "pdf_extract"))]
657 let min_good_chars = (bytes.len() / 100).clamp(500, 5000);
658
659 #[cfg(feature = "pdf_oxide")]
662 {
663 let bytes_clone = bytes.to_vec();
664 let oxide_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
665 pdf_text_extract_oxide(&bytes_clone)
666 }));
667
668 match oxide_result {
669 Ok(Ok(Some(text))) => {
670 let trimmed = text.trim();
671 if !trimmed.is_empty() {
672 if trimmed.len() >= min_good_chars {
673 tracing::debug!(
674 target: "memvid::extract",
675 len = trimmed.len(),
676 "pdf_oxide succeeded with good result"
677 );
678 return Ok(Some((trimmed.to_string(), "pdf_oxide")));
679 }
680 tracing::debug!(
681 target: "memvid::extract",
682 len = trimmed.len(),
683 min_good = min_good_chars,
684 "pdf_oxide returned partial result, trying fallbacks"
685 );
686 best_text = Some(trimmed.to_string());
687 best_source = "pdf_oxide";
688 }
689 }
690 Ok(Ok(None)) => {
691 tracing::debug!(target: "memvid::extract", "pdf_oxide returned no text");
692 }
693 Ok(Err(e)) => {
694 tracing::debug!(target: "memvid::extract", error = %e, "pdf_oxide failed");
695 }
696 Err(_) => {
697 tracing::warn!(target: "memvid::extract", "pdf_oxide panicked (likely font parsing issue), falling back to other extractors");
698 }
699 }
700 }
701
702 #[cfg(feature = "pdf_extract")]
704 {
705 match pdf_extract::extract_text_from_mem(bytes) {
706 Ok(text) => {
707 let trimmed = text.trim();
708 if !trimmed.is_empty() {
709 if best_text.is_none() && trimmed.len() >= min_good_chars {
710 tracing::debug!(
711 target: "memvid::extract",
712 len = trimmed.len(),
713 "pdf_extract succeeded with good result"
714 );
715 return Ok(Some((trimmed.to_string(), "pdf_extract")));
716 }
717 if best_text
719 .as_ref()
720 .is_none_or(|prev| trimmed.len() > prev.len())
721 {
722 best_text = Some(trimmed.to_string());
723 best_source = "pdf_extract";
724 }
725 }
726 }
727 Err(e) => {
728 tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
729 }
730 }
731 }
732
733 match pdf_text_extract_lopdf(bytes) {
735 Ok(Some(text)) => {
736 let trimmed = text.trim();
737 if !trimmed.is_empty() {
738 if best_text
740 .as_ref()
741 .is_none_or(|prev| trimmed.len() > prev.len())
742 {
743 tracing::debug!(
744 target: "memvid::extract",
745 len = trimmed.len(),
746 "lopdf extracted more text"
747 );
748 best_text = Some(trimmed.to_string());
749 best_source = "lopdf";
750 }
751 }
752 }
753 Ok(None) => {
754 tracing::debug!(target: "memvid::extract", "lopdf returned no text");
755 }
756 Err(e) => {
757 tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
758 }
759 }
760
761 Ok(best_text.map(|t| (fix_pdf_spacing(&t), best_source)))
763}
764
765#[cfg(feature = "pdf_oxide")]
768#[allow(dead_code)]
769fn pdf_text_extract_oxide(bytes: &[u8]) -> Result<Option<String>> {
770 use pdf_oxide::PdfDocument;
771 use std::io::Write;
772 use tempfile::NamedTempFile;
773
774 let mut temp_file = NamedTempFile::new().map_err(|err| MemvidError::ExtractionFailed {
776 reason: format!("pdf_oxide failed to create temp file: {err}").into(),
777 })?;
778
779 temp_file
780 .write_all(bytes)
781 .map_err(|err| MemvidError::ExtractionFailed {
782 reason: format!("pdf_oxide failed to write temp file: {err}").into(),
783 })?;
784
785 temp_file
786 .flush()
787 .map_err(|err| MemvidError::ExtractionFailed {
788 reason: format!("pdf_oxide failed to flush temp file: {err}").into(),
789 })?;
790
791 let temp_path = temp_file.path();
792 let mut doc = PdfDocument::open(temp_path).map_err(|err| MemvidError::ExtractionFailed {
793 reason: format!("pdf_oxide failed to load PDF: {err}").into(),
794 })?;
795
796 let page_count = doc
797 .page_count()
798 .map_err(|err| MemvidError::ExtractionFailed {
799 reason: format!("pdf_oxide failed to get page count: {err}").into(),
800 })?;
801 if page_count == 0 {
802 return Ok(None);
803 }
804
805 let mut all_text = String::new();
806 for page_idx in 0..page_count {
807 match doc.extract_text(page_idx) {
808 Ok(text) => {
809 if !text.is_empty() {
810 if !all_text.is_empty() {
811 all_text.push('\n');
812 }
813 all_text.push_str(&text);
814 }
815 }
816 Err(e) => {
817 tracing::debug!(
818 target: "memvid::extract",
819 page = page_idx,
820 error = %e,
821 "pdf_oxide failed to extract page"
822 );
823 }
824 }
825 }
826
827 let trimmed = all_text.trim();
828 if trimmed.is_empty() {
829 Ok(None)
830 } else {
831 Ok(Some(trimmed.to_string()))
832 }
833}
834
835#[allow(dead_code)]
837fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
838 if bytes.is_empty() {
839 return false;
840 }
841 let mut slice = bytes;
842 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
844 slice = &slice[3..];
845 }
846 while let Some((first, rest)) = slice.split_first() {
848 if *first == 0 || first.is_ascii_whitespace() {
849 slice = rest;
850 } else {
851 break;
852 }
853 }
854 slice.starts_with(b"%PDF")
855}
856
857#[allow(dead_code)]
859fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
860 if bytes.len() > PDF_LOPDF_MAX_BYTES {
861 return Err(MemvidError::ExtractionFailed {
862 reason: format!(
863 "PDF too large: {} bytes exceeds limit of {} bytes",
864 bytes.len(),
865 PDF_LOPDF_MAX_BYTES
866 )
867 .into(),
868 });
869 }
870
871 let mut document =
872 LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
873 reason: format!("failed to load PDF: {err}").into(),
874 })?;
875
876 if document.is_encrypted() && document.decrypt("").is_err() {
878 return Err(MemvidError::ExtractionFailed {
879 reason: "cannot decrypt password-protected PDF".into(),
880 });
881 }
882
883 let () = document.decompress();
885
886 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
887 if page_numbers.is_empty() {
888 return Ok(None);
889 }
890 page_numbers.sort_unstable();
891
892 if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
893 return Err(MemvidError::ExtractionFailed {
894 reason: format!(
895 "PDF has too many pages: {} exceeds limit of {}",
896 page_numbers.len(),
897 PDF_LOPDF_MAX_PAGES
898 )
899 .into(),
900 });
901 }
902
903 match document.extract_text(&page_numbers) {
904 Ok(text) => {
905 let trimmed = text.trim();
906 if trimmed.is_empty() {
907 Ok(None)
908 } else {
909 Ok(Some(trimmed.to_string()))
910 }
911 }
912 Err(err) => Err(MemvidError::ExtractionFailed {
913 reason: format!("failed to extract text from PDF: {err}").into(),
914 }),
915 }
916}
917
918#[cfg(all(test, feature = "extractous"))]
919mod tests {
920 use super::*;
921
922 #[test]
923 fn detects_pdf_like_dump() {
924 let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
925 let dump = snippet.repeat(12);
926 assert!(looks_like_pdf_structure_dump(&dump));
927 }
928
929 #[test]
930 fn skips_normal_text() {
931 let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
932 assert!(!looks_like_pdf_structure_dump(text));
933 }
934
935 #[test]
936 fn identifies_pdf_magic() {
937 let bytes = b"%PDF-1.7 some data";
938 assert!(is_probably_pdf(bytes));
939 let padded = b"\n\n%PDF-1.5";
940 assert!(is_probably_pdf(padded));
941 let not_pdf = b"<!doctype html>";
942 assert!(!is_probably_pdf(not_pdf));
943 }
944}
945
946#[cfg(all(test, feature = "extractous"))]
947mod pdf_fix_tests {
948 use super::*;
949
950 #[test]
955 fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
956 let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
958
959 assert!(is_probably_pdf(pdf_structure));
961
962 let structure_dump =
964 "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
965 assert!(looks_like_pdf_structure_dump(&structure_dump));
966
967 let normal_text = "This is perfectly normal extracted text from a document.";
969 assert!(!looks_like_pdf_structure_dump(normal_text));
970 }
971}
972
973#[cfg(all(test, feature = "extractous"))]
978mod extraction_cache_tests {
979 use super::*;
980
981 fn make_doc(content: &str) -> ExtractedDocument {
982 ExtractedDocument {
983 text: Some(content.to_string()),
984 metadata: serde_json::json!({}),
985 mime_type: Some("text/plain".to_string()),
986 }
987 }
988
989 #[test]
990 fn test_extraction_cache_basic() {
991 let mut cache = ExtractionCache::new(10);
992 let hash = blake3::hash(b"test document");
993 let doc = make_doc("test content");
994
995 cache.insert(hash, doc.clone());
996 let retrieved = cache.get(&hash);
997 assert!(retrieved.is_some());
998 assert_eq!(retrieved.unwrap().text, Some("test content".to_string()));
999 }
1000
1001 #[test]
1002 fn test_extraction_cache_stats() {
1003 let mut cache = ExtractionCache::new(10);
1004 let hash = blake3::hash(b"test");
1005 cache.insert(hash, make_doc("test"));
1006
1007 let _ = cache.get(&hash);
1009 let missing = blake3::hash(b"missing");
1011 let _ = cache.get(&missing);
1012
1013 let (hits, misses, size) = cache.stats();
1014 assert_eq!(hits, 1);
1015 assert_eq!(misses, 1);
1016 assert_eq!(size, 1);
1017 }
1018
1019 #[test]
1020 fn test_extraction_cache_eviction() {
1021 let mut cache = ExtractionCache::new(3);
1022
1023 for i in 0..4u8 {
1025 let hash = blake3::hash(&[i]);
1026 cache.insert(hash, make_doc(&format!("doc{}", i)));
1027 }
1028
1029 let evicted = blake3::hash(&[0u8]);
1031 assert!(cache.get(&evicted).is_none());
1032
1033 for i in 1..4u8 {
1035 let hash = blake3::hash(&[i]);
1036 assert!(cache.get(&hash).is_some());
1037 }
1038 }
1039
1040 #[test]
1041 fn test_extraction_cache_lru_promotion() {
1042 let mut cache = ExtractionCache::new(3);
1043
1044 for i in 0..3u8 {
1046 let hash = blake3::hash(&[i]);
1047 cache.insert(hash, make_doc(&format!("doc{}", i)));
1048 }
1049
1050 let first = blake3::hash(&[0u8]);
1052 let _ = cache.get(&first);
1053
1054 let new_hash = blake3::hash(&[3u8]);
1056 cache.insert(new_hash, make_doc("doc3"));
1057
1058 assert!(cache.get(&first).is_some());
1060
1061 let second = blake3::hash(&[1u8]);
1063 assert!(cache.get(&second).is_none());
1064
1065 let third = blake3::hash(&[2u8]);
1067 assert!(cache.get(&third).is_some());
1068 assert!(cache.get(&new_hash).is_some());
1069 }
1070
1071 #[test]
1072 fn test_extraction_cache_update_existing() {
1073 let mut cache = ExtractionCache::new(3);
1074 let hash = blake3::hash(b"test");
1075
1076 cache.insert(hash, make_doc("original"));
1077 cache.insert(hash, make_doc("updated"));
1078
1079 let retrieved = cache.get(&hash);
1080 assert_eq!(retrieved.unwrap().text, Some("updated".to_string()));
1081
1082 let (_, _, size) = cache.stats();
1084 assert_eq!(size, 1);
1085 }
1086}