memvid_core/
extract.rs

1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5// Use SymSpell-based cleanup when feature is enabled, otherwise fall back to heuristic
6#[cfg(feature = "symspell_cleanup")]
7use crate::symspell_cleanup::fix_pdf_text as fix_pdf_spacing;
8#[cfg(not(feature = "symspell_cleanup"))]
9use crate::text::fix_pdf_spacing;
10
11#[cfg(feature = "extractous")]
12use log::LevelFilter;
13use lopdf::Document as LopdfDocument;
14use serde_json::{Value, json};
15
16#[cfg(feature = "extractous")]
17use extractous::Extractor;
18#[cfg(feature = "extractous")]
19use std::collections::{HashMap, VecDeque};
20#[cfg(feature = "extractous")]
21use std::sync::{Mutex, OnceLock};
22
23/// Structured result produced by [`DocumentProcessor`] after running
24/// Extractous over an input document.
25#[derive(Debug, Clone)]
26pub struct ExtractedDocument {
27    pub text: Option<String>,
28    pub metadata: Value,
29    pub mime_type: Option<String>,
30}
31
32impl ExtractedDocument {
33    #[must_use]
34    pub fn empty() -> Self {
35        Self {
36            text: None,
37            metadata: Value::Null,
38            mime_type: None,
39        }
40    }
41}
42
43#[derive(Debug, Clone, Copy)]
44pub struct ProcessorConfig {
45    pub max_text_chars: usize,
46}
47
48impl Default for ProcessorConfig {
49    fn default() -> Self {
50        Self {
51            max_text_chars: 2_000_000,
52        }
53    }
54}
55
56// ============================================================================
57// Extraction Cache with LRU Eviction
58// ============================================================================
59
60/// Default capacity for extraction cache (number of documents)
61#[cfg(feature = "extractous")]
62const DEFAULT_EXTRACTION_CACHE_CAPACITY: usize = 100;
63
64/// LRU cache for extracted documents to avoid re-extracting the same content.
65///
66/// This cache has a maximum capacity and evicts the least recently used entries
67/// when full, following the same pattern as `EmbeddingCache` in `text_embed.rs`.
68#[cfg(feature = "extractous")]
69struct ExtractionCache {
70    /// Cache storage: document hash -> extracted document
71    cache: HashMap<blake3::Hash, ExtractedDocument>,
72    /// LRU queue: tracks access order (most recent at front)
73    lru_queue: VecDeque<blake3::Hash>,
74    /// Maximum capacity
75    capacity: usize,
76    /// Cache hit count
77    hits: usize,
78    /// Cache miss count
79    misses: usize,
80}
81
82#[cfg(feature = "extractous")]
83impl ExtractionCache {
84    fn new(capacity: usize) -> Self {
85        Self {
86            cache: HashMap::with_capacity(capacity),
87            lru_queue: VecDeque::with_capacity(capacity),
88            capacity,
89            hits: 0,
90            misses: 0,
91        }
92    }
93
94    fn get(&mut self, key: &blake3::Hash) -> Option<ExtractedDocument> {
95        if let Some(document) = self.cache.get(key) {
96            // Move to front (most recently used)
97            self.lru_queue.retain(|k| k != key);
98            self.lru_queue.push_front(*key);
99            self.hits += 1;
100            Some(document.clone())
101        } else {
102            self.misses += 1;
103            None
104        }
105    }
106
107    fn insert(&mut self, key: blake3::Hash, value: ExtractedDocument) {
108        // Check if already exists
109        if self.cache.contains_key(&key) {
110            // Update and move to front
111            self.cache.insert(key, value);
112            self.lru_queue.retain(|k| *k != key);
113            self.lru_queue.push_front(key);
114            return;
115        }
116
117        // Evict if at capacity
118        if self.cache.len() >= self.capacity {
119            if let Some(oldest_key) = self.lru_queue.pop_back() {
120                self.cache.remove(&oldest_key);
121                tracing::debug!(
122                    evicted_hash = ?oldest_key,
123                    "Evicted oldest entry from extraction cache"
124                );
125            }
126        }
127
128        // Insert new entry
129        self.cache.insert(key, value);
130        self.lru_queue.push_front(key);
131    }
132
133    #[allow(dead_code)]
134    fn stats(&self) -> (usize, usize, usize) {
135        (self.hits, self.misses, self.cache.len())
136    }
137}
138
139// ============================================================================
140// DocumentProcessor - only available with extractous feature
141// ============================================================================
142
143#[cfg(feature = "extractous")]
144#[derive(Debug)]
145pub struct DocumentProcessor {
146    extractor: Mutex<Extractor>,
147    max_length: usize,
148}
149
150#[cfg(feature = "extractous")]
151impl Default for DocumentProcessor {
152    fn default() -> Self {
153        Self::new(Default::default())
154    }
155}
156
157#[cfg(feature = "extractous")]
158static EXTRACTION_CACHE: OnceLock<Mutex<ExtractionCache>> = OnceLock::new();
159
160#[cfg(feature = "extractous")]
161impl DocumentProcessor {
162    pub fn new(config: ProcessorConfig) -> Self {
163        let capped = config
164            .max_text_chars
165            .min(i32::MAX as usize)
166            .try_into()
167            .unwrap_or(i32::MAX);
168        let mut extractor = Extractor::new().set_extract_string_max_length(capped);
169        extractor = extractor.set_xml_output(false);
170        Self {
171            extractor: Mutex::new(extractor),
172            max_length: config.max_text_chars,
173        }
174    }
175
176    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
177        let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
178            reason: "input path contains invalid UTF-8".into(),
179        })?;
180
181        let extraction = {
182            let extractor = self.locked()?;
183            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
184            extractor.extract_file_to_string(path_str)
185        };
186
187        match extraction {
188            Ok((mut content, metadata)) => {
189                if needs_pdf_fallback(&content) {
190                    if let Ok(bytes) = fs::read(path) {
191                        if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
192                            content = fallback_text;
193                        }
194                    }
195                }
196                Ok(self.into_document(content, metadata))
197            }
198            Err(err) => {
199                let primary_reason = err.to_string();
200                if let Ok(bytes) = fs::read(path) {
201                    match pdf_text_fallback(&bytes) {
202                        Ok(Some(fallback_text)) => {
203                            return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
204                        }
205                        Ok(None) => {}
206                        Err(fallback_err) => {
207                            let reason = format!(
208                                "primary extractor error: {}; PDF fallback error: {}",
209                                primary_reason, fallback_err
210                            );
211                            return Err(MemvidError::ExtractionFailed {
212                                reason: reason.into(),
213                            });
214                        }
215                    }
216                }
217                Err(MemvidError::ExtractionFailed {
218                    reason: primary_reason.into(),
219                })
220            }
221        }
222    }
223
224    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
225        let hash = blake3::hash(bytes);
226        if let Some(cached) = cache_lookup(&hash) {
227            tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
228            return Ok(cached);
229        }
230
231        let extraction = {
232            let extractor = self.locked()?;
233            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
234            extractor.extract_bytes_to_string(bytes)
235        };
236
237        let document = match extraction {
238            Ok((mut content, metadata)) => {
239                let pdf_needed = needs_pdf_fallback(&content);
240                tracing::debug!(
241                    target: "memvid::extract",
242                    content_len = content.len(),
243                    pdf_fallback_needed = pdf_needed,
244                    "extractous returned content"
245                );
246                if pdf_needed {
247                    match pdf_text_fallback(bytes) {
248                        Ok(Some(fallback_text)) => {
249                            tracing::debug!(
250                                target: "memvid::extract",
251                                fallback_len = fallback_text.len(),
252                                "lopdf fallback succeeded"
253                            );
254                            content = fallback_text;
255                        }
256                        Ok(None) => {
257                            tracing::debug!(
258                                target: "memvid::extract",
259                                "lopdf fallback returned None"
260                            );
261                            // PDF detected but lopdf couldn't extract any text
262                            // Return empty rather than raw PDF bytes
263                            content = String::new();
264                        }
265                        Err(e) => {
266                            tracing::debug!(
267                                target: "memvid::extract",
268                                error = %e,
269                                "lopdf fallback failed"
270                            );
271                            // lopdf extraction failed - return empty rather than raw PDF bytes
272                            content = String::new();
273                        }
274                    }
275                }
276                self.into_document(content, metadata)
277            }
278            Err(err) => {
279                let primary_reason = err.to_string();
280                match pdf_text_fallback(bytes) {
281                    Ok(Some(fallback_text)) => {
282                        self.into_document(fallback_text, pdf_fallback_metadata())
283                    }
284                    Ok(None) => {
285                        return Err(MemvidError::ExtractionFailed {
286                            reason: primary_reason.into(),
287                        });
288                    }
289                    Err(fallback_err) => {
290                        let reason = format!(
291                            "primary extractor error: {}; PDF fallback error: {}",
292                            primary_reason, fallback_err
293                        );
294                        return Err(MemvidError::ExtractionFailed {
295                            reason: reason.into(),
296                        });
297                    }
298                }
299            }
300        };
301
302        cache_store(hash, &document);
303        Ok(document)
304    }
305
306    fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
307        self.extractor
308            .lock()
309            .map_err(|_| MemvidError::ExtractionFailed {
310                reason: "extractor mutex poisoned".into(),
311            })
312    }
313
314    fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
315    where
316        M: serde::Serialize,
317    {
318        let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
319        let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
320
321        let text = if content.trim().is_empty() {
322            tracing::debug!(
323                target: "memvid::extract",
324                "into_document: content is empty, returning text=None"
325            );
326            None
327        } else {
328            let final_text = if content.len() > self.max_length {
329                let end = truncate_at_grapheme_boundary(&content, self.max_length);
330                content[..end].to_string()
331            } else {
332                content
333            };
334            tracing::debug!(
335                target: "memvid::extract",
336                text_len = final_text.len(),
337                starts_with_pdf = final_text.starts_with("%PDF"),
338                "into_document: returning text"
339            );
340            Some(final_text)
341        };
342
343        ExtractedDocument {
344            text,
345            metadata: metadata_value,
346            mime_type,
347        }
348    }
349}
350
351// ============================================================================
352// Stub DocumentProcessor when extractous is disabled - returns clear error
353// ============================================================================
354
355#[cfg(not(feature = "extractous"))]
356#[derive(Debug)]
357pub struct DocumentProcessor {
358    max_length: usize,
359}
360
361#[cfg(not(feature = "extractous"))]
362impl Default for DocumentProcessor {
363    fn default() -> Self {
364        Self::new(Default::default())
365    }
366}
367
368#[cfg(not(feature = "extractous"))]
369impl DocumentProcessor {
370    #[must_use]
371    pub fn new(config: ProcessorConfig) -> Self {
372        Self {
373            max_length: config.max_text_chars,
374        }
375    }
376
377    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
378        // Without extractous, we can still handle plain text files
379        let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
380            reason: format!("failed to read file: {e}").into(),
381        })?;
382        self.extract_from_bytes(&bytes)
383    }
384
385    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
386        // Check if this is a PDF - extract text using pdf_extract (if available) or lopdf
387        if is_probably_pdf_simple(bytes) {
388            match pdf_text_extract_best(bytes) {
389                Ok(Some((text, extractor))) => {
390                    let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
391                    let truncated = &text[..truncate_len];
392                    return Ok(ExtractedDocument {
393                        text: Some(truncated.to_string()),
394                        metadata: json!({
395                            "Content-Type": "application/pdf",
396                            "extraction": extractor,
397                        }),
398                        mime_type: Some("application/pdf".to_string()),
399                    });
400                }
401                Ok(None) => {
402                    // PDF detected but no text could be extracted (image-only PDF)
403                    return Ok(ExtractedDocument {
404                        text: None,
405                        metadata: json!({
406                            "Content-Type": "application/pdf",
407                            "extraction": "no_text",
408                        }),
409                        mime_type: Some("application/pdf".to_string()),
410                    });
411                }
412                Err(e) => {
413                    tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
414                    // Fall through to binary handling
415                }
416            }
417        }
418
419        // Without extractous, we can still handle plain text files and common text-based formats
420        // Try to interpret as UTF-8 text first
421        if let Ok(text) = std::str::from_utf8(bytes) {
422            // Check if it's likely text (no null bytes in first 8KB)
423            let sample = &bytes[..bytes.len().min(8192)];
424            if !sample.contains(&0) {
425                let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
426                let truncated = &text[..truncate_len];
427                return Ok(ExtractedDocument {
428                    text: Some(truncated.to_string()),
429                    metadata: json!({}),
430                    mime_type: Some("text/plain".to_string()),
431                });
432            }
433        }
434
435        // For binary content (video, audio, images, etc.), return success with no text.
436        // This allows binary blobs to be stored without requiring the extractous feature.
437        // The caller can still store the blob; there just won't be extracted text for search.
438        Ok(ExtractedDocument {
439            text: None,
440            metadata: json!({}),
441            mime_type: Some("application/octet-stream".to_string()),
442        })
443    }
444}
445
446#[cfg(feature = "extractous")]
447fn needs_pdf_fallback(content: &str) -> bool {
448    if content.trim().is_empty() {
449        return true;
450    }
451    looks_like_pdf_structure_dump(content)
452}
453
454#[cfg(feature = "extractous")]
455fn pdf_fallback_metadata() -> Value {
456    json!({
457        "Content-Type": "application/pdf",
458        "extraction": "lopdf_fallback",
459    })
460}
461
462#[cfg(feature = "extractous")]
463const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap.
464#[cfg(feature = "extractous")]
465const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
466
467#[cfg(feature = "extractous")]
468fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
469    if !is_probably_pdf(bytes) {
470        return Ok(None);
471    }
472
473    if bytes.len() > PDF_FALLBACK_MAX_BYTES {
474        return Err(MemvidError::ExtractionFailed {
475            reason: format!(
476                "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
477                bytes.len(),
478                PDF_FALLBACK_MAX_BYTES
479            )
480            .into(),
481        });
482    }
483
484    let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
485    let mut document =
486        LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
487            reason: format!("pdf fallback failed to load document: {err}").into(),
488        })?;
489
490    if document.is_encrypted() {
491        if document.decrypt("").is_err() {
492            return Err(MemvidError::ExtractionFailed {
493                reason: "pdf fallback cannot decrypt password-protected file".into(),
494            });
495        }
496    }
497
498    let _ = document.decompress();
499
500    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
501    if page_numbers.is_empty() {
502        return Ok(None);
503    }
504    page_numbers.sort_unstable();
505
506    if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
507        return Err(MemvidError::ExtractionFailed {
508            reason: format!(
509                "pdf fallback aborted: page count {} exceeds limit of {}",
510                page_numbers.len(),
511                PDF_FALLBACK_MAX_PAGES
512            )
513            .into(),
514        });
515    }
516
517    match document.extract_text(&page_numbers) {
518        Ok(text) => {
519            let trimmed = text.trim();
520            if trimmed.is_empty() {
521                Ok(None)
522            } else {
523                // Apply fix_pdf_spacing to repair character-level spacing artifacts
524                Ok(Some(fix_pdf_spacing(trimmed)))
525            }
526        }
527        Err(err) => Err(MemvidError::ExtractionFailed {
528            reason: format!("pdf fallback failed to extract text: {err}").into(),
529        }),
530    }
531}
532
533#[cfg(feature = "extractous")]
534struct ScopedLogLevel {
535    previous: LevelFilter,
536    changed: bool,
537}
538
539#[cfg(feature = "extractous")]
540impl ScopedLogLevel {
541    fn lowered(level: LevelFilter) -> Self {
542        let previous = log::max_level();
543        if level < previous {
544            log::set_max_level(level);
545            Self {
546                previous,
547                changed: true,
548            }
549        } else {
550            Self {
551                previous,
552                changed: false,
553            }
554        }
555    }
556}
557
558#[cfg(feature = "extractous")]
559impl Drop for ScopedLogLevel {
560    fn drop(&mut self) {
561        if self.changed {
562            log::set_max_level(self.previous);
563        }
564    }
565}
566
567#[cfg(feature = "extractous")]
568fn is_probably_pdf(bytes: &[u8]) -> bool {
569    if bytes.is_empty() {
570        return false;
571    }
572    let mut slice = bytes;
573    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
574        slice = &slice[3..];
575    }
576    while let Some((first, rest)) = slice.split_first() {
577        if *first == 0 || first.is_ascii_whitespace() {
578            slice = rest;
579        } else {
580            break;
581        }
582    }
583    slice.starts_with(b"%PDF")
584}
585
586#[cfg(feature = "extractous")]
587fn looks_like_pdf_structure_dump(content: &str) -> bool {
588    if content.len() < 256 {
589        return false;
590    }
591    let sample_len = content.len().min(8_192);
592    // Find the nearest valid character boundary at or before sample_len
593    let safe_len = truncate_at_grapheme_boundary(content, sample_len);
594    let sample = &content[..safe_len];
595    let endobj_hits = sample.matches("endobj").take(3).count();
596    if endobj_hits < 2 {
597        return false;
598    }
599    let has_obj =
600        sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
601    let has_stream = sample.contains("endstream");
602    let has_page_type = sample.contains("/Type /Page");
603    endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
604}
605
606#[cfg(feature = "extractous")]
607fn value_to_mime(value: &Value) -> Option<String> {
608    if let Some(mime) = value.as_str() {
609        return Some(mime.to_string());
610    }
611    if let Some(array) = value.as_array() {
612        for entry in array {
613            if let Some(mime) = entry.as_str() {
614                return Some(mime.to_string());
615            }
616        }
617    }
618    None
619}
620
621#[cfg(feature = "extractous")]
622fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
623    let cache = EXTRACTION_CACHE
624        .get_or_init(|| Mutex::new(ExtractionCache::new(DEFAULT_EXTRACTION_CACHE_CAPACITY)));
625    cache.lock().ok().and_then(|mut map| map.get(hash))
626}
627
628#[cfg(feature = "extractous")]
629fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
630    let cache = EXTRACTION_CACHE
631        .get_or_init(|| Mutex::new(ExtractionCache::new(DEFAULT_EXTRACTION_CACHE_CAPACITY)));
632    if let Ok(mut map) = cache.lock() {
633        map.insert(hash, document.clone());
634    }
635}
636
637// ============================================================================
638// PDF extraction helpers (available without extractous feature)
639// Priority: pdf_oxide (best accuracy) > pdf_extract > lopdf
640// ============================================================================
641
642#[allow(dead_code)]
643const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap
644#[allow(dead_code)]
645const PDF_LOPDF_MAX_PAGES: usize = 4_096;
646
647/// Try multiple PDF extractors and return the best result
648/// Returns (text, `extractor_name`) or None if no text found
649/// Priority: `pdf_oxide` (2025, best accuracy) > `pdf_extract` > lopdf
650#[allow(dead_code)]
651fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
652    let mut best_text: Option<String> = None;
653    let mut best_source: &'static str = "";
654
655    // Calculate minimum "good" threshold based on file size
656    #[cfg(any(feature = "pdf_oxide", feature = "pdf_extract"))]
657    let min_good_chars = (bytes.len() / 100).clamp(500, 5000);
658
659    // Try pdf_oxide first (if feature enabled) - highest accuracy, perfect word spacing
660    // Wrap in catch_unwind because cff-parser may panic on certain fonts (ligatures)
661    #[cfg(feature = "pdf_oxide")]
662    {
663        let bytes_clone = bytes.to_vec();
664        let oxide_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
665            pdf_text_extract_oxide(&bytes_clone)
666        }));
667
668        match oxide_result {
669            Ok(Ok(Some(text))) => {
670                let trimmed = text.trim();
671                if !trimmed.is_empty() {
672                    if trimmed.len() >= min_good_chars {
673                        tracing::debug!(
674                            target: "memvid::extract",
675                            len = trimmed.len(),
676                            "pdf_oxide succeeded with good result"
677                        );
678                        return Ok(Some((trimmed.to_string(), "pdf_oxide")));
679                    }
680                    tracing::debug!(
681                        target: "memvid::extract",
682                        len = trimmed.len(),
683                        min_good = min_good_chars,
684                        "pdf_oxide returned partial result, trying fallbacks"
685                    );
686                    best_text = Some(trimmed.to_string());
687                    best_source = "pdf_oxide";
688                }
689            }
690            Ok(Ok(None)) => {
691                tracing::debug!(target: "memvid::extract", "pdf_oxide returned no text");
692            }
693            Ok(Err(e)) => {
694                tracing::debug!(target: "memvid::extract", error = %e, "pdf_oxide failed");
695            }
696            Err(_) => {
697                tracing::warn!(target: "memvid::extract", "pdf_oxide panicked (likely font parsing issue), falling back to other extractors");
698            }
699        }
700    }
701
702    // Try pdf_extract next (if feature enabled)
703    #[cfg(feature = "pdf_extract")]
704    {
705        match pdf_extract::extract_text_from_mem(bytes) {
706            Ok(text) => {
707                let trimmed = text.trim();
708                if !trimmed.is_empty() {
709                    if best_text.is_none() && trimmed.len() >= min_good_chars {
710                        tracing::debug!(
711                            target: "memvid::extract",
712                            len = trimmed.len(),
713                            "pdf_extract succeeded with good result"
714                        );
715                        return Ok(Some((trimmed.to_string(), "pdf_extract")));
716                    }
717                    // Use if better than current best
718                    if best_text
719                        .as_ref()
720                        .is_none_or(|prev| trimmed.len() > prev.len())
721                    {
722                        best_text = Some(trimmed.to_string());
723                        best_source = "pdf_extract";
724                    }
725                }
726            }
727            Err(e) => {
728                tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
729            }
730        }
731    }
732
733    // Try lopdf (pure Rust, always available)
734    match pdf_text_extract_lopdf(bytes) {
735        Ok(Some(text)) => {
736            let trimmed = text.trim();
737            if !trimmed.is_empty() {
738                // Use lopdf result if better than previous
739                if best_text
740                    .as_ref()
741                    .is_none_or(|prev| trimmed.len() > prev.len())
742                {
743                    tracing::debug!(
744                        target: "memvid::extract",
745                        len = trimmed.len(),
746                        "lopdf extracted more text"
747                    );
748                    best_text = Some(trimmed.to_string());
749                    best_source = "lopdf";
750                }
751            }
752        }
753        Ok(None) => {
754            tracing::debug!(target: "memvid::extract", "lopdf returned no text");
755        }
756        Err(e) => {
757            tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
758        }
759    }
760
761    // Apply fix_pdf_spacing to repair character-level spacing artifacts from PDF encoding
762    Ok(best_text.map(|t| (fix_pdf_spacing(&t), best_source)))
763}
764
765/// Extract text from PDF using pdf_oxide (highest accuracy, perfect word spacing)
766/// Note: pdf_oxide only supports file paths, so we write bytes to a temp file first
767#[cfg(feature = "pdf_oxide")]
768#[allow(dead_code)]
769fn pdf_text_extract_oxide(bytes: &[u8]) -> Result<Option<String>> {
770    use pdf_oxide::PdfDocument;
771    use std::io::Write;
772    use tempfile::NamedTempFile;
773
774    // pdf_oxide only supports opening from file path, so we write to temp file
775    let mut temp_file = NamedTempFile::new().map_err(|err| MemvidError::ExtractionFailed {
776        reason: format!("pdf_oxide failed to create temp file: {err}").into(),
777    })?;
778
779    temp_file
780        .write_all(bytes)
781        .map_err(|err| MemvidError::ExtractionFailed {
782            reason: format!("pdf_oxide failed to write temp file: {err}").into(),
783        })?;
784
785    temp_file
786        .flush()
787        .map_err(|err| MemvidError::ExtractionFailed {
788            reason: format!("pdf_oxide failed to flush temp file: {err}").into(),
789        })?;
790
791    let temp_path = temp_file.path();
792    let mut doc = PdfDocument::open(temp_path).map_err(|err| MemvidError::ExtractionFailed {
793        reason: format!("pdf_oxide failed to load PDF: {err}").into(),
794    })?;
795
796    let page_count = doc
797        .page_count()
798        .map_err(|err| MemvidError::ExtractionFailed {
799            reason: format!("pdf_oxide failed to get page count: {err}").into(),
800        })?;
801    if page_count == 0 {
802        return Ok(None);
803    }
804
805    let mut all_text = String::new();
806    for page_idx in 0..page_count {
807        match doc.extract_text(page_idx) {
808            Ok(text) => {
809                if !text.is_empty() {
810                    if !all_text.is_empty() {
811                        all_text.push('\n');
812                    }
813                    all_text.push_str(&text);
814                }
815            }
816            Err(e) => {
817                tracing::debug!(
818                    target: "memvid::extract",
819                    page = page_idx,
820                    error = %e,
821                    "pdf_oxide failed to extract page"
822                );
823            }
824        }
825    }
826
827    let trimmed = all_text.trim();
828    if trimmed.is_empty() {
829        Ok(None)
830    } else {
831        Ok(Some(trimmed.to_string()))
832    }
833}
834
835/// Check if bytes look like a PDF file (magic bytes check)
836#[allow(dead_code)]
837fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
838    if bytes.is_empty() {
839        return false;
840    }
841    let mut slice = bytes;
842    // Skip BOM if present
843    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
844        slice = &slice[3..];
845    }
846    // Skip leading whitespace/nulls
847    while let Some((first, rest)) = slice.split_first() {
848        if *first == 0 || first.is_ascii_whitespace() {
849            slice = rest;
850        } else {
851            break;
852        }
853    }
854    slice.starts_with(b"%PDF")
855}
856
857/// Extract text from a PDF using lopdf (pure Rust, no external dependencies)
858#[allow(dead_code)]
859fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
860    if bytes.len() > PDF_LOPDF_MAX_BYTES {
861        return Err(MemvidError::ExtractionFailed {
862            reason: format!(
863                "PDF too large: {} bytes exceeds limit of {} bytes",
864                bytes.len(),
865                PDF_LOPDF_MAX_BYTES
866            )
867            .into(),
868        });
869    }
870
871    let mut document =
872        LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
873            reason: format!("failed to load PDF: {err}").into(),
874        })?;
875
876    // Try to decrypt if encrypted (empty password for unprotected PDFs)
877    if document.is_encrypted() && document.decrypt("").is_err() {
878        return Err(MemvidError::ExtractionFailed {
879            reason: "cannot decrypt password-protected PDF".into(),
880        });
881    }
882
883    // Decompress streams for better text extraction
884    let () = document.decompress();
885
886    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
887    if page_numbers.is_empty() {
888        return Ok(None);
889    }
890    page_numbers.sort_unstable();
891
892    if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
893        return Err(MemvidError::ExtractionFailed {
894            reason: format!(
895                "PDF has too many pages: {} exceeds limit of {}",
896                page_numbers.len(),
897                PDF_LOPDF_MAX_PAGES
898            )
899            .into(),
900        });
901    }
902
903    match document.extract_text(&page_numbers) {
904        Ok(text) => {
905            let trimmed = text.trim();
906            if trimmed.is_empty() {
907                Ok(None)
908            } else {
909                Ok(Some(trimmed.to_string()))
910            }
911        }
912        Err(err) => Err(MemvidError::ExtractionFailed {
913            reason: format!("failed to extract text from PDF: {err}").into(),
914        }),
915    }
916}
917
918#[cfg(all(test, feature = "extractous"))]
919mod tests {
920    use super::*;
921
922    #[test]
923    fn detects_pdf_like_dump() {
924        let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
925        let dump = snippet.repeat(12);
926        assert!(looks_like_pdf_structure_dump(&dump));
927    }
928
929    #[test]
930    fn skips_normal_text() {
931        let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
932        assert!(!looks_like_pdf_structure_dump(text));
933    }
934
935    #[test]
936    fn identifies_pdf_magic() {
937        let bytes = b"%PDF-1.7 some data";
938        assert!(is_probably_pdf(bytes));
939        let padded = b"\n\n%PDF-1.5";
940        assert!(is_probably_pdf(padded));
941        let not_pdf = b"<!doctype html>";
942        assert!(!is_probably_pdf(not_pdf));
943    }
944}
945
946#[cfg(all(test, feature = "extractous"))]
947mod pdf_fix_tests {
948    use super::*;
949
950    /// Test that PDF extraction via lopdf fallback returns actual text content,
951    /// not raw PDF bytes. This test verifies the fix for the bug where PDFs
952    /// with extractous returning empty content would have their raw bytes
953    /// indexed instead of extracted text.
954    #[test]
955    fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
956        // Create a synthetic PDF-like structure that should trigger the fallback
957        let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
958
959        // This should be detected as PDF structure
960        assert!(is_probably_pdf(pdf_structure));
961
962        // And content that looks like PDF structure should be detected
963        let structure_dump =
964            "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
965        assert!(looks_like_pdf_structure_dump(&structure_dump));
966
967        // Normal text should NOT be detected as PDF structure
968        let normal_text = "This is perfectly normal extracted text from a document.";
969        assert!(!looks_like_pdf_structure_dump(normal_text));
970    }
971}
972
973// ============================================================================
974// Tests for ExtractionCache LRU eviction
975// ============================================================================
976
977#[cfg(all(test, feature = "extractous"))]
978mod extraction_cache_tests {
979    use super::*;
980
981    fn make_doc(content: &str) -> ExtractedDocument {
982        ExtractedDocument {
983            text: Some(content.to_string()),
984            metadata: serde_json::json!({}),
985            mime_type: Some("text/plain".to_string()),
986        }
987    }
988
989    #[test]
990    fn test_extraction_cache_basic() {
991        let mut cache = ExtractionCache::new(10);
992        let hash = blake3::hash(b"test document");
993        let doc = make_doc("test content");
994
995        cache.insert(hash, doc.clone());
996        let retrieved = cache.get(&hash);
997        assert!(retrieved.is_some());
998        assert_eq!(retrieved.unwrap().text, Some("test content".to_string()));
999    }
1000
1001    #[test]
1002    fn test_extraction_cache_stats() {
1003        let mut cache = ExtractionCache::new(10);
1004        let hash = blake3::hash(b"test");
1005        cache.insert(hash, make_doc("test"));
1006
1007        // Hit
1008        let _ = cache.get(&hash);
1009        // Miss
1010        let missing = blake3::hash(b"missing");
1011        let _ = cache.get(&missing);
1012
1013        let (hits, misses, size) = cache.stats();
1014        assert_eq!(hits, 1);
1015        assert_eq!(misses, 1);
1016        assert_eq!(size, 1);
1017    }
1018
1019    #[test]
1020    fn test_extraction_cache_eviction() {
1021        let mut cache = ExtractionCache::new(3);
1022
1023        // Insert 4 items, first should be evicted
1024        for i in 0..4u8 {
1025            let hash = blake3::hash(&[i]);
1026            cache.insert(hash, make_doc(&format!("doc{}", i)));
1027        }
1028
1029        // First item should be evicted
1030        let evicted = blake3::hash(&[0u8]);
1031        assert!(cache.get(&evicted).is_none());
1032
1033        // Last 3 should still exist
1034        for i in 1..4u8 {
1035            let hash = blake3::hash(&[i]);
1036            assert!(cache.get(&hash).is_some());
1037        }
1038    }
1039
1040    #[test]
1041    fn test_extraction_cache_lru_promotion() {
1042        let mut cache = ExtractionCache::new(3);
1043
1044        // Insert 3 items: 0, 1, 2
1045        for i in 0..3u8 {
1046            let hash = blake3::hash(&[i]);
1047            cache.insert(hash, make_doc(&format!("doc{}", i)));
1048        }
1049
1050        // Access first item (promotes it to front)
1051        let first = blake3::hash(&[0u8]);
1052        let _ = cache.get(&first);
1053
1054        // Insert 4th item - should evict second (index 1, now oldest)
1055        let new_hash = blake3::hash(&[3u8]);
1056        cache.insert(new_hash, make_doc("doc3"));
1057
1058        // First should still exist (was accessed, got promoted)
1059        assert!(cache.get(&first).is_some());
1060
1061        // Second should be evicted (was oldest after first was promoted)
1062        let second = blake3::hash(&[1u8]);
1063        assert!(cache.get(&second).is_none());
1064
1065        // Third and fourth should exist
1066        let third = blake3::hash(&[2u8]);
1067        assert!(cache.get(&third).is_some());
1068        assert!(cache.get(&new_hash).is_some());
1069    }
1070
1071    #[test]
1072    fn test_extraction_cache_update_existing() {
1073        let mut cache = ExtractionCache::new(3);
1074        let hash = blake3::hash(b"test");
1075
1076        cache.insert(hash, make_doc("original"));
1077        cache.insert(hash, make_doc("updated"));
1078
1079        let retrieved = cache.get(&hash);
1080        assert_eq!(retrieved.unwrap().text, Some("updated".to_string()));
1081
1082        // Size should still be 1
1083        let (_, _, size) = cache.stats();
1084        assert_eq!(size, 1);
1085    }
1086}
memvid_core/extract.rs

memvid_core/
extract.rs