memvid_core/
extract.rs

1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5#[cfg(feature = "extractous")]
6use log::LevelFilter;
7#[cfg(feature = "extractous")]
8use lopdf::Document as LopdfDocument;
9use serde_json::{Value, json};
10
11#[cfg(feature = "extractous")]
12use extractous::Extractor;
13#[cfg(feature = "extractous")]
14use std::collections::HashMap;
15#[cfg(feature = "extractous")]
16use std::sync::{Mutex, OnceLock};
17
18/// Structured result produced by [`DocumentProcessor`] after running
19/// Extractous over an input document.
20#[derive(Debug, Clone)]
21pub struct ExtractedDocument {
22    pub text: Option<String>,
23    pub metadata: Value,
24    pub mime_type: Option<String>,
25}
26
27impl ExtractedDocument {
28    pub fn empty() -> Self {
29        Self {
30            text: None,
31            metadata: Value::Null,
32            mime_type: None,
33        }
34    }
35}
36
37#[derive(Debug, Clone, Copy)]
38pub struct ProcessorConfig {
39    pub max_text_chars: usize,
40}
41
42impl Default for ProcessorConfig {
43    fn default() -> Self {
44        Self {
45            max_text_chars: 2_000_000,
46        }
47    }
48}
49
50// ============================================================================
51// DocumentProcessor - only available with extractous feature
52// ============================================================================
53
54#[cfg(feature = "extractous")]
55#[derive(Debug)]
56pub struct DocumentProcessor {
57    extractor: Mutex<Extractor>,
58    max_length: usize,
59}
60
61#[cfg(feature = "extractous")]
62impl Default for DocumentProcessor {
63    fn default() -> Self {
64        Self::new(Default::default())
65    }
66}
67
68#[cfg(feature = "extractous")]
69static EXTRACTION_CACHE: OnceLock<Mutex<HashMap<blake3::Hash, ExtractedDocument>>> =
70    OnceLock::new();
71
72#[cfg(feature = "extractous")]
73impl DocumentProcessor {
74    pub fn new(config: ProcessorConfig) -> Self {
75        let capped = config
76            .max_text_chars
77            .min(i32::MAX as usize)
78            .try_into()
79            .unwrap_or(i32::MAX);
80        let mut extractor = Extractor::new().set_extract_string_max_length(capped);
81        extractor = extractor.set_xml_output(false);
82        Self {
83            extractor: Mutex::new(extractor),
84            max_length: config.max_text_chars,
85        }
86    }
87
88    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
89        let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
90            reason: "input path contains invalid UTF-8".into(),
91        })?;
92
93        let extraction = {
94            let extractor = self.locked()?;
95            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
96            extractor.extract_file_to_string(path_str)
97        };
98
99        match extraction {
100            Ok((mut content, metadata)) => {
101                if needs_pdf_fallback(&content) {
102                    if let Ok(bytes) = fs::read(path) {
103                        if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
104                            content = fallback_text;
105                        }
106                    }
107                }
108                Ok(self.into_document(content, metadata))
109            }
110            Err(err) => {
111                let primary_reason = err.to_string();
112                if let Ok(bytes) = fs::read(path) {
113                    match pdf_text_fallback(&bytes) {
114                        Ok(Some(fallback_text)) => {
115                            return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
116                        }
117                        Ok(None) => {}
118                        Err(fallback_err) => {
119                            let reason = format!(
120                                "primary extractor error: {}; PDF fallback error: {}",
121                                primary_reason, fallback_err
122                            );
123                            return Err(MemvidError::ExtractionFailed {
124                                reason: reason.into(),
125                            });
126                        }
127                    }
128                }
129                Err(MemvidError::ExtractionFailed {
130                    reason: primary_reason.into(),
131                })
132            }
133        }
134    }
135
136    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
137        let hash = blake3::hash(bytes);
138        if let Some(cached) = cache_lookup(&hash) {
139            tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
140            return Ok(cached);
141        }
142
143        let extraction = {
144            let extractor = self.locked()?;
145            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
146            extractor.extract_bytes_to_string(bytes)
147        };
148
149        let document = match extraction {
150            Ok((mut content, metadata)) => {
151                let pdf_needed = needs_pdf_fallback(&content);
152                tracing::debug!(
153                    target: "memvid::extract",
154                    content_len = content.len(),
155                    pdf_fallback_needed = pdf_needed,
156                    "extractous returned content"
157                );
158                if pdf_needed {
159                    match pdf_text_fallback(bytes) {
160                        Ok(Some(fallback_text)) => {
161                            tracing::debug!(
162                                target: "memvid::extract",
163                                fallback_len = fallback_text.len(),
164                                "lopdf fallback succeeded"
165                            );
166                            content = fallback_text;
167                        }
168                        Ok(None) => {
169                            tracing::debug!(
170                                target: "memvid::extract",
171                                "lopdf fallback returned None"
172                            );
173                            // PDF detected but lopdf couldn't extract any text
174                            // Return empty rather than raw PDF bytes
175                            content = String::new();
176                        }
177                        Err(e) => {
178                            tracing::debug!(
179                                target: "memvid::extract",
180                                error = %e,
181                                "lopdf fallback failed"
182                            );
183                            // lopdf extraction failed - return empty rather than raw PDF bytes
184                            content = String::new();
185                        }
186                    }
187                }
188                self.into_document(content, metadata)
189            }
190            Err(err) => {
191                let primary_reason = err.to_string();
192                match pdf_text_fallback(bytes) {
193                    Ok(Some(fallback_text)) => {
194                        self.into_document(fallback_text, pdf_fallback_metadata())
195                    }
196                    Ok(None) => {
197                        return Err(MemvidError::ExtractionFailed {
198                            reason: primary_reason.into(),
199                        });
200                    }
201                    Err(fallback_err) => {
202                        let reason = format!(
203                            "primary extractor error: {}; PDF fallback error: {}",
204                            primary_reason, fallback_err
205                        );
206                        return Err(MemvidError::ExtractionFailed {
207                            reason: reason.into(),
208                        });
209                    }
210                }
211            }
212        };
213
214        cache_store(hash, &document);
215        Ok(document)
216    }
217
218    fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
219        self.extractor
220            .lock()
221            .map_err(|_| MemvidError::ExtractionFailed {
222                reason: "extractor mutex poisoned".into(),
223            })
224    }
225
226    fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
227    where
228        M: serde::Serialize,
229    {
230        let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
231        let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
232
233        let text = if content.trim().is_empty() {
234            tracing::debug!(
235                target: "memvid::extract",
236                "into_document: content is empty, returning text=None"
237            );
238            None
239        } else {
240            let final_text = if content.len() > self.max_length {
241                let end = truncate_at_grapheme_boundary(&content, self.max_length);
242                content[..end].to_string()
243            } else {
244                content
245            };
246            tracing::debug!(
247                target: "memvid::extract",
248                text_len = final_text.len(),
249                starts_with_pdf = final_text.starts_with("%PDF"),
250                "into_document: returning text"
251            );
252            Some(final_text)
253        };
254
255        ExtractedDocument {
256            text,
257            metadata: metadata_value,
258            mime_type,
259        }
260    }
261}
262
263// ============================================================================
264// Stub DocumentProcessor when extractous is disabled - returns clear error
265// ============================================================================
266
267#[cfg(not(feature = "extractous"))]
268#[derive(Debug)]
269pub struct DocumentProcessor {
270    max_length: usize,
271}
272
273#[cfg(not(feature = "extractous"))]
274impl Default for DocumentProcessor {
275    fn default() -> Self {
276        Self::new(Default::default())
277    }
278}
279
280#[cfg(not(feature = "extractous"))]
281impl DocumentProcessor {
282    pub fn new(config: ProcessorConfig) -> Self {
283        Self {
284            max_length: config.max_text_chars,
285        }
286    }
287
288    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
289        // Without extractous, we can still handle plain text files
290        let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
291            reason: format!("failed to read file: {e}").into(),
292        })?;
293        self.extract_from_bytes(&bytes)
294    }
295
296    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
297        // Without extractous, we can still handle plain text files and common text-based formats
298        // Try to interpret as UTF-8 text first
299        if let Ok(text) = std::str::from_utf8(bytes) {
300            // Check if it's likely text (no null bytes in first 8KB)
301            let sample = &bytes[..bytes.len().min(8192)];
302            if !sample.contains(&0) {
303                let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
304                let truncated = &text[..truncate_len];
305                return Ok(ExtractedDocument {
306                    text: Some(truncated.to_string()),
307                    metadata: json!({}),
308                    mime_type: Some("text/plain".to_string()),
309                });
310            }
311        }
312
313        // For binary content (video, audio, images, etc.), return success with no text.
314        // This allows binary blobs to be stored without requiring the extractous feature.
315        // The caller can still store the blob; there just won't be extracted text for search.
316        Ok(ExtractedDocument {
317            text: None,
318            metadata: json!({}),
319            mime_type: Some("application/octet-stream".to_string()),
320        })
321    }
322}
323
324#[cfg(feature = "extractous")]
325fn needs_pdf_fallback(content: &str) -> bool {
326    if content.trim().is_empty() {
327        return true;
328    }
329    looks_like_pdf_structure_dump(content)
330}
331
332#[cfg(feature = "extractous")]
333fn pdf_fallback_metadata() -> Value {
334    json!({
335        "Content-Type": "application/pdf",
336        "extraction": "lopdf_fallback",
337    })
338}
339
340#[cfg(feature = "extractous")]
341const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap.
342#[cfg(feature = "extractous")]
343const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
344
345#[cfg(feature = "extractous")]
346fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
347    if !is_probably_pdf(bytes) {
348        return Ok(None);
349    }
350
351    if bytes.len() > PDF_FALLBACK_MAX_BYTES {
352        return Err(MemvidError::ExtractionFailed {
353            reason: format!(
354                "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
355                bytes.len(),
356                PDF_FALLBACK_MAX_BYTES
357            )
358            .into(),
359        });
360    }
361
362    let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
363    let mut document =
364        LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
365            reason: format!("pdf fallback failed to load document: {err}").into(),
366        })?;
367
368    if document.is_encrypted() {
369        if document.decrypt("").is_err() {
370            return Err(MemvidError::ExtractionFailed {
371                reason: "pdf fallback cannot decrypt password-protected file".into(),
372            });
373        }
374    }
375
376    let _ = document.decompress();
377
378    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
379    if page_numbers.is_empty() {
380        return Ok(None);
381    }
382    page_numbers.sort_unstable();
383
384    if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
385        return Err(MemvidError::ExtractionFailed {
386            reason: format!(
387                "pdf fallback aborted: page count {} exceeds limit of {}",
388                page_numbers.len(),
389                PDF_FALLBACK_MAX_PAGES
390            )
391            .into(),
392        });
393    }
394
395    match document.extract_text(&page_numbers) {
396        Ok(text) => {
397            let trimmed = text.trim();
398            if trimmed.is_empty() {
399                Ok(None)
400            } else {
401                Ok(Some(trimmed.to_string()))
402            }
403        }
404        Err(err) => Err(MemvidError::ExtractionFailed {
405            reason: format!("pdf fallback failed to extract text: {err}").into(),
406        }),
407    }
408}
409
410#[cfg(feature = "extractous")]
411struct ScopedLogLevel {
412    previous: LevelFilter,
413    changed: bool,
414}
415
416#[cfg(feature = "extractous")]
417impl ScopedLogLevel {
418    fn lowered(level: LevelFilter) -> Self {
419        let previous = log::max_level();
420        if level < previous {
421            log::set_max_level(level);
422            Self {
423                previous,
424                changed: true,
425            }
426        } else {
427            Self {
428                previous,
429                changed: false,
430            }
431        }
432    }
433}
434
435#[cfg(feature = "extractous")]
436impl Drop for ScopedLogLevel {
437    fn drop(&mut self) {
438        if self.changed {
439            log::set_max_level(self.previous);
440        }
441    }
442}
443
444#[cfg(feature = "extractous")]
445fn is_probably_pdf(bytes: &[u8]) -> bool {
446    if bytes.is_empty() {
447        return false;
448    }
449    let mut slice = bytes;
450    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
451        slice = &slice[3..];
452    }
453    while let Some((first, rest)) = slice.split_first() {
454        if *first == 0 || first.is_ascii_whitespace() {
455            slice = rest;
456        } else {
457            break;
458        }
459    }
460    slice.starts_with(b"%PDF")
461}
462
463#[cfg(feature = "extractous")]
464fn looks_like_pdf_structure_dump(content: &str) -> bool {
465    if content.len() < 256 {
466        return false;
467    }
468    let sample_len = content.len().min(8_192);
469    // Find the nearest valid character boundary at or before sample_len
470    let safe_len = truncate_at_grapheme_boundary(content, sample_len);
471    let sample = &content[..safe_len];
472    let endobj_hits = sample.matches("endobj").take(3).count();
473    if endobj_hits < 2 {
474        return false;
475    }
476    let has_obj =
477        sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
478    let has_stream = sample.contains("endstream");
479    let has_page_type = sample.contains("/Type /Page");
480    endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
481}
482
483#[cfg(feature = "extractous")]
484fn value_to_mime(value: &Value) -> Option<String> {
485    if let Some(mime) = value.as_str() {
486        return Some(mime.to_string());
487    }
488    if let Some(array) = value.as_array() {
489        for entry in array {
490            if let Some(mime) = entry.as_str() {
491                return Some(mime.to_string());
492            }
493        }
494    }
495    None
496}
497
498#[cfg(feature = "extractous")]
499fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
500    let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
501    cache.lock().ok().and_then(|map| map.get(hash).cloned())
502}
503
504#[cfg(feature = "extractous")]
505fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
506    let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
507    if let Ok(mut map) = cache.lock() {
508        map.insert(hash, document.clone());
509    }
510}
511
512#[cfg(all(test, feature = "extractous"))]
513mod tests {
514    use super::*;
515
516    #[test]
517    fn detects_pdf_like_dump() {
518        let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
519        let dump = snippet.repeat(12);
520        assert!(looks_like_pdf_structure_dump(&dump));
521    }
522
523    #[test]
524    fn skips_normal_text() {
525        let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
526        assert!(!looks_like_pdf_structure_dump(text));
527    }
528
529    #[test]
530    fn identifies_pdf_magic() {
531        let bytes = b"%PDF-1.7 some data";
532        assert!(is_probably_pdf(bytes));
533        let padded = b"\n\n%PDF-1.5";
534        assert!(is_probably_pdf(padded));
535        let not_pdf = b"<!doctype html>";
536        assert!(!is_probably_pdf(not_pdf));
537    }
538}
539
540#[cfg(all(test, feature = "extractous"))]
541mod pdf_fix_tests {
542    use super::*;
543
544    /// Test that PDF extraction via lopdf fallback returns actual text content,
545    /// not raw PDF bytes. This test verifies the fix for the bug where PDFs
546    /// with extractous returning empty content would have their raw bytes
547    /// indexed instead of extracted text.
548    #[test]
549    fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
550        // Create a synthetic PDF-like structure that should trigger the fallback
551        let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
552
553        // This should be detected as PDF structure
554        assert!(is_probably_pdf(pdf_structure));
555
556        // And content that looks like PDF structure should be detected
557        let structure_dump =
558            "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
559        assert!(looks_like_pdf_structure_dump(&structure_dump));
560
561        // Normal text should NOT be detected as PDF structure
562        let normal_text = "This is perfectly normal extracted text from a document.";
563        assert!(!looks_like_pdf_structure_dump(normal_text));
564    }
565}