memvid_core/
extract.rs

1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5// Use SymSpell-based cleanup when feature is enabled, otherwise fall back to heuristic
6#[cfg(feature = "symspell_cleanup")]
7use crate::symspell_cleanup::fix_pdf_text as fix_pdf_spacing;
8#[cfg(not(feature = "symspell_cleanup"))]
9use crate::text::fix_pdf_spacing;
10
11#[cfg(feature = "extractous")]
12use log::LevelFilter;
13use lopdf::Document as LopdfDocument;
14use serde_json::{Value, json};
15
16#[cfg(feature = "extractous")]
17use extractous::Extractor;
18#[cfg(feature = "extractous")]
19use std::collections::HashMap;
20#[cfg(feature = "extractous")]
21use std::sync::{Mutex, OnceLock};
22
23/// Structured result produced by [`DocumentProcessor`] after running
24/// Extractous over an input document.
25#[derive(Debug, Clone)]
26pub struct ExtractedDocument {
27    pub text: Option<String>,
28    pub metadata: Value,
29    pub mime_type: Option<String>,
30}
31
32impl ExtractedDocument {
33    pub fn empty() -> Self {
34        Self {
35            text: None,
36            metadata: Value::Null,
37            mime_type: None,
38        }
39    }
40}
41
42#[derive(Debug, Clone, Copy)]
43pub struct ProcessorConfig {
44    pub max_text_chars: usize,
45}
46
47impl Default for ProcessorConfig {
48    fn default() -> Self {
49        Self {
50            max_text_chars: 2_000_000,
51        }
52    }
53}
54
55// ============================================================================
56// DocumentProcessor - only available with extractous feature
57// ============================================================================
58
59#[cfg(feature = "extractous")]
60#[derive(Debug)]
61pub struct DocumentProcessor {
62    extractor: Mutex<Extractor>,
63    max_length: usize,
64}
65
66#[cfg(feature = "extractous")]
67impl Default for DocumentProcessor {
68    fn default() -> Self {
69        Self::new(Default::default())
70    }
71}
72
73#[cfg(feature = "extractous")]
74static EXTRACTION_CACHE: OnceLock<Mutex<HashMap<blake3::Hash, ExtractedDocument>>> =
75    OnceLock::new();
76
77#[cfg(feature = "extractous")]
78impl DocumentProcessor {
79    pub fn new(config: ProcessorConfig) -> Self {
80        let capped = config
81            .max_text_chars
82            .min(i32::MAX as usize)
83            .try_into()
84            .unwrap_or(i32::MAX);
85        let mut extractor = Extractor::new().set_extract_string_max_length(capped);
86        extractor = extractor.set_xml_output(false);
87        Self {
88            extractor: Mutex::new(extractor),
89            max_length: config.max_text_chars,
90        }
91    }
92
93    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
94        let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
95            reason: "input path contains invalid UTF-8".into(),
96        })?;
97
98        let extraction = {
99            let extractor = self.locked()?;
100            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
101            extractor.extract_file_to_string(path_str)
102        };
103
104        match extraction {
105            Ok((mut content, metadata)) => {
106                if needs_pdf_fallback(&content) {
107                    if let Ok(bytes) = fs::read(path) {
108                        if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
109                            content = fallback_text;
110                        }
111                    }
112                }
113                Ok(self.into_document(content, metadata))
114            }
115            Err(err) => {
116                let primary_reason = err.to_string();
117                if let Ok(bytes) = fs::read(path) {
118                    match pdf_text_fallback(&bytes) {
119                        Ok(Some(fallback_text)) => {
120                            return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
121                        }
122                        Ok(None) => {}
123                        Err(fallback_err) => {
124                            let reason = format!(
125                                "primary extractor error: {}; PDF fallback error: {}",
126                                primary_reason, fallback_err
127                            );
128                            return Err(MemvidError::ExtractionFailed {
129                                reason: reason.into(),
130                            });
131                        }
132                    }
133                }
134                Err(MemvidError::ExtractionFailed {
135                    reason: primary_reason.into(),
136                })
137            }
138        }
139    }
140
141    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
142        let hash = blake3::hash(bytes);
143        if let Some(cached) = cache_lookup(&hash) {
144            tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
145            return Ok(cached);
146        }
147
148        let extraction = {
149            let extractor = self.locked()?;
150            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
151            extractor.extract_bytes_to_string(bytes)
152        };
153
154        let document = match extraction {
155            Ok((mut content, metadata)) => {
156                let pdf_needed = needs_pdf_fallback(&content);
157                tracing::debug!(
158                    target: "memvid::extract",
159                    content_len = content.len(),
160                    pdf_fallback_needed = pdf_needed,
161                    "extractous returned content"
162                );
163                if pdf_needed {
164                    match pdf_text_fallback(bytes) {
165                        Ok(Some(fallback_text)) => {
166                            tracing::debug!(
167                                target: "memvid::extract",
168                                fallback_len = fallback_text.len(),
169                                "lopdf fallback succeeded"
170                            );
171                            content = fallback_text;
172                        }
173                        Ok(None) => {
174                            tracing::debug!(
175                                target: "memvid::extract",
176                                "lopdf fallback returned None"
177                            );
178                            // PDF detected but lopdf couldn't extract any text
179                            // Return empty rather than raw PDF bytes
180                            content = String::new();
181                        }
182                        Err(e) => {
183                            tracing::debug!(
184                                target: "memvid::extract",
185                                error = %e,
186                                "lopdf fallback failed"
187                            );
188                            // lopdf extraction failed - return empty rather than raw PDF bytes
189                            content = String::new();
190                        }
191                    }
192                }
193                self.into_document(content, metadata)
194            }
195            Err(err) => {
196                let primary_reason = err.to_string();
197                match pdf_text_fallback(bytes) {
198                    Ok(Some(fallback_text)) => {
199                        self.into_document(fallback_text, pdf_fallback_metadata())
200                    }
201                    Ok(None) => {
202                        return Err(MemvidError::ExtractionFailed {
203                            reason: primary_reason.into(),
204                        });
205                    }
206                    Err(fallback_err) => {
207                        let reason = format!(
208                            "primary extractor error: {}; PDF fallback error: {}",
209                            primary_reason, fallback_err
210                        );
211                        return Err(MemvidError::ExtractionFailed {
212                            reason: reason.into(),
213                        });
214                    }
215                }
216            }
217        };
218
219        cache_store(hash, &document);
220        Ok(document)
221    }
222
223    fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
224        self.extractor
225            .lock()
226            .map_err(|_| MemvidError::ExtractionFailed {
227                reason: "extractor mutex poisoned".into(),
228            })
229    }
230
231    fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
232    where
233        M: serde::Serialize,
234    {
235        let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
236        let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
237
238        let text = if content.trim().is_empty() {
239            tracing::debug!(
240                target: "memvid::extract",
241                "into_document: content is empty, returning text=None"
242            );
243            None
244        } else {
245            let final_text = if content.len() > self.max_length {
246                let end = truncate_at_grapheme_boundary(&content, self.max_length);
247                content[..end].to_string()
248            } else {
249                content
250            };
251            tracing::debug!(
252                target: "memvid::extract",
253                text_len = final_text.len(),
254                starts_with_pdf = final_text.starts_with("%PDF"),
255                "into_document: returning text"
256            );
257            Some(final_text)
258        };
259
260        ExtractedDocument {
261            text,
262            metadata: metadata_value,
263            mime_type,
264        }
265    }
266}
267
268// ============================================================================
269// Stub DocumentProcessor when extractous is disabled - returns clear error
270// ============================================================================
271
272#[cfg(not(feature = "extractous"))]
273#[derive(Debug)]
274pub struct DocumentProcessor {
275    max_length: usize,
276}
277
278#[cfg(not(feature = "extractous"))]
279impl Default for DocumentProcessor {
280    fn default() -> Self {
281        Self::new(Default::default())
282    }
283}
284
285#[cfg(not(feature = "extractous"))]
286impl DocumentProcessor {
287    pub fn new(config: ProcessorConfig) -> Self {
288        Self {
289            max_length: config.max_text_chars,
290        }
291    }
292
293    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
294        // Without extractous, we can still handle plain text files
295        let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
296            reason: format!("failed to read file: {e}").into(),
297        })?;
298        self.extract_from_bytes(&bytes)
299    }
300
301    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
302        // Check if this is a PDF - extract text using pdf_extract (if available) or lopdf
303        if is_probably_pdf_simple(bytes) {
304            match pdf_text_extract_best(bytes) {
305                Ok(Some((text, extractor))) => {
306                    let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
307                    let truncated = &text[..truncate_len];
308                    return Ok(ExtractedDocument {
309                        text: Some(truncated.to_string()),
310                        metadata: json!({
311                            "Content-Type": "application/pdf",
312                            "extraction": extractor,
313                        }),
314                        mime_type: Some("application/pdf".to_string()),
315                    });
316                }
317                Ok(None) => {
318                    // PDF detected but no text could be extracted (image-only PDF)
319                    return Ok(ExtractedDocument {
320                        text: None,
321                        metadata: json!({
322                            "Content-Type": "application/pdf",
323                            "extraction": "no_text",
324                        }),
325                        mime_type: Some("application/pdf".to_string()),
326                    });
327                }
328                Err(e) => {
329                    tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
330                    // Fall through to binary handling
331                }
332            }
333        }
334
335        // Without extractous, we can still handle plain text files and common text-based formats
336        // Try to interpret as UTF-8 text first
337        if let Ok(text) = std::str::from_utf8(bytes) {
338            // Check if it's likely text (no null bytes in first 8KB)
339            let sample = &bytes[..bytes.len().min(8192)];
340            if !sample.contains(&0) {
341                let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
342                let truncated = &text[..truncate_len];
343                return Ok(ExtractedDocument {
344                    text: Some(truncated.to_string()),
345                    metadata: json!({}),
346                    mime_type: Some("text/plain".to_string()),
347                });
348            }
349        }
350
351        // For binary content (video, audio, images, etc.), return success with no text.
352        // This allows binary blobs to be stored without requiring the extractous feature.
353        // The caller can still store the blob; there just won't be extracted text for search.
354        Ok(ExtractedDocument {
355            text: None,
356            metadata: json!({}),
357            mime_type: Some("application/octet-stream".to_string()),
358        })
359    }
360}
361
362#[cfg(feature = "extractous")]
363fn needs_pdf_fallback(content: &str) -> bool {
364    if content.trim().is_empty() {
365        return true;
366    }
367    looks_like_pdf_structure_dump(content)
368}
369
370#[cfg(feature = "extractous")]
371fn pdf_fallback_metadata() -> Value {
372    json!({
373        "Content-Type": "application/pdf",
374        "extraction": "lopdf_fallback",
375    })
376}
377
378#[cfg(feature = "extractous")]
379const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap.
380#[cfg(feature = "extractous")]
381const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
382
383#[cfg(feature = "extractous")]
384fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
385    if !is_probably_pdf(bytes) {
386        return Ok(None);
387    }
388
389    if bytes.len() > PDF_FALLBACK_MAX_BYTES {
390        return Err(MemvidError::ExtractionFailed {
391            reason: format!(
392                "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
393                bytes.len(),
394                PDF_FALLBACK_MAX_BYTES
395            )
396            .into(),
397        });
398    }
399
400    let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
401    let mut document =
402        LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
403            reason: format!("pdf fallback failed to load document: {err}").into(),
404        })?;
405
406    if document.is_encrypted() {
407        if document.decrypt("").is_err() {
408            return Err(MemvidError::ExtractionFailed {
409                reason: "pdf fallback cannot decrypt password-protected file".into(),
410            });
411        }
412    }
413
414    let _ = document.decompress();
415
416    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
417    if page_numbers.is_empty() {
418        return Ok(None);
419    }
420    page_numbers.sort_unstable();
421
422    if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
423        return Err(MemvidError::ExtractionFailed {
424            reason: format!(
425                "pdf fallback aborted: page count {} exceeds limit of {}",
426                page_numbers.len(),
427                PDF_FALLBACK_MAX_PAGES
428            )
429            .into(),
430        });
431    }
432
433    match document.extract_text(&page_numbers) {
434        Ok(text) => {
435            let trimmed = text.trim();
436            if trimmed.is_empty() {
437                Ok(None)
438            } else {
439                // Apply fix_pdf_spacing to repair character-level spacing artifacts
440                Ok(Some(fix_pdf_spacing(trimmed)))
441            }
442        }
443        Err(err) => Err(MemvidError::ExtractionFailed {
444            reason: format!("pdf fallback failed to extract text: {err}").into(),
445        }),
446    }
447}
448
449#[cfg(feature = "extractous")]
450struct ScopedLogLevel {
451    previous: LevelFilter,
452    changed: bool,
453}
454
455#[cfg(feature = "extractous")]
456impl ScopedLogLevel {
457    fn lowered(level: LevelFilter) -> Self {
458        let previous = log::max_level();
459        if level < previous {
460            log::set_max_level(level);
461            Self {
462                previous,
463                changed: true,
464            }
465        } else {
466            Self {
467                previous,
468                changed: false,
469            }
470        }
471    }
472}
473
474#[cfg(feature = "extractous")]
475impl Drop for ScopedLogLevel {
476    fn drop(&mut self) {
477        if self.changed {
478            log::set_max_level(self.previous);
479        }
480    }
481}
482
483#[cfg(feature = "extractous")]
484fn is_probably_pdf(bytes: &[u8]) -> bool {
485    if bytes.is_empty() {
486        return false;
487    }
488    let mut slice = bytes;
489    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
490        slice = &slice[3..];
491    }
492    while let Some((first, rest)) = slice.split_first() {
493        if *first == 0 || first.is_ascii_whitespace() {
494            slice = rest;
495        } else {
496            break;
497        }
498    }
499    slice.starts_with(b"%PDF")
500}
501
502#[cfg(feature = "extractous")]
503fn looks_like_pdf_structure_dump(content: &str) -> bool {
504    if content.len() < 256 {
505        return false;
506    }
507    let sample_len = content.len().min(8_192);
508    // Find the nearest valid character boundary at or before sample_len
509    let safe_len = truncate_at_grapheme_boundary(content, sample_len);
510    let sample = &content[..safe_len];
511    let endobj_hits = sample.matches("endobj").take(3).count();
512    if endobj_hits < 2 {
513        return false;
514    }
515    let has_obj =
516        sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
517    let has_stream = sample.contains("endstream");
518    let has_page_type = sample.contains("/Type /Page");
519    endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
520}
521
522#[cfg(feature = "extractous")]
523fn value_to_mime(value: &Value) -> Option<String> {
524    if let Some(mime) = value.as_str() {
525        return Some(mime.to_string());
526    }
527    if let Some(array) = value.as_array() {
528        for entry in array {
529            if let Some(mime) = entry.as_str() {
530                return Some(mime.to_string());
531            }
532        }
533    }
534    None
535}
536
537#[cfg(feature = "extractous")]
538fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
539    let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
540    cache.lock().ok().and_then(|map| map.get(hash).cloned())
541}
542
543#[cfg(feature = "extractous")]
544fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
545    let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
546    if let Ok(mut map) = cache.lock() {
547        map.insert(hash, document.clone());
548    }
549}
550
551// ============================================================================
552// PDF extraction helpers (available without extractous feature)
553// Priority: pdf_oxide (best accuracy) > pdf_extract > lopdf
554// ============================================================================
555
556#[allow(dead_code)]
557const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap
558#[allow(dead_code)]
559const PDF_LOPDF_MAX_PAGES: usize = 4_096;
560
561/// Try multiple PDF extractors and return the best result
562/// Returns (text, extractor_name) or None if no text found
563/// Priority: pdf_oxide (2025, best accuracy) > pdf_extract > lopdf
564#[allow(dead_code)]
565fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
566    let mut best_text: Option<String> = None;
567    let mut best_source: &'static str = "";
568
569    // Calculate minimum "good" threshold based on file size
570    #[cfg(any(feature = "pdf_oxide", feature = "pdf_extract"))]
571    let min_good_chars = (bytes.len() / 100).clamp(500, 5000);
572
573    // Try pdf_oxide first (if feature enabled) - highest accuracy, perfect word spacing
574    // Wrap in catch_unwind because cff-parser may panic on certain fonts (ligatures)
575    #[cfg(feature = "pdf_oxide")]
576    {
577        let bytes_clone = bytes.to_vec();
578        let oxide_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
579            pdf_text_extract_oxide(&bytes_clone)
580        }));
581
582        match oxide_result {
583            Ok(Ok(Some(text))) => {
584                let trimmed = text.trim();
585                if !trimmed.is_empty() {
586                    if trimmed.len() >= min_good_chars {
587                        tracing::debug!(
588                            target: "memvid::extract",
589                            len = trimmed.len(),
590                            "pdf_oxide succeeded with good result"
591                        );
592                        return Ok(Some((trimmed.to_string(), "pdf_oxide")));
593                    }
594                    tracing::debug!(
595                        target: "memvid::extract",
596                        len = trimmed.len(),
597                        min_good = min_good_chars,
598                        "pdf_oxide returned partial result, trying fallbacks"
599                    );
600                    best_text = Some(trimmed.to_string());
601                    best_source = "pdf_oxide";
602                }
603            }
604            Ok(Ok(None)) => {
605                tracing::debug!(target: "memvid::extract", "pdf_oxide returned no text");
606            }
607            Ok(Err(e)) => {
608                tracing::debug!(target: "memvid::extract", error = %e, "pdf_oxide failed");
609            }
610            Err(_) => {
611                tracing::warn!(target: "memvid::extract", "pdf_oxide panicked (likely font parsing issue), falling back to other extractors");
612            }
613        }
614    }
615
616    // Try pdf_extract next (if feature enabled)
617    #[cfg(feature = "pdf_extract")]
618    {
619        match pdf_extract::extract_text_from_mem(bytes) {
620            Ok(text) => {
621                let trimmed = text.trim();
622                if !trimmed.is_empty() {
623                    if best_text.is_none() && trimmed.len() >= min_good_chars {
624                        tracing::debug!(
625                            target: "memvid::extract",
626                            len = trimmed.len(),
627                            "pdf_extract succeeded with good result"
628                        );
629                        return Ok(Some((trimmed.to_string(), "pdf_extract")));
630                    }
631                    // Use if better than current best
632                    if best_text
633                        .as_ref()
634                        .map_or(true, |prev| trimmed.len() > prev.len())
635                    {
636                        best_text = Some(trimmed.to_string());
637                        best_source = "pdf_extract";
638                    }
639                }
640            }
641            Err(e) => {
642                tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
643            }
644        }
645    }
646
647    // Try lopdf (pure Rust, always available)
648    match pdf_text_extract_lopdf(bytes) {
649        Ok(Some(text)) => {
650            let trimmed = text.trim();
651            if !trimmed.is_empty() {
652                // Use lopdf result if better than previous
653                if best_text
654                    .as_ref()
655                    .map_or(true, |prev| trimmed.len() > prev.len())
656                {
657                    tracing::debug!(
658                        target: "memvid::extract",
659                        len = trimmed.len(),
660                        "lopdf extracted more text"
661                    );
662                    best_text = Some(trimmed.to_string());
663                    best_source = "lopdf";
664                }
665            }
666        }
667        Ok(None) => {
668            tracing::debug!(target: "memvid::extract", "lopdf returned no text");
669        }
670        Err(e) => {
671            tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
672        }
673    }
674
675    // Apply fix_pdf_spacing to repair character-level spacing artifacts from PDF encoding
676    Ok(best_text.map(|t| (fix_pdf_spacing(&t), best_source)))
677}
678
679/// Extract text from PDF using pdf_oxide (highest accuracy, perfect word spacing)
680/// Note: pdf_oxide only supports file paths, so we write bytes to a temp file first
681#[cfg(feature = "pdf_oxide")]
682#[allow(dead_code)]
683fn pdf_text_extract_oxide(bytes: &[u8]) -> Result<Option<String>> {
684    use pdf_oxide::PdfDocument;
685    use std::io::Write;
686    use tempfile::NamedTempFile;
687
688    // pdf_oxide only supports opening from file path, so we write to temp file
689    let mut temp_file = NamedTempFile::new().map_err(|err| MemvidError::ExtractionFailed {
690        reason: format!("pdf_oxide failed to create temp file: {err}").into(),
691    })?;
692
693    temp_file
694        .write_all(bytes)
695        .map_err(|err| MemvidError::ExtractionFailed {
696            reason: format!("pdf_oxide failed to write temp file: {err}").into(),
697        })?;
698
699    temp_file
700        .flush()
701        .map_err(|err| MemvidError::ExtractionFailed {
702            reason: format!("pdf_oxide failed to flush temp file: {err}").into(),
703        })?;
704
705    let temp_path = temp_file.path();
706    let mut doc = PdfDocument::open(temp_path).map_err(|err| MemvidError::ExtractionFailed {
707        reason: format!("pdf_oxide failed to load PDF: {err}").into(),
708    })?;
709
710    let page_count = doc
711        .page_count()
712        .map_err(|err| MemvidError::ExtractionFailed {
713            reason: format!("pdf_oxide failed to get page count: {err}").into(),
714        })?;
715    if page_count == 0 {
716        return Ok(None);
717    }
718
719    let mut all_text = String::new();
720    for page_idx in 0..page_count {
721        match doc.extract_text(page_idx) {
722            Ok(text) => {
723                if !text.is_empty() {
724                    if !all_text.is_empty() {
725                        all_text.push('\n');
726                    }
727                    all_text.push_str(&text);
728                }
729            }
730            Err(e) => {
731                tracing::debug!(
732                    target: "memvid::extract",
733                    page = page_idx,
734                    error = %e,
735                    "pdf_oxide failed to extract page"
736                );
737            }
738        }
739    }
740
741    let trimmed = all_text.trim();
742    if trimmed.is_empty() {
743        Ok(None)
744    } else {
745        Ok(Some(trimmed.to_string()))
746    }
747}
748
749/// Check if bytes look like a PDF file (magic bytes check)
750#[allow(dead_code)]
751fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
752    if bytes.is_empty() {
753        return false;
754    }
755    let mut slice = bytes;
756    // Skip BOM if present
757    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
758        slice = &slice[3..];
759    }
760    // Skip leading whitespace/nulls
761    while let Some((first, rest)) = slice.split_first() {
762        if *first == 0 || first.is_ascii_whitespace() {
763            slice = rest;
764        } else {
765            break;
766        }
767    }
768    slice.starts_with(b"%PDF")
769}
770
771/// Extract text from a PDF using lopdf (pure Rust, no external dependencies)
772#[allow(dead_code)]
773fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
774    if bytes.len() > PDF_LOPDF_MAX_BYTES {
775        return Err(MemvidError::ExtractionFailed {
776            reason: format!(
777                "PDF too large: {} bytes exceeds limit of {} bytes",
778                bytes.len(),
779                PDF_LOPDF_MAX_BYTES
780            )
781            .into(),
782        });
783    }
784
785    let mut document =
786        LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
787            reason: format!("failed to load PDF: {err}").into(),
788        })?;
789
790    // Try to decrypt if encrypted (empty password for unprotected PDFs)
791    if document.is_encrypted() {
792        if document.decrypt("").is_err() {
793            return Err(MemvidError::ExtractionFailed {
794                reason: "cannot decrypt password-protected PDF".into(),
795            });
796        }
797    }
798
799    // Decompress streams for better text extraction
800    let _ = document.decompress();
801
802    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
803    if page_numbers.is_empty() {
804        return Ok(None);
805    }
806    page_numbers.sort_unstable();
807
808    if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
809        return Err(MemvidError::ExtractionFailed {
810            reason: format!(
811                "PDF has too many pages: {} exceeds limit of {}",
812                page_numbers.len(),
813                PDF_LOPDF_MAX_PAGES
814            )
815            .into(),
816        });
817    }
818
819    match document.extract_text(&page_numbers) {
820        Ok(text) => {
821            let trimmed = text.trim();
822            if trimmed.is_empty() {
823                Ok(None)
824            } else {
825                Ok(Some(trimmed.to_string()))
826            }
827        }
828        Err(err) => Err(MemvidError::ExtractionFailed {
829            reason: format!("failed to extract text from PDF: {err}").into(),
830        }),
831    }
832}
833
834#[cfg(all(test, feature = "extractous"))]
835mod tests {
836    use super::*;
837
838    #[test]
839    fn detects_pdf_like_dump() {
840        let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
841        let dump = snippet.repeat(12);
842        assert!(looks_like_pdf_structure_dump(&dump));
843    }
844
845    #[test]
846    fn skips_normal_text() {
847        let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
848        assert!(!looks_like_pdf_structure_dump(text));
849    }
850
851    #[test]
852    fn identifies_pdf_magic() {
853        let bytes = b"%PDF-1.7 some data";
854        assert!(is_probably_pdf(bytes));
855        let padded = b"\n\n%PDF-1.5";
856        assert!(is_probably_pdf(padded));
857        let not_pdf = b"<!doctype html>";
858        assert!(!is_probably_pdf(not_pdf));
859    }
860}
861
862#[cfg(all(test, feature = "extractous"))]
863mod pdf_fix_tests {
864    use super::*;
865
866    /// Test that PDF extraction via lopdf fallback returns actual text content,
867    /// not raw PDF bytes. This test verifies the fix for the bug where PDFs
868    /// with extractous returning empty content would have their raw bytes
869    /// indexed instead of extracted text.
870    #[test]
871    fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
872        // Create a synthetic PDF-like structure that should trigger the fallback
873        let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
874
875        // This should be detected as PDF structure
876        assert!(is_probably_pdf(pdf_structure));
877
878        // And content that looks like PDF structure should be detected
879        let structure_dump =
880            "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
881        assert!(looks_like_pdf_structure_dump(&structure_dump));
882
883        // Normal text should NOT be detected as PDF structure
884        let normal_text = "This is perfectly normal extracted text from a document.";
885        assert!(!looks_like_pdf_structure_dump(normal_text));
886    }
887}
memvid_core/extract.rs

memvid_core/
extract.rs