Skip to main content

memvid_core/reader/
pdf.rs

1#[cfg(not(feature = "pdfium"))]
2use std::sync::OnceLock;
3
4use crate::{DocumentFormat, DocumentReader, ReaderHint, ReaderOutput, Result};
5
6#[cfg(not(feature = "pdfium"))]
7use crate::{DocumentProcessor, ReaderDiagnostics};
8
9#[cfg(feature = "pdfium")]
10use crate::PassthroughReader;
11#[cfg(feature = "pdfium")]
12use pdfium_render::prelude::*;
13#[cfg(feature = "pdfium")]
14use serde_json::json;
15#[cfg(feature = "pdfium")]
16use std::time::{Duration, Instant};
17
18/// Primary PDF reader. Uses Pdfium when enabled, with a graceful fallback to
19/// the shared document processor.
20pub struct PdfReader;
21
22#[cfg(feature = "pdfium")]
23const PDFIUM_MAX_PAGES: u32 = 4_096;
24#[cfg(feature = "pdfium")]
25const PDFIUM_MAX_DURATION: Duration = Duration::from_secs(10);
26#[cfg(feature = "pdfium")]
27const PDFIUM_MAX_BYTES: usize = 128 * 1024 * 1024;
28
29impl PdfReader {
30    #[cfg(not(feature = "pdfium"))]
31    fn processor() -> &'static DocumentProcessor {
32        static PROCESSOR: OnceLock<DocumentProcessor> = OnceLock::new();
33        PROCESSOR.get_or_init(DocumentProcessor::default)
34    }
35
36    fn supports_mime(mime: Option<&str>) -> bool {
37        mime.is_some_and(|m| m.eq_ignore_ascii_case("application/pdf"))
38    }
39
40    fn supports_magic(magic: Option<&[u8]>) -> bool {
41        let mut slice = match magic {
42            Some(slice) if !slice.is_empty() => slice,
43            _ => return false,
44        };
45        if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
46            slice = &slice[3..];
47        }
48        while let Some((first, rest)) = slice.split_first() {
49            if first.is_ascii_whitespace() {
50                slice = rest;
51            } else {
52                break;
53            }
54        }
55        slice.starts_with(b"%PDF")
56    }
57
58    #[cfg(feature = "pdfium")]
59    fn extract_with_pdfium(bytes: &[u8]) -> Result<(String, u32, u64)> {
60        if bytes.len() > PDFIUM_MAX_BYTES {
61            return Err(crate::MemvidError::ExtractionFailed {
62                reason: format!(
63                    "pdfium payload exceeds limit ({} bytes > {} bytes)",
64                    bytes.len(),
65                    PDFIUM_MAX_BYTES
66                )
67                .into(),
68            });
69        }
70        let pdfium = Pdfium::bind_to_system_library()
71            .map(Pdfium::new)
72            .map_err(|err| crate::MemvidError::ExtractionFailed {
73                reason: format!("failed to bind pdfium: {err}").into(),
74            })?;
75        let start = Instant::now();
76        let document = pdfium
77            .load_pdf_from_byte_slice(bytes, None)
78            .map_err(|err| crate::MemvidError::ExtractionFailed {
79                reason: format!("pdfium failed to load pdf: {err}").into(),
80            })?;
81
82        let mut combined = String::new();
83        let mut pages = 0u32;
84
85        for index in 0..document.pages().len() {
86            if pages >= PDFIUM_MAX_PAGES {
87                return Err(crate::MemvidError::ExtractionFailed {
88                    reason: format!("pdfium page limit reached (>{} pages)", PDFIUM_MAX_PAGES)
89                        .into(),
90                });
91            }
92            let page = document.pages().get(index).map_err(|err| {
93                crate::MemvidError::ExtractionFailed {
94                    reason: format!("pdfium failed to access page {index}: {err}").into(),
95                }
96            })?;
97            let page_text = page
98                .text()
99                .map_err(|err| crate::MemvidError::ExtractionFailed {
100                    reason: format!("pdfium failed to extract page {index} text: {err}").into(),
101                })?;
102            let chunk = page_text.all();
103            combined.push_str(&chunk);
104            combined.push('\n');
105            pages += 1;
106        }
107
108        let duration_ms = start.elapsed().as_millis().try_into().unwrap_or(u64::MAX);
109        let trimmed = combined.trim();
110        if trimmed.is_empty() {
111            return Err(crate::MemvidError::ExtractionFailed {
112                reason: "pdfium produced no textual content".into(),
113            });
114        }
115
116        Ok((trimmed.to_string(), pages, duration_ms))
117    }
118}
119
120impl DocumentReader for PdfReader {
121    fn name(&self) -> &'static str {
122        "pdf"
123    }
124
125    fn supports(&self, hint: &ReaderHint<'_>) -> bool {
126        matches!(hint.format, Some(DocumentFormat::Pdf))
127            || Self::supports_mime(hint.mime)
128            || Self::supports_magic(hint.magic_bytes)
129    }
130
131    fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
132        #[cfg(feature = "pdfium")]
133        {
134            let result = Self::extract_with_pdfium(bytes);
135            let output = match result {
136                Ok((text, pages, duration_ms)) => {
137                    let mut base = PassthroughReader.extract(bytes, hint)?;
138                    base.reader_name = self.name().to_string();
139                    base.document.text = Some(text);
140                    base.diagnostics.duration_ms = Some(duration_ms);
141                    base.diagnostics.pages_processed = Some(pages);
142                    base.diagnostics.extra_metadata = json!({
143                        "pages": pages,
144                        "reader": "pdfium",
145                        "duration_ms": duration_ms,
146                    });
147                    if Duration::from_millis(duration_ms) > PDFIUM_MAX_DURATION {
148                        base.diagnostics.track_warning(format!(
149                            "pdfium extraction exceeded timeout {:?}",
150                            PDFIUM_MAX_DURATION
151                        ));
152                    }
153                    base
154                }
155                Err(err) => {
156                    let mut fallback = PassthroughReader.extract(bytes, hint)?;
157                    fallback.reader_name = self.name().to_string();
158                    fallback
159                        .diagnostics
160                        .track_warning(format!("pdfium extraction failed: {err}"));
161                    fallback
162                }
163            };
164            return Ok(output);
165        }
166
167        #[cfg(not(feature = "pdfium"))]
168        {
169            let _ = hint;
170            let document = Self::processor().extract_from_bytes(bytes)?;
171            Ok(ReaderOutput::new(document, self.name())
172                .with_diagnostics(ReaderDiagnostics::default()))
173        }
174    }
175}