memvid_core/reader/
pdf.rs1#[cfg(not(feature = "pdfium"))]
2use std::sync::OnceLock;
3
4use crate::{DocumentFormat, DocumentReader, ReaderHint, ReaderOutput, Result};
5
6#[cfg(not(feature = "pdfium"))]
7use crate::{DocumentProcessor, ReaderDiagnostics};
8
9#[cfg(feature = "pdfium")]
10use crate::PassthroughReader;
11#[cfg(feature = "pdfium")]
12use pdfium_render::prelude::*;
13#[cfg(feature = "pdfium")]
14use serde_json::json;
15#[cfg(feature = "pdfium")]
16use std::time::{Duration, Instant};
17
18pub struct PdfReader;
21
22#[cfg(feature = "pdfium")]
23const PDFIUM_MAX_PAGES: u32 = 4_096;
24#[cfg(feature = "pdfium")]
25const PDFIUM_MAX_DURATION: Duration = Duration::from_secs(10);
26#[cfg(feature = "pdfium")]
27const PDFIUM_MAX_BYTES: usize = 128 * 1024 * 1024;
28
29impl PdfReader {
30 #[cfg(not(feature = "pdfium"))]
31 fn processor() -> &'static DocumentProcessor {
32 static PROCESSOR: OnceLock<DocumentProcessor> = OnceLock::new();
33 PROCESSOR.get_or_init(DocumentProcessor::default)
34 }
35
36 fn supports_mime(mime: Option<&str>) -> bool {
37 mime.map(|m| m.eq_ignore_ascii_case("application/pdf"))
38 .unwrap_or(false)
39 }
40
41 fn supports_magic(magic: Option<&[u8]>) -> bool {
42 let mut slice = match magic {
43 Some(slice) if !slice.is_empty() => slice,
44 _ => return false,
45 };
46 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
47 slice = &slice[3..];
48 }
49 while let Some((first, rest)) = slice.split_first() {
50 if first.is_ascii_whitespace() {
51 slice = rest;
52 } else {
53 break;
54 }
55 }
56 slice.starts_with(b"%PDF")
57 }
58
59 #[cfg(feature = "pdfium")]
60 fn extract_with_pdfium(bytes: &[u8]) -> Result<(String, u32, u64)> {
61 if bytes.len() > PDFIUM_MAX_BYTES {
62 return Err(crate::MemvidError::ExtractionFailed {
63 reason: format!(
64 "pdfium payload exceeds limit ({} bytes > {} bytes)",
65 bytes.len(),
66 PDFIUM_MAX_BYTES
67 )
68 .into(),
69 });
70 }
71 let pdfium = Pdfium::bind_to_system_library()
72 .map(Pdfium::new)
73 .map_err(|err| crate::MemvidError::ExtractionFailed {
74 reason: format!("failed to bind pdfium: {err}").into(),
75 })?;
76 let start = Instant::now();
77 let document = pdfium
78 .load_pdf_from_byte_slice(bytes, None)
79 .map_err(|err| crate::MemvidError::ExtractionFailed {
80 reason: format!("pdfium failed to load pdf: {err}").into(),
81 })?;
82
83 let mut combined = String::new();
84 let mut pages = 0u32;
85
86 for index in 0..document.pages().len() {
87 if pages >= PDFIUM_MAX_PAGES {
88 return Err(crate::MemvidError::ExtractionFailed {
89 reason: format!("pdfium page limit reached (>{} pages)", PDFIUM_MAX_PAGES)
90 .into(),
91 });
92 }
93 let page = document.pages().get(index).map_err(|err| {
94 crate::MemvidError::ExtractionFailed {
95 reason: format!("pdfium failed to access page {index}: {err}").into(),
96 }
97 })?;
98 let page_text = page
99 .text()
100 .map_err(|err| crate::MemvidError::ExtractionFailed {
101 reason: format!("pdfium failed to extract page {index} text: {err}").into(),
102 })?;
103 let chunk = page_text.all();
104 combined.push_str(&chunk);
105 combined.push('\n');
106 pages += 1;
107 }
108
109 let duration_ms = start.elapsed().as_millis() as u64;
110 let trimmed = combined.trim();
111 if trimmed.is_empty() {
112 return Err(crate::MemvidError::ExtractionFailed {
113 reason: "pdfium produced no textual content".into(),
114 });
115 }
116
117 Ok((trimmed.to_string(), pages, duration_ms))
118 }
119}
120
121impl DocumentReader for PdfReader {
122 fn name(&self) -> &'static str {
123 "pdf"
124 }
125
126 fn supports(&self, hint: &ReaderHint<'_>) -> bool {
127 matches!(hint.format, Some(DocumentFormat::Pdf))
128 || Self::supports_mime(hint.mime)
129 || Self::supports_magic(hint.magic_bytes)
130 }
131
132 fn extract(&self, bytes: &[u8], hint: &ReaderHint<'_>) -> Result<ReaderOutput> {
133 #[cfg(feature = "pdfium")]
134 {
135 let result = Self::extract_with_pdfium(bytes);
136 let output = match result {
137 Ok((text, pages, duration_ms)) => {
138 let mut base = PassthroughReader.extract(bytes, hint)?;
139 base.reader_name = self.name().to_string();
140 base.document.text = Some(text);
141 base.diagnostics.duration_ms = Some(duration_ms);
142 base.diagnostics.pages_processed = Some(pages);
143 base.diagnostics.extra_metadata = json!({
144 "pages": pages,
145 "reader": "pdfium",
146 "duration_ms": duration_ms,
147 });
148 if Duration::from_millis(duration_ms) > PDFIUM_MAX_DURATION {
149 base.diagnostics.track_warning(format!(
150 "pdfium extraction exceeded timeout {:?}",
151 PDFIUM_MAX_DURATION
152 ));
153 }
154 base
155 }
156 Err(err) => {
157 let mut fallback = PassthroughReader.extract(bytes, hint)?;
158 fallback.reader_name = self.name().to_string();
159 fallback
160 .diagnostics
161 .track_warning(format!("pdfium extraction failed: {err}"));
162 fallback
163 }
164 };
165 return Ok(output);
166 }
167
168 #[cfg(not(feature = "pdfium"))]
169 {
170 let _ = hint;
171 let document = Self::processor().extract_from_bytes(bytes)?;
172 return Ok(ReaderOutput::new(document, self.name())
173 .with_diagnostics(ReaderDiagnostics::default()));
174 }
175 }
176}