1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5#[cfg(feature = "extractous")]
6use log::LevelFilter;
7#[cfg(feature = "extractous")]
8use lopdf::Document as LopdfDocument;
9use serde_json::{Value, json};
10
11#[cfg(feature = "extractous")]
12use extractous::Extractor;
13#[cfg(feature = "extractous")]
14use std::collections::HashMap;
15#[cfg(feature = "extractous")]
16use std::sync::{Mutex, OnceLock};
17
18#[derive(Debug, Clone)]
21pub struct ExtractedDocument {
22 pub text: Option<String>,
23 pub metadata: Value,
24 pub mime_type: Option<String>,
25}
26
27impl ExtractedDocument {
28 pub fn empty() -> Self {
29 Self {
30 text: None,
31 metadata: Value::Null,
32 mime_type: None,
33 }
34 }
35}
36
37#[derive(Debug, Clone, Copy)]
38pub struct ProcessorConfig {
39 pub max_text_chars: usize,
40}
41
42impl Default for ProcessorConfig {
43 fn default() -> Self {
44 Self {
45 max_text_chars: 2_000_000,
46 }
47 }
48}
49
50#[cfg(feature = "extractous")]
55#[derive(Debug)]
56pub struct DocumentProcessor {
57 extractor: Mutex<Extractor>,
58 max_length: usize,
59}
60
61#[cfg(feature = "extractous")]
62impl Default for DocumentProcessor {
63 fn default() -> Self {
64 Self::new(Default::default())
65 }
66}
67
68#[cfg(feature = "extractous")]
69static EXTRACTION_CACHE: OnceLock<Mutex<HashMap<blake3::Hash, ExtractedDocument>>> =
70 OnceLock::new();
71
72#[cfg(feature = "extractous")]
73impl DocumentProcessor {
74 pub fn new(config: ProcessorConfig) -> Self {
75 let capped = config
76 .max_text_chars
77 .min(i32::MAX as usize)
78 .try_into()
79 .unwrap_or(i32::MAX);
80 let mut extractor = Extractor::new().set_extract_string_max_length(capped);
81 extractor = extractor.set_xml_output(false);
82 Self {
83 extractor: Mutex::new(extractor),
84 max_length: config.max_text_chars,
85 }
86 }
87
88 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
89 let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
90 reason: "input path contains invalid UTF-8".into(),
91 })?;
92
93 let extraction = {
94 let extractor = self.locked()?;
95 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
96 extractor.extract_file_to_string(path_str)
97 };
98
99 match extraction {
100 Ok((mut content, metadata)) => {
101 if needs_pdf_fallback(&content) {
102 if let Ok(bytes) = fs::read(path) {
103 if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
104 content = fallback_text;
105 }
106 }
107 }
108 Ok(self.into_document(content, metadata))
109 }
110 Err(err) => {
111 let primary_reason = err.to_string();
112 if let Ok(bytes) = fs::read(path) {
113 match pdf_text_fallback(&bytes) {
114 Ok(Some(fallback_text)) => {
115 return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
116 }
117 Ok(None) => {}
118 Err(fallback_err) => {
119 let reason = format!(
120 "primary extractor error: {}; PDF fallback error: {}",
121 primary_reason, fallback_err
122 );
123 return Err(MemvidError::ExtractionFailed {
124 reason: reason.into(),
125 });
126 }
127 }
128 }
129 Err(MemvidError::ExtractionFailed {
130 reason: primary_reason.into(),
131 })
132 }
133 }
134 }
135
136 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
137 let hash = blake3::hash(bytes);
138 if let Some(cached) = cache_lookup(&hash) {
139 tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
140 return Ok(cached);
141 }
142
143 let extraction = {
144 let extractor = self.locked()?;
145 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
146 extractor.extract_bytes_to_string(bytes)
147 };
148
149 let document = match extraction {
150 Ok((mut content, metadata)) => {
151 let pdf_needed = needs_pdf_fallback(&content);
152 tracing::debug!(
153 target: "memvid::extract",
154 content_len = content.len(),
155 pdf_fallback_needed = pdf_needed,
156 "extractous returned content"
157 );
158 if pdf_needed {
159 match pdf_text_fallback(bytes) {
160 Ok(Some(fallback_text)) => {
161 tracing::debug!(
162 target: "memvid::extract",
163 fallback_len = fallback_text.len(),
164 "lopdf fallback succeeded"
165 );
166 content = fallback_text;
167 }
168 Ok(None) => {
169 tracing::debug!(
170 target: "memvid::extract",
171 "lopdf fallback returned None"
172 );
173 content = String::new();
176 }
177 Err(e) => {
178 tracing::debug!(
179 target: "memvid::extract",
180 error = %e,
181 "lopdf fallback failed"
182 );
183 content = String::new();
185 }
186 }
187 }
188 self.into_document(content, metadata)
189 }
190 Err(err) => {
191 let primary_reason = err.to_string();
192 match pdf_text_fallback(bytes) {
193 Ok(Some(fallback_text)) => {
194 self.into_document(fallback_text, pdf_fallback_metadata())
195 }
196 Ok(None) => {
197 return Err(MemvidError::ExtractionFailed {
198 reason: primary_reason.into(),
199 });
200 }
201 Err(fallback_err) => {
202 let reason = format!(
203 "primary extractor error: {}; PDF fallback error: {}",
204 primary_reason, fallback_err
205 );
206 return Err(MemvidError::ExtractionFailed {
207 reason: reason.into(),
208 });
209 }
210 }
211 }
212 };
213
214 cache_store(hash, &document);
215 Ok(document)
216 }
217
218 fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
219 self.extractor
220 .lock()
221 .map_err(|_| MemvidError::ExtractionFailed {
222 reason: "extractor mutex poisoned".into(),
223 })
224 }
225
226 fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
227 where
228 M: serde::Serialize,
229 {
230 let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
231 let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
232
233 let text = if content.trim().is_empty() {
234 tracing::debug!(
235 target: "memvid::extract",
236 "into_document: content is empty, returning text=None"
237 );
238 None
239 } else {
240 let final_text = if content.len() > self.max_length {
241 let end = truncate_at_grapheme_boundary(&content, self.max_length);
242 content[..end].to_string()
243 } else {
244 content
245 };
246 tracing::debug!(
247 target: "memvid::extract",
248 text_len = final_text.len(),
249 starts_with_pdf = final_text.starts_with("%PDF"),
250 "into_document: returning text"
251 );
252 Some(final_text)
253 };
254
255 ExtractedDocument {
256 text,
257 metadata: metadata_value,
258 mime_type,
259 }
260 }
261}
262
263#[cfg(not(feature = "extractous"))]
268#[derive(Debug)]
269pub struct DocumentProcessor {
270 max_length: usize,
271}
272
273#[cfg(not(feature = "extractous"))]
274impl Default for DocumentProcessor {
275 fn default() -> Self {
276 Self::new(Default::default())
277 }
278}
279
280#[cfg(not(feature = "extractous"))]
281impl DocumentProcessor {
282 pub fn new(config: ProcessorConfig) -> Self {
283 Self {
284 max_length: config.max_text_chars,
285 }
286 }
287
288 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
289 let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
291 reason: format!("failed to read file: {e}").into(),
292 })?;
293 self.extract_from_bytes(&bytes)
294 }
295
296 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
297 if let Ok(text) = std::str::from_utf8(bytes) {
300 let sample = &bytes[..bytes.len().min(8192)];
302 if !sample.contains(&0) {
303 let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
304 let truncated = &text[..truncate_len];
305 return Ok(ExtractedDocument {
306 text: Some(truncated.to_string()),
307 metadata: json!({}),
308 mime_type: Some("text/plain".to_string()),
309 });
310 }
311 }
312
313 Ok(ExtractedDocument {
317 text: None,
318 metadata: json!({}),
319 mime_type: Some("application/octet-stream".to_string()),
320 })
321 }
322}
323
324#[cfg(feature = "extractous")]
325fn needs_pdf_fallback(content: &str) -> bool {
326 if content.trim().is_empty() {
327 return true;
328 }
329 looks_like_pdf_structure_dump(content)
330}
331
332#[cfg(feature = "extractous")]
333fn pdf_fallback_metadata() -> Value {
334 json!({
335 "Content-Type": "application/pdf",
336 "extraction": "lopdf_fallback",
337 })
338}
339
340#[cfg(feature = "extractous")]
341const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; #[cfg(feature = "extractous")]
343const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
344
345#[cfg(feature = "extractous")]
346fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
347 if !is_probably_pdf(bytes) {
348 return Ok(None);
349 }
350
351 if bytes.len() > PDF_FALLBACK_MAX_BYTES {
352 return Err(MemvidError::ExtractionFailed {
353 reason: format!(
354 "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
355 bytes.len(),
356 PDF_FALLBACK_MAX_BYTES
357 )
358 .into(),
359 });
360 }
361
362 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
363 let mut document =
364 LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
365 reason: format!("pdf fallback failed to load document: {err}").into(),
366 })?;
367
368 if document.is_encrypted() {
369 if document.decrypt("").is_err() {
370 return Err(MemvidError::ExtractionFailed {
371 reason: "pdf fallback cannot decrypt password-protected file".into(),
372 });
373 }
374 }
375
376 let _ = document.decompress();
377
378 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
379 if page_numbers.is_empty() {
380 return Ok(None);
381 }
382 page_numbers.sort_unstable();
383
384 if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
385 return Err(MemvidError::ExtractionFailed {
386 reason: format!(
387 "pdf fallback aborted: page count {} exceeds limit of {}",
388 page_numbers.len(),
389 PDF_FALLBACK_MAX_PAGES
390 )
391 .into(),
392 });
393 }
394
395 match document.extract_text(&page_numbers) {
396 Ok(text) => {
397 let trimmed = text.trim();
398 if trimmed.is_empty() {
399 Ok(None)
400 } else {
401 Ok(Some(trimmed.to_string()))
402 }
403 }
404 Err(err) => Err(MemvidError::ExtractionFailed {
405 reason: format!("pdf fallback failed to extract text: {err}").into(),
406 }),
407 }
408}
409
410#[cfg(feature = "extractous")]
411struct ScopedLogLevel {
412 previous: LevelFilter,
413 changed: bool,
414}
415
416#[cfg(feature = "extractous")]
417impl ScopedLogLevel {
418 fn lowered(level: LevelFilter) -> Self {
419 let previous = log::max_level();
420 if level < previous {
421 log::set_max_level(level);
422 Self {
423 previous,
424 changed: true,
425 }
426 } else {
427 Self {
428 previous,
429 changed: false,
430 }
431 }
432 }
433}
434
435#[cfg(feature = "extractous")]
436impl Drop for ScopedLogLevel {
437 fn drop(&mut self) {
438 if self.changed {
439 log::set_max_level(self.previous);
440 }
441 }
442}
443
444#[cfg(feature = "extractous")]
445fn is_probably_pdf(bytes: &[u8]) -> bool {
446 if bytes.is_empty() {
447 return false;
448 }
449 let mut slice = bytes;
450 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
451 slice = &slice[3..];
452 }
453 while let Some((first, rest)) = slice.split_first() {
454 if *first == 0 || first.is_ascii_whitespace() {
455 slice = rest;
456 } else {
457 break;
458 }
459 }
460 slice.starts_with(b"%PDF")
461}
462
463#[cfg(feature = "extractous")]
464fn looks_like_pdf_structure_dump(content: &str) -> bool {
465 if content.len() < 256 {
466 return false;
467 }
468 let sample_len = content.len().min(8_192);
469 let safe_len = truncate_at_grapheme_boundary(content, sample_len);
471 let sample = &content[..safe_len];
472 let endobj_hits = sample.matches("endobj").take(3).count();
473 if endobj_hits < 2 {
474 return false;
475 }
476 let has_obj =
477 sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
478 let has_stream = sample.contains("endstream");
479 let has_page_type = sample.contains("/Type /Page");
480 endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
481}
482
483#[cfg(feature = "extractous")]
484fn value_to_mime(value: &Value) -> Option<String> {
485 if let Some(mime) = value.as_str() {
486 return Some(mime.to_string());
487 }
488 if let Some(array) = value.as_array() {
489 for entry in array {
490 if let Some(mime) = entry.as_str() {
491 return Some(mime.to_string());
492 }
493 }
494 }
495 None
496}
497
498#[cfg(feature = "extractous")]
499fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
500 let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
501 cache.lock().ok().and_then(|map| map.get(hash).cloned())
502}
503
504#[cfg(feature = "extractous")]
505fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
506 let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
507 if let Ok(mut map) = cache.lock() {
508 map.insert(hash, document.clone());
509 }
510}
511
512#[cfg(all(test, feature = "extractous"))]
513mod tests {
514 use super::*;
515
516 #[test]
517 fn detects_pdf_like_dump() {
518 let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
519 let dump = snippet.repeat(12);
520 assert!(looks_like_pdf_structure_dump(&dump));
521 }
522
523 #[test]
524 fn skips_normal_text() {
525 let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
526 assert!(!looks_like_pdf_structure_dump(text));
527 }
528
529 #[test]
530 fn identifies_pdf_magic() {
531 let bytes = b"%PDF-1.7 some data";
532 assert!(is_probably_pdf(bytes));
533 let padded = b"\n\n%PDF-1.5";
534 assert!(is_probably_pdf(padded));
535 let not_pdf = b"<!doctype html>";
536 assert!(!is_probably_pdf(not_pdf));
537 }
538}
539
540#[cfg(all(test, feature = "extractous"))]
541mod pdf_fix_tests {
542 use super::*;
543
544 #[test]
549 fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
550 let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
552
553 assert!(is_probably_pdf(pdf_structure));
555
556 let structure_dump =
558 "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
559 assert!(looks_like_pdf_structure_dump(&structure_dump));
560
561 let normal_text = "This is perfectly normal extracted text from a document.";
563 assert!(!looks_like_pdf_structure_dump(normal_text));
564 }
565}