1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5#[cfg(feature = "extractous")]
6use log::LevelFilter;
7use lopdf::Document as LopdfDocument;
8use serde_json::{Value, json};
9
10#[cfg(feature = "extractous")]
11use extractous::Extractor;
12#[cfg(feature = "extractous")]
13use std::collections::HashMap;
14#[cfg(feature = "extractous")]
15use std::sync::{Mutex, OnceLock};
16
17#[derive(Debug, Clone)]
20pub struct ExtractedDocument {
21 pub text: Option<String>,
22 pub metadata: Value,
23 pub mime_type: Option<String>,
24}
25
26impl ExtractedDocument {
27 pub fn empty() -> Self {
28 Self {
29 text: None,
30 metadata: Value::Null,
31 mime_type: None,
32 }
33 }
34}
35
36#[derive(Debug, Clone, Copy)]
37pub struct ProcessorConfig {
38 pub max_text_chars: usize,
39}
40
41impl Default for ProcessorConfig {
42 fn default() -> Self {
43 Self {
44 max_text_chars: 2_000_000,
45 }
46 }
47}
48
49#[cfg(feature = "extractous")]
54#[derive(Debug)]
55pub struct DocumentProcessor {
56 extractor: Mutex<Extractor>,
57 max_length: usize,
58}
59
60#[cfg(feature = "extractous")]
61impl Default for DocumentProcessor {
62 fn default() -> Self {
63 Self::new(Default::default())
64 }
65}
66
67#[cfg(feature = "extractous")]
68static EXTRACTION_CACHE: OnceLock<Mutex<HashMap<blake3::Hash, ExtractedDocument>>> =
69 OnceLock::new();
70
71#[cfg(feature = "extractous")]
72impl DocumentProcessor {
73 pub fn new(config: ProcessorConfig) -> Self {
74 let capped = config
75 .max_text_chars
76 .min(i32::MAX as usize)
77 .try_into()
78 .unwrap_or(i32::MAX);
79 let mut extractor = Extractor::new().set_extract_string_max_length(capped);
80 extractor = extractor.set_xml_output(false);
81 Self {
82 extractor: Mutex::new(extractor),
83 max_length: config.max_text_chars,
84 }
85 }
86
87 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
88 let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
89 reason: "input path contains invalid UTF-8".into(),
90 })?;
91
92 let extraction = {
93 let extractor = self.locked()?;
94 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
95 extractor.extract_file_to_string(path_str)
96 };
97
98 match extraction {
99 Ok((mut content, metadata)) => {
100 if needs_pdf_fallback(&content) {
101 if let Ok(bytes) = fs::read(path) {
102 if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
103 content = fallback_text;
104 }
105 }
106 }
107 Ok(self.into_document(content, metadata))
108 }
109 Err(err) => {
110 let primary_reason = err.to_string();
111 if let Ok(bytes) = fs::read(path) {
112 match pdf_text_fallback(&bytes) {
113 Ok(Some(fallback_text)) => {
114 return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
115 }
116 Ok(None) => {}
117 Err(fallback_err) => {
118 let reason = format!(
119 "primary extractor error: {}; PDF fallback error: {}",
120 primary_reason, fallback_err
121 );
122 return Err(MemvidError::ExtractionFailed {
123 reason: reason.into(),
124 });
125 }
126 }
127 }
128 Err(MemvidError::ExtractionFailed {
129 reason: primary_reason.into(),
130 })
131 }
132 }
133 }
134
135 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
136 let hash = blake3::hash(bytes);
137 if let Some(cached) = cache_lookup(&hash) {
138 tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
139 return Ok(cached);
140 }
141
142 let extraction = {
143 let extractor = self.locked()?;
144 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
145 extractor.extract_bytes_to_string(bytes)
146 };
147
148 let document = match extraction {
149 Ok((mut content, metadata)) => {
150 let pdf_needed = needs_pdf_fallback(&content);
151 tracing::debug!(
152 target: "memvid::extract",
153 content_len = content.len(),
154 pdf_fallback_needed = pdf_needed,
155 "extractous returned content"
156 );
157 if pdf_needed {
158 match pdf_text_fallback(bytes) {
159 Ok(Some(fallback_text)) => {
160 tracing::debug!(
161 target: "memvid::extract",
162 fallback_len = fallback_text.len(),
163 "lopdf fallback succeeded"
164 );
165 content = fallback_text;
166 }
167 Ok(None) => {
168 tracing::debug!(
169 target: "memvid::extract",
170 "lopdf fallback returned None"
171 );
172 content = String::new();
175 }
176 Err(e) => {
177 tracing::debug!(
178 target: "memvid::extract",
179 error = %e,
180 "lopdf fallback failed"
181 );
182 content = String::new();
184 }
185 }
186 }
187 self.into_document(content, metadata)
188 }
189 Err(err) => {
190 let primary_reason = err.to_string();
191 match pdf_text_fallback(bytes) {
192 Ok(Some(fallback_text)) => {
193 self.into_document(fallback_text, pdf_fallback_metadata())
194 }
195 Ok(None) => {
196 return Err(MemvidError::ExtractionFailed {
197 reason: primary_reason.into(),
198 });
199 }
200 Err(fallback_err) => {
201 let reason = format!(
202 "primary extractor error: {}; PDF fallback error: {}",
203 primary_reason, fallback_err
204 );
205 return Err(MemvidError::ExtractionFailed {
206 reason: reason.into(),
207 });
208 }
209 }
210 }
211 };
212
213 cache_store(hash, &document);
214 Ok(document)
215 }
216
217 fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
218 self.extractor
219 .lock()
220 .map_err(|_| MemvidError::ExtractionFailed {
221 reason: "extractor mutex poisoned".into(),
222 })
223 }
224
225 fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
226 where
227 M: serde::Serialize,
228 {
229 let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
230 let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
231
232 let text = if content.trim().is_empty() {
233 tracing::debug!(
234 target: "memvid::extract",
235 "into_document: content is empty, returning text=None"
236 );
237 None
238 } else {
239 let final_text = if content.len() > self.max_length {
240 let end = truncate_at_grapheme_boundary(&content, self.max_length);
241 content[..end].to_string()
242 } else {
243 content
244 };
245 tracing::debug!(
246 target: "memvid::extract",
247 text_len = final_text.len(),
248 starts_with_pdf = final_text.starts_with("%PDF"),
249 "into_document: returning text"
250 );
251 Some(final_text)
252 };
253
254 ExtractedDocument {
255 text,
256 metadata: metadata_value,
257 mime_type,
258 }
259 }
260}
261
262#[cfg(not(feature = "extractous"))]
267#[derive(Debug)]
268pub struct DocumentProcessor {
269 max_length: usize,
270}
271
272#[cfg(not(feature = "extractous"))]
273impl Default for DocumentProcessor {
274 fn default() -> Self {
275 Self::new(Default::default())
276 }
277}
278
279#[cfg(not(feature = "extractous"))]
280impl DocumentProcessor {
281 pub fn new(config: ProcessorConfig) -> Self {
282 Self {
283 max_length: config.max_text_chars,
284 }
285 }
286
287 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
288 let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
290 reason: format!("failed to read file: {e}").into(),
291 })?;
292 self.extract_from_bytes(&bytes)
293 }
294
295 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
296 if is_probably_pdf_simple(bytes) {
298 match pdf_text_extract_best(bytes) {
299 Ok(Some((text, extractor))) => {
300 let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
301 let truncated = &text[..truncate_len];
302 return Ok(ExtractedDocument {
303 text: Some(truncated.to_string()),
304 metadata: json!({
305 "Content-Type": "application/pdf",
306 "extraction": extractor,
307 }),
308 mime_type: Some("application/pdf".to_string()),
309 });
310 }
311 Ok(None) => {
312 return Ok(ExtractedDocument {
314 text: None,
315 metadata: json!({
316 "Content-Type": "application/pdf",
317 "extraction": "no_text",
318 }),
319 mime_type: Some("application/pdf".to_string()),
320 });
321 }
322 Err(e) => {
323 tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
324 }
326 }
327 }
328
329 if let Ok(text) = std::str::from_utf8(bytes) {
332 let sample = &bytes[..bytes.len().min(8192)];
334 if !sample.contains(&0) {
335 let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
336 let truncated = &text[..truncate_len];
337 return Ok(ExtractedDocument {
338 text: Some(truncated.to_string()),
339 metadata: json!({}),
340 mime_type: Some("text/plain".to_string()),
341 });
342 }
343 }
344
345 Ok(ExtractedDocument {
349 text: None,
350 metadata: json!({}),
351 mime_type: Some("application/octet-stream".to_string()),
352 })
353 }
354}
355
356#[cfg(feature = "extractous")]
357fn needs_pdf_fallback(content: &str) -> bool {
358 if content.trim().is_empty() {
359 return true;
360 }
361 looks_like_pdf_structure_dump(content)
362}
363
364#[cfg(feature = "extractous")]
365fn pdf_fallback_metadata() -> Value {
366 json!({
367 "Content-Type": "application/pdf",
368 "extraction": "lopdf_fallback",
369 })
370}
371
372#[cfg(feature = "extractous")]
373const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; #[cfg(feature = "extractous")]
375const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
376
377#[cfg(feature = "extractous")]
378fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
379 if !is_probably_pdf(bytes) {
380 return Ok(None);
381 }
382
383 if bytes.len() > PDF_FALLBACK_MAX_BYTES {
384 return Err(MemvidError::ExtractionFailed {
385 reason: format!(
386 "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
387 bytes.len(),
388 PDF_FALLBACK_MAX_BYTES
389 )
390 .into(),
391 });
392 }
393
394 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
395 let mut document =
396 LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
397 reason: format!("pdf fallback failed to load document: {err}").into(),
398 })?;
399
400 if document.is_encrypted() {
401 if document.decrypt("").is_err() {
402 return Err(MemvidError::ExtractionFailed {
403 reason: "pdf fallback cannot decrypt password-protected file".into(),
404 });
405 }
406 }
407
408 let _ = document.decompress();
409
410 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
411 if page_numbers.is_empty() {
412 return Ok(None);
413 }
414 page_numbers.sort_unstable();
415
416 if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
417 return Err(MemvidError::ExtractionFailed {
418 reason: format!(
419 "pdf fallback aborted: page count {} exceeds limit of {}",
420 page_numbers.len(),
421 PDF_FALLBACK_MAX_PAGES
422 )
423 .into(),
424 });
425 }
426
427 match document.extract_text(&page_numbers) {
428 Ok(text) => {
429 let trimmed = text.trim();
430 if trimmed.is_empty() {
431 Ok(None)
432 } else {
433 Ok(Some(trimmed.to_string()))
434 }
435 }
436 Err(err) => Err(MemvidError::ExtractionFailed {
437 reason: format!("pdf fallback failed to extract text: {err}").into(),
438 }),
439 }
440}
441
442#[cfg(feature = "extractous")]
443struct ScopedLogLevel {
444 previous: LevelFilter,
445 changed: bool,
446}
447
448#[cfg(feature = "extractous")]
449impl ScopedLogLevel {
450 fn lowered(level: LevelFilter) -> Self {
451 let previous = log::max_level();
452 if level < previous {
453 log::set_max_level(level);
454 Self {
455 previous,
456 changed: true,
457 }
458 } else {
459 Self {
460 previous,
461 changed: false,
462 }
463 }
464 }
465}
466
467#[cfg(feature = "extractous")]
468impl Drop for ScopedLogLevel {
469 fn drop(&mut self) {
470 if self.changed {
471 log::set_max_level(self.previous);
472 }
473 }
474}
475
476#[cfg(feature = "extractous")]
477fn is_probably_pdf(bytes: &[u8]) -> bool {
478 if bytes.is_empty() {
479 return false;
480 }
481 let mut slice = bytes;
482 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
483 slice = &slice[3..];
484 }
485 while let Some((first, rest)) = slice.split_first() {
486 if *first == 0 || first.is_ascii_whitespace() {
487 slice = rest;
488 } else {
489 break;
490 }
491 }
492 slice.starts_with(b"%PDF")
493}
494
495#[cfg(feature = "extractous")]
496fn looks_like_pdf_structure_dump(content: &str) -> bool {
497 if content.len() < 256 {
498 return false;
499 }
500 let sample_len = content.len().min(8_192);
501 let safe_len = truncate_at_grapheme_boundary(content, sample_len);
503 let sample = &content[..safe_len];
504 let endobj_hits = sample.matches("endobj").take(3).count();
505 if endobj_hits < 2 {
506 return false;
507 }
508 let has_obj =
509 sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
510 let has_stream = sample.contains("endstream");
511 let has_page_type = sample.contains("/Type /Page");
512 endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
513}
514
515#[cfg(feature = "extractous")]
516fn value_to_mime(value: &Value) -> Option<String> {
517 if let Some(mime) = value.as_str() {
518 return Some(mime.to_string());
519 }
520 if let Some(array) = value.as_array() {
521 for entry in array {
522 if let Some(mime) = entry.as_str() {
523 return Some(mime.to_string());
524 }
525 }
526 }
527 None
528}
529
530#[cfg(feature = "extractous")]
531fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
532 let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
533 cache.lock().ok().and_then(|map| map.get(hash).cloned())
534}
535
536#[cfg(feature = "extractous")]
537fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
538 let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
539 if let Ok(mut map) = cache.lock() {
540 map.insert(hash, document.clone());
541 }
542}
543
544#[allow(dead_code)]
550const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; #[allow(dead_code)]
552const PDF_LOPDF_MAX_PAGES: usize = 4_096;
553
554#[allow(dead_code)]
557fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
558 let mut best_text: Option<String> = None;
559 let mut best_source: &'static str = "";
560
561 let _min_good_chars = (bytes.len() / 100).clamp(500, 5000);
564
565 #[cfg(feature = "pdf_extract")]
567 {
568 match pdf_extract::extract_text_from_mem(bytes) {
569 Ok(text) => {
570 let trimmed = text.trim();
571 if !trimmed.is_empty() {
572 if trimmed.len() >= min_good_chars {
573 tracing::debug!(
575 target: "memvid::extract",
576 len = trimmed.len(),
577 "pdf_extract succeeded with good result"
578 );
579 return Ok(Some((trimmed.to_string(), "pdf_extract")));
580 }
581 tracing::debug!(
583 target: "memvid::extract",
584 len = trimmed.len(),
585 min_good = min_good_chars,
586 "pdf_extract returned partial result, trying lopdf"
587 );
588 best_text = Some(trimmed.to_string());
589 best_source = "pdf_extract";
590 }
591 }
592 Err(e) => {
593 tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
594 }
595 }
596 }
597
598 match pdf_text_extract_lopdf(bytes) {
600 Ok(Some(text)) => {
601 let trimmed = text.trim();
602 if !trimmed.is_empty() {
603 if best_text.as_ref().map_or(true, |prev| trimmed.len() > prev.len()) {
605 tracing::debug!(
606 target: "memvid::extract",
607 len = trimmed.len(),
608 "lopdf extracted more text"
609 );
610 best_text = Some(trimmed.to_string());
611 best_source = "lopdf";
612 }
613 }
614 }
615 Ok(None) => {
616 tracing::debug!(target: "memvid::extract", "lopdf returned no text");
617 }
618 Err(e) => {
619 tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
620 }
621 }
622
623 Ok(best_text.map(|t| (t, best_source)))
624}
625
626#[allow(dead_code)]
628fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
629 if bytes.is_empty() {
630 return false;
631 }
632 let mut slice = bytes;
633 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
635 slice = &slice[3..];
636 }
637 while let Some((first, rest)) = slice.split_first() {
639 if *first == 0 || first.is_ascii_whitespace() {
640 slice = rest;
641 } else {
642 break;
643 }
644 }
645 slice.starts_with(b"%PDF")
646}
647
648#[allow(dead_code)]
650fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
651 if bytes.len() > PDF_LOPDF_MAX_BYTES {
652 return Err(MemvidError::ExtractionFailed {
653 reason: format!(
654 "PDF too large: {} bytes exceeds limit of {} bytes",
655 bytes.len(),
656 PDF_LOPDF_MAX_BYTES
657 )
658 .into(),
659 });
660 }
661
662 let mut document = LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
663 reason: format!("failed to load PDF: {err}").into(),
664 })?;
665
666 if document.is_encrypted() {
668 if document.decrypt("").is_err() {
669 return Err(MemvidError::ExtractionFailed {
670 reason: "cannot decrypt password-protected PDF".into(),
671 });
672 }
673 }
674
675 let _ = document.decompress();
677
678 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
679 if page_numbers.is_empty() {
680 return Ok(None);
681 }
682 page_numbers.sort_unstable();
683
684 if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
685 return Err(MemvidError::ExtractionFailed {
686 reason: format!(
687 "PDF has too many pages: {} exceeds limit of {}",
688 page_numbers.len(),
689 PDF_LOPDF_MAX_PAGES
690 )
691 .into(),
692 });
693 }
694
695 match document.extract_text(&page_numbers) {
696 Ok(text) => {
697 let trimmed = text.trim();
698 if trimmed.is_empty() {
699 Ok(None)
700 } else {
701 Ok(Some(trimmed.to_string()))
702 }
703 }
704 Err(err) => Err(MemvidError::ExtractionFailed {
705 reason: format!("failed to extract text from PDF: {err}").into(),
706 }),
707 }
708}
709
710#[cfg(all(test, feature = "extractous"))]
711mod tests {
712 use super::*;
713
714 #[test]
715 fn detects_pdf_like_dump() {
716 let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
717 let dump = snippet.repeat(12);
718 assert!(looks_like_pdf_structure_dump(&dump));
719 }
720
721 #[test]
722 fn skips_normal_text() {
723 let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
724 assert!(!looks_like_pdf_structure_dump(text));
725 }
726
727 #[test]
728 fn identifies_pdf_magic() {
729 let bytes = b"%PDF-1.7 some data";
730 assert!(is_probably_pdf(bytes));
731 let padded = b"\n\n%PDF-1.5";
732 assert!(is_probably_pdf(padded));
733 let not_pdf = b"<!doctype html>";
734 assert!(!is_probably_pdf(not_pdf));
735 }
736}
737
738#[cfg(all(test, feature = "extractous"))]
739mod pdf_fix_tests {
740 use super::*;
741
742 #[test]
747 fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
748 let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
750
751 assert!(is_probably_pdf(pdf_structure));
753
754 let structure_dump =
756 "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
757 assert!(looks_like_pdf_structure_dump(&structure_dump));
758
759 let normal_text = "This is perfectly normal extracted text from a document.";
761 assert!(!looks_like_pdf_structure_dump(normal_text));
762 }
763}