1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5#[cfg(feature = "symspell_cleanup")]
7use crate::symspell_cleanup::fix_pdf_text as fix_pdf_spacing;
8#[cfg(not(feature = "symspell_cleanup"))]
9use crate::text::fix_pdf_spacing;
10
11#[cfg(feature = "extractous")]
12use log::LevelFilter;
13use lopdf::Document as LopdfDocument;
14use serde_json::{Value, json};
15
16#[cfg(feature = "extractous")]
17use extractous::Extractor;
18#[cfg(feature = "extractous")]
19use std::collections::HashMap;
20#[cfg(feature = "extractous")]
21use std::sync::{Mutex, OnceLock};
22
23#[derive(Debug, Clone)]
26pub struct ExtractedDocument {
27 pub text: Option<String>,
28 pub metadata: Value,
29 pub mime_type: Option<String>,
30}
31
32impl ExtractedDocument {
33 pub fn empty() -> Self {
34 Self {
35 text: None,
36 metadata: Value::Null,
37 mime_type: None,
38 }
39 }
40}
41
42#[derive(Debug, Clone, Copy)]
43pub struct ProcessorConfig {
44 pub max_text_chars: usize,
45}
46
47impl Default for ProcessorConfig {
48 fn default() -> Self {
49 Self {
50 max_text_chars: 2_000_000,
51 }
52 }
53}
54
55#[cfg(feature = "extractous")]
60#[derive(Debug)]
61pub struct DocumentProcessor {
62 extractor: Mutex<Extractor>,
63 max_length: usize,
64}
65
66#[cfg(feature = "extractous")]
67impl Default for DocumentProcessor {
68 fn default() -> Self {
69 Self::new(Default::default())
70 }
71}
72
73#[cfg(feature = "extractous")]
74static EXTRACTION_CACHE: OnceLock<Mutex<HashMap<blake3::Hash, ExtractedDocument>>> =
75 OnceLock::new();
76
77#[cfg(feature = "extractous")]
78impl DocumentProcessor {
79 pub fn new(config: ProcessorConfig) -> Self {
80 let capped = config
81 .max_text_chars
82 .min(i32::MAX as usize)
83 .try_into()
84 .unwrap_or(i32::MAX);
85 let mut extractor = Extractor::new().set_extract_string_max_length(capped);
86 extractor = extractor.set_xml_output(false);
87 Self {
88 extractor: Mutex::new(extractor),
89 max_length: config.max_text_chars,
90 }
91 }
92
93 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
94 let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
95 reason: "input path contains invalid UTF-8".into(),
96 })?;
97
98 let extraction = {
99 let extractor = self.locked()?;
100 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
101 extractor.extract_file_to_string(path_str)
102 };
103
104 match extraction {
105 Ok((mut content, metadata)) => {
106 if needs_pdf_fallback(&content) {
107 if let Ok(bytes) = fs::read(path) {
108 if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
109 content = fallback_text;
110 }
111 }
112 }
113 Ok(self.into_document(content, metadata))
114 }
115 Err(err) => {
116 let primary_reason = err.to_string();
117 if let Ok(bytes) = fs::read(path) {
118 match pdf_text_fallback(&bytes) {
119 Ok(Some(fallback_text)) => {
120 return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
121 }
122 Ok(None) => {}
123 Err(fallback_err) => {
124 let reason = format!(
125 "primary extractor error: {}; PDF fallback error: {}",
126 primary_reason, fallback_err
127 );
128 return Err(MemvidError::ExtractionFailed {
129 reason: reason.into(),
130 });
131 }
132 }
133 }
134 Err(MemvidError::ExtractionFailed {
135 reason: primary_reason.into(),
136 })
137 }
138 }
139 }
140
141 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
142 let hash = blake3::hash(bytes);
143 if let Some(cached) = cache_lookup(&hash) {
144 tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
145 return Ok(cached);
146 }
147
148 let extraction = {
149 let extractor = self.locked()?;
150 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
151 extractor.extract_bytes_to_string(bytes)
152 };
153
154 let document = match extraction {
155 Ok((mut content, metadata)) => {
156 let pdf_needed = needs_pdf_fallback(&content);
157 tracing::debug!(
158 target: "memvid::extract",
159 content_len = content.len(),
160 pdf_fallback_needed = pdf_needed,
161 "extractous returned content"
162 );
163 if pdf_needed {
164 match pdf_text_fallback(bytes) {
165 Ok(Some(fallback_text)) => {
166 tracing::debug!(
167 target: "memvid::extract",
168 fallback_len = fallback_text.len(),
169 "lopdf fallback succeeded"
170 );
171 content = fallback_text;
172 }
173 Ok(None) => {
174 tracing::debug!(
175 target: "memvid::extract",
176 "lopdf fallback returned None"
177 );
178 content = String::new();
181 }
182 Err(e) => {
183 tracing::debug!(
184 target: "memvid::extract",
185 error = %e,
186 "lopdf fallback failed"
187 );
188 content = String::new();
190 }
191 }
192 }
193 self.into_document(content, metadata)
194 }
195 Err(err) => {
196 let primary_reason = err.to_string();
197 match pdf_text_fallback(bytes) {
198 Ok(Some(fallback_text)) => {
199 self.into_document(fallback_text, pdf_fallback_metadata())
200 }
201 Ok(None) => {
202 return Err(MemvidError::ExtractionFailed {
203 reason: primary_reason.into(),
204 });
205 }
206 Err(fallback_err) => {
207 let reason = format!(
208 "primary extractor error: {}; PDF fallback error: {}",
209 primary_reason, fallback_err
210 );
211 return Err(MemvidError::ExtractionFailed {
212 reason: reason.into(),
213 });
214 }
215 }
216 }
217 };
218
219 cache_store(hash, &document);
220 Ok(document)
221 }
222
223 fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
224 self.extractor
225 .lock()
226 .map_err(|_| MemvidError::ExtractionFailed {
227 reason: "extractor mutex poisoned".into(),
228 })
229 }
230
231 fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
232 where
233 M: serde::Serialize,
234 {
235 let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
236 let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
237
238 let text = if content.trim().is_empty() {
239 tracing::debug!(
240 target: "memvid::extract",
241 "into_document: content is empty, returning text=None"
242 );
243 None
244 } else {
245 let final_text = if content.len() > self.max_length {
246 let end = truncate_at_grapheme_boundary(&content, self.max_length);
247 content[..end].to_string()
248 } else {
249 content
250 };
251 tracing::debug!(
252 target: "memvid::extract",
253 text_len = final_text.len(),
254 starts_with_pdf = final_text.starts_with("%PDF"),
255 "into_document: returning text"
256 );
257 Some(final_text)
258 };
259
260 ExtractedDocument {
261 text,
262 metadata: metadata_value,
263 mime_type,
264 }
265 }
266}
267
268#[cfg(not(feature = "extractous"))]
273#[derive(Debug)]
274pub struct DocumentProcessor {
275 max_length: usize,
276}
277
278#[cfg(not(feature = "extractous"))]
279impl Default for DocumentProcessor {
280 fn default() -> Self {
281 Self::new(Default::default())
282 }
283}
284
285#[cfg(not(feature = "extractous"))]
286impl DocumentProcessor {
287 pub fn new(config: ProcessorConfig) -> Self {
288 Self {
289 max_length: config.max_text_chars,
290 }
291 }
292
293 pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
294 let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
296 reason: format!("failed to read file: {e}").into(),
297 })?;
298 self.extract_from_bytes(&bytes)
299 }
300
301 pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
302 if is_probably_pdf_simple(bytes) {
304 match pdf_text_extract_best(bytes) {
305 Ok(Some((text, extractor))) => {
306 let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
307 let truncated = &text[..truncate_len];
308 return Ok(ExtractedDocument {
309 text: Some(truncated.to_string()),
310 metadata: json!({
311 "Content-Type": "application/pdf",
312 "extraction": extractor,
313 }),
314 mime_type: Some("application/pdf".to_string()),
315 });
316 }
317 Ok(None) => {
318 return Ok(ExtractedDocument {
320 text: None,
321 metadata: json!({
322 "Content-Type": "application/pdf",
323 "extraction": "no_text",
324 }),
325 mime_type: Some("application/pdf".to_string()),
326 });
327 }
328 Err(e) => {
329 tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
330 }
332 }
333 }
334
335 if let Ok(text) = std::str::from_utf8(bytes) {
338 let sample = &bytes[..bytes.len().min(8192)];
340 if !sample.contains(&0) {
341 let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
342 let truncated = &text[..truncate_len];
343 return Ok(ExtractedDocument {
344 text: Some(truncated.to_string()),
345 metadata: json!({}),
346 mime_type: Some("text/plain".to_string()),
347 });
348 }
349 }
350
351 Ok(ExtractedDocument {
355 text: None,
356 metadata: json!({}),
357 mime_type: Some("application/octet-stream".to_string()),
358 })
359 }
360}
361
362#[cfg(feature = "extractous")]
363fn needs_pdf_fallback(content: &str) -> bool {
364 if content.trim().is_empty() {
365 return true;
366 }
367 looks_like_pdf_structure_dump(content)
368}
369
370#[cfg(feature = "extractous")]
371fn pdf_fallback_metadata() -> Value {
372 json!({
373 "Content-Type": "application/pdf",
374 "extraction": "lopdf_fallback",
375 })
376}
377
378#[cfg(feature = "extractous")]
379const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; #[cfg(feature = "extractous")]
381const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
382
383#[cfg(feature = "extractous")]
384fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
385 if !is_probably_pdf(bytes) {
386 return Ok(None);
387 }
388
389 if bytes.len() > PDF_FALLBACK_MAX_BYTES {
390 return Err(MemvidError::ExtractionFailed {
391 reason: format!(
392 "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
393 bytes.len(),
394 PDF_FALLBACK_MAX_BYTES
395 )
396 .into(),
397 });
398 }
399
400 let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
401 let mut document =
402 LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
403 reason: format!("pdf fallback failed to load document: {err}").into(),
404 })?;
405
406 if document.is_encrypted() {
407 if document.decrypt("").is_err() {
408 return Err(MemvidError::ExtractionFailed {
409 reason: "pdf fallback cannot decrypt password-protected file".into(),
410 });
411 }
412 }
413
414 let _ = document.decompress();
415
416 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
417 if page_numbers.is_empty() {
418 return Ok(None);
419 }
420 page_numbers.sort_unstable();
421
422 if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
423 return Err(MemvidError::ExtractionFailed {
424 reason: format!(
425 "pdf fallback aborted: page count {} exceeds limit of {}",
426 page_numbers.len(),
427 PDF_FALLBACK_MAX_PAGES
428 )
429 .into(),
430 });
431 }
432
433 match document.extract_text(&page_numbers) {
434 Ok(text) => {
435 let trimmed = text.trim();
436 if trimmed.is_empty() {
437 Ok(None)
438 } else {
439 Ok(Some(fix_pdf_spacing(trimmed)))
441 }
442 }
443 Err(err) => Err(MemvidError::ExtractionFailed {
444 reason: format!("pdf fallback failed to extract text: {err}").into(),
445 }),
446 }
447}
448
449#[cfg(feature = "extractous")]
450struct ScopedLogLevel {
451 previous: LevelFilter,
452 changed: bool,
453}
454
455#[cfg(feature = "extractous")]
456impl ScopedLogLevel {
457 fn lowered(level: LevelFilter) -> Self {
458 let previous = log::max_level();
459 if level < previous {
460 log::set_max_level(level);
461 Self {
462 previous,
463 changed: true,
464 }
465 } else {
466 Self {
467 previous,
468 changed: false,
469 }
470 }
471 }
472}
473
474#[cfg(feature = "extractous")]
475impl Drop for ScopedLogLevel {
476 fn drop(&mut self) {
477 if self.changed {
478 log::set_max_level(self.previous);
479 }
480 }
481}
482
483#[cfg(feature = "extractous")]
484fn is_probably_pdf(bytes: &[u8]) -> bool {
485 if bytes.is_empty() {
486 return false;
487 }
488 let mut slice = bytes;
489 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
490 slice = &slice[3..];
491 }
492 while let Some((first, rest)) = slice.split_first() {
493 if *first == 0 || first.is_ascii_whitespace() {
494 slice = rest;
495 } else {
496 break;
497 }
498 }
499 slice.starts_with(b"%PDF")
500}
501
502#[cfg(feature = "extractous")]
503fn looks_like_pdf_structure_dump(content: &str) -> bool {
504 if content.len() < 256 {
505 return false;
506 }
507 let sample_len = content.len().min(8_192);
508 let safe_len = truncate_at_grapheme_boundary(content, sample_len);
510 let sample = &content[..safe_len];
511 let endobj_hits = sample.matches("endobj").take(3).count();
512 if endobj_hits < 2 {
513 return false;
514 }
515 let has_obj =
516 sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
517 let has_stream = sample.contains("endstream");
518 let has_page_type = sample.contains("/Type /Page");
519 endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
520}
521
522#[cfg(feature = "extractous")]
523fn value_to_mime(value: &Value) -> Option<String> {
524 if let Some(mime) = value.as_str() {
525 return Some(mime.to_string());
526 }
527 if let Some(array) = value.as_array() {
528 for entry in array {
529 if let Some(mime) = entry.as_str() {
530 return Some(mime.to_string());
531 }
532 }
533 }
534 None
535}
536
537#[cfg(feature = "extractous")]
538fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
539 let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
540 cache.lock().ok().and_then(|map| map.get(hash).cloned())
541}
542
543#[cfg(feature = "extractous")]
544fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
545 let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
546 if let Ok(mut map) = cache.lock() {
547 map.insert(hash, document.clone());
548 }
549}
550
551#[allow(dead_code)]
557const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; #[allow(dead_code)]
559const PDF_LOPDF_MAX_PAGES: usize = 4_096;
560
561#[allow(dead_code)]
565fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
566 let mut best_text: Option<String> = None;
567 let mut best_source: &'static str = "";
568
569 #[cfg(any(feature = "pdf_oxide", feature = "pdf_extract"))]
571 let min_good_chars = (bytes.len() / 100).clamp(500, 5000);
572
573 #[cfg(feature = "pdf_oxide")]
576 {
577 let bytes_clone = bytes.to_vec();
578 let oxide_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
579 pdf_text_extract_oxide(&bytes_clone)
580 }));
581
582 match oxide_result {
583 Ok(Ok(Some(text))) => {
584 let trimmed = text.trim();
585 if !trimmed.is_empty() {
586 if trimmed.len() >= min_good_chars {
587 tracing::debug!(
588 target: "memvid::extract",
589 len = trimmed.len(),
590 "pdf_oxide succeeded with good result"
591 );
592 return Ok(Some((trimmed.to_string(), "pdf_oxide")));
593 }
594 tracing::debug!(
595 target: "memvid::extract",
596 len = trimmed.len(),
597 min_good = min_good_chars,
598 "pdf_oxide returned partial result, trying fallbacks"
599 );
600 best_text = Some(trimmed.to_string());
601 best_source = "pdf_oxide";
602 }
603 }
604 Ok(Ok(None)) => {
605 tracing::debug!(target: "memvid::extract", "pdf_oxide returned no text");
606 }
607 Ok(Err(e)) => {
608 tracing::debug!(target: "memvid::extract", error = %e, "pdf_oxide failed");
609 }
610 Err(_) => {
611 tracing::warn!(target: "memvid::extract", "pdf_oxide panicked (likely font parsing issue), falling back to other extractors");
612 }
613 }
614 }
615
616 #[cfg(feature = "pdf_extract")]
618 {
619 match pdf_extract::extract_text_from_mem(bytes) {
620 Ok(text) => {
621 let trimmed = text.trim();
622 if !trimmed.is_empty() {
623 if best_text.is_none() && trimmed.len() >= min_good_chars {
624 tracing::debug!(
625 target: "memvid::extract",
626 len = trimmed.len(),
627 "pdf_extract succeeded with good result"
628 );
629 return Ok(Some((trimmed.to_string(), "pdf_extract")));
630 }
631 if best_text
633 .as_ref()
634 .map_or(true, |prev| trimmed.len() > prev.len())
635 {
636 best_text = Some(trimmed.to_string());
637 best_source = "pdf_extract";
638 }
639 }
640 }
641 Err(e) => {
642 tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
643 }
644 }
645 }
646
647 match pdf_text_extract_lopdf(bytes) {
649 Ok(Some(text)) => {
650 let trimmed = text.trim();
651 if !trimmed.is_empty() {
652 if best_text
654 .as_ref()
655 .map_or(true, |prev| trimmed.len() > prev.len())
656 {
657 tracing::debug!(
658 target: "memvid::extract",
659 len = trimmed.len(),
660 "lopdf extracted more text"
661 );
662 best_text = Some(trimmed.to_string());
663 best_source = "lopdf";
664 }
665 }
666 }
667 Ok(None) => {
668 tracing::debug!(target: "memvid::extract", "lopdf returned no text");
669 }
670 Err(e) => {
671 tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
672 }
673 }
674
675 Ok(best_text.map(|t| (fix_pdf_spacing(&t), best_source)))
677}
678
679#[cfg(feature = "pdf_oxide")]
682#[allow(dead_code)]
683fn pdf_text_extract_oxide(bytes: &[u8]) -> Result<Option<String>> {
684 use pdf_oxide::PdfDocument;
685 use std::io::Write;
686 use tempfile::NamedTempFile;
687
688 let mut temp_file = NamedTempFile::new().map_err(|err| MemvidError::ExtractionFailed {
690 reason: format!("pdf_oxide failed to create temp file: {err}").into(),
691 })?;
692
693 temp_file
694 .write_all(bytes)
695 .map_err(|err| MemvidError::ExtractionFailed {
696 reason: format!("pdf_oxide failed to write temp file: {err}").into(),
697 })?;
698
699 temp_file
700 .flush()
701 .map_err(|err| MemvidError::ExtractionFailed {
702 reason: format!("pdf_oxide failed to flush temp file: {err}").into(),
703 })?;
704
705 let temp_path = temp_file.path();
706 let mut doc = PdfDocument::open(temp_path).map_err(|err| MemvidError::ExtractionFailed {
707 reason: format!("pdf_oxide failed to load PDF: {err}").into(),
708 })?;
709
710 let page_count = doc
711 .page_count()
712 .map_err(|err| MemvidError::ExtractionFailed {
713 reason: format!("pdf_oxide failed to get page count: {err}").into(),
714 })?;
715 if page_count == 0 {
716 return Ok(None);
717 }
718
719 let mut all_text = String::new();
720 for page_idx in 0..page_count {
721 match doc.extract_text(page_idx) {
722 Ok(text) => {
723 if !text.is_empty() {
724 if !all_text.is_empty() {
725 all_text.push('\n');
726 }
727 all_text.push_str(&text);
728 }
729 }
730 Err(e) => {
731 tracing::debug!(
732 target: "memvid::extract",
733 page = page_idx,
734 error = %e,
735 "pdf_oxide failed to extract page"
736 );
737 }
738 }
739 }
740
741 let trimmed = all_text.trim();
742 if trimmed.is_empty() {
743 Ok(None)
744 } else {
745 Ok(Some(trimmed.to_string()))
746 }
747}
748
749#[allow(dead_code)]
751fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
752 if bytes.is_empty() {
753 return false;
754 }
755 let mut slice = bytes;
756 if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
758 slice = &slice[3..];
759 }
760 while let Some((first, rest)) = slice.split_first() {
762 if *first == 0 || first.is_ascii_whitespace() {
763 slice = rest;
764 } else {
765 break;
766 }
767 }
768 slice.starts_with(b"%PDF")
769}
770
771#[allow(dead_code)]
773fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
774 if bytes.len() > PDF_LOPDF_MAX_BYTES {
775 return Err(MemvidError::ExtractionFailed {
776 reason: format!(
777 "PDF too large: {} bytes exceeds limit of {} bytes",
778 bytes.len(),
779 PDF_LOPDF_MAX_BYTES
780 )
781 .into(),
782 });
783 }
784
785 let mut document =
786 LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
787 reason: format!("failed to load PDF: {err}").into(),
788 })?;
789
790 if document.is_encrypted() {
792 if document.decrypt("").is_err() {
793 return Err(MemvidError::ExtractionFailed {
794 reason: "cannot decrypt password-protected PDF".into(),
795 });
796 }
797 }
798
799 let _ = document.decompress();
801
802 let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
803 if page_numbers.is_empty() {
804 return Ok(None);
805 }
806 page_numbers.sort_unstable();
807
808 if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
809 return Err(MemvidError::ExtractionFailed {
810 reason: format!(
811 "PDF has too many pages: {} exceeds limit of {}",
812 page_numbers.len(),
813 PDF_LOPDF_MAX_PAGES
814 )
815 .into(),
816 });
817 }
818
819 match document.extract_text(&page_numbers) {
820 Ok(text) => {
821 let trimmed = text.trim();
822 if trimmed.is_empty() {
823 Ok(None)
824 } else {
825 Ok(Some(trimmed.to_string()))
826 }
827 }
828 Err(err) => Err(MemvidError::ExtractionFailed {
829 reason: format!("failed to extract text from PDF: {err}").into(),
830 }),
831 }
832}
833
834#[cfg(all(test, feature = "extractous"))]
835mod tests {
836 use super::*;
837
838 #[test]
839 fn detects_pdf_like_dump() {
840 let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
841 let dump = snippet.repeat(12);
842 assert!(looks_like_pdf_structure_dump(&dump));
843 }
844
845 #[test]
846 fn skips_normal_text() {
847 let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
848 assert!(!looks_like_pdf_structure_dump(text));
849 }
850
851 #[test]
852 fn identifies_pdf_magic() {
853 let bytes = b"%PDF-1.7 some data";
854 assert!(is_probably_pdf(bytes));
855 let padded = b"\n\n%PDF-1.5";
856 assert!(is_probably_pdf(padded));
857 let not_pdf = b"<!doctype html>";
858 assert!(!is_probably_pdf(not_pdf));
859 }
860}
861
862#[cfg(all(test, feature = "extractous"))]
863mod pdf_fix_tests {
864 use super::*;
865
866 #[test]
871 fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
872 let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
874
875 assert!(is_probably_pdf(pdf_structure));
877
878 let structure_dump =
880 "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
881 assert!(looks_like_pdf_structure_dump(&structure_dump));
882
883 let normal_text = "This is perfectly normal extracted text from a document.";
885 assert!(!looks_like_pdf_structure_dump(normal_text));
886 }
887}