memvid_core/
extract.rs

1use std::fs;
2use std::path::Path;
3
4use crate::{Result, error::MemvidError, text::truncate_at_grapheme_boundary};
5#[cfg(feature = "extractous")]
6use log::LevelFilter;
7use lopdf::Document as LopdfDocument;
8use serde_json::{Value, json};
9
10#[cfg(feature = "extractous")]
11use extractous::Extractor;
12#[cfg(feature = "extractous")]
13use std::collections::HashMap;
14#[cfg(feature = "extractous")]
15use std::sync::{Mutex, OnceLock};
16
17/// Structured result produced by [`DocumentProcessor`] after running
18/// Extractous over an input document.
19#[derive(Debug, Clone)]
20pub struct ExtractedDocument {
21    pub text: Option<String>,
22    pub metadata: Value,
23    pub mime_type: Option<String>,
24}
25
26impl ExtractedDocument {
27    pub fn empty() -> Self {
28        Self {
29            text: None,
30            metadata: Value::Null,
31            mime_type: None,
32        }
33    }
34}
35
36#[derive(Debug, Clone, Copy)]
37pub struct ProcessorConfig {
38    pub max_text_chars: usize,
39}
40
41impl Default for ProcessorConfig {
42    fn default() -> Self {
43        Self {
44            max_text_chars: 2_000_000,
45        }
46    }
47}
48
49// ============================================================================
50// DocumentProcessor - only available with extractous feature
51// ============================================================================
52
53#[cfg(feature = "extractous")]
54#[derive(Debug)]
55pub struct DocumentProcessor {
56    extractor: Mutex<Extractor>,
57    max_length: usize,
58}
59
60#[cfg(feature = "extractous")]
61impl Default for DocumentProcessor {
62    fn default() -> Self {
63        Self::new(Default::default())
64    }
65}
66
67#[cfg(feature = "extractous")]
68static EXTRACTION_CACHE: OnceLock<Mutex<HashMap<blake3::Hash, ExtractedDocument>>> =
69    OnceLock::new();
70
71#[cfg(feature = "extractous")]
72impl DocumentProcessor {
73    pub fn new(config: ProcessorConfig) -> Self {
74        let capped = config
75            .max_text_chars
76            .min(i32::MAX as usize)
77            .try_into()
78            .unwrap_or(i32::MAX);
79        let mut extractor = Extractor::new().set_extract_string_max_length(capped);
80        extractor = extractor.set_xml_output(false);
81        Self {
82            extractor: Mutex::new(extractor),
83            max_length: config.max_text_chars,
84        }
85    }
86
87    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
88        let path_str = path.to_str().ok_or_else(|| MemvidError::ExtractionFailed {
89            reason: "input path contains invalid UTF-8".into(),
90        })?;
91
92        let extraction = {
93            let extractor = self.locked()?;
94            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
95            extractor.extract_file_to_string(path_str)
96        };
97
98        match extraction {
99            Ok((mut content, metadata)) => {
100                if needs_pdf_fallback(&content) {
101                    if let Ok(bytes) = fs::read(path) {
102                        if let Ok(Some(fallback_text)) = pdf_text_fallback(&bytes) {
103                            content = fallback_text;
104                        }
105                    }
106                }
107                Ok(self.into_document(content, metadata))
108            }
109            Err(err) => {
110                let primary_reason = err.to_string();
111                if let Ok(bytes) = fs::read(path) {
112                    match pdf_text_fallback(&bytes) {
113                        Ok(Some(fallback_text)) => {
114                            return Ok(self.into_document(fallback_text, pdf_fallback_metadata()));
115                        }
116                        Ok(None) => {}
117                        Err(fallback_err) => {
118                            let reason = format!(
119                                "primary extractor error: {}; PDF fallback error: {}",
120                                primary_reason, fallback_err
121                            );
122                            return Err(MemvidError::ExtractionFailed {
123                                reason: reason.into(),
124                            });
125                        }
126                    }
127                }
128                Err(MemvidError::ExtractionFailed {
129                    reason: primary_reason.into(),
130                })
131            }
132        }
133    }
134
135    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
136        let hash = blake3::hash(bytes);
137        if let Some(cached) = cache_lookup(&hash) {
138            tracing::debug!(target = "memvid::extract", reader = "cache", "cache hit");
139            return Ok(cached);
140        }
141
142        let extraction = {
143            let extractor = self.locked()?;
144            let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
145            extractor.extract_bytes_to_string(bytes)
146        };
147
148        let document = match extraction {
149            Ok((mut content, metadata)) => {
150                let pdf_needed = needs_pdf_fallback(&content);
151                tracing::debug!(
152                    target: "memvid::extract",
153                    content_len = content.len(),
154                    pdf_fallback_needed = pdf_needed,
155                    "extractous returned content"
156                );
157                if pdf_needed {
158                    match pdf_text_fallback(bytes) {
159                        Ok(Some(fallback_text)) => {
160                            tracing::debug!(
161                                target: "memvid::extract",
162                                fallback_len = fallback_text.len(),
163                                "lopdf fallback succeeded"
164                            );
165                            content = fallback_text;
166                        }
167                        Ok(None) => {
168                            tracing::debug!(
169                                target: "memvid::extract",
170                                "lopdf fallback returned None"
171                            );
172                            // PDF detected but lopdf couldn't extract any text
173                            // Return empty rather than raw PDF bytes
174                            content = String::new();
175                        }
176                        Err(e) => {
177                            tracing::debug!(
178                                target: "memvid::extract",
179                                error = %e,
180                                "lopdf fallback failed"
181                            );
182                            // lopdf extraction failed - return empty rather than raw PDF bytes
183                            content = String::new();
184                        }
185                    }
186                }
187                self.into_document(content, metadata)
188            }
189            Err(err) => {
190                let primary_reason = err.to_string();
191                match pdf_text_fallback(bytes) {
192                    Ok(Some(fallback_text)) => {
193                        self.into_document(fallback_text, pdf_fallback_metadata())
194                    }
195                    Ok(None) => {
196                        return Err(MemvidError::ExtractionFailed {
197                            reason: primary_reason.into(),
198                        });
199                    }
200                    Err(fallback_err) => {
201                        let reason = format!(
202                            "primary extractor error: {}; PDF fallback error: {}",
203                            primary_reason, fallback_err
204                        );
205                        return Err(MemvidError::ExtractionFailed {
206                            reason: reason.into(),
207                        });
208                    }
209                }
210            }
211        };
212
213        cache_store(hash, &document);
214        Ok(document)
215    }
216
217    fn locked(&self) -> Result<std::sync::MutexGuard<'_, Extractor>> {
218        self.extractor
219            .lock()
220            .map_err(|_| MemvidError::ExtractionFailed {
221                reason: "extractor mutex poisoned".into(),
222            })
223    }
224
225    fn into_document<M>(&self, content: String, metadata: M) -> ExtractedDocument
226    where
227        M: serde::Serialize,
228    {
229        let metadata_value = serde_json::to_value(metadata).unwrap_or(Value::Null);
230        let mime_type = metadata_value.get("Content-Type").and_then(value_to_mime);
231
232        let text = if content.trim().is_empty() {
233            tracing::debug!(
234                target: "memvid::extract",
235                "into_document: content is empty, returning text=None"
236            );
237            None
238        } else {
239            let final_text = if content.len() > self.max_length {
240                let end = truncate_at_grapheme_boundary(&content, self.max_length);
241                content[..end].to_string()
242            } else {
243                content
244            };
245            tracing::debug!(
246                target: "memvid::extract",
247                text_len = final_text.len(),
248                starts_with_pdf = final_text.starts_with("%PDF"),
249                "into_document: returning text"
250            );
251            Some(final_text)
252        };
253
254        ExtractedDocument {
255            text,
256            metadata: metadata_value,
257            mime_type,
258        }
259    }
260}
261
262// ============================================================================
263// Stub DocumentProcessor when extractous is disabled - returns clear error
264// ============================================================================
265
266#[cfg(not(feature = "extractous"))]
267#[derive(Debug)]
268pub struct DocumentProcessor {
269    max_length: usize,
270}
271
272#[cfg(not(feature = "extractous"))]
273impl Default for DocumentProcessor {
274    fn default() -> Self {
275        Self::new(Default::default())
276    }
277}
278
279#[cfg(not(feature = "extractous"))]
280impl DocumentProcessor {
281    pub fn new(config: ProcessorConfig) -> Self {
282        Self {
283            max_length: config.max_text_chars,
284        }
285    }
286
287    pub fn extract_from_path(&self, path: &Path) -> Result<ExtractedDocument> {
288        // Without extractous, we can still handle plain text files
289        let bytes = fs::read(path).map_err(|e| MemvidError::ExtractionFailed {
290            reason: format!("failed to read file: {e}").into(),
291        })?;
292        self.extract_from_bytes(&bytes)
293    }
294
295    pub fn extract_from_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
296        // Check if this is a PDF - extract text using pdf_extract (if available) or lopdf
297        if is_probably_pdf_simple(bytes) {
298            match pdf_text_extract_best(bytes) {
299                Ok(Some((text, extractor))) => {
300                    let truncate_len = truncate_at_grapheme_boundary(&text, self.max_length);
301                    let truncated = &text[..truncate_len];
302                    return Ok(ExtractedDocument {
303                        text: Some(truncated.to_string()),
304                        metadata: json!({
305                            "Content-Type": "application/pdf",
306                            "extraction": extractor,
307                        }),
308                        mime_type: Some("application/pdf".to_string()),
309                    });
310                }
311                Ok(None) => {
312                    // PDF detected but no text could be extracted (image-only PDF)
313                    return Ok(ExtractedDocument {
314                        text: None,
315                        metadata: json!({
316                            "Content-Type": "application/pdf",
317                            "extraction": "no_text",
318                        }),
319                        mime_type: Some("application/pdf".to_string()),
320                    });
321                }
322                Err(e) => {
323                    tracing::warn!(target: "memvid::extract", error = %e, "PDF extraction failed");
324                    // Fall through to binary handling
325                }
326            }
327        }
328
329        // Without extractous, we can still handle plain text files and common text-based formats
330        // Try to interpret as UTF-8 text first
331        if let Ok(text) = std::str::from_utf8(bytes) {
332            // Check if it's likely text (no null bytes in first 8KB)
333            let sample = &bytes[..bytes.len().min(8192)];
334            if !sample.contains(&0) {
335                let truncate_len = truncate_at_grapheme_boundary(text, self.max_length);
336                let truncated = &text[..truncate_len];
337                return Ok(ExtractedDocument {
338                    text: Some(truncated.to_string()),
339                    metadata: json!({}),
340                    mime_type: Some("text/plain".to_string()),
341                });
342            }
343        }
344
345        // For binary content (video, audio, images, etc.), return success with no text.
346        // This allows binary blobs to be stored without requiring the extractous feature.
347        // The caller can still store the blob; there just won't be extracted text for search.
348        Ok(ExtractedDocument {
349            text: None,
350            metadata: json!({}),
351            mime_type: Some("application/octet-stream".to_string()),
352        })
353    }
354}
355
356#[cfg(feature = "extractous")]
357fn needs_pdf_fallback(content: &str) -> bool {
358    if content.trim().is_empty() {
359        return true;
360    }
361    looks_like_pdf_structure_dump(content)
362}
363
364#[cfg(feature = "extractous")]
365fn pdf_fallback_metadata() -> Value {
366    json!({
367        "Content-Type": "application/pdf",
368        "extraction": "lopdf_fallback",
369    })
370}
371
372#[cfg(feature = "extractous")]
373const PDF_FALLBACK_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap.
374#[cfg(feature = "extractous")]
375const PDF_FALLBACK_MAX_PAGES: usize = 4_096;
376
377#[cfg(feature = "extractous")]
378fn pdf_text_fallback(bytes: &[u8]) -> Result<Option<String>> {
379    if !is_probably_pdf(bytes) {
380        return Ok(None);
381    }
382
383    if bytes.len() > PDF_FALLBACK_MAX_BYTES {
384        return Err(MemvidError::ExtractionFailed {
385            reason: format!(
386                "pdf fallback aborted: {} bytes exceeds limit of {} bytes",
387                bytes.len(),
388                PDF_FALLBACK_MAX_BYTES
389            )
390            .into(),
391        });
392    }
393
394    let _log_guard = ScopedLogLevel::lowered(LevelFilter::Off);
395    let mut document =
396        LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
397            reason: format!("pdf fallback failed to load document: {err}").into(),
398        })?;
399
400    if document.is_encrypted() {
401        if document.decrypt("").is_err() {
402            return Err(MemvidError::ExtractionFailed {
403                reason: "pdf fallback cannot decrypt password-protected file".into(),
404            });
405        }
406    }
407
408    let _ = document.decompress();
409
410    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
411    if page_numbers.is_empty() {
412        return Ok(None);
413    }
414    page_numbers.sort_unstable();
415
416    if page_numbers.len() > PDF_FALLBACK_MAX_PAGES {
417        return Err(MemvidError::ExtractionFailed {
418            reason: format!(
419                "pdf fallback aborted: page count {} exceeds limit of {}",
420                page_numbers.len(),
421                PDF_FALLBACK_MAX_PAGES
422            )
423            .into(),
424        });
425    }
426
427    match document.extract_text(&page_numbers) {
428        Ok(text) => {
429            let trimmed = text.trim();
430            if trimmed.is_empty() {
431                Ok(None)
432            } else {
433                Ok(Some(trimmed.to_string()))
434            }
435        }
436        Err(err) => Err(MemvidError::ExtractionFailed {
437            reason: format!("pdf fallback failed to extract text: {err}").into(),
438        }),
439    }
440}
441
442#[cfg(feature = "extractous")]
443struct ScopedLogLevel {
444    previous: LevelFilter,
445    changed: bool,
446}
447
448#[cfg(feature = "extractous")]
449impl ScopedLogLevel {
450    fn lowered(level: LevelFilter) -> Self {
451        let previous = log::max_level();
452        if level < previous {
453            log::set_max_level(level);
454            Self {
455                previous,
456                changed: true,
457            }
458        } else {
459            Self {
460                previous,
461                changed: false,
462            }
463        }
464    }
465}
466
467#[cfg(feature = "extractous")]
468impl Drop for ScopedLogLevel {
469    fn drop(&mut self) {
470        if self.changed {
471            log::set_max_level(self.previous);
472        }
473    }
474}
475
476#[cfg(feature = "extractous")]
477fn is_probably_pdf(bytes: &[u8]) -> bool {
478    if bytes.is_empty() {
479        return false;
480    }
481    let mut slice = bytes;
482    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
483        slice = &slice[3..];
484    }
485    while let Some((first, rest)) = slice.split_first() {
486        if *first == 0 || first.is_ascii_whitespace() {
487            slice = rest;
488        } else {
489            break;
490        }
491    }
492    slice.starts_with(b"%PDF")
493}
494
495#[cfg(feature = "extractous")]
496fn looks_like_pdf_structure_dump(content: &str) -> bool {
497    if content.len() < 256 {
498        return false;
499    }
500    let sample_len = content.len().min(8_192);
501    // Find the nearest valid character boundary at or before sample_len
502    let safe_len = truncate_at_grapheme_boundary(content, sample_len);
503    let sample = &content[..safe_len];
504    let endobj_hits = sample.matches("endobj").take(3).count();
505    if endobj_hits < 2 {
506        return false;
507    }
508    let has_obj =
509        sample.contains(" 0 obj") || sample.contains("\n0 obj") || sample.contains("\r0 obj");
510    let has_stream = sample.contains("endstream");
511    let has_page_type = sample.contains("/Type /Page");
512    endobj_hits >= 2 && (has_obj || has_stream || has_page_type)
513}
514
515#[cfg(feature = "extractous")]
516fn value_to_mime(value: &Value) -> Option<String> {
517    if let Some(mime) = value.as_str() {
518        return Some(mime.to_string());
519    }
520    if let Some(array) = value.as_array() {
521        for entry in array {
522            if let Some(mime) = entry.as_str() {
523                return Some(mime.to_string());
524            }
525        }
526    }
527    None
528}
529
530#[cfg(feature = "extractous")]
531fn cache_lookup(hash: &blake3::Hash) -> Option<ExtractedDocument> {
532    let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
533    cache.lock().ok().and_then(|map| map.get(hash).cloned())
534}
535
536#[cfg(feature = "extractous")]
537fn cache_store(hash: blake3::Hash, document: &ExtractedDocument) {
538    let cache = EXTRACTION_CACHE.get_or_init(|| Mutex::new(HashMap::new()));
539    if let Ok(mut map) = cache.lock() {
540        map.insert(hash, document.clone());
541    }
542}
543
544// ============================================================================
545// PDF extraction helpers (available without extractous feature)
546// Uses pdf_extract crate when available (better quality), falls back to lopdf
547// ============================================================================
548
549#[allow(dead_code)]
550const PDF_LOPDF_MAX_BYTES: usize = 64 * 1024 * 1024; // 64 MiB hard cap
551#[allow(dead_code)]
552const PDF_LOPDF_MAX_PAGES: usize = 4_096;
553
554/// Try multiple PDF extractors and return the best result
555/// Returns (text, extractor_name) or None if no text found
556#[allow(dead_code)]
557fn pdf_text_extract_best(bytes: &[u8]) -> Result<Option<(String, &'static str)>> {
558    let mut best_text: Option<String> = None;
559    let mut best_source: &'static str = "";
560
561    // Calculate minimum "good" threshold based on file size
562    // Larger PDFs should have more text, so scale the threshold
563    let _min_good_chars = (bytes.len() / 100).clamp(500, 5000);
564
565    // Try pdf_extract first (if feature enabled) - often extracts better
566    #[cfg(feature = "pdf_extract")]
567    {
568        match pdf_extract::extract_text_from_mem(bytes) {
569            Ok(text) => {
570                let trimmed = text.trim();
571                if !trimmed.is_empty() {
572                    if trimmed.len() >= min_good_chars {
573                        // Good extraction, use it immediately
574                        tracing::debug!(
575                            target: "memvid::extract",
576                            len = trimmed.len(),
577                            "pdf_extract succeeded with good result"
578                        );
579                        return Ok(Some((trimmed.to_string(), "pdf_extract")));
580                    }
581                    // Partial result, save and try lopdf too
582                    tracing::debug!(
583                        target: "memvid::extract",
584                        len = trimmed.len(),
585                        min_good = min_good_chars,
586                        "pdf_extract returned partial result, trying lopdf"
587                    );
588                    best_text = Some(trimmed.to_string());
589                    best_source = "pdf_extract";
590                }
591            }
592            Err(e) => {
593                tracing::debug!(target: "memvid::extract", error = %e, "pdf_extract failed");
594            }
595        }
596    }
597
598    // Try lopdf (pure Rust, always available)
599    match pdf_text_extract_lopdf(bytes) {
600        Ok(Some(text)) => {
601            let trimmed = text.trim();
602            if !trimmed.is_empty() {
603                // Use lopdf result if better than pdf_extract
604                if best_text.as_ref().map_or(true, |prev| trimmed.len() > prev.len()) {
605                    tracing::debug!(
606                        target: "memvid::extract",
607                        len = trimmed.len(),
608                        "lopdf extracted more text"
609                    );
610                    best_text = Some(trimmed.to_string());
611                    best_source = "lopdf";
612                }
613            }
614        }
615        Ok(None) => {
616            tracing::debug!(target: "memvid::extract", "lopdf returned no text");
617        }
618        Err(e) => {
619            tracing::debug!(target: "memvid::extract", error = %e, "lopdf failed");
620        }
621    }
622
623    Ok(best_text.map(|t| (t, best_source)))
624}
625
626/// Check if bytes look like a PDF file (magic bytes check)
627#[allow(dead_code)]
628fn is_probably_pdf_simple(bytes: &[u8]) -> bool {
629    if bytes.is_empty() {
630        return false;
631    }
632    let mut slice = bytes;
633    // Skip BOM if present
634    if slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
635        slice = &slice[3..];
636    }
637    // Skip leading whitespace/nulls
638    while let Some((first, rest)) = slice.split_first() {
639        if *first == 0 || first.is_ascii_whitespace() {
640            slice = rest;
641        } else {
642            break;
643        }
644    }
645    slice.starts_with(b"%PDF")
646}
647
648/// Extract text from a PDF using lopdf (pure Rust, no external dependencies)
649#[allow(dead_code)]
650fn pdf_text_extract_lopdf(bytes: &[u8]) -> Result<Option<String>> {
651    if bytes.len() > PDF_LOPDF_MAX_BYTES {
652        return Err(MemvidError::ExtractionFailed {
653            reason: format!(
654                "PDF too large: {} bytes exceeds limit of {} bytes",
655                bytes.len(),
656                PDF_LOPDF_MAX_BYTES
657            )
658            .into(),
659        });
660    }
661
662    let mut document = LopdfDocument::load_mem(bytes).map_err(|err| MemvidError::ExtractionFailed {
663        reason: format!("failed to load PDF: {err}").into(),
664    })?;
665
666    // Try to decrypt if encrypted (empty password for unprotected PDFs)
667    if document.is_encrypted() {
668        if document.decrypt("").is_err() {
669            return Err(MemvidError::ExtractionFailed {
670                reason: "cannot decrypt password-protected PDF".into(),
671            });
672        }
673    }
674
675    // Decompress streams for better text extraction
676    let _ = document.decompress();
677
678    let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
679    if page_numbers.is_empty() {
680        return Ok(None);
681    }
682    page_numbers.sort_unstable();
683
684    if page_numbers.len() > PDF_LOPDF_MAX_PAGES {
685        return Err(MemvidError::ExtractionFailed {
686            reason: format!(
687                "PDF has too many pages: {} exceeds limit of {}",
688                page_numbers.len(),
689                PDF_LOPDF_MAX_PAGES
690            )
691            .into(),
692        });
693    }
694
695    match document.extract_text(&page_numbers) {
696        Ok(text) => {
697            let trimmed = text.trim();
698            if trimmed.is_empty() {
699                Ok(None)
700            } else {
701                Ok(Some(trimmed.to_string()))
702            }
703        }
704        Err(err) => Err(MemvidError::ExtractionFailed {
705            reason: format!("failed to extract text from PDF: {err}").into(),
706        }),
707    }
708}
709
710#[cfg(all(test, feature = "extractous"))]
711mod tests {
712    use super::*;
713
714    #[test]
715    fn detects_pdf_like_dump() {
716        let snippet = "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ";
717        let dump = snippet.repeat(12);
718        assert!(looks_like_pdf_structure_dump(&dump));
719    }
720
721    #[test]
722    fn skips_normal_text() {
723        let text = "This is a perfectly normal paragraph that should not trigger the PDF fallback.";
724        assert!(!looks_like_pdf_structure_dump(text));
725    }
726
727    #[test]
728    fn identifies_pdf_magic() {
729        let bytes = b"%PDF-1.7 some data";
730        assert!(is_probably_pdf(bytes));
731        let padded = b"\n\n%PDF-1.5";
732        assert!(is_probably_pdf(padded));
733        let not_pdf = b"<!doctype html>";
734        assert!(!is_probably_pdf(not_pdf));
735    }
736}
737
738#[cfg(all(test, feature = "extractous"))]
739mod pdf_fix_tests {
740    use super::*;
741
742    /// Test that PDF extraction via lopdf fallback returns actual text content,
743    /// not raw PDF bytes. This test verifies the fix for the bug where PDFs
744    /// with extractous returning empty content would have their raw bytes
745    /// indexed instead of extracted text.
746    #[test]
747    fn test_pdf_structure_dump_detection_prevents_raw_indexing() {
748        // Create a synthetic PDF-like structure that should trigger the fallback
749        let pdf_structure = b"%PDF-1.4\n%\xff\xff\xff\xff\n1 0 obj\n<</Type/Catalog>>\nendobj\n";
750
751        // This should be detected as PDF structure
752        assert!(is_probably_pdf(pdf_structure));
753
754        // And content that looks like PDF structure should be detected
755        let structure_dump =
756            "binary %PDF snippet endobj endobj endstream 0 obj /Type /Page endobj ".repeat(12);
757        assert!(looks_like_pdf_structure_dump(&structure_dump));
758
759        // Normal text should NOT be detected as PDF structure
760        let normal_text = "This is perfectly normal extracted text from a document.";
761        assert!(!looks_like_pdf_structure_dump(normal_text));
762    }
763}