lc/readers/
pdf.rs

1use super::FileReader;
2use anyhow::{Context, Result};
3use std::io::Read;
4
5#[cfg(feature = "pdf")]
6extern crate pdf_extract;
7
8pub struct PdfReader;
9
10impl PdfReader {
11    pub fn new() -> Self {
12        Self
13    }
14
15    /// Internal method to extract text from PDF bytes with comprehensive error handling
16    fn extract_text_from_bytes_internal(&self, bytes: &[u8]) -> Result<String> {
17        #[cfg(feature = "pdf")]
18        {
19            // Attempt to extract text using pdf-extract
20            match pdf_extract::extract_text_from_mem(bytes) {
21                Ok(text) => {
22                    // Check if the extracted text is mostly empty or contains only whitespace
23                    let cleaned_text = text.trim();
24                    if cleaned_text.is_empty() {
25                        // This might be a bitmap-only PDF
26                        return Ok("[image page]".to_string());
27                    }
28
29                    // Preserve page breaks by converting form feed characters
30                    let formatted_text = text.replace('\x0C', "\u{000C}");
31
32                    // Ensure UTF-8 encoding
33                    Ok(formatted_text)
34                }
35                Err(e) => {
36                    // Check if this might be an encrypted PDF
37                    let error_msg = e.to_string().to_lowercase();
38                    if error_msg.contains("encrypt")
39                        || error_msg.contains("password")
40                        || error_msg.contains("security")
41                    {
42                        // Try passwordless decryption attempt (pdf-extract handles this internally)
43                        // If it fails, return appropriate error
44                        Err(anyhow::anyhow!(
45                            "PDF appears to be encrypted and requires a password for text extraction. \
46                            Error: {}", e
47                        ))
48                    } else {
49                        // Check if this might be a bitmap-only PDF
50                        if error_msg.contains("no text")
51                            || error_msg.contains("image")
52                            || error_msg.contains("scan")
53                        {
54                            Ok("[image page]".to_string())
55                        } else {
56                            Err(anyhow::anyhow!("Failed to extract text from PDF: {}", e))
57                        }
58                    }
59                }
60            }
61        }
62        #[cfg(not(feature = "pdf"))]
63        {
64            let _ = bytes; // Suppress unused parameter warning
65            Err(anyhow::anyhow!(
66                "PDF support is not enabled. Please compile with the 'pdf' feature flag to enable PDF processing."
67            ))
68        }
69    }
70}
71
72impl FileReader for PdfReader {
73    fn read_as_text(&self, file_path: &str) -> Result<String> {
74        let bytes = std::fs::read(file_path)
75            .with_context(|| format!("Failed to read PDF file: {}", file_path))?;
76
77        self.read_as_text_from_bytes(&bytes)
78            .with_context(|| format!("Failed to extract text from PDF file: {}", file_path))
79    }
80
81    fn read_as_text_from_bytes(&self, bytes: &[u8]) -> Result<String> {
82        self.extract_text_from_bytes_internal(bytes)
83    }
84
85    fn read_as_text_from_reader(&self, mut reader: Box<dyn Read>) -> Result<String> {
86        let mut bytes = Vec::new();
87        reader
88            .read_to_end(&mut bytes)
89            .with_context(|| "Failed to read bytes from reader")?;
90
91        self.read_as_text_from_bytes(&bytes)
92    }
93
94    fn can_handle(&self, extension: &str) -> bool {
95        extension.to_lowercase() == "pdf"
96    }
97}