1use super::FileReader;
2use anyhow::{Context, Result};
3use std::io::Read;
4
5#[cfg(feature = "pdf")]
6extern crate pdf_extract;
7
8pub struct PdfReader;
9
10impl PdfReader {
11 pub fn new() -> Self {
12 Self
13 }
14
15 fn extract_text_from_bytes_internal(&self, bytes: &[u8]) -> Result<String> {
17 #[cfg(feature = "pdf")]
18 {
19 match pdf_extract::extract_text_from_mem(bytes) {
21 Ok(text) => {
22 let cleaned_text = text.trim();
24 if cleaned_text.is_empty() {
25 return Ok("[image page]".to_string());
27 }
28
29 let formatted_text = text.replace('\x0C', "\u{000C}");
31
32 Ok(formatted_text)
34 }
35 Err(e) => {
36 let error_msg = e.to_string().to_lowercase();
38 if error_msg.contains("encrypt")
39 || error_msg.contains("password")
40 || error_msg.contains("security")
41 {
42 Err(anyhow::anyhow!(
45 "PDF appears to be encrypted and requires a password for text extraction. \
46 Error: {}", e
47 ))
48 } else {
49 if error_msg.contains("no text")
51 || error_msg.contains("image")
52 || error_msg.contains("scan")
53 {
54 Ok("[image page]".to_string())
55 } else {
56 Err(anyhow::anyhow!("Failed to extract text from PDF: {}", e))
57 }
58 }
59 }
60 }
61 }
62 #[cfg(not(feature = "pdf"))]
63 {
64 let _ = bytes; Err(anyhow::anyhow!(
66 "PDF support is not enabled. Please compile with the 'pdf' feature flag to enable PDF processing."
67 ))
68 }
69 }
70}
71
72impl FileReader for PdfReader {
73 fn read_as_text(&self, file_path: &str) -> Result<String> {
74 let bytes = std::fs::read(file_path)
75 .with_context(|| format!("Failed to read PDF file: {}", file_path))?;
76
77 self.read_as_text_from_bytes(&bytes)
78 .with_context(|| format!("Failed to extract text from PDF file: {}", file_path))
79 }
80
81 fn read_as_text_from_bytes(&self, bytes: &[u8]) -> Result<String> {
82 self.extract_text_from_bytes_internal(bytes)
83 }
84
85 fn read_as_text_from_reader(&self, mut reader: Box<dyn Read>) -> Result<String> {
86 let mut bytes = Vec::new();
87 reader
88 .read_to_end(&mut bytes)
89 .with_context(|| "Failed to read bytes from reader")?;
90
91 self.read_as_text_from_bytes(&bytes)
92 }
93
94 fn can_handle(&self, extension: &str) -> bool {
95 extension.to_lowercase() == "pdf"
96 }
97}