1use crate::{config::ExtractorConfig, error::Result, ExtractError, ExtractResult, Extractor};
4use lopdf::Document;
5use std::path::Path;
6
7pub struct PdfExtractor {
9 config: ExtractorConfig,
10}
11
12impl Default for PdfExtractor {
13 fn default() -> Self {
14 Self::new(ExtractorConfig::default())
15 }
16}
17
18impl PdfExtractor {
19 pub fn new(config: ExtractorConfig) -> Self {
21 Self { config }
22 }
23
24 pub fn extract_from_file(&self, path: &Path) -> Result<ExtractResult> {
26 let doc = Document::load(path)?;
27 self.extract_from_document(&doc, path.to_string_lossy().to_string())
28 }
29
30 pub fn extract_from_bytes(&self, bytes: &[u8], source: String) -> Result<ExtractResult> {
32 let doc = Document::load_mem(bytes)?;
33 self.extract_from_document(&doc, source)
34 }
35
36 fn extract_from_document(&self, doc: &Document, source: String) -> Result<ExtractResult> {
38 let mut text_parts: Vec<String> = Vec::new();
39 let pages = doc.get_pages();
40
41 for (page_num, _) in pages.iter() {
42 if let Ok(page_text) = doc.extract_text(&[*page_num]) {
43 let cleaned = self.clean_text(&page_text);
44 if !cleaned.is_empty() {
45 text_parts.push(cleaned);
46 }
47 }
48 }
49
50 let text = text_parts.join("\n\n");
51
52 if text.len() > self.config.max_length {
53 return Err(ExtractError::ContentTooLarge {
54 size: text.len(),
55 max: self.config.max_length,
56 });
57 }
58
59 let mut result = ExtractResult::new(text, source)
60 .with_content_type("application/pdf")
61 .with_metadata("page_count", pages.len().to_string());
62
63 if let Ok(catalog) = doc.catalog() {
65 if let Ok(info_ref) = catalog.get(b"Info") {
66 if let Ok(info) = doc.get_object(info_ref.as_reference().unwrap_or_default()) {
67 if let Ok(info_dict) = info.as_dict() {
68 if let Ok(title) = info_dict.get(b"Title") {
69 if let Ok(title_bytes) = title.as_str() {
70 if let Ok(title_str) = std::str::from_utf8(title_bytes) {
71 result = result.with_title(title_str);
72 }
73 }
74 }
75 if let Ok(author) = info_dict.get(b"Author") {
76 if let Ok(author_bytes) = author.as_str() {
77 if let Ok(author_str) = std::str::from_utf8(author_bytes) {
78 result = result.with_metadata("author", author_str);
79 }
80 }
81 }
82 }
83 }
84 }
85 }
86
87 Ok(result)
88 }
89
90 fn clean_text(&self, text: &str) -> String {
92 let mut result = String::with_capacity(text.len());
93 let mut prev_was_whitespace = false;
94
95 for c in text.chars() {
96 if c.is_whitespace() {
97 if !prev_was_whitespace {
98 result.push(if c == '\n' { '\n' } else { ' ' });
99 prev_was_whitespace = true;
100 }
101 } else {
102 result.push(c);
103 prev_was_whitespace = false;
104 }
105 }
106
107 result.trim().to_string()
108 }
109}
110
111#[async_trait::async_trait]
112impl Extractor for PdfExtractor {
113 async fn extract(&self, source: &str) -> Result<ExtractResult> {
115 if source.starts_with("http://") || source.starts_with("https://") {
117 #[cfg(feature = "web")]
118 {
119 let client = reqwest::Client::builder()
121 .timeout(std::time::Duration::from_secs(self.config.timeout_secs))
122 .build()
123 .map_err(|e| ExtractError::Network(e.to_string()))?;
124
125 let response = client
126 .get(source)
127 .send()
128 .await
129 .map_err(|e| ExtractError::Network(e.to_string()))?;
130
131 if !response.status().is_success() {
132 return Err(ExtractError::Http {
133 status: response.status().as_u16(),
134 message: response.status().to_string(),
135 });
136 }
137
138 let bytes = response
139 .bytes()
140 .await
141 .map_err(|e| ExtractError::Network(e.to_string()))?;
142
143 return self.extract_from_bytes(&bytes, source.to_string());
144 }
145
146 #[cfg(not(feature = "web"))]
147 {
148 return Err(ExtractError::Other(
149 "URL extraction requires 'web' feature".to_string(),
150 ));
151 }
152 }
153
154 let path = Path::new(source);
156 if !path.exists() {
157 return Err(ExtractError::Io(std::io::Error::new(
158 std::io::ErrorKind::NotFound,
159 format!("File not found: {source}"),
160 )));
161 }
162
163 self.extract_from_file(path)
164 }
165
166 #[cfg(feature = "sanitize")]
167 async fn extract_sanitized(&self, source: &str) -> Result<ExtractResult> {
168 let result = self.extract(source).await?;
169 crate::sanitize::sanitize_result(result, &self.config).await
170 }
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176
177 #[test]
178 fn test_clean_text() {
179 let extractor = PdfExtractor::default();
180 let input = " Hello World \n\n Test ";
181 let result = extractor.clean_text(input);
182 assert_eq!(result, "Hello World Test");
184 }
185}