hanzo_extract/
pdf.rs

1//! PDF document content extraction
2
3use crate::{config::ExtractorConfig, error::Result, ExtractError, ExtractResult, Extractor};
4use lopdf::Document;
5use std::path::Path;
6
7/// PDF document content extractor
8pub struct PdfExtractor {
9    config: ExtractorConfig,
10}
11
12impl Default for PdfExtractor {
13    fn default() -> Self {
14        Self::new(ExtractorConfig::default())
15    }
16}
17
18impl PdfExtractor {
19    /// Create a new PDF extractor with the given configuration
20    pub fn new(config: ExtractorConfig) -> Self {
21        Self { config }
22    }
23
24    /// Extract text from a PDF file path
25    pub fn extract_from_file(&self, path: &Path) -> Result<ExtractResult> {
26        let doc = Document::load(path)?;
27        self.extract_from_document(&doc, path.to_string_lossy().to_string())
28    }
29
30    /// Extract text from PDF bytes
31    pub fn extract_from_bytes(&self, bytes: &[u8], source: String) -> Result<ExtractResult> {
32        let doc = Document::load_mem(bytes)?;
33        self.extract_from_document(&doc, source)
34    }
35
36    /// Extract text from a lopdf Document
37    fn extract_from_document(&self, doc: &Document, source: String) -> Result<ExtractResult> {
38        let mut text_parts: Vec<String> = Vec::new();
39        let pages = doc.get_pages();
40
41        for (page_num, _) in pages.iter() {
42            if let Ok(page_text) = doc.extract_text(&[*page_num]) {
43                let cleaned = self.clean_text(&page_text);
44                if !cleaned.is_empty() {
45                    text_parts.push(cleaned);
46                }
47            }
48        }
49
50        let text = text_parts.join("\n\n");
51
52        if text.len() > self.config.max_length {
53            return Err(ExtractError::ContentTooLarge {
54                size: text.len(),
55                max: self.config.max_length,
56            });
57        }
58
59        let mut result = ExtractResult::new(text, source)
60            .with_content_type("application/pdf")
61            .with_metadata("page_count", pages.len().to_string());
62
63        // Extract title from PDF metadata if available
64        if let Ok(catalog) = doc.catalog() {
65            if let Ok(info_ref) = catalog.get(b"Info") {
66                if let Ok(info) = doc.get_object(info_ref.as_reference().unwrap_or_default()) {
67                    if let Ok(info_dict) = info.as_dict() {
68                        if let Ok(title) = info_dict.get(b"Title") {
69                            if let Ok(title_bytes) = title.as_str() {
70                                if let Ok(title_str) = std::str::from_utf8(title_bytes) {
71                                    result = result.with_title(title_str);
72                                }
73                            }
74                        }
75                        if let Ok(author) = info_dict.get(b"Author") {
76                            if let Ok(author_bytes) = author.as_str() {
77                                if let Ok(author_str) = std::str::from_utf8(author_bytes) {
78                                    result = result.with_metadata("author", author_str);
79                                }
80                            }
81                        }
82                    }
83                }
84            }
85        }
86
87        Ok(result)
88    }
89
90    /// Clean extracted text
91    fn clean_text(&self, text: &str) -> String {
92        let mut result = String::with_capacity(text.len());
93        let mut prev_was_whitespace = false;
94
95        for c in text.chars() {
96            if c.is_whitespace() {
97                if !prev_was_whitespace {
98                    result.push(if c == '\n' { '\n' } else { ' ' });
99                    prev_was_whitespace = true;
100                }
101            } else {
102                result.push(c);
103                prev_was_whitespace = false;
104            }
105        }
106
107        result.trim().to_string()
108    }
109}
110
111#[async_trait::async_trait]
112impl Extractor for PdfExtractor {
113    /// Extract text from a PDF source (file path or URL)
114    async fn extract(&self, source: &str) -> Result<ExtractResult> {
115        // Check if source is a URL
116        if source.starts_with("http://") || source.starts_with("https://") {
117            #[cfg(feature = "web")]
118            {
119                // Fetch PDF from URL
120                let client = reqwest::Client::builder()
121                    .timeout(std::time::Duration::from_secs(self.config.timeout_secs))
122                    .build()
123                    .map_err(|e| ExtractError::Network(e.to_string()))?;
124
125                let response = client
126                    .get(source)
127                    .send()
128                    .await
129                    .map_err(|e| ExtractError::Network(e.to_string()))?;
130
131                if !response.status().is_success() {
132                    return Err(ExtractError::Http {
133                        status: response.status().as_u16(),
134                        message: response.status().to_string(),
135                    });
136                }
137
138                let bytes = response
139                    .bytes()
140                    .await
141                    .map_err(|e| ExtractError::Network(e.to_string()))?;
142
143                return self.extract_from_bytes(&bytes, source.to_string());
144            }
145
146            #[cfg(not(feature = "web"))]
147            {
148                return Err(ExtractError::Other(
149                    "URL extraction requires 'web' feature".to_string(),
150                ));
151            }
152        }
153
154        // Treat as file path
155        let path = Path::new(source);
156        if !path.exists() {
157            return Err(ExtractError::Io(std::io::Error::new(
158                std::io::ErrorKind::NotFound,
159                format!("File not found: {source}"),
160            )));
161        }
162
163        self.extract_from_file(path)
164    }
165
166    #[cfg(feature = "sanitize")]
167    async fn extract_sanitized(&self, source: &str) -> Result<ExtractResult> {
168        let result = self.extract(source).await?;
169        crate::sanitize::sanitize_result(result, &self.config).await
170    }
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn test_clean_text() {
179        let extractor = PdfExtractor::default();
180        let input = "  Hello   World  \n\n  Test  ";
181        let result = extractor.clean_text(input);
182        // All consecutive whitespace collapsed to single space
183        assert_eq!(result, "Hello World Test");
184    }
185}
hanzo_extract/pdf.rs

hanzo_extract/
pdf.rs