agentroot_core/providers/
pdf.rs

1//! PDF Provider for extracting text from PDF files
2
3use crate::db::hash_content;
4use crate::error::{AgentRootError, Result};
5use crate::providers::{ProviderConfig, SourceItem, SourceProvider};
6use async_trait::async_trait;
7use std::fs;
8use std::path::{Path, PathBuf};
9use walkdir::WalkDir;
10
11/// Provider for extracting text from PDF files
12pub struct PDFProvider;
13
14impl Default for PDFProvider {
15    fn default() -> Self {
16        Self::new()
17    }
18}
19
20impl PDFProvider {
21    /// Create a new PDFProvider
22    pub fn new() -> Self {
23        Self
24    }
25
26    /// Extract text from a PDF file
27    fn extract_text_from_pdf(&self, path: &Path) -> Result<String> {
28        let bytes = fs::read(path).map_err(|e| {
29            AgentRootError::Io(std::io::Error::new(
30                e.kind(),
31                format!("Failed to read PDF file {:?}: {}", path, e),
32            ))
33        })?;
34
35        let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| {
36            AgentRootError::Parse(format!("Failed to extract text from PDF {:?}: {}", path, e))
37        })?;
38
39        if text.trim().is_empty() {
40            return Err(AgentRootError::Parse(format!(
41                "PDF file {:?} contains no extractable text (may be image-based)",
42                path
43            )));
44        }
45
46        Ok(text)
47    }
48
49    /// Extract title from PDF text content
50    fn extract_title(&self, content: &str, filename: &str) -> String {
51        let first_line = content
52            .lines()
53            .map(|l| l.trim())
54            .find(|l| !l.is_empty())
55            .unwrap_or("");
56
57        if !first_line.is_empty() && first_line.len() < 200 {
58            return first_line.to_string();
59        }
60
61        Path::new(filename)
62            .file_stem()
63            .and_then(|s| s.to_str())
64            .map(|s| s.replace(['_', '-'], " "))
65            .unwrap_or_else(|| "Untitled PDF".to_string())
66    }
67
68    /// Scan directory for PDF files matching pattern
69    fn scan_directory(&self, base_path: &Path, pattern: &str) -> Result<Vec<PathBuf>> {
70        let glob_pattern = glob::Pattern::new(pattern)?;
71        let mut pdf_files = Vec::new();
72
73        for entry in WalkDir::new(base_path)
74            .follow_links(true)
75            .into_iter()
76            .filter_entry(|e| {
77                let name = e.file_name().to_string_lossy();
78                !name.starts_with('.')
79                    && !matches!(
80                        name.as_ref(),
81                        "node_modules" | ".git" | ".cache" | "target" | "dist" | "build"
82                    )
83            })
84        {
85            let entry = entry?;
86            if !entry.file_type().is_file() {
87                continue;
88            }
89
90            let path = entry.path();
91            if let Some(ext) = path.extension() {
92                if ext.eq_ignore_ascii_case("pdf") {
93                    if let Ok(relative) = path.strip_prefix(base_path) {
94                        let relative_str = relative.to_string_lossy();
95                        if glob_pattern.matches(&relative_str) {
96                            pdf_files.push(path.to_path_buf());
97                        }
98                    }
99                }
100            }
101        }
102
103        Ok(pdf_files)
104    }
105}
106
107#[async_trait]
108impl SourceProvider for PDFProvider {
109    fn provider_type(&self) -> &'static str {
110        "pdf"
111    }
112
113    async fn list_items(&self, config: &ProviderConfig) -> Result<Vec<SourceItem>> {
114        let base_path = Path::new(&config.base_path);
115        if !base_path.exists() {
116            return Err(AgentRootError::InvalidInput(format!(
117                "Path does not exist: {}",
118                config.base_path
119            )));
120        }
121
122        let pdf_files = if base_path.is_file() {
123            if base_path.extension().and_then(|e| e.to_str()) == Some("pdf") {
124                vec![base_path.to_path_buf()]
125            } else {
126                return Err(AgentRootError::InvalidInput(format!(
127                    "File is not a PDF: {}",
128                    config.base_path
129                )));
130            }
131        } else {
132            self.scan_directory(base_path, &config.pattern)?
133        };
134
135        let mut items = Vec::new();
136        for pdf_path in pdf_files {
137            match self.extract_text_from_pdf(&pdf_path) {
138                Ok(content) => {
139                    let filename = pdf_path.to_string_lossy().to_string();
140                    let title = self.extract_title(&content, &filename);
141                    let hash = hash_content(&content);
142
143                    let mut item =
144                        SourceItem::new(filename.clone(), title, content, hash, "pdf".to_string());
145                    item.metadata
146                        .insert("file_path".to_string(), filename.clone());
147                    if let Some(stem) = pdf_path.file_stem() {
148                        item.metadata
149                            .insert("filename".to_string(), stem.to_string_lossy().to_string());
150                    }
151
152                    items.push(item);
153                }
154                Err(e) => {
155                    tracing::warn!("Skipping PDF {:?}: {}", pdf_path, e);
156                }
157            }
158        }
159
160        Ok(items)
161    }
162
163    async fn fetch_item(&self, uri: &str) -> Result<SourceItem> {
164        let path = Path::new(uri);
165        let content = self.extract_text_from_pdf(path)?;
166        let title = self.extract_title(&content, uri);
167        let hash = hash_content(&content);
168
169        let mut item = SourceItem::new(uri.to_string(), title, content, hash, "pdf".to_string());
170        item.metadata
171            .insert("file_path".to_string(), uri.to_string());
172        if let Some(stem) = path.file_stem() {
173            item.metadata
174                .insert("filename".to_string(), stem.to_string_lossy().to_string());
175        }
176
177        Ok(item)
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn test_provider_type() {
187        let provider = PDFProvider::new();
188        assert_eq!(provider.provider_type(), "pdf");
189    }
190
191    #[test]
192    fn test_extract_title_from_content() {
193        let provider = PDFProvider::new();
194        let content = "   \n\nDocument Title\n\nSome content here...";
195        let title = provider.extract_title(content, "test.pdf");
196        assert_eq!(title, "Document Title");
197    }
198
199    #[test]
200    fn test_extract_title_from_filename() {
201        let provider = PDFProvider::new();
202        let content = "";
203        let title = provider.extract_title(content, "my_important_document.pdf");
204        assert_eq!(title, "my important document");
205    }
206
207    #[test]
208    fn test_extract_title_with_dashes() {
209        let provider = PDFProvider::new();
210        let content = "";
211        let title = provider.extract_title(content, "user-guide-v2.pdf");
212        assert_eq!(title, "user guide v2");
213    }
214
215    #[test]
216    fn test_extract_title_long_first_line() {
217        let provider = PDFProvider::new();
218        let long_line = "a".repeat(250);
219        let content = format!("{}\n\nMore content", long_line);
220        let title = provider.extract_title(&content, "document.pdf");
221        assert_eq!(title, "document");
222    }
223}