agentroot_core/providers/
pdf.rs1use crate::db::hash_content;
4use crate::error::{AgentRootError, Result};
5use crate::providers::{ProviderConfig, SourceItem, SourceProvider};
6use async_trait::async_trait;
7use std::fs;
8use std::path::{Path, PathBuf};
9use walkdir::WalkDir;
10
11pub struct PDFProvider;
13
14impl Default for PDFProvider {
15 fn default() -> Self {
16 Self::new()
17 }
18}
19
20impl PDFProvider {
21 pub fn new() -> Self {
23 Self
24 }
25
26 fn extract_text_from_pdf(&self, path: &Path) -> Result<String> {
28 let bytes = fs::read(path).map_err(|e| {
29 AgentRootError::Io(std::io::Error::new(
30 e.kind(),
31 format!("Failed to read PDF file {:?}: {}", path, e),
32 ))
33 })?;
34
35 let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| {
36 AgentRootError::Parse(format!("Failed to extract text from PDF {:?}: {}", path, e))
37 })?;
38
39 if text.trim().is_empty() {
40 return Err(AgentRootError::Parse(format!(
41 "PDF file {:?} contains no extractable text (may be image-based)",
42 path
43 )));
44 }
45
46 Ok(text)
47 }
48
49 fn extract_title(&self, content: &str, filename: &str) -> String {
51 let first_line = content
52 .lines()
53 .map(|l| l.trim())
54 .find(|l| !l.is_empty())
55 .unwrap_or("");
56
57 if !first_line.is_empty() && first_line.len() < 200 {
58 return first_line.to_string();
59 }
60
61 Path::new(filename)
62 .file_stem()
63 .and_then(|s| s.to_str())
64 .map(|s| s.replace(['_', '-'], " "))
65 .unwrap_or_else(|| "Untitled PDF".to_string())
66 }
67
68 fn scan_directory(&self, base_path: &Path, pattern: &str) -> Result<Vec<PathBuf>> {
70 let glob_pattern = glob::Pattern::new(pattern)?;
71 let mut pdf_files = Vec::new();
72
73 for entry in WalkDir::new(base_path)
74 .follow_links(true)
75 .into_iter()
76 .filter_entry(|e| {
77 let name = e.file_name().to_string_lossy();
78 !name.starts_with('.')
79 && !matches!(
80 name.as_ref(),
81 "node_modules" | ".git" | ".cache" | "target" | "dist" | "build"
82 )
83 })
84 {
85 let entry = entry?;
86 if !entry.file_type().is_file() {
87 continue;
88 }
89
90 let path = entry.path();
91 if let Some(ext) = path.extension() {
92 if ext.eq_ignore_ascii_case("pdf") {
93 if let Ok(relative) = path.strip_prefix(base_path) {
94 let relative_str = relative.to_string_lossy();
95 if glob_pattern.matches(&relative_str) {
96 pdf_files.push(path.to_path_buf());
97 }
98 }
99 }
100 }
101 }
102
103 Ok(pdf_files)
104 }
105}
106
107#[async_trait]
108impl SourceProvider for PDFProvider {
109 fn provider_type(&self) -> &'static str {
110 "pdf"
111 }
112
113 async fn list_items(&self, config: &ProviderConfig) -> Result<Vec<SourceItem>> {
114 let base_path = Path::new(&config.base_path);
115 if !base_path.exists() {
116 return Err(AgentRootError::InvalidInput(format!(
117 "Path does not exist: {}",
118 config.base_path
119 )));
120 }
121
122 let pdf_files = if base_path.is_file() {
123 if base_path.extension().and_then(|e| e.to_str()) == Some("pdf") {
124 vec![base_path.to_path_buf()]
125 } else {
126 return Err(AgentRootError::InvalidInput(format!(
127 "File is not a PDF: {}",
128 config.base_path
129 )));
130 }
131 } else {
132 self.scan_directory(base_path, &config.pattern)?
133 };
134
135 let mut items = Vec::new();
136 for pdf_path in pdf_files {
137 match self.extract_text_from_pdf(&pdf_path) {
138 Ok(content) => {
139 let filename = pdf_path.to_string_lossy().to_string();
140 let title = self.extract_title(&content, &filename);
141 let hash = hash_content(&content);
142
143 let mut item =
144 SourceItem::new(filename.clone(), title, content, hash, "pdf".to_string());
145 item.metadata
146 .insert("file_path".to_string(), filename.clone());
147 if let Some(stem) = pdf_path.file_stem() {
148 item.metadata
149 .insert("filename".to_string(), stem.to_string_lossy().to_string());
150 }
151
152 items.push(item);
153 }
154 Err(e) => {
155 tracing::warn!("Skipping PDF {:?}: {}", pdf_path, e);
156 }
157 }
158 }
159
160 Ok(items)
161 }
162
163 async fn fetch_item(&self, uri: &str) -> Result<SourceItem> {
164 let path = Path::new(uri);
165 let content = self.extract_text_from_pdf(path)?;
166 let title = self.extract_title(&content, uri);
167 let hash = hash_content(&content);
168
169 let mut item = SourceItem::new(uri.to_string(), title, content, hash, "pdf".to_string());
170 item.metadata
171 .insert("file_path".to_string(), uri.to_string());
172 if let Some(stem) = path.file_stem() {
173 item.metadata
174 .insert("filename".to_string(), stem.to_string_lossy().to_string());
175 }
176
177 Ok(item)
178 }
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184
185 #[test]
186 fn test_provider_type() {
187 let provider = PDFProvider::new();
188 assert_eq!(provider.provider_type(), "pdf");
189 }
190
191 #[test]
192 fn test_extract_title_from_content() {
193 let provider = PDFProvider::new();
194 let content = " \n\nDocument Title\n\nSome content here...";
195 let title = provider.extract_title(content, "test.pdf");
196 assert_eq!(title, "Document Title");
197 }
198
199 #[test]
200 fn test_extract_title_from_filename() {
201 let provider = PDFProvider::new();
202 let content = "";
203 let title = provider.extract_title(content, "my_important_document.pdf");
204 assert_eq!(title, "my important document");
205 }
206
207 #[test]
208 fn test_extract_title_with_dashes() {
209 let provider = PDFProvider::new();
210 let content = "";
211 let title = provider.extract_title(content, "user-guide-v2.pdf");
212 assert_eq!(title, "user guide v2");
213 }
214
215 #[test]
216 fn test_extract_title_long_first_line() {
217 let provider = PDFProvider::new();
218 let long_line = "a".repeat(250);
219 let content = format!("{}\n\nMore content", long_line);
220 let title = provider.extract_title(&content, "document.pdf");
221 assert_eq!(title, "document");
222 }
223}