stygian_graph/adapters/
document.rs1use async_trait::async_trait;
26use serde_json::{Value, json};
27use std::path::{Path, PathBuf};
28use tokio::fs;
29
30use crate::domain::error::{Result, ServiceError, StygianError};
31use crate::ports::document_source::{Document, DocumentQuery, DocumentSourcePort};
32use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
33
34pub struct DocumentSource {
52 _priv: (),
53}
54
55impl DocumentSource {
56 #[must_use]
66 pub const fn new() -> Self {
67 Self { _priv: () }
68 }
69
70 #[must_use]
72 pub const fn source_name(&self) -> &'static str {
73 "filesystem"
74 }
75
76 async fn collect_paths(query: &DocumentQuery) -> Result<Vec<PathBuf>> {
78 let meta = fs::metadata(&query.path).await.map_err(|e| {
79 StygianError::Service(ServiceError::Unavailable(format!(
80 "cannot access {}: {e}",
81 query.path.display()
82 )))
83 })?;
84
85 if meta.is_file() {
86 return Ok(vec![query.path.clone()]);
87 }
88
89 let mut paths = Vec::new();
90 Self::walk_dir(
91 &query.path,
92 query.recursive,
93 query.glob_pattern.as_deref(),
94 &mut paths,
95 )
96 .await?;
97 Ok(paths)
98 }
99
100 async fn walk_dir(
102 dir: &Path,
103 recursive: bool,
104 glob: Option<&str>,
105 out: &mut Vec<PathBuf>,
106 ) -> Result<()> {
107 let mut entries = fs::read_dir(dir).await.map_err(|e| {
108 StygianError::Service(ServiceError::Unavailable(format!(
109 "cannot read directory {}: {e}",
110 dir.display()
111 )))
112 })?;
113
114 while let Some(entry) = entries.next_entry().await.map_err(|e| {
115 StygianError::Service(ServiceError::Unavailable(format!("readdir error: {e}")))
116 })? {
117 let path = entry.path();
118 let ft = entry.file_type().await.map_err(|e| {
119 StygianError::Service(ServiceError::Unavailable(format!("file_type error: {e}")))
120 })?;
121
122 if ft.is_symlink() {
123 match fs::metadata(&path).await {
125 Ok(meta) if meta.is_dir() && recursive => {
126 Box::pin(Self::walk_dir(&path, recursive, glob, out)).await?;
127 }
128 Ok(meta) if meta.is_file() => {
129 if let Some(pattern) = glob {
130 if let Some(name) = path.file_name().and_then(|n| n.to_str())
131 && Self::glob_matches(pattern, name)
132 {
133 out.push(path);
134 }
135 } else {
136 out.push(path);
137 }
138 }
139 _ => {}
141 }
142 } else if ft.is_dir() && recursive {
143 Box::pin(Self::walk_dir(&path, recursive, glob, out)).await?;
144 } else if ft.is_file() {
145 if let Some(pattern) = glob {
146 if let Some(name) = path.file_name().and_then(|n| n.to_str())
147 && Self::glob_matches(pattern, name)
148 {
149 out.push(path);
150 }
151 } else {
152 out.push(path);
153 }
154 }
155 }
156
157 Ok(())
158 }
159
160 fn glob_matches(pattern: &str, name: &str) -> bool {
165 if pattern == "*" {
166 return true;
167 }
168 let pattern_lower = pattern.to_ascii_lowercase();
169 let name_lower = name.to_ascii_lowercase();
170 if let Some(ext) = pattern_lower.strip_prefix("*.") {
171 return name_lower.ends_with(&format!(".{ext}"));
172 }
173 if let Some(prefix) = pattern_lower.strip_suffix('*') {
174 return name_lower.starts_with(prefix);
175 }
176 pattern_lower == name_lower
177 }
178
179 fn infer_mime(path: &Path) -> Option<String> {
184 let ext_raw = path.extension()?.to_str()?;
185 let ext = ext_raw.to_ascii_lowercase();
186 let mime = match ext.as_str() {
187 "json" => "application/json",
188 "csv" => "text/csv",
189 "xml" => "application/xml",
190 "html" | "htm" => "text/html",
191 "md" | "markdown" => "text/markdown",
192 "txt" => "text/plain",
193 "yaml" | "yml" => "application/yaml",
194 "toml" => "application/toml",
195 "pdf" => "application/pdf",
196 "png" => "image/png",
197 "jpg" | "jpeg" => "image/jpeg",
198 "gif" => "image/gif",
199 "svg" => "image/svg+xml",
200 _ => return None,
201 };
202 Some(mime.to_string())
203 }
204}
205
206impl Default for DocumentSource {
207 fn default() -> Self {
208 Self::new()
209 }
210}
211
212#[async_trait]
217impl DocumentSourcePort for DocumentSource {
218 async fn read_documents(&self, query: DocumentQuery) -> Result<Vec<Document>> {
219 let paths = Self::collect_paths(&query).await?;
220 let mut docs = Vec::with_capacity(paths.len());
221
222 for path in paths {
223 let content = fs::read_to_string(&path).await.map_err(|e| {
224 StygianError::Service(ServiceError::InvalidResponse(format!(
225 "cannot read {}: {e}",
226 path.display()
227 )))
228 })?;
229 let size_bytes = content.len() as u64;
230 let mime_type = Self::infer_mime(&path);
231
232 docs.push(Document {
233 path,
234 content,
235 mime_type,
236 size_bytes,
237 });
238 }
239
240 Ok(docs)
241 }
242
243 fn source_name(&self) -> &'static str {
244 "filesystem"
245 }
246}
247
248#[async_trait]
253impl ScrapingService for DocumentSource {
254 async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
277 let path_str = input
278 .params
279 .get("path")
280 .and_then(Value::as_str)
281 .ok_or_else(|| {
282 StygianError::Service(ServiceError::InvalidResponse(
283 "missing 'path' in params".into(),
284 ))
285 })?;
286
287 let query = DocumentQuery {
288 path: PathBuf::from(path_str),
289 recursive: input
290 .params
291 .get("recursive")
292 .and_then(Value::as_bool)
293 .unwrap_or(false),
294 glob_pattern: input
295 .params
296 .get("glob_pattern")
297 .and_then(Value::as_str)
298 .map(String::from),
299 };
300
301 let docs = self.read_documents(query).await?;
302 let doc_count = docs.len();
303
304 Ok(ServiceOutput {
305 data: serde_json::to_string(&docs).unwrap_or_default(),
306 metadata: json!({
307 "source": "filesystem",
308 "document_count": doc_count,
309 }),
310 })
311 }
312
313 fn name(&self) -> &'static str {
314 "document"
315 }
316}
317
318#[cfg(test)]
323mod tests {
324 use super::*;
325
326 #[test]
327 fn glob_matches_exact() {
328 assert!(DocumentSource::glob_matches("report.csv", "report.csv"));
329 assert!(!DocumentSource::glob_matches("report.csv", "other.csv"));
330 }
331
332 #[test]
333 fn glob_matches_extension() {
334 assert!(DocumentSource::glob_matches("*.csv", "data.csv"));
335 assert!(!DocumentSource::glob_matches("*.csv", "data.json"));
336 }
337
338 #[test]
339 fn glob_matches_case_insensitive() {
340 assert!(DocumentSource::glob_matches("*.JSON", "data.json"));
341 assert!(DocumentSource::glob_matches("*.csv", "DATA.CSV"));
342 assert!(DocumentSource::glob_matches("Report*", "report_2024.csv"));
343 }
344
345 #[test]
346 fn glob_matches_prefix() {
347 assert!(DocumentSource::glob_matches("report*", "report_2024.csv"));
348 assert!(!DocumentSource::glob_matches("report*", "data.csv"));
349 }
350
351 #[test]
352 fn glob_matches_star() {
353 assert!(DocumentSource::glob_matches("*", "anything.txt"));
354 }
355
356 #[test]
357 fn infer_mime_known_types() {
358 assert_eq!(
359 DocumentSource::infer_mime(Path::new("data.json")),
360 Some("application/json".into())
361 );
362 assert_eq!(
363 DocumentSource::infer_mime(Path::new("data.csv")),
364 Some("text/csv".into())
365 );
366 assert_eq!(
367 DocumentSource::infer_mime(Path::new("doc.pdf")),
368 Some("application/pdf".into())
369 );
370 }
371
372 #[test]
373 fn infer_mime_case_insensitive() {
374 assert_eq!(
375 DocumentSource::infer_mime(Path::new("DATA.JSON")),
376 Some("application/json".into())
377 );
378 assert_eq!(
379 DocumentSource::infer_mime(Path::new("photo.JPG")),
380 Some("image/jpeg".into())
381 );
382 }
383
384 #[test]
385 fn infer_mime_unknown() {
386 assert_eq!(DocumentSource::infer_mime(Path::new("data.xyz")), None);
387 }
388
389 #[tokio::test]
390 async fn read_nonexistent_path_returns_error() {
391 let source = DocumentSource::new();
392 let query = DocumentQuery {
393 path: PathBuf::from("/nonexistent/path/that/does/not/exist"),
394 recursive: false,
395 glob_pattern: None,
396 };
397 let result = source.read_documents(query).await;
398 assert!(result.is_err());
399 }
400}