stygian_graph/ports/document_source.rs
1//! Document source port — read local files as pipeline data sources.
2//!
3//! Defines the [`DocumentSourcePort`] trait for reading documents (CSV, JSON,
4//! Markdown, plain text, etc.) from the local file system and returning their
5//! content for downstream processing.
6//!
7//! # Architecture
8//!
9//! ```text
10//! stygian-graph
11//! ├─ DocumentSourcePort (this file) ← always compiled
12//! └─ Adapters (adapters/)
13//! └─ DocumentSource → std::fs / tokio::fs
14//! ```
15//!
16//! # Example
17//!
18//! ```no_run
19//! use stygian_graph::ports::document_source::{DocumentSourcePort, DocumentQuery};
20//! use std::path::PathBuf;
21//!
22//! async fn read_docs<D: DocumentSourcePort>(source: &D) {
23//! let query = DocumentQuery {
24//! path: PathBuf::from("data/input.csv"),
25//! recursive: false,
26//! glob_pattern: None,
27//! };
28//! let docs = source.read_documents(query).await.unwrap();
29//! for doc in &docs {
30//! println!("{}: {} bytes", doc.path.display(), doc.content.len());
31//! }
32//! }
33//! ```
34
35use crate::domain::error::Result;
36use async_trait::async_trait;
37use serde::{Deserialize, Serialize};
38use std::path::PathBuf;
39
40// ─────────────────────────────────────────────────────────────────────────────
41// Document / DocumentQuery
42// ─────────────────────────────────────────────────────────────────────────────
43
44/// Query parameters for reading documents from the file system.
45///
46/// # Example
47///
48/// ```
49/// use stygian_graph::ports::document_source::DocumentQuery;
50/// use std::path::PathBuf;
51///
52/// let query = DocumentQuery {
53/// path: PathBuf::from("data/"),
54/// recursive: true,
55/// glob_pattern: Some("*.csv".into()),
56/// };
57/// ```
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct DocumentQuery {
60 /// Path to a file or directory
61 pub path: PathBuf,
62 /// If `path` is a directory, whether to recurse into subdirectories
63 pub recursive: bool,
64 /// Optional glob pattern to filter files (e.g. `"*.json"`)
65 pub glob_pattern: Option<String>,
66}
67
68/// A document read from the file system.
69///
70/// # Example
71///
72/// ```
73/// use stygian_graph::ports::document_source::Document;
74/// use std::path::PathBuf;
75///
76/// let doc = Document {
77/// path: PathBuf::from("data/input.csv"),
78/// content: "id,name\n1,Alice\n".into(),
79/// mime_type: Some("text/csv".into()),
80/// size_bytes: 17,
81/// };
82/// assert_eq!(doc.size_bytes, 17);
83/// ```
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct Document {
86 /// Absolute or relative path to the file
87 pub path: PathBuf,
88 /// File content (text; binary files should be base64-encoded)
89 pub content: String,
90 /// Detected or inferred MIME type
91 pub mime_type: Option<String>,
92 /// File size in bytes
93 pub size_bytes: u64,
94}
95
96// ─────────────────────────────────────────────────────────────────────────────
97// DocumentSourcePort
98// ─────────────────────────────────────────────────────────────────────────────
99
100/// Port: read documents from the local file system.
101///
102/// Implementations handle file enumeration, glob filtering, and content
103/// reading. Binary files (PDFs, images) should be base64-encoded in the
104/// `content` field and can be further processed by the multimodal adapter.
105///
106/// # Example
107///
108/// ```no_run
109/// use stygian_graph::ports::document_source::{DocumentSourcePort, DocumentQuery, Document};
110/// use stygian_graph::domain::error::Result;
111/// use async_trait::async_trait;
112/// use std::path::PathBuf;
113///
114/// struct MockDocs;
115///
116/// #[async_trait]
117/// impl DocumentSourcePort for MockDocs {
118/// async fn read_documents(&self, query: DocumentQuery) -> Result<Vec<Document>> {
119/// Ok(vec![Document {
120/// path: query.path,
121/// content: "hello".into(),
122/// mime_type: Some("text/plain".into()),
123/// size_bytes: 5,
124/// }])
125/// }
126///
127/// fn source_name(&self) -> &str {
128/// "mock-docs"
129/// }
130/// }
131/// ```
132#[async_trait]
133pub trait DocumentSourcePort: Send + Sync {
134 /// Read documents matching the query.
135 ///
136 /// # Arguments
137 ///
138 /// * `query` - Path, recursion flag, and optional glob filter
139 ///
140 /// # Returns
141 ///
142 /// * `Ok(Vec<Document>)` - Matched documents with content
143 /// * `Err(StygianError)` - I/O or permission error
144 ///
145 /// # Example
146 ///
147 /// ```no_run
148 /// # use stygian_graph::ports::document_source::{DocumentSourcePort, DocumentQuery};
149 /// # use std::path::PathBuf;
150 /// # async fn example(source: impl DocumentSourcePort) {
151 /// let query = DocumentQuery {
152 /// path: PathBuf::from("data/report.json"),
153 /// recursive: false,
154 /// glob_pattern: None,
155 /// };
156 /// let docs = source.read_documents(query).await.unwrap();
157 /// # }
158 /// ```
159 async fn read_documents(&self, query: DocumentQuery) -> Result<Vec<Document>>;
160
161 /// Human-readable name of this document source.
162 ///
163 /// # Example
164 ///
165 /// ```no_run
166 /// # use stygian_graph::ports::document_source::DocumentSourcePort;
167 /// # fn example(source: impl DocumentSourcePort) {
168 /// println!("Source: {}", source.source_name());
169 /// # }
170 /// ```
171 fn source_name(&self) -> &str;
172}