Skip to main content

stygian_graph/ports/
document_source.rs

1//! Document source port — read local files as pipeline data sources.
2//!
3//! Defines the [`DocumentSourcePort`] trait for reading documents (CSV, JSON,
4//! Markdown, plain text, etc.) from the local file system and returning their
5//! content for downstream processing.
6//!
7//! # Architecture
8//!
9//! ```text
10//! stygian-graph
11//!   ├─ DocumentSourcePort (this file)      ← always compiled
12//!   └─ Adapters (adapters/)
13//!        └─ DocumentSource                 → std::fs / tokio::fs
14//! ```
15//!
16//! # Example
17//!
18//! ```no_run
19//! use stygian_graph::ports::document_source::{DocumentSourcePort, DocumentQuery};
20//! use std::path::PathBuf;
21//!
22//! async fn read_docs<D: DocumentSourcePort>(source: &D) {
23//!     let query = DocumentQuery {
24//!         path: PathBuf::from("data/input.csv"),
25//!         recursive: false,
26//!         glob_pattern: None,
27//!     };
28//!     let docs = source.read_documents(query).await.unwrap();
29//!     for doc in &docs {
30//!         println!("{}: {} bytes", doc.path.display(), doc.content.len());
31//!     }
32//! }
33//! ```
34
35use crate::domain::error::Result;
36use async_trait::async_trait;
37use serde::{Deserialize, Serialize};
38use std::path::PathBuf;
39
40// ─────────────────────────────────────────────────────────────────────────────
41// Document / DocumentQuery
42// ─────────────────────────────────────────────────────────────────────────────
43
44/// Query parameters for reading documents from the file system.
45///
46/// # Example
47///
48/// ```
49/// use stygian_graph::ports::document_source::DocumentQuery;
50/// use std::path::PathBuf;
51///
52/// let query = DocumentQuery {
53///     path: PathBuf::from("data/"),
54///     recursive: true,
55///     glob_pattern: Some("*.csv".into()),
56/// };
57/// ```
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct DocumentQuery {
60    /// Path to a file or directory
61    pub path: PathBuf,
62    /// If `path` is a directory, whether to recurse into subdirectories
63    pub recursive: bool,
64    /// Optional glob pattern to filter files (e.g. `"*.json"`)
65    pub glob_pattern: Option<String>,
66}
67
68/// A document read from the file system.
69///
70/// # Example
71///
72/// ```
73/// use stygian_graph::ports::document_source::Document;
74/// use std::path::PathBuf;
75///
76/// let doc = Document {
77///     path: PathBuf::from("data/input.csv"),
78///     content: "id,name\n1,Alice\n".into(),
79///     mime_type: Some("text/csv".into()),
80///     size_bytes: 17,
81/// };
82/// assert_eq!(doc.size_bytes, 17);
83/// ```
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct Document {
86    /// Absolute or relative path to the file
87    pub path: PathBuf,
88    /// File content (text; binary files should be base64-encoded)
89    pub content: String,
90    /// Detected or inferred MIME type
91    pub mime_type: Option<String>,
92    /// File size in bytes
93    pub size_bytes: u64,
94}
95
96// ─────────────────────────────────────────────────────────────────────────────
97// DocumentSourcePort
98// ─────────────────────────────────────────────────────────────────────────────
99
100/// Port: read documents from the local file system.
101///
102/// Implementations handle file enumeration, glob filtering, and content
103/// reading.  Binary files (PDFs, images) should be base64-encoded in the
104/// `content` field and can be further processed by the multimodal adapter.
105///
106/// # Example
107///
108/// ```no_run
109/// use stygian_graph::ports::document_source::{DocumentSourcePort, DocumentQuery, Document};
110/// use stygian_graph::domain::error::Result;
111/// use async_trait::async_trait;
112/// use std::path::PathBuf;
113///
114/// struct MockDocs;
115///
116/// #[async_trait]
117/// impl DocumentSourcePort for MockDocs {
118///     async fn read_documents(&self, query: DocumentQuery) -> Result<Vec<Document>> {
119///         Ok(vec![Document {
120///             path: query.path,
121///             content: "hello".into(),
122///             mime_type: Some("text/plain".into()),
123///             size_bytes: 5,
124///         }])
125///     }
126///
127///     fn source_name(&self) -> &str {
128///         "mock-docs"
129///     }
130/// }
131/// ```
132#[async_trait]
133pub trait DocumentSourcePort: Send + Sync {
134    /// Read documents matching the query.
135    ///
136    /// # Arguments
137    ///
138    /// * `query` - Path, recursion flag, and optional glob filter
139    ///
140    /// # Returns
141    ///
142    /// * `Ok(Vec<Document>)` - Matched documents with content
143    /// * `Err(StygianError)` - I/O or permission error
144    ///
145    /// # Example
146    ///
147    /// ```no_run
148    /// # use stygian_graph::ports::document_source::{DocumentSourcePort, DocumentQuery};
149    /// # use std::path::PathBuf;
150    /// # async fn example(source: impl DocumentSourcePort) {
151    /// let query = DocumentQuery {
152    ///     path: PathBuf::from("data/report.json"),
153    ///     recursive: false,
154    ///     glob_pattern: None,
155    /// };
156    /// let docs = source.read_documents(query).await.unwrap();
157    /// # }
158    /// ```
159    async fn read_documents(&self, query: DocumentQuery) -> Result<Vec<Document>>;
160
161    /// Human-readable name of this document source.
162    ///
163    /// # Example
164    ///
165    /// ```no_run
166    /// # use stygian_graph::ports::document_source::DocumentSourcePort;
167    /// # fn example(source: impl DocumentSourcePort) {
168    /// println!("Source: {}", source.source_name());
169    /// # }
170    /// ```
171    fn source_name(&self) -> &str;
172}