Skip to main content

cognis_rag/loaders/
mod.rs

1//! Document loaders — read sources into [`Document`]s.
2//!
3//! Each loader returns either:
4//! - `Result<Vec<Document>>` — for small, bounded sources (one file).
5//! - `RunnableStream<Document>` — for unbounded / large sources (a directory walk).
6//!
7//! The unifying trait is [`DocumentLoader`]: `load() -> Stream<Item=Result<Document>>`,
8//! so callers can compose with `futures` combinators (e.g. `.take(100)`,
9//! `.filter`).
10
11use async_trait::async_trait;
12use futures::Stream;
13
14use cognis_core::Result;
15
16use crate::document::Document;
17
18#[cfg(feature = "csv-loader")]
19pub mod csv_loader;
20pub mod directory;
21#[cfg(feature = "html-loader")]
22pub mod html;
23pub mod json;
24pub mod markdown;
25#[cfg(feature = "pdf-loader")]
26pub mod pdf;
27pub mod text;
28#[cfg(feature = "toml-loader")]
29pub mod toml_loader;
30#[cfg(feature = "web-loader")]
31pub mod web;
32#[cfg(feature = "yaml-loader")]
33pub mod yaml;
34
35#[cfg(feature = "csv-loader")]
36pub use csv_loader::CsvLoader;
37pub use directory::DirectoryLoader;
38#[cfg(feature = "html-loader")]
39pub use html::HtmlLoader;
40pub use json::JsonLoader;
41pub use markdown::MarkdownLoader;
42#[cfg(feature = "pdf-loader")]
43pub use pdf::PdfLoader;
44pub use text::TextLoader;
45#[cfg(feature = "toml-loader")]
46pub use toml_loader::TomlLoader;
47#[cfg(feature = "web-loader")]
48pub use web::WebLoader;
49#[cfg(feature = "yaml-loader")]
50pub use yaml::YamlLoader;
51
52/// A document loader.
53///
54/// Implementations stream documents — preferred over collecting into a
55/// `Vec` so callers can early-terminate large sources.
56#[async_trait]
57pub trait DocumentLoader: Send + Sync {
58    /// Stream of documents. Errors are per-item so a partial-failure source
59    /// can still yield successful items.
60    async fn load(&self) -> Result<DocumentStream>;
61
62    /// Convenience: collect every yielded document. Stops at the first error.
63    async fn load_all(&self) -> Result<Vec<Document>> {
64        use futures::StreamExt;
65        let mut s = self.load().await?;
66        let mut out = Vec::new();
67        while let Some(doc) = s.next().await {
68            out.push(doc?);
69        }
70        Ok(out)
71    }
72}
73
74/// Boxed stream returned by [`DocumentLoader::load`].
75pub type DocumentStream = std::pin::Pin<Box<dyn Stream<Item = Result<Document>> + Send>>;