Skip to main content

cognis_rag/loaders/
text.rs

1//! Plain-text file loader.
2
3use std::path::{Path, PathBuf};
4
5use async_trait::async_trait;
6use futures::stream;
7
8use cognis_core::{CognisError, Result};
9
10use crate::document::Document;
11
12use super::{DocumentLoader, DocumentStream};
13
14/// Loads a single UTF-8 text file as one [`Document`].
15///
16/// `metadata.source` is set to the absolute (or as-given) path.
17pub struct TextLoader {
18    path: PathBuf,
19    encoding_check: bool,
20}
21
22impl TextLoader {
23    /// Construct a loader for the file at `path`.
24    pub fn new(path: impl AsRef<Path>) -> Self {
25        Self {
26            path: path.as_ref().to_path_buf(),
27            encoding_check: true,
28        }
29    }
30
31    /// Skip the UTF-8 validity check (loader will lossily decode).
32    pub fn lossy(mut self) -> Self {
33        self.encoding_check = false;
34        self
35    }
36}
37
38#[async_trait]
39impl DocumentLoader for TextLoader {
40    async fn load(&self) -> Result<DocumentStream> {
41        let bytes = tokio::fs::read(&self.path).await.map_err(|e| {
42            CognisError::Configuration(format!("TextLoader: read `{}`: {e}", self.path.display()))
43        })?;
44        let content = if self.encoding_check {
45            String::from_utf8(bytes).map_err(|e| {
46                CognisError::Serialization(format!(
47                    "TextLoader: `{}` is not valid UTF-8: {e}",
48                    self.path.display()
49                ))
50            })?
51        } else {
52            String::from_utf8_lossy(&bytes).into_owned()
53        };
54
55        let doc = Document::new(content).with_metadata("source", self.path.display().to_string());
56        Ok(Box::pin(stream::iter(vec![Ok(doc)])))
57    }
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63    use std::io::Write;
64    use tempfile::NamedTempFile;
65
66    #[tokio::test]
67    async fn loads_text_file() {
68        let mut f = NamedTempFile::new().unwrap();
69        writeln!(f, "hello").unwrap();
70        let loader = TextLoader::new(f.path());
71        let docs = loader.load_all().await.unwrap();
72        assert_eq!(docs.len(), 1);
73        assert!(docs[0].content.contains("hello"));
74        assert!(docs[0].metadata.contains_key("source"));
75    }
76
77    #[tokio::test]
78    async fn missing_file_errors() {
79        let loader = TextLoader::new("/no/such/file/__test__");
80        assert!(loader.load_all().await.is_err());
81    }
82}