cognis_rag/loaders/
text.rs1use std::path::{Path, PathBuf};
4
5use async_trait::async_trait;
6use futures::stream;
7
8use cognis_core::{CognisError, Result};
9
10use crate::document::Document;
11
12use super::{DocumentLoader, DocumentStream};
13
14pub struct TextLoader {
18 path: PathBuf,
19 encoding_check: bool,
20}
21
22impl TextLoader {
23 pub fn new(path: impl AsRef<Path>) -> Self {
25 Self {
26 path: path.as_ref().to_path_buf(),
27 encoding_check: true,
28 }
29 }
30
31 pub fn lossy(mut self) -> Self {
33 self.encoding_check = false;
34 self
35 }
36}
37
38#[async_trait]
39impl DocumentLoader for TextLoader {
40 async fn load(&self) -> Result<DocumentStream> {
41 let bytes = tokio::fs::read(&self.path).await.map_err(|e| {
42 CognisError::Configuration(format!("TextLoader: read `{}`: {e}", self.path.display()))
43 })?;
44 let content = if self.encoding_check {
45 String::from_utf8(bytes).map_err(|e| {
46 CognisError::Serialization(format!(
47 "TextLoader: `{}` is not valid UTF-8: {e}",
48 self.path.display()
49 ))
50 })?
51 } else {
52 String::from_utf8_lossy(&bytes).into_owned()
53 };
54
55 let doc = Document::new(content).with_metadata("source", self.path.display().to_string());
56 Ok(Box::pin(stream::iter(vec![Ok(doc)])))
57 }
58}
59
60#[cfg(test)]
61mod tests {
62 use super::*;
63 use std::io::Write;
64 use tempfile::NamedTempFile;
65
66 #[tokio::test]
67 async fn loads_text_file() {
68 let mut f = NamedTempFile::new().unwrap();
69 writeln!(f, "hello").unwrap();
70 let loader = TextLoader::new(f.path());
71 let docs = loader.load_all().await.unwrap();
72 assert_eq!(docs.len(), 1);
73 assert!(docs[0].content.contains("hello"));
74 assert!(docs[0].metadata.contains_key("source"));
75 }
76
77 #[tokio::test]
78 async fn missing_file_errors() {
79 let loader = TextLoader::new("/no/such/file/__test__");
80 assert!(loader.load_all().await.is_err());
81 }
82}