mini_rag/
document.rs

1use std::{io::Read, path::Path};
2
3use anyhow::Result;
4
5use rayon::prelude::*;
6use serde::{Deserialize, Serialize};
7
8#[derive(Clone, Debug, Serialize, Deserialize)]
9pub struct Document {
10    path: String,
11    ident: String,
12    #[serde(skip_deserializing, skip_serializing)]
13    data: Option<String>,
14}
15
16impl Document {
17    pub fn from_reader<R>(path: &Path, mut reader: R) -> Result<Self>
18    where
19        R: Read,
20    {
21        let path = std::fs::canonicalize(path.display().to_string())?
22            .display()
23            .to_string();
24
25        let mut data = String::new();
26
27        reader.read_to_string(&mut data)?;
28
29        let data = Some(data);
30        let ident = sha256::digest(data.as_ref().unwrap());
31        Ok(Self { path, data, ident })
32    }
33
34    pub fn get_ident(&self) -> &str {
35        &self.ident
36    }
37
38    pub fn get_path(&self) -> &str {
39        &self.path
40    }
41
42    pub fn get_data(&mut self) -> Result<&str> {
43        if self.data.is_none() {
44            self.data = Some(std::fs::read_to_string(&self.path)?);
45        }
46
47        Ok(self.data.as_ref().unwrap())
48    }
49
50    pub fn drop_data(&mut self) {
51        self.data = None;
52    }
53
54    pub fn get_byte_size(&mut self) -> Result<usize> {
55        Ok(self.get_data()?.as_bytes().len())
56    }
57
58    pub fn chunks(mut self, chunk_size: usize) -> Result<Vec<Document>> {
59        return Ok(self
60            .get_data()?
61            .chars()
62            .collect::<Vec<char>>()
63            .par_chunks(chunk_size)
64            .enumerate()
65            .map(|(idx, chunk)| Document {
66                ident: format!("{}@{}", self.ident, idx),
67                path: format!("{}", self.path),
68                data: Some(chunk.iter().collect::<String>()),
69            })
70            .collect());
71    }
72}