1use std::{io::Read, path::Path};
2
3use anyhow::Result;
4
5use rayon::prelude::*;
6use serde::{Deserialize, Serialize};
7
8#[derive(Clone, Debug, Serialize, Deserialize)]
9pub struct Document {
10 path: String,
11 ident: String,
12 #[serde(skip_deserializing, skip_serializing)]
13 data: Option<String>,
14}
15
16impl Document {
17 pub fn from_reader<R>(path: &Path, mut reader: R) -> Result<Self>
18 where
19 R: Read,
20 {
21 let path = std::fs::canonicalize(path.display().to_string())?
22 .display()
23 .to_string();
24
25 let mut data = String::new();
26
27 reader.read_to_string(&mut data)?;
28
29 let data = Some(data);
30 let ident = sha256::digest(data.as_ref().unwrap());
31 Ok(Self { path, data, ident })
32 }
33
34 pub fn get_ident(&self) -> &str {
35 &self.ident
36 }
37
38 pub fn get_path(&self) -> &str {
39 &self.path
40 }
41
42 pub fn get_data(&mut self) -> Result<&str> {
43 if self.data.is_none() {
44 self.data = Some(std::fs::read_to_string(&self.path)?);
45 }
46
47 Ok(self.data.as_ref().unwrap())
48 }
49
50 pub fn drop_data(&mut self) {
51 self.data = None;
52 }
53
54 pub fn get_byte_size(&mut self) -> Result<usize> {
55 Ok(self.get_data()?.as_bytes().len())
56 }
57
58 pub fn chunks(mut self, chunk_size: usize) -> Result<Vec<Document>> {
59 return Ok(self
60 .get_data()?
61 .chars()
62 .collect::<Vec<char>>()
63 .par_chunks(chunk_size)
64 .enumerate()
65 .map(|(idx, chunk)| Document {
66 ident: format!("{}@{}", self.ident, idx),
67 path: format!("{}", self.path),
68 data: Some(chunk.iter().collect::<String>()),
69 })
70 .collect());
71 }
72}