1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
use std::{io::Read, path::Path};

use anyhow::Result;

use rayon::prelude::*;
use serde::{Deserialize, Serialize};

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Document {
    path: String,
    ident: String,
    #[serde(skip_deserializing, skip_serializing)]
    data: Option<String>,
}

impl Document {
    pub fn from_reader<R>(path: &Path, mut reader: R) -> Result<Self>
    where
        R: Read,
    {
        let path = std::fs::canonicalize(path.display().to_string())?
            .display()
            .to_string();

        let mut data = String::new();

        reader.read_to_string(&mut data)?;

        let data = Some(data);
        let ident = sha256::digest(data.as_ref().unwrap());
        Ok(Self { path, data, ident })
    }

    pub fn get_ident(&self) -> &str {
        &self.ident
    }

    pub fn get_path(&self) -> &str {
        &self.path
    }

    pub fn get_data(&mut self) -> Result<&str> {
        if self.data.is_none() {
            self.data = Some(std::fs::read_to_string(&self.path)?);
        }

        Ok(self.data.as_ref().unwrap())
    }

    pub fn drop_data(&mut self) {
        self.data = None;
    }

    pub fn get_byte_size(&mut self) -> Result<usize> {
        Ok(self.get_data()?.as_bytes().len())
    }

    pub fn chunks(mut self, chunk_size: usize) -> Result<Vec<Document>> {
        return Ok(self
            .get_data()?
            .chars()
            .collect::<Vec<char>>()
            .par_chunks(chunk_size)
            .enumerate()
            .map(|(idx, chunk)| Document {
                ident: format!("{}@{}", self.ident, idx),
                path: format!("{}@{}", self.path, idx),
                data: Some(chunk.iter().collect::<String>()),
            })
            .collect());
    }
}