Skip to main content

alith_knowledge/
pdf.rs

1use lopdf::Document;
2use std::path::{Path, PathBuf};
3
4use alith_core::{
5    chunking::{ChunkError, Chunker, chunk_text},
6    knowledge::{FileKnowledge, Knowledge, KnowledgeError},
7};
8
9pub struct PdfFileKnowledge {
10    pub path: PathBuf,
11}
12
13impl PdfFileKnowledge {
14    pub fn new<P: AsRef<Path>>(path: P) -> Self {
15        Self {
16            path: path.as_ref().to_path_buf(),
17        }
18    }
19}
20
21impl Chunker for PdfFileKnowledge {
22    fn chunk(&self) -> std::result::Result<Vec<String>, ChunkError> {
23        Ok(chunk_text(
24            &self
25                .load()
26                .map_err(|err| ChunkError::Normal(err.to_string()))?,
27            self.chunk_size() as u32,
28            self.overlap_percent(),
29        )
30        .map_err(|err| ChunkError::Normal(err.to_string()))?
31        .unwrap_or_default())
32    }
33}
34
35impl Knowledge for PdfFileKnowledge {
36    fn load(&self) -> Result<String, KnowledgeError> {
37        let doc =
38            Document::load(&self.path).map_err(|err| KnowledgeError::LoadError(err.to_string()))?;
39        Ok(doc
40            .page_iter()
41            .enumerate()
42            .map(|(page_no, _)| {
43                doc.extract_text(&[page_no as u32 + 1])
44                    .map_err(|err| KnowledgeError::LoadError(err.to_string()))
45            })
46            .collect::<Result<Vec<String>, KnowledgeError>>()?
47            .into_iter()
48            .collect::<String>())
49    }
50
51    fn enrich(&self, _input: &str) -> Result<String, KnowledgeError> {
52        Ok(format!("<pdffile>{}</pdffile>", self.load()?))
53    }
54}
55
56impl FileKnowledge for PdfFileKnowledge {
57    fn load_with_path(&self) -> Result<(PathBuf, String), KnowledgeError> {
58        let content = self.load()?;
59        Ok((self.path.clone(), content))
60    }
61}