1use lopdf::Document;
2use std::path::{Path, PathBuf};
3
4use alith_core::{
5 chunking::{ChunkError, Chunker, chunk_text},
6 knowledge::{FileKnowledge, Knowledge, KnowledgeError},
7};
8
9pub struct PdfFileKnowledge {
10 pub path: PathBuf,
11}
12
13impl PdfFileKnowledge {
14 pub fn new<P: AsRef<Path>>(path: P) -> Self {
15 Self {
16 path: path.as_ref().to_path_buf(),
17 }
18 }
19}
20
21impl Chunker for PdfFileKnowledge {
22 fn chunk(&self) -> std::result::Result<Vec<String>, ChunkError> {
23 Ok(chunk_text(
24 &self
25 .load()
26 .map_err(|err| ChunkError::Normal(err.to_string()))?,
27 self.chunk_size() as u32,
28 self.overlap_percent(),
29 )
30 .map_err(|err| ChunkError::Normal(err.to_string()))?
31 .unwrap_or_default())
32 }
33}
34
35impl Knowledge for PdfFileKnowledge {
36 fn load(&self) -> Result<String, KnowledgeError> {
37 let doc =
38 Document::load(&self.path).map_err(|err| KnowledgeError::LoadError(err.to_string()))?;
39 Ok(doc
40 .page_iter()
41 .enumerate()
42 .map(|(page_no, _)| {
43 doc.extract_text(&[page_no as u32 + 1])
44 .map_err(|err| KnowledgeError::LoadError(err.to_string()))
45 })
46 .collect::<Result<Vec<String>, KnowledgeError>>()?
47 .into_iter()
48 .collect::<String>())
49 }
50
51 fn enrich(&self, _input: &str) -> Result<String, KnowledgeError> {
52 Ok(format!("<pdffile>{}</pdffile>", self.load()?))
53 }
54}
55
56impl FileKnowledge for PdfFileKnowledge {
57 fn load_with_path(&self) -> Result<(PathBuf, String), KnowledgeError> {
58 let content = self.load()?;
59 Ok((self.path.clone(), content))
60 }
61}