processors_rs/pdf/
pdf_processor.rs

1use crate::markdown_processor::MarkdownProcessor;
2use crate::pdf::tesseract::input::{Args, Image};
3use crate::processor::{Document, DocumentProcessor, FileProcessor};
4use anyhow::Error;
5use image::DynamicImage;
6use pdf2image::{Pages, RenderOptionsBuilder, PDF};
7use std::path::Path;
8use text_splitter::ChunkConfigError;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum PdfBackend {
12    LoPdf,
13}
14
15/// A struct for processing PDF files.
16pub struct PdfProcessor {
17    markdown_processor: MarkdownProcessor,
18    ocr_config: OcrConfig,
19    backend: PdfBackend,
20}
21
22pub struct OcrConfig {
23    pub use_ocr: bool,
24    pub tesseract_path: Option<String>,
25}
26
27impl PdfProcessor {
28    pub fn new(
29        chunk_size: usize,
30        overlap: usize,
31        ocr_config: OcrConfig,
32        backend: PdfBackend,
33    ) -> Result<PdfProcessor, ChunkConfigError> {
34        let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
35        Ok(PdfProcessor {
36            markdown_processor,
37            ocr_config,
38            backend,
39        })
40    }
41}
42
43impl FileProcessor for PdfProcessor {
44    fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
45        let content = if self.ocr_config.use_ocr {
46            let tesseract_path = self.ocr_config.tesseract_path.as_deref();
47            extract_text_with_ocr(&path, tesseract_path)?
48        } else {
49            match self.backend {
50                PdfBackend::LoPdf => {
51                    pdf_extract::extract_text(path.as_ref()).map_err(|e| anyhow::anyhow!(e))?
52                }
53            }
54        };
55
56        self.markdown_processor.process_document(&content)
57    }
58}
59
60fn get_images_from_pdf<T: AsRef<Path>>(file_path: &T) -> Result<Vec<DynamicImage>, Error> {
61    let pdf = PDF::from_file(file_path)?;
62    let page_count = pdf.page_count();
63    let pages = pdf.render(
64        Pages::Range(1..=page_count),
65        RenderOptionsBuilder::default().build()?,
66    )?;
67    Ok(pages)
68}
69
70fn extract_text_from_image(image: &DynamicImage, args: &Args) -> Result<String, Error> {
71    let image = Image::from_dynamic_image(image)?;
72    let text = crate::pdf::tesseract::command::image_to_string(&image, args)?;
73    Ok(text)
74}
75
76fn extract_text_with_ocr<T: AsRef<Path>>(
77    file_path: &T,
78    tesseract_path: Option<&str>,
79) -> Result<String, Error> {
80    let images = get_images_from_pdf(file_path)?;
81    let texts: Result<Vec<String>, Error> = images
82        .iter()
83        .map(|image| extract_text_from_image(image, &Args::default().with_path(tesseract_path)))
84        .collect();
85
86    // Join the texts and clean up empty lines
87    let text = texts?.join("\n");
88    let cleaned_text = text
89        .lines()
90        .filter(|line| !line.trim().is_empty())
91        .collect::<Vec<&str>>()
92        .join("\n");
93
94    Ok(cleaned_text)
95}
96
97#[cfg(test)]
98mod tests {
99    use super::*;
100    use std::fs::File;
101    use tempdir::TempDir;
102
103    #[test]
104    fn test_extract_text() {
105        let temp_dir = TempDir::new("example").unwrap();
106        let pdf_file = temp_dir.path().join("test.pdf");
107        let processor = PdfProcessor::new(
108            128,
109            0,
110            OcrConfig {
111                use_ocr: false,
112                tesseract_path: None,
113            },
114            PdfBackend::LoPdf,
115        )
116        .unwrap();
117
118        File::create(pdf_file).unwrap();
119
120        let pdf_file = "../test_files/test.pdf";
121        let text = processor.process_file(pdf_file).unwrap();
122        assert_eq!(text.chunks.len(), 4271);
123    }
124
125    #[test]
126    fn test_extract_text_with_ocr() {
127        let pdf_file = "../test_files/test.pdf";
128        let path = Path::new(pdf_file);
129
130        // Check if the path exists
131        if !path.exists() {
132            panic!("File does not exist: {}", path.display());
133        }
134
135        // Print the absolute path
136        println!("Absolute path: {}", path.canonicalize().unwrap().display());
137
138        let text = extract_text_with_ocr(&pdf_file, None).unwrap();
139
140        println!("Text: {}", text);
141    }
142}