processors_rs/pdf/
pdf_processor.rs1use crate::markdown_processor::MarkdownProcessor;
2use crate::pdf::tesseract::input::{Args, Image};
3use crate::processor::{Document, DocumentProcessor, FileProcessor};
4use anyhow::Error;
5use image::DynamicImage;
6use pdf2image::{Pages, RenderOptionsBuilder, PDF};
7use std::path::Path;
8use text_splitter::ChunkConfigError;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum PdfBackend {
12 LoPdf,
13}
14
15pub struct PdfProcessor {
17 markdown_processor: MarkdownProcessor,
18 ocr_config: OcrConfig,
19 backend: PdfBackend,
20}
21
22pub struct OcrConfig {
23 pub use_ocr: bool,
24 pub tesseract_path: Option<String>,
25}
26
27impl PdfProcessor {
28 pub fn new(
29 chunk_size: usize,
30 overlap: usize,
31 ocr_config: OcrConfig,
32 backend: PdfBackend,
33 ) -> Result<PdfProcessor, ChunkConfigError> {
34 let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
35 Ok(PdfProcessor {
36 markdown_processor,
37 ocr_config,
38 backend,
39 })
40 }
41}
42
43impl FileProcessor for PdfProcessor {
44 fn process_file(&self, path: impl AsRef<Path>) -> anyhow::Result<Document> {
45 let content = if self.ocr_config.use_ocr {
46 let tesseract_path = self.ocr_config.tesseract_path.as_deref();
47 extract_text_with_ocr(&path, tesseract_path)?
48 } else {
49 match self.backend {
50 PdfBackend::LoPdf => {
51 pdf_extract::extract_text(path.as_ref()).map_err(|e| anyhow::anyhow!(e))?
52 }
53 }
54 };
55
56 self.markdown_processor.process_document(&content)
57 }
58}
59
60fn get_images_from_pdf<T: AsRef<Path>>(file_path: &T) -> Result<Vec<DynamicImage>, Error> {
61 let pdf = PDF::from_file(file_path)?;
62 let page_count = pdf.page_count();
63 let pages = pdf.render(
64 Pages::Range(1..=page_count),
65 RenderOptionsBuilder::default().build()?,
66 )?;
67 Ok(pages)
68}
69
70fn extract_text_from_image(image: &DynamicImage, args: &Args) -> Result<String, Error> {
71 let image = Image::from_dynamic_image(image)?;
72 let text = crate::pdf::tesseract::command::image_to_string(&image, args)?;
73 Ok(text)
74}
75
76fn extract_text_with_ocr<T: AsRef<Path>>(
77 file_path: &T,
78 tesseract_path: Option<&str>,
79) -> Result<String, Error> {
80 let images = get_images_from_pdf(file_path)?;
81 let texts: Result<Vec<String>, Error> = images
82 .iter()
83 .map(|image| extract_text_from_image(image, &Args::default().with_path(tesseract_path)))
84 .collect();
85
86 let text = texts?.join("\n");
88 let cleaned_text = text
89 .lines()
90 .filter(|line| !line.trim().is_empty())
91 .collect::<Vec<&str>>()
92 .join("\n");
93
94 Ok(cleaned_text)
95}
96
97#[cfg(test)]
98mod tests {
99 use super::*;
100 use std::fs::File;
101 use tempdir::TempDir;
102
103 #[test]
104 fn test_extract_text() {
105 let temp_dir = TempDir::new("example").unwrap();
106 let pdf_file = temp_dir.path().join("test.pdf");
107 let processor = PdfProcessor::new(
108 128,
109 0,
110 OcrConfig {
111 use_ocr: false,
112 tesseract_path: None,
113 },
114 PdfBackend::LoPdf,
115 )
116 .unwrap();
117
118 File::create(pdf_file).unwrap();
119
120 let pdf_file = "../test_files/test.pdf";
121 let text = processor.process_file(pdf_file).unwrap();
122 assert_eq!(text.chunks.len(), 4271);
123 }
124
125 #[test]
126 fn test_extract_text_with_ocr() {
127 let pdf_file = "../test_files/test.pdf";
128 let path = Path::new(pdf_file);
129
130 if !path.exists() {
132 panic!("File does not exist: {}", path.display());
133 }
134
135 println!("Absolute path: {}", path.canonicalize().unwrap().display());
137
138 let text = extract_text_with_ocr(&pdf_file, None).unwrap();
139
140 println!("Text: {}", text);
141 }
142}