1use crate::types::OcrResult;
8use crate::VisionProvider;
9use serde::{Deserialize, Serialize};
10use std::sync::Arc;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct PdfPage {
15 pub page_number: usize,
17 pub ocr_result: OcrResult,
19 pub dimensions: (f32, f32),
21 pub rotation: i32,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct PdfDocumentResult {
28 pub pages: Vec<PdfPage>,
30 pub total_pages: usize,
32 pub metadata: PdfMetadata,
34 pub full_text: String,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize, Default)]
40pub struct PdfMetadata {
41 pub title: Option<String>,
43 pub author: Option<String>,
45 pub subject: Option<String>,
47 pub creation_date: Option<String>,
49 pub modification_date: Option<String>,
51 pub producer: Option<String>,
53}
54
55#[derive(Debug, Clone)]
57pub struct PdfProcessingConfig {
58 pub max_pages: Option<usize>,
60 pub page_range: Option<(usize, usize)>,
62 pub generate_toc: bool,
64 pub combine_text: bool,
66 pub render_dpi: u32,
68}
69
70impl Default for PdfProcessingConfig {
71 fn default() -> Self {
72 Self {
73 max_pages: None,
74 page_range: None,
75 generate_toc: true,
76 combine_text: true,
77 render_dpi: 300,
78 }
79 }
80}
81
82pub struct PdfProcessor {
84 config: PdfProcessingConfig,
85}
86
87impl PdfProcessor {
88 pub fn new() -> Self {
90 Self {
91 config: PdfProcessingConfig::default(),
92 }
93 }
94
95 pub fn with_config(config: PdfProcessingConfig) -> Self {
97 Self { config }
98 }
99
100 pub async fn process_pdf(
105 &self,
106 _pdf_data: &[u8],
107 _provider: Arc<dyn VisionProvider>,
108 ) -> crate::Result<PdfDocumentResult> {
109 let metadata = PdfMetadata::default();
118
119 let pages = vec![PdfPage {
120 page_number: 1,
121 ocr_result: OcrResult::from_text("[PDF processing requires pdf library]"),
122 dimensions: (612.0, 792.0), rotation: 0,
124 }];
125
126 let full_text = if self.config.combine_text {
127 pages
128 .iter()
129 .map(|p| p.ocr_result.text.clone())
130 .collect::<Vec<_>>()
131 .join("\n\n")
132 } else {
133 String::new()
134 };
135
136 Ok(PdfDocumentResult {
137 total_pages: pages.len(),
138 pages,
139 metadata,
140 full_text,
141 })
142 }
143
144 pub async fn process_page_range(
146 &self,
147 pdf_data: &[u8],
148 provider: Arc<dyn VisionProvider>,
149 start_page: usize,
150 end_page: usize,
151 ) -> crate::Result<PdfDocumentResult> {
152 let mut config = self.config.clone();
153 config.page_range = Some((start_page, end_page));
154
155 let processor = Self::with_config(config);
156 processor.process_pdf(pdf_data, provider).await
157 }
158
159 pub fn extract_metadata(&self, _pdf_data: &[u8]) -> crate::Result<PdfMetadata> {
161 Ok(PdfMetadata::default())
163 }
164}
165
166impl Default for PdfProcessor {
167 fn default() -> Self {
168 Self::new()
169 }
170}
171
172impl PdfDocumentResult {
173 pub fn get_page(&self, page_number: usize) -> Option<&PdfPage> {
175 self.pages.iter().find(|p| p.page_number == page_number)
176 }
177
178 pub fn generate_toc(&self) -> Vec<TocEntry> {
180 let mut toc = Vec::new();
181
182 for page in &self.pages {
183 for block in &page.ocr_result.blocks {
185 if matches!(block.role, crate::types::BlockRole::Header) {
186 toc.push(TocEntry {
187 title: block.text.clone(),
188 page_number: page.page_number,
189 level: 1, });
191 }
192 }
193 }
194
195 toc
196 }
197
198 pub fn to_markdown(&self) -> String {
200 let mut output = String::new();
201
202 if let Some(ref title) = self.metadata.title {
204 output.push_str(&format!("# {}\n\n", title));
205 }
206
207 for page in &self.pages {
209 output.push_str(&format!("## Page {}\n\n", page.page_number));
210 output.push_str(&page.ocr_result.markdown);
211 output.push_str("\n\n");
212 }
213
214 output
215 }
216
217 pub fn to_html(&self) -> String {
219 let mut output = String::from("<!DOCTYPE html>\n<html>\n<head>\n");
220
221 if let Some(ref title) = self.metadata.title {
222 output.push_str(&format!(" <title>{}</title>\n", title));
223 }
224
225 output.push_str("</head>\n<body>\n");
226
227 for page in &self.pages {
228 output.push_str(&format!(
229 " <div class=\"page\" data-page=\"{}\">\n",
230 page.page_number
231 ));
232 output.push_str(&format!(" <h2>Page {}</h2>\n", page.page_number));
233 output.push_str(" <div class=\"content\">\n");
234 output.push_str(&format!(" {}\n", page.ocr_result.text));
235 output.push_str(" </div>\n");
236 output.push_str(" </div>\n");
237 }
238
239 output.push_str("</body>\n</html>");
240 output
241 }
242
243 pub fn search(&self, query: &str) -> Vec<SearchResult> {
245 let mut results = Vec::new();
246
247 for page in &self.pages {
248 if page
249 .ocr_result
250 .text
251 .to_lowercase()
252 .contains(&query.to_lowercase())
253 {
254 results.push(SearchResult {
255 page_number: page.page_number,
256 context: self.extract_context(&page.ocr_result.text, query, 50),
257 });
258 }
259 }
260
261 results
262 }
263
264 fn extract_context(&self, text: &str, query: &str, context_chars: usize) -> String {
266 let lower_text = text.to_lowercase();
267 let lower_query = query.to_lowercase();
268
269 if let Some(pos) = lower_text.find(&lower_query) {
270 let start = pos.saturating_sub(context_chars);
271 let end = (pos + query.len() + context_chars).min(text.len());
272
273 let mut context = text[start..end].to_string();
274
275 if start > 0 {
276 context = format!("...{}", context);
277 }
278 if end < text.len() {
279 context.push_str("...");
280 }
281
282 context
283 } else {
284 String::new()
285 }
286 }
287}
288
289#[derive(Debug, Clone, Serialize, Deserialize)]
291pub struct TocEntry {
292 pub title: String,
294 pub page_number: usize,
296 pub level: usize,
298}
299
300#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct SearchResult {
303 pub page_number: usize,
305 pub context: String,
307}
308
309#[cfg(test)]
310mod tests {
311 use super::*;
312
313 #[test]
314 fn test_pdf_processing_config_default() {
315 let config = PdfProcessingConfig::default();
316 assert_eq!(config.render_dpi, 300);
317 assert!(config.generate_toc);
318 assert!(config.combine_text);
319 }
320
321 #[test]
322 fn test_pdf_page_creation() {
323 let page = PdfPage {
324 page_number: 1,
325 ocr_result: OcrResult::from_text("Test page"),
326 dimensions: (612.0, 792.0),
327 rotation: 0,
328 };
329
330 assert_eq!(page.page_number, 1);
331 assert_eq!(page.ocr_result.text, "Test page");
332 }
333
334 #[test]
335 fn test_pdf_document_get_page() {
336 let result = PdfDocumentResult {
337 pages: vec![
338 PdfPage {
339 page_number: 1,
340 ocr_result: OcrResult::from_text("Page 1"),
341 dimensions: (612.0, 792.0),
342 rotation: 0,
343 },
344 PdfPage {
345 page_number: 2,
346 ocr_result: OcrResult::from_text("Page 2"),
347 dimensions: (612.0, 792.0),
348 rotation: 0,
349 },
350 ],
351 total_pages: 2,
352 metadata: PdfMetadata::default(),
353 full_text: String::new(),
354 };
355
356 let page1 = result.get_page(1);
357 assert!(page1.is_some());
358 assert_eq!(page1.unwrap().ocr_result.text, "Page 1");
359
360 let page3 = result.get_page(3);
361 assert!(page3.is_none());
362 }
363
364 #[test]
365 fn test_pdf_document_search() {
366 let result = PdfDocumentResult {
367 pages: vec![
368 PdfPage {
369 page_number: 1,
370 ocr_result: OcrResult::from_text("Hello world from page 1"),
371 dimensions: (612.0, 792.0),
372 rotation: 0,
373 },
374 PdfPage {
375 page_number: 2,
376 ocr_result: OcrResult::from_text("Different content on page 2"),
377 dimensions: (612.0, 792.0),
378 rotation: 0,
379 },
380 ],
381 total_pages: 2,
382 metadata: PdfMetadata::default(),
383 full_text: String::new(),
384 };
385
386 let results = result.search("world");
387 assert_eq!(results.len(), 1);
388 assert_eq!(results[0].page_number, 1);
389 }
390
391 #[test]
392 fn test_toc_entry_creation() {
393 let entry = TocEntry {
394 title: "Chapter 1".to_string(),
395 page_number: 5,
396 level: 1,
397 };
398
399 assert_eq!(entry.title, "Chapter 1");
400 assert_eq!(entry.page_number, 5);
401 assert_eq!(entry.level, 1);
402 }
403
404 #[test]
405 fn test_pdf_metadata() {
406 let metadata = PdfMetadata {
407 title: Some("Test Document".to_string()),
408 author: Some("Test Author".to_string()),
409 subject: None,
410 creation_date: None,
411 modification_date: None,
412 producer: Some("Test Producer".to_string()),
413 };
414
415 assert_eq!(metadata.title.unwrap(), "Test Document");
416 assert_eq!(metadata.author.unwrap(), "Test Author");
417 }
418}