1use crate::engine::{OcrEngine, OcrPageResult};
7use crate::error::{OcrError, Result};
8use lopdf::content::{Content, Operation};
9use lopdf::{dictionary, Document, Object, ObjectId, Stream};
10
11#[derive(Debug, Clone)]
13pub struct OcrConfig {
14 pub dpi: u32,
16 pub text_threshold: usize,
18 pub pages: Vec<u32>,
20}
21
22impl Default for OcrConfig {
23 fn default() -> Self {
24 Self {
25 dpi: 300,
26 text_threshold: 10,
27 pages: Vec::new(),
28 }
29 }
30}
31
32#[derive(Debug, Clone)]
34pub struct OcrReport {
35 pub pages: Vec<OcrPageReport>,
37 pub pages_processed: usize,
39 pub total_words: usize,
41}
42
43#[derive(Debug, Clone)]
45pub struct OcrPageReport {
46 pub page: u32,
48 pub ocr_needed: bool,
50 pub words_recognized: usize,
52 pub confidence: f32,
54}
55
56pub fn make_searchable<
64 E: OcrEngine,
65 R: Fn(&Document, u32, u32) -> std::result::Result<(Vec<u8>, u32, u32), String>,
66>(
67 doc: &mut Document,
68 engine: &E,
69 config: &OcrConfig,
70 render_fn: R,
71) -> Result<OcrReport> {
72 let pages = doc.get_pages();
73 let total = pages.len() as u32;
74
75 let page_nums: Vec<u32> = if config.pages.is_empty() {
77 (1..=total).collect()
78 } else {
79 for &p in &config.pages {
81 if p == 0 || p > total {
82 return Err(OcrError::PageOutOfRange(p, total));
83 }
84 }
85 config.pages.clone()
86 };
87
88 let mut report = OcrReport {
89 pages: Vec::new(),
90 pages_processed: 0,
91 total_words: 0,
92 };
93
94 for &page_num in &page_nums {
95 let page_id = match pages.get(&page_num) {
96 Some(&id) => id,
97 None => continue,
98 };
99
100 let needs_ocr = page_needs_ocr(doc, page_id, config.text_threshold);
101
102 if !needs_ocr {
103 report.pages.push(OcrPageReport {
104 page: page_num,
105 ocr_needed: false,
106 words_recognized: 0,
107 confidence: 1.0,
108 });
109 continue;
110 }
111
112 let (image_data, width, height) =
114 render_fn(doc, page_num, config.dpi).map_err(OcrError::Render)?;
115
116 let ocr_result = engine
118 .recognize(&image_data, width, height, config.dpi)
119 .map_err(OcrError::Engine)?;
120
121 let words_count = ocr_result.words.len();
122 let confidence = ocr_result.confidence;
123
124 if !ocr_result.words.is_empty() {
126 let media_box = get_media_box(doc, page_id);
127 insert_invisible_text_layer(doc, page_id, &ocr_result, &media_box, config.dpi)?;
128 }
129
130 report.pages.push(OcrPageReport {
131 page: page_num,
132 ocr_needed: true,
133 words_recognized: words_count,
134 confidence,
135 });
136 report.pages_processed += 1;
137 report.total_words += words_count;
138 }
139
140 Ok(report)
141}
142
143fn page_needs_ocr(doc: &Document, page_id: ObjectId, threshold: usize) -> bool {
145 let content_bytes = match get_page_content_bytes(doc, page_id) {
146 Some(bytes) => bytes,
147 None => return true, };
149
150 let content = match Content::decode(&content_bytes) {
151 Ok(c) => c,
152 Err(_) => return true,
153 };
154
155 let mut char_count = 0;
156 for op in &content.operations {
157 match op.operator.as_str() {
158 "Tj" => {
159 for operand in &op.operands {
160 if let Object::String(bytes, _) = operand {
161 char_count += bytes.len();
162 }
163 }
164 }
165 "TJ" => {
166 for operand in &op.operands {
167 if let Object::Array(arr) = operand {
168 for item in arr {
169 if let Object::String(bytes, _) = item {
170 char_count += bytes.len();
171 }
172 }
173 }
174 }
175 }
176 "'" | "\"" => {
177 for operand in &op.operands {
178 if let Object::String(bytes, _) = operand {
179 char_count += bytes.len();
180 }
181 }
182 }
183 _ => {}
184 }
185 }
186
187 char_count < threshold
188}
189
190fn insert_invisible_text_layer(
194 doc: &mut Document,
195 page_id: ObjectId,
196 ocr_result: &OcrPageResult,
197 media_box: &[f64; 4],
198 _dpi: u32,
199) -> Result<()> {
200 let page_width = media_box[2] - media_box[0];
201 let page_height = media_box[3] - media_box[1];
202 let img_w = ocr_result.image_width as f64;
203 let img_h = ocr_result.image_height as f64;
204
205 let scale_x = page_width / img_w;
206 let scale_y = page_height / img_h;
207
208 let mut ops = vec![
209 Operation::new("BT", vec![]),
210 Operation::new("Tr", vec![Object::Integer(3)]),
212 Operation::new(
213 "Tf",
214 vec![Object::Name(b"Helvetica".to_vec()), Object::Real(10.0)],
215 ),
216 ];
217
218 for word in &ocr_result.words {
219 let [px0, py0, px1, _py1] = word.bbox_px;
220
221 let pdf_x = media_box[0] + (px0 as f64) * scale_x;
223 let pdf_y = media_box[3] - (py0 as f64) * scale_y;
225
226 let word_width_px = (px1 - px0) as f64;
227 let word_width_pdf = word_width_px * scale_x;
228
229 let natural_width = word.text.len() as f64 * 10.0 * 0.5;
231 let h_scale = if natural_width > 0.0 {
232 (word_width_pdf / natural_width) * 100.0
233 } else {
234 100.0
235 };
236
237 ops.push(Operation::new(
239 "Tm",
240 vec![
241 Object::Real(1.0),
242 Object::Real(0.0),
243 Object::Real(0.0),
244 Object::Real(1.0),
245 Object::Real(pdf_x as f32),
246 Object::Real(pdf_y as f32),
247 ],
248 ));
249 ops.push(Operation::new("Tz", vec![Object::Real(h_scale as f32)]));
251 ops.push(Operation::new(
253 "Tj",
254 vec![Object::String(
255 word.text.as_bytes().to_vec(),
256 lopdf::StringFormat::Literal,
257 )],
258 ));
259 }
260
261 ops.push(Operation::new("ET", vec![]));
262
263 let content = Content { operations: ops };
264 let encoded = content
265 .encode()
266 .map_err(|e| OcrError::Other(format!("failed to encode text layer: {e}")))?;
267
268 let text_stream = Stream::new(dictionary! {}, encoded);
269 let text_id = doc.add_object(Object::Stream(text_stream));
270
271 let existing = {
273 let page_obj = match doc.get_object(page_id) {
274 Ok(obj) => obj,
275 Err(_) => return Ok(()),
276 };
277 let page_dict = match page_obj {
278 Object::Dictionary(ref d) => d,
279 _ => return Ok(()),
280 };
281 page_dict.get(b"Contents").ok().cloned()
282 };
283
284 let new_contents = match existing {
285 Some(Object::Reference(existing_id)) => Object::Array(vec![
286 Object::Reference(existing_id),
287 Object::Reference(text_id),
288 ]),
289 Some(Object::Array(mut arr)) => {
290 arr.push(Object::Reference(text_id));
291 Object::Array(arr)
292 }
293 _ => Object::Reference(text_id),
294 };
295
296 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
297 d.set("Contents", new_contents);
298 }
299
300 Ok(())
301}
302
303fn get_media_box(doc: &Document, page_id: ObjectId) -> [f64; 4] {
305 let default_box = [0.0, 0.0, 612.0, 792.0];
306
307 let page_obj = match doc.get_object(page_id) {
308 Ok(obj) => obj,
309 Err(_) => return default_box,
310 };
311
312 let page_dict = match page_obj {
313 Object::Dictionary(ref d) => d,
314 _ => return default_box,
315 };
316
317 match page_dict.get(b"MediaBox") {
318 Ok(Object::Array(arr)) => {
319 if arr.len() >= 4 {
320 let vals: Vec<f64> = arr
321 .iter()
322 .filter_map(|v| match v {
323 Object::Integer(i) => Some(*i as f64),
324 Object::Real(f) => Some(*f as f64),
325 _ => None,
326 })
327 .collect();
328 if vals.len() >= 4 {
329 [vals[0], vals[1], vals[2], vals[3]]
330 } else {
331 default_box
332 }
333 } else {
334 default_box
335 }
336 }
337 _ => default_box,
338 }
339}
340
341fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> Option<Vec<u8>> {
343 doc.get_page_content(page_id).ok()
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349 use crate::engine::{NoOpEngine, OcrPageResult, OcrWord};
350
351 struct MockEngine {
353 result: OcrPageResult,
354 }
355
356 impl MockEngine {
357 fn new(words: Vec<OcrWord>) -> Self {
358 let confidence = if words.is_empty() {
359 0.0
360 } else {
361 words.iter().map(|w| w.confidence).sum::<f32>() / words.len() as f32
362 };
363 Self {
364 result: OcrPageResult {
365 words,
366 confidence,
367 image_width: 600,
368 image_height: 800,
369 },
370 }
371 }
372 }
373
374 impl OcrEngine for MockEngine {
375 fn recognize(
376 &self,
377 _image_data: &[u8],
378 _width: u32,
379 _height: u32,
380 _dpi: u32,
381 ) -> std::result::Result<OcrPageResult, String> {
382 Ok(self.result.clone())
383 }
384
385 fn supported_languages(&self) -> Vec<String> {
386 vec!["eng".to_string()]
387 }
388 }
389
390 fn make_scanned_doc() -> Document {
392 let mut doc = Document::with_version("1.7");
393
394 let content_stream =
396 Stream::new(dictionary! {}, b"q 612 0 0 792 0 0 cm /Im0 Do Q".to_vec());
397 let content_id = doc.add_object(Object::Stream(content_stream));
398
399 let page_dict = dictionary! {
400 "Type" => "Page",
401 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
402 "Contents" => Object::Reference(content_id),
403 };
404 let page_id = doc.add_object(Object::Dictionary(page_dict));
405
406 let pages_dict = dictionary! {
407 "Type" => "Pages",
408 "Kids" => vec![Object::Reference(page_id)],
409 "Count" => 1_i64,
410 };
411 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
412
413 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
414 d.set("Parent", Object::Reference(pages_id));
415 }
416
417 let catalog = dictionary! {
418 "Type" => "Catalog",
419 "Pages" => Object::Reference(pages_id),
420 };
421 let catalog_id = doc.add_object(Object::Dictionary(catalog));
422 doc.trailer.set("Root", Object::Reference(catalog_id));
423
424 doc
425 }
426
427 fn make_text_doc() -> Document {
429 let mut doc = Document::with_version("1.7");
430
431 let content_stream = Stream::new(
432 dictionary! {},
433 b"BT /F1 12 Tf (This is a text page with enough characters to pass threshold) Tj ET"
434 .to_vec(),
435 );
436 let content_id = doc.add_object(Object::Stream(content_stream));
437
438 let page_dict = dictionary! {
439 "Type" => "Page",
440 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
441 "Contents" => Object::Reference(content_id),
442 };
443 let page_id = doc.add_object(Object::Dictionary(page_dict));
444
445 let pages_dict = dictionary! {
446 "Type" => "Pages",
447 "Kids" => vec![Object::Reference(page_id)],
448 "Count" => 1_i64,
449 };
450 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
451
452 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
453 d.set("Parent", Object::Reference(pages_id));
454 }
455
456 let catalog = dictionary! {
457 "Type" => "Catalog",
458 "Pages" => Object::Reference(pages_id),
459 };
460 let catalog_id = doc.add_object(Object::Dictionary(catalog));
461 doc.trailer.set("Root", Object::Reference(catalog_id));
462
463 doc
464 }
465
466 fn make_mixed_doc() -> Document {
468 let mut doc = Document::with_version("1.7");
469 let mut page_ids = Vec::new();
470
471 let content1 = Stream::new(dictionary! {}, b"q 612 0 0 792 0 0 cm /Im0 Do Q".to_vec());
473 let c1 = doc.add_object(Object::Stream(content1));
474 let p1 = dictionary! {
475 "Type" => "Page",
476 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
477 "Contents" => Object::Reference(c1),
478 };
479 let p1_id = doc.add_object(Object::Dictionary(p1));
480 page_ids.push(p1_id);
481
482 let content2 = Stream::new(
484 dictionary! {},
485 b"BT /F1 12 Tf (Enough text content to pass the threshold) Tj ET".to_vec(),
486 );
487 let c2 = doc.add_object(Object::Stream(content2));
488 let p2 = dictionary! {
489 "Type" => "Page",
490 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
491 "Contents" => Object::Reference(c2),
492 };
493 let p2_id = doc.add_object(Object::Dictionary(p2));
494 page_ids.push(p2_id);
495
496 let content3 = Stream::new(dictionary! {}, b"q 612 0 0 792 0 0 cm /Im1 Do Q".to_vec());
498 let c3 = doc.add_object(Object::Stream(content3));
499 let p3 = dictionary! {
500 "Type" => "Page",
501 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
502 "Contents" => Object::Reference(c3),
503 };
504 let p3_id = doc.add_object(Object::Dictionary(p3));
505 page_ids.push(p3_id);
506
507 let kids: Vec<Object> = page_ids.iter().map(|id| Object::Reference(*id)).collect();
508 let pages_dict = dictionary! {
509 "Type" => "Pages",
510 "Kids" => kids,
511 "Count" => Object::Integer(page_ids.len() as i64),
512 };
513 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
514
515 for &pid in &page_ids {
516 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(pid) {
517 d.set("Parent", Object::Reference(pages_id));
518 }
519 }
520
521 let catalog = dictionary! {
522 "Type" => "Catalog",
523 "Pages" => Object::Reference(pages_id),
524 };
525 let catalog_id = doc.add_object(Object::Dictionary(catalog));
526 doc.trailer.set("Root", Object::Reference(catalog_id));
527
528 doc
529 }
530
531 fn dummy_render(
533 _doc: &Document,
534 _page_num: u32,
535 _dpi: u32,
536 ) -> std::result::Result<(Vec<u8>, u32, u32), String> {
537 Ok((vec![0u8; 600 * 800 * 3], 600, 800))
538 }
539
540 #[test]
541 fn scanned_page_needs_ocr() {
542 let doc = make_scanned_doc();
543 let pages = doc.get_pages();
544 let page_id = *pages.get(&1).unwrap();
545 assert!(page_needs_ocr(&doc, page_id, 10));
546 }
547
548 #[test]
549 fn text_page_does_not_need_ocr() {
550 let doc = make_text_doc();
551 let pages = doc.get_pages();
552 let page_id = *pages.get(&1).unwrap();
553 assert!(!page_needs_ocr(&doc, page_id, 10));
554 }
555
556 #[test]
557 fn noop_engine_processes_scanned() {
558 let mut doc = make_scanned_doc();
559 let engine = NoOpEngine;
560 let config = OcrConfig::default();
561
562 let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
563 assert_eq!(report.pages.len(), 1);
564 assert!(report.pages[0].ocr_needed);
565 assert_eq!(report.pages[0].words_recognized, 0);
566 }
567
568 #[test]
569 fn text_page_skipped_by_pipeline() {
570 let mut doc = make_text_doc();
571 let engine = NoOpEngine;
572 let config = OcrConfig::default();
573
574 let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
575 assert_eq!(report.pages.len(), 1);
576 assert!(!report.pages[0].ocr_needed);
577 assert_eq!(report.pages_processed, 0);
578 }
579
580 #[test]
581 fn mock_engine_inserts_invisible_text() {
582 let mut doc = make_scanned_doc();
583 let engine = MockEngine::new(vec![
584 OcrWord {
585 text: "Hello".to_string(),
586 bbox_px: [10, 20, 100, 40],
587 confidence: 0.95,
588 },
589 OcrWord {
590 text: "World".to_string(),
591 bbox_px: [110, 20, 200, 40],
592 confidence: 0.90,
593 },
594 ]);
595 let config = OcrConfig::default();
596
597 let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
598 assert_eq!(report.pages_processed, 1);
599 assert_eq!(report.total_words, 2);
600 assert!(report.pages[0].ocr_needed);
601 assert_eq!(report.pages[0].words_recognized, 2);
602 }
603
604 #[test]
605 fn ocr_specific_pages() {
606 let mut doc = make_mixed_doc();
607 let engine = NoOpEngine;
608 let config = OcrConfig {
609 pages: vec![1],
610 ..Default::default()
611 };
612
613 let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
614 assert_eq!(report.pages.len(), 1);
616 assert_eq!(report.pages[0].page, 1);
617 }
618
619 #[test]
620 fn ocr_page_out_of_range() {
621 let mut doc = make_scanned_doc();
622 let engine = NoOpEngine;
623 let config = OcrConfig {
624 pages: vec![5],
625 ..Default::default()
626 };
627
628 let result = make_searchable(&mut doc, &engine, &config, dummy_render);
629 assert!(result.is_err());
630 }
631}