Skip to main content

pdf_ocr/
pipeline.rs

1//! OCR pipeline: detect scanned pages, run OCR, insert invisible text layers.
2//!
3//! The pipeline takes a render callback to rasterize pages, avoiding a hard
4//! dependency on any particular rendering engine.
5
6use crate::engine::{OcrEngine, OcrPageResult};
7use crate::error::{OcrError, Result};
8use lopdf::content::{Content, Operation};
9use lopdf::{dictionary, Document, Object, ObjectId, Stream};
10
11/// Configuration for the OCR pipeline.
12#[derive(Debug, Clone)]
13pub struct OcrConfig {
14    /// Resolution for rendering pages (dots per inch).
15    pub dpi: u32,
16    /// Minimum character count in content stream to consider a page as already containing text.
17    pub text_threshold: usize,
18    /// Specific pages to process (empty = all pages).
19    pub pages: Vec<u32>,
20}
21
22impl Default for OcrConfig {
23    fn default() -> Self {
24        Self {
25            dpi: 300,
26            text_threshold: 10,
27            pages: Vec::new(),
28        }
29    }
30}
31
32/// Report for the entire OCR process.
33#[derive(Debug, Clone)]
34pub struct OcrReport {
35    /// Per-page reports.
36    pub pages: Vec<OcrPageReport>,
37    /// Total number of pages processed.
38    pub pages_processed: usize,
39    /// Total number of words recognized.
40    pub total_words: usize,
41}
42
43/// Report for a single page.
44#[derive(Debug, Clone)]
45pub struct OcrPageReport {
46    /// Page number (1-based).
47    pub page: u32,
48    /// Whether OCR was needed (page was scanned).
49    pub ocr_needed: bool,
50    /// Number of words recognized.
51    pub words_recognized: usize,
52    /// Overall confidence.
53    pub confidence: f32,
54}
55
56/// Make a PDF searchable by running OCR on scanned pages.
57///
58/// # Arguments
59/// * `doc` - The PDF document to process.
60/// * `engine` - The OCR engine to use.
61/// * `config` - OCR configuration.
62/// * `render_fn` - A callback to render a page to an image: `(doc, page_num, dpi) -> (pixels, width, height)`.
63pub fn make_searchable<
64    E: OcrEngine,
65    R: Fn(&Document, u32, u32) -> std::result::Result<(Vec<u8>, u32, u32), String>,
66>(
67    doc: &mut Document,
68    engine: &E,
69    config: &OcrConfig,
70    render_fn: R,
71) -> Result<OcrReport> {
72    let pages = doc.get_pages();
73    let total = pages.len() as u32;
74
75    // Determine which pages to process.
76    let page_nums: Vec<u32> = if config.pages.is_empty() {
77        (1..=total).collect()
78    } else {
79        // Validate page numbers.
80        for &p in &config.pages {
81            if p == 0 || p > total {
82                return Err(OcrError::PageOutOfRange(p, total));
83            }
84        }
85        config.pages.clone()
86    };
87
88    let mut report = OcrReport {
89        pages: Vec::new(),
90        pages_processed: 0,
91        total_words: 0,
92    };
93
94    for &page_num in &page_nums {
95        let page_id = match pages.get(&page_num) {
96            Some(&id) => id,
97            None => continue,
98        };
99
100        let needs_ocr = page_needs_ocr(doc, page_id, config.text_threshold);
101
102        if !needs_ocr {
103            report.pages.push(OcrPageReport {
104                page: page_num,
105                ocr_needed: false,
106                words_recognized: 0,
107                confidence: 1.0,
108            });
109            continue;
110        }
111
112        // Render page to image.
113        let (image_data, width, height) =
114            render_fn(doc, page_num, config.dpi).map_err(OcrError::Render)?;
115
116        // Run OCR.
117        let ocr_result = engine
118            .recognize(&image_data, width, height, config.dpi)
119            .map_err(OcrError::Engine)?;
120
121        let words_count = ocr_result.words.len();
122        let confidence = ocr_result.confidence;
123
124        // Insert invisible text layer.
125        if !ocr_result.words.is_empty() {
126            let media_box = get_media_box(doc, page_id);
127            insert_invisible_text_layer(doc, page_id, &ocr_result, &media_box, config.dpi)?;
128        }
129
130        report.pages.push(OcrPageReport {
131            page: page_num,
132            ocr_needed: true,
133            words_recognized: words_count,
134            confidence,
135        });
136        report.pages_processed += 1;
137        report.total_words += words_count;
138    }
139
140    Ok(report)
141}
142
143/// Check whether a page needs OCR by counting text characters in the content stream.
144fn page_needs_ocr(doc: &Document, page_id: ObjectId, threshold: usize) -> bool {
145    let content_bytes = match get_page_content_bytes(doc, page_id) {
146        Some(bytes) => bytes,
147        None => return true, // No content stream = likely scanned.
148    };
149
150    let content = match Content::decode(&content_bytes) {
151        Ok(c) => c,
152        Err(_) => return true,
153    };
154
155    let mut char_count = 0;
156    for op in &content.operations {
157        match op.operator.as_str() {
158            "Tj" => {
159                for operand in &op.operands {
160                    if let Object::String(bytes, _) = operand {
161                        char_count += bytes.len();
162                    }
163                }
164            }
165            "TJ" => {
166                for operand in &op.operands {
167                    if let Object::Array(arr) = operand {
168                        for item in arr {
169                            if let Object::String(bytes, _) = item {
170                                char_count += bytes.len();
171                            }
172                        }
173                    }
174                }
175            }
176            "'" | "\"" => {
177                for operand in &op.operands {
178                    if let Object::String(bytes, _) = operand {
179                        char_count += bytes.len();
180                    }
181                }
182            }
183            _ => {}
184        }
185    }
186
187    char_count < threshold
188}
189
190/// Insert an invisible text layer on a page using OCR results.
191///
192/// Uses rendering mode 3 (invisible) so text is searchable but not visible.
193fn insert_invisible_text_layer(
194    doc: &mut Document,
195    page_id: ObjectId,
196    ocr_result: &OcrPageResult,
197    media_box: &[f64; 4],
198    _dpi: u32,
199) -> Result<()> {
200    let page_width = media_box[2] - media_box[0];
201    let page_height = media_box[3] - media_box[1];
202    let img_w = ocr_result.image_width as f64;
203    let img_h = ocr_result.image_height as f64;
204
205    let scale_x = page_width / img_w;
206    let scale_y = page_height / img_h;
207
208    let mut ops = vec![
209        Operation::new("BT", vec![]),
210        // Set rendering mode 3 (invisible).
211        Operation::new("Tr", vec![Object::Integer(3)]),
212        Operation::new(
213            "Tf",
214            vec![Object::Name(b"Helvetica".to_vec()), Object::Real(10.0)],
215        ),
216    ];
217
218    for word in &ocr_result.words {
219        let [px0, py0, px1, _py1] = word.bbox_px;
220
221        // Convert pixel coordinates to PDF coordinates.
222        let pdf_x = media_box[0] + (px0 as f64) * scale_x;
223        // PDF y-axis is bottom-up, image y-axis is top-down.
224        let pdf_y = media_box[3] - (py0 as f64) * scale_y;
225
226        let word_width_px = (px1 - px0) as f64;
227        let word_width_pdf = word_width_px * scale_x;
228
229        // Approximate the natural width of the text at 10pt.
230        let natural_width = word.text.len() as f64 * 10.0 * 0.5;
231        let h_scale = if natural_width > 0.0 {
232            (word_width_pdf / natural_width) * 100.0
233        } else {
234            100.0
235        };
236
237        // Set text position with Tm.
238        ops.push(Operation::new(
239            "Tm",
240            vec![
241                Object::Real(1.0),
242                Object::Real(0.0),
243                Object::Real(0.0),
244                Object::Real(1.0),
245                Object::Real(pdf_x as f32),
246                Object::Real(pdf_y as f32),
247            ],
248        ));
249        // Set horizontal scaling.
250        ops.push(Operation::new("Tz", vec![Object::Real(h_scale as f32)]));
251        // Show the text.
252        ops.push(Operation::new(
253            "Tj",
254            vec![Object::String(
255                word.text.as_bytes().to_vec(),
256                lopdf::StringFormat::Literal,
257            )],
258        ));
259    }
260
261    ops.push(Operation::new("ET", vec![]));
262
263    let content = Content { operations: ops };
264    let encoded = content
265        .encode()
266        .map_err(|e| OcrError::Other(format!("failed to encode text layer: {e}")))?;
267
268    let text_stream = Stream::new(dictionary! {}, encoded);
269    let text_id = doc.add_object(Object::Stream(text_stream));
270
271    // Append to page contents.
272    let existing = {
273        let page_obj = match doc.get_object(page_id) {
274            Ok(obj) => obj,
275            Err(_) => return Ok(()),
276        };
277        let page_dict = match page_obj {
278            Object::Dictionary(ref d) => d,
279            _ => return Ok(()),
280        };
281        page_dict.get(b"Contents").ok().cloned()
282    };
283
284    let new_contents = match existing {
285        Some(Object::Reference(existing_id)) => Object::Array(vec![
286            Object::Reference(existing_id),
287            Object::Reference(text_id),
288        ]),
289        Some(Object::Array(mut arr)) => {
290            arr.push(Object::Reference(text_id));
291            Object::Array(arr)
292        }
293        _ => Object::Reference(text_id),
294    };
295
296    if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
297        d.set("Contents", new_contents);
298    }
299
300    Ok(())
301}
302
303/// Get the MediaBox for a page.
304fn get_media_box(doc: &Document, page_id: ObjectId) -> [f64; 4] {
305    let default_box = [0.0, 0.0, 612.0, 792.0];
306
307    let page_obj = match doc.get_object(page_id) {
308        Ok(obj) => obj,
309        Err(_) => return default_box,
310    };
311
312    let page_dict = match page_obj {
313        Object::Dictionary(ref d) => d,
314        _ => return default_box,
315    };
316
317    match page_dict.get(b"MediaBox") {
318        Ok(Object::Array(arr)) => {
319            if arr.len() >= 4 {
320                let vals: Vec<f64> = arr
321                    .iter()
322                    .filter_map(|v| match v {
323                        Object::Integer(i) => Some(*i as f64),
324                        Object::Real(f) => Some(*f as f64),
325                        _ => None,
326                    })
327                    .collect();
328                if vals.len() >= 4 {
329                    [vals[0], vals[1], vals[2], vals[3]]
330                } else {
331                    default_box
332                }
333            } else {
334                default_box
335            }
336        }
337        _ => default_box,
338    }
339}
340
341/// Get content stream bytes for a page.
342fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> Option<Vec<u8>> {
343    doc.get_page_content(page_id).ok()
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use crate::engine::{NoOpEngine, OcrPageResult, OcrWord};
350
351    /// A mock OCR engine that returns predetermined results.
352    struct MockEngine {
353        result: OcrPageResult,
354    }
355
356    impl MockEngine {
357        fn new(words: Vec<OcrWord>) -> Self {
358            let confidence = if words.is_empty() {
359                0.0
360            } else {
361                words.iter().map(|w| w.confidence).sum::<f32>() / words.len() as f32
362            };
363            Self {
364                result: OcrPageResult {
365                    words,
366                    confidence,
367                    image_width: 600,
368                    image_height: 800,
369                },
370            }
371        }
372    }
373
374    impl OcrEngine for MockEngine {
375        fn recognize(
376            &self,
377            _image_data: &[u8],
378            _width: u32,
379            _height: u32,
380            _dpi: u32,
381        ) -> std::result::Result<OcrPageResult, String> {
382            Ok(self.result.clone())
383        }
384
385        fn supported_languages(&self) -> Vec<String> {
386            vec!["eng".to_string()]
387        }
388    }
389
390    /// Helper: create a doc with a scanned page (no text content).
391    fn make_scanned_doc() -> Document {
392        let mut doc = Document::with_version("1.7");
393
394        // Page with only graphics (no text operators).
395        let content_stream =
396            Stream::new(dictionary! {}, b"q 612 0 0 792 0 0 cm /Im0 Do Q".to_vec());
397        let content_id = doc.add_object(Object::Stream(content_stream));
398
399        let page_dict = dictionary! {
400            "Type" => "Page",
401            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
402            "Contents" => Object::Reference(content_id),
403        };
404        let page_id = doc.add_object(Object::Dictionary(page_dict));
405
406        let pages_dict = dictionary! {
407            "Type" => "Pages",
408            "Kids" => vec![Object::Reference(page_id)],
409            "Count" => 1_i64,
410        };
411        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
412
413        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
414            d.set("Parent", Object::Reference(pages_id));
415        }
416
417        let catalog = dictionary! {
418            "Type" => "Catalog",
419            "Pages" => Object::Reference(pages_id),
420        };
421        let catalog_id = doc.add_object(Object::Dictionary(catalog));
422        doc.trailer.set("Root", Object::Reference(catalog_id));
423
424        doc
425    }
426
427    /// Helper: create a doc with a text page.
428    fn make_text_doc() -> Document {
429        let mut doc = Document::with_version("1.7");
430
431        let content_stream = Stream::new(
432            dictionary! {},
433            b"BT /F1 12 Tf (This is a text page with enough characters to pass threshold) Tj ET"
434                .to_vec(),
435        );
436        let content_id = doc.add_object(Object::Stream(content_stream));
437
438        let page_dict = dictionary! {
439            "Type" => "Page",
440            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
441            "Contents" => Object::Reference(content_id),
442        };
443        let page_id = doc.add_object(Object::Dictionary(page_dict));
444
445        let pages_dict = dictionary! {
446            "Type" => "Pages",
447            "Kids" => vec![Object::Reference(page_id)],
448            "Count" => 1_i64,
449        };
450        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
451
452        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
453            d.set("Parent", Object::Reference(pages_id));
454        }
455
456        let catalog = dictionary! {
457            "Type" => "Catalog",
458            "Pages" => Object::Reference(pages_id),
459        };
460        let catalog_id = doc.add_object(Object::Dictionary(catalog));
461        doc.trailer.set("Root", Object::Reference(catalog_id));
462
463        doc
464    }
465
466    /// Helper: create a multi-page doc (mix of scanned and text).
467    fn make_mixed_doc() -> Document {
468        let mut doc = Document::with_version("1.7");
469        let mut page_ids = Vec::new();
470
471        // Page 1: scanned (no text).
472        let content1 = Stream::new(dictionary! {}, b"q 612 0 0 792 0 0 cm /Im0 Do Q".to_vec());
473        let c1 = doc.add_object(Object::Stream(content1));
474        let p1 = dictionary! {
475            "Type" => "Page",
476            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
477            "Contents" => Object::Reference(c1),
478        };
479        let p1_id = doc.add_object(Object::Dictionary(p1));
480        page_ids.push(p1_id);
481
482        // Page 2: text page.
483        let content2 = Stream::new(
484            dictionary! {},
485            b"BT /F1 12 Tf (Enough text content to pass the threshold) Tj ET".to_vec(),
486        );
487        let c2 = doc.add_object(Object::Stream(content2));
488        let p2 = dictionary! {
489            "Type" => "Page",
490            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
491            "Contents" => Object::Reference(c2),
492        };
493        let p2_id = doc.add_object(Object::Dictionary(p2));
494        page_ids.push(p2_id);
495
496        // Page 3: scanned.
497        let content3 = Stream::new(dictionary! {}, b"q 612 0 0 792 0 0 cm /Im1 Do Q".to_vec());
498        let c3 = doc.add_object(Object::Stream(content3));
499        let p3 = dictionary! {
500            "Type" => "Page",
501            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
502            "Contents" => Object::Reference(c3),
503        };
504        let p3_id = doc.add_object(Object::Dictionary(p3));
505        page_ids.push(p3_id);
506
507        let kids: Vec<Object> = page_ids.iter().map(|id| Object::Reference(*id)).collect();
508        let pages_dict = dictionary! {
509            "Type" => "Pages",
510            "Kids" => kids,
511            "Count" => Object::Integer(page_ids.len() as i64),
512        };
513        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
514
515        for &pid in &page_ids {
516            if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(pid) {
517                d.set("Parent", Object::Reference(pages_id));
518            }
519        }
520
521        let catalog = dictionary! {
522            "Type" => "Catalog",
523            "Pages" => Object::Reference(pages_id),
524        };
525        let catalog_id = doc.add_object(Object::Dictionary(catalog));
526        doc.trailer.set("Root", Object::Reference(catalog_id));
527
528        doc
529    }
530
531    /// Dummy render function for tests.
532    fn dummy_render(
533        _doc: &Document,
534        _page_num: u32,
535        _dpi: u32,
536    ) -> std::result::Result<(Vec<u8>, u32, u32), String> {
537        Ok((vec![0u8; 600 * 800 * 3], 600, 800))
538    }
539
540    #[test]
541    fn scanned_page_needs_ocr() {
542        let doc = make_scanned_doc();
543        let pages = doc.get_pages();
544        let page_id = *pages.get(&1).unwrap();
545        assert!(page_needs_ocr(&doc, page_id, 10));
546    }
547
548    #[test]
549    fn text_page_does_not_need_ocr() {
550        let doc = make_text_doc();
551        let pages = doc.get_pages();
552        let page_id = *pages.get(&1).unwrap();
553        assert!(!page_needs_ocr(&doc, page_id, 10));
554    }
555
556    #[test]
557    fn noop_engine_processes_scanned() {
558        let mut doc = make_scanned_doc();
559        let engine = NoOpEngine;
560        let config = OcrConfig::default();
561
562        let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
563        assert_eq!(report.pages.len(), 1);
564        assert!(report.pages[0].ocr_needed);
565        assert_eq!(report.pages[0].words_recognized, 0);
566    }
567
568    #[test]
569    fn text_page_skipped_by_pipeline() {
570        let mut doc = make_text_doc();
571        let engine = NoOpEngine;
572        let config = OcrConfig::default();
573
574        let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
575        assert_eq!(report.pages.len(), 1);
576        assert!(!report.pages[0].ocr_needed);
577        assert_eq!(report.pages_processed, 0);
578    }
579
580    #[test]
581    fn mock_engine_inserts_invisible_text() {
582        let mut doc = make_scanned_doc();
583        let engine = MockEngine::new(vec![
584            OcrWord {
585                text: "Hello".to_string(),
586                bbox_px: [10, 20, 100, 40],
587                confidence: 0.95,
588            },
589            OcrWord {
590                text: "World".to_string(),
591                bbox_px: [110, 20, 200, 40],
592                confidence: 0.90,
593            },
594        ]);
595        let config = OcrConfig::default();
596
597        let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
598        assert_eq!(report.pages_processed, 1);
599        assert_eq!(report.total_words, 2);
600        assert!(report.pages[0].ocr_needed);
601        assert_eq!(report.pages[0].words_recognized, 2);
602    }
603
604    #[test]
605    fn ocr_specific_pages() {
606        let mut doc = make_mixed_doc();
607        let engine = NoOpEngine;
608        let config = OcrConfig {
609            pages: vec![1],
610            ..Default::default()
611        };
612
613        let report = make_searchable(&mut doc, &engine, &config, dummy_render).unwrap();
614        // Only page 1 should be in the report.
615        assert_eq!(report.pages.len(), 1);
616        assert_eq!(report.pages[0].page, 1);
617    }
618
619    #[test]
620    fn ocr_page_out_of_range() {
621        let mut doc = make_scanned_doc();
622        let engine = NoOpEngine;
623        let config = OcrConfig {
624            pages: vec![5],
625            ..Default::default()
626        };
627
628        let result = make_searchable(&mut doc, &engine, &config, dummy_render);
629        assert!(result.is_err());
630    }
631}