1use std::path::PathBuf;
2use std::sync::Arc;
3
4use cli_image_to_ascii::{RenderConfig, render_half_block};
5use hygg_shared::normalize_file_path;
6
7use crate::sanitize::sanitize_layout_text;
8
9pub struct PdfStream {
23 canonical_path: PathBuf,
24 doc: pdf_oxide::PdfDocument,
25 total_pages: usize,
26 #[cfg(feature = "pdf-ocr-bundled")]
27 ocr_engine: Option<pdf_oxide::ocr::OcrEngine>,
28}
29
30#[derive(Clone, Copy, Debug, Eq, PartialEq)]
31pub enum PdfLineKind {
32 Text,
33 AnsiArt,
34}
35
36#[derive(Clone, Debug)]
37pub struct PdfRenderedPage {
38 pub raw_text: String,
39 pub lines: Vec<String>,
40 pub line_kinds: Vec<PdfLineKind>,
41 pub contains_images: bool,
42}
43
44impl PdfStream {
45 pub fn open(pdf_path: &str) -> Result<Self, Box<dyn std::error::Error>> {
47 Self::open_with_optional_ocr(pdf_path, false)
48 }
49
50 pub fn open_with_bundled_ocr(
51 pdf_path: &str,
52 ) -> Result<Self, Box<dyn std::error::Error>> {
53 Self::open_with_optional_ocr(pdf_path, true)
54 }
55
56 fn open_with_optional_ocr(
57 pdf_path: &str,
58 enable_ocr: bool,
59 ) -> Result<Self, Box<dyn std::error::Error>> {
60 let canonical_path = normalize_file_path(pdf_path)?;
61 let doc = pdf_oxide::PdfDocument::open(&canonical_path)
62 .map_err(|e| format!("pdf_oxide open failed: {e:?}"))?;
63 let total_pages = doc
64 .page_count()
65 .map_err(|e| format!("pdf_oxide page_count failed: {e:?}"))?;
66 #[cfg(feature = "pdf-ocr-bundled")]
67 let ocr_engine =
68 if enable_ocr { Some(crate::ocr::bundled_ocr_engine()?) } else { None };
69 #[cfg(not(feature = "pdf-ocr-bundled"))]
70 if enable_ocr {
71 return Err(
72 "OCR support is not available in this build. Rebuild with `--features pdf-ocr-bundled` to use the bundled English OCR engine."
73 .into(),
74 );
75 }
76 Ok(Self {
77 canonical_path,
78 doc,
79 total_pages,
80 #[cfg(feature = "pdf-ocr-bundled")]
81 ocr_engine,
82 })
83 }
84
85 pub fn total_pages(&self) -> usize {
86 self.total_pages
87 }
88
89 pub fn canonical_path(&self) -> &std::path::Path {
90 &self.canonical_path
91 }
92
93 pub fn extract_page(&self, page_index: usize) -> Option<String> {
110 if page_index == 0 || page_index > self.total_pages {
111 return None;
112 }
113 let doc = &self.doc;
114 let page_0based = page_index - 1;
115 let raw = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
116 extract_page_text_lines(doc, page_0based)
117 }))
118 .ok()
119 .flatten()?;
120 if raw.trim().is_empty() {
121 return None;
122 }
123 Some(sanitize_layout_text(&raw))
124 }
125
126 pub fn extract_page_with_images(
127 &self,
128 page_index: usize,
129 col: usize,
130 ) -> Option<PdfRenderedPage> {
131 if page_index == 0 || page_index > self.total_pages {
132 return None;
133 }
134
135 let raw_text = self.extract_page(page_index).unwrap_or_default();
136 let page_0based = page_index - 1;
137 let images = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
138 self.doc.extract_images(page_0based)
139 }))
140 .ok()
141 .and_then(Result::ok)
142 .unwrap_or_default();
143
144 let text_rows = positioned_visual_text_rows(&self.doc, page_0based);
145 #[cfg(feature = "pdf-ocr-bundled")]
146 let allow_unlabeled_vector_regions = self.ocr_engine.is_some();
147 #[cfg(not(feature = "pdf-ocr-bundled"))]
148 let allow_unlabeled_vector_regions = false;
149
150 let mut image_rows =
151 render_pdf_images(&self.doc, page_0based, col, images.as_slice());
152 image_rows.extend(render_vector_diagram_regions(
153 &self.doc,
154 page_0based,
155 col,
156 &text_rows,
157 allow_unlabeled_vector_regions,
158 ));
159
160 #[cfg(feature = "pdf-ocr-bundled")]
161 let text_rows = {
162 let mut text_rows = text_rows;
163 if let Some(engine) = self.ocr_engine.as_ref() {
164 let ocr_rows = ocr_visual_text_rows(
165 &self.doc,
166 page_0based,
167 images.as_slice(),
168 engine,
169 &text_rows,
170 );
171 let native_rows = text_rows.clone();
172 text_rows.extend(
173 ocr_rows
174 .into_iter()
175 .filter(|row| !has_near_duplicate_visual_text(&native_rows, row)),
176 );
177 }
178 text_rows
179 };
180 if image_rows.is_empty() {
181 let PdfPageForAnsi { lines, line_kinds } = if text_rows.is_empty() {
182 text_only_page_lines(&raw_text, col)
183 } else {
184 compose_visual_page(text_rows, Vec::new(), col)
185 };
186 return Some(PdfRenderedPage {
187 raw_text,
188 lines,
189 line_kinds,
190 contains_images: false,
191 });
192 }
193
194 let PdfPageForAnsi { lines, line_kinds } =
195 compose_visual_page(text_rows, image_rows, col);
196 Some(PdfRenderedPage { raw_text, lines, line_kinds, contains_images: true })
197 }
198}
199
200struct PdfPageForAnsi {
201 lines: Vec<String>,
202 line_kinds: Vec<PdfLineKind>,
203}
204
205#[derive(Clone, Debug)]
206struct VisualTextRow {
207 top: f32,
208 left: f32,
209 text: String,
210}
211
212struct VisualImageRows {
213 top: f32,
214 left_cells: usize,
215 width_cells: usize,
216 region: PdfRegion,
217 lines: Vec<String>,
218}
219
220#[derive(Clone, Copy, Debug)]
221struct PdfRegion {
222 left: f32,
223 bottom: f32,
224 width: f32,
225 height: f32,
226}
227
228impl PdfRegion {
229 fn top(&self) -> f32 {
230 self.bottom + self.height
231 }
232}
233
234fn text_only_page_lines(raw_text: &str, col: usize) -> PdfPageForAnsi {
235 let lines = cli_justify::justify_pdf_page(raw_text, col).lines;
236 let line_kinds = vec![PdfLineKind::Text; lines.len()];
237 PdfPageForAnsi { lines, line_kinds }
238}
239
240fn render_pdf_images(
241 doc: &pdf_oxide::PdfDocument,
242 page_0based: usize,
243 col: usize,
244 images: &[pdf_oxide::extractors::PdfImage],
245) -> Vec<VisualImageRows> {
246 if col == 0 {
247 return Vec::new();
248 }
249 let (page_left, page_width) = doc
250 .get_page_media_box(page_0based)
251 .ok()
252 .map(|(llx, _, urx, _)| (llx, (urx - llx).abs()))
253 .filter(|(_, w)| *w > 0.0)
254 .unwrap_or((0.0, 612.0));
255
256 let mut out = Vec::new();
257 for image in images {
258 let Some(bbox) = image.bbox() else {
259 continue;
260 };
261 if bbox.width <= 0.0 || bbox.height <= 0.0 {
262 continue;
263 }
264 let Ok(dynamic_image) = image.to_dynamic_image() else {
265 continue;
266 };
267 if let Some(rows) = render_dynamic_image_region(
268 &dynamic_image,
269 PdfRegion {
270 left: bbox.left(),
271 bottom: bbox.top(),
272 width: bbox.width,
273 height: bbox.height,
274 },
275 page_left,
276 page_width,
277 col,
278 ) {
279 out.push(rows);
280 }
281 }
282 out
283}
284
285fn render_dynamic_image_region(
286 dynamic_image: &image::DynamicImage,
287 region: PdfRegion,
288 page_left: f32,
289 page_width: f32,
290 col: usize,
291) -> Option<VisualImageRows> {
292 let left_cells = pdf_x_to_cells(region.left, page_left, page_width, col);
293 let left_cells = left_cells.min(col.saturating_sub(1));
294 let width_cells = pdf_width_to_cells(region.width, page_width, col);
295 let width_cells = width_cells.max(1).min(col.saturating_sub(left_cells));
296 if width_cells == 0 {
297 return None;
298 }
299 let height_rows =
300 pdf_image_height_rows(region.width, region.height, width_cells);
301 let lines = render_half_block(
302 dynamic_image,
303 RenderConfig::new(Some(width_cells as u32), Some(height_rows as u32)),
304 );
305 if lines.is_empty() {
306 return None;
307 }
308 Some(VisualImageRows {
309 top: region.top(),
310 left_cells,
311 width_cells,
312 region,
313 lines,
314 })
315}
316
317#[cfg(feature = "pdf-ocr-bundled")]
318fn ocr_visual_text_rows(
319 doc: &pdf_oxide::PdfDocument,
320 page_0based: usize,
321 images: &[pdf_oxide::extractors::PdfImage],
322 engine: &pdf_oxide::ocr::OcrEngine,
323 native_rows: &[VisualTextRow],
324) -> Vec<VisualTextRow> {
325 let mut out = Vec::new();
326 for image in images {
327 let Some(bbox) = image.bbox() else {
328 continue;
329 };
330 if bbox.width <= 0.0 || bbox.height <= 0.0 {
331 continue;
332 }
333 let region = PdfRegion {
334 left: bbox.left(),
335 bottom: bbox.top(),
336 width: bbox.width,
337 height: bbox.height,
338 };
339 if !should_ocr_image_region(region, native_rows) {
340 continue;
341 }
342 let Ok(dynamic_image) = image.to_dynamic_image() else {
343 continue;
344 };
345 out.extend(ocr_dynamic_image_text_rows(engine, &dynamic_image, region));
346 }
347
348 for (region, dynamic_image) in
349 render_vector_diagram_images(doc, page_0based, native_rows)
350 {
351 if !should_ocr_image_region(region, native_rows) {
352 continue;
353 }
354 out.extend(ocr_dynamic_image_text_rows(engine, &dynamic_image, region));
355 }
356
357 out
358}
359
360#[cfg(feature = "pdf-ocr-bundled")]
361fn should_ocr_image_region(
362 region: PdfRegion,
363 native_rows: &[VisualTextRow],
364) -> bool {
365 if native_text_is_sufficient_in_region(native_rows, region) {
366 return false;
367 }
368 if native_rows.is_empty() {
369 return true;
370 }
371 has_nearby_figure_caption(region, native_rows)
372}
373
374#[cfg(feature = "pdf-ocr-bundled")]
375fn native_text_is_sufficient_in_region(
376 native_rows: &[VisualTextRow],
377 region: PdfRegion,
378) -> bool {
379 let text = native_rows
380 .iter()
381 .filter(|row| visual_text_row_overlaps_region(row, region))
382 .map(|row| row.text.as_str())
383 .collect::<Vec<_>>()
384 .join(" ");
385 normalized_visual_text(&text).len() >= 8
386}
387
388#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
389fn visual_text_row_overlaps_region(
390 row: &VisualTextRow,
391 region: PdfRegion,
392) -> bool {
393 let right = region.left + region.width;
394 let row_right = row.left + row.text.chars().count() as f32 * 5.0;
395 row.top <= region.top() + 6.0
396 && row.top >= region.bottom - 6.0
397 && row.left <= right + 6.0
398 && row_right >= region.left - 6.0
399}
400
401#[cfg(feature = "pdf-ocr-bundled")]
402fn ocr_dynamic_image_text_rows(
403 engine: &pdf_oxide::ocr::OcrEngine,
404 image: &image::DynamicImage,
405 pdf_region: PdfRegion,
406) -> Vec<VisualTextRow> {
407 let Ok(output) = engine.ocr_image(image) else {
408 return Vec::new();
409 };
410 let image_width = image.width().max(1) as f32;
411 let image_height = image.height().max(1) as f32;
412
413 output
414 .spans
415 .into_iter()
416 .filter_map(|span| {
417 let text = normalize_visual_text_row(span.text.trim());
418 if text.trim().is_empty() {
419 return None;
420 }
421 let (left, top) = ocr_polygon_pdf_anchor(
422 &span.polygon,
423 pdf_region,
424 image_width,
425 image_height,
426 )?;
427 Some(VisualTextRow { top, left, text })
428 })
429 .collect()
430}
431
432#[cfg(feature = "pdf-ocr-bundled")]
433fn ocr_polygon_pdf_anchor(
434 polygon: &[[f32; 2]; 4],
435 pdf_region: PdfRegion,
436 image_width: f32,
437 image_height: f32,
438) -> Option<(f32, f32)> {
439 let mut min_x = f32::INFINITY;
440 let mut min_y = f32::INFINITY;
441 for [x, y] in polygon {
442 if !x.is_finite() || !y.is_finite() {
443 return None;
444 }
445 min_x = min_x.min(*x);
446 min_y = min_y.min(*y);
447 }
448 if !min_x.is_finite() || !min_y.is_finite() {
449 return None;
450 }
451 let left = pdf_region.left + (min_x / image_width) * pdf_region.width;
452 let top = pdf_region.top() - (min_y / image_height) * pdf_region.height;
453 Some((left, top))
454}
455
456#[cfg(feature = "pdf-ocr-bundled")]
457fn render_vector_diagram_images(
458 doc: &pdf_oxide::PdfDocument,
459 page_0based: usize,
460 native_rows: &[VisualTextRow],
461) -> Vec<(PdfRegion, image::DynamicImage)> {
462 let (page_left, page_top, page_width, page_height) =
463 page_metrics(doc, page_0based);
464 let paths = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
465 doc.extract_paths(page_0based)
466 }))
467 .ok()
468 .and_then(Result::ok)
469 .unwrap_or_default();
470 let regions = detect_vector_diagram_regions(
471 &paths,
472 page_left,
473 page_top,
474 page_width,
475 page_height,
476 native_rows,
477 true,
478 );
479 let options = pdf_oxide::rendering::RenderOptions::with_dpi(120);
480
481 regions
482 .into_iter()
483 .filter(|region| should_ocr_image_region(*region, native_rows))
484 .filter_map(|region| {
485 let rendered = pdf_oxide::rendering::render_page_region(
486 doc,
487 page_0based,
488 (region.left, region.bottom, region.width, region.height),
489 &options,
490 )
491 .ok()?;
492 let dynamic_image = image::load_from_memory(&rendered.data).ok()?;
493 Some((region, dynamic_image))
494 })
495 .collect()
496}
497
498#[cfg(feature = "pdf-rendering")]
499fn render_vector_diagram_regions(
500 doc: &pdf_oxide::PdfDocument,
501 page_0based: usize,
502 col: usize,
503 native_rows: &[VisualTextRow],
504 allow_missing_native_text: bool,
505) -> Vec<VisualImageRows> {
506 if col == 0 {
507 return Vec::new();
508 }
509
510 let (page_left, page_top, page_width, page_height) =
511 page_metrics(doc, page_0based);
512 let paths = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
513 doc.extract_paths(page_0based)
514 }))
515 .ok()
516 .and_then(Result::ok)
517 .unwrap_or_default();
518 let regions = detect_vector_diagram_regions(
519 &paths,
520 page_left,
521 page_top,
522 page_width,
523 page_height,
524 native_rows,
525 allow_missing_native_text,
526 );
527
528 let options = pdf_oxide::rendering::RenderOptions::with_dpi(120);
529 let mut out = Vec::new();
530 for region in regions {
531 let rendered = pdf_oxide::rendering::render_page_region(
532 doc,
533 page_0based,
534 (region.left, region.bottom, region.width, region.height),
535 &options,
536 );
537 let Ok(rendered) = rendered else {
538 continue;
539 };
540 let Ok(dynamic_image) = image::load_from_memory(&rendered.data) else {
541 continue;
542 };
543 if let Some(rows) = render_dynamic_image_region(
544 &dynamic_image,
545 region,
546 page_left,
547 page_width,
548 col,
549 ) {
550 out.push(rows);
551 }
552 }
553 out
554}
555
556#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled"))]
557fn page_metrics(
558 doc: &pdf_oxide::PdfDocument,
559 page_0based: usize,
560) -> (f32, f32, f32, f32) {
561 doc
562 .get_page_media_box(page_0based)
563 .ok()
564 .map(|(llx, lly, urx, ury)| {
565 (llx.min(urx), lly.min(ury), (urx - llx).abs(), (ury - lly).abs())
566 })
567 .filter(|(_, _, w, h)| *w > 0.0 && *h > 0.0)
568 .unwrap_or((0.0, 0.0, 612.0, 792.0))
569}
570
571#[cfg(not(feature = "pdf-rendering"))]
572fn render_vector_diagram_regions(
573 _doc: &pdf_oxide::PdfDocument,
574 _page_0based: usize,
575 _col: usize,
576 _native_rows: &[VisualTextRow],
577 _allow_missing_native_text: bool,
578) -> Vec<VisualImageRows> {
579 Vec::new()
580}
581
582#[cfg(any(feature = "pdf-rendering", test))]
583fn detect_vector_diagram_regions(
584 paths: &[pdf_oxide::elements::PathContent],
585 page_left: f32,
586 page_top: f32,
587 page_width: f32,
588 page_height: f32,
589 native_rows: &[VisualTextRow],
590 allow_missing_native_text: bool,
591) -> Vec<PdfRegion> {
592 let mut clusters: Vec<VectorPathCluster> = Vec::new();
593
594 for path in paths {
595 let bbox = path.bbox;
596 if !path.is_table_primitive()
597 || !bbox.x.is_finite()
598 || !bbox.y.is_finite()
599 || !bbox.width.is_finite()
600 || !bbox.height.is_finite()
601 || (bbox.width <= 0.0 && bbox.height <= 0.0)
602 || bbox.width > page_width * 0.95
603 || bbox.height > page_height * 0.95
604 {
605 continue;
606 }
607
608 let bounds = VectorPathBounds {
609 left: bbox.left(),
610 bottom: bbox.top(),
611 right: bbox.right(),
612 top: bbox.bottom(),
613 };
614 add_vector_path_to_clusters(&mut clusters, bounds);
615 }
616
617 let page_right = page_left + page_width;
618 let page_bottom = page_top + page_height;
619 clusters
620 .into_iter()
621 .filter(|cluster| cluster.count >= 3)
622 .filter_map(|cluster| {
623 cluster.region_with_padding(page_left, page_top, page_right, page_bottom)
624 })
625 .filter(|region| region.width >= 24.0 && region.height >= 24.0)
626 .filter(|region| {
627 should_render_vector_diagram_region(
628 *region,
629 native_rows,
630 allow_missing_native_text,
631 )
632 })
633 .collect()
634}
635
636#[cfg(any(feature = "pdf-rendering", test))]
637#[derive(Clone, Copy, Debug)]
638struct VectorPathBounds {
639 left: f32,
640 bottom: f32,
641 right: f32,
642 top: f32,
643}
644
645#[cfg(any(feature = "pdf-rendering", test))]
646#[derive(Clone, Copy, Debug)]
647struct VectorPathCluster {
648 count: usize,
649 left: f32,
650 bottom: f32,
651 right: f32,
652 top: f32,
653}
654
655#[cfg(any(feature = "pdf-rendering", test))]
656impl VectorPathCluster {
657 fn new(bounds: VectorPathBounds) -> Self {
658 Self {
659 count: 1,
660 left: bounds.left,
661 bottom: bounds.bottom,
662 right: bounds.right,
663 top: bounds.top,
664 }
665 }
666
667 fn is_near(&self, bounds: VectorPathBounds) -> bool {
668 const CLUSTER_TOLERANCE: f32 = 48.0;
669 bounds.left <= self.right + CLUSTER_TOLERANCE
670 && bounds.right >= self.left - CLUSTER_TOLERANCE
671 && bounds.bottom <= self.top + CLUSTER_TOLERANCE
672 && bounds.top >= self.bottom - CLUSTER_TOLERANCE
673 }
674
675 fn merge_bounds(&mut self, bounds: VectorPathBounds) {
676 self.count += 1;
677 self.left = self.left.min(bounds.left);
678 self.bottom = self.bottom.min(bounds.bottom);
679 self.right = self.right.max(bounds.right);
680 self.top = self.top.max(bounds.top);
681 }
682
683 fn merge_cluster(&mut self, other: Self) {
684 self.count += other.count;
685 self.left = self.left.min(other.left);
686 self.bottom = self.bottom.min(other.bottom);
687 self.right = self.right.max(other.right);
688 self.top = self.top.max(other.top);
689 }
690
691 fn region_with_padding(
692 &self,
693 page_left: f32,
694 page_top: f32,
695 page_right: f32,
696 page_bottom: f32,
697 ) -> Option<PdfRegion> {
698 if !self.left.is_finite() || !self.bottom.is_finite() {
699 return None;
700 }
701 let pad = 4.0;
702 let padded_left = (self.left - pad).max(page_left);
703 let padded_bottom = (self.bottom - pad).max(page_top);
704 let padded_right = (self.right + pad).min(page_right);
705 let padded_top = (self.top + pad).min(page_bottom);
706 Some(PdfRegion {
707 left: padded_left,
708 bottom: padded_bottom,
709 width: (padded_right - padded_left).max(0.0),
710 height: (padded_top - padded_bottom).max(0.0),
711 })
712 }
713}
714
715#[cfg(any(feature = "pdf-rendering", test))]
716fn add_vector_path_to_clusters(
717 clusters: &mut Vec<VectorPathCluster>,
718 bounds: VectorPathBounds,
719) {
720 let Some(mut cluster_idx) =
721 clusters.iter().position(|cluster| cluster.is_near(bounds))
722 else {
723 clusters.push(VectorPathCluster::new(bounds));
724 return;
725 };
726
727 clusters[cluster_idx].merge_bounds(bounds);
728 let mut idx = 0;
729 while idx < clusters.len() {
730 if idx != cluster_idx
731 && clusters[cluster_idx].is_near(VectorPathBounds {
732 left: clusters[idx].left,
733 bottom: clusters[idx].bottom,
734 right: clusters[idx].right,
735 top: clusters[idx].top,
736 })
737 {
738 let other = clusters.remove(idx);
739 if idx < cluster_idx {
740 cluster_idx -= 1;
741 }
742 clusters[cluster_idx].merge_cluster(other);
743 } else {
744 idx += 1;
745 }
746 }
747}
748
749#[cfg(any(feature = "pdf-rendering", test))]
750fn should_render_vector_diagram_region(
751 region: PdfRegion,
752 native_rows: &[VisualTextRow],
753 allow_missing_native_text: bool,
754) -> bool {
755 if !has_nearby_figure_caption(region, native_rows) {
756 return false;
757 }
758 allow_missing_native_text
759 || has_native_text_inside_region(region, native_rows)
760}
761
762#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
763fn has_nearby_figure_caption(
764 region: PdfRegion,
765 native_rows: &[VisualTextRow],
766) -> bool {
767 native_rows.iter().any(|row| {
768 is_figure_caption(&row.text)
769 && row.left <= region.left + region.width + 80.0
770 && row.left + row.text.chars().count() as f32 * 5.0 >= region.left - 80.0
771 && vertical_distance_to_region(region, row.top) <= 90.0
772 })
773}
774
775#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
776fn has_native_text_inside_region(
777 region: PdfRegion,
778 native_rows: &[VisualTextRow],
779) -> bool {
780 native_rows.iter().any(|row| {
781 !is_figure_caption(&row.text)
782 && visual_alnum_len(&row.text) >= 2
783 && visual_text_row_overlaps_region(row, region)
784 })
785}
786
787#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
788fn visual_alnum_len(text: &str) -> usize {
789 text.chars().filter(|ch| ch.is_alphanumeric()).count()
790}
791
792#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
793fn is_figure_caption(text: &str) -> bool {
794 let trimmed = text.trim_start();
795 let Some(rest) = trimmed.strip_prefix("Figure ") else {
796 return false;
797 };
798 rest.chars().next().is_some_and(|ch| ch.is_ascii_digit())
799}
800
801#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
802fn vertical_distance_to_region(region: PdfRegion, y: f32) -> f32 {
803 if y < region.bottom {
804 region.bottom - y
805 } else if y > region.top() {
806 y - region.top()
807 } else {
808 0.0
809 }
810}
811
812fn pdf_x_to_cells(
813 x: f32,
814 page_left: f32,
815 page_width: f32,
816 col: usize,
817) -> usize {
818 if page_width <= 0.0 || col == 0 {
819 return 0;
820 }
821 (((x - page_left).max(0.0) / page_width) * col as f32).round() as usize
822}
823
824fn pdf_width_to_cells(width: f32, page_width: f32, col: usize) -> usize {
825 if page_width <= 0.0 || col == 0 {
826 return 0;
827 }
828 ((width.max(0.0) / page_width) * col as f32).round() as usize
829}
830
831fn pdf_image_height_rows(
832 bbox_width: f32,
833 bbox_height: f32,
834 width_cells: usize,
835) -> usize {
836 if bbox_width <= 0.0 || bbox_height <= 0.0 || width_cells == 0 {
837 return 1;
838 }
839 ((bbox_height / bbox_width) * width_cells as f32).round().max(1.0) as usize
840}
841
842fn compose_visual_page(
843 text_rows: Vec<VisualTextRow>,
844 mut image_rows: Vec<VisualImageRows>,
845 col: usize,
846) -> PdfPageForAnsi {
847 enum Event {
848 Text(VisualTextRow),
849 Image(VisualImageRows),
850 }
851
852 let text_rows = overlay_text_rows_on_images(text_rows, &mut image_rows);
853 let mut events: Vec<Event> =
854 Vec::with_capacity(text_rows.len() + image_rows.len());
855 events.extend(text_rows.into_iter().map(Event::Text));
856 events.extend(image_rows.into_iter().map(Event::Image));
857 events.sort_by(|a, b| {
858 let a_top = match a {
859 Event::Text(row) => row.top,
860 Event::Image(row) => row.top,
861 };
862 let b_top = match b {
863 Event::Text(row) => row.top,
864 Event::Image(row) => row.top,
865 };
866 b_top.partial_cmp(&a_top).unwrap_or(std::cmp::Ordering::Equal)
867 });
868
869 let page_left = events
870 .iter()
871 .filter_map(|event| match event {
872 Event::Text(row) if !row.text.trim().is_empty() => Some(row.left),
873 _ => None,
874 })
875 .fold(f32::INFINITY, f32::min);
876 let page_left = if page_left.is_finite() { page_left } else { 0.0 };
877
878 let mut lines = Vec::new();
879 let mut line_kinds = Vec::new();
880 for event in events {
881 match event {
882 Event::Text(row) => {
883 if row.text.trim().is_empty() {
884 continue;
885 }
886 let indent =
887 (((row.left - page_left) / 5.0).round()).clamp(0.0, 20.0) as usize;
888 let text_width = col.saturating_sub(indent).max(1);
889 let wrapped_lines = if row.text.chars().count() <= text_width {
890 vec![row.text]
891 } else {
892 cli_justify::justify(&row.text, text_width)
893 };
894 for wrapped in wrapped_lines {
895 lines.push(format!("{}{}", " ".repeat(indent), wrapped));
896 line_kinds.push(PdfLineKind::Text);
897 }
898 }
899 Event::Image(row) => {
900 let indent = " ".repeat(row.left_cells);
901 for line in row.lines {
902 lines.push(format!("{indent}{line}\x1b[0m"));
903 line_kinds.push(PdfLineKind::AnsiArt);
904 }
905 }
906 }
907 }
908
909 if lines.is_empty() {
910 lines.push(String::new());
911 line_kinds.push(PdfLineKind::Text);
912 }
913
914 PdfPageForAnsi { lines, line_kinds }
915}
916
917fn overlay_text_rows_on_images(
918 text_rows: Vec<VisualTextRow>,
919 image_rows: &mut [VisualImageRows],
920) -> Vec<VisualTextRow> {
921 let mut remaining = Vec::new();
922 for row in text_rows {
923 if !overlay_text_row_on_first_matching_image(&row, image_rows) {
924 remaining.push(row);
925 }
926 }
927 remaining
928}
929
930fn overlay_text_row_on_first_matching_image(
931 row: &VisualTextRow,
932 image_rows: &mut [VisualImageRows],
933) -> bool {
934 for image in image_rows {
935 if !image_contains_text_row(image, row) {
936 continue;
937 }
938 let line_idx = image_text_line_index(image, row.top);
939 let col_idx = image_text_col_index(image, row.left);
940 let Some(line) = image.lines.get_mut(line_idx) else {
941 return false;
942 };
943 *line = overlay_text_on_ansi_line(line, col_idx, row.text.trim());
944 return true;
945 }
946 false
947}
948
949fn image_contains_text_row(
950 image: &VisualImageRows,
951 row: &VisualTextRow,
952) -> bool {
953 let right = image.region.left + image.region.width;
954 let bottom = image.region.bottom;
955 let top = image.region.top();
956 let vertical_pad = (image.region.height / image.lines.len().max(1) as f32
957 * 0.5)
958 .clamp(2.0, 6.0);
959 row.top <= top + vertical_pad
960 && row.top >= bottom - vertical_pad
961 && row.left <= right
962 && row.left + row.text.chars().count() as f32 * 5.0 >= image.region.left
963}
964
965fn image_text_line_index(image: &VisualImageRows, text_top: f32) -> usize {
966 if image.lines.is_empty() || image.region.height <= 0.0 {
967 return 0;
968 }
969 let rel = ((image.region.top() - text_top) / image.region.height)
970 .clamp(0.0, 0.999_999);
971 (rel * image.lines.len() as f32).floor() as usize
972}
973
974fn image_text_col_index(image: &VisualImageRows, text_left: f32) -> usize {
975 if image.region.width <= 0.0 || image.width_cells == 0 {
976 return 0;
977 }
978 let rel =
979 ((text_left - image.region.left) / image.region.width).clamp(0.0, 1.0);
980 (rel * image.width_cells as f32).round() as usize
981}
982
983fn overlay_text_on_ansi_line(
984 line: &str,
985 start_col: usize,
986 text: &str,
987) -> String {
988 let available = ansi_visible_width(line).saturating_sub(start_col);
989 if available == 0 {
990 return line.to_string();
991 }
992 let text: String =
993 text.chars().filter(|ch| !ch.is_control()).take(available).collect();
994 if text.is_empty() {
995 return line.to_string();
996 }
997 let overlay_width = text.chars().count();
998 let mut out = String::with_capacity(line.len() + text.len() + 8);
999 let mut chars = line.chars().peekable();
1000 let mut visible_col = 0usize;
1001 let mut inserted = false;
1002
1003 while let Some(ch) = chars.next() {
1004 if ch == '\x1b' {
1005 out.push(ch);
1006 for next in chars.by_ref() {
1007 out.push(next);
1008 if next == 'm' {
1009 break;
1010 }
1011 }
1012 continue;
1013 }
1014
1015 if !inserted && visible_col >= start_col {
1016 out.push_str("\x1b[0m");
1017 out.push_str(&text);
1018 out.push_str("\x1b[0m");
1019 inserted = true;
1020 }
1021
1022 if inserted
1023 && visible_col >= start_col
1024 && visible_col < start_col + overlay_width
1025 {
1026 visible_col += 1;
1027 continue;
1028 }
1029
1030 out.push(ch);
1031 visible_col += 1;
1032 }
1033
1034 if !inserted {
1035 out.push_str(&" ".repeat(start_col.saturating_sub(visible_col)));
1036 out.push_str("\x1b[0m");
1037 out.push_str(&text);
1038 }
1039
1040 out
1041}
1042
1043fn ansi_visible_width(line: &str) -> usize {
1044 let mut chars = line.chars().peekable();
1045 let mut width = 0usize;
1046 while let Some(ch) = chars.next() {
1047 if ch == '\x1b' {
1048 for next in chars.by_ref() {
1049 if next == 'm' {
1050 break;
1051 }
1052 }
1053 continue;
1054 }
1055 width += 1;
1056 }
1057 width
1058}
1059
1060#[cfg(feature = "pdf-ocr-bundled")]
1061fn has_near_duplicate_visual_text(
1062 native_rows: &[VisualTextRow],
1063 ocr_row: &VisualTextRow,
1064) -> bool {
1065 let ocr_norm = normalized_visual_text(&ocr_row.text);
1066 if ocr_norm.is_empty() {
1067 return true;
1068 }
1069 native_rows.iter().any(|native| {
1070 (native.top - ocr_row.top).abs() <= 12.0
1071 && (native.left - ocr_row.left).abs() <= 24.0
1072 && {
1073 let native_norm = normalized_visual_text(&native.text);
1074 native_norm.contains(&ocr_norm) || ocr_norm.contains(&native_norm)
1075 }
1076 })
1077}
1078
1079#[cfg(feature = "pdf-ocr-bundled")]
1080fn normalized_visual_text(text: &str) -> String {
1081 text
1082 .chars()
1083 .filter(|ch| ch.is_alphanumeric())
1084 .flat_map(char::to_lowercase)
1085 .collect()
1086}
1087
1088#[cfg(test)]
1089fn positioned_sanitized_text_rows(
1090 doc: &pdf_oxide::PdfDocument,
1091 page_0based: usize,
1092 raw_text: &str,
1093 col: usize,
1094) -> Vec<VisualTextRow> {
1095 let sanitized_lines = cli_justify::justify_pdf_page(raw_text, col).lines;
1096 let anchors = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1097 extract_visual_text_rows(doc, page_0based)
1098 }))
1099 .ok()
1100 .flatten()
1101 .unwrap_or_default();
1102
1103 if anchors.is_empty() {
1104 return sanitized_lines
1105 .into_iter()
1106 .enumerate()
1107 .map(|(idx, text)| VisualTextRow { top: -(idx as f32), left: 0.0, text })
1108 .collect();
1109 }
1110
1111 sanitized_lines
1112 .into_iter()
1113 .enumerate()
1114 .map(|(idx, text)| {
1115 let anchor = anchors
1116 .get(idx)
1117 .or_else(|| anchors.last())
1118 .expect("anchors is non-empty");
1119 let extra = idx.saturating_sub(anchors.len().saturating_sub(1)) as f32;
1120 VisualTextRow { top: anchor.top - extra, left: anchor.left, text }
1121 })
1122 .collect()
1123}
1124
1125fn positioned_visual_text_rows(
1126 doc: &pdf_oxide::PdfDocument,
1127 page_0based: usize,
1128) -> Vec<VisualTextRow> {
1129 std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1130 extract_visual_text_rows(doc, page_0based)
1131 }))
1132 .ok()
1133 .flatten()
1134 .map(filter_visual_text_rows)
1135 .unwrap_or_default()
1136}
1137
1138fn filter_visual_text_rows(rows: Vec<VisualTextRow>) -> Vec<VisualTextRow> {
1139 let mut rows: Vec<VisualTextRow> = rows
1140 .into_iter()
1141 .filter_map(|mut row| {
1142 row.text = normalize_visual_text_row(&row.text);
1143 if row.text.trim().is_empty() || is_visual_running_header(&row.text) {
1144 None
1145 } else {
1146 Some(row)
1147 }
1148 })
1149 .collect();
1150
1151 const ISOLATED_GAP: f32 = 30.0;
1152 while rows.len() >= 2
1153 && is_digits_only(&rows[0].text)
1154 && (rows[0].top - rows[1].top).abs() > ISOLATED_GAP
1155 {
1156 rows.remove(0);
1157 }
1158 while rows.len() >= 2 {
1159 let last = rows.len() - 1;
1160 if is_digits_only(&rows[last].text)
1161 && (rows[last - 1].top - rows[last].top).abs() > ISOLATED_GAP
1162 {
1163 rows.remove(last);
1164 } else {
1165 break;
1166 }
1167 }
1168
1169 rows
1170}
1171
1172fn normalize_visual_text_row(text: &str) -> String {
1173 let mut normalized = String::with_capacity(text.len());
1174 for ch in text.chars() {
1175 if is_private_use_or_format_char(ch) {
1176 continue;
1177 }
1178 if ch == '\u{00A0}' {
1179 normalized.push(' ');
1180 } else {
1181 normalized.push(ch);
1182 }
1183 }
1184 normalized
1185}
1186
1187fn is_private_use_or_format_char(ch: char) -> bool {
1188 matches!(
1189 ch,
1190 '\u{E000}'..='\u{F8FF}'
1191 | '\u{F0000}'..='\u{FFFFD}'
1192 | '\u{100000}'..='\u{10FFFD}'
1193 | '\u{FEFF}'
1194 | '\u{200B}'..='\u{200D}'
1195 | '\u{2060}'
1196 )
1197}
1198
1199fn is_visual_running_header(text: &str) -> bool {
1200 let trimmed = text.trim();
1201 if trimmed.is_empty() {
1202 return false;
1203 }
1204
1205 is_chapter_section_visual_header(trimmed)
1206}
1207
1208fn is_chapter_section_visual_header(trimmed: &str) -> bool {
1209 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
1210 if tokens.len() < 3 || tokens.len() > 6 {
1211 return false;
1212 }
1213
1214 let label = tokens[0];
1215 if !matches!(label, "CHAPTER" | "SECTION" | "APPENDIX" | "PART") {
1216 return false;
1217 }
1218
1219 let number = tokens[1];
1220 if number.is_empty() || number.len() > 8 {
1221 return false;
1222 }
1223 if !number.chars().all(|ch| ch.is_ascii_alphanumeric() || ch == '.') {
1224 return false;
1225 }
1226
1227 let looks_like_section_id = number.chars().any(|ch| ch.is_ascii_digit())
1228 || number.chars().all(|ch| ch.is_ascii_uppercase());
1229 if !looks_like_section_id {
1230 return false;
1231 }
1232
1233 let last = tokens[tokens.len() - 1];
1234 if last.chars().all(|ch| ch.is_ascii_digit()) {
1235 return false;
1236 }
1237
1238 has_visual_wide_gap_between(trimmed, number, last)
1239}
1240
1241fn has_visual_wide_gap_between(trimmed: &str, first: &str, last: &str) -> bool {
1242 let Some(first_idx) = trimmed.find(first) else {
1243 return false;
1244 };
1245 let first_end = first_idx + first.len();
1246 let Some(last_start) = trimmed.rfind(last) else {
1247 return false;
1248 };
1249 if last_start <= first_end {
1250 return false;
1251 }
1252 trimmed[first_end..last_start].chars().filter(|ch| *ch == ' ').count() >= 10
1253}
1254
1255fn extract_visual_text_rows(
1256 doc: &pdf_oxide::PdfDocument,
1257 page_0based: usize,
1258) -> Option<Vec<VisualTextRow>> {
1259 let mut lines = doc.extract_text_lines(page_0based).ok()?;
1260 if lines.is_empty() {
1261 return None;
1262 }
1263
1264 lines.sort_by(|a, b| {
1265 b.bbox
1266 .top()
1267 .partial_cmp(&a.bbox.top())
1268 .unwrap_or(std::cmp::Ordering::Equal)
1269 .then_with(|| {
1270 a.bbox
1271 .left()
1272 .partial_cmp(&b.bbox.left())
1273 .unwrap_or(std::cmp::Ordering::Equal)
1274 })
1275 });
1276
1277 const SAME_ROW_TOL: f32 = 3.0;
1278 const PT_PER_CHAR: f32 = 5.0;
1279
1280 let mut rows = Vec::new();
1281 let mut row_start = 0usize;
1282 let mut row_anchor_y = lines[0].bbox.top();
1283 for i in 1..=lines.len() {
1284 let break_row = i == lines.len()
1285 || (row_anchor_y - lines[i].bbox.top()).abs() > SAME_ROW_TOL;
1286 if break_row {
1287 let mut row: Vec<&pdf_oxide::layout::TextLine> =
1288 lines[row_start..i].iter().collect();
1289 row.sort_by(|a, b| {
1290 a.bbox
1291 .left()
1292 .partial_cmp(&b.bbox.left())
1293 .unwrap_or(std::cmp::Ordering::Equal)
1294 });
1295 let row_left =
1296 row.iter().map(|l| l.bbox.left()).fold(f32::INFINITY, f32::min);
1297 let mut body = String::new();
1298 let mut prev_right: Option<f32> = None;
1299 for line in row {
1300 for word in &line.words {
1301 if let Some(pr) = prev_right {
1302 let gap_pt = (word.bbox.left() - pr).max(0.0);
1303 let gap_chars = ((gap_pt / PT_PER_CHAR).round() as usize).max(1);
1304 for _ in 0..gap_chars {
1305 body.push(' ');
1306 }
1307 }
1308 body.push_str(&word.text);
1309 prev_right = Some(word.bbox.right());
1310 }
1311 }
1312 rows.push(VisualTextRow {
1313 top: row_anchor_y,
1314 left: row_left,
1315 text: body,
1316 });
1317 row_start = i;
1318 if i < lines.len() {
1319 row_anchor_y = lines[i].bbox.top();
1320 }
1321 }
1322 }
1323
1324 Some(rows)
1325}
1326
1327fn extract_page_text_lines(
1335 doc: &pdf_oxide::PdfDocument,
1336 page_0based: usize,
1337) -> Option<String> {
1338 let mut lines = doc.extract_text_lines(page_0based).ok()?;
1339 if lines.is_empty() {
1340 return None;
1341 }
1342
1343 lines.sort_by(|a, b| {
1345 b.bbox
1346 .top()
1347 .partial_cmp(&a.bbox.top())
1348 .unwrap_or(std::cmp::Ordering::Equal)
1349 .then_with(|| {
1350 a.bbox
1351 .left()
1352 .partial_cmp(&b.bbox.left())
1353 .unwrap_or(std::cmp::Ordering::Equal)
1354 })
1355 });
1356
1357 const SAME_ROW_TOL: f32 = 3.0;
1362
1363 const PT_PER_CHAR: f32 = 5.0;
1368 const MAX_INDENT_CHARS: usize = 20;
1369
1370 let mut rows: Vec<(f32, f32, String)> = Vec::new();
1378 let mut row_start = 0usize;
1379 let mut row_anchor_y = lines[0].bbox.top();
1380 for i in 1..=lines.len() {
1381 let break_row = i == lines.len()
1382 || (row_anchor_y - lines[i].bbox.top()).abs() > SAME_ROW_TOL;
1383 if break_row {
1384 let mut row: Vec<&pdf_oxide::layout::TextLine> =
1385 lines[row_start..i].iter().collect();
1386 row.sort_by(|a, b| {
1387 a.bbox
1388 .left()
1389 .partial_cmp(&b.bbox.left())
1390 .unwrap_or(std::cmp::Ordering::Equal)
1391 });
1392 let row_left =
1393 row.iter().map(|l| l.bbox.left()).fold(f32::INFINITY, f32::min);
1394 let mut body = String::with_capacity(64);
1401 let mut prev_right: Option<f32> = None;
1402 for line in row.iter() {
1403 for word in &line.words {
1404 if let Some(pr) = prev_right {
1405 let gap_pt = (word.bbox.left() - pr).max(0.0);
1406 let gap_chars = ((gap_pt / PT_PER_CHAR).round() as usize).max(1);
1407 for _ in 0..gap_chars {
1408 body.push(' ');
1409 }
1410 }
1411 body.push_str(&word.text);
1412 prev_right = Some(word.bbox.right());
1413 }
1414 }
1415 rows.push((row_anchor_y, row_left, body));
1416 row_start = i;
1417 if i < lines.len() {
1418 row_anchor_y = lines[i].bbox.top();
1419 }
1420 }
1421 }
1422
1423 const ISOLATED_GAP: f32 = 30.0;
1435 while rows.len() >= 2
1438 && is_digits_only(&rows[0].2)
1439 && (rows[0].0 - rows[1].0).abs() > ISOLATED_GAP
1440 {
1441 rows.remove(0);
1442 }
1443 while rows.len() >= 2 {
1444 let last = rows.len() - 1;
1445 if is_digits_only(&rows[last].2)
1446 && (rows[last - 1].0 - rows[last].0).abs() > ISOLATED_GAP
1447 {
1448 rows.remove(last);
1449 } else {
1450 break;
1451 }
1452 }
1453
1454 let mut buckets: std::collections::HashMap<i32, usize> =
1463 std::collections::HashMap::new();
1464 for (_, row_left, _) in &rows {
1465 let key = row_left.round() as i32;
1466 *buckets.entry(key).or_insert(0) += 1;
1467 }
1468 let popular_min = buckets
1469 .iter()
1470 .filter(|(_, count)| **count >= 2)
1471 .map(|(k, _)| *k as f32)
1472 .fold(f32::INFINITY, f32::min);
1473 let page_left = if popular_min.is_finite() {
1474 popular_min
1475 } else {
1476 rows.iter().map(|(_, x, _)| *x).fold(f32::INFINITY, f32::min)
1477 };
1478
1479 let gaps: Vec<f32> =
1493 rows.windows(2).map(|w| (w[0].0 - w[1].0).max(0.0)).collect();
1494 let para_threshold = paragraph_gap_threshold(&gaps);
1495
1496 let mut output =
1497 String::with_capacity(rows.iter().map(|(_, _, s)| s.len() + 8).sum());
1498 for i in 0..rows.len() {
1499 if i > 0 && gaps[i - 1] > para_threshold {
1500 output.push('\n');
1501 }
1502 let (_, row_left, body) = &rows[i];
1503 let indent_chars =
1504 (((row_left - page_left) / PT_PER_CHAR).round()).max(0.0) as usize;
1505 let indent_chars = indent_chars.min(MAX_INDENT_CHARS);
1506 for _ in 0..indent_chars {
1507 output.push(' ');
1508 }
1509 output.push_str(body);
1510 output.push('\n');
1511 }
1512 Some(output)
1513}
1514
1515fn is_digits_only(s: &str) -> bool {
1516 let t = s.trim();
1517 !t.is_empty() && t.chars().all(|c| c.is_ascii_digit())
1518}
1519
1520fn paragraph_gap_threshold(gaps: &[f32]) -> f32 {
1529 let mut buckets: std::collections::HashMap<i32, usize> =
1530 std::collections::HashMap::new();
1531 for &g in gaps {
1532 if g >= 5.0 {
1533 let key = (g / 2.0).round() as i32;
1534 *buckets.entry(key).or_insert(0) += 1;
1535 }
1536 }
1537 let mode_gap = buckets
1538 .iter()
1539 .max_by_key(|(_, c)| *c)
1540 .map(|(k, _)| (*k as f32) * 2.0)
1541 .unwrap_or(14.0);
1542 (mode_gap * 1.7).clamp(20.0, 50.0)
1543}
1544
1545pub type SharedPdfStream = Arc<PdfStream>;
1547
1548#[cfg(test)]
1549mod tests {
1550 use super::*;
1551 use std::path::Path;
1552
1553 #[test]
1554 fn opens_and_extracts_individual_pages() {
1555 let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1556 .join("../test-data/pdf/progit-1-50.pdf");
1557 if !pdf_path.exists() {
1558 return;
1559 }
1560 let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1561 .expect("PdfStream should open valid test PDF");
1562 assert!(stream.total_pages() > 0, "test PDF should report pages");
1563
1564 let scan_upto = stream.total_pages().min(5);
1567 let mut any_non_empty = false;
1568 for p in 1..=scan_upto {
1569 if let Some(text) = stream.extract_page(p)
1570 && !text.trim().is_empty()
1571 {
1572 any_non_empty = true;
1573 break;
1574 }
1575 }
1576 assert!(
1577 any_non_empty,
1578 "at least one of the first {scan_upto} pages should extract non-empty text"
1579 );
1580 }
1581
1582 #[test]
1589 fn progit_paragraph_breaks_and_page_footer() {
1590 let pdf_path =
1591 Path::new(env!("CARGO_MANIFEST_DIR")).join("../test-data/pdf/progit.pdf");
1592 if !pdf_path.exists() {
1593 return;
1594 }
1595 let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1596 .expect("PdfStream should open progit");
1597 let text =
1598 stream.extract_page(43).expect("progit page 43 should produce text");
1599
1600 let lines: Vec<&str> = text.lines().collect();
1602 assert!(
1603 !lines.iter().any(|l| l.trim() == "37"),
1604 "isolated page-number footer '37' should be stripped, got:\n{text}"
1605 );
1606
1607 let alt_pos = text
1612 .find("Alternatively, you can type your commit message")
1613 .expect("expected sentence on page 43");
1614 let before = &text[..alt_pos];
1615 assert!(
1616 before.trim_end().ends_with("and diff stripped out)."),
1617 "text immediately before 'Alternatively…' should end the previous \
1618 paragraph, got:\n…{}…",
1619 &before[before.len().saturating_sub(80)..]
1620 );
1621 let trailing_newlines =
1622 before.as_bytes().iter().rev().take_while(|&&b| b == b'\n').count();
1623 assert!(
1624 trailing_newlines >= 2,
1625 "expected at least one blank line before 'Alternatively…' \
1626 (a paragraph break), got {trailing_newlines} trailing newlines"
1627 );
1628 }
1629
1630 #[test]
1636 fn toc_section_labels_stay_separate() {
1637 let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1638 .join("../test-data/pdf/pdfreference1.7old.pdf");
1639 if !pdf_path.exists() {
1640 return;
1641 }
1642 let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1643 .expect("PdfStream should open the reference PDF");
1644 let text = stream.extract_page(5).expect("page 5 should produce text");
1646 let lines: Vec<&str> = text.lines().collect();
1647 let normalize_spaces =
1651 |s: &str| s.split_whitespace().collect::<Vec<_>>().join(" ");
1652 assert!(
1653 lines
1654 .iter()
1655 .any(|l| normalize_spaces(l.trim()) == "1.3 Related Publications 31"),
1656 "section 1.3 should be on its own line, got:\n{text}"
1657 );
1658 assert!(
1659 lines
1660 .iter()
1661 .any(|l| normalize_spaces(l.trim()) == "1.4 Intellectual Property 32"),
1662 "section 1.4 should be on its own line, got:\n{text}"
1663 );
1664 assert!(
1666 !text.contains("1.3 Related Publications1.4"),
1667 "section labels must not be concatenated, got:\n{text}"
1668 );
1669 }
1670
1671 #[test]
1672 fn visual_composition_orders_text_and_ansi_art_with_metadata() {
1673 let text_rows = vec![
1674 VisualTextRow { top: 90.0, left: 50.0, text: "after image".to_string() },
1675 VisualTextRow {
1676 top: 200.0,
1677 left: 50.0,
1678 text: "before image".to_string(),
1679 },
1680 ];
1681 let image_rows = vec![VisualImageRows {
1682 top: 150.0,
1683 left_cells: 4,
1684 width_cells: 20,
1685 region: PdfRegion {
1686 left: 0.0,
1687 bottom: 125.0,
1688 width: 100.0,
1689 height: 25.0,
1690 },
1691 lines: vec!["\x1b[38;2;1;2;3m\x1b[48;2;4;5;6m▀\x1b[0m".into()],
1692 }];
1693
1694 let page = compose_visual_page(text_rows, image_rows, 80);
1695
1696 assert_eq!(
1697 page.line_kinds,
1698 vec![PdfLineKind::Text, PdfLineKind::AnsiArt, PdfLineKind::Text,]
1699 );
1700 assert_eq!(page.lines[0], "before image");
1701 assert!(page.lines[1].starts_with(" \x1b[38;2;1;2;3m"));
1702 assert!(page.lines[1].ends_with("\x1b[0m"));
1703 assert_eq!(page.lines[2], "after image");
1704 }
1705
1706 #[test]
1707 fn visual_text_inside_image_region_overlays_ansi_art() {
1708 let text_rows = vec![VisualTextRow {
1709 top: 140.0,
1710 left: 25.0,
1711 text: "diagram label".to_string(),
1712 }];
1713 let image_rows = vec![VisualImageRows {
1714 top: 150.0,
1715 left_cells: 0,
1716 width_cells: 40,
1717 region: PdfRegion {
1718 left: 0.0,
1719 bottom: 100.0,
1720 width: 100.0,
1721 height: 50.0,
1722 },
1723 lines: vec![
1724 format!("\x1b[38;2;1;2;3m{}\x1b[0m", "▀".repeat(40)),
1725 format!("\x1b[38;2;1;2;3m{}\x1b[0m", "▀".repeat(40)),
1726 ],
1727 }];
1728
1729 let page = compose_visual_page(text_rows, image_rows, 80);
1730
1731 assert_eq!(
1732 page.line_kinds,
1733 vec![PdfLineKind::AnsiArt, PdfLineKind::AnsiArt]
1734 );
1735 assert!(
1736 page.lines.iter().any(|line| line.contains("diagram label")),
1737 "text should be painted into the ANSI art lines: {:?}",
1738 page.lines
1739 );
1740 }
1741
1742 #[test]
1743 #[cfg(feature = "pdf-ocr-bundled")]
1744 fn ocr_text_rows_overlay_existing_ansi_art() {
1745 let engine =
1746 crate::ocr::bundled_ocr_engine().expect("bundled OCR should initialize");
1747 let image = generated_ocr_fixture("HELLO OCR");
1748 let text_rows = ocr_dynamic_image_text_rows(
1749 &engine,
1750 &image,
1751 PdfRegion { left: 0.0, bottom: 100.0, width: 300.0, height: 80.0 },
1752 );
1753 assert!(
1754 text_rows.iter().any(|row| {
1755 let normalized = normalized_visual_text(&row.text);
1756 normalized.contains("hello") || normalized.contains("ocr")
1757 }),
1758 "OCR should produce overlayable text rows, got {:?}",
1759 text_rows
1760 );
1761 let image_rows = vec![VisualImageRows {
1762 top: 180.0,
1763 left_cells: 0,
1764 width_cells: 60,
1765 region: PdfRegion {
1766 left: 0.0,
1767 bottom: 100.0,
1768 width: 300.0,
1769 height: 80.0,
1770 },
1771 lines: (0..6)
1772 .map(|_| format!("\x1b[38;2;1;2;3m{}\x1b[0m", "▀".repeat(60)))
1773 .collect(),
1774 }];
1775
1776 let page = compose_visual_page(text_rows, image_rows, 80);
1777 let rendered = page.lines.join("\n");
1778 let normalized = normalized_visual_text(&rendered);
1779
1780 assert!(page.line_kinds.iter().all(|kind| *kind == PdfLineKind::AnsiArt));
1781 assert!(
1782 normalized.contains("hello") || normalized.contains("ocr"),
1783 "OCR text should be overlaid into ANSI art, got {rendered:?}"
1784 );
1785 }
1786
1787 #[test]
1788 fn visual_text_outside_image_region_stays_separate() {
1789 let text_rows = vec![VisualTextRow {
1790 top: 75.0,
1791 left: 25.0,
1792 text: "caption below".to_string(),
1793 }];
1794 let image_rows = vec![VisualImageRows {
1795 top: 150.0,
1796 left_cells: 0,
1797 width_cells: 40,
1798 region: PdfRegion {
1799 left: 0.0,
1800 bottom: 100.0,
1801 width: 100.0,
1802 height: 50.0,
1803 },
1804 lines: vec!["\x1b[38;2;1;2;3m▀▀▀▀▀▀▀▀▀▀\x1b[0m".into()],
1805 }];
1806
1807 let page = compose_visual_page(text_rows, image_rows, 80);
1808
1809 assert_eq!(page.line_kinds, vec![PdfLineKind::AnsiArt, PdfLineKind::Text]);
1810 assert_eq!(page.lines[1], "caption below");
1811 }
1812
1813 #[test]
1814 fn text_only_ansi_page_keeps_every_line_text_marked() {
1815 let page = text_only_page_lines("one two three", 10);
1816
1817 assert!(!page.lines.is_empty());
1818 assert_eq!(page.line_kinds, vec![PdfLineKind::Text; page.lines.len()]);
1819 }
1820
1821 #[test]
1822 fn visual_page_without_art_uses_native_rows_before_sanitized_fallback() {
1823 let text_rows = vec![VisualTextRow {
1824 top: 100.0,
1825 left: 20.0,
1826 text: "diagram label".to_string(),
1827 }];
1828
1829 let page = compose_visual_page(text_rows, Vec::new(), 80);
1830
1831 assert_eq!(page.lines, vec!["diagram label"]);
1832 assert_eq!(page.line_kinds, vec![PdfLineKind::Text]);
1833 }
1834
1835 #[test]
1836 fn sanitized_text_rows_keep_pdf_position_anchors() {
1837 let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1838 .join("../test-data/pdf/progit-1-50.pdf");
1839 if !pdf_path.exists() {
1840 return;
1841 }
1842 let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1843 .expect("PdfStream should open valid test PDF");
1844 let raw_text = stream.extract_page(2).expect("page should produce text");
1845 let anchors = extract_visual_text_rows(&stream.doc, 1)
1846 .expect("page should produce positioned rows");
1847 let rows = positioned_sanitized_text_rows(&stream.doc, 1, &raw_text, 80);
1848
1849 assert!(!rows.is_empty());
1850 assert_eq!(rows[0].top, anchors[0].top);
1851 assert_eq!(rows[0].left, anchors[0].left);
1852 }
1853
1854 #[test]
1855 fn progit_figure_images_do_not_expose_internal_native_labels() {
1856 let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1857 .join("../test-data/pdf/progit-1-50.pdf");
1858 if !pdf_path.exists() {
1859 return;
1860 }
1861 let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1862 .expect("PdfStream should open valid test PDF");
1863 let page_0based = 22;
1864 let rows = positioned_visual_text_rows(&stream.doc, page_0based);
1865 let images = stream
1866 .doc
1867 .extract_images(page_0based)
1868 .expect("page should extract images");
1869 let bbox = images[0].bbox().expect("figure image should have a bbox");
1870 let region = PdfRegion {
1871 left: bbox.left(),
1872 bottom: bbox.top(),
1873 width: bbox.width,
1874 height: bbox.height,
1875 };
1876
1877 assert!(has_nearby_figure_caption(region, &rows));
1878 assert!(
1879 !rows.iter().any(|row| {
1880 !is_figure_caption(&row.text)
1881 && visual_text_row_overlaps_region(row, region)
1882 }),
1883 "ProGit figure labels are embedded in the image and require OCR"
1884 );
1885 }
1886
1887 #[cfg(feature = "pdf-ocr-bundled")]
1888 #[test]
1889 fn progit_figure_ocr_overlays_embedded_image_labels() {
1890 let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1891 .join("../test-data/pdf/progit-1-50.pdf");
1892 if !pdf_path.exists() {
1893 return;
1894 }
1895 let stream =
1896 PdfStream::open_with_bundled_ocr(pdf_path.to_str().expect("utf-8 path"))
1897 .expect("PdfStream should open valid test PDF");
1898
1899 let page = stream
1900 .extract_page_with_images(34, 100)
1901 .expect("page should render with image rows");
1902 let rendered = page.lines.join("\n");
1903
1904 assert!(
1905 ["Untracked", "Unmodified", "Modified", "Staged"]
1906 .iter()
1907 .any(|label| rendered.contains(label)),
1908 "OCR should recover at least one embedded figure label, got {rendered:?}"
1909 );
1910 }
1911
1912 #[test]
1913 fn visual_text_rows_preserve_native_diagram_labels() {
1914 let rows = vec![
1915 VisualTextRow { top: 800.0, left: 300.0, text: "12".to_string() },
1916 VisualTextRow {
1917 top: 700.0,
1918 left: 72.0,
1919 text: "Body text before figure.".to_string(),
1920 },
1921 VisualTextRow { top: 660.0, left: 250.0, text: "Acrobat".to_string() },
1922 VisualTextRow {
1923 top: 645.0,
1924 left: 90.0,
1925 text: "Macintosh application Windows application".to_string(),
1926 },
1927 VisualTextRow { top: 630.0, left: 275.0, text: "Adobe PDF".to_string() },
1928 VisualTextRow { top: 615.0, left: 320.0, text: "printer".to_string() },
1929 VisualTextRow { top: 600.0, left: 72.0, text: "\u{f05a}".to_string() },
1930 VisualTextRow {
1931 top: 560.0,
1932 left: 72.0,
1933 text: "Body text after figure.".to_string(),
1934 },
1935 VisualTextRow { top: 40.0, left: 300.0, text: "13".to_string() },
1936 ];
1937
1938 let filtered = filter_visual_text_rows(rows);
1939 let texts: Vec<&str> =
1940 filtered.iter().map(|row| row.text.as_str()).collect();
1941
1942 assert_eq!(
1943 texts,
1944 vec![
1945 "Body text before figure.",
1946 "Acrobat",
1947 "Macintosh application Windows application",
1948 "Adobe PDF",
1949 "printer",
1950 "Body text after figure.",
1951 ]
1952 );
1953 assert!(filtered.iter().all(|row| row.text.trim() != "\u{f05a}"));
1954 }
1955
1956 #[test]
1957 fn detects_vector_diagram_region_from_box_primitives() {
1958 let paths = vec![
1959 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
1960 100.0, 200.0, 80.0, 40.0,
1961 )),
1962 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
1963 220.0, 200.0, 80.0, 40.0,
1964 )),
1965 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
1966 160.0, 280.0, 80.0, 40.0,
1967 )),
1968 ];
1969
1970 let text_rows = vec![VisualTextRow {
1971 top: 180.0,
1972 left: 100.0,
1973 text: "Figure 1. Test diagram".to_string(),
1974 }];
1975 let regions = detect_vector_diagram_regions(
1976 &paths, 0.0, 0.0, 612.0, 792.0, &text_rows, true,
1977 );
1978
1979 assert_eq!(regions.len(), 1);
1980 assert!(regions[0].left <= 100.0);
1981 assert!(regions[0].bottom <= 200.0);
1982 assert!(regions[0].width >= 200.0);
1983 assert!(regions[0].height >= 120.0);
1984 }
1985
1986 #[test]
1987 fn ignores_single_full_width_vector_rule() {
1988 let paths = vec![pdf_oxide::elements::PathContent::new(
1989 pdf_oxide::geometry::Rect::new(0.0, 700.0, 612.0, 1.0),
1990 )];
1991
1992 assert!(
1993 detect_vector_diagram_regions(&paths, 0.0, 0.0, 612.0, 792.0, &[], true)
1994 .is_empty()
1995 );
1996 }
1997
1998 #[test]
1999 fn ignores_vector_regions_without_nearby_figure_caption() {
2000 let paths = vec![
2001 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2002 100.0, 200.0, 80.0, 40.0,
2003 )),
2004 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2005 220.0, 200.0, 80.0, 40.0,
2006 )),
2007 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2008 160.0, 280.0, 80.0, 40.0,
2009 )),
2010 ];
2011
2012 assert!(
2013 detect_vector_diagram_regions(&paths, 0.0, 0.0, 612.0, 792.0, &[], true)
2014 .is_empty()
2015 );
2016 }
2017
2018 #[test]
2019 fn ignores_unlabeled_vector_regions_without_ocr() {
2020 let paths = vec![
2021 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2022 100.0, 200.0, 80.0, 40.0,
2023 )),
2024 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2025 220.0, 200.0, 80.0, 40.0,
2026 )),
2027 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2028 160.0, 280.0, 80.0, 40.0,
2029 )),
2030 ];
2031 let text_rows = vec![VisualTextRow {
2032 top: 180.0,
2033 left: 100.0,
2034 text: "Figure 1. Test diagram".to_string(),
2035 }];
2036
2037 assert!(
2038 detect_vector_diagram_regions(
2039 &paths, 0.0, 0.0, 612.0, 792.0, &text_rows, false,
2040 )
2041 .is_empty()
2042 );
2043 }
2044
2045 #[test]
2046 fn keeps_vector_regions_with_native_overlay_text_without_ocr() {
2047 let paths = vec![
2048 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2049 100.0, 200.0, 80.0, 40.0,
2050 )),
2051 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2052 220.0, 200.0, 80.0, 40.0,
2053 )),
2054 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2055 160.0, 280.0, 80.0, 40.0,
2056 )),
2057 ];
2058 let text_rows = vec![
2059 VisualTextRow {
2060 top: 180.0,
2061 left: 100.0,
2062 text: "Figure 1. Test diagram".to_string(),
2063 },
2064 VisualTextRow {
2065 top: 220.0,
2066 left: 120.0,
2067 text: "Native label".to_string(),
2068 },
2069 ];
2070
2071 let regions = detect_vector_diagram_regions(
2072 &paths, 0.0, 0.0, 612.0, 792.0, &text_rows, false,
2073 );
2074
2075 assert_eq!(regions.len(), 1);
2076 }
2077
2078 #[test]
2079 fn vector_diagram_region_clamps_to_media_box_origin() {
2080 let paths = vec![
2081 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2082 110.0, 210.0, 80.0, 40.0,
2083 )),
2084 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2085 230.0, 210.0, 80.0, 40.0,
2086 )),
2087 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2088 170.0, 290.0, 80.0, 40.0,
2089 )),
2090 ];
2091
2092 let text_rows = vec![VisualTextRow {
2093 top: 206.0,
2094 left: 110.0,
2095 text: "Figure 1. Test diagram".to_string(),
2096 }];
2097 let regions = detect_vector_diagram_regions(
2098 &paths, 100.0, 200.0, 500.0, 500.0, &text_rows, true,
2099 );
2100
2101 assert_eq!(regions.len(), 1);
2102 assert!(regions[0].left >= 100.0);
2103 assert!(regions[0].bottom >= 200.0);
2104 assert!(regions[0].left <= 110.0);
2105 assert!(regions[0].bottom <= 210.0);
2106 }
2107
2108 #[test]
2109 fn vector_diagram_region_handles_negative_media_box_origin() {
2110 let paths = vec![
2111 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2112 -290.0, -190.0, 80.0, 40.0,
2113 )),
2114 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2115 -170.0, -190.0, 80.0, 40.0,
2116 )),
2117 pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2118 -230.0, -110.0, 80.0, 40.0,
2119 )),
2120 ];
2121
2122 let text_rows = vec![VisualTextRow {
2123 top: -194.0,
2124 left: -290.0,
2125 text: "Figure 1. Test diagram".to_string(),
2126 }];
2127 let regions = detect_vector_diagram_regions(
2128 &paths, -300.0, -200.0, 500.0, 500.0, &text_rows, true,
2129 );
2130
2131 assert_eq!(regions.len(), 1);
2132 assert!(regions[0].left >= -300.0);
2133 assert!(regions[0].bottom >= -200.0);
2134 assert!(regions[0].width >= 200.0);
2135 assert!(regions[0].height >= 120.0);
2136 }
2137
2138 #[test]
2139 fn pdf_cell_mapping_accounts_for_media_box_origin() {
2140 assert_eq!(pdf_x_to_cells(100.0, 100.0, 500.0, 80), 0);
2141 assert_eq!(pdf_x_to_cells(350.0, 100.0, 500.0, 80), 40);
2142 assert_eq!(pdf_width_to_cells(125.0, 500.0, 80), 20);
2143 }
2144
2145 #[test]
2146 fn pdf_image_height_uses_display_bbox_aspect_ratio() {
2147 assert_eq!(pdf_image_height_rows(100.0, 50.0, 20), 10);
2148 assert_eq!(pdf_image_height_rows(100.0, 200.0, 20), 40);
2149 assert_eq!(pdf_image_height_rows(0.0, 200.0, 20), 1);
2150 }
2151
2152 #[cfg(feature = "pdf-ocr-bundled")]
2153 #[test]
2154 fn ocrs_images_when_page_has_no_native_text() {
2155 let region =
2156 PdfRegion { left: 0.0, bottom: 0.0, width: 100.0, height: 100.0 };
2157
2158 assert!(should_ocr_image_region(region, &[]));
2159 }
2160
2161 #[cfg(feature = "pdf-ocr-bundled")]
2162 #[test]
2163 fn ocrs_captioned_images_without_native_text() {
2164 let region =
2165 PdfRegion { left: 48.0, bottom: 300.0, width: 500.0, height: 200.0 };
2166 let native_rows = vec![
2167 VisualTextRow {
2168 top: 285.0,
2169 left: 48.0,
2170 text: "Figure 8. The lifecycle of the status of your files".to_string(),
2171 },
2172 VisualTextRow {
2173 top: 250.0,
2174 left: 48.0,
2175 text: "Checking the Status of Your Files".to_string(),
2176 },
2177 ];
2178
2179 assert!(should_ocr_image_region(region, &native_rows));
2180 }
2181
2182 #[cfg(feature = "pdf-ocr-bundled")]
2183 #[test]
2184 fn skips_uncaptioned_images_on_native_text_pages() {
2185 let region =
2186 PdfRegion { left: 48.0, bottom: 300.0, width: 500.0, height: 200.0 };
2187 let native_rows = vec![VisualTextRow {
2188 top: 250.0,
2189 left: 48.0,
2190 text: "Body text below an unrelated decorative image".to_string(),
2191 }];
2192
2193 assert!(!should_ocr_image_region(region, &native_rows));
2194 }
2195
2196 #[cfg(feature = "pdf-ocr-bundled")]
2197 #[test]
2198 fn skips_ocr_when_native_text_already_covers_region() {
2199 let region =
2200 PdfRegion { left: 48.0, bottom: 300.0, width: 500.0, height: 200.0 };
2201 let native_rows = vec![
2202 VisualTextRow {
2203 top: 400.0,
2204 left: 100.0,
2205 text: "Native label".to_string(),
2206 },
2207 VisualTextRow {
2208 top: 285.0,
2209 left: 48.0,
2210 text: "Figure 1. Native diagram".to_string(),
2211 },
2212 ];
2213
2214 assert!(!should_ocr_image_region(region, &native_rows));
2215 }
2216
2217 #[cfg(feature = "pdf-ocr-bundled")]
2218 fn generated_ocr_fixture(text: &str) -> image::DynamicImage {
2219 let scale = 12u32;
2220 let glyph_width = 5u32;
2221 let glyph_height = 7u32;
2222 let spacing = 2u32;
2223 let padding = 24u32;
2224 let width = padding * 2
2225 + text.chars().count() as u32 * (glyph_width + spacing) * scale;
2226 let height = padding * 2 + glyph_height * scale;
2227 let mut image = image::RgbaImage::from_pixel(
2228 width,
2229 height,
2230 image::Rgba([255, 255, 255, 255]),
2231 );
2232
2233 let mut x = padding;
2234 for ch in text.chars() {
2235 if ch == ' ' {
2236 x += (glyph_width + spacing) * scale;
2237 continue;
2238 }
2239 draw_glyph(&mut image, x, padding, scale, ch);
2240 x += (glyph_width + spacing) * scale;
2241 }
2242
2243 image::DynamicImage::ImageRgba8(image)
2244 }
2245
2246 #[cfg(feature = "pdf-ocr-bundled")]
2247 fn draw_glyph(
2248 image: &mut image::RgbaImage,
2249 x: u32,
2250 y: u32,
2251 scale: u32,
2252 ch: char,
2253 ) {
2254 let Some(pattern) = glyph_pattern(ch) else {
2255 return;
2256 };
2257 for (row, bits) in pattern.iter().enumerate() {
2258 for (col, bit) in bits.chars().enumerate() {
2259 if bit != '1' {
2260 continue;
2261 }
2262 for dy in 0..scale {
2263 for dx in 0..scale {
2264 image.put_pixel(
2265 x + col as u32 * scale + dx,
2266 y + row as u32 * scale + dy,
2267 image::Rgba([0, 0, 0, 255]),
2268 );
2269 }
2270 }
2271 }
2272 }
2273 }
2274
2275 #[cfg(feature = "pdf-ocr-bundled")]
2276 fn glyph_pattern(ch: char) -> Option<[&'static str; 7]> {
2277 match ch {
2278 'C' => {
2279 Some(["01111", "10000", "10000", "10000", "10000", "10000", "01111"])
2280 }
2281 'E' => {
2282 Some(["11111", "10000", "10000", "11110", "10000", "10000", "11111"])
2283 }
2284 'H' => {
2285 Some(["10001", "10001", "10001", "11111", "10001", "10001", "10001"])
2286 }
2287 'L' => {
2288 Some(["10000", "10000", "10000", "10000", "10000", "10000", "11111"])
2289 }
2290 'O' => {
2291 Some(["01110", "10001", "10001", "10001", "10001", "10001", "01110"])
2292 }
2293 'R' => {
2294 Some(["11110", "10001", "10001", "11110", "10100", "10010", "10001"])
2295 }
2296 _ => None,
2297 }
2298 }
2299}