cli_pdf_to_text/
stream.rs

1use std::path::PathBuf;
2use std::sync::Arc;
3
4use cli_image_to_ascii::{RenderConfig, render_half_block};
5use hygg_shared::normalize_file_path;
6
7use crate::sanitize::sanitize_layout_text;
8
9/// On-demand page extractor backed by a single parsed `pdf_oxide::PdfDocument`.
10///
11/// pdf_oxide parses the file lazily — `open` does the xref + catalog and
12/// returns in tens of milliseconds even on the 31 MB / 1310-page PDF
13/// reference, where `lopdf::Document::load` (the old backend) took ~40 s
14/// because it eagerly decompressed every content stream. Per-page
15/// extraction is sub-millisecond warm, hundreds of micros cold.
16///
17/// `pdf_oxide::PdfDocument` is `Send + Sync` (its interior-mutable caches
18/// are `Mutex`-guarded), so a `PdfStream` can be wrapped in `Arc` and
19/// shared between the main thread (rendering the first visible page) and
20/// the background loader thread (extracting the rest of the document) the
21/// same way the lopdf-backed version was.
22pub struct PdfStream {
23  canonical_path: PathBuf,
24  doc: pdf_oxide::PdfDocument,
25  total_pages: usize,
26  #[cfg(feature = "pdf-ocr-bundled")]
27  ocr_engine: Option<pdf_oxide::ocr::OcrEngine>,
28}
29
30#[derive(Clone, Copy, Debug, Eq, PartialEq)]
31pub enum PdfLineKind {
32  Text,
33  AnsiArt,
34}
35
36#[derive(Clone, Debug)]
37pub struct PdfRenderedPage {
38  pub raw_text: String,
39  pub lines: Vec<String>,
40  pub line_kinds: Vec<PdfLineKind>,
41  pub contains_images: bool,
42}
43
44impl PdfStream {
45  /// Open a PDF and parse its catalog. Does not extract any page text.
46  pub fn open(pdf_path: &str) -> Result<Self, Box<dyn std::error::Error>> {
47    Self::open_with_optional_ocr(pdf_path, false)
48  }
49
50  pub fn open_with_bundled_ocr(
51    pdf_path: &str,
52  ) -> Result<Self, Box<dyn std::error::Error>> {
53    Self::open_with_optional_ocr(pdf_path, true)
54  }
55
56  fn open_with_optional_ocr(
57    pdf_path: &str,
58    enable_ocr: bool,
59  ) -> Result<Self, Box<dyn std::error::Error>> {
60    let canonical_path = normalize_file_path(pdf_path)?;
61    let doc = pdf_oxide::PdfDocument::open(&canonical_path)
62      .map_err(|e| format!("pdf_oxide open failed: {e:?}"))?;
63    let total_pages = doc
64      .page_count()
65      .map_err(|e| format!("pdf_oxide page_count failed: {e:?}"))?;
66    #[cfg(feature = "pdf-ocr-bundled")]
67    let ocr_engine =
68      if enable_ocr { Some(crate::ocr::bundled_ocr_engine()?) } else { None };
69    #[cfg(not(feature = "pdf-ocr-bundled"))]
70    if enable_ocr {
71      return Err(
72        "OCR support is not available in this build. Rebuild with `--features pdf-ocr-bundled` to use the bundled English OCR engine."
73          .into(),
74      );
75    }
76    Ok(Self {
77      canonical_path,
78      doc,
79      total_pages,
80      #[cfg(feature = "pdf-ocr-bundled")]
81      ocr_engine,
82    })
83  }
84
85  pub fn total_pages(&self) -> usize {
86    self.total_pages
87  }
88
89  pub fn canonical_path(&self) -> &std::path::Path {
90    &self.canonical_path
91  }
92
93  /// Extract sanitized text for a single page.
94  ///
95  /// `page_index` is 1-based to match the historical lopdf-backed API
96  /// (the rest of hygg counts pages from 1 in saved progress, status
97  /// line, etc.). Returns `None` if the index is out of range, the page
98  /// has no extractable text, or extraction panicked. pdf_oxide claims a
99  /// 100 % pass rate on its 3 830-PDF corpus, but we still wrap in
100  /// `catch_unwind` so a misbehaving page can't take down the background
101  /// loader thread and leave every later page stuck on "loading".
102  ///
103  /// Uses pdf_oxide's positional `extract_text_lines` rather than the
104  /// simpler `extract_text`. The former returns each visual line with
105  /// its bounding box; we group lines that share a row (overlapping y
106  /// ranges) and join them left-to-right. Without that step pdf_oxide
107  /// can interleave adjacent TOC entries — "1.3 Foo1.4 Bar 3231" — and
108  /// the downstream sanitizer can't recover them.
109  pub fn extract_page(&self, page_index: usize) -> Option<String> {
110    if page_index == 0 || page_index > self.total_pages {
111      return None;
112    }
113    let doc = &self.doc;
114    let page_0based = page_index - 1;
115    let raw = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
116      extract_page_text_lines(doc, page_0based)
117    }))
118    .ok()
119    .flatten()?;
120    if raw.trim().is_empty() {
121      return None;
122    }
123    Some(sanitize_layout_text(&raw))
124  }
125
126  pub fn extract_page_with_images(
127    &self,
128    page_index: usize,
129    col: usize,
130  ) -> Option<PdfRenderedPage> {
131    if page_index == 0 || page_index > self.total_pages {
132      return None;
133    }
134
135    let raw_text = self.extract_page(page_index).unwrap_or_default();
136    let page_0based = page_index - 1;
137    let images = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
138      self.doc.extract_images(page_0based)
139    }))
140    .ok()
141    .and_then(Result::ok)
142    .unwrap_or_default();
143
144    let text_rows = positioned_visual_text_rows(&self.doc, page_0based);
145    #[cfg(feature = "pdf-ocr-bundled")]
146    let allow_unlabeled_vector_regions = self.ocr_engine.is_some();
147    #[cfg(not(feature = "pdf-ocr-bundled"))]
148    let allow_unlabeled_vector_regions = false;
149
150    let mut image_rows =
151      render_pdf_images(&self.doc, page_0based, col, images.as_slice());
152    image_rows.extend(render_vector_diagram_regions(
153      &self.doc,
154      page_0based,
155      col,
156      &text_rows,
157      allow_unlabeled_vector_regions,
158    ));
159
160    #[cfg(feature = "pdf-ocr-bundled")]
161    let text_rows = {
162      let mut text_rows = text_rows;
163      if let Some(engine) = self.ocr_engine.as_ref() {
164        let ocr_rows = ocr_visual_text_rows(
165          &self.doc,
166          page_0based,
167          images.as_slice(),
168          engine,
169          &text_rows,
170        );
171        let native_rows = text_rows.clone();
172        text_rows.extend(
173          ocr_rows
174            .into_iter()
175            .filter(|row| !has_near_duplicate_visual_text(&native_rows, row)),
176        );
177      }
178      text_rows
179    };
180    if image_rows.is_empty() {
181      let PdfPageForAnsi { lines, line_kinds } = if text_rows.is_empty() {
182        text_only_page_lines(&raw_text, col)
183      } else {
184        compose_visual_page(text_rows, Vec::new(), col)
185      };
186      return Some(PdfRenderedPage {
187        raw_text,
188        lines,
189        line_kinds,
190        contains_images: false,
191      });
192    }
193
194    let PdfPageForAnsi { lines, line_kinds } =
195      compose_visual_page(text_rows, image_rows, col);
196    Some(PdfRenderedPage { raw_text, lines, line_kinds, contains_images: true })
197  }
198}
199
200struct PdfPageForAnsi {
201  lines: Vec<String>,
202  line_kinds: Vec<PdfLineKind>,
203}
204
205#[derive(Clone, Debug)]
206struct VisualTextRow {
207  top: f32,
208  left: f32,
209  text: String,
210}
211
212struct VisualImageRows {
213  top: f32,
214  left_cells: usize,
215  width_cells: usize,
216  region: PdfRegion,
217  lines: Vec<String>,
218}
219
220#[derive(Clone, Copy, Debug)]
221struct PdfRegion {
222  left: f32,
223  bottom: f32,
224  width: f32,
225  height: f32,
226}
227
228impl PdfRegion {
229  fn top(&self) -> f32 {
230    self.bottom + self.height
231  }
232}
233
234fn text_only_page_lines(raw_text: &str, col: usize) -> PdfPageForAnsi {
235  let lines = cli_justify::justify_pdf_page(raw_text, col).lines;
236  let line_kinds = vec![PdfLineKind::Text; lines.len()];
237  PdfPageForAnsi { lines, line_kinds }
238}
239
240fn render_pdf_images(
241  doc: &pdf_oxide::PdfDocument,
242  page_0based: usize,
243  col: usize,
244  images: &[pdf_oxide::extractors::PdfImage],
245) -> Vec<VisualImageRows> {
246  if col == 0 {
247    return Vec::new();
248  }
249  let (page_left, page_width) = doc
250    .get_page_media_box(page_0based)
251    .ok()
252    .map(|(llx, _, urx, _)| (llx, (urx - llx).abs()))
253    .filter(|(_, w)| *w > 0.0)
254    .unwrap_or((0.0, 612.0));
255
256  let mut out = Vec::new();
257  for image in images {
258    let Some(bbox) = image.bbox() else {
259      continue;
260    };
261    if bbox.width <= 0.0 || bbox.height <= 0.0 {
262      continue;
263    }
264    let Ok(dynamic_image) = image.to_dynamic_image() else {
265      continue;
266    };
267    if let Some(rows) = render_dynamic_image_region(
268      &dynamic_image,
269      PdfRegion {
270        left: bbox.left(),
271        bottom: bbox.top(),
272        width: bbox.width,
273        height: bbox.height,
274      },
275      page_left,
276      page_width,
277      col,
278    ) {
279      out.push(rows);
280    }
281  }
282  out
283}
284
285fn render_dynamic_image_region(
286  dynamic_image: &image::DynamicImage,
287  region: PdfRegion,
288  page_left: f32,
289  page_width: f32,
290  col: usize,
291) -> Option<VisualImageRows> {
292  let left_cells = pdf_x_to_cells(region.left, page_left, page_width, col);
293  let left_cells = left_cells.min(col.saturating_sub(1));
294  let width_cells = pdf_width_to_cells(region.width, page_width, col);
295  let width_cells = width_cells.max(1).min(col.saturating_sub(left_cells));
296  if width_cells == 0 {
297    return None;
298  }
299  let height_rows =
300    pdf_image_height_rows(region.width, region.height, width_cells);
301  let lines = render_half_block(
302    dynamic_image,
303    RenderConfig::new(Some(width_cells as u32), Some(height_rows as u32)),
304  );
305  if lines.is_empty() {
306    return None;
307  }
308  Some(VisualImageRows {
309    top: region.top(),
310    left_cells,
311    width_cells,
312    region,
313    lines,
314  })
315}
316
317#[cfg(feature = "pdf-ocr-bundled")]
318fn ocr_visual_text_rows(
319  doc: &pdf_oxide::PdfDocument,
320  page_0based: usize,
321  images: &[pdf_oxide::extractors::PdfImage],
322  engine: &pdf_oxide::ocr::OcrEngine,
323  native_rows: &[VisualTextRow],
324) -> Vec<VisualTextRow> {
325  let mut out = Vec::new();
326  for image in images {
327    let Some(bbox) = image.bbox() else {
328      continue;
329    };
330    if bbox.width <= 0.0 || bbox.height <= 0.0 {
331      continue;
332    }
333    let region = PdfRegion {
334      left: bbox.left(),
335      bottom: bbox.top(),
336      width: bbox.width,
337      height: bbox.height,
338    };
339    if !should_ocr_image_region(region, native_rows) {
340      continue;
341    }
342    let Ok(dynamic_image) = image.to_dynamic_image() else {
343      continue;
344    };
345    out.extend(ocr_dynamic_image_text_rows(engine, &dynamic_image, region));
346  }
347
348  for (region, dynamic_image) in
349    render_vector_diagram_images(doc, page_0based, native_rows)
350  {
351    if !should_ocr_image_region(region, native_rows) {
352      continue;
353    }
354    out.extend(ocr_dynamic_image_text_rows(engine, &dynamic_image, region));
355  }
356
357  out
358}
359
360#[cfg(feature = "pdf-ocr-bundled")]
361fn should_ocr_image_region(
362  region: PdfRegion,
363  native_rows: &[VisualTextRow],
364) -> bool {
365  if native_text_is_sufficient_in_region(native_rows, region) {
366    return false;
367  }
368  if native_rows.is_empty() {
369    return true;
370  }
371  has_nearby_figure_caption(region, native_rows)
372}
373
374#[cfg(feature = "pdf-ocr-bundled")]
375fn native_text_is_sufficient_in_region(
376  native_rows: &[VisualTextRow],
377  region: PdfRegion,
378) -> bool {
379  let text = native_rows
380    .iter()
381    .filter(|row| visual_text_row_overlaps_region(row, region))
382    .map(|row| row.text.as_str())
383    .collect::<Vec<_>>()
384    .join(" ");
385  normalized_visual_text(&text).len() >= 8
386}
387
388#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
389fn visual_text_row_overlaps_region(
390  row: &VisualTextRow,
391  region: PdfRegion,
392) -> bool {
393  let right = region.left + region.width;
394  let row_right = row.left + row.text.chars().count() as f32 * 5.0;
395  row.top <= region.top() + 6.0
396    && row.top >= region.bottom - 6.0
397    && row.left <= right + 6.0
398    && row_right >= region.left - 6.0
399}
400
401#[cfg(feature = "pdf-ocr-bundled")]
402fn ocr_dynamic_image_text_rows(
403  engine: &pdf_oxide::ocr::OcrEngine,
404  image: &image::DynamicImage,
405  pdf_region: PdfRegion,
406) -> Vec<VisualTextRow> {
407  let Ok(output) = engine.ocr_image(image) else {
408    return Vec::new();
409  };
410  let image_width = image.width().max(1) as f32;
411  let image_height = image.height().max(1) as f32;
412
413  output
414    .spans
415    .into_iter()
416    .filter_map(|span| {
417      let text = normalize_visual_text_row(span.text.trim());
418      if text.trim().is_empty() {
419        return None;
420      }
421      let (left, top) = ocr_polygon_pdf_anchor(
422        &span.polygon,
423        pdf_region,
424        image_width,
425        image_height,
426      )?;
427      Some(VisualTextRow { top, left, text })
428    })
429    .collect()
430}
431
432#[cfg(feature = "pdf-ocr-bundled")]
433fn ocr_polygon_pdf_anchor(
434  polygon: &[[f32; 2]; 4],
435  pdf_region: PdfRegion,
436  image_width: f32,
437  image_height: f32,
438) -> Option<(f32, f32)> {
439  let mut min_x = f32::INFINITY;
440  let mut min_y = f32::INFINITY;
441  for [x, y] in polygon {
442    if !x.is_finite() || !y.is_finite() {
443      return None;
444    }
445    min_x = min_x.min(*x);
446    min_y = min_y.min(*y);
447  }
448  if !min_x.is_finite() || !min_y.is_finite() {
449    return None;
450  }
451  let left = pdf_region.left + (min_x / image_width) * pdf_region.width;
452  let top = pdf_region.top() - (min_y / image_height) * pdf_region.height;
453  Some((left, top))
454}
455
456#[cfg(feature = "pdf-ocr-bundled")]
457fn render_vector_diagram_images(
458  doc: &pdf_oxide::PdfDocument,
459  page_0based: usize,
460  native_rows: &[VisualTextRow],
461) -> Vec<(PdfRegion, image::DynamicImage)> {
462  let (page_left, page_top, page_width, page_height) =
463    page_metrics(doc, page_0based);
464  let paths = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
465    doc.extract_paths(page_0based)
466  }))
467  .ok()
468  .and_then(Result::ok)
469  .unwrap_or_default();
470  let regions = detect_vector_diagram_regions(
471    &paths,
472    page_left,
473    page_top,
474    page_width,
475    page_height,
476    native_rows,
477    true,
478  );
479  let options = pdf_oxide::rendering::RenderOptions::with_dpi(120);
480
481  regions
482    .into_iter()
483    .filter(|region| should_ocr_image_region(*region, native_rows))
484    .filter_map(|region| {
485      let rendered = pdf_oxide::rendering::render_page_region(
486        doc,
487        page_0based,
488        (region.left, region.bottom, region.width, region.height),
489        &options,
490      )
491      .ok()?;
492      let dynamic_image = image::load_from_memory(&rendered.data).ok()?;
493      Some((region, dynamic_image))
494    })
495    .collect()
496}
497
498#[cfg(feature = "pdf-rendering")]
499fn render_vector_diagram_regions(
500  doc: &pdf_oxide::PdfDocument,
501  page_0based: usize,
502  col: usize,
503  native_rows: &[VisualTextRow],
504  allow_missing_native_text: bool,
505) -> Vec<VisualImageRows> {
506  if col == 0 {
507    return Vec::new();
508  }
509
510  let (page_left, page_top, page_width, page_height) =
511    page_metrics(doc, page_0based);
512  let paths = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
513    doc.extract_paths(page_0based)
514  }))
515  .ok()
516  .and_then(Result::ok)
517  .unwrap_or_default();
518  let regions = detect_vector_diagram_regions(
519    &paths,
520    page_left,
521    page_top,
522    page_width,
523    page_height,
524    native_rows,
525    allow_missing_native_text,
526  );
527
528  let options = pdf_oxide::rendering::RenderOptions::with_dpi(120);
529  let mut out = Vec::new();
530  for region in regions {
531    let rendered = pdf_oxide::rendering::render_page_region(
532      doc,
533      page_0based,
534      (region.left, region.bottom, region.width, region.height),
535      &options,
536    );
537    let Ok(rendered) = rendered else {
538      continue;
539    };
540    let Ok(dynamic_image) = image::load_from_memory(&rendered.data) else {
541      continue;
542    };
543    if let Some(rows) = render_dynamic_image_region(
544      &dynamic_image,
545      region,
546      page_left,
547      page_width,
548      col,
549    ) {
550      out.push(rows);
551    }
552  }
553  out
554}
555
556#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled"))]
557fn page_metrics(
558  doc: &pdf_oxide::PdfDocument,
559  page_0based: usize,
560) -> (f32, f32, f32, f32) {
561  doc
562    .get_page_media_box(page_0based)
563    .ok()
564    .map(|(llx, lly, urx, ury)| {
565      (llx.min(urx), lly.min(ury), (urx - llx).abs(), (ury - lly).abs())
566    })
567    .filter(|(_, _, w, h)| *w > 0.0 && *h > 0.0)
568    .unwrap_or((0.0, 0.0, 612.0, 792.0))
569}
570
571#[cfg(not(feature = "pdf-rendering"))]
572fn render_vector_diagram_regions(
573  _doc: &pdf_oxide::PdfDocument,
574  _page_0based: usize,
575  _col: usize,
576  _native_rows: &[VisualTextRow],
577  _allow_missing_native_text: bool,
578) -> Vec<VisualImageRows> {
579  Vec::new()
580}
581
582#[cfg(any(feature = "pdf-rendering", test))]
583fn detect_vector_diagram_regions(
584  paths: &[pdf_oxide::elements::PathContent],
585  page_left: f32,
586  page_top: f32,
587  page_width: f32,
588  page_height: f32,
589  native_rows: &[VisualTextRow],
590  allow_missing_native_text: bool,
591) -> Vec<PdfRegion> {
592  let mut clusters: Vec<VectorPathCluster> = Vec::new();
593
594  for path in paths {
595    let bbox = path.bbox;
596    if !path.is_table_primitive()
597      || !bbox.x.is_finite()
598      || !bbox.y.is_finite()
599      || !bbox.width.is_finite()
600      || !bbox.height.is_finite()
601      || (bbox.width <= 0.0 && bbox.height <= 0.0)
602      || bbox.width > page_width * 0.95
603      || bbox.height > page_height * 0.95
604    {
605      continue;
606    }
607
608    let bounds = VectorPathBounds {
609      left: bbox.left(),
610      bottom: bbox.top(),
611      right: bbox.right(),
612      top: bbox.bottom(),
613    };
614    add_vector_path_to_clusters(&mut clusters, bounds);
615  }
616
617  let page_right = page_left + page_width;
618  let page_bottom = page_top + page_height;
619  clusters
620    .into_iter()
621    .filter(|cluster| cluster.count >= 3)
622    .filter_map(|cluster| {
623      cluster.region_with_padding(page_left, page_top, page_right, page_bottom)
624    })
625    .filter(|region| region.width >= 24.0 && region.height >= 24.0)
626    .filter(|region| {
627      should_render_vector_diagram_region(
628        *region,
629        native_rows,
630        allow_missing_native_text,
631      )
632    })
633    .collect()
634}
635
636#[cfg(any(feature = "pdf-rendering", test))]
637#[derive(Clone, Copy, Debug)]
638struct VectorPathBounds {
639  left: f32,
640  bottom: f32,
641  right: f32,
642  top: f32,
643}
644
645#[cfg(any(feature = "pdf-rendering", test))]
646#[derive(Clone, Copy, Debug)]
647struct VectorPathCluster {
648  count: usize,
649  left: f32,
650  bottom: f32,
651  right: f32,
652  top: f32,
653}
654
655#[cfg(any(feature = "pdf-rendering", test))]
656impl VectorPathCluster {
657  fn new(bounds: VectorPathBounds) -> Self {
658    Self {
659      count: 1,
660      left: bounds.left,
661      bottom: bounds.bottom,
662      right: bounds.right,
663      top: bounds.top,
664    }
665  }
666
667  fn is_near(&self, bounds: VectorPathBounds) -> bool {
668    const CLUSTER_TOLERANCE: f32 = 48.0;
669    bounds.left <= self.right + CLUSTER_TOLERANCE
670      && bounds.right >= self.left - CLUSTER_TOLERANCE
671      && bounds.bottom <= self.top + CLUSTER_TOLERANCE
672      && bounds.top >= self.bottom - CLUSTER_TOLERANCE
673  }
674
675  fn merge_bounds(&mut self, bounds: VectorPathBounds) {
676    self.count += 1;
677    self.left = self.left.min(bounds.left);
678    self.bottom = self.bottom.min(bounds.bottom);
679    self.right = self.right.max(bounds.right);
680    self.top = self.top.max(bounds.top);
681  }
682
683  fn merge_cluster(&mut self, other: Self) {
684    self.count += other.count;
685    self.left = self.left.min(other.left);
686    self.bottom = self.bottom.min(other.bottom);
687    self.right = self.right.max(other.right);
688    self.top = self.top.max(other.top);
689  }
690
691  fn region_with_padding(
692    &self,
693    page_left: f32,
694    page_top: f32,
695    page_right: f32,
696    page_bottom: f32,
697  ) -> Option<PdfRegion> {
698    if !self.left.is_finite() || !self.bottom.is_finite() {
699      return None;
700    }
701    let pad = 4.0;
702    let padded_left = (self.left - pad).max(page_left);
703    let padded_bottom = (self.bottom - pad).max(page_top);
704    let padded_right = (self.right + pad).min(page_right);
705    let padded_top = (self.top + pad).min(page_bottom);
706    Some(PdfRegion {
707      left: padded_left,
708      bottom: padded_bottom,
709      width: (padded_right - padded_left).max(0.0),
710      height: (padded_top - padded_bottom).max(0.0),
711    })
712  }
713}
714
715#[cfg(any(feature = "pdf-rendering", test))]
716fn add_vector_path_to_clusters(
717  clusters: &mut Vec<VectorPathCluster>,
718  bounds: VectorPathBounds,
719) {
720  let Some(mut cluster_idx) =
721    clusters.iter().position(|cluster| cluster.is_near(bounds))
722  else {
723    clusters.push(VectorPathCluster::new(bounds));
724    return;
725  };
726
727  clusters[cluster_idx].merge_bounds(bounds);
728  let mut idx = 0;
729  while idx < clusters.len() {
730    if idx != cluster_idx
731      && clusters[cluster_idx].is_near(VectorPathBounds {
732        left: clusters[idx].left,
733        bottom: clusters[idx].bottom,
734        right: clusters[idx].right,
735        top: clusters[idx].top,
736      })
737    {
738      let other = clusters.remove(idx);
739      if idx < cluster_idx {
740        cluster_idx -= 1;
741      }
742      clusters[cluster_idx].merge_cluster(other);
743    } else {
744      idx += 1;
745    }
746  }
747}
748
749#[cfg(any(feature = "pdf-rendering", test))]
750fn should_render_vector_diagram_region(
751  region: PdfRegion,
752  native_rows: &[VisualTextRow],
753  allow_missing_native_text: bool,
754) -> bool {
755  if !has_nearby_figure_caption(region, native_rows) {
756    return false;
757  }
758  allow_missing_native_text
759    || has_native_text_inside_region(region, native_rows)
760}
761
762#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
763fn has_nearby_figure_caption(
764  region: PdfRegion,
765  native_rows: &[VisualTextRow],
766) -> bool {
767  native_rows.iter().any(|row| {
768    is_figure_caption(&row.text)
769      && row.left <= region.left + region.width + 80.0
770      && row.left + row.text.chars().count() as f32 * 5.0 >= region.left - 80.0
771      && vertical_distance_to_region(region, row.top) <= 90.0
772  })
773}
774
775#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
776fn has_native_text_inside_region(
777  region: PdfRegion,
778  native_rows: &[VisualTextRow],
779) -> bool {
780  native_rows.iter().any(|row| {
781    !is_figure_caption(&row.text)
782      && visual_alnum_len(&row.text) >= 2
783      && visual_text_row_overlaps_region(row, region)
784  })
785}
786
787#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
788fn visual_alnum_len(text: &str) -> usize {
789  text.chars().filter(|ch| ch.is_alphanumeric()).count()
790}
791
792#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
793fn is_figure_caption(text: &str) -> bool {
794  let trimmed = text.trim_start();
795  let Some(rest) = trimmed.strip_prefix("Figure ") else {
796    return false;
797  };
798  rest.chars().next().is_some_and(|ch| ch.is_ascii_digit())
799}
800
801#[cfg(any(feature = "pdf-rendering", feature = "pdf-ocr-bundled", test))]
802fn vertical_distance_to_region(region: PdfRegion, y: f32) -> f32 {
803  if y < region.bottom {
804    region.bottom - y
805  } else if y > region.top() {
806    y - region.top()
807  } else {
808    0.0
809  }
810}
811
812fn pdf_x_to_cells(
813  x: f32,
814  page_left: f32,
815  page_width: f32,
816  col: usize,
817) -> usize {
818  if page_width <= 0.0 || col == 0 {
819    return 0;
820  }
821  (((x - page_left).max(0.0) / page_width) * col as f32).round() as usize
822}
823
824fn pdf_width_to_cells(width: f32, page_width: f32, col: usize) -> usize {
825  if page_width <= 0.0 || col == 0 {
826    return 0;
827  }
828  ((width.max(0.0) / page_width) * col as f32).round() as usize
829}
830
831fn pdf_image_height_rows(
832  bbox_width: f32,
833  bbox_height: f32,
834  width_cells: usize,
835) -> usize {
836  if bbox_width <= 0.0 || bbox_height <= 0.0 || width_cells == 0 {
837    return 1;
838  }
839  ((bbox_height / bbox_width) * width_cells as f32).round().max(1.0) as usize
840}
841
842fn compose_visual_page(
843  text_rows: Vec<VisualTextRow>,
844  mut image_rows: Vec<VisualImageRows>,
845  col: usize,
846) -> PdfPageForAnsi {
847  enum Event {
848    Text(VisualTextRow),
849    Image(VisualImageRows),
850  }
851
852  let text_rows = overlay_text_rows_on_images(text_rows, &mut image_rows);
853  let mut events: Vec<Event> =
854    Vec::with_capacity(text_rows.len() + image_rows.len());
855  events.extend(text_rows.into_iter().map(Event::Text));
856  events.extend(image_rows.into_iter().map(Event::Image));
857  events.sort_by(|a, b| {
858    let a_top = match a {
859      Event::Text(row) => row.top,
860      Event::Image(row) => row.top,
861    };
862    let b_top = match b {
863      Event::Text(row) => row.top,
864      Event::Image(row) => row.top,
865    };
866    b_top.partial_cmp(&a_top).unwrap_or(std::cmp::Ordering::Equal)
867  });
868
869  let page_left = events
870    .iter()
871    .filter_map(|event| match event {
872      Event::Text(row) if !row.text.trim().is_empty() => Some(row.left),
873      _ => None,
874    })
875    .fold(f32::INFINITY, f32::min);
876  let page_left = if page_left.is_finite() { page_left } else { 0.0 };
877
878  let mut lines = Vec::new();
879  let mut line_kinds = Vec::new();
880  for event in events {
881    match event {
882      Event::Text(row) => {
883        if row.text.trim().is_empty() {
884          continue;
885        }
886        let indent =
887          (((row.left - page_left) / 5.0).round()).clamp(0.0, 20.0) as usize;
888        let text_width = col.saturating_sub(indent).max(1);
889        let wrapped_lines = if row.text.chars().count() <= text_width {
890          vec![row.text]
891        } else {
892          cli_justify::justify(&row.text, text_width)
893        };
894        for wrapped in wrapped_lines {
895          lines.push(format!("{}{}", " ".repeat(indent), wrapped));
896          line_kinds.push(PdfLineKind::Text);
897        }
898      }
899      Event::Image(row) => {
900        let indent = " ".repeat(row.left_cells);
901        for line in row.lines {
902          lines.push(format!("{indent}{line}\x1b[0m"));
903          line_kinds.push(PdfLineKind::AnsiArt);
904        }
905      }
906    }
907  }
908
909  if lines.is_empty() {
910    lines.push(String::new());
911    line_kinds.push(PdfLineKind::Text);
912  }
913
914  PdfPageForAnsi { lines, line_kinds }
915}
916
917fn overlay_text_rows_on_images(
918  text_rows: Vec<VisualTextRow>,
919  image_rows: &mut [VisualImageRows],
920) -> Vec<VisualTextRow> {
921  let mut remaining = Vec::new();
922  for row in text_rows {
923    if !overlay_text_row_on_first_matching_image(&row, image_rows) {
924      remaining.push(row);
925    }
926  }
927  remaining
928}
929
930fn overlay_text_row_on_first_matching_image(
931  row: &VisualTextRow,
932  image_rows: &mut [VisualImageRows],
933) -> bool {
934  for image in image_rows {
935    if !image_contains_text_row(image, row) {
936      continue;
937    }
938    let line_idx = image_text_line_index(image, row.top);
939    let col_idx = image_text_col_index(image, row.left);
940    let Some(line) = image.lines.get_mut(line_idx) else {
941      return false;
942    };
943    *line = overlay_text_on_ansi_line(line, col_idx, row.text.trim());
944    return true;
945  }
946  false
947}
948
949fn image_contains_text_row(
950  image: &VisualImageRows,
951  row: &VisualTextRow,
952) -> bool {
953  let right = image.region.left + image.region.width;
954  let bottom = image.region.bottom;
955  let top = image.region.top();
956  let vertical_pad = (image.region.height / image.lines.len().max(1) as f32
957    * 0.5)
958    .clamp(2.0, 6.0);
959  row.top <= top + vertical_pad
960    && row.top >= bottom - vertical_pad
961    && row.left <= right
962    && row.left + row.text.chars().count() as f32 * 5.0 >= image.region.left
963}
964
965fn image_text_line_index(image: &VisualImageRows, text_top: f32) -> usize {
966  if image.lines.is_empty() || image.region.height <= 0.0 {
967    return 0;
968  }
969  let rel = ((image.region.top() - text_top) / image.region.height)
970    .clamp(0.0, 0.999_999);
971  (rel * image.lines.len() as f32).floor() as usize
972}
973
974fn image_text_col_index(image: &VisualImageRows, text_left: f32) -> usize {
975  if image.region.width <= 0.0 || image.width_cells == 0 {
976    return 0;
977  }
978  let rel =
979    ((text_left - image.region.left) / image.region.width).clamp(0.0, 1.0);
980  (rel * image.width_cells as f32).round() as usize
981}
982
983fn overlay_text_on_ansi_line(
984  line: &str,
985  start_col: usize,
986  text: &str,
987) -> String {
988  let available = ansi_visible_width(line).saturating_sub(start_col);
989  if available == 0 {
990    return line.to_string();
991  }
992  let text: String =
993    text.chars().filter(|ch| !ch.is_control()).take(available).collect();
994  if text.is_empty() {
995    return line.to_string();
996  }
997  let overlay_width = text.chars().count();
998  let mut out = String::with_capacity(line.len() + text.len() + 8);
999  let mut chars = line.chars().peekable();
1000  let mut visible_col = 0usize;
1001  let mut inserted = false;
1002
1003  while let Some(ch) = chars.next() {
1004    if ch == '\x1b' {
1005      out.push(ch);
1006      for next in chars.by_ref() {
1007        out.push(next);
1008        if next == 'm' {
1009          break;
1010        }
1011      }
1012      continue;
1013    }
1014
1015    if !inserted && visible_col >= start_col {
1016      out.push_str("\x1b[0m");
1017      out.push_str(&text);
1018      out.push_str("\x1b[0m");
1019      inserted = true;
1020    }
1021
1022    if inserted
1023      && visible_col >= start_col
1024      && visible_col < start_col + overlay_width
1025    {
1026      visible_col += 1;
1027      continue;
1028    }
1029
1030    out.push(ch);
1031    visible_col += 1;
1032  }
1033
1034  if !inserted {
1035    out.push_str(&" ".repeat(start_col.saturating_sub(visible_col)));
1036    out.push_str("\x1b[0m");
1037    out.push_str(&text);
1038  }
1039
1040  out
1041}
1042
1043fn ansi_visible_width(line: &str) -> usize {
1044  let mut chars = line.chars().peekable();
1045  let mut width = 0usize;
1046  while let Some(ch) = chars.next() {
1047    if ch == '\x1b' {
1048      for next in chars.by_ref() {
1049        if next == 'm' {
1050          break;
1051        }
1052      }
1053      continue;
1054    }
1055    width += 1;
1056  }
1057  width
1058}
1059
1060#[cfg(feature = "pdf-ocr-bundled")]
1061fn has_near_duplicate_visual_text(
1062  native_rows: &[VisualTextRow],
1063  ocr_row: &VisualTextRow,
1064) -> bool {
1065  let ocr_norm = normalized_visual_text(&ocr_row.text);
1066  if ocr_norm.is_empty() {
1067    return true;
1068  }
1069  native_rows.iter().any(|native| {
1070    (native.top - ocr_row.top).abs() <= 12.0
1071      && (native.left - ocr_row.left).abs() <= 24.0
1072      && {
1073        let native_norm = normalized_visual_text(&native.text);
1074        native_norm.contains(&ocr_norm) || ocr_norm.contains(&native_norm)
1075      }
1076  })
1077}
1078
1079#[cfg(feature = "pdf-ocr-bundled")]
1080fn normalized_visual_text(text: &str) -> String {
1081  text
1082    .chars()
1083    .filter(|ch| ch.is_alphanumeric())
1084    .flat_map(char::to_lowercase)
1085    .collect()
1086}
1087
1088#[cfg(test)]
1089fn positioned_sanitized_text_rows(
1090  doc: &pdf_oxide::PdfDocument,
1091  page_0based: usize,
1092  raw_text: &str,
1093  col: usize,
1094) -> Vec<VisualTextRow> {
1095  let sanitized_lines = cli_justify::justify_pdf_page(raw_text, col).lines;
1096  let anchors = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1097    extract_visual_text_rows(doc, page_0based)
1098  }))
1099  .ok()
1100  .flatten()
1101  .unwrap_or_default();
1102
1103  if anchors.is_empty() {
1104    return sanitized_lines
1105      .into_iter()
1106      .enumerate()
1107      .map(|(idx, text)| VisualTextRow { top: -(idx as f32), left: 0.0, text })
1108      .collect();
1109  }
1110
1111  sanitized_lines
1112    .into_iter()
1113    .enumerate()
1114    .map(|(idx, text)| {
1115      let anchor = anchors
1116        .get(idx)
1117        .or_else(|| anchors.last())
1118        .expect("anchors is non-empty");
1119      let extra = idx.saturating_sub(anchors.len().saturating_sub(1)) as f32;
1120      VisualTextRow { top: anchor.top - extra, left: anchor.left, text }
1121    })
1122    .collect()
1123}
1124
1125fn positioned_visual_text_rows(
1126  doc: &pdf_oxide::PdfDocument,
1127  page_0based: usize,
1128) -> Vec<VisualTextRow> {
1129  std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1130    extract_visual_text_rows(doc, page_0based)
1131  }))
1132  .ok()
1133  .flatten()
1134  .map(filter_visual_text_rows)
1135  .unwrap_or_default()
1136}
1137
1138fn filter_visual_text_rows(rows: Vec<VisualTextRow>) -> Vec<VisualTextRow> {
1139  let mut rows: Vec<VisualTextRow> = rows
1140    .into_iter()
1141    .filter_map(|mut row| {
1142      row.text = normalize_visual_text_row(&row.text);
1143      if row.text.trim().is_empty() || is_visual_running_header(&row.text) {
1144        None
1145      } else {
1146        Some(row)
1147      }
1148    })
1149    .collect();
1150
1151  const ISOLATED_GAP: f32 = 30.0;
1152  while rows.len() >= 2
1153    && is_digits_only(&rows[0].text)
1154    && (rows[0].top - rows[1].top).abs() > ISOLATED_GAP
1155  {
1156    rows.remove(0);
1157  }
1158  while rows.len() >= 2 {
1159    let last = rows.len() - 1;
1160    if is_digits_only(&rows[last].text)
1161      && (rows[last - 1].top - rows[last].top).abs() > ISOLATED_GAP
1162    {
1163      rows.remove(last);
1164    } else {
1165      break;
1166    }
1167  }
1168
1169  rows
1170}
1171
1172fn normalize_visual_text_row(text: &str) -> String {
1173  let mut normalized = String::with_capacity(text.len());
1174  for ch in text.chars() {
1175    if is_private_use_or_format_char(ch) {
1176      continue;
1177    }
1178    if ch == '\u{00A0}' {
1179      normalized.push(' ');
1180    } else {
1181      normalized.push(ch);
1182    }
1183  }
1184  normalized
1185}
1186
1187fn is_private_use_or_format_char(ch: char) -> bool {
1188  matches!(
1189    ch,
1190    '\u{E000}'..='\u{F8FF}'
1191      | '\u{F0000}'..='\u{FFFFD}'
1192      | '\u{100000}'..='\u{10FFFD}'
1193      | '\u{FEFF}'
1194      | '\u{200B}'..='\u{200D}'
1195      | '\u{2060}'
1196  )
1197}
1198
1199fn is_visual_running_header(text: &str) -> bool {
1200  let trimmed = text.trim();
1201  if trimmed.is_empty() {
1202    return false;
1203  }
1204
1205  is_chapter_section_visual_header(trimmed)
1206}
1207
1208fn is_chapter_section_visual_header(trimmed: &str) -> bool {
1209  let tokens: Vec<&str> = trimmed.split_whitespace().collect();
1210  if tokens.len() < 3 || tokens.len() > 6 {
1211    return false;
1212  }
1213
1214  let label = tokens[0];
1215  if !matches!(label, "CHAPTER" | "SECTION" | "APPENDIX" | "PART") {
1216    return false;
1217  }
1218
1219  let number = tokens[1];
1220  if number.is_empty() || number.len() > 8 {
1221    return false;
1222  }
1223  if !number.chars().all(|ch| ch.is_ascii_alphanumeric() || ch == '.') {
1224    return false;
1225  }
1226
1227  let looks_like_section_id = number.chars().any(|ch| ch.is_ascii_digit())
1228    || number.chars().all(|ch| ch.is_ascii_uppercase());
1229  if !looks_like_section_id {
1230    return false;
1231  }
1232
1233  let last = tokens[tokens.len() - 1];
1234  if last.chars().all(|ch| ch.is_ascii_digit()) {
1235    return false;
1236  }
1237
1238  has_visual_wide_gap_between(trimmed, number, last)
1239}
1240
1241fn has_visual_wide_gap_between(trimmed: &str, first: &str, last: &str) -> bool {
1242  let Some(first_idx) = trimmed.find(first) else {
1243    return false;
1244  };
1245  let first_end = first_idx + first.len();
1246  let Some(last_start) = trimmed.rfind(last) else {
1247    return false;
1248  };
1249  if last_start <= first_end {
1250    return false;
1251  }
1252  trimmed[first_end..last_start].chars().filter(|ch| *ch == ' ').count() >= 10
1253}
1254
1255fn extract_visual_text_rows(
1256  doc: &pdf_oxide::PdfDocument,
1257  page_0based: usize,
1258) -> Option<Vec<VisualTextRow>> {
1259  let mut lines = doc.extract_text_lines(page_0based).ok()?;
1260  if lines.is_empty() {
1261    return None;
1262  }
1263
1264  lines.sort_by(|a, b| {
1265    b.bbox
1266      .top()
1267      .partial_cmp(&a.bbox.top())
1268      .unwrap_or(std::cmp::Ordering::Equal)
1269      .then_with(|| {
1270        a.bbox
1271          .left()
1272          .partial_cmp(&b.bbox.left())
1273          .unwrap_or(std::cmp::Ordering::Equal)
1274      })
1275  });
1276
1277  const SAME_ROW_TOL: f32 = 3.0;
1278  const PT_PER_CHAR: f32 = 5.0;
1279
1280  let mut rows = Vec::new();
1281  let mut row_start = 0usize;
1282  let mut row_anchor_y = lines[0].bbox.top();
1283  for i in 1..=lines.len() {
1284    let break_row = i == lines.len()
1285      || (row_anchor_y - lines[i].bbox.top()).abs() > SAME_ROW_TOL;
1286    if break_row {
1287      let mut row: Vec<&pdf_oxide::layout::TextLine> =
1288        lines[row_start..i].iter().collect();
1289      row.sort_by(|a, b| {
1290        a.bbox
1291          .left()
1292          .partial_cmp(&b.bbox.left())
1293          .unwrap_or(std::cmp::Ordering::Equal)
1294      });
1295      let row_left =
1296        row.iter().map(|l| l.bbox.left()).fold(f32::INFINITY, f32::min);
1297      let mut body = String::new();
1298      let mut prev_right: Option<f32> = None;
1299      for line in row {
1300        for word in &line.words {
1301          if let Some(pr) = prev_right {
1302            let gap_pt = (word.bbox.left() - pr).max(0.0);
1303            let gap_chars = ((gap_pt / PT_PER_CHAR).round() as usize).max(1);
1304            for _ in 0..gap_chars {
1305              body.push(' ');
1306            }
1307          }
1308          body.push_str(&word.text);
1309          prev_right = Some(word.bbox.right());
1310        }
1311      }
1312      rows.push(VisualTextRow {
1313        top: row_anchor_y,
1314        left: row_left,
1315        text: body,
1316      });
1317      row_start = i;
1318      if i < lines.len() {
1319        row_anchor_y = lines[i].bbox.top();
1320      }
1321    }
1322  }
1323
1324  Some(rows)
1325}
1326
1327/// Build a text blob from pdf_oxide's positional `TextLine` output.
1328///
1329/// Lines are returned in a roughly visual order but adjacent rows can
1330/// collide when text is laid out in cells (table rows) or columns. We
1331/// sort by y descending (PDF origin is bottom-left, so top of page is the
1332/// largest y), then walk the list collecting lines that share a row into
1333/// a single output line, sorted left-to-right within the row.
1334fn extract_page_text_lines(
1335  doc: &pdf_oxide::PdfDocument,
1336  page_0based: usize,
1337) -> Option<String> {
1338  let mut lines = doc.extract_text_lines(page_0based).ok()?;
1339  if lines.is_empty() {
1340    return None;
1341  }
1342
1343  // Sort top-to-bottom, then left-to-right.
1344  lines.sort_by(|a, b| {
1345    b.bbox
1346      .top()
1347      .partial_cmp(&a.bbox.top())
1348      .unwrap_or(std::cmp::Ordering::Equal)
1349      .then_with(|| {
1350        a.bbox
1351          .left()
1352          .partial_cmp(&b.bbox.left())
1353          .unwrap_or(std::cmp::Ordering::Equal)
1354      })
1355  });
1356
1357  // Threshold below which two lines are considered to be on the same row.
1358  // pdf_oxide's line bboxes for the same baseline tend to differ by < 1pt
1359  // even with mixed font sizes; 3pt comfortably absorbs that noise without
1360  // merging adjacent rows (which are typically separated by 10+pt).
1361  const SAME_ROW_TOL: f32 = 3.0;
1362
1363  // ~5 pt per char is a rough monospace approximation that lands within
1364  // a column or two of correct on body fonts in the PDFs we test against.
1365  // Cap the resulting indent so an outlier x-coordinate can't produce a
1366  // multi-line waste of whitespace.
1367  const PT_PER_CHAR: f32 = 5.0;
1368  const MAX_INDENT_CHARS: usize = 20;
1369
1370  // Build rows first as `(anchor_y, row_left, body_text)` so we can
1371  // post-process before producing the final string (drop isolated
1372  // page-number rows, drop running headers, recompute page_left after
1373  // dropping outliers, insert paragraph-break blank lines, etc.).
1374  // `body_text` is the row content WITHOUT its leading indent — the indent
1375  // is applied later from `(row_left - page_left)` once page_left has been
1376  // settled.
1377  let mut rows: Vec<(f32, f32, String)> = Vec::new();
1378  let mut row_start = 0usize;
1379  let mut row_anchor_y = lines[0].bbox.top();
1380  for i in 1..=lines.len() {
1381    let break_row = i == lines.len()
1382      || (row_anchor_y - lines[i].bbox.top()).abs() > SAME_ROW_TOL;
1383    if break_row {
1384      let mut row: Vec<&pdf_oxide::layout::TextLine> =
1385        lines[row_start..i].iter().collect();
1386      row.sort_by(|a, b| {
1387        a.bbox
1388          .left()
1389          .partial_cmp(&b.bbox.left())
1390          .unwrap_or(std::cmp::Ordering::Equal)
1391      });
1392      let row_left =
1393        row.iter().map(|l| l.bbox.left()).fold(f32::INFINITY, f32::min);
1394      // Walk every word across every TextLine in this row left-to-right
1395      // and insert spacing proportional to the bbox gap between adjacent
1396      // words. `TextLine::text` joins words with a single space and so
1397      // collapses the wide column gaps that TOC pages depend on — without
1398      // those gaps `parse_aligned_toc_row_start` can't split a row like
1399      // `1.1     About This Book     25` into prefix/title/page-number.
1400      let mut body = String::with_capacity(64);
1401      let mut prev_right: Option<f32> = None;
1402      for line in row.iter() {
1403        for word in &line.words {
1404          if let Some(pr) = prev_right {
1405            let gap_pt = (word.bbox.left() - pr).max(0.0);
1406            let gap_chars = ((gap_pt / PT_PER_CHAR).round() as usize).max(1);
1407            for _ in 0..gap_chars {
1408              body.push(' ');
1409            }
1410          }
1411          body.push_str(&word.text);
1412          prev_right = Some(word.bbox.right());
1413        }
1414      }
1415      rows.push((row_anchor_y, row_left, body));
1416      row_start = i;
1417      if i < lines.len() {
1418        row_anchor_y = lines[i].bbox.top();
1419      }
1420    }
1421  }
1422
1423  // Drop the top/bottom row if it's an isolated digits-only run — almost
1424  // certainly the page-number header/footer. The old `>=20 leading ws`
1425  // sanitize rule didn't survive positional extraction (we recompute
1426  // indents ourselves), so this is the only thing standing between the
1427  // page-number "5" / "6" / "7" rows and the reader.
1428  //
1429  // We deliberately do NOT drop short alphabetic running headers
1430  // (`Contents`, `Figures`, etc.) here — on some pages those same words
1431  // are the actual centered chapter title, and we can't tell them apart
1432  // by isolation alone. The sanitize pass dedups them by exact text after
1433  // the first occurrence is registered as a centered heading.
1434  const ISOLATED_GAP: f32 = 30.0;
1435  // Loop so a page that has BOTH a "6" page-number AND a centered title
1436  // above the body still strips the page number; the title stays.
1437  while rows.len() >= 2
1438    && is_digits_only(&rows[0].2)
1439    && (rows[0].0 - rows[1].0).abs() > ISOLATED_GAP
1440  {
1441    rows.remove(0);
1442  }
1443  while rows.len() >= 2 {
1444    let last = rows.len() - 1;
1445    if is_digits_only(&rows[last].2)
1446      && (rows[last - 1].0 - rows[last].0).abs() > ISOLATED_GAP
1447    {
1448      rows.remove(last);
1449    } else {
1450      break;
1451    }
1452  }
1453
1454  // Page body left margin. We need a value that's stable across pages so
1455  // facing-page TOCs (where the running header lives in a different x
1456  // column than body content) don't produce different indents for the
1457  // same logical content. Strategy: take the leftmost x that's "popular"
1458  // — bucket every row's left edge at 1pt resolution and use the smallest
1459  // bucket that has more than one row. Singleton positions (centered
1460  // titles, lone running headers, isolated captions) get filtered out and
1461  // can no longer pull the margin to the left.
1462  let mut buckets: std::collections::HashMap<i32, usize> =
1463    std::collections::HashMap::new();
1464  for (_, row_left, _) in &rows {
1465    let key = row_left.round() as i32;
1466    *buckets.entry(key).or_insert(0) += 1;
1467  }
1468  let popular_min = buckets
1469    .iter()
1470    .filter(|(_, count)| **count >= 2)
1471    .map(|(k, _)| *k as f32)
1472    .fold(f32::INFINITY, f32::min);
1473  let page_left = if popular_min.is_finite() {
1474    popular_min
1475  } else {
1476    rows.iter().map(|(_, x, _)| *x).fold(f32::INFINITY, f32::min)
1477  };
1478
1479  // Paragraph / code-block boundaries: pdf_oxide gives us no signal for
1480  // these — adjacent rows just have their y values, and runs of body text
1481  // sit ~13-16pt apart while a paragraph break or heading-to-prose
1482  // transition leaves a 25-35pt gap. Emit a blank line at every gap
1483  // that's noticeably larger than the page's *typical* line gap, so
1484  // downstream re-justification (and the reader visually) gets the same
1485  // paragraph shape pdf-extract used to produce.
1486  //
1487  // "Typical" here is the *mode* of the gap distribution bucketed at 2pt,
1488  // not the median or mean — within-block line spacing is by far the most
1489  // common gap (most rows are body text on most pages), so the mode tracks
1490  // it directly. Mean and median both pick up an upward bias from the few
1491  // legitimate paragraph breaks they're trying to detect.
1492  let gaps: Vec<f32> =
1493    rows.windows(2).map(|w| (w[0].0 - w[1].0).max(0.0)).collect();
1494  let para_threshold = paragraph_gap_threshold(&gaps);
1495
1496  let mut output =
1497    String::with_capacity(rows.iter().map(|(_, _, s)| s.len() + 8).sum());
1498  for i in 0..rows.len() {
1499    if i > 0 && gaps[i - 1] > para_threshold {
1500      output.push('\n');
1501    }
1502    let (_, row_left, body) = &rows[i];
1503    let indent_chars =
1504      (((row_left - page_left) / PT_PER_CHAR).round()).max(0.0) as usize;
1505    let indent_chars = indent_chars.min(MAX_INDENT_CHARS);
1506    for _ in 0..indent_chars {
1507      output.push(' ');
1508    }
1509    output.push_str(body);
1510    output.push('\n');
1511  }
1512  Some(output)
1513}
1514
1515fn is_digits_only(s: &str) -> bool {
1516  let t = s.trim();
1517  !t.is_empty() && t.chars().all(|c| c.is_ascii_digit())
1518}
1519
1520/// Returns "anything bigger than this is a paragraph / block break"
1521/// derived from the distribution of gaps on the page.
1522///
1523/// Strategy: bucket gaps at 2pt resolution, take the most-popular bucket
1524/// as the within-block line spacing, then return 1.7× that. We ignore
1525/// gaps under 5pt (intra-row noise from the row-grouping tolerance) when
1526/// computing the mode. Clamped to [20, 50] pt so a degenerate page (one
1527/// row, all-equal gaps, etc.) still produces a sane threshold.
1528fn paragraph_gap_threshold(gaps: &[f32]) -> f32 {
1529  let mut buckets: std::collections::HashMap<i32, usize> =
1530    std::collections::HashMap::new();
1531  for &g in gaps {
1532    if g >= 5.0 {
1533      let key = (g / 2.0).round() as i32;
1534      *buckets.entry(key).or_insert(0) += 1;
1535    }
1536  }
1537  let mode_gap = buckets
1538    .iter()
1539    .max_by_key(|(_, c)| *c)
1540    .map(|(k, _)| (*k as f32) * 2.0)
1541    .unwrap_or(14.0);
1542  (mode_gap * 1.7).clamp(20.0, 50.0)
1543}
1544
1545/// Convenience wrapper so callers can hold a cheap shared handle.
1546pub type SharedPdfStream = Arc<PdfStream>;
1547
1548#[cfg(test)]
1549mod tests {
1550  use super::*;
1551  use std::path::Path;
1552
1553  #[test]
1554  fn opens_and_extracts_individual_pages() {
1555    let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1556      .join("../test-data/pdf/progit-1-50.pdf");
1557    if !pdf_path.exists() {
1558      return;
1559    }
1560    let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1561      .expect("PdfStream should open valid test PDF");
1562    assert!(stream.total_pages() > 0, "test PDF should report pages");
1563
1564    // Scan a few early pages — at least one should produce real text.
1565    // (The first page of progit is a title/cover with minimal text.)
1566    let scan_upto = stream.total_pages().min(5);
1567    let mut any_non_empty = false;
1568    for p in 1..=scan_upto {
1569      if let Some(text) = stream.extract_page(p)
1570        && !text.trim().is_empty()
1571      {
1572        any_non_empty = true;
1573        break;
1574      }
1575    }
1576    assert!(
1577      any_non_empty,
1578      "at least one of the first {scan_upto} pages should extract non-empty text"
1579    );
1580  }
1581
1582  /// Regression: progit page 43 (the "Skipping the Staging Area" page)
1583  /// used to lose all paragraph breaks because pdf_oxide's text-line API
1584  /// doesn't signal them — and the standalone "37" page-number footer
1585  /// used to leak into content because the existing sanitize.rs heuristic
1586  /// for footer numbers requires ≥20 chars of leading whitespace, which
1587  /// our positional row builder strips. Verify both stay fixed.
1588  #[test]
1589  fn progit_paragraph_breaks_and_page_footer() {
1590    let pdf_path =
1591      Path::new(env!("CARGO_MANIFEST_DIR")).join("../test-data/pdf/progit.pdf");
1592    if !pdf_path.exists() {
1593      return;
1594    }
1595    let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1596      .expect("PdfStream should open progit");
1597    let text =
1598      stream.extract_page(43).expect("progit page 43 should produce text");
1599
1600    // Page-number footer must not leak through.
1601    let lines: Vec<&str> = text.lines().collect();
1602    assert!(
1603      !lines.iter().any(|l| l.trim() == "37"),
1604      "isolated page-number footer '37' should be stripped, got:\n{text}"
1605    );
1606
1607    // The "Alternatively, you can type your commit message" sentence
1608    // starts a new paragraph after "and diff stripped out)." — there
1609    // should be a blank line between them so the reflowed output keeps
1610    // paragraph structure.
1611    let alt_pos = text
1612      .find("Alternatively, you can type your commit message")
1613      .expect("expected sentence on page 43");
1614    let before = &text[..alt_pos];
1615    assert!(
1616      before.trim_end().ends_with("and diff stripped out)."),
1617      "text immediately before 'Alternatively…' should end the previous \
1618       paragraph, got:\n…{}…",
1619      &before[before.len().saturating_sub(80)..]
1620    );
1621    let trailing_newlines =
1622      before.as_bytes().iter().rev().take_while(|&&b| b == b'\n').count();
1623    assert!(
1624      trailing_newlines >= 2,
1625      "expected at least one blank line before 'Alternatively…' \
1626       (a paragraph break), got {trailing_newlines} trailing newlines"
1627    );
1628  }
1629
1630  /// Regression: the pdf reference 1.7 TOC interleaves two adjacent
1631  /// section headers because `extract_text` collapses lines without
1632  /// regard to their bounding boxes. `extract_text_lines` + the
1633  /// row-grouping in `extract_page_text_lines` is what fixes it, so make
1634  /// sure section labels stay on their own lines for a TOC-shaped page.
1635  #[test]
1636  fn toc_section_labels_stay_separate() {
1637    let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1638      .join("../test-data/pdf/pdfreference1.7old.pdf");
1639    if !pdf_path.exists() {
1640      return;
1641    }
1642    let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1643      .expect("PdfStream should open the reference PDF");
1644    // Page 5 (1-based) is the contents page.
1645    let text = stream.extract_page(5).expect("page 5 should produce text");
1646    let lines: Vec<&str> = text.lines().collect();
1647    // Word-bbox-derived spacing now preserves the wide TOC gap between the
1648    // section title and its trailing page number, so the trimmed row keeps
1649    // multiple spaces between them. Match either spacing shape.
1650    let normalize_spaces =
1651      |s: &str| s.split_whitespace().collect::<Vec<_>>().join(" ");
1652    assert!(
1653      lines
1654        .iter()
1655        .any(|l| normalize_spaces(l.trim()) == "1.3 Related Publications 31"),
1656      "section 1.3 should be on its own line, got:\n{text}"
1657    );
1658    assert!(
1659      lines
1660        .iter()
1661        .any(|l| normalize_spaces(l.trim()) == "1.4 Intellectual Property 32"),
1662      "section 1.4 should be on its own line, got:\n{text}"
1663    );
1664    // The collapsing bug previously produced this run-on string.
1665    assert!(
1666      !text.contains("1.3 Related Publications1.4"),
1667      "section labels must not be concatenated, got:\n{text}"
1668    );
1669  }
1670
1671  #[test]
1672  fn visual_composition_orders_text_and_ansi_art_with_metadata() {
1673    let text_rows = vec![
1674      VisualTextRow { top: 90.0, left: 50.0, text: "after image".to_string() },
1675      VisualTextRow {
1676        top: 200.0,
1677        left: 50.0,
1678        text: "before image".to_string(),
1679      },
1680    ];
1681    let image_rows = vec![VisualImageRows {
1682      top: 150.0,
1683      left_cells: 4,
1684      width_cells: 20,
1685      region: PdfRegion {
1686        left: 0.0,
1687        bottom: 125.0,
1688        width: 100.0,
1689        height: 25.0,
1690      },
1691      lines: vec!["\x1b[38;2;1;2;3m\x1b[48;2;4;5;6m▀\x1b[0m".into()],
1692    }];
1693
1694    let page = compose_visual_page(text_rows, image_rows, 80);
1695
1696    assert_eq!(
1697      page.line_kinds,
1698      vec![PdfLineKind::Text, PdfLineKind::AnsiArt, PdfLineKind::Text,]
1699    );
1700    assert_eq!(page.lines[0], "before image");
1701    assert!(page.lines[1].starts_with("    \x1b[38;2;1;2;3m"));
1702    assert!(page.lines[1].ends_with("\x1b[0m"));
1703    assert_eq!(page.lines[2], "after image");
1704  }
1705
1706  #[test]
1707  fn visual_text_inside_image_region_overlays_ansi_art() {
1708    let text_rows = vec![VisualTextRow {
1709      top: 140.0,
1710      left: 25.0,
1711      text: "diagram label".to_string(),
1712    }];
1713    let image_rows = vec![VisualImageRows {
1714      top: 150.0,
1715      left_cells: 0,
1716      width_cells: 40,
1717      region: PdfRegion {
1718        left: 0.0,
1719        bottom: 100.0,
1720        width: 100.0,
1721        height: 50.0,
1722      },
1723      lines: vec![
1724        format!("\x1b[38;2;1;2;3m{}\x1b[0m", "▀".repeat(40)),
1725        format!("\x1b[38;2;1;2;3m{}\x1b[0m", "▀".repeat(40)),
1726      ],
1727    }];
1728
1729    let page = compose_visual_page(text_rows, image_rows, 80);
1730
1731    assert_eq!(
1732      page.line_kinds,
1733      vec![PdfLineKind::AnsiArt, PdfLineKind::AnsiArt]
1734    );
1735    assert!(
1736      page.lines.iter().any(|line| line.contains("diagram label")),
1737      "text should be painted into the ANSI art lines: {:?}",
1738      page.lines
1739    );
1740  }
1741
1742  #[test]
1743  #[cfg(feature = "pdf-ocr-bundled")]
1744  fn ocr_text_rows_overlay_existing_ansi_art() {
1745    let engine =
1746      crate::ocr::bundled_ocr_engine().expect("bundled OCR should initialize");
1747    let image = generated_ocr_fixture("HELLO OCR");
1748    let text_rows = ocr_dynamic_image_text_rows(
1749      &engine,
1750      &image,
1751      PdfRegion { left: 0.0, bottom: 100.0, width: 300.0, height: 80.0 },
1752    );
1753    assert!(
1754      text_rows.iter().any(|row| {
1755        let normalized = normalized_visual_text(&row.text);
1756        normalized.contains("hello") || normalized.contains("ocr")
1757      }),
1758      "OCR should produce overlayable text rows, got {:?}",
1759      text_rows
1760    );
1761    let image_rows = vec![VisualImageRows {
1762      top: 180.0,
1763      left_cells: 0,
1764      width_cells: 60,
1765      region: PdfRegion {
1766        left: 0.0,
1767        bottom: 100.0,
1768        width: 300.0,
1769        height: 80.0,
1770      },
1771      lines: (0..6)
1772        .map(|_| format!("\x1b[38;2;1;2;3m{}\x1b[0m", "▀".repeat(60)))
1773        .collect(),
1774    }];
1775
1776    let page = compose_visual_page(text_rows, image_rows, 80);
1777    let rendered = page.lines.join("\n");
1778    let normalized = normalized_visual_text(&rendered);
1779
1780    assert!(page.line_kinds.iter().all(|kind| *kind == PdfLineKind::AnsiArt));
1781    assert!(
1782      normalized.contains("hello") || normalized.contains("ocr"),
1783      "OCR text should be overlaid into ANSI art, got {rendered:?}"
1784    );
1785  }
1786
1787  #[test]
1788  fn visual_text_outside_image_region_stays_separate() {
1789    let text_rows = vec![VisualTextRow {
1790      top: 75.0,
1791      left: 25.0,
1792      text: "caption below".to_string(),
1793    }];
1794    let image_rows = vec![VisualImageRows {
1795      top: 150.0,
1796      left_cells: 0,
1797      width_cells: 40,
1798      region: PdfRegion {
1799        left: 0.0,
1800        bottom: 100.0,
1801        width: 100.0,
1802        height: 50.0,
1803      },
1804      lines: vec!["\x1b[38;2;1;2;3m▀▀▀▀▀▀▀▀▀▀\x1b[0m".into()],
1805    }];
1806
1807    let page = compose_visual_page(text_rows, image_rows, 80);
1808
1809    assert_eq!(page.line_kinds, vec![PdfLineKind::AnsiArt, PdfLineKind::Text]);
1810    assert_eq!(page.lines[1], "caption below");
1811  }
1812
1813  #[test]
1814  fn text_only_ansi_page_keeps_every_line_text_marked() {
1815    let page = text_only_page_lines("one two three", 10);
1816
1817    assert!(!page.lines.is_empty());
1818    assert_eq!(page.line_kinds, vec![PdfLineKind::Text; page.lines.len()]);
1819  }
1820
1821  #[test]
1822  fn visual_page_without_art_uses_native_rows_before_sanitized_fallback() {
1823    let text_rows = vec![VisualTextRow {
1824      top: 100.0,
1825      left: 20.0,
1826      text: "diagram label".to_string(),
1827    }];
1828
1829    let page = compose_visual_page(text_rows, Vec::new(), 80);
1830
1831    assert_eq!(page.lines, vec!["diagram label"]);
1832    assert_eq!(page.line_kinds, vec![PdfLineKind::Text]);
1833  }
1834
1835  #[test]
1836  fn sanitized_text_rows_keep_pdf_position_anchors() {
1837    let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1838      .join("../test-data/pdf/progit-1-50.pdf");
1839    if !pdf_path.exists() {
1840      return;
1841    }
1842    let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1843      .expect("PdfStream should open valid test PDF");
1844    let raw_text = stream.extract_page(2).expect("page should produce text");
1845    let anchors = extract_visual_text_rows(&stream.doc, 1)
1846      .expect("page should produce positioned rows");
1847    let rows = positioned_sanitized_text_rows(&stream.doc, 1, &raw_text, 80);
1848
1849    assert!(!rows.is_empty());
1850    assert_eq!(rows[0].top, anchors[0].top);
1851    assert_eq!(rows[0].left, anchors[0].left);
1852  }
1853
1854  #[test]
1855  fn progit_figure_images_do_not_expose_internal_native_labels() {
1856    let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1857      .join("../test-data/pdf/progit-1-50.pdf");
1858    if !pdf_path.exists() {
1859      return;
1860    }
1861    let stream = PdfStream::open(pdf_path.to_str().expect("utf-8 path"))
1862      .expect("PdfStream should open valid test PDF");
1863    let page_0based = 22;
1864    let rows = positioned_visual_text_rows(&stream.doc, page_0based);
1865    let images = stream
1866      .doc
1867      .extract_images(page_0based)
1868      .expect("page should extract images");
1869    let bbox = images[0].bbox().expect("figure image should have a bbox");
1870    let region = PdfRegion {
1871      left: bbox.left(),
1872      bottom: bbox.top(),
1873      width: bbox.width,
1874      height: bbox.height,
1875    };
1876
1877    assert!(has_nearby_figure_caption(region, &rows));
1878    assert!(
1879      !rows.iter().any(|row| {
1880        !is_figure_caption(&row.text)
1881          && visual_text_row_overlaps_region(row, region)
1882      }),
1883      "ProGit figure labels are embedded in the image and require OCR"
1884    );
1885  }
1886
1887  #[cfg(feature = "pdf-ocr-bundled")]
1888  #[test]
1889  fn progit_figure_ocr_overlays_embedded_image_labels() {
1890    let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
1891      .join("../test-data/pdf/progit-1-50.pdf");
1892    if !pdf_path.exists() {
1893      return;
1894    }
1895    let stream =
1896      PdfStream::open_with_bundled_ocr(pdf_path.to_str().expect("utf-8 path"))
1897        .expect("PdfStream should open valid test PDF");
1898
1899    let page = stream
1900      .extract_page_with_images(34, 100)
1901      .expect("page should render with image rows");
1902    let rendered = page.lines.join("\n");
1903
1904    assert!(
1905      ["Untracked", "Unmodified", "Modified", "Staged"]
1906        .iter()
1907        .any(|label| rendered.contains(label)),
1908      "OCR should recover at least one embedded figure label, got {rendered:?}"
1909    );
1910  }
1911
1912  #[test]
1913  fn visual_text_rows_preserve_native_diagram_labels() {
1914    let rows = vec![
1915      VisualTextRow { top: 800.0, left: 300.0, text: "12".to_string() },
1916      VisualTextRow {
1917        top: 700.0,
1918        left: 72.0,
1919        text: "Body text before figure.".to_string(),
1920      },
1921      VisualTextRow { top: 660.0, left: 250.0, text: "Acrobat".to_string() },
1922      VisualTextRow {
1923        top: 645.0,
1924        left: 90.0,
1925        text: "Macintosh application Windows application".to_string(),
1926      },
1927      VisualTextRow { top: 630.0, left: 275.0, text: "Adobe PDF".to_string() },
1928      VisualTextRow { top: 615.0, left: 320.0, text: "printer".to_string() },
1929      VisualTextRow { top: 600.0, left: 72.0, text: "\u{f05a}".to_string() },
1930      VisualTextRow {
1931        top: 560.0,
1932        left: 72.0,
1933        text: "Body text after figure.".to_string(),
1934      },
1935      VisualTextRow { top: 40.0, left: 300.0, text: "13".to_string() },
1936    ];
1937
1938    let filtered = filter_visual_text_rows(rows);
1939    let texts: Vec<&str> =
1940      filtered.iter().map(|row| row.text.as_str()).collect();
1941
1942    assert_eq!(
1943      texts,
1944      vec![
1945        "Body text before figure.",
1946        "Acrobat",
1947        "Macintosh application Windows application",
1948        "Adobe PDF",
1949        "printer",
1950        "Body text after figure.",
1951      ]
1952    );
1953    assert!(filtered.iter().all(|row| row.text.trim() != "\u{f05a}"));
1954  }
1955
1956  #[test]
1957  fn detects_vector_diagram_region_from_box_primitives() {
1958    let paths = vec![
1959      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
1960        100.0, 200.0, 80.0, 40.0,
1961      )),
1962      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
1963        220.0, 200.0, 80.0, 40.0,
1964      )),
1965      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
1966        160.0, 280.0, 80.0, 40.0,
1967      )),
1968    ];
1969
1970    let text_rows = vec![VisualTextRow {
1971      top: 180.0,
1972      left: 100.0,
1973      text: "Figure 1. Test diagram".to_string(),
1974    }];
1975    let regions = detect_vector_diagram_regions(
1976      &paths, 0.0, 0.0, 612.0, 792.0, &text_rows, true,
1977    );
1978
1979    assert_eq!(regions.len(), 1);
1980    assert!(regions[0].left <= 100.0);
1981    assert!(regions[0].bottom <= 200.0);
1982    assert!(regions[0].width >= 200.0);
1983    assert!(regions[0].height >= 120.0);
1984  }
1985
1986  #[test]
1987  fn ignores_single_full_width_vector_rule() {
1988    let paths = vec![pdf_oxide::elements::PathContent::new(
1989      pdf_oxide::geometry::Rect::new(0.0, 700.0, 612.0, 1.0),
1990    )];
1991
1992    assert!(
1993      detect_vector_diagram_regions(&paths, 0.0, 0.0, 612.0, 792.0, &[], true)
1994        .is_empty()
1995    );
1996  }
1997
1998  #[test]
1999  fn ignores_vector_regions_without_nearby_figure_caption() {
2000    let paths = vec![
2001      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2002        100.0, 200.0, 80.0, 40.0,
2003      )),
2004      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2005        220.0, 200.0, 80.0, 40.0,
2006      )),
2007      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2008        160.0, 280.0, 80.0, 40.0,
2009      )),
2010    ];
2011
2012    assert!(
2013      detect_vector_diagram_regions(&paths, 0.0, 0.0, 612.0, 792.0, &[], true)
2014        .is_empty()
2015    );
2016  }
2017
2018  #[test]
2019  fn ignores_unlabeled_vector_regions_without_ocr() {
2020    let paths = vec![
2021      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2022        100.0, 200.0, 80.0, 40.0,
2023      )),
2024      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2025        220.0, 200.0, 80.0, 40.0,
2026      )),
2027      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2028        160.0, 280.0, 80.0, 40.0,
2029      )),
2030    ];
2031    let text_rows = vec![VisualTextRow {
2032      top: 180.0,
2033      left: 100.0,
2034      text: "Figure 1. Test diagram".to_string(),
2035    }];
2036
2037    assert!(
2038      detect_vector_diagram_regions(
2039        &paths, 0.0, 0.0, 612.0, 792.0, &text_rows, false,
2040      )
2041      .is_empty()
2042    );
2043  }
2044
2045  #[test]
2046  fn keeps_vector_regions_with_native_overlay_text_without_ocr() {
2047    let paths = vec![
2048      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2049        100.0, 200.0, 80.0, 40.0,
2050      )),
2051      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2052        220.0, 200.0, 80.0, 40.0,
2053      )),
2054      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2055        160.0, 280.0, 80.0, 40.0,
2056      )),
2057    ];
2058    let text_rows = vec![
2059      VisualTextRow {
2060        top: 180.0,
2061        left: 100.0,
2062        text: "Figure 1. Test diagram".to_string(),
2063      },
2064      VisualTextRow {
2065        top: 220.0,
2066        left: 120.0,
2067        text: "Native label".to_string(),
2068      },
2069    ];
2070
2071    let regions = detect_vector_diagram_regions(
2072      &paths, 0.0, 0.0, 612.0, 792.0, &text_rows, false,
2073    );
2074
2075    assert_eq!(regions.len(), 1);
2076  }
2077
2078  #[test]
2079  fn vector_diagram_region_clamps_to_media_box_origin() {
2080    let paths = vec![
2081      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2082        110.0, 210.0, 80.0, 40.0,
2083      )),
2084      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2085        230.0, 210.0, 80.0, 40.0,
2086      )),
2087      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2088        170.0, 290.0, 80.0, 40.0,
2089      )),
2090    ];
2091
2092    let text_rows = vec![VisualTextRow {
2093      top: 206.0,
2094      left: 110.0,
2095      text: "Figure 1. Test diagram".to_string(),
2096    }];
2097    let regions = detect_vector_diagram_regions(
2098      &paths, 100.0, 200.0, 500.0, 500.0, &text_rows, true,
2099    );
2100
2101    assert_eq!(regions.len(), 1);
2102    assert!(regions[0].left >= 100.0);
2103    assert!(regions[0].bottom >= 200.0);
2104    assert!(regions[0].left <= 110.0);
2105    assert!(regions[0].bottom <= 210.0);
2106  }
2107
2108  #[test]
2109  fn vector_diagram_region_handles_negative_media_box_origin() {
2110    let paths = vec![
2111      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2112        -290.0, -190.0, 80.0, 40.0,
2113      )),
2114      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2115        -170.0, -190.0, 80.0, 40.0,
2116      )),
2117      pdf_oxide::elements::PathContent::new(pdf_oxide::geometry::Rect::new(
2118        -230.0, -110.0, 80.0, 40.0,
2119      )),
2120    ];
2121
2122    let text_rows = vec![VisualTextRow {
2123      top: -194.0,
2124      left: -290.0,
2125      text: "Figure 1. Test diagram".to_string(),
2126    }];
2127    let regions = detect_vector_diagram_regions(
2128      &paths, -300.0, -200.0, 500.0, 500.0, &text_rows, true,
2129    );
2130
2131    assert_eq!(regions.len(), 1);
2132    assert!(regions[0].left >= -300.0);
2133    assert!(regions[0].bottom >= -200.0);
2134    assert!(regions[0].width >= 200.0);
2135    assert!(regions[0].height >= 120.0);
2136  }
2137
2138  #[test]
2139  fn pdf_cell_mapping_accounts_for_media_box_origin() {
2140    assert_eq!(pdf_x_to_cells(100.0, 100.0, 500.0, 80), 0);
2141    assert_eq!(pdf_x_to_cells(350.0, 100.0, 500.0, 80), 40);
2142    assert_eq!(pdf_width_to_cells(125.0, 500.0, 80), 20);
2143  }
2144
2145  #[test]
2146  fn pdf_image_height_uses_display_bbox_aspect_ratio() {
2147    assert_eq!(pdf_image_height_rows(100.0, 50.0, 20), 10);
2148    assert_eq!(pdf_image_height_rows(100.0, 200.0, 20), 40);
2149    assert_eq!(pdf_image_height_rows(0.0, 200.0, 20), 1);
2150  }
2151
2152  #[cfg(feature = "pdf-ocr-bundled")]
2153  #[test]
2154  fn ocrs_images_when_page_has_no_native_text() {
2155    let region =
2156      PdfRegion { left: 0.0, bottom: 0.0, width: 100.0, height: 100.0 };
2157
2158    assert!(should_ocr_image_region(region, &[]));
2159  }
2160
2161  #[cfg(feature = "pdf-ocr-bundled")]
2162  #[test]
2163  fn ocrs_captioned_images_without_native_text() {
2164    let region =
2165      PdfRegion { left: 48.0, bottom: 300.0, width: 500.0, height: 200.0 };
2166    let native_rows = vec![
2167      VisualTextRow {
2168        top: 285.0,
2169        left: 48.0,
2170        text: "Figure 8. The lifecycle of the status of your files".to_string(),
2171      },
2172      VisualTextRow {
2173        top: 250.0,
2174        left: 48.0,
2175        text: "Checking the Status of Your Files".to_string(),
2176      },
2177    ];
2178
2179    assert!(should_ocr_image_region(region, &native_rows));
2180  }
2181
2182  #[cfg(feature = "pdf-ocr-bundled")]
2183  #[test]
2184  fn skips_uncaptioned_images_on_native_text_pages() {
2185    let region =
2186      PdfRegion { left: 48.0, bottom: 300.0, width: 500.0, height: 200.0 };
2187    let native_rows = vec![VisualTextRow {
2188      top: 250.0,
2189      left: 48.0,
2190      text: "Body text below an unrelated decorative image".to_string(),
2191    }];
2192
2193    assert!(!should_ocr_image_region(region, &native_rows));
2194  }
2195
2196  #[cfg(feature = "pdf-ocr-bundled")]
2197  #[test]
2198  fn skips_ocr_when_native_text_already_covers_region() {
2199    let region =
2200      PdfRegion { left: 48.0, bottom: 300.0, width: 500.0, height: 200.0 };
2201    let native_rows = vec![
2202      VisualTextRow {
2203        top: 400.0,
2204        left: 100.0,
2205        text: "Native label".to_string(),
2206      },
2207      VisualTextRow {
2208        top: 285.0,
2209        left: 48.0,
2210        text: "Figure 1. Native diagram".to_string(),
2211      },
2212    ];
2213
2214    assert!(!should_ocr_image_region(region, &native_rows));
2215  }
2216
2217  #[cfg(feature = "pdf-ocr-bundled")]
2218  fn generated_ocr_fixture(text: &str) -> image::DynamicImage {
2219    let scale = 12u32;
2220    let glyph_width = 5u32;
2221    let glyph_height = 7u32;
2222    let spacing = 2u32;
2223    let padding = 24u32;
2224    let width = padding * 2
2225      + text.chars().count() as u32 * (glyph_width + spacing) * scale;
2226    let height = padding * 2 + glyph_height * scale;
2227    let mut image = image::RgbaImage::from_pixel(
2228      width,
2229      height,
2230      image::Rgba([255, 255, 255, 255]),
2231    );
2232
2233    let mut x = padding;
2234    for ch in text.chars() {
2235      if ch == ' ' {
2236        x += (glyph_width + spacing) * scale;
2237        continue;
2238      }
2239      draw_glyph(&mut image, x, padding, scale, ch);
2240      x += (glyph_width + spacing) * scale;
2241    }
2242
2243    image::DynamicImage::ImageRgba8(image)
2244  }
2245
2246  #[cfg(feature = "pdf-ocr-bundled")]
2247  fn draw_glyph(
2248    image: &mut image::RgbaImage,
2249    x: u32,
2250    y: u32,
2251    scale: u32,
2252    ch: char,
2253  ) {
2254    let Some(pattern) = glyph_pattern(ch) else {
2255      return;
2256    };
2257    for (row, bits) in pattern.iter().enumerate() {
2258      for (col, bit) in bits.chars().enumerate() {
2259        if bit != '1' {
2260          continue;
2261        }
2262        for dy in 0..scale {
2263          for dx in 0..scale {
2264            image.put_pixel(
2265              x + col as u32 * scale + dx,
2266              y + row as u32 * scale + dy,
2267              image::Rgba([0, 0, 0, 255]),
2268            );
2269          }
2270        }
2271      }
2272    }
2273  }
2274
2275  #[cfg(feature = "pdf-ocr-bundled")]
2276  fn glyph_pattern(ch: char) -> Option<[&'static str; 7]> {
2277    match ch {
2278      'C' => {
2279        Some(["01111", "10000", "10000", "10000", "10000", "10000", "01111"])
2280      }
2281      'E' => {
2282        Some(["11111", "10000", "10000", "11110", "10000", "10000", "11111"])
2283      }
2284      'H' => {
2285        Some(["10001", "10001", "10001", "11111", "10001", "10001", "10001"])
2286      }
2287      'L' => {
2288        Some(["10000", "10000", "10000", "10000", "10000", "10000", "11111"])
2289      }
2290      'O' => {
2291        Some(["01110", "10001", "10001", "10001", "10001", "10001", "01110"])
2292      }
2293      'R' => {
2294        Some(["11110", "10001", "10001", "11110", "10100", "10010", "10001"])
2295      }
2296      _ => None,
2297    }
2298  }
2299}
cli_pdf_to_text/stream.rs

cli_pdf_to_text/
stream.rs