1use kurbo::{Affine, BezPath, Shape};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7 BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11const BAND_Y_TOLERANCE: f64 = 5.0;
15const BAND_Y_FRACTION: f64 = 0.30;
20const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
73#[cfg_attr(feature = "serde", derive(serde::Serialize))]
74pub enum WidthSource {
75 Metric,
77 #[default]
79 Estimate,
80}
81
82impl WidthSource {
83 pub fn as_str(&self) -> &'static str {
87 match self {
88 WidthSource::Metric => "Metric",
89 WidthSource::Estimate => "Estimate",
90 }
91 }
92}
93
94#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
96pub enum GeometryMode {
97 #[default]
99 Basic,
100 RichGeometry,
102}
103
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
106pub enum BoundsSource {
107 #[default]
109 Advance,
110 Tight,
112 Estimate,
114}
115
116#[derive(Debug, Clone, Copy, PartialEq, Default)]
122#[cfg_attr(feature = "serde", derive(serde::Serialize))]
123pub struct FontMetrics {
124 pub ascent: f64,
126 pub descent: f64,
128 #[cfg_attr(
130 feature = "serde",
131 serde(rename = "capHeight", skip_serializing_if = "Option::is_none")
132 )]
133 pub cap_height: Option<f64>,
134 #[cfg_attr(
136 feature = "serde",
137 serde(rename = "xHeight", skip_serializing_if = "Option::is_none")
138 )]
139 pub x_height: Option<f64>,
140}
141
142#[derive(Debug, Clone, Default)]
144pub struct TextSpan {
145 pub text: String,
147 pub x: f64,
149 pub y: f64,
151 pub width: f64,
153 pub height: f64,
155 pub font_size: f64,
157
158 pub font_name: Option<String>,
164 pub is_bold: bool,
168 pub is_italic: bool,
172 pub color: Option<[u8; 4]>,
176
177 pub width_source: WidthSource,
180 pub char_bounds: Vec<[f64; 4]>,
183
184 #[doc(hidden)]
186 pub geometry_mode: GeometryMode,
187 #[doc(hidden)]
188 pub bounds_source: BoundsSource,
189 #[doc(hidden)]
190 pub tight_char_bounds: Vec<[f64; 4]>,
191 #[doc(hidden)]
192 pub glyph_advances: Vec<f64>,
193 #[doc(hidden)]
194 pub glyph_bounds_sources: Vec<BoundsSource>,
195
196 pub transform: Option<[f64; 6]>,
202 pub font_weight: Option<u16>,
205 pub is_serif: Option<bool>,
208 pub is_monospace: Option<bool>,
211 pub render_mode: Option<u8>,
216
217 pub font_metrics: Option<FontMetrics>,
221}
222
223impl TextSpan {
224 fn right(&self) -> f64 {
227 self.x + self.width.max(self.estimated_width())
228 }
229
230 fn measured_right(&self) -> f64 {
232 self.x + self.width
233 }
234
235 fn estimated_width(&self) -> f64 {
236 let char_count = self.text.chars().count() as f64;
237 if char_count <= 0.0 {
238 self.font_size * 0.5
239 } else {
240 self.font_size * 0.5 * char_count
241 }
242 }
243}
244
245#[derive(Debug, Clone)]
247pub struct TextBlock {
248 pub spans: Vec<TextSpan>,
250}
251
252impl TextBlock {
253 pub fn text(&self) -> String {
259 if self.spans.is_empty() {
260 return String::new();
261 }
262 let mut result = self.spans[0].text.clone();
263 for pair in self.spans.windows(2) {
264 let prev = &pair[0];
265 let curr = &pair[1];
266 let expected_end = prev.measured_right();
267 let gap = curr.x - expected_end;
268 if gap <= prev.font_size * 0.12 {
269 if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
270 result.push_str(&trimmed);
271 continue;
272 }
273 }
274 if gap > prev.font_size * 0.25 {
275 result.push(' ');
276 }
277 result.push_str(&curr.text);
278 }
279 result
280 }
281}
282
283#[derive(Debug, Clone)]
284struct TextBand {
285 y: f64,
286 spans: Vec<TextSpan>,
287}
288
289impl TextBand {
290 fn new(span: TextSpan) -> Self {
291 Self {
292 y: span.y,
293 spans: vec![span],
294 }
295 }
296
297 fn sort_spans(&mut self) {
298 self.spans.sort_by(|a, b| {
299 a.x.partial_cmp(&b.x)
300 .unwrap_or(Ordering::Equal)
301 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
302 });
303 collapse_overprinted_spans(&mut self.spans);
304 }
305
306 fn row_block(&self) -> TextBlock {
307 let mut spans = self.spans.clone();
308 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
309 TextBlock { spans }
310 }
311
312 fn left(&self) -> f64 {
313 self.spans
314 .iter()
315 .map(|span| span.x)
316 .fold(f64::INFINITY, f64::min)
317 }
318
319 fn right(&self) -> f64 {
320 self.spans
321 .iter()
322 .map(TextSpan::right)
323 .fold(f64::NEG_INFINITY, f64::max)
324 }
325
326 fn width(&self) -> f64 {
327 (self.right() - self.left()).max(0.0)
328 }
329
330 fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
331 self.gaps(column_gap_threshold)
332 .into_iter()
333 .map(|gap| (gap.start + gap.end) * 0.5)
334 .collect()
335 }
336
337 fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
338 if self.spans.len() < 2 {
339 return Vec::new();
340 }
341
342 let mut spans = self.spans.clone();
343 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
344
345 let mut gaps = Vec::new();
346 let mut prev_right = spans[0].right();
347 for span in spans.iter().skip(1) {
348 let gap = span.x - prev_right;
349 if gap >= column_gap_threshold {
350 gaps.push(BandGap {
351 start: prev_right,
352 end: span.x,
353 });
354 }
355 prev_right = prev_right.max(span.right());
356 }
357
358 gaps
359 }
360
361 fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
362 let mut columns = vec![Vec::new(); boundaries.len() + 1];
363 for span in &self.spans {
364 let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
365 let column_idx = boundaries
366 .iter()
367 .position(|boundary| center_x < *boundary)
368 .unwrap_or(boundaries.len());
369 columns[column_idx].push(span.clone());
370 }
371
372 for spans in &mut columns {
373 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
374 }
375
376 columns
377 }
378
379 fn fits_single_column(
380 &self,
381 boundaries: &[f64],
382 region_left: f64,
383 region_right: f64,
384 ) -> Option<usize> {
385 let mut column_idx: Option<usize> = None;
386 for span in &self.spans {
387 let left = span.x;
388 let right = span.right();
389 if boundaries
390 .iter()
391 .any(|boundary| left < *boundary && right > *boundary)
392 {
393 return None;
394 }
395
396 let center_x = left + (right - left) * 0.5;
397 let idx = boundaries
398 .iter()
399 .position(|boundary| center_x < *boundary)
400 .unwrap_or(boundaries.len());
401 match column_idx {
402 Some(existing) if existing != idx => return None,
403 Some(_) => {}
404 None => column_idx = Some(idx),
405 }
406 }
407 let idx = column_idx?;
408 let mut edges = Vec::with_capacity(boundaries.len() + 2);
409 edges.push(region_left);
410 edges.extend_from_slice(boundaries);
411 edges.push(region_right);
412
413 let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
414 if column_width <= 0.0 || self.width() > column_width * 0.8 {
415 return None;
416 }
417
418 Some(idx)
419 }
420}
421
422#[derive(Debug, Clone, Copy)]
423struct BandGap {
424 start: f64,
425 end: f64,
426}
427
428#[doc(hidden)]
450pub struct TextExtractionDevice {
451 spans: Vec<TextSpan>,
452 last_y: f64,
453 last_end_x: f64,
454 pending_tj_offset: f32,
458 glyph_widths: Vec<f64>,
462 cached_median_char_width: f64,
466 geometry_mode: GeometryMode,
468 deferred_rich_glyphs: Vec<DeferredGlyph>,
472}
473
474#[derive(Clone)]
477struct DeferredGlyph {
478 coeffs: [f64; 6],
480 font_size: f64,
482 glyph_width: f64,
484 needs_exact: bool,
486 outline: Option<BezPath>,
491 font_bbox: Option<kurbo::Rect>,
494}
495
496const MEDIAN_REFRESH: usize = 32;
497
498impl Default for TextExtractionDevice {
499 fn default() -> Self {
500 Self::new()
501 }
502}
503
504impl TextExtractionDevice {
505 pub fn new() -> Self {
507 Self::with_mode(GeometryMode::Basic)
508 }
509
510 pub fn with_mode(geometry_mode: GeometryMode) -> Self {
512 Self {
513 spans: Vec::new(),
514 last_y: f64::NEG_INFINITY,
515 last_end_x: f64::NEG_INFINITY,
516 pending_tj_offset: 0.0,
517 glyph_widths: Vec::new(),
518 cached_median_char_width: 0.0,
519 geometry_mode,
520 deferred_rich_glyphs: Vec::new(),
521 }
522 }
523
524 fn refresh_median_char_width(&mut self) {
527 if self.glyph_widths.is_empty() {
528 self.cached_median_char_width = 0.0;
529 return;
530 }
531 let mut sorted = self.glyph_widths.clone();
532 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
533 self.cached_median_char_width = sorted[sorted.len() / 2];
534 }
535
536 fn evaluate_space_consensus(
539 &self,
540 gap: f64,
541 font_size: f64,
542 prev_text: &str,
543 next_text: &str,
544 ) -> bool {
545 let mut confidence = 0.0;
546
547 if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
550 confidence += TJ_SIGNAL_WEIGHT;
551 }
552
553 let gap_reference = if self.cached_median_char_width > 0.0 {
557 self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
558 } else {
559 font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
560 };
561 if gap > gap_reference {
562 confidence += GAP_SIGNAL_WEIGHT;
563 }
564
565 if let (Some(prev_last), Some(next_first)) =
569 (prev_text.chars().last(), next_text.chars().next())
570 {
571 let camel = prev_last.is_lowercase() && next_first.is_uppercase();
572 let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
573 let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
574 if camel || digit_to_letter || letter_to_digit {
575 confidence += HEURISTIC_SIGNAL_WEIGHT;
576 }
577 }
578
579 confidence >= SPACE_CONSENSUS_THRESHOLD
580 }
581
582 pub fn into_text(mut self) -> String {
584 if self.geometry_mode == GeometryMode::RichGeometry {
585 self.compute_tight_bounds();
586 }
587 let blocks = group_spans_into_blocks(self.spans);
588 let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
589 let stitched = stitch_hyphenated_lines(&lines);
590 normalize_text_output(&stitched)
591 }
592
593 pub fn into_blocks(mut self) -> Vec<TextBlock> {
595 if self.geometry_mode == GeometryMode::RichGeometry {
596 self.compute_tight_bounds();
597 }
598 group_spans_into_blocks(self.spans)
599 }
600
601 #[allow(dead_code)]
603 pub(crate) fn into_spans(mut self) -> Vec<TextSpan> {
604 if self.geometry_mode == GeometryMode::RichGeometry {
605 self.compute_tight_bounds();
606 }
607 self.spans
608 }
609
610 fn record_deferred_glyph(
613 &mut self,
614 glyph: &Glyph<'_>,
615 composed: &Affine,
616 font_size: f64,
617 glyph_width: f64,
618 ) {
619 let coeffs = composed.as_coeffs();
620 let is_rotated = coeffs[1].abs() > ROTATION_EPSILON || coeffs[2].abs() > ROTATION_EPSILON;
621 let is_sheared = (coeffs[0] - coeffs[3]).abs() > SHEAR_EPSILON;
622 let needs_exact = is_rotated || is_sheared;
623
624 let (outline, font_bbox) = match glyph {
630 Glyph::Outline(o) => {
631 let path = o.outline();
632 let bb = path.bounding_box();
633 if bb.width() <= 0.0 && bb.height() <= 0.0 {
634 (None, None)
635 } else if needs_exact {
636 (Some(path), Some(bb))
637 } else {
638 (None, Some(bb))
639 }
640 }
641 Glyph::Type3(_) => (None, None),
642 };
643
644 self.deferred_rich_glyphs.push(DeferredGlyph {
645 coeffs,
646 font_size,
647 glyph_width,
648 needs_exact,
649 outline,
650 font_bbox,
651 });
652 }
653
654 fn compute_tight_bounds(&mut self) {
657 let deferred = std::mem::take(&mut self.deferred_rich_glyphs);
658 if deferred.is_empty() {
659 return;
660 }
661
662 let mut idx = 0usize;
663
664 for span in &mut self.spans {
665 let n = span.char_bounds.len();
666 if n == 0 {
667 continue;
668 }
669
670 let mut span_bounds_source = BoundsSource::Advance;
671
672 for _gi in 0..n {
673 if idx >= deferred.len() {
674 break;
675 }
676 let dg = &deferred[idx];
677 idx += 1;
678
679 let composed = Affine::new(dg.coeffs);
680 let (tight_bound, source) = if let Some(font_bbox) = dg.font_bbox {
681 if dg.needs_exact {
687 let page_bbox = match dg.outline {
692 Some(ref path) => (composed * path.clone()).bounding_box(),
693 None => {
694 let raw = [font_bbox.x0, font_bbox.y0, font_bbox.x1, font_bbox.y1];
695 let b = transform_bbox_corners(&raw, &composed);
696 kurbo::Rect::new(b[0], b[1], b[2], b[3])
697 }
698 };
699 (
700 [page_bbox.x0, page_bbox.y0, page_bbox.x1, page_bbox.y1],
701 BoundsSource::Tight,
702 )
703 } else {
704 let raw = [font_bbox.x0, font_bbox.y0, font_bbox.x1, font_bbox.y1];
707 (transform_bbox_corners(&raw, &composed), BoundsSource::Tight)
708 }
709 } else {
710 let x = dg.coeffs[4];
714 let y = dg.coeffs[5];
715 (
716 [x, y, x + dg.glyph_width, y + dg.font_size],
717 BoundsSource::Estimate,
718 )
719 };
720
721 span.tight_char_bounds.push(tight_bound);
722 span.glyph_advances.push(dg.glyph_width);
723 span.glyph_bounds_sources.push(source);
724
725 if source == BoundsSource::Tight && span_bounds_source != BoundsSource::Estimate {
726 span_bounds_source = BoundsSource::Tight;
727 } else if source == BoundsSource::Estimate {
728 span_bounds_source = BoundsSource::Estimate;
729 }
730 }
731
732 span.bounds_source = span_bounds_source;
733 }
734 }
735}
736
737impl Device<'_> for TextExtractionDevice {
738 fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
739 fn set_blend_mode(&mut self, _: BlendMode) {}
740 fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
741 fn push_clip_path(&mut self, _: &ClipPath) {}
742 fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
743 fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
744 fn pop_clip_path(&mut self) {}
745 fn pop_transparency_group(&mut self) {}
746
747 fn draw_glyph(
748 &mut self,
749 glyph: &Glyph<'_>,
750 transform: Affine,
751 glyph_transform: Affine,
752 paint: &Paint<'_>,
753 draw_mode: &GlyphDrawMode,
754 ) {
755 let text = match glyph.as_unicode() {
756 Some(BfString::Char(c)) => c.to_string(),
757 Some(BfString::String(s)) => s,
758 None => return,
759 };
760
761 let composed = transform * glyph_transform;
762 let coeffs = composed.as_coeffs();
763 let x = coeffs[4];
764 let y = coeffs[5];
765 let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
766 let font_size = glyph_scale * 1000.0;
767
768 let (glyph_width, glyph_ws) = glyph_width_and_source(glyph, font_size);
770 let glyph_end_x = x + glyph_width;
771 let glyph_bound = [x, y, glyph_end_x, y + font_size];
772
773 let style = derive_glyph_style(glyph);
774 let color = paint_to_rgba(paint);
775
776 if self.geometry_mode == GeometryMode::RichGeometry {
778 self.record_deferred_glyph(glyph, &composed, font_size, glyph_width);
779 }
780
781 if self.glyph_widths.len() < 4096 {
785 self.glyph_widths.push(glyph_width);
786 if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
787 self.refresh_median_char_width();
788 }
789 }
790
791 let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
792 let gap = x - self.last_end_x;
793 let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
794
795 let style_matches = self
799 .spans
800 .last()
801 .map(|last| {
802 last.font_name == style.font_name
803 && last.is_bold == style.is_bold
804 && last.is_italic == style.is_italic
805 && last.color == color
806 })
807 .unwrap_or(false);
808
809 if adjacent && !self.spans.is_empty() && style_matches {
810 let want_space = {
818 let last = self.spans.last().expect("checked non-empty");
819 !last.text.ends_with(' ')
820 && !text.starts_with(' ')
821 && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
822 };
823 let last = self.spans.last_mut().expect("checked non-empty");
824 if want_space {
825 last.text.push(' ');
826 }
827 last.text.push_str(&text);
828 last.width = last.width.max(glyph_end_x - last.x);
829 last.height = last.height.max(font_size);
830 last.char_bounds.push(glyph_bound);
832 if glyph_ws == WidthSource::Estimate {
833 last.width_source = WidthSource::Estimate;
834 }
835 self.last_y = y;
836 self.last_end_x = glyph_end_x;
837 self.pending_tj_offset = 0.0;
840 return;
841 }
842
843 self.last_y = y;
844 self.last_end_x = glyph_end_x;
845 self.pending_tj_offset = 0.0;
849
850 let span = TextSpan {
851 text,
852 x,
853 y,
854 width: glyph_width,
855 height: font_size,
856 font_size,
857 font_name: style.font_name,
858 is_bold: style.is_bold,
859 is_italic: style.is_italic,
860 color,
861 width_source: glyph_ws,
862 char_bounds: vec![glyph_bound],
863 transform: Some(coeffs),
864 font_weight: style.font_weight,
865 is_serif: style.is_serif,
866 is_monospace: style.is_monospace,
867 render_mode: Some(render_mode_from_draw_mode(draw_mode)),
868 font_metrics: style.font_metrics,
869 geometry_mode: self.geometry_mode,
870 bounds_source: BoundsSource::Advance,
871 tight_char_bounds: Vec::new(),
872 glyph_advances: Vec::new(),
873 glyph_bounds_sources: Vec::new(),
874 };
875
876 self.spans.push(span);
877 }
878
879 fn text_adjustment(&mut self, amount: f32) {
884 self.pending_tj_offset += amount;
885 }
886}
887
888#[derive(Debug, Default, Clone)]
890struct GlyphStyle {
891 font_name: Option<String>,
892 is_bold: bool,
893 is_italic: bool,
894 font_weight: Option<u16>,
896 is_serif: Option<bool>,
898 is_monospace: Option<bool>,
900 font_metrics: Option<FontMetrics>,
902}
903
904fn strip_subset_prefix(name: &str) -> &str {
906 match name.split_once('+') {
907 Some((prefix, rest)) if prefix.len() == 6 => rest,
908 _ => name,
909 }
910}
911
912fn name_style_hints(name: &str) -> (bool, bool) {
916 let lower = name.to_ascii_lowercase();
917 let italic = lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
918 let bold = lower.contains("bold")
919 || lower.contains("demi")
920 || lower.contains("semibold")
921 || lower.contains("heavy")
922 || lower.contains("black");
923 (bold, italic)
924}
925
926fn derive_glyph_style(glyph: &Glyph<'_>) -> GlyphStyle {
927 match glyph {
928 Glyph::Outline(outline) => {
929 if let Some(data) = outline.font_data() {
930 let raw = data.postscript_name.as_deref().unwrap_or("");
931 let name = strip_subset_prefix(raw).to_string();
932 let weight_bold = data.weight.is_some_and(|w| w >= 700);
933 let (name_bold, name_italic) = name_style_hints(&name);
934 GlyphStyle {
935 font_name: if name.is_empty() { None } else { Some(name) },
936 is_bold: weight_bold || name_bold,
937 is_italic: data.is_italic || name_italic,
938 font_weight: data.weight.map(|w| w.clamp(1, 1000) as u16),
939 is_serif: Some(data.is_serif),
940 is_monospace: Some(data.is_monospace),
941 font_metrics: match (data.ascent, data.descent) {
942 (Some(ascent), Some(descent)) => Some(FontMetrics {
943 ascent,
944 descent,
945 cap_height: data.cap_height,
946 x_height: data.x_height,
947 }),
948 _ => None,
949 },
950 }
951 } else {
952 let raw = outline.postscript_name().unwrap_or_default();
958 let name = strip_subset_prefix(&raw).to_string();
959 let (name_bold, name_italic) = name_style_hints(&name);
960 let metrics = outline.font_metrics().map(|(a, d, c, x)| FontMetrics {
961 ascent: a,
962 descent: d,
963 cap_height: c,
964 x_height: x,
965 });
966 GlyphStyle {
967 font_name: if name.is_empty() { None } else { Some(name) },
968 is_bold: name_bold,
969 is_italic: name_italic,
970 font_weight: None,
972 is_serif: None,
973 is_monospace: None,
974 font_metrics: metrics,
975 }
976 }
977 }
978 Glyph::Type3(_) => GlyphStyle::default(),
979 }
980}
981
982fn paint_to_rgba(paint: &Paint<'_>) -> Option<[u8; 4]> {
983 match paint {
984 Paint::Color(c) => Some(c.to_rgba().to_rgba8()),
985 Paint::Pattern(_) => None,
986 }
987}
988
989fn render_mode_from_draw_mode(mode: &GlyphDrawMode) -> u8 {
998 match mode {
999 GlyphDrawMode::Fill => 0,
1000 GlyphDrawMode::Stroke(_) => 1,
1001 GlyphDrawMode::Invisible => 3,
1002 }
1003}
1004
1005#[cfg(test)]
1006mod render_mode_tests {
1007 use super::render_mode_from_draw_mode;
1008 use pdf_render::pdf_interpret::{GlyphDrawMode, StrokeProps};
1009
1010 #[test]
1011 fn render_mode_is_only_zero_one_three() {
1012 assert_eq!(render_mode_from_draw_mode(&GlyphDrawMode::Fill), 0);
1013 assert_eq!(
1014 render_mode_from_draw_mode(&GlyphDrawMode::Stroke(StrokeProps::default())),
1015 1
1016 );
1017 assert_eq!(render_mode_from_draw_mode(&GlyphDrawMode::Invisible), 3);
1018 for m in [
1020 GlyphDrawMode::Fill,
1021 GlyphDrawMode::Stroke(StrokeProps::default()),
1022 GlyphDrawMode::Invisible,
1023 ] {
1024 assert!(matches!(render_mode_from_draw_mode(&m), 0 | 1 | 3));
1025 }
1026 }
1027}
1028
1029fn glyph_width_and_source(glyph: &Glyph<'_>, font_size: f64) -> (f64, WidthSource) {
1036 match glyph {
1037 Glyph::Outline(outline) => {
1038 if let Some(w) = outline.advance_width() {
1039 let advance = (w as f64 / 1000.0 * font_size).max(font_size * 0.25);
1040 (advance, WidthSource::Metric)
1041 } else {
1042 (font_size * 0.5, WidthSource::Estimate)
1043 }
1044 }
1045 Glyph::Type3(_) => (font_size * 0.5, WidthSource::Estimate),
1046 }
1047}
1048
1049const ROTATION_EPSILON: f64 = 1e-6;
1053const SHEAR_EPSILON: f64 = 1e-3;
1054
1055fn transform_bbox_corners(local_bbox: &[f64; 4], affine: &Affine) -> [f64; 4] {
1058 use kurbo::Point;
1059 let corners = [
1060 *affine * Point::new(local_bbox[0], local_bbox[1]),
1061 *affine * Point::new(local_bbox[2], local_bbox[1]),
1062 *affine * Point::new(local_bbox[2], local_bbox[3]),
1063 *affine * Point::new(local_bbox[0], local_bbox[3]),
1064 ];
1065 let x0 = corners.iter().map(|p| p.x).fold(f64::INFINITY, f64::min);
1066 let y0 = corners.iter().map(|p| p.y).fold(f64::INFINITY, f64::min);
1067 let x1 = corners
1068 .iter()
1069 .map(|p| p.x)
1070 .fold(f64::NEG_INFINITY, f64::max);
1071 let y1 = corners
1072 .iter()
1073 .map(|p| p.y)
1074 .fold(f64::NEG_INFINITY, f64::max);
1075 [x0, y0, x1, y1]
1076}
1077
1078fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
1084 if spans.len() < 2 {
1085 return;
1086 }
1087
1088 let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
1089 for span in spans.drain(..) {
1090 if let Some(last) = deduped.last_mut() {
1091 if spans_are_overprint_duplicates(last, &span) {
1092 let choose_incoming = span.text.chars().count() > last.text.chars().count()
1093 || (span.text.chars().count() == last.text.chars().count()
1094 && span.width > last.width);
1095 let preferred_text = if choose_incoming {
1096 span.text.clone()
1097 } else {
1098 last.text.clone()
1099 };
1100 let left = last.x.min(span.x);
1101 let right = last.right().max(span.right());
1102 last.x = left;
1103 last.y = (last.y + span.y) * 0.5;
1104 last.width = (right - left).max(last.width).max(span.width);
1105 last.height = last.height.max(span.height);
1106 last.font_size = last.font_size.max(span.font_size);
1107 last.text = preferred_text;
1108 continue;
1109 }
1110 }
1111
1112 deduped.push(span);
1113 }
1114
1115 *spans = deduped;
1116}
1117
1118fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
1119 let lhs_text = lhs.text.trim();
1120 let rhs_text = rhs.text.trim();
1121 if lhs_text.is_empty() || rhs_text.is_empty() {
1122 return false;
1123 }
1124
1125 let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
1126 if !same_baseline {
1127 return false;
1128 }
1129
1130 let lhs_left = lhs.x;
1131 let lhs_right = lhs.right();
1132 let rhs_left = rhs.x;
1133 let rhs_right = rhs.right();
1134 let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
1135 let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
1136 let heavily_overlaps = overlap / min_width >= 0.85;
1137 if !heavily_overlaps {
1138 return false;
1139 }
1140
1141 lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
1142}
1143
1144fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
1145 let prev_chars: Vec<char> = prev.trim_end().chars().collect();
1146 let curr_chars: Vec<char> = curr.trim_start().chars().collect();
1147 let max = prev_chars.len().min(curr_chars.len());
1148
1149 for len in (4..=max).rev() {
1150 let prev_start = prev_chars.len() - len;
1151 if prev_chars[prev_start..] != curr_chars[..len] {
1152 continue;
1153 }
1154
1155 if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
1156 continue;
1157 }
1158
1159 let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
1160 let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
1161 if !prev_boundary || !curr_boundary {
1162 continue;
1163 }
1164
1165 return Some(curr_chars[len..].iter().collect());
1166 }
1167
1168 None
1169}
1170
1171fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
1178 let mut all_gaps: Vec<f64> = Vec::new();
1179
1180 for band in bands {
1181 if band.spans.len() < 2 {
1182 continue;
1183 }
1184 let mut sorted = band.spans.clone();
1185 sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1186 let mut prev_right = sorted[0].right();
1187 for span in sorted.iter().skip(1) {
1188 let gap = span.x - prev_right;
1189 if gap > 0.0 {
1190 all_gaps.push(gap);
1191 }
1192 prev_right = prev_right.max(span.right());
1193 }
1194 }
1195
1196 if all_gaps.is_empty() {
1197 return COLUMN_GAP_THRESHOLD_FALLBACK;
1198 }
1199
1200 all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1201
1202 let min_gap = all_gaps[0];
1203
1204 if min_gap > COLUMN_GAP_THRESHOLD_MIN {
1209 return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
1210 }
1211
1212 let mut best_break_threshold = 0.0f64;
1215 let mut best_ratio = 1.5f64; for pair in all_gaps.windows(2) {
1217 if pair[0] > 0.5 {
1218 let ratio = pair[1] / pair[0];
1219 if ratio > best_ratio {
1220 best_ratio = ratio;
1221 best_break_threshold = (pair[0] + pair[1]) * 0.5;
1222 }
1223 }
1224 }
1225
1226 if best_break_threshold > 0.0 {
1227 return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
1228 }
1229
1230 let mid = all_gaps.len() / 2;
1232 let median = if all_gaps.len().is_multiple_of(2) {
1233 (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
1234 } else {
1235 all_gaps[mid]
1236 };
1237
1238 (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
1239 .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
1240}
1241
1242#[derive(Debug, Clone, Copy)]
1250struct PageStats {
1251 median_font_size: f64,
1253 #[allow(dead_code)]
1258 median_char_width: f64,
1259 median_line_spacing: f64,
1265}
1266
1267impl PageStats {
1268 fn from_spans(spans: &[TextSpan]) -> Self {
1269 if spans.is_empty() {
1270 return Self {
1271 median_font_size: 12.0,
1272 median_char_width: 6.0,
1273 median_line_spacing: 0.0,
1274 };
1275 }
1276
1277 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
1279 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1280 let median_font_size = sizes[sizes.len() / 2];
1281
1282 let mut char_widths: Vec<f64> = spans
1284 .iter()
1285 .filter_map(|s| {
1286 let chars = s.text.chars().count();
1287 if chars > 0 && s.width > 0.0 {
1288 Some(s.width / chars as f64)
1289 } else {
1290 None
1291 }
1292 })
1293 .collect();
1294 let median_char_width = if char_widths.is_empty() {
1295 median_font_size * 0.5
1296 } else {
1297 char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1298 char_widths[char_widths.len() / 2]
1299 };
1300
1301 let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1304 let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
1305 ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
1306 let mut band_ys: Vec<f64> = Vec::new();
1307 for y in ys {
1308 if band_ys
1309 .last()
1310 .map(|prev: &f64| (prev - y).abs() > band_tolerance)
1311 .unwrap_or(true)
1312 {
1313 band_ys.push(y);
1314 }
1315 }
1316 let median_line_spacing = if band_ys.len() < 2 {
1325 0.0
1326 } else {
1327 let mut spacings: Vec<f64> = band_ys
1328 .windows(2)
1329 .map(|pair| (pair[0] - pair[1]).abs())
1330 .collect();
1331 spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1332 let q1_index = spacings.len() / 4;
1333 spacings[q1_index]
1334 };
1335
1336 Self {
1337 median_font_size,
1338 median_char_width,
1339 median_line_spacing,
1340 }
1341 }
1342}
1343
1344const XY_CUT_MAX_DEPTH: usize = 12;
1349const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
1352const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
1356const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
1359const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
1363const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
1366
1367fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1371 if spans.is_empty() {
1372 return Vec::new();
1373 }
1374 let stats = PageStats::from_spans(&spans);
1375 xy_cut_recursive(spans, 0, &stats)
1376}
1377
1378fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
1379 if spans.is_empty() {
1380 return Vec::new();
1381 }
1382 if depth >= XY_CUT_MAX_DEPTH {
1383 return band_based_blocks(spans, stats);
1384 }
1385
1386 let vcut = try_vertical_cut(&spans, stats);
1393 let hcut = try_horizontal_cut(&spans, stats);
1394
1395 let (chosen, _) = match (vcut, hcut) {
1396 (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
1397 if v_gap >= h_gap {
1398 (Some(v_groups), v_gap)
1399 } else {
1400 (Some(h_groups), h_gap)
1401 }
1402 }
1403 (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
1404 (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
1405 (None, None) => (None, 0.0),
1406 };
1407
1408 if let Some(groups) = chosen {
1409 let mut out = Vec::new();
1410 for group in groups {
1411 out.extend(xy_cut_recursive(group, depth + 1, stats));
1412 }
1413 return out;
1414 }
1415
1416 band_based_blocks(spans, stats)
1417}
1418
1419fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
1423 group_spans_into_blocks_legacy_with_stats(spans, stats)
1428}
1429
1430#[allow(dead_code)]
1433fn median_font_size(spans: &[TextSpan]) -> f64 {
1434 if spans.is_empty() {
1435 return 12.0;
1436 }
1437 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
1438 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1439 sizes[sizes.len() / 2]
1440}
1441
1442fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1459 if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
1460 return None;
1461 }
1462
1463 let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
1464 let region_right = spans
1465 .iter()
1466 .map(TextSpan::right)
1467 .fold(f64::NEG_INFINITY, f64::max);
1468 let region_width = region_right - region_left;
1469 if region_width <= 0.0 {
1470 return None;
1471 }
1472
1473 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1482 let adaptive = compute_adaptive_column_gap(&bands);
1483 let floor = stats
1484 .median_font_size
1485 .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
1486 .max(XY_CUT_VERTICAL_GAP_FLOOR);
1487 let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
1488
1489 let mut intervals: Vec<(f64, f64)> = spans
1492 .iter()
1493 .map(|s| (s.x, s.right().max(s.x + 0.001)))
1494 .collect();
1495 intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
1496
1497 let mut cursor = intervals[0].1;
1498 let mut best_gap: Option<(f64, f64)> = None; for (left, right) in intervals.iter().skip(1) {
1500 if *left > cursor {
1501 let gap = *left - cursor;
1502 if gap >= min_gap {
1503 match best_gap {
1504 Some((best, _)) if best >= gap => {}
1505 _ => {
1506 let cut_x = (cursor + *left) * 0.5;
1507 best_gap = Some((gap, cut_x));
1508 }
1509 }
1510 }
1511 }
1512 cursor = cursor.max(*right);
1513 }
1514
1515 let (gap_size, cut_x) = best_gap?;
1516
1517 let mut left_group = Vec::new();
1520 let mut right_group = Vec::new();
1521 for span in spans {
1522 let midpoint = span.x + (span.right() - span.x) * 0.5;
1523 if midpoint < cut_x {
1524 left_group.push(span.clone());
1525 } else {
1526 right_group.push(span.clone());
1527 }
1528 }
1529
1530 if !columns_are_dense(&left_group, &right_group, stats) {
1531 return None;
1532 }
1533 if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
1534 return None;
1535 }
1536
1537 Some((vec![left_group, right_group], gap_size))
1538}
1539
1540fn columns_are_band_aligned(
1546 spans: &[TextSpan],
1547 cut_x: f64,
1548 region_left: f64,
1549 region_right: f64,
1550 stats: &PageStats,
1551) -> bool {
1552 let left_width = (cut_x - region_left).max(1.0);
1553 let right_width = (region_right - cut_x).max(1.0);
1554
1555 const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1560
1561 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1562 for band in &bands {
1563 let mut has_left = false;
1564 let mut has_right = false;
1565 for span in &band.spans {
1566 let midpoint = span.x + (span.right() - span.x) * 0.5;
1567 if midpoint < cut_x {
1568 has_left = true;
1569 } else {
1570 has_right = true;
1571 }
1572 }
1573 if has_left && has_right {
1574 continue; }
1576 let band_width = band.width();
1577 if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1578 return false;
1579 }
1580 if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1581 return false;
1582 }
1583 }
1584 true
1585}
1586
1587fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1592 for col in [left, right] {
1593 if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1594 return false;
1595 }
1596 let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1597 if bands.is_empty() {
1598 return false;
1599 }
1600 let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1601 let chars_per_band = total_chars as f64 / bands.len() as f64;
1602 if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1603 return false;
1604 }
1605 }
1606 true
1607}
1608
1609fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1613 if spans.len() < 2 {
1614 return None;
1615 }
1616 let mut sorted = spans.to_vec();
1618 sorted.sort_by(|a, b| {
1619 b.y.partial_cmp(&a.y)
1620 .unwrap_or(Ordering::Equal)
1621 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1622 });
1623
1624 let min_gap = if stats.median_line_spacing > 0.0 {
1630 stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1631 } else {
1632 stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1633 };
1634
1635 let mut best: Option<(f64, f64)> = None; let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1638 let mut band_bottom = sorted[0].y;
1639
1640 for span in sorted.iter().skip(1) {
1641 if (band_bottom - span.y).abs() <= tolerance {
1642 band_bottom = band_bottom.min(span.y);
1643 continue;
1644 }
1645 let gap = band_bottom - span.y;
1646 if gap >= min_gap {
1647 let cut_y = (band_bottom + span.y) * 0.5;
1648 match best {
1649 Some((best_gap, _)) if best_gap >= gap => {}
1650 _ => best = Some((gap, cut_y)),
1651 }
1652 }
1653 band_bottom = span.y;
1654 }
1655
1656 let (gap_size, cut_y) = best?;
1657
1658 let mut top_group = Vec::new();
1659 let mut bottom_group = Vec::new();
1660 for span in spans {
1661 if span.y > cut_y {
1662 top_group.push(span.clone());
1663 } else {
1664 bottom_group.push(span.clone());
1665 }
1666 }
1667 if top_group.is_empty() || bottom_group.is_empty() {
1668 return None;
1669 }
1670 Some((vec![top_group, bottom_group], gap_size))
1671}
1672
1673#[allow(dead_code)]
1677fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1678 let bands = group_spans_into_bands(spans);
1679 group_spans_into_blocks_legacy_from_bands(bands)
1680}
1681
1682fn group_spans_into_blocks_legacy_with_stats(
1683 spans: Vec<TextSpan>,
1684 stats: &PageStats,
1685) -> Vec<TextBlock> {
1686 let bands = group_spans_into_bands_with_stats(spans, stats);
1687 group_spans_into_blocks_legacy_from_bands(bands)
1688}
1689
1690fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1691 if bands.is_empty() {
1692 return Vec::new();
1693 }
1694
1695 let column_gap_threshold = compute_adaptive_column_gap(&bands);
1696
1697 let mut blocks = Vec::new();
1698 let mut idx = 0;
1699
1700 while idx < bands.len() {
1701 let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1702 if gap_midpoints.is_empty() {
1703 blocks.push(bands[idx].row_block());
1704 idx += 1;
1705 continue;
1706 }
1707
1708 let mut boundaries = gap_midpoints.clone();
1709 let mut band_indices = vec![idx];
1710 let mut gapped_band_count = 1usize;
1711 let mut region_left = bands[idx].left();
1712 let mut region_right = bands[idx].right();
1713 let mut next_idx = idx + 1;
1714
1715 while next_idx < bands.len() {
1716 let next_band = &bands[next_idx];
1717 let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1718 if next_gap_midpoints.is_empty() {
1719 if next_band
1720 .fits_single_column(&boundaries, region_left, region_right)
1721 .is_some()
1722 {
1723 band_indices.push(next_idx);
1724 next_idx += 1;
1725 continue;
1726 }
1727 break;
1728 }
1729
1730 if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1731 break;
1732 }
1733
1734 update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1735 gapped_band_count += 1;
1736 band_indices.push(next_idx);
1737 region_left = region_left.min(next_band.left());
1738 region_right = region_right.max(next_band.right());
1739 next_idx += 1;
1740 }
1741
1742 if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1743 append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1744 idx = next_idx;
1745 } else {
1746 blocks.push(bands[idx].row_block());
1747 idx += 1;
1748 }
1749 }
1750
1751 blocks
1752}
1753
1754fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1758 let stats = PageStats::from_spans(&spans);
1759 group_spans_into_bands_with_stats(spans, &stats)
1760}
1761
1762fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1763 if spans.is_empty() {
1764 return Vec::new();
1765 }
1766
1767 spans.sort_by(|a, b| {
1768 b.y.partial_cmp(&a.y)
1769 .unwrap_or(Ordering::Equal)
1770 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1771 });
1772
1773 let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1779
1780 let mut bands: Vec<TextBand> = Vec::new();
1781
1782 for span in spans {
1783 let tolerance = (span.height * BAND_Y_FRACTION)
1784 .max(page_tolerance)
1785 .max(BAND_Y_TOLERANCE);
1786 if let Some(band) = bands
1787 .iter_mut()
1788 .find(|band| (band.y - span.y).abs() <= tolerance)
1789 {
1790 let span_count = band.spans.len() as f64;
1791 band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1792 band.spans.push(span);
1793 } else {
1794 bands.push(TextBand::new(span));
1795 }
1796 }
1797
1798 for band in &mut bands {
1799 band.sort_spans();
1800 }
1801
1802 bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1803 bands
1804}
1805
1806fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1807 let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1808 boundaries.len() == gap_midpoints.len()
1809 && boundaries
1810 .iter()
1811 .zip(gap_midpoints)
1812 .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1813}
1814
1815fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1816 for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1817 *boundary =
1818 (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1819 }
1820}
1821
1822fn region_is_columnar(
1823 bands: &[TextBand],
1824 band_indices: &[usize],
1825 boundaries: &[f64],
1826 gapped_band_count: usize,
1827) -> bool {
1828 if boundaries.is_empty()
1829 || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1830 || band_indices.is_empty()
1831 || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1832 {
1833 return false;
1834 }
1835
1836 let mut non_empty_slices = 0usize;
1837 let mut dense_slices = 0usize;
1838 let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1839
1840 for &band_idx in band_indices {
1841 let slices = bands[band_idx].split_by_boundaries(boundaries);
1842 for (column_idx, slice) in slices.iter().enumerate() {
1843 if slice.is_empty() {
1844 continue;
1845 }
1846
1847 non_empty_slices += 1;
1848 slices_per_column[column_idx] += 1;
1849
1850 let char_count = slice
1851 .iter()
1852 .map(|span| span.text.chars().count())
1853 .sum::<usize>();
1854 if slice.len() >= 2 || char_count >= 8 {
1855 dense_slices += 1;
1856 }
1857 }
1858 }
1859
1860 if non_empty_slices < boundaries.len() + 2 {
1861 return false;
1862 }
1863
1864 if slices_per_column.contains(&0) {
1865 return false;
1866 }
1867
1868 (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1869}
1870
1871fn append_column_region_blocks(
1872 bands: &[TextBand],
1873 band_indices: &[usize],
1874 boundaries: &[f64],
1875 blocks: &mut Vec<TextBlock>,
1876) {
1877 let column_count = boundaries.len() + 1;
1878 let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1879
1880 for &band_idx in band_indices {
1881 let slices = bands[band_idx].split_by_boundaries(boundaries);
1882 for (column_idx, slice) in slices.into_iter().enumerate() {
1883 if slice.is_empty() {
1884 continue;
1885 }
1886 column_bands[column_idx].push(TextSpan::default());
1887 let marker_idx = column_bands[column_idx].len() - 1;
1888 column_bands[column_idx][marker_idx] = TextSpan {
1889 x: f64::NEG_INFINITY,
1890 y: bands[band_idx].y,
1891 ..TextSpan::default()
1892 };
1893 column_bands[column_idx].extend(slice);
1894 }
1895 }
1896
1897 for spans in column_bands {
1898 let mut current: Vec<TextSpan> = Vec::new();
1899 for span in spans {
1900 if span.x == f64::NEG_INFINITY {
1901 if !current.is_empty() {
1902 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1903 blocks.push(TextBlock {
1904 spans: std::mem::take(&mut current),
1905 });
1906 }
1907 continue;
1908 }
1909 current.push(span);
1910 }
1911 if !current.is_empty() {
1912 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1913 blocks.push(TextBlock { spans: current });
1914 }
1915 }
1916}
1917
1918fn stitch_hyphenated_lines(lines: &[String]) -> String {
1933 let mut out = String::new();
1934 for (idx, line) in lines.iter().enumerate() {
1935 if idx == 0 {
1936 out.push_str(line);
1937 continue;
1938 }
1939
1940 let next_trimmed = line.trim_start();
1941
1942 let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1944
1945 if should_merge {
1946 out.pop(); out.push_str(next_trimmed);
1948 } else {
1949 out.push('\n');
1950 out.push_str(line);
1951 }
1952 }
1953 out
1954}
1955
1956fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1959 if !accumulated.ends_with('-') {
1961 return false;
1962 }
1963
1964 let before_hyphen = accumulated.chars().rev().nth(1);
1966 if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1967 return false;
1968 }
1969
1970 let alpha_prefix_len = accumulated
1972 .chars()
1973 .rev()
1974 .skip(1) .take_while(|c| c.is_alphabetic())
1976 .count();
1977 if alpha_prefix_len < 3 {
1978 return false;
1979 }
1980
1981 let first_next = next_trimmed.chars().next();
1983 if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1984 return false;
1985 }
1986
1987 let next_alpha_len = next_trimmed
1989 .chars()
1990 .take_while(|c| c.is_ascii_lowercase())
1991 .count();
1992 if next_alpha_len < 3 {
1993 return false;
1994 }
1995
1996 true
1997}
1998
1999pub(crate) fn normalize_text_output(text: &str) -> String {
2006 if text.is_empty() {
2007 return String::new();
2008 }
2009
2010 let mut lines: Vec<&str> = Vec::new();
2011 for line in text.split('\n') {
2012 lines.push(line.trim_end());
2013 }
2014
2015 while lines.last() == Some(&"") {
2017 lines.pop();
2018 }
2019
2020 if lines.is_empty() {
2021 return String::new();
2022 }
2023
2024 let mut result = String::with_capacity(text.len());
2025 let mut consecutive_empty = 0u32;
2026
2027 for (i, line) in lines.iter().enumerate() {
2028 if line.is_empty() || *line == "\x0C" {
2029 if line.is_empty() {
2030 consecutive_empty += 1;
2031 if consecutive_empty <= 2 {
2033 result.push('\n');
2034 }
2035 } else {
2036 consecutive_empty = 0;
2038 result.push_str(line);
2039 if i + 1 < lines.len() {
2040 result.push('\n');
2041 }
2042 }
2043 } else {
2044 consecutive_empty = 0;
2046 result.push_str(line);
2047 if i + 1 < lines.len() {
2048 result.push('\n');
2049 }
2050 }
2051 }
2052
2053 if !result.is_empty() && !result.ends_with('\n') {
2055 result.push('\n');
2056 }
2057
2058 result
2059}
2060
2061#[cfg(test)]
2062mod tests {
2063 use super::*;
2064
2065 fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
2066 TextSpan {
2067 text: text.into(),
2068 x,
2069 y,
2070 width,
2071 height: 12.0,
2072 font_size: 12.0,
2073 ..TextSpan::default()
2074 }
2075 }
2076
2077 fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
2078 group_spans_into_blocks(spans)
2079 .into_iter()
2080 .map(|block| block.text())
2081 .collect()
2082 }
2083
2084 #[test]
2085 fn empty_device_produces_empty_text() {
2086 let dev = TextExtractionDevice::new();
2087 assert!(dev.into_text().is_empty());
2088 }
2089
2090 #[test]
2091 fn single_column_stays_row_major() {
2092 let texts = block_texts(vec![
2093 span("Single Column Line 1", 40.0, 700.0, 140.0),
2094 span("Single Column Line 2", 40.0, 684.0, 140.0),
2095 span("Single Column Line 3", 40.0, 668.0, 140.0),
2096 ]);
2097
2098 assert_eq!(
2099 texts,
2100 vec![
2101 "Single Column Line 1",
2102 "Single Column Line 2",
2103 "Single Column Line 3",
2104 ]
2105 );
2106 }
2107
2108 #[test]
2109 fn two_column_region_reads_column_major() {
2110 let texts = block_texts(vec![
2111 span("Header", 200.0, 740.0, 80.0),
2112 span("Left column line one", 40.0, 700.0, 115.0),
2113 span("Right column line one", 320.0, 700.0, 120.0),
2114 span("Left column line two", 40.0, 684.0, 115.0),
2115 span("Right column line two", 320.0, 684.0, 120.0),
2116 span("Left column line three", 40.0, 668.0, 125.0),
2117 span("Right column line three", 320.0, 668.0, 130.0),
2118 span("Footer", 200.0, 620.0, 80.0),
2119 ]);
2120
2121 assert_eq!(
2122 texts,
2123 vec![
2124 "Header",
2125 "Left column line one",
2126 "Left column line two",
2127 "Left column line three",
2128 "Right column line one",
2129 "Right column line two",
2130 "Right column line three",
2131 "Footer",
2132 ]
2133 );
2134 }
2135
2136 #[test]
2137 fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
2138 let texts = block_texts(vec![
2139 span("Intro paragraph", 40.0, 740.0, 180.0),
2140 span("L1 words here", 40.0, 700.0, 110.0),
2141 span("R1 words here", 320.0, 700.0, 110.0),
2142 span("L2 words here", 40.0, 684.0, 110.0),
2143 span("R2 words here", 320.0, 684.0, 110.0),
2144 span("L3 words here", 40.0, 668.0, 110.0),
2145 span("R3 words here", 320.0, 668.0, 110.0),
2146 span("Outro paragraph", 40.0, 620.0, 180.0),
2147 ]);
2148
2149 assert_eq!(
2150 texts,
2151 vec![
2152 "Intro paragraph",
2153 "L1 words here",
2154 "L2 words here",
2155 "L3 words here",
2156 "R1 words here",
2157 "R2 words here",
2158 "R3 words here",
2159 "Outro paragraph",
2160 ]
2161 );
2162 }
2163
2164 #[test]
2165 fn short_table_like_rows_fall_back_to_row_major() {
2166 let texts = block_texts(vec![
2167 span("Name", 40.0, 700.0, 30.0),
2168 span("Age", 320.0, 700.0, 20.0),
2169 span("Alice", 40.0, 684.0, 35.0),
2170 span("30", 320.0, 684.0, 15.0),
2171 span("Bob", 40.0, 668.0, 24.0),
2172 span("25", 320.0, 668.0, 15.0),
2173 ]);
2174
2175 assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
2176 }
2177
2178 #[test]
2179 fn three_column_regions_are_supported() {
2180 let texts = block_texts(vec![
2181 span("Column one line one", 40.0, 700.0, 105.0),
2182 span("Column two line one", 220.0, 700.0, 105.0),
2183 span("Column three line one", 400.0, 700.0, 120.0),
2184 span("Column one line two", 40.0, 684.0, 105.0),
2185 span("Column two line two", 220.0, 684.0, 105.0),
2186 span("Column three line two", 400.0, 684.0, 120.0),
2187 span("Column one line three", 40.0, 668.0, 120.0),
2188 span("Column two line three", 220.0, 668.0, 120.0),
2189 span("Column three line three", 400.0, 668.0, 135.0),
2190 ]);
2191
2192 assert_eq!(
2193 texts,
2194 vec![
2195 "Column one line one",
2196 "Column one line two",
2197 "Column one line three",
2198 "Column two line one",
2199 "Column two line two",
2200 "Column two line three",
2201 "Column three line one",
2202 "Column three line two",
2203 "Column three line three",
2204 ]
2205 );
2206 }
2207
2208 #[test]
2209 fn text_block_concatenation_spaced() {
2210 let block = TextBlock {
2211 spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
2212 };
2213 assert_eq!(block.text(), "A B");
2214 }
2215
2216 #[test]
2217 fn adaptive_column_gap_fallback_for_no_gaps() {
2218 let bands = vec![
2220 TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
2221 TextBand::new(span("World", 40.0, 684.0, 80.0)),
2222 ];
2223 let threshold = compute_adaptive_column_gap(&bands);
2224 assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
2225 }
2226
2227 #[test]
2228 fn adaptive_column_gap_uses_median() {
2229 let mut bands = Vec::new();
2231 for y in [700.0, 684.0, 668.0] {
2232 let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
2233 band.spans.push(span("word2", 74.0, y, 30.0)); band.spans.push(span("word3", 108.0, y, 30.0)); bands.push(band);
2236 }
2237 let threshold = compute_adaptive_column_gap(&bands);
2238 assert!(
2240 (10.0..=14.0).contains(&threshold),
2241 "expected ~12, got {threshold}"
2242 );
2243 }
2244
2245 #[test]
2246 fn adaptive_column_gap_clamps_to_min() {
2247 let mut bands = Vec::new();
2249 for y in [700.0, 684.0, 668.0, 652.0] {
2250 let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
2251 band.spans.push(span("def", 20.0, y, 18.0));
2253 bands.push(band);
2254 }
2255 let threshold = compute_adaptive_column_gap(&bands);
2256 assert!(
2257 (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
2258 "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
2259 );
2260 }
2261
2262 #[test]
2263 fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
2264 let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
2266 band.spans.push(span("Right", 80.0, 700.0, 30.0)); let bands = vec![band];
2268 let threshold = compute_adaptive_column_gap(&bands);
2269 assert!(
2270 (threshold - 37.5).abs() < 0.01,
2271 "expected 37.5 (0.75×50), got {threshold}"
2272 );
2273 }
2274
2275 #[test]
2276 fn normalize_trims_trailing_whitespace_per_line() {
2277 assert_eq!(
2278 normalize_text_output("hello \nworld \n"),
2279 "hello\nworld\n"
2280 );
2281 }
2282
2283 #[test]
2284 fn normalize_collapses_excess_newlines() {
2285 assert_eq!(
2287 normalize_text_output("hello\n\n\n\n\nworld\n"),
2288 "hello\n\n\nworld\n"
2289 );
2290 }
2291
2292 #[test]
2293 fn normalize_preserves_double_newline() {
2294 assert_eq!(
2295 normalize_text_output("paragraph one\n\nparagraph two\n"),
2296 "paragraph one\n\nparagraph two\n"
2297 );
2298 }
2299
2300 #[test]
2301 fn normalize_preserves_form_feed() {
2302 assert_eq!(
2303 normalize_text_output("page1\n\n\x0Cpage2\n"),
2304 "page1\n\n\x0Cpage2\n"
2305 );
2306 }
2307
2308 #[test]
2309 fn normalize_adds_trailing_newline() {
2310 assert_eq!(normalize_text_output("hello"), "hello\n");
2311 }
2312
2313 #[test]
2314 fn normalize_empty_input() {
2315 assert_eq!(normalize_text_output(""), "");
2316 }
2317
2318 #[test]
2319 fn normalize_only_whitespace() {
2320 assert_eq!(normalize_text_output(" \n \n"), "");
2321 }
2322
2323 #[test]
2326 fn hyphen_stitch_joins_wrapped_word() {
2327 let lines = vec!["the aver-".into(), "age rainfall".into()];
2328 assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
2329 }
2330
2331 #[test]
2332 fn hyphen_stitch_handles_leading_whitespace() {
2333 let lines = vec!["pre-".into(), " dict the outcome".into()];
2334 assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
2336 }
2337
2338 #[test]
2339 fn hyphen_stitch_capital_continuation_not_stitched() {
2340 let lines = vec!["Section three-".into(), "Summary here".into()];
2341 assert_eq!(
2342 stitch_hyphenated_lines(&lines),
2343 "Section three-\nSummary here"
2344 );
2345 }
2346
2347 #[test]
2348 fn hyphen_stitch_bullet_dash_not_stitched() {
2349 let lines = vec!["Items:".into(), "-".into(), "milk".into()];
2351 assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
2352 }
2353
2354 #[test]
2355 fn hyphen_stitch_numeric_range_not_stitched() {
2356 let lines = vec!["page 42-".into(), "seventy".into()];
2358 assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
2359 }
2360
2361 #[test]
2362 fn hyphen_stitch_short_prefix_not_stitched() {
2363 let lines = vec!["re-".into(), "organize".into()];
2365 assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
2366 }
2367
2368 #[test]
2369 fn hyphen_stitch_short_continuation_not_stitched() {
2370 let lines = vec!["counter-".into(), "an example".into()];
2372 assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
2373 }
2374
2375 #[test]
2376 fn hyphen_stitch_compound_word_midline_preserved() {
2377 let lines = vec!["real-time system".into()];
2380 assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
2381 }
2382
2383 #[test]
2384 fn hyphen_stitch_single_line_unchanged() {
2385 let lines = vec!["only line".into()];
2386 assert_eq!(stitch_hyphenated_lines(&lines), "only line");
2387 }
2388
2389 #[test]
2390 fn hyphen_stitch_empty_input() {
2391 let lines: Vec<String> = vec![];
2392 assert_eq!(stitch_hyphenated_lines(&lines), "");
2393 }
2394
2395 fn make_device_with_median(median: f64) -> TextExtractionDevice {
2398 let mut dev = TextExtractionDevice::new();
2399 for _ in 0..MEDIAN_REFRESH {
2401 dev.glyph_widths.push(median);
2402 }
2403 dev.refresh_median_char_width();
2404 assert!((dev.cached_median_char_width - median).abs() < 1e-9);
2405 dev
2406 }
2407
2408 #[test]
2409 fn consensus_inserts_space_on_strong_tj_offset_alone() {
2410 let mut dev = make_device_with_median(6.0);
2413 dev.pending_tj_offset = 250.0; assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2415 }
2416
2417 #[test]
2418 fn consensus_inserts_space_on_geometric_gap_alone() {
2419 let dev = make_device_with_median(6.0);
2421 assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
2424 }
2425
2426 #[test]
2427 fn consensus_no_space_on_kerning_gap() {
2428 let dev = make_device_with_median(6.0);
2432 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
2433 }
2434
2435 #[test]
2436 fn consensus_inserts_space_on_camel_case_plus_gap() {
2437 let dev = make_device_with_median(6.0);
2441 assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
2442 }
2443
2444 #[test]
2445 fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
2446 let dev = make_device_with_median(6.0);
2447 assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
2448 }
2449
2450 #[test]
2451 fn consensus_heuristic_alone_is_insufficient() {
2452 let dev = make_device_with_median(6.0);
2457 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
2458 }
2459
2460 #[test]
2461 fn consensus_falls_back_to_font_size_when_no_median() {
2462 let dev = TextExtractionDevice::new();
2464 assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
2466 assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
2468 }
2469
2470 #[test]
2471 fn consensus_ignores_tiny_tj_offsets() {
2472 let mut dev = make_device_with_median(6.0);
2474 dev.pending_tj_offset = 50.0;
2475 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2476 }
2477
2478 #[test]
2479 fn consensus_accepts_negative_tj_offsets() {
2480 let mut dev = make_device_with_median(6.0);
2483 dev.pending_tj_offset = -250.0;
2484 assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2485 }
2486
2487 #[test]
2488 fn text_adjustment_accumulates_until_glyph() {
2489 let mut dev = TextExtractionDevice::new();
2490 dev.text_adjustment(120.0);
2491 dev.text_adjustment(140.0);
2492 assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
2493 }
2494
2495 #[test]
2498 fn xy_cut_header_body_footer_with_two_columns() {
2499 let texts = block_texts(vec![
2505 span("HEADLINE TITLE", 180.0, 760.0, 120.0),
2506 span("Left col line A", 40.0, 700.0, 110.0),
2507 span("Right col line A", 320.0, 700.0, 115.0),
2508 span("Left col line B", 40.0, 684.0, 110.0),
2509 span("Right col line B", 320.0, 684.0, 115.0),
2510 span("Left col line C", 40.0, 668.0, 110.0),
2511 span("Right col line C", 320.0, 668.0, 115.0),
2512 span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
2513 ]);
2514 assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
2515 assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
2516 let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
2518 let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
2519 assert!(
2520 left_c_idx < right_a_idx,
2521 "expected column-major ordering in body: {texts:?}"
2522 );
2523 }
2524
2525 #[test]
2526 fn xy_cut_rejects_column_split_on_table_rows() {
2527 let texts = block_texts(vec![
2530 span("Name", 40.0, 700.0, 30.0),
2531 span("Age", 320.0, 700.0, 20.0),
2532 span("Alice", 40.0, 684.0, 35.0),
2533 span("30", 320.0, 684.0, 15.0),
2534 ]);
2535 assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2536 }
2537
2538 #[test]
2539 fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2540 let texts = block_texts(vec![
2544 span(
2545 "Full width intro spanning both columns here",
2546 40.0,
2547 740.0,
2548 360.0,
2549 ),
2550 span("Left A", 40.0, 700.0, 50.0),
2551 span("Right A", 320.0, 700.0, 50.0),
2552 span("Left B", 40.0, 684.0, 50.0),
2553 span("Right B", 320.0, 684.0, 50.0),
2554 ]);
2555 assert!(
2556 texts[0].contains("Full width intro"),
2557 "expected full-width intro first: {texts:?}"
2558 );
2559 }
2560
2561 #[test]
2562 fn xy_cut_horizontal_split_for_zone_boundaries() {
2563 let texts = block_texts(vec![
2567 span("First paragraph body text", 40.0, 740.0, 200.0),
2568 span("Second paragraph body", 40.0, 680.0, 180.0),
2569 ]);
2570 assert_eq!(texts.len(), 2);
2571 assert!(texts[0].starts_with("First"));
2572 assert!(texts[1].starts_with("Second"));
2573 }
2574
2575 #[test]
2576 fn xy_cut_recursion_terminates_with_single_span() {
2577 let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2578 assert_eq!(texts, vec!["Only one span on the page"]);
2579 }
2580
2581 #[test]
2582 fn median_font_size_handles_mixed_sizes() {
2583 let spans = vec![
2584 TextSpan {
2585 text: "small".into(),
2586 width: 10.0,
2587 height: 8.0,
2588 font_size: 8.0,
2589 ..TextSpan::default()
2590 },
2591 TextSpan {
2592 text: "medium".into(),
2593 width: 10.0,
2594 height: 12.0,
2595 font_size: 12.0,
2596 ..TextSpan::default()
2597 },
2598 TextSpan {
2599 text: "large".into(),
2600 width: 10.0,
2601 height: 24.0,
2602 font_size: 24.0,
2603 ..TextSpan::default()
2604 },
2605 ];
2606 assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2607 }
2608
2609 #[test]
2610 fn columns_band_aligned_accepts_aligned_columns() {
2611 let spans = vec![
2612 span("L1", 40.0, 700.0, 60.0),
2613 span("R1", 300.0, 700.0, 60.0),
2614 span("L2", 40.0, 684.0, 60.0),
2615 span("R2", 300.0, 684.0, 60.0),
2616 ];
2617 let stats = PageStats::from_spans(&spans);
2618 assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2620 }
2621
2622 #[test]
2623 fn columns_band_aligned_rejects_wide_single_side_band() {
2624 let spans = vec![
2625 span("Wide banner line across top", 40.0, 740.0, 280.0),
2626 span("L1", 40.0, 700.0, 60.0),
2627 span("R1", 300.0, 700.0, 60.0),
2628 ];
2629 let stats = PageStats::from_spans(&spans);
2630 assert!(!columns_are_band_aligned(
2633 &spans, 200.0, 40.0, 360.0, &stats
2634 ));
2635 }
2636
2637 #[test]
2638 fn page_stats_computes_median_values() {
2639 let spans = vec![
2640 span("one", 40.0, 700.0, 30.0),
2641 span("two", 40.0, 680.0, 30.0),
2642 span("three", 40.0, 660.0, 50.0),
2643 ];
2644 let stats = PageStats::from_spans(&spans);
2645 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2646 assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2648 assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2650 }
2651
2652 #[test]
2653 fn page_stats_handles_empty_input() {
2654 let stats = PageStats::from_spans(&[]);
2655 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2656 assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2657 assert_eq!(stats.median_line_spacing, 0.0);
2658 }
2659
2660 #[test]
2661 fn narrow_gutter_detected_with_adaptive_threshold() {
2662 let mut spans = Vec::new();
2666 for y in [700.0, 684.0, 668.0] {
2667 spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2669 spans.push(span("dolor sit", 144.0, y, 80.0));
2670 spans.push(span("amet consec", 236.0, y, 100.0));
2672 spans.push(span("tetur adipi", 340.0, y, 80.0));
2673 }
2674 let texts = block_texts(spans);
2675 assert!(
2677 texts.len() >= 6,
2678 "expected column-major output, got {texts:?}"
2679 );
2680 assert!(
2682 texts[0].contains("Lorem"),
2683 "first block should be left column: {texts:?}"
2684 );
2685 }
2686
2687 #[test]
2688 fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2689 let texts = block_texts(vec![
2690 span("73022", 45.0, 750.0, 70.0),
2691 span("Federal Register banner", 125.6, 750.0, 260.0),
2692 span("Left column line one", 45.0, 725.0, 140.0),
2693 span("Middle column line one", 222.0, 725.0, 140.0),
2694 span("Right column line one", 399.0, 725.0, 120.0),
2695 span("Left column line two", 45.0, 715.0, 140.0),
2696 span("Middle column line two", 210.0, 715.0, 152.0),
2697 span("Right column line two", 388.0, 715.0, 132.0),
2698 span("Left column line three", 45.0, 705.0, 140.0),
2699 span("Middle column line three", 235.0, 705.0, 135.0),
2700 span("Right column line three", 408.0, 705.0, 118.0),
2701 ]);
2702
2703 assert_eq!(
2704 texts,
2705 vec![
2706 "73022 Federal Register banner",
2707 "Left column line one",
2708 "Left column line two",
2709 "Left column line three",
2710 "Middle column line one",
2711 "Middle column line two",
2712 "Middle column line three",
2713 "Right column line one",
2714 "Right column line two",
2715 "Right column line three",
2716 ]
2717 );
2718 }
2719
2720 #[test]
2721 fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2722 let texts = block_texts(vec![
2723 span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2724 span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2725 span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2726 span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2727 span("2 This is a fakebold", 27.0, 714.8, 142.0),
2728 span(" fakebold", 169.8, 714.8, 70.0),
2729 span(" fakebold", 170.1, 714.8, 70.0),
2730 span(" fakebold word.", 170.4, 714.8, 110.0),
2731 ]);
2732
2733 assert_eq!(
2734 texts,
2735 vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2736 );
2737 }
2738
2739 #[test]
2742 fn g1_default_text_span_has_empty_metadata() {
2743 let s = TextSpan::default();
2744 assert_eq!(s.font_name, None);
2745 assert!(!s.is_bold);
2746 assert!(!s.is_italic);
2747 assert_eq!(s.color, None);
2748 }
2749
2750 #[test]
2751 fn g1_strip_subset_prefix_handles_six_char_prefix() {
2752 assert_eq!(strip_subset_prefix("AAAAAA+Helvetica"), "Helvetica");
2753 assert_eq!(strip_subset_prefix("ABC+Helvetica"), "ABC+Helvetica");
2755 assert_eq!(strip_subset_prefix("Helvetica-Bold"), "Helvetica-Bold");
2757 }
2758
2759 #[test]
2760 fn g1_name_style_hints_match_pdf_interpret_rules() {
2761 assert_eq!(name_style_hints("Helvetica-Bold"), (true, false));
2762 assert_eq!(name_style_hints("Times-Italic"), (false, true));
2763 assert_eq!(name_style_hints("MyFont-BoldOblique"), (true, true));
2764 assert_eq!(name_style_hints("Helvetica"), (false, false));
2765 assert_eq!(name_style_hints("Roboto-DemiBold"), (true, false));
2767 assert_eq!(name_style_hints("Roboto-Black"), (true, false));
2768 assert_eq!(name_style_hints("Roboto-Oblique"), (false, true));
2770 assert_eq!(name_style_hints("MyFont-Slanted"), (false, true));
2771 }
2772
2773 #[test]
2776 fn g2_default_text_span_has_estimate_width_source() {
2777 let s = TextSpan::default();
2778 assert_eq!(s.width_source, WidthSource::Estimate);
2779 assert!(s.char_bounds.is_empty());
2780 }
2781
2782 #[test]
2785 fn g2_single_glyph_span_has_one_char_bound() {
2786 let s = TextSpan {
2787 text: "A".into(),
2788 x: 10.0,
2789 y: 100.0,
2790 width: 7.22,
2791 height: 10.0,
2792 font_size: 10.0,
2793 width_source: WidthSource::Metric,
2794 char_bounds: vec![[10.0, 100.0, 17.22, 110.0]],
2795 ..Default::default()
2796 };
2797
2798 assert_eq!(s.char_bounds.len(), 1);
2799 let [x0, y0, x1, y1] = s.char_bounds[0];
2800 assert!((x0 - 10.0).abs() < 0.001);
2801 assert!((x1 - 17.22).abs() < 0.001);
2802 assert!((y1 - y0 - s.font_size).abs() < 0.001);
2803 }
2804
2805 #[test]
2808 fn g2_merged_span_degrades_width_source_on_estimate() {
2809 let mut s = TextSpan {
2810 width_source: WidthSource::Metric,
2811 char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
2812 ..Default::default()
2813 };
2814
2815 s.char_bounds.push([7.0, 0.0, 12.0, 10.0]);
2817 s.width_source = WidthSource::Estimate; assert_eq!(s.width_source, WidthSource::Estimate);
2820 assert_eq!(s.char_bounds.len(), 2);
2821 }
2822
2823 #[test]
2825 fn g2_width_source_variants_are_correct() {
2826 assert_eq!(format!("{:?}", WidthSource::Metric), "Metric");
2827 assert_eq!(format!("{:?}", WidthSource::Estimate), "Estimate");
2828 assert_ne!(WidthSource::Metric, WidthSource::Estimate);
2829 assert_eq!(WidthSource::default(), WidthSource::Estimate);
2830 }
2831
2832 #[test]
2835 fn m3_default_text_span_has_basic_mode() {
2836 let s = TextSpan::default();
2837 assert_eq!(s.geometry_mode, GeometryMode::Basic);
2838 assert_eq!(s.bounds_source, BoundsSource::Advance);
2839 assert!(s.tight_char_bounds.is_empty());
2840 assert!(s.glyph_advances.is_empty());
2841 }
2842
2843 #[test]
2844 fn m3_geometry_mode_default_is_basic() {
2845 assert_eq!(GeometryMode::default(), GeometryMode::Basic);
2846 }
2847
2848 #[test]
2849 fn m3_bounds_source_default_is_advance() {
2850 assert_eq!(BoundsSource::default(), BoundsSource::Advance);
2851 }
2852
2853 #[test]
2854 fn m3_geometry_mode_variants_are_distinct() {
2855 assert_ne!(GeometryMode::Basic, GeometryMode::RichGeometry);
2856 assert_eq!(format!("{:?}", GeometryMode::Basic), "Basic");
2857 assert_eq!(format!("{:?}", GeometryMode::RichGeometry), "RichGeometry");
2858 }
2859
2860 #[test]
2861 fn m3_bounds_source_variants_are_distinct() {
2862 assert_ne!(BoundsSource::Advance, BoundsSource::Tight);
2863 assert_ne!(BoundsSource::Tight, BoundsSource::Estimate);
2864 assert_eq!(format!("{:?}", BoundsSource::Advance), "Advance");
2865 assert_eq!(format!("{:?}", BoundsSource::Tight), "Tight");
2866 assert_eq!(format!("{:?}", BoundsSource::Estimate), "Estimate");
2867 }
2868
2869 #[test]
2870 fn m3_text_span_rich_geometry_preserves_existing_fields() {
2871 let s = TextSpan {
2872 text: "Hello".into(),
2873 x: 10.0,
2874 y: 100.0,
2875 width: 30.0,
2876 height: 12.0,
2877 font_size: 12.0,
2878 font_name: Some("Helvetica".into()),
2879 is_bold: true,
2880 is_italic: false,
2881 color: Some([0, 0, 0, 255]),
2882 width_source: WidthSource::Metric,
2883 char_bounds: vec![[10.0, 100.0, 40.0, 112.0]],
2884 geometry_mode: GeometryMode::RichGeometry,
2885 bounds_source: BoundsSource::Tight,
2886 tight_char_bounds: vec![[8.5, 98.5, 41.5, 114.0]],
2887 glyph_advances: vec![30.0],
2888 glyph_bounds_sources: vec![BoundsSource::Tight],
2889 ..Default::default()
2890 };
2891
2892 assert_eq!(s.text, "Hello");
2894 assert!((s.x - 10.0).abs() < 0.001);
2895 assert!((s.y - 100.0).abs() < 0.001);
2896 assert_eq!(s.width_source, WidthSource::Metric);
2897 assert_eq!(s.char_bounds.len(), 1);
2898
2899 assert_eq!(s.geometry_mode, GeometryMode::RichGeometry);
2901 assert_eq!(s.bounds_source, BoundsSource::Tight);
2902 assert_eq!(s.tight_char_bounds.len(), 1);
2903 assert_eq!(s.glyph_advances.len(), 1);
2904 assert!((s.glyph_advances[0] - 30.0).abs() < 0.001);
2905 }
2906
2907 #[test]
2908 fn m3_text_span_basic_mode_does_not_populate_tight_fields() {
2909 let s = TextSpan {
2910 text: "A".into(),
2911 x: 0.0,
2912 y: 0.0,
2913 width: 7.0,
2914 height: 10.0,
2915 font_size: 10.0,
2916 geometry_mode: GeometryMode::Basic,
2917 bounds_source: BoundsSource::Advance,
2918 ..Default::default()
2919 };
2920
2921 assert!(s.tight_char_bounds.is_empty());
2922 assert!(s.glyph_advances.is_empty());
2923 }
2924
2925 #[test]
2926 fn m3_tight_char_bounds_are_optional_in_basic_mode() {
2927 let s = TextSpan {
2929 text: "test".into(),
2930 x: 0.0,
2931 y: 0.0,
2932 width: 40.0,
2933 height: 12.0,
2934 font_size: 12.0,
2935 geometry_mode: GeometryMode::Basic,
2936 ..Default::default()
2937 };
2938 assert!(s.tight_char_bounds.is_empty());
2939 assert!(s.glyph_advances.is_empty());
2940 }
2941
2942 #[test]
2943 fn m3_test_span_constructor_preserves_all_fields() {
2944 let s = span("hello", 10.0, 100.0, 30.0);
2946 assert_eq!(s.geometry_mode, GeometryMode::Basic);
2947 assert_eq!(s.bounds_source, BoundsSource::Advance);
2948 assert!(s.tight_char_bounds.is_empty());
2949 }
2950
2951 #[test]
2952 fn m3_width_source_unchanged_by_m3() {
2953 assert_eq!(WidthSource::Metric, WidthSource::Metric);
2955 assert_eq!(WidthSource::Estimate, WidthSource::Estimate);
2956 assert_eq!(WidthSource::default(), WidthSource::Estimate);
2957 }
2958
2959 #[test]
2960 fn m3_transform_bbox_corners_identity() {
2961 let bbox = [0.0, 0.0, 10.0, 12.0];
2962 let identity = kurbo::Affine::IDENTITY;
2963 let result = transform_bbox_corners(&bbox, &identity);
2964 assert!((result[0] - 0.0).abs() < 0.001);
2965 assert!((result[1] - 0.0).abs() < 0.001);
2966 assert!((result[2] - 10.0).abs() < 0.001);
2967 assert!((result[3] - 12.0).abs() < 0.001);
2968 }
2969
2970 #[test]
2971 fn m3_transform_bbox_corners_translation() {
2972 let bbox = [0.0, 0.0, 10.0, 12.0];
2973 let t = kurbo::Affine::translate((50.0, 100.0));
2974 let result = transform_bbox_corners(&bbox, &t);
2975 assert!((result[0] - 50.0).abs() < 0.001);
2976 assert!((result[1] - 100.0).abs() < 0.001);
2977 assert!((result[2] - 60.0).abs() < 0.001);
2978 assert!((result[3] - 112.0).abs() < 0.001);
2979 }
2980
2981 #[test]
2982 fn m3_transform_bbox_corners_scale() {
2983 let bbox = [0.0, 0.0, 10.0, 12.0];
2984 let s = kurbo::Affine::scale(2.0);
2985 let result = transform_bbox_corners(&bbox, &s);
2986 assert!((result[0] - 0.0).abs() < 0.001);
2987 assert!((result[1] - 0.0).abs() < 0.001);
2988 assert!((result[2] - 20.0).abs() < 0.001);
2989 assert!((result[3] - 24.0).abs() < 0.001);
2990 }
2991
2992 #[test]
2993 fn m3_transform_bbox_corners_with_negative_bbox() {
2994 let bbox = [-5.0, -200.0, 15.0, 800.0];
2996 let s = kurbo::Affine::scale(0.012); let result = transform_bbox_corners(&bbox, &s);
2998 assert!((result[0] - (-0.06)).abs() < 0.01);
2999 assert!((result[1] - (-2.4)).abs() < 0.01);
3000 assert!((result[2] - 0.18).abs() < 0.01);
3001 assert!((result[3] - 9.6).abs() < 0.01);
3002 }
3003
3004 #[test]
3005 fn m3_tight_char_bounds_descender_detection() {
3006 let s = TextSpan {
3008 text: "g".into(),
3009 x: 100.0,
3010 y: 200.0, width: 8.0,
3012 height: 12.0,
3013 font_size: 12.0,
3014 geometry_mode: GeometryMode::RichGeometry,
3015 bounds_source: BoundsSource::Tight,
3016 tight_char_bounds: vec![[99.0, 197.5, 109.0, 210.0]],
3017 char_bounds: vec![[100.0, 200.0, 108.0, 212.0]],
3018 glyph_advances: vec![8.0],
3019 ..Default::default()
3020 };
3021
3022 assert!(
3024 s.tight_char_bounds[0][1] < s.y,
3025 "descender should extend below baseline"
3026 );
3027 }
3028
3029 #[test]
3030 fn m3_tight_char_bounds_ascender_detection() {
3031 let s = TextSpan {
3033 text: "f".into(),
3034 x: 100.0,
3035 y: 200.0,
3036 width: 7.0,
3037 height: 12.0,
3038 font_size: 12.0,
3039 geometry_mode: GeometryMode::RichGeometry,
3040 bounds_source: BoundsSource::Tight,
3041 tight_char_bounds: vec![[95.0, 200.0, 108.0, 214.0]],
3042 char_bounds: vec![[100.0, 200.0, 107.0, 212.0]],
3043 glyph_advances: vec![7.0],
3044 ..Default::default()
3045 };
3046
3047 let advance_y1 = s.y + s.height;
3049 assert!(
3050 s.tight_char_bounds[0][3] > advance_y1,
3051 "ascender should extend above advance bounds"
3052 );
3053 }
3054
3055 #[test]
3056 fn m3_tight_char_bounds_differ_from_advance() {
3057 let s = TextSpan {
3059 text: "A".into(),
3060 x: 10.0,
3061 y: 100.0,
3062 width: 7.22,
3063 height: 10.0,
3064 font_size: 10.0,
3065 geometry_mode: GeometryMode::RichGeometry,
3066 bounds_source: BoundsSource::Tight,
3067 char_bounds: vec![[10.0, 100.0, 17.22, 110.0]], tight_char_bounds: vec![[9.5, 100.2, 17.5, 109.5]], glyph_advances: vec![7.22],
3070 ..Default::default()
3071 };
3072
3073 let ac = s.char_bounds[0];
3074 let tc = s.tight_char_bounds[0];
3075 let differs = (ac[0] - tc[0]).abs() > 0.01
3076 || (ac[1] - tc[1]).abs() > 0.01
3077 || (ac[2] - tc[2]).abs() > 0.01
3078 || (ac[3] - tc[3]).abs() > 0.01;
3079 assert!(differs, "tight bounds should differ from advance bounds");
3080 }
3081
3082 #[test]
3083 fn m3_device_new_uses_basic_mode() {
3084 let dev = TextExtractionDevice::new();
3085 assert_eq!(dev.geometry_mode, GeometryMode::Basic);
3086 assert!(dev.deferred_rich_glyphs.is_empty());
3087 }
3088
3089 #[test]
3090 fn m3_device_with_mode_rich_geometry() {
3091 let dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3092 assert_eq!(dev.geometry_mode, GeometryMode::RichGeometry);
3093 assert!(dev.deferred_rich_glyphs.is_empty());
3094 }
3095
3096 #[cfg(test)]
3103 fn synthetic_glyph_rect() -> (BezPath, kurbo::Rect) {
3104 let mut path = BezPath::new();
3105 path.move_to((100.0, 0.0));
3106 path.line_to((600.0, 0.0));
3107 path.line_to((600.0, 700.0));
3108 path.line_to((100.0, 700.0));
3109 path.close_path();
3110 let bbox = path.bounding_box();
3111 (path, bbox)
3112 }
3113
3114 #[test]
3115 fn m3_basic_mode_skips_tight_bounds_computation() {
3116 let mut dev = TextExtractionDevice::new();
3118 dev.spans.push(TextSpan {
3119 text: "A".into(),
3120 char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
3121 ..Default::default()
3122 });
3123 let spans = dev.into_spans();
3124 assert_eq!(spans.len(), 1);
3125 assert!(spans[0].tight_char_bounds.is_empty());
3126 assert!(spans[0].glyph_bounds_sources.is_empty());
3127 assert_eq!(spans[0].bounds_source, BoundsSource::Advance);
3128 }
3129
3130 #[test]
3131 fn m3_rich_geometry_tight_bounds_match_rendered_outline() {
3132 let (path, bbox) = synthetic_glyph_rect();
3137 let composed = Affine::scale(0.01);
3138 let oracle = (composed * path.clone()).bounding_box();
3139
3140 let mut dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3141 dev.spans.push(TextSpan {
3142 text: "A".into(),
3143 char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
3144 geometry_mode: GeometryMode::RichGeometry,
3145 ..Default::default()
3146 });
3147 dev.deferred_rich_glyphs.push(DeferredGlyph {
3148 coeffs: composed.as_coeffs(),
3149 font_size: 10.0,
3150 glyph_width: 7.0,
3151 needs_exact: false,
3152 outline: Some(path),
3153 font_bbox: Some(bbox),
3154 });
3155
3156 let spans = dev.into_spans();
3157 assert_eq!(spans[0].glyph_bounds_sources, vec![BoundsSource::Tight]);
3158 assert_eq!(spans[0].bounds_source, BoundsSource::Tight);
3159 let tb = spans[0].tight_char_bounds[0];
3160 assert!(
3161 (tb[0] - oracle.x0).abs() < 1e-6,
3162 "x0: got {} want {}",
3163 tb[0],
3164 oracle.x0
3165 );
3166 assert!(
3167 (tb[1] - oracle.y0).abs() < 1e-6,
3168 "y0: got {} want {}",
3169 tb[1],
3170 oracle.y0
3171 );
3172 assert!(
3173 (tb[2] - oracle.x1).abs() < 1e-6,
3174 "x1: got {} want {}",
3175 tb[2],
3176 oracle.x1
3177 );
3178 assert!(
3179 (tb[3] - oracle.y1).abs() < 1e-6,
3180 "y1: got {} want {}",
3181 tb[3],
3182 oracle.y1
3183 );
3184 assert!(
3187 tb[3] - tb[1] > 1.0,
3188 "tight height must be commensurate with font size"
3189 );
3190 }
3191
3192 #[test]
3193 fn m3_rich_geometry_tight_bounds_rotation_is_exact() {
3194 let (path, bbox) = synthetic_glyph_rect();
3197 let composed = Affine::scale(0.01) * Affine::rotate(std::f64::consts::FRAC_PI_2);
3198 let oracle = (composed * path.clone()).bounding_box();
3199
3200 let mut dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3201 dev.spans.push(TextSpan {
3202 text: "A".into(),
3203 char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
3204 geometry_mode: GeometryMode::RichGeometry,
3205 ..Default::default()
3206 });
3207 dev.deferred_rich_glyphs.push(DeferredGlyph {
3208 coeffs: composed.as_coeffs(),
3209 font_size: 10.0,
3210 glyph_width: 7.0,
3211 needs_exact: true,
3212 outline: Some(path),
3213 font_bbox: Some(bbox),
3214 });
3215
3216 let spans = dev.into_spans();
3217 assert_eq!(spans[0].glyph_bounds_sources, vec![BoundsSource::Tight]);
3218 let tb = spans[0].tight_char_bounds[0];
3219 assert!(
3220 (tb[0] - oracle.x0).abs() < 1e-6,
3221 "x0: got {} want {}",
3222 tb[0],
3223 oracle.x0
3224 );
3225 assert!(
3226 (tb[1] - oracle.y0).abs() < 1e-6,
3227 "y0: got {} want {}",
3228 tb[1],
3229 oracle.y0
3230 );
3231 assert!(
3232 (tb[2] - oracle.x1).abs() < 1e-6,
3233 "x1: got {} want {}",
3234 tb[2],
3235 oracle.x1
3236 );
3237 assert!(
3238 (tb[3] - oracle.y1).abs() < 1e-6,
3239 "y1: got {} want {}",
3240 tb[3],
3241 oracle.y1
3242 );
3243 }
3244
3245 #[test]
3246 fn m3_empty_device_rich_geometry_no_panic() {
3247 let dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3248 let text = dev.into_text();
3249 assert!(text.is_empty());
3250 }
3251
3252 #[test]
3253 fn m3_bounds_source_estimate_fallback() {
3254 let s = TextSpan {
3256 geometry_mode: GeometryMode::RichGeometry,
3257 bounds_source: BoundsSource::Estimate,
3258 ..Default::default()
3259 };
3260 assert_eq!(s.bounds_source, BoundsSource::Estimate);
3261 assert_eq!(s.geometry_mode, GeometryMode::RichGeometry);
3262 assert!(s.tight_char_bounds.is_empty());
3264 }
3265
3266 #[test]
3267 fn m3_multiple_glyphs_tight_bounds_consistent_count() {
3268 let s = TextSpan {
3271 text: "abc".into(),
3272 x: 0.0,
3273 y: 0.0,
3274 width: 25.0,
3275 height: 12.0,
3276 font_size: 12.0,
3277 geometry_mode: GeometryMode::RichGeometry,
3278 bounds_source: BoundsSource::Tight,
3279 char_bounds: vec![[0., 0., 8., 12.], [8., 0., 16., 12.], [16., 0., 25., 12.]],
3280 tight_char_bounds: vec![
3281 [-1., -2., 9., 14.],
3282 [7., -2., 17., 14.],
3283 [15., -2., 26., 14.],
3284 ],
3285 glyph_advances: vec![8.0, 8.0, 9.0],
3286 ..Default::default()
3287 };
3288
3289 assert_eq!(s.tight_char_bounds.len(), 3);
3290 assert_eq!(s.glyph_advances.len(), 3);
3291 assert_eq!(s.char_bounds.len(), 3);
3292 for &adv in &s.glyph_advances {
3294 assert!(adv >= 0.0, "glyph advance must be non-negative");
3295 }
3296 }
3297}