1use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7 BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11const BAND_Y_TOLERANCE: f64 = 5.0;
15const BAND_Y_FRACTION: f64 = 0.30;
20const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71#[derive(Debug, Clone)]
73pub struct TextSpan {
74 pub text: String,
76 pub x: f64,
78 pub y: f64,
80 pub width: f64,
82 pub height: f64,
84 pub font_size: f64,
86}
87
88impl TextSpan {
89 fn right(&self) -> f64 {
92 self.x + self.width.max(self.estimated_width())
93 }
94
95 fn measured_right(&self) -> f64 {
97 self.x + self.width
98 }
99
100 fn estimated_width(&self) -> f64 {
101 let char_count = self.text.chars().count() as f64;
102 if char_count <= 0.0 {
103 self.font_size * 0.5
104 } else {
105 self.font_size * 0.5 * char_count
106 }
107 }
108}
109
110#[derive(Debug, Clone)]
112pub struct TextBlock {
113 pub spans: Vec<TextSpan>,
115}
116
117impl TextBlock {
118 pub fn text(&self) -> String {
124 if self.spans.is_empty() {
125 return String::new();
126 }
127 let mut result = self.spans[0].text.clone();
128 for pair in self.spans.windows(2) {
129 let prev = &pair[0];
130 let curr = &pair[1];
131 let expected_end = prev.measured_right();
132 let gap = curr.x - expected_end;
133 if gap <= prev.font_size * 0.12 {
134 if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
135 result.push_str(&trimmed);
136 continue;
137 }
138 }
139 if gap > prev.font_size * 0.25 {
140 result.push(' ');
141 }
142 result.push_str(&curr.text);
143 }
144 result
145 }
146}
147
148#[derive(Debug, Clone)]
149struct TextBand {
150 y: f64,
151 spans: Vec<TextSpan>,
152}
153
154impl TextBand {
155 fn new(span: TextSpan) -> Self {
156 Self {
157 y: span.y,
158 spans: vec![span],
159 }
160 }
161
162 fn sort_spans(&mut self) {
163 self.spans.sort_by(|a, b| {
164 a.x.partial_cmp(&b.x)
165 .unwrap_or(Ordering::Equal)
166 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
167 });
168 collapse_overprinted_spans(&mut self.spans);
169 }
170
171 fn row_block(&self) -> TextBlock {
172 let mut spans = self.spans.clone();
173 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
174 TextBlock { spans }
175 }
176
177 fn left(&self) -> f64 {
178 self.spans
179 .iter()
180 .map(|span| span.x)
181 .fold(f64::INFINITY, f64::min)
182 }
183
184 fn right(&self) -> f64 {
185 self.spans
186 .iter()
187 .map(TextSpan::right)
188 .fold(f64::NEG_INFINITY, f64::max)
189 }
190
191 fn width(&self) -> f64 {
192 (self.right() - self.left()).max(0.0)
193 }
194
195 fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
196 self.gaps(column_gap_threshold)
197 .into_iter()
198 .map(|gap| (gap.start + gap.end) * 0.5)
199 .collect()
200 }
201
202 fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
203 if self.spans.len() < 2 {
204 return Vec::new();
205 }
206
207 let mut spans = self.spans.clone();
208 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
209
210 let mut gaps = Vec::new();
211 let mut prev_right = spans[0].right();
212 for span in spans.iter().skip(1) {
213 let gap = span.x - prev_right;
214 if gap >= column_gap_threshold {
215 gaps.push(BandGap {
216 start: prev_right,
217 end: span.x,
218 });
219 }
220 prev_right = prev_right.max(span.right());
221 }
222
223 gaps
224 }
225
226 fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
227 let mut columns = vec![Vec::new(); boundaries.len() + 1];
228 for span in &self.spans {
229 let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
230 let column_idx = boundaries
231 .iter()
232 .position(|boundary| center_x < *boundary)
233 .unwrap_or(boundaries.len());
234 columns[column_idx].push(span.clone());
235 }
236
237 for spans in &mut columns {
238 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
239 }
240
241 columns
242 }
243
244 fn fits_single_column(
245 &self,
246 boundaries: &[f64],
247 region_left: f64,
248 region_right: f64,
249 ) -> Option<usize> {
250 let mut column_idx: Option<usize> = None;
251 for span in &self.spans {
252 let left = span.x;
253 let right = span.right();
254 if boundaries
255 .iter()
256 .any(|boundary| left < *boundary && right > *boundary)
257 {
258 return None;
259 }
260
261 let center_x = left + (right - left) * 0.5;
262 let idx = boundaries
263 .iter()
264 .position(|boundary| center_x < *boundary)
265 .unwrap_or(boundaries.len());
266 match column_idx {
267 Some(existing) if existing != idx => return None,
268 Some(_) => {}
269 None => column_idx = Some(idx),
270 }
271 }
272 let idx = column_idx?;
273 let mut edges = Vec::with_capacity(boundaries.len() + 2);
274 edges.push(region_left);
275 edges.extend_from_slice(boundaries);
276 edges.push(region_right);
277
278 let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
279 if column_width <= 0.0 || self.width() > column_width * 0.8 {
280 return None;
281 }
282
283 Some(idx)
284 }
285}
286
287#[derive(Debug, Clone, Copy)]
288struct BandGap {
289 start: f64,
290 end: f64,
291}
292
293pub(crate) struct TextExtractionDevice {
315 spans: Vec<TextSpan>,
316 last_y: f64,
317 last_end_x: f64,
318 pending_tj_offset: f32,
322 glyph_widths: Vec<f64>,
326 cached_median_char_width: f64,
330}
331
332const MEDIAN_REFRESH: usize = 32;
333
334impl Default for TextExtractionDevice {
335 fn default() -> Self {
336 Self::new()
337 }
338}
339
340impl TextExtractionDevice {
341 pub fn new() -> Self {
343 Self {
344 spans: Vec::new(),
345 last_y: f64::NEG_INFINITY,
346 last_end_x: f64::NEG_INFINITY,
347 pending_tj_offset: 0.0,
348 glyph_widths: Vec::new(),
349 cached_median_char_width: 0.0,
350 }
351 }
352
353 fn refresh_median_char_width(&mut self) {
356 if self.glyph_widths.is_empty() {
357 self.cached_median_char_width = 0.0;
358 return;
359 }
360 let mut sorted = self.glyph_widths.clone();
361 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
362 self.cached_median_char_width = sorted[sorted.len() / 2];
363 }
364
365 fn evaluate_space_consensus(
368 &self,
369 gap: f64,
370 font_size: f64,
371 prev_text: &str,
372 next_text: &str,
373 ) -> bool {
374 let mut confidence = 0.0;
375
376 if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
379 confidence += TJ_SIGNAL_WEIGHT;
380 }
381
382 let gap_reference = if self.cached_median_char_width > 0.0 {
386 self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
387 } else {
388 font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
389 };
390 if gap > gap_reference {
391 confidence += GAP_SIGNAL_WEIGHT;
392 }
393
394 if let (Some(prev_last), Some(next_first)) =
398 (prev_text.chars().last(), next_text.chars().next())
399 {
400 let camel = prev_last.is_lowercase() && next_first.is_uppercase();
401 let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
402 let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
403 if camel || digit_to_letter || letter_to_digit {
404 confidence += HEURISTIC_SIGNAL_WEIGHT;
405 }
406 }
407
408 confidence >= SPACE_CONSENSUS_THRESHOLD
409 }
410
411 pub fn into_text(self) -> String {
413 let blocks = group_spans_into_blocks(self.spans);
414 let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
415 let stitched = stitch_hyphenated_lines(&lines);
416 normalize_text_output(&stitched)
417 }
418
419 pub fn into_blocks(self) -> Vec<TextBlock> {
421 group_spans_into_blocks(self.spans)
422 }
423
424 #[allow(dead_code)]
426 pub(crate) fn into_spans(self) -> Vec<TextSpan> {
427 self.spans
428 }
429}
430
431impl Device<'_> for TextExtractionDevice {
432 fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
433 fn set_blend_mode(&mut self, _: BlendMode) {}
434 fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
435 fn push_clip_path(&mut self, _: &ClipPath) {}
436 fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
437 fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
438 fn pop_clip_path(&mut self) {}
439 fn pop_transparency_group(&mut self) {}
440
441 fn draw_glyph(
442 &mut self,
443 glyph: &Glyph<'_>,
444 transform: Affine,
445 glyph_transform: Affine,
446 _paint: &Paint<'_>,
447 _draw_mode: &GlyphDrawMode,
448 ) {
449 let text = match glyph.as_unicode() {
450 Some(BfString::Char(c)) => c.to_string(),
451 Some(BfString::String(s)) => s,
452 None => return,
453 };
454
455 let composed = transform * glyph_transform;
456 let coeffs = composed.as_coeffs();
457 let x = coeffs[4];
458 let y = coeffs[5];
459 let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
460 let font_size = glyph_scale * 1000.0;
461 let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
462 let glyph_end_x = x + glyph_width;
463
464 if self.glyph_widths.len() < 4096 {
468 self.glyph_widths.push(glyph_width);
469 if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
470 self.refresh_median_char_width();
471 }
472 }
473
474 let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
475 let gap = x - self.last_end_x;
476 let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
477
478 if adjacent && !self.spans.is_empty() {
479 let want_space = {
487 let last = self.spans.last().expect("checked non-empty");
488 !last.text.ends_with(' ')
489 && !text.starts_with(' ')
490 && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
491 };
492 let last = self.spans.last_mut().expect("checked non-empty");
493 if want_space {
494 last.text.push(' ');
495 }
496 last.text.push_str(&text);
497 last.width = last.width.max(glyph_end_x - last.x);
498 last.height = last.height.max(font_size);
499 self.last_y = y;
500 self.last_end_x = glyph_end_x;
501 self.pending_tj_offset = 0.0;
504 return;
505 }
506
507 self.last_y = y;
508 self.last_end_x = glyph_end_x;
509 self.pending_tj_offset = 0.0;
513
514 self.spans.push(TextSpan {
515 text,
516 x,
517 y,
518 width: glyph_width,
519 height: font_size,
520 font_size,
521 });
522 }
523
524 fn text_adjustment(&mut self, amount: f32) {
529 self.pending_tj_offset += amount;
530 }
531}
532
533fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
534 match glyph {
535 Glyph::Outline(outline) => outline
536 .advance_width()
537 .map(|width| width as f64 / 1000.0 * font_size)
538 .unwrap_or(font_size * 0.5),
539 Glyph::Type3(_) => font_size * 0.5,
540 }
541}
542
543fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
549 if spans.len() < 2 {
550 return;
551 }
552
553 let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
554 for span in spans.drain(..) {
555 if let Some(last) = deduped.last_mut() {
556 if spans_are_overprint_duplicates(last, &span) {
557 let choose_incoming = span.text.chars().count() > last.text.chars().count()
558 || (span.text.chars().count() == last.text.chars().count()
559 && span.width > last.width);
560 let preferred_text = if choose_incoming {
561 span.text.clone()
562 } else {
563 last.text.clone()
564 };
565 let left = last.x.min(span.x);
566 let right = last.right().max(span.right());
567 last.x = left;
568 last.y = (last.y + span.y) * 0.5;
569 last.width = (right - left).max(last.width).max(span.width);
570 last.height = last.height.max(span.height);
571 last.font_size = last.font_size.max(span.font_size);
572 last.text = preferred_text;
573 continue;
574 }
575 }
576
577 deduped.push(span);
578 }
579
580 *spans = deduped;
581}
582
583fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
584 let lhs_text = lhs.text.trim();
585 let rhs_text = rhs.text.trim();
586 if lhs_text.is_empty() || rhs_text.is_empty() {
587 return false;
588 }
589
590 let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
591 if !same_baseline {
592 return false;
593 }
594
595 let lhs_left = lhs.x;
596 let lhs_right = lhs.right();
597 let rhs_left = rhs.x;
598 let rhs_right = rhs.right();
599 let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
600 let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
601 let heavily_overlaps = overlap / min_width >= 0.85;
602 if !heavily_overlaps {
603 return false;
604 }
605
606 lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
607}
608
609fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
610 let prev_chars: Vec<char> = prev.trim_end().chars().collect();
611 let curr_chars: Vec<char> = curr.trim_start().chars().collect();
612 let max = prev_chars.len().min(curr_chars.len());
613
614 for len in (4..=max).rev() {
615 let prev_start = prev_chars.len() - len;
616 if prev_chars[prev_start..] != curr_chars[..len] {
617 continue;
618 }
619
620 if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
621 continue;
622 }
623
624 let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
625 let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
626 if !prev_boundary || !curr_boundary {
627 continue;
628 }
629
630 return Some(curr_chars[len..].iter().collect());
631 }
632
633 None
634}
635
636fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
643 let mut all_gaps: Vec<f64> = Vec::new();
644
645 for band in bands {
646 if band.spans.len() < 2 {
647 continue;
648 }
649 let mut sorted = band.spans.clone();
650 sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
651 let mut prev_right = sorted[0].right();
652 for span in sorted.iter().skip(1) {
653 let gap = span.x - prev_right;
654 if gap > 0.0 {
655 all_gaps.push(gap);
656 }
657 prev_right = prev_right.max(span.right());
658 }
659 }
660
661 if all_gaps.is_empty() {
662 return COLUMN_GAP_THRESHOLD_FALLBACK;
663 }
664
665 all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
666
667 let min_gap = all_gaps[0];
668
669 if min_gap > COLUMN_GAP_THRESHOLD_MIN {
674 return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
675 }
676
677 let mut best_break_threshold = 0.0f64;
680 let mut best_ratio = 1.5f64; for pair in all_gaps.windows(2) {
682 if pair[0] > 0.5 {
683 let ratio = pair[1] / pair[0];
684 if ratio > best_ratio {
685 best_ratio = ratio;
686 best_break_threshold = (pair[0] + pair[1]) * 0.5;
687 }
688 }
689 }
690
691 if best_break_threshold > 0.0 {
692 return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
693 }
694
695 let mid = all_gaps.len() / 2;
697 let median = if all_gaps.len().is_multiple_of(2) {
698 (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
699 } else {
700 all_gaps[mid]
701 };
702
703 (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
704 .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
705}
706
707#[derive(Debug, Clone, Copy)]
715struct PageStats {
716 median_font_size: f64,
718 #[allow(dead_code)]
723 median_char_width: f64,
724 median_line_spacing: f64,
730}
731
732impl PageStats {
733 fn from_spans(spans: &[TextSpan]) -> Self {
734 if spans.is_empty() {
735 return Self {
736 median_font_size: 12.0,
737 median_char_width: 6.0,
738 median_line_spacing: 0.0,
739 };
740 }
741
742 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
744 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
745 let median_font_size = sizes[sizes.len() / 2];
746
747 let mut char_widths: Vec<f64> = spans
749 .iter()
750 .filter_map(|s| {
751 let chars = s.text.chars().count();
752 if chars > 0 && s.width > 0.0 {
753 Some(s.width / chars as f64)
754 } else {
755 None
756 }
757 })
758 .collect();
759 let median_char_width = if char_widths.is_empty() {
760 median_font_size * 0.5
761 } else {
762 char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
763 char_widths[char_widths.len() / 2]
764 };
765
766 let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
769 let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
770 ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
771 let mut band_ys: Vec<f64> = Vec::new();
772 for y in ys {
773 if band_ys
774 .last()
775 .map(|prev: &f64| (prev - y).abs() > band_tolerance)
776 .unwrap_or(true)
777 {
778 band_ys.push(y);
779 }
780 }
781 let median_line_spacing = if band_ys.len() < 2 {
790 0.0
791 } else {
792 let mut spacings: Vec<f64> = band_ys
793 .windows(2)
794 .map(|pair| (pair[0] - pair[1]).abs())
795 .collect();
796 spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
797 let q1_index = spacings.len() / 4;
798 spacings[q1_index]
799 };
800
801 Self {
802 median_font_size,
803 median_char_width,
804 median_line_spacing,
805 }
806 }
807}
808
809const XY_CUT_MAX_DEPTH: usize = 12;
814const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
817const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
821const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
824const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
828const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
831
832fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
836 if spans.is_empty() {
837 return Vec::new();
838 }
839 let stats = PageStats::from_spans(&spans);
840 xy_cut_recursive(spans, 0, &stats)
841}
842
843fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
844 if spans.is_empty() {
845 return Vec::new();
846 }
847 if depth >= XY_CUT_MAX_DEPTH {
848 return band_based_blocks(spans, stats);
849 }
850
851 let vcut = try_vertical_cut(&spans, stats);
858 let hcut = try_horizontal_cut(&spans, stats);
859
860 let (chosen, _) = match (vcut, hcut) {
861 (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
862 if v_gap >= h_gap {
863 (Some(v_groups), v_gap)
864 } else {
865 (Some(h_groups), h_gap)
866 }
867 }
868 (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
869 (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
870 (None, None) => (None, 0.0),
871 };
872
873 if let Some(groups) = chosen {
874 let mut out = Vec::new();
875 for group in groups {
876 out.extend(xy_cut_recursive(group, depth + 1, stats));
877 }
878 return out;
879 }
880
881 band_based_blocks(spans, stats)
882}
883
884fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
888 group_spans_into_blocks_legacy_with_stats(spans, stats)
893}
894
895#[allow(dead_code)]
898fn median_font_size(spans: &[TextSpan]) -> f64 {
899 if spans.is_empty() {
900 return 12.0;
901 }
902 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
903 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
904 sizes[sizes.len() / 2]
905}
906
907fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
924 if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
925 return None;
926 }
927
928 let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
929 let region_right = spans
930 .iter()
931 .map(TextSpan::right)
932 .fold(f64::NEG_INFINITY, f64::max);
933 let region_width = region_right - region_left;
934 if region_width <= 0.0 {
935 return None;
936 }
937
938 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
947 let adaptive = compute_adaptive_column_gap(&bands);
948 let floor = stats
949 .median_font_size
950 .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
951 .max(XY_CUT_VERTICAL_GAP_FLOOR);
952 let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
953
954 let mut intervals: Vec<(f64, f64)> = spans
957 .iter()
958 .map(|s| (s.x, s.right().max(s.x + 0.001)))
959 .collect();
960 intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
961
962 let mut cursor = intervals[0].1;
963 let mut best_gap: Option<(f64, f64)> = None; for (left, right) in intervals.iter().skip(1) {
965 if *left > cursor {
966 let gap = *left - cursor;
967 if gap >= min_gap {
968 match best_gap {
969 Some((best, _)) if best >= gap => {}
970 _ => {
971 let cut_x = (cursor + *left) * 0.5;
972 best_gap = Some((gap, cut_x));
973 }
974 }
975 }
976 }
977 cursor = cursor.max(*right);
978 }
979
980 let (gap_size, cut_x) = best_gap?;
981
982 let mut left_group = Vec::new();
985 let mut right_group = Vec::new();
986 for span in spans {
987 let midpoint = span.x + (span.right() - span.x) * 0.5;
988 if midpoint < cut_x {
989 left_group.push(span.clone());
990 } else {
991 right_group.push(span.clone());
992 }
993 }
994
995 if !columns_are_dense(&left_group, &right_group, stats) {
996 return None;
997 }
998 if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
999 return None;
1000 }
1001
1002 Some((vec![left_group, right_group], gap_size))
1003}
1004
1005fn columns_are_band_aligned(
1011 spans: &[TextSpan],
1012 cut_x: f64,
1013 region_left: f64,
1014 region_right: f64,
1015 stats: &PageStats,
1016) -> bool {
1017 let left_width = (cut_x - region_left).max(1.0);
1018 let right_width = (region_right - cut_x).max(1.0);
1019
1020 const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1025
1026 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1027 for band in &bands {
1028 let mut has_left = false;
1029 let mut has_right = false;
1030 for span in &band.spans {
1031 let midpoint = span.x + (span.right() - span.x) * 0.5;
1032 if midpoint < cut_x {
1033 has_left = true;
1034 } else {
1035 has_right = true;
1036 }
1037 }
1038 if has_left && has_right {
1039 continue; }
1041 let band_width = band.width();
1042 if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1043 return false;
1044 }
1045 if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1046 return false;
1047 }
1048 }
1049 true
1050}
1051
1052fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1057 for col in [left, right] {
1058 if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1059 return false;
1060 }
1061 let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1062 if bands.is_empty() {
1063 return false;
1064 }
1065 let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1066 let chars_per_band = total_chars as f64 / bands.len() as f64;
1067 if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1068 return false;
1069 }
1070 }
1071 true
1072}
1073
1074fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1078 if spans.len() < 2 {
1079 return None;
1080 }
1081 let mut sorted = spans.to_vec();
1083 sorted.sort_by(|a, b| {
1084 b.y.partial_cmp(&a.y)
1085 .unwrap_or(Ordering::Equal)
1086 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1087 });
1088
1089 let min_gap = if stats.median_line_spacing > 0.0 {
1095 stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1096 } else {
1097 stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1098 };
1099
1100 let mut best: Option<(f64, f64)> = None; let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1103 let mut band_bottom = sorted[0].y;
1104
1105 for span in sorted.iter().skip(1) {
1106 if (band_bottom - span.y).abs() <= tolerance {
1107 band_bottom = band_bottom.min(span.y);
1108 continue;
1109 }
1110 let gap = band_bottom - span.y;
1111 if gap >= min_gap {
1112 let cut_y = (band_bottom + span.y) * 0.5;
1113 match best {
1114 Some((best_gap, _)) if best_gap >= gap => {}
1115 _ => best = Some((gap, cut_y)),
1116 }
1117 }
1118 band_bottom = span.y;
1119 }
1120
1121 let (gap_size, cut_y) = best?;
1122
1123 let mut top_group = Vec::new();
1124 let mut bottom_group = Vec::new();
1125 for span in spans {
1126 if span.y > cut_y {
1127 top_group.push(span.clone());
1128 } else {
1129 bottom_group.push(span.clone());
1130 }
1131 }
1132 if top_group.is_empty() || bottom_group.is_empty() {
1133 return None;
1134 }
1135 Some((vec![top_group, bottom_group], gap_size))
1136}
1137
1138#[allow(dead_code)]
1142fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1143 let bands = group_spans_into_bands(spans);
1144 group_spans_into_blocks_legacy_from_bands(bands)
1145}
1146
1147fn group_spans_into_blocks_legacy_with_stats(
1148 spans: Vec<TextSpan>,
1149 stats: &PageStats,
1150) -> Vec<TextBlock> {
1151 let bands = group_spans_into_bands_with_stats(spans, stats);
1152 group_spans_into_blocks_legacy_from_bands(bands)
1153}
1154
1155fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1156 if bands.is_empty() {
1157 return Vec::new();
1158 }
1159
1160 let column_gap_threshold = compute_adaptive_column_gap(&bands);
1161
1162 let mut blocks = Vec::new();
1163 let mut idx = 0;
1164
1165 while idx < bands.len() {
1166 let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1167 if gap_midpoints.is_empty() {
1168 blocks.push(bands[idx].row_block());
1169 idx += 1;
1170 continue;
1171 }
1172
1173 let mut boundaries = gap_midpoints.clone();
1174 let mut band_indices = vec![idx];
1175 let mut gapped_band_count = 1usize;
1176 let mut region_left = bands[idx].left();
1177 let mut region_right = bands[idx].right();
1178 let mut next_idx = idx + 1;
1179
1180 while next_idx < bands.len() {
1181 let next_band = &bands[next_idx];
1182 let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1183 if next_gap_midpoints.is_empty() {
1184 if next_band
1185 .fits_single_column(&boundaries, region_left, region_right)
1186 .is_some()
1187 {
1188 band_indices.push(next_idx);
1189 next_idx += 1;
1190 continue;
1191 }
1192 break;
1193 }
1194
1195 if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1196 break;
1197 }
1198
1199 update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1200 gapped_band_count += 1;
1201 band_indices.push(next_idx);
1202 region_left = region_left.min(next_band.left());
1203 region_right = region_right.max(next_band.right());
1204 next_idx += 1;
1205 }
1206
1207 if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1208 append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1209 idx = next_idx;
1210 } else {
1211 blocks.push(bands[idx].row_block());
1212 idx += 1;
1213 }
1214 }
1215
1216 blocks
1217}
1218
1219fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1223 let stats = PageStats::from_spans(&spans);
1224 group_spans_into_bands_with_stats(spans, &stats)
1225}
1226
1227fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1228 if spans.is_empty() {
1229 return Vec::new();
1230 }
1231
1232 spans.sort_by(|a, b| {
1233 b.y.partial_cmp(&a.y)
1234 .unwrap_or(Ordering::Equal)
1235 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1236 });
1237
1238 let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1244
1245 let mut bands: Vec<TextBand> = Vec::new();
1246
1247 for span in spans {
1248 let tolerance = (span.height * BAND_Y_FRACTION)
1249 .max(page_tolerance)
1250 .max(BAND_Y_TOLERANCE);
1251 if let Some(band) = bands
1252 .iter_mut()
1253 .find(|band| (band.y - span.y).abs() <= tolerance)
1254 {
1255 let span_count = band.spans.len() as f64;
1256 band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1257 band.spans.push(span);
1258 } else {
1259 bands.push(TextBand::new(span));
1260 }
1261 }
1262
1263 for band in &mut bands {
1264 band.sort_spans();
1265 }
1266
1267 bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1268 bands
1269}
1270
1271fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1272 let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1273 boundaries.len() == gap_midpoints.len()
1274 && boundaries
1275 .iter()
1276 .zip(gap_midpoints)
1277 .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1278}
1279
1280fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1281 for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1282 *boundary =
1283 (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1284 }
1285}
1286
1287fn region_is_columnar(
1288 bands: &[TextBand],
1289 band_indices: &[usize],
1290 boundaries: &[f64],
1291 gapped_band_count: usize,
1292) -> bool {
1293 if boundaries.is_empty()
1294 || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1295 || band_indices.is_empty()
1296 || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1297 {
1298 return false;
1299 }
1300
1301 let mut non_empty_slices = 0usize;
1302 let mut dense_slices = 0usize;
1303 let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1304
1305 for &band_idx in band_indices {
1306 let slices = bands[band_idx].split_by_boundaries(boundaries);
1307 for (column_idx, slice) in slices.iter().enumerate() {
1308 if slice.is_empty() {
1309 continue;
1310 }
1311
1312 non_empty_slices += 1;
1313 slices_per_column[column_idx] += 1;
1314
1315 let char_count = slice
1316 .iter()
1317 .map(|span| span.text.chars().count())
1318 .sum::<usize>();
1319 if slice.len() >= 2 || char_count >= 8 {
1320 dense_slices += 1;
1321 }
1322 }
1323 }
1324
1325 if non_empty_slices < boundaries.len() + 2 {
1326 return false;
1327 }
1328
1329 if slices_per_column.contains(&0) {
1330 return false;
1331 }
1332
1333 (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1334}
1335
1336fn append_column_region_blocks(
1337 bands: &[TextBand],
1338 band_indices: &[usize],
1339 boundaries: &[f64],
1340 blocks: &mut Vec<TextBlock>,
1341) {
1342 let column_count = boundaries.len() + 1;
1343 let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1344
1345 for &band_idx in band_indices {
1346 let slices = bands[band_idx].split_by_boundaries(boundaries);
1347 for (column_idx, slice) in slices.into_iter().enumerate() {
1348 if slice.is_empty() {
1349 continue;
1350 }
1351 column_bands[column_idx].push(TextSpan {
1352 text: String::new(),
1353 x: 0.0,
1354 y: 0.0,
1355 width: 0.0,
1356 height: 0.0,
1357 font_size: 0.0,
1358 });
1359 let marker_idx = column_bands[column_idx].len() - 1;
1360 column_bands[column_idx][marker_idx] = TextSpan {
1361 text: String::new(),
1362 x: f64::NEG_INFINITY,
1363 y: bands[band_idx].y,
1364 width: 0.0,
1365 height: 0.0,
1366 font_size: 0.0,
1367 };
1368 column_bands[column_idx].extend(slice);
1369 }
1370 }
1371
1372 for spans in column_bands {
1373 let mut current: Vec<TextSpan> = Vec::new();
1374 for span in spans {
1375 if span.x == f64::NEG_INFINITY {
1376 if !current.is_empty() {
1377 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1378 blocks.push(TextBlock {
1379 spans: std::mem::take(&mut current),
1380 });
1381 }
1382 continue;
1383 }
1384 current.push(span);
1385 }
1386 if !current.is_empty() {
1387 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1388 blocks.push(TextBlock { spans: current });
1389 }
1390 }
1391}
1392
1393fn stitch_hyphenated_lines(lines: &[String]) -> String {
1408 let mut out = String::new();
1409 for (idx, line) in lines.iter().enumerate() {
1410 if idx == 0 {
1411 out.push_str(line);
1412 continue;
1413 }
1414
1415 let next_trimmed = line.trim_start();
1416
1417 let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1419
1420 if should_merge {
1421 out.pop(); out.push_str(next_trimmed);
1423 } else {
1424 out.push('\n');
1425 out.push_str(line);
1426 }
1427 }
1428 out
1429}
1430
1431fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1434 if !accumulated.ends_with('-') {
1436 return false;
1437 }
1438
1439 let before_hyphen = accumulated.chars().rev().nth(1);
1441 if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1442 return false;
1443 }
1444
1445 let alpha_prefix_len = accumulated
1447 .chars()
1448 .rev()
1449 .skip(1) .take_while(|c| c.is_alphabetic())
1451 .count();
1452 if alpha_prefix_len < 3 {
1453 return false;
1454 }
1455
1456 let first_next = next_trimmed.chars().next();
1458 if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1459 return false;
1460 }
1461
1462 let next_alpha_len = next_trimmed
1464 .chars()
1465 .take_while(|c| c.is_ascii_lowercase())
1466 .count();
1467 if next_alpha_len < 3 {
1468 return false;
1469 }
1470
1471 true
1472}
1473
1474pub(crate) fn normalize_text_output(text: &str) -> String {
1481 if text.is_empty() {
1482 return String::new();
1483 }
1484
1485 let mut lines: Vec<&str> = Vec::new();
1486 for line in text.split('\n') {
1487 lines.push(line.trim_end());
1488 }
1489
1490 while lines.last() == Some(&"") {
1492 lines.pop();
1493 }
1494
1495 if lines.is_empty() {
1496 return String::new();
1497 }
1498
1499 let mut result = String::with_capacity(text.len());
1500 let mut consecutive_empty = 0u32;
1501
1502 for (i, line) in lines.iter().enumerate() {
1503 if line.is_empty() || *line == "\x0C" {
1504 if line.is_empty() {
1505 consecutive_empty += 1;
1506 if consecutive_empty <= 2 {
1508 result.push('\n');
1509 }
1510 } else {
1511 consecutive_empty = 0;
1513 result.push_str(line);
1514 if i + 1 < lines.len() {
1515 result.push('\n');
1516 }
1517 }
1518 } else {
1519 consecutive_empty = 0;
1521 result.push_str(line);
1522 if i + 1 < lines.len() {
1523 result.push('\n');
1524 }
1525 }
1526 }
1527
1528 if !result.is_empty() && !result.ends_with('\n') {
1530 result.push('\n');
1531 }
1532
1533 result
1534}
1535
1536#[cfg(test)]
1537mod tests {
1538 use super::*;
1539
1540 fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1541 TextSpan {
1542 text: text.into(),
1543 x,
1544 y,
1545 width,
1546 height: 12.0,
1547 font_size: 12.0,
1548 }
1549 }
1550
1551 fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1552 group_spans_into_blocks(spans)
1553 .into_iter()
1554 .map(|block| block.text())
1555 .collect()
1556 }
1557
1558 #[test]
1559 fn empty_device_produces_empty_text() {
1560 let dev = TextExtractionDevice::new();
1561 assert!(dev.into_text().is_empty());
1562 }
1563
1564 #[test]
1565 fn single_column_stays_row_major() {
1566 let texts = block_texts(vec![
1567 span("Single Column Line 1", 40.0, 700.0, 140.0),
1568 span("Single Column Line 2", 40.0, 684.0, 140.0),
1569 span("Single Column Line 3", 40.0, 668.0, 140.0),
1570 ]);
1571
1572 assert_eq!(
1573 texts,
1574 vec![
1575 "Single Column Line 1",
1576 "Single Column Line 2",
1577 "Single Column Line 3",
1578 ]
1579 );
1580 }
1581
1582 #[test]
1583 fn two_column_region_reads_column_major() {
1584 let texts = block_texts(vec![
1585 span("Header", 200.0, 740.0, 80.0),
1586 span("Left column line one", 40.0, 700.0, 115.0),
1587 span("Right column line one", 320.0, 700.0, 120.0),
1588 span("Left column line two", 40.0, 684.0, 115.0),
1589 span("Right column line two", 320.0, 684.0, 120.0),
1590 span("Left column line three", 40.0, 668.0, 125.0),
1591 span("Right column line three", 320.0, 668.0, 130.0),
1592 span("Footer", 200.0, 620.0, 80.0),
1593 ]);
1594
1595 assert_eq!(
1596 texts,
1597 vec![
1598 "Header",
1599 "Left column line one",
1600 "Left column line two",
1601 "Left column line three",
1602 "Right column line one",
1603 "Right column line two",
1604 "Right column line three",
1605 "Footer",
1606 ]
1607 );
1608 }
1609
1610 #[test]
1611 fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1612 let texts = block_texts(vec![
1613 span("Intro paragraph", 40.0, 740.0, 180.0),
1614 span("L1 words here", 40.0, 700.0, 110.0),
1615 span("R1 words here", 320.0, 700.0, 110.0),
1616 span("L2 words here", 40.0, 684.0, 110.0),
1617 span("R2 words here", 320.0, 684.0, 110.0),
1618 span("L3 words here", 40.0, 668.0, 110.0),
1619 span("R3 words here", 320.0, 668.0, 110.0),
1620 span("Outro paragraph", 40.0, 620.0, 180.0),
1621 ]);
1622
1623 assert_eq!(
1624 texts,
1625 vec![
1626 "Intro paragraph",
1627 "L1 words here",
1628 "L2 words here",
1629 "L3 words here",
1630 "R1 words here",
1631 "R2 words here",
1632 "R3 words here",
1633 "Outro paragraph",
1634 ]
1635 );
1636 }
1637
1638 #[test]
1639 fn short_table_like_rows_fall_back_to_row_major() {
1640 let texts = block_texts(vec![
1641 span("Name", 40.0, 700.0, 30.0),
1642 span("Age", 320.0, 700.0, 20.0),
1643 span("Alice", 40.0, 684.0, 35.0),
1644 span("30", 320.0, 684.0, 15.0),
1645 span("Bob", 40.0, 668.0, 24.0),
1646 span("25", 320.0, 668.0, 15.0),
1647 ]);
1648
1649 assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1650 }
1651
1652 #[test]
1653 fn three_column_regions_are_supported() {
1654 let texts = block_texts(vec![
1655 span("Column one line one", 40.0, 700.0, 105.0),
1656 span("Column two line one", 220.0, 700.0, 105.0),
1657 span("Column three line one", 400.0, 700.0, 120.0),
1658 span("Column one line two", 40.0, 684.0, 105.0),
1659 span("Column two line two", 220.0, 684.0, 105.0),
1660 span("Column three line two", 400.0, 684.0, 120.0),
1661 span("Column one line three", 40.0, 668.0, 120.0),
1662 span("Column two line three", 220.0, 668.0, 120.0),
1663 span("Column three line three", 400.0, 668.0, 135.0),
1664 ]);
1665
1666 assert_eq!(
1667 texts,
1668 vec![
1669 "Column one line one",
1670 "Column one line two",
1671 "Column one line three",
1672 "Column two line one",
1673 "Column two line two",
1674 "Column two line three",
1675 "Column three line one",
1676 "Column three line two",
1677 "Column three line three",
1678 ]
1679 );
1680 }
1681
1682 #[test]
1683 fn text_block_concatenation_spaced() {
1684 let block = TextBlock {
1685 spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1686 };
1687 assert_eq!(block.text(), "A B");
1688 }
1689
1690 #[test]
1691 fn adaptive_column_gap_fallback_for_no_gaps() {
1692 let bands = vec![
1694 TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1695 TextBand::new(span("World", 40.0, 684.0, 80.0)),
1696 ];
1697 let threshold = compute_adaptive_column_gap(&bands);
1698 assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1699 }
1700
1701 #[test]
1702 fn adaptive_column_gap_uses_median() {
1703 let mut bands = Vec::new();
1705 for y in [700.0, 684.0, 668.0] {
1706 let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1707 band.spans.push(span("word2", 74.0, y, 30.0)); band.spans.push(span("word3", 108.0, y, 30.0)); bands.push(band);
1710 }
1711 let threshold = compute_adaptive_column_gap(&bands);
1712 assert!(
1714 threshold >= 10.0 && threshold <= 14.0,
1715 "expected ~12, got {threshold}"
1716 );
1717 }
1718
1719 #[test]
1720 fn adaptive_column_gap_clamps_to_min() {
1721 let mut bands = Vec::new();
1723 for y in [700.0, 684.0, 668.0, 652.0] {
1724 let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1725 band.spans.push(span("def", 20.0, y, 18.0));
1727 bands.push(band);
1728 }
1729 let threshold = compute_adaptive_column_gap(&bands);
1730 assert!(
1731 (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1732 "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1733 );
1734 }
1735
1736 #[test]
1737 fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1738 let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1740 band.spans.push(span("Right", 80.0, 700.0, 30.0)); let bands = vec![band];
1742 let threshold = compute_adaptive_column_gap(&bands);
1743 assert!(
1744 (threshold - 37.5).abs() < 0.01,
1745 "expected 37.5 (0.75×50), got {threshold}"
1746 );
1747 }
1748
1749 #[test]
1750 fn normalize_trims_trailing_whitespace_per_line() {
1751 assert_eq!(
1752 normalize_text_output("hello \nworld \n"),
1753 "hello\nworld\n"
1754 );
1755 }
1756
1757 #[test]
1758 fn normalize_collapses_excess_newlines() {
1759 assert_eq!(
1761 normalize_text_output("hello\n\n\n\n\nworld\n"),
1762 "hello\n\n\nworld\n"
1763 );
1764 }
1765
1766 #[test]
1767 fn normalize_preserves_double_newline() {
1768 assert_eq!(
1769 normalize_text_output("paragraph one\n\nparagraph two\n"),
1770 "paragraph one\n\nparagraph two\n"
1771 );
1772 }
1773
1774 #[test]
1775 fn normalize_preserves_form_feed() {
1776 assert_eq!(
1777 normalize_text_output("page1\n\n\x0Cpage2\n"),
1778 "page1\n\n\x0Cpage2\n"
1779 );
1780 }
1781
1782 #[test]
1783 fn normalize_adds_trailing_newline() {
1784 assert_eq!(normalize_text_output("hello"), "hello\n");
1785 }
1786
1787 #[test]
1788 fn normalize_empty_input() {
1789 assert_eq!(normalize_text_output(""), "");
1790 }
1791
1792 #[test]
1793 fn normalize_only_whitespace() {
1794 assert_eq!(normalize_text_output(" \n \n"), "");
1795 }
1796
1797 #[test]
1800 fn hyphen_stitch_joins_wrapped_word() {
1801 let lines = vec!["the aver-".into(), "age rainfall".into()];
1802 assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1803 }
1804
1805 #[test]
1806 fn hyphen_stitch_handles_leading_whitespace() {
1807 let lines = vec!["pre-".into(), " dict the outcome".into()];
1808 assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1810 }
1811
1812 #[test]
1813 fn hyphen_stitch_capital_continuation_not_stitched() {
1814 let lines = vec!["Section three-".into(), "Summary here".into()];
1815 assert_eq!(
1816 stitch_hyphenated_lines(&lines),
1817 "Section three-\nSummary here"
1818 );
1819 }
1820
1821 #[test]
1822 fn hyphen_stitch_bullet_dash_not_stitched() {
1823 let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1825 assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1826 }
1827
1828 #[test]
1829 fn hyphen_stitch_numeric_range_not_stitched() {
1830 let lines = vec!["page 42-".into(), "seventy".into()];
1832 assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1833 }
1834
1835 #[test]
1836 fn hyphen_stitch_short_prefix_not_stitched() {
1837 let lines = vec!["re-".into(), "organize".into()];
1839 assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1840 }
1841
1842 #[test]
1843 fn hyphen_stitch_short_continuation_not_stitched() {
1844 let lines = vec!["counter-".into(), "an example".into()];
1846 assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1847 }
1848
1849 #[test]
1850 fn hyphen_stitch_compound_word_midline_preserved() {
1851 let lines = vec!["real-time system".into()];
1854 assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1855 }
1856
1857 #[test]
1858 fn hyphen_stitch_single_line_unchanged() {
1859 let lines = vec!["only line".into()];
1860 assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1861 }
1862
1863 #[test]
1864 fn hyphen_stitch_empty_input() {
1865 let lines: Vec<String> = vec![];
1866 assert_eq!(stitch_hyphenated_lines(&lines), "");
1867 }
1868
1869 fn make_device_with_median(median: f64) -> TextExtractionDevice {
1872 let mut dev = TextExtractionDevice::new();
1873 for _ in 0..MEDIAN_REFRESH {
1875 dev.glyph_widths.push(median);
1876 }
1877 dev.refresh_median_char_width();
1878 assert!((dev.cached_median_char_width - median).abs() < 1e-9);
1879 dev
1880 }
1881
1882 #[test]
1883 fn consensus_inserts_space_on_strong_tj_offset_alone() {
1884 let mut dev = make_device_with_median(6.0);
1887 dev.pending_tj_offset = 250.0; assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1889 }
1890
1891 #[test]
1892 fn consensus_inserts_space_on_geometric_gap_alone() {
1893 let dev = make_device_with_median(6.0);
1895 assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
1898 }
1899
1900 #[test]
1901 fn consensus_no_space_on_kerning_gap() {
1902 let dev = make_device_with_median(6.0);
1906 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
1907 }
1908
1909 #[test]
1910 fn consensus_inserts_space_on_camel_case_plus_gap() {
1911 let dev = make_device_with_median(6.0);
1915 assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
1916 }
1917
1918 #[test]
1919 fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
1920 let dev = make_device_with_median(6.0);
1921 assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
1922 }
1923
1924 #[test]
1925 fn consensus_heuristic_alone_is_insufficient() {
1926 let dev = make_device_with_median(6.0);
1931 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
1932 }
1933
1934 #[test]
1935 fn consensus_falls_back_to_font_size_when_no_median() {
1936 let dev = TextExtractionDevice::new();
1938 assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
1940 assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
1942 }
1943
1944 #[test]
1945 fn consensus_ignores_tiny_tj_offsets() {
1946 let mut dev = make_device_with_median(6.0);
1948 dev.pending_tj_offset = 50.0;
1949 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1950 }
1951
1952 #[test]
1953 fn consensus_accepts_negative_tj_offsets() {
1954 let mut dev = make_device_with_median(6.0);
1957 dev.pending_tj_offset = -250.0;
1958 assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1959 }
1960
1961 #[test]
1962 fn text_adjustment_accumulates_until_glyph() {
1963 let mut dev = TextExtractionDevice::new();
1964 dev.text_adjustment(120.0);
1965 dev.text_adjustment(140.0);
1966 assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
1967 }
1968
1969 #[test]
1972 fn xy_cut_header_body_footer_with_two_columns() {
1973 let texts = block_texts(vec![
1979 span("HEADLINE TITLE", 180.0, 760.0, 120.0),
1980 span("Left col line A", 40.0, 700.0, 110.0),
1981 span("Right col line A", 320.0, 700.0, 115.0),
1982 span("Left col line B", 40.0, 684.0, 110.0),
1983 span("Right col line B", 320.0, 684.0, 115.0),
1984 span("Left col line C", 40.0, 668.0, 110.0),
1985 span("Right col line C", 320.0, 668.0, 115.0),
1986 span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
1987 ]);
1988 assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
1989 assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
1990 let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
1992 let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
1993 assert!(
1994 left_c_idx < right_a_idx,
1995 "expected column-major ordering in body: {texts:?}"
1996 );
1997 }
1998
1999 #[test]
2000 fn xy_cut_rejects_column_split_on_table_rows() {
2001 let texts = block_texts(vec![
2004 span("Name", 40.0, 700.0, 30.0),
2005 span("Age", 320.0, 700.0, 20.0),
2006 span("Alice", 40.0, 684.0, 35.0),
2007 span("30", 320.0, 684.0, 15.0),
2008 ]);
2009 assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2010 }
2011
2012 #[test]
2013 fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2014 let texts = block_texts(vec![
2018 span(
2019 "Full width intro spanning both columns here",
2020 40.0,
2021 740.0,
2022 360.0,
2023 ),
2024 span("Left A", 40.0, 700.0, 50.0),
2025 span("Right A", 320.0, 700.0, 50.0),
2026 span("Left B", 40.0, 684.0, 50.0),
2027 span("Right B", 320.0, 684.0, 50.0),
2028 ]);
2029 assert!(
2030 texts[0].contains("Full width intro"),
2031 "expected full-width intro first: {texts:?}"
2032 );
2033 }
2034
2035 #[test]
2036 fn xy_cut_horizontal_split_for_zone_boundaries() {
2037 let texts = block_texts(vec![
2041 span("First paragraph body text", 40.0, 740.0, 200.0),
2042 span("Second paragraph body", 40.0, 680.0, 180.0),
2043 ]);
2044 assert_eq!(texts.len(), 2);
2045 assert!(texts[0].starts_with("First"));
2046 assert!(texts[1].starts_with("Second"));
2047 }
2048
2049 #[test]
2050 fn xy_cut_recursion_terminates_with_single_span() {
2051 let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2052 assert_eq!(texts, vec!["Only one span on the page"]);
2053 }
2054
2055 #[test]
2056 fn median_font_size_handles_mixed_sizes() {
2057 let spans = vec![
2058 TextSpan {
2059 text: "small".into(),
2060 x: 0.0,
2061 y: 0.0,
2062 width: 10.0,
2063 height: 8.0,
2064 font_size: 8.0,
2065 },
2066 TextSpan {
2067 text: "medium".into(),
2068 x: 0.0,
2069 y: 0.0,
2070 width: 10.0,
2071 height: 12.0,
2072 font_size: 12.0,
2073 },
2074 TextSpan {
2075 text: "large".into(),
2076 x: 0.0,
2077 y: 0.0,
2078 width: 10.0,
2079 height: 24.0,
2080 font_size: 24.0,
2081 },
2082 ];
2083 assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2084 }
2085
2086 #[test]
2087 fn columns_band_aligned_accepts_aligned_columns() {
2088 let spans = vec![
2089 span("L1", 40.0, 700.0, 60.0),
2090 span("R1", 300.0, 700.0, 60.0),
2091 span("L2", 40.0, 684.0, 60.0),
2092 span("R2", 300.0, 684.0, 60.0),
2093 ];
2094 let stats = PageStats::from_spans(&spans);
2095 assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2097 }
2098
2099 #[test]
2100 fn columns_band_aligned_rejects_wide_single_side_band() {
2101 let spans = vec![
2102 span("Wide banner line across top", 40.0, 740.0, 280.0),
2103 span("L1", 40.0, 700.0, 60.0),
2104 span("R1", 300.0, 700.0, 60.0),
2105 ];
2106 let stats = PageStats::from_spans(&spans);
2107 assert!(!columns_are_band_aligned(
2110 &spans, 200.0, 40.0, 360.0, &stats
2111 ));
2112 }
2113
2114 #[test]
2115 fn page_stats_computes_median_values() {
2116 let spans = vec![
2117 span("one", 40.0, 700.0, 30.0),
2118 span("two", 40.0, 680.0, 30.0),
2119 span("three", 40.0, 660.0, 50.0),
2120 ];
2121 let stats = PageStats::from_spans(&spans);
2122 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2123 assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2125 assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2127 }
2128
2129 #[test]
2130 fn page_stats_handles_empty_input() {
2131 let stats = PageStats::from_spans(&[]);
2132 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2133 assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2134 assert_eq!(stats.median_line_spacing, 0.0);
2135 }
2136
2137 #[test]
2138 fn narrow_gutter_detected_with_adaptive_threshold() {
2139 let mut spans = Vec::new();
2143 for y in [700.0, 684.0, 668.0] {
2144 spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2146 spans.push(span("dolor sit", 144.0, y, 80.0));
2147 spans.push(span("amet consec", 236.0, y, 100.0));
2149 spans.push(span("tetur adipi", 340.0, y, 80.0));
2150 }
2151 let texts = block_texts(spans);
2152 assert!(
2154 texts.len() >= 6,
2155 "expected column-major output, got {texts:?}"
2156 );
2157 assert!(
2159 texts[0].contains("Lorem"),
2160 "first block should be left column: {texts:?}"
2161 );
2162 }
2163
2164 #[test]
2165 fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2166 let texts = block_texts(vec![
2167 span("73022", 45.0, 750.0, 70.0),
2168 span("Federal Register banner", 125.6, 750.0, 260.0),
2169 span("Left column line one", 45.0, 725.0, 140.0),
2170 span("Middle column line one", 222.0, 725.0, 140.0),
2171 span("Right column line one", 399.0, 725.0, 120.0),
2172 span("Left column line two", 45.0, 715.0, 140.0),
2173 span("Middle column line two", 210.0, 715.0, 152.0),
2174 span("Right column line two", 388.0, 715.0, 132.0),
2175 span("Left column line three", 45.0, 705.0, 140.0),
2176 span("Middle column line three", 235.0, 705.0, 135.0),
2177 span("Right column line three", 408.0, 705.0, 118.0),
2178 ]);
2179
2180 assert_eq!(
2181 texts,
2182 vec![
2183 "73022 Federal Register banner",
2184 "Left column line one",
2185 "Left column line two",
2186 "Left column line three",
2187 "Middle column line one",
2188 "Middle column line two",
2189 "Middle column line three",
2190 "Right column line one",
2191 "Right column line two",
2192 "Right column line three",
2193 ]
2194 );
2195 }
2196
2197 #[test]
2198 fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2199 let texts = block_texts(vec![
2200 span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2201 span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2202 span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2203 span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2204 span("2 This is a fakebold", 27.0, 714.8, 142.0),
2205 span(" fakebold", 169.8, 714.8, 70.0),
2206 span(" fakebold", 170.1, 714.8, 70.0),
2207 span(" fakebold word.", 170.4, 714.8, 110.0),
2208 ]);
2209
2210 assert_eq!(
2211 texts,
2212 vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2213 );
2214 }
2215}