1use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7 BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11const BAND_Y_TOLERANCE: f64 = 5.0;
15const BAND_Y_FRACTION: f64 = 0.30;
20const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71#[derive(Debug, Clone)]
73pub struct TextSpan {
74 pub text: String,
76 pub x: f64,
78 pub y: f64,
80 pub width: f64,
82 pub height: f64,
84 pub font_size: f64,
86}
87
88impl TextSpan {
89 fn right(&self) -> f64 {
92 self.x + self.width.max(self.estimated_width())
93 }
94
95 fn measured_right(&self) -> f64 {
97 self.x + self.width
98 }
99
100 fn estimated_width(&self) -> f64 {
101 let char_count = self.text.chars().count() as f64;
102 if char_count <= 0.0 {
103 self.font_size * 0.5
104 } else {
105 self.font_size * 0.5 * char_count
106 }
107 }
108}
109
110#[derive(Debug, Clone)]
112pub struct TextBlock {
113 pub spans: Vec<TextSpan>,
115}
116
117impl TextBlock {
118 pub fn text(&self) -> String {
124 if self.spans.is_empty() {
125 return String::new();
126 }
127 let mut result = self.spans[0].text.clone();
128 for pair in self.spans.windows(2) {
129 let prev = &pair[0];
130 let curr = &pair[1];
131 let expected_end = prev.measured_right();
132 let gap = curr.x - expected_end;
133 if gap <= prev.font_size * 0.12 {
134 if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
135 result.push_str(&trimmed);
136 continue;
137 }
138 }
139 if gap > prev.font_size * 0.25 {
140 result.push(' ');
141 }
142 result.push_str(&curr.text);
143 }
144 result
145 }
146}
147
148#[derive(Debug, Clone)]
149struct TextBand {
150 y: f64,
151 spans: Vec<TextSpan>,
152}
153
154impl TextBand {
155 fn new(span: TextSpan) -> Self {
156 Self {
157 y: span.y,
158 spans: vec![span],
159 }
160 }
161
162 fn sort_spans(&mut self) {
163 self.spans.sort_by(|a, b| {
164 a.x.partial_cmp(&b.x)
165 .unwrap_or(Ordering::Equal)
166 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
167 });
168 collapse_overprinted_spans(&mut self.spans);
169 }
170
171 fn row_block(&self) -> TextBlock {
172 let mut spans = self.spans.clone();
173 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
174 TextBlock { spans }
175 }
176
177 fn left(&self) -> f64 {
178 self.spans
179 .iter()
180 .map(|span| span.x)
181 .fold(f64::INFINITY, f64::min)
182 }
183
184 fn right(&self) -> f64 {
185 self.spans
186 .iter()
187 .map(TextSpan::right)
188 .fold(f64::NEG_INFINITY, f64::max)
189 }
190
191 fn width(&self) -> f64 {
192 (self.right() - self.left()).max(0.0)
193 }
194
195 fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
196 self.gaps(column_gap_threshold)
197 .into_iter()
198 .map(|gap| (gap.start + gap.end) * 0.5)
199 .collect()
200 }
201
202 fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
203 if self.spans.len() < 2 {
204 return Vec::new();
205 }
206
207 let mut spans = self.spans.clone();
208 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
209
210 let mut gaps = Vec::new();
211 let mut prev_right = spans[0].right();
212 for span in spans.iter().skip(1) {
213 let gap = span.x - prev_right;
214 if gap >= column_gap_threshold {
215 gaps.push(BandGap {
216 start: prev_right,
217 end: span.x,
218 });
219 }
220 prev_right = prev_right.max(span.right());
221 }
222
223 gaps
224 }
225
226 fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
227 let mut columns = vec![Vec::new(); boundaries.len() + 1];
228 for span in &self.spans {
229 let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
230 let column_idx = boundaries
231 .iter()
232 .position(|boundary| center_x < *boundary)
233 .unwrap_or(boundaries.len());
234 columns[column_idx].push(span.clone());
235 }
236
237 for spans in &mut columns {
238 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
239 }
240
241 columns
242 }
243
244 fn fits_single_column(
245 &self,
246 boundaries: &[f64],
247 region_left: f64,
248 region_right: f64,
249 ) -> Option<usize> {
250 let mut column_idx: Option<usize> = None;
251 for span in &self.spans {
252 let left = span.x;
253 let right = span.right();
254 if boundaries
255 .iter()
256 .any(|boundary| left < *boundary && right > *boundary)
257 {
258 return None;
259 }
260
261 let center_x = left + (right - left) * 0.5;
262 let idx = boundaries
263 .iter()
264 .position(|boundary| center_x < *boundary)
265 .unwrap_or(boundaries.len());
266 match column_idx {
267 Some(existing) if existing != idx => return None,
268 Some(_) => {}
269 None => column_idx = Some(idx),
270 }
271 }
272 let idx = column_idx?;
273 let mut edges = Vec::with_capacity(boundaries.len() + 2);
274 edges.push(region_left);
275 edges.extend_from_slice(boundaries);
276 edges.push(region_right);
277
278 let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
279 if column_width <= 0.0 || self.width() > column_width * 0.8 {
280 return None;
281 }
282
283 Some(idx)
284 }
285}
286
287#[derive(Debug, Clone, Copy)]
288struct BandGap {
289 start: f64,
290 end: f64,
291}
292
293pub(crate) struct TextExtractionDevice {
315 spans: Vec<TextSpan>,
316 last_y: f64,
317 last_end_x: f64,
318 pending_tj_offset: f32,
322 glyph_widths: Vec<f64>,
326 cached_median_char_width: f64,
330}
331
332const MEDIAN_REFRESH: usize = 32;
333
334impl Default for TextExtractionDevice {
335 fn default() -> Self {
336 Self::new()
337 }
338}
339
340impl TextExtractionDevice {
341 pub fn new() -> Self {
343 Self {
344 spans: Vec::new(),
345 last_y: f64::NEG_INFINITY,
346 last_end_x: f64::NEG_INFINITY,
347 pending_tj_offset: 0.0,
348 glyph_widths: Vec::new(),
349 cached_median_char_width: 0.0,
350 }
351 }
352
353 fn refresh_median_char_width(&mut self) {
356 if self.glyph_widths.is_empty() {
357 self.cached_median_char_width = 0.0;
358 return;
359 }
360 let mut sorted = self.glyph_widths.clone();
361 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
362 self.cached_median_char_width = sorted[sorted.len() / 2];
363 }
364
365 fn evaluate_space_consensus(
368 &self,
369 gap: f64,
370 font_size: f64,
371 prev_text: &str,
372 next_text: &str,
373 ) -> bool {
374 let mut confidence = 0.0;
375
376 if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
379 confidence += TJ_SIGNAL_WEIGHT;
380 }
381
382 let gap_reference = if self.cached_median_char_width > 0.0 {
386 self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
387 } else {
388 font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
389 };
390 if gap > gap_reference {
391 confidence += GAP_SIGNAL_WEIGHT;
392 }
393
394 if let (Some(prev_last), Some(next_first)) =
398 (prev_text.chars().last(), next_text.chars().next())
399 {
400 let camel = prev_last.is_lowercase() && next_first.is_uppercase();
401 let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
402 let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
403 if camel || digit_to_letter || letter_to_digit {
404 confidence += HEURISTIC_SIGNAL_WEIGHT;
405 }
406 }
407
408 confidence >= SPACE_CONSENSUS_THRESHOLD
409 }
410
411 pub fn into_text(self) -> String {
413 let blocks = group_spans_into_blocks(self.spans);
414 let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
415 let stitched = stitch_hyphenated_lines(&lines);
416 normalize_text_output(&stitched)
417 }
418
419 pub fn into_blocks(self) -> Vec<TextBlock> {
421 group_spans_into_blocks(self.spans)
422 }
423
424 #[allow(dead_code)]
426 pub(crate) fn into_spans(self) -> Vec<TextSpan> {
427 self.spans
428 }
429}
430
431impl Device<'_> for TextExtractionDevice {
432 fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
433 fn set_blend_mode(&mut self, _: BlendMode) {}
434 fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
435 fn push_clip_path(&mut self, _: &ClipPath) {}
436 fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
437 fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
438 fn pop_clip_path(&mut self) {}
439 fn pop_transparency_group(&mut self) {}
440
441 fn draw_glyph(
442 &mut self,
443 glyph: &Glyph<'_>,
444 transform: Affine,
445 glyph_transform: Affine,
446 _paint: &Paint<'_>,
447 _draw_mode: &GlyphDrawMode,
448 ) {
449 let text = match glyph.as_unicode() {
450 Some(BfString::Char(c)) => c.to_string(),
451 Some(BfString::String(s)) => s,
452 None => return,
453 };
454
455 let composed = transform * glyph_transform;
456 let coeffs = composed.as_coeffs();
457 let x = coeffs[4];
458 let y = coeffs[5];
459 let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
460 let font_size = glyph_scale * 1000.0;
461 let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
462 let glyph_end_x = x + glyph_width;
463
464 if self.glyph_widths.len() < 4096 {
468 self.glyph_widths.push(glyph_width);
469 if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
470 self.refresh_median_char_width();
471 }
472 }
473
474 let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
475 let gap = x - self.last_end_x;
476 let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
477
478 if adjacent && !self.spans.is_empty() {
479 let want_space = {
487 let last = self.spans.last().expect("checked non-empty");
488 !last.text.ends_with(' ')
489 && !text.starts_with(' ')
490 && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
491 };
492 let last = self.spans.last_mut().expect("checked non-empty");
493 if want_space {
494 last.text.push(' ');
495 }
496 last.text.push_str(&text);
497 last.width = last.width.max(glyph_end_x - last.x);
498 last.height = last.height.max(font_size);
499 self.last_y = y;
500 self.last_end_x = glyph_end_x;
501 self.pending_tj_offset = 0.0;
504 return;
505 }
506
507 self.last_y = y;
508 self.last_end_x = glyph_end_x;
509 self.pending_tj_offset = 0.0;
513
514 self.spans.push(TextSpan {
515 text,
516 x,
517 y,
518 width: glyph_width,
519 height: font_size,
520 font_size,
521 });
522 }
523
524 fn text_adjustment(&mut self, amount: f32) {
529 self.pending_tj_offset += amount;
530 }
531}
532
533fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
534 match glyph {
535 Glyph::Outline(outline) => outline
536 .advance_width()
537 .map(|width| width as f64 / 1000.0 * font_size)
538 .unwrap_or(font_size * 0.5),
539 Glyph::Type3(_) => font_size * 0.5,
540 }
541}
542
543fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
549 if spans.len() < 2 {
550 return;
551 }
552
553 let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
554 for span in spans.drain(..) {
555 if let Some(last) = deduped.last_mut() {
556 if spans_are_overprint_duplicates(last, &span) {
557 let choose_incoming = span.text.chars().count() > last.text.chars().count()
558 || (span.text.chars().count() == last.text.chars().count()
559 && span.width > last.width);
560 let preferred_text = if choose_incoming {
561 span.text.clone()
562 } else {
563 last.text.clone()
564 };
565 let left = last.x.min(span.x);
566 let right = last.right().max(span.right());
567 last.x = left;
568 last.y = (last.y + span.y) * 0.5;
569 last.width = (right - left).max(last.width).max(span.width);
570 last.height = last.height.max(span.height);
571 last.font_size = last.font_size.max(span.font_size);
572 last.text = preferred_text;
573 continue;
574 }
575 }
576
577 deduped.push(span);
578 }
579
580 *spans = deduped;
581}
582
583fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
584 let lhs_text = lhs.text.trim();
585 let rhs_text = rhs.text.trim();
586 if lhs_text.is_empty() || rhs_text.is_empty() {
587 return false;
588 }
589
590 let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
591 if !same_baseline {
592 return false;
593 }
594
595 let lhs_left = lhs.x;
596 let lhs_right = lhs.right();
597 let rhs_left = rhs.x;
598 let rhs_right = rhs.right();
599 let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
600 let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
601 let heavily_overlaps = overlap / min_width >= 0.85;
602 if !heavily_overlaps {
603 return false;
604 }
605
606 lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
607}
608
609fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
610 let prev_chars: Vec<char> = prev.trim_end().chars().collect();
611 let curr_chars: Vec<char> = curr.trim_start().chars().collect();
612 let max = prev_chars.len().min(curr_chars.len());
613
614 for len in (4..=max).rev() {
615 let prev_start = prev_chars.len() - len;
616 if prev_chars[prev_start..] != curr_chars[..len] {
617 continue;
618 }
619
620 if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
621 continue;
622 }
623
624 let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
625 let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
626 if !prev_boundary || !curr_boundary {
627 continue;
628 }
629
630 return Some(curr_chars[len..].iter().collect());
631 }
632
633 None
634}
635
636fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
643 let mut all_gaps: Vec<f64> = Vec::new();
644
645 for band in bands {
646 if band.spans.len() < 2 {
647 continue;
648 }
649 let mut sorted = band.spans.clone();
650 sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
651 let mut prev_right = sorted[0].right();
652 for span in sorted.iter().skip(1) {
653 let gap = span.x - prev_right;
654 if gap > 0.0 {
655 all_gaps.push(gap);
656 }
657 prev_right = prev_right.max(span.right());
658 }
659 }
660
661 if all_gaps.is_empty() {
662 return COLUMN_GAP_THRESHOLD_FALLBACK;
663 }
664
665 all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
666
667 let min_gap = all_gaps[0];
668
669 if min_gap > COLUMN_GAP_THRESHOLD_MIN {
674 return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
675 }
676
677 let mut best_break_threshold = 0.0f64;
680 let mut best_ratio = 1.5f64; for pair in all_gaps.windows(2) {
682 if pair[0] > 0.5 {
683 let ratio = pair[1] / pair[0];
684 if ratio > best_ratio {
685 best_ratio = ratio;
686 best_break_threshold = (pair[0] + pair[1]) * 0.5;
687 }
688 }
689 }
690
691 if best_break_threshold > 0.0 {
692 return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
693 }
694
695 let mid = all_gaps.len() / 2;
697 let median = if all_gaps.len().is_multiple_of(2) {
698 (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
699 } else {
700 all_gaps[mid]
701 };
702
703 (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
704 .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
705}
706
707#[derive(Debug, Clone, Copy)]
715struct PageStats {
716 median_font_size: f64,
718 #[allow(dead_code)]
723 median_char_width: f64,
724 median_line_spacing: f64,
730}
731
732impl PageStats {
733 fn from_spans(spans: &[TextSpan]) -> Self {
734 if spans.is_empty() {
735 return Self {
736 median_font_size: 12.0,
737 median_char_width: 6.0,
738 median_line_spacing: 0.0,
739 };
740 }
741
742 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
744 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
745 let median_font_size = sizes[sizes.len() / 2];
746
747 let mut char_widths: Vec<f64> = spans
749 .iter()
750 .filter_map(|s| {
751 let chars = s.text.chars().count();
752 if chars > 0 && s.width > 0.0 {
753 Some(s.width / chars as f64)
754 } else {
755 None
756 }
757 })
758 .collect();
759 let median_char_width = if char_widths.is_empty() {
760 median_font_size * 0.5
761 } else {
762 char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
763 char_widths[char_widths.len() / 2]
764 };
765
766 let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
769 let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
770 ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
771 let mut band_ys: Vec<f64> = Vec::new();
772 for y in ys {
773 if band_ys
774 .last()
775 .map(|prev: &f64| (prev - y).abs() > band_tolerance)
776 .unwrap_or(true)
777 {
778 band_ys.push(y);
779 }
780 }
781 let median_line_spacing = if band_ys.len() < 2 {
790 0.0
791 } else {
792 let mut spacings: Vec<f64> = band_ys
793 .windows(2)
794 .map(|pair| (pair[0] - pair[1]).abs())
795 .collect();
796 spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
797 let q1_index = spacings.len() / 4;
798 spacings[q1_index]
799 };
800
801 Self {
802 median_font_size,
803 median_char_width,
804 median_line_spacing,
805 }
806 }
807}
808
809const XY_CUT_MAX_DEPTH: usize = 12;
814const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
817const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
821const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
824const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
828const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
831
832fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
836 if spans.is_empty() {
837 return Vec::new();
838 }
839 let stats = PageStats::from_spans(&spans);
840 xy_cut_recursive(spans, 0, &stats)
841}
842
843fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
844 if spans.is_empty() {
845 return Vec::new();
846 }
847 if depth >= XY_CUT_MAX_DEPTH {
848 return band_based_blocks(spans, stats);
849 }
850
851 let vcut = try_vertical_cut(&spans, stats);
858 let hcut = try_horizontal_cut(&spans, stats);
859
860 let (chosen, _) = match (vcut, hcut) {
861 (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
862 if v_gap >= h_gap {
863 (Some(v_groups), v_gap)
864 } else {
865 (Some(h_groups), h_gap)
866 }
867 }
868 (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
869 (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
870 (None, None) => (None, 0.0),
871 };
872
873 if let Some(groups) = chosen {
874 let mut out = Vec::new();
875 for group in groups {
876 out.extend(xy_cut_recursive(group, depth + 1, stats));
877 }
878 return out;
879 }
880
881 band_based_blocks(spans, stats)
882}
883
884fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
888 group_spans_into_blocks_legacy_with_stats(spans, stats)
893}
894
895#[allow(dead_code)]
898fn median_font_size(spans: &[TextSpan]) -> f64 {
899 if spans.is_empty() {
900 return 12.0;
901 }
902 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
903 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
904 sizes[sizes.len() / 2]
905}
906
907fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
924 if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
925 return None;
926 }
927
928 let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
929 let region_right = spans
930 .iter()
931 .map(TextSpan::right)
932 .fold(f64::NEG_INFINITY, f64::max);
933 let region_width = region_right - region_left;
934 if region_width <= 0.0 {
935 return None;
936 }
937
938 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
947 let adaptive = compute_adaptive_column_gap(&bands);
948 let floor = stats
949 .median_font_size
950 .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
951 .max(XY_CUT_VERTICAL_GAP_FLOOR);
952 let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
953
954 let mut intervals: Vec<(f64, f64)> = spans
957 .iter()
958 .map(|s| (s.x, s.right().max(s.x + 0.001)))
959 .collect();
960 intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
961
962 let mut cursor = intervals[0].1;
963 let mut best_gap: Option<(f64, f64)> = None; for (left, right) in intervals.iter().skip(1) {
965 if *left > cursor {
966 let gap = *left - cursor;
967 if gap >= min_gap {
968 match best_gap {
969 Some((best, _)) if best >= gap => {}
970 _ => {
971 let cut_x = (cursor + *left) * 0.5;
972 best_gap = Some((gap, cut_x));
973 }
974 }
975 }
976 }
977 cursor = cursor.max(*right);
978 }
979
980 let (gap_size, cut_x) = best_gap?;
981
982 let mut left_group = Vec::new();
985 let mut right_group = Vec::new();
986 for span in spans {
987 let midpoint = span.x + (span.right() - span.x) * 0.5;
988 if midpoint < cut_x {
989 left_group.push(span.clone());
990 } else {
991 right_group.push(span.clone());
992 }
993 }
994
995 if !columns_are_dense(&left_group, &right_group, stats) {
996 return None;
997 }
998 if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
999 return None;
1000 }
1001
1002 Some((vec![left_group, right_group], gap_size))
1003}
1004
1005fn columns_are_band_aligned(
1011 spans: &[TextSpan],
1012 cut_x: f64,
1013 region_left: f64,
1014 region_right: f64,
1015 stats: &PageStats,
1016) -> bool {
1017 let left_width = (cut_x - region_left).max(1.0);
1018 let right_width = (region_right - cut_x).max(1.0);
1019
1020 const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1028
1029 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1030 for band in &bands {
1031 let mut has_left = false;
1032 let mut has_right = false;
1033 for span in &band.spans {
1034 let midpoint = span.x + (span.right() - span.x) * 0.5;
1035 if midpoint < cut_x {
1036 has_left = true;
1037 } else {
1038 has_right = true;
1039 }
1040 }
1041 if has_left && has_right {
1042 continue; }
1044 let band_width = band.width();
1045 if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1046 return false;
1047 }
1048 if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1049 return false;
1050 }
1051 }
1052 true
1053}
1054
1055fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1060 for col in [left, right] {
1061 if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1062 return false;
1063 }
1064 let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1065 if bands.is_empty() {
1066 return false;
1067 }
1068 let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1069 let chars_per_band = total_chars as f64 / bands.len() as f64;
1070 if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1071 return false;
1072 }
1073 }
1074 true
1075}
1076
1077fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1081 if spans.len() < 2 {
1082 return None;
1083 }
1084 let mut sorted = spans.to_vec();
1086 sorted.sort_by(|a, b| {
1087 b.y.partial_cmp(&a.y)
1088 .unwrap_or(Ordering::Equal)
1089 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1090 });
1091
1092 let min_gap = if stats.median_line_spacing > 0.0 {
1098 stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1099 } else {
1100 stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1101 };
1102
1103 let mut best: Option<(f64, f64)> = None; let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1106 let mut band_bottom = sorted[0].y;
1107
1108 for span in sorted.iter().skip(1) {
1109 if (band_bottom - span.y).abs() <= tolerance {
1110 band_bottom = band_bottom.min(span.y);
1111 continue;
1112 }
1113 let gap = band_bottom - span.y;
1114 if gap >= min_gap {
1115 let cut_y = (band_bottom + span.y) * 0.5;
1116 match best {
1117 Some((best_gap, _)) if best_gap >= gap => {}
1118 _ => best = Some((gap, cut_y)),
1119 }
1120 }
1121 band_bottom = span.y;
1122 }
1123
1124 let (gap_size, cut_y) = best?;
1125
1126 let mut top_group = Vec::new();
1127 let mut bottom_group = Vec::new();
1128 for span in spans {
1129 if span.y > cut_y {
1130 top_group.push(span.clone());
1131 } else {
1132 bottom_group.push(span.clone());
1133 }
1134 }
1135 if top_group.is_empty() || bottom_group.is_empty() {
1136 return None;
1137 }
1138 Some((vec![top_group, bottom_group], gap_size))
1139}
1140
1141#[allow(dead_code)]
1145fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1146 let bands = group_spans_into_bands(spans);
1147 group_spans_into_blocks_legacy_from_bands(bands)
1148}
1149
1150fn group_spans_into_blocks_legacy_with_stats(
1151 spans: Vec<TextSpan>,
1152 stats: &PageStats,
1153) -> Vec<TextBlock> {
1154 let bands = group_spans_into_bands_with_stats(spans, stats);
1155 group_spans_into_blocks_legacy_from_bands(bands)
1156}
1157
1158fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1159 if bands.is_empty() {
1160 return Vec::new();
1161 }
1162
1163 let column_gap_threshold = compute_adaptive_column_gap(&bands);
1164
1165 let mut blocks = Vec::new();
1166 let mut idx = 0;
1167
1168 while idx < bands.len() {
1169 let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1170 if gap_midpoints.is_empty() {
1171 blocks.push(bands[idx].row_block());
1172 idx += 1;
1173 continue;
1174 }
1175
1176 let mut boundaries = gap_midpoints.clone();
1177 let mut band_indices = vec![idx];
1178 let mut gapped_band_count = 1usize;
1179 let mut region_left = bands[idx].left();
1180 let mut region_right = bands[idx].right();
1181 let mut next_idx = idx + 1;
1182
1183 while next_idx < bands.len() {
1184 let next_band = &bands[next_idx];
1185 let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1186 if next_gap_midpoints.is_empty() {
1187 if next_band
1188 .fits_single_column(&boundaries, region_left, region_right)
1189 .is_some()
1190 {
1191 band_indices.push(next_idx);
1192 next_idx += 1;
1193 continue;
1194 }
1195 break;
1196 }
1197
1198 if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1199 break;
1200 }
1201
1202 update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1203 gapped_band_count += 1;
1204 band_indices.push(next_idx);
1205 region_left = region_left.min(next_band.left());
1206 region_right = region_right.max(next_band.right());
1207 next_idx += 1;
1208 }
1209
1210 if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1211 append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1212 idx = next_idx;
1213 } else {
1214 blocks.push(bands[idx].row_block());
1215 idx += 1;
1216 }
1217 }
1218
1219 blocks
1220}
1221
1222fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1226 let stats = PageStats::from_spans(&spans);
1227 group_spans_into_bands_with_stats(spans, &stats)
1228}
1229
1230fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1231 if spans.is_empty() {
1232 return Vec::new();
1233 }
1234
1235 spans.sort_by(|a, b| {
1236 b.y.partial_cmp(&a.y)
1237 .unwrap_or(Ordering::Equal)
1238 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1239 });
1240
1241 let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1247
1248 let mut bands: Vec<TextBand> = Vec::new();
1249
1250 for span in spans {
1251 let tolerance = (span.height * BAND_Y_FRACTION)
1252 .max(page_tolerance)
1253 .max(BAND_Y_TOLERANCE);
1254 if let Some(band) = bands
1255 .iter_mut()
1256 .find(|band| (band.y - span.y).abs() <= tolerance)
1257 {
1258 let span_count = band.spans.len() as f64;
1259 band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1260 band.spans.push(span);
1261 } else {
1262 bands.push(TextBand::new(span));
1263 }
1264 }
1265
1266 for band in &mut bands {
1267 band.sort_spans();
1268 }
1269
1270 bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1271 bands
1272}
1273
1274fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1275 let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1276 boundaries.len() == gap_midpoints.len()
1277 && boundaries
1278 .iter()
1279 .zip(gap_midpoints)
1280 .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1281}
1282
1283fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1284 for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1285 *boundary =
1286 (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1287 }
1288}
1289
1290fn region_is_columnar(
1291 bands: &[TextBand],
1292 band_indices: &[usize],
1293 boundaries: &[f64],
1294 gapped_band_count: usize,
1295) -> bool {
1296 if boundaries.is_empty()
1297 || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1298 || band_indices.is_empty()
1299 || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1300 {
1301 return false;
1302 }
1303
1304 let mut non_empty_slices = 0usize;
1305 let mut dense_slices = 0usize;
1306 let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1307
1308 for &band_idx in band_indices {
1309 let slices = bands[band_idx].split_by_boundaries(boundaries);
1310 for (column_idx, slice) in slices.iter().enumerate() {
1311 if slice.is_empty() {
1312 continue;
1313 }
1314
1315 non_empty_slices += 1;
1316 slices_per_column[column_idx] += 1;
1317
1318 let char_count = slice
1319 .iter()
1320 .map(|span| span.text.chars().count())
1321 .sum::<usize>();
1322 if slice.len() >= 2 || char_count >= 8 {
1323 dense_slices += 1;
1324 }
1325 }
1326 }
1327
1328 if non_empty_slices < boundaries.len() + 2 {
1329 return false;
1330 }
1331
1332 if slices_per_column.contains(&0) {
1333 return false;
1334 }
1335
1336 (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1337}
1338
1339fn append_column_region_blocks(
1340 bands: &[TextBand],
1341 band_indices: &[usize],
1342 boundaries: &[f64],
1343 blocks: &mut Vec<TextBlock>,
1344) {
1345 let column_count = boundaries.len() + 1;
1346 let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1347
1348 for &band_idx in band_indices {
1349 let slices = bands[band_idx].split_by_boundaries(boundaries);
1350 for (column_idx, slice) in slices.into_iter().enumerate() {
1351 if slice.is_empty() {
1352 continue;
1353 }
1354 column_bands[column_idx].push(TextSpan {
1355 text: String::new(),
1356 x: 0.0,
1357 y: 0.0,
1358 width: 0.0,
1359 height: 0.0,
1360 font_size: 0.0,
1361 });
1362 let marker_idx = column_bands[column_idx].len() - 1;
1363 column_bands[column_idx][marker_idx] = TextSpan {
1364 text: String::new(),
1365 x: f64::NEG_INFINITY,
1366 y: bands[band_idx].y,
1367 width: 0.0,
1368 height: 0.0,
1369 font_size: 0.0,
1370 };
1371 column_bands[column_idx].extend(slice);
1372 }
1373 }
1374
1375 for spans in column_bands {
1376 let mut current: Vec<TextSpan> = Vec::new();
1377 for span in spans {
1378 if span.x == f64::NEG_INFINITY {
1379 if !current.is_empty() {
1380 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1381 blocks.push(TextBlock {
1382 spans: std::mem::take(&mut current),
1383 });
1384 }
1385 continue;
1386 }
1387 current.push(span);
1388 }
1389 if !current.is_empty() {
1390 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1391 blocks.push(TextBlock { spans: current });
1392 }
1393 }
1394}
1395
1396fn stitch_hyphenated_lines(lines: &[String]) -> String {
1411 let mut out = String::new();
1412 for (idx, line) in lines.iter().enumerate() {
1413 if idx == 0 {
1414 out.push_str(line);
1415 continue;
1416 }
1417
1418 let next_trimmed = line.trim_start();
1419
1420 let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1422
1423 if should_merge {
1424 out.pop(); out.push_str(next_trimmed);
1426 } else {
1427 out.push('\n');
1428 out.push_str(line);
1429 }
1430 }
1431 out
1432}
1433
1434fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1437 if !accumulated.ends_with('-') {
1439 return false;
1440 }
1441
1442 let before_hyphen = accumulated.chars().rev().nth(1);
1444 if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1445 return false;
1446 }
1447
1448 let alpha_prefix_len = accumulated
1450 .chars()
1451 .rev()
1452 .skip(1) .take_while(|c| c.is_alphabetic())
1454 .count();
1455 if alpha_prefix_len < 3 {
1456 return false;
1457 }
1458
1459 let first_next = next_trimmed.chars().next();
1461 if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1462 return false;
1463 }
1464
1465 let next_alpha_len = next_trimmed
1467 .chars()
1468 .take_while(|c| c.is_ascii_lowercase())
1469 .count();
1470 if next_alpha_len < 3 {
1471 return false;
1472 }
1473
1474 true
1475}
1476
1477pub(crate) fn normalize_text_output(text: &str) -> String {
1484 if text.is_empty() {
1485 return String::new();
1486 }
1487
1488 let mut lines: Vec<&str> = Vec::new();
1489 for line in text.split('\n') {
1490 lines.push(line.trim_end());
1491 }
1492
1493 while lines.last() == Some(&"") {
1495 lines.pop();
1496 }
1497
1498 if lines.is_empty() {
1499 return String::new();
1500 }
1501
1502 let mut result = String::with_capacity(text.len());
1503 let mut consecutive_empty = 0u32;
1504
1505 for (i, line) in lines.iter().enumerate() {
1506 if line.is_empty() || *line == "\x0C" {
1507 if line.is_empty() {
1508 consecutive_empty += 1;
1509 if consecutive_empty <= 2 {
1511 result.push('\n');
1512 }
1513 } else {
1514 consecutive_empty = 0;
1516 result.push_str(line);
1517 if i + 1 < lines.len() {
1518 result.push('\n');
1519 }
1520 }
1521 } else {
1522 consecutive_empty = 0;
1524 result.push_str(line);
1525 if i + 1 < lines.len() {
1526 result.push('\n');
1527 }
1528 }
1529 }
1530
1531 if !result.is_empty() && !result.ends_with('\n') {
1533 result.push('\n');
1534 }
1535
1536 result
1537}
1538
1539#[cfg(test)]
1540mod tests {
1541 use super::*;
1542
1543 fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1544 TextSpan {
1545 text: text.into(),
1546 x,
1547 y,
1548 width,
1549 height: 12.0,
1550 font_size: 12.0,
1551 }
1552 }
1553
1554 fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1555 group_spans_into_blocks(spans)
1556 .into_iter()
1557 .map(|block| block.text())
1558 .collect()
1559 }
1560
1561 #[test]
1562 fn empty_device_produces_empty_text() {
1563 let dev = TextExtractionDevice::new();
1564 assert!(dev.into_text().is_empty());
1565 }
1566
1567 #[test]
1568 fn single_column_stays_row_major() {
1569 let texts = block_texts(vec![
1570 span("Single Column Line 1", 40.0, 700.0, 140.0),
1571 span("Single Column Line 2", 40.0, 684.0, 140.0),
1572 span("Single Column Line 3", 40.0, 668.0, 140.0),
1573 ]);
1574
1575 assert_eq!(
1576 texts,
1577 vec![
1578 "Single Column Line 1",
1579 "Single Column Line 2",
1580 "Single Column Line 3",
1581 ]
1582 );
1583 }
1584
1585 #[test]
1586 fn two_column_region_reads_column_major() {
1587 let texts = block_texts(vec![
1588 span("Header", 200.0, 740.0, 80.0),
1589 span("Left column line one", 40.0, 700.0, 115.0),
1590 span("Right column line one", 320.0, 700.0, 120.0),
1591 span("Left column line two", 40.0, 684.0, 115.0),
1592 span("Right column line two", 320.0, 684.0, 120.0),
1593 span("Left column line three", 40.0, 668.0, 125.0),
1594 span("Right column line three", 320.0, 668.0, 130.0),
1595 span("Footer", 200.0, 620.0, 80.0),
1596 ]);
1597
1598 assert_eq!(
1599 texts,
1600 vec![
1601 "Header",
1602 "Left column line one",
1603 "Left column line two",
1604 "Left column line three",
1605 "Right column line one",
1606 "Right column line two",
1607 "Right column line three",
1608 "Footer",
1609 ]
1610 );
1611 }
1612
1613 #[test]
1614 fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1615 let texts = block_texts(vec![
1616 span("Intro paragraph", 40.0, 740.0, 180.0),
1617 span("L1 words here", 40.0, 700.0, 110.0),
1618 span("R1 words here", 320.0, 700.0, 110.0),
1619 span("L2 words here", 40.0, 684.0, 110.0),
1620 span("R2 words here", 320.0, 684.0, 110.0),
1621 span("L3 words here", 40.0, 668.0, 110.0),
1622 span("R3 words here", 320.0, 668.0, 110.0),
1623 span("Outro paragraph", 40.0, 620.0, 180.0),
1624 ]);
1625
1626 assert_eq!(
1627 texts,
1628 vec![
1629 "Intro paragraph",
1630 "L1 words here",
1631 "L2 words here",
1632 "L3 words here",
1633 "R1 words here",
1634 "R2 words here",
1635 "R3 words here",
1636 "Outro paragraph",
1637 ]
1638 );
1639 }
1640
1641 #[test]
1642 fn short_table_like_rows_fall_back_to_row_major() {
1643 let texts = block_texts(vec![
1644 span("Name", 40.0, 700.0, 30.0),
1645 span("Age", 320.0, 700.0, 20.0),
1646 span("Alice", 40.0, 684.0, 35.0),
1647 span("30", 320.0, 684.0, 15.0),
1648 span("Bob", 40.0, 668.0, 24.0),
1649 span("25", 320.0, 668.0, 15.0),
1650 ]);
1651
1652 assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1653 }
1654
1655 #[test]
1656 fn three_column_regions_are_supported() {
1657 let texts = block_texts(vec![
1658 span("Column one line one", 40.0, 700.0, 105.0),
1659 span("Column two line one", 220.0, 700.0, 105.0),
1660 span("Column three line one", 400.0, 700.0, 120.0),
1661 span("Column one line two", 40.0, 684.0, 105.0),
1662 span("Column two line two", 220.0, 684.0, 105.0),
1663 span("Column three line two", 400.0, 684.0, 120.0),
1664 span("Column one line three", 40.0, 668.0, 120.0),
1665 span("Column two line three", 220.0, 668.0, 120.0),
1666 span("Column three line three", 400.0, 668.0, 135.0),
1667 ]);
1668
1669 assert_eq!(
1670 texts,
1671 vec![
1672 "Column one line one",
1673 "Column one line two",
1674 "Column one line three",
1675 "Column two line one",
1676 "Column two line two",
1677 "Column two line three",
1678 "Column three line one",
1679 "Column three line two",
1680 "Column three line three",
1681 ]
1682 );
1683 }
1684
1685 #[test]
1686 fn text_block_concatenation_spaced() {
1687 let block = TextBlock {
1688 spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1689 };
1690 assert_eq!(block.text(), "A B");
1691 }
1692
1693 #[test]
1694 fn adaptive_column_gap_fallback_for_no_gaps() {
1695 let bands = vec![
1697 TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1698 TextBand::new(span("World", 40.0, 684.0, 80.0)),
1699 ];
1700 let threshold = compute_adaptive_column_gap(&bands);
1701 assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1702 }
1703
1704 #[test]
1705 fn adaptive_column_gap_uses_median() {
1706 let mut bands = Vec::new();
1708 for y in [700.0, 684.0, 668.0] {
1709 let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1710 band.spans.push(span("word2", 74.0, y, 30.0)); band.spans.push(span("word3", 108.0, y, 30.0)); bands.push(band);
1713 }
1714 let threshold = compute_adaptive_column_gap(&bands);
1715 assert!(
1717 threshold >= 10.0 && threshold <= 14.0,
1718 "expected ~12, got {threshold}"
1719 );
1720 }
1721
1722 #[test]
1723 fn adaptive_column_gap_clamps_to_min() {
1724 let mut bands = Vec::new();
1726 for y in [700.0, 684.0, 668.0, 652.0] {
1727 let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1728 band.spans.push(span("def", 20.0, y, 18.0));
1730 bands.push(band);
1731 }
1732 let threshold = compute_adaptive_column_gap(&bands);
1733 assert!(
1734 (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1735 "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1736 );
1737 }
1738
1739 #[test]
1740 fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1741 let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1743 band.spans.push(span("Right", 80.0, 700.0, 30.0)); let bands = vec![band];
1745 let threshold = compute_adaptive_column_gap(&bands);
1746 assert!(
1747 (threshold - 37.5).abs() < 0.01,
1748 "expected 37.5 (0.75×50), got {threshold}"
1749 );
1750 }
1751
1752 #[test]
1753 fn normalize_trims_trailing_whitespace_per_line() {
1754 assert_eq!(
1755 normalize_text_output("hello \nworld \n"),
1756 "hello\nworld\n"
1757 );
1758 }
1759
1760 #[test]
1761 fn normalize_collapses_excess_newlines() {
1762 assert_eq!(
1764 normalize_text_output("hello\n\n\n\n\nworld\n"),
1765 "hello\n\n\nworld\n"
1766 );
1767 }
1768
1769 #[test]
1770 fn normalize_preserves_double_newline() {
1771 assert_eq!(
1772 normalize_text_output("paragraph one\n\nparagraph two\n"),
1773 "paragraph one\n\nparagraph two\n"
1774 );
1775 }
1776
1777 #[test]
1778 fn normalize_preserves_form_feed() {
1779 assert_eq!(
1780 normalize_text_output("page1\n\n\x0Cpage2\n"),
1781 "page1\n\n\x0Cpage2\n"
1782 );
1783 }
1784
1785 #[test]
1786 fn normalize_adds_trailing_newline() {
1787 assert_eq!(normalize_text_output("hello"), "hello\n");
1788 }
1789
1790 #[test]
1791 fn normalize_empty_input() {
1792 assert_eq!(normalize_text_output(""), "");
1793 }
1794
1795 #[test]
1796 fn normalize_only_whitespace() {
1797 assert_eq!(normalize_text_output(" \n \n"), "");
1798 }
1799
1800 #[test]
1803 fn hyphen_stitch_joins_wrapped_word() {
1804 let lines = vec!["the aver-".into(), "age rainfall".into()];
1805 assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1806 }
1807
1808 #[test]
1809 fn hyphen_stitch_handles_leading_whitespace() {
1810 let lines = vec!["pre-".into(), " dict the outcome".into()];
1811 assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1813 }
1814
1815 #[test]
1816 fn hyphen_stitch_capital_continuation_not_stitched() {
1817 let lines = vec!["Section three-".into(), "Summary here".into()];
1818 assert_eq!(
1819 stitch_hyphenated_lines(&lines),
1820 "Section three-\nSummary here"
1821 );
1822 }
1823
1824 #[test]
1825 fn hyphen_stitch_bullet_dash_not_stitched() {
1826 let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1828 assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1829 }
1830
1831 #[test]
1832 fn hyphen_stitch_numeric_range_not_stitched() {
1833 let lines = vec!["page 42-".into(), "seventy".into()];
1835 assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1836 }
1837
1838 #[test]
1839 fn hyphen_stitch_short_prefix_not_stitched() {
1840 let lines = vec!["re-".into(), "organize".into()];
1842 assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1843 }
1844
1845 #[test]
1846 fn hyphen_stitch_short_continuation_not_stitched() {
1847 let lines = vec!["counter-".into(), "an example".into()];
1849 assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1850 }
1851
1852 #[test]
1853 fn hyphen_stitch_compound_word_midline_preserved() {
1854 let lines = vec!["real-time system".into()];
1857 assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1858 }
1859
1860 #[test]
1861 fn hyphen_stitch_single_line_unchanged() {
1862 let lines = vec!["only line".into()];
1863 assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1864 }
1865
1866 #[test]
1867 fn hyphen_stitch_empty_input() {
1868 let lines: Vec<String> = vec![];
1869 assert_eq!(stitch_hyphenated_lines(&lines), "");
1870 }
1871
1872 fn make_device_with_median(median: f64) -> TextExtractionDevice {
1875 let mut dev = TextExtractionDevice::new();
1876 for _ in 0..MEDIAN_REFRESH {
1878 dev.glyph_widths.push(median);
1879 }
1880 dev.refresh_median_char_width();
1881 assert!((dev.cached_median_char_width - median).abs() < 1e-9);
1882 dev
1883 }
1884
1885 #[test]
1886 fn consensus_inserts_space_on_strong_tj_offset_alone() {
1887 let mut dev = make_device_with_median(6.0);
1890 dev.pending_tj_offset = 250.0; assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1892 }
1893
1894 #[test]
1895 fn consensus_inserts_space_on_geometric_gap_alone() {
1896 let dev = make_device_with_median(6.0);
1898 assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
1901 }
1902
1903 #[test]
1904 fn consensus_no_space_on_kerning_gap() {
1905 let dev = make_device_with_median(6.0);
1909 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
1910 }
1911
1912 #[test]
1913 fn consensus_inserts_space_on_camel_case_plus_gap() {
1914 let dev = make_device_with_median(6.0);
1918 assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
1919 }
1920
1921 #[test]
1922 fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
1923 let dev = make_device_with_median(6.0);
1924 assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
1925 }
1926
1927 #[test]
1928 fn consensus_heuristic_alone_is_insufficient() {
1929 let dev = make_device_with_median(6.0);
1934 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
1935 }
1936
1937 #[test]
1938 fn consensus_falls_back_to_font_size_when_no_median() {
1939 let dev = TextExtractionDevice::new();
1941 assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
1943 assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
1945 }
1946
1947 #[test]
1948 fn consensus_ignores_tiny_tj_offsets() {
1949 let mut dev = make_device_with_median(6.0);
1951 dev.pending_tj_offset = 50.0;
1952 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1953 }
1954
1955 #[test]
1956 fn consensus_accepts_negative_tj_offsets() {
1957 let mut dev = make_device_with_median(6.0);
1960 dev.pending_tj_offset = -250.0;
1961 assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1962 }
1963
1964 #[test]
1965 fn text_adjustment_accumulates_until_glyph() {
1966 let mut dev = TextExtractionDevice::new();
1967 dev.text_adjustment(120.0);
1968 dev.text_adjustment(140.0);
1969 assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
1970 }
1971
1972 #[test]
1975 fn xy_cut_header_body_footer_with_two_columns() {
1976 let texts = block_texts(vec![
1982 span("HEADLINE TITLE", 180.0, 760.0, 120.0),
1983 span("Left col line A", 40.0, 700.0, 110.0),
1984 span("Right col line A", 320.0, 700.0, 115.0),
1985 span("Left col line B", 40.0, 684.0, 110.0),
1986 span("Right col line B", 320.0, 684.0, 115.0),
1987 span("Left col line C", 40.0, 668.0, 110.0),
1988 span("Right col line C", 320.0, 668.0, 115.0),
1989 span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
1990 ]);
1991 assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
1992 assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
1993 let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
1995 let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
1996 assert!(
1997 left_c_idx < right_a_idx,
1998 "expected column-major ordering in body: {texts:?}"
1999 );
2000 }
2001
2002 #[test]
2003 fn xy_cut_rejects_column_split_on_table_rows() {
2004 let texts = block_texts(vec![
2007 span("Name", 40.0, 700.0, 30.0),
2008 span("Age", 320.0, 700.0, 20.0),
2009 span("Alice", 40.0, 684.0, 35.0),
2010 span("30", 320.0, 684.0, 15.0),
2011 ]);
2012 assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2013 }
2014
2015 #[test]
2016 fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2017 let texts = block_texts(vec![
2021 span(
2022 "Full width intro spanning both columns here",
2023 40.0,
2024 740.0,
2025 360.0,
2026 ),
2027 span("Left A", 40.0, 700.0, 50.0),
2028 span("Right A", 320.0, 700.0, 50.0),
2029 span("Left B", 40.0, 684.0, 50.0),
2030 span("Right B", 320.0, 684.0, 50.0),
2031 ]);
2032 assert!(
2033 texts[0].contains("Full width intro"),
2034 "expected full-width intro first: {texts:?}"
2035 );
2036 }
2037
2038 #[test]
2039 fn xy_cut_horizontal_split_for_zone_boundaries() {
2040 let texts = block_texts(vec![
2044 span("First paragraph body text", 40.0, 740.0, 200.0),
2045 span("Second paragraph body", 40.0, 680.0, 180.0),
2046 ]);
2047 assert_eq!(texts.len(), 2);
2048 assert!(texts[0].starts_with("First"));
2049 assert!(texts[1].starts_with("Second"));
2050 }
2051
2052 #[test]
2053 fn xy_cut_recursion_terminates_with_single_span() {
2054 let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2055 assert_eq!(texts, vec!["Only one span on the page"]);
2056 }
2057
2058 #[test]
2059 fn median_font_size_handles_mixed_sizes() {
2060 let spans = vec![
2061 TextSpan {
2062 text: "small".into(),
2063 x: 0.0,
2064 y: 0.0,
2065 width: 10.0,
2066 height: 8.0,
2067 font_size: 8.0,
2068 },
2069 TextSpan {
2070 text: "medium".into(),
2071 x: 0.0,
2072 y: 0.0,
2073 width: 10.0,
2074 height: 12.0,
2075 font_size: 12.0,
2076 },
2077 TextSpan {
2078 text: "large".into(),
2079 x: 0.0,
2080 y: 0.0,
2081 width: 10.0,
2082 height: 24.0,
2083 font_size: 24.0,
2084 },
2085 ];
2086 assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2087 }
2088
2089 #[test]
2090 fn columns_band_aligned_accepts_aligned_columns() {
2091 let spans = vec![
2092 span("L1", 40.0, 700.0, 60.0),
2093 span("R1", 300.0, 700.0, 60.0),
2094 span("L2", 40.0, 684.0, 60.0),
2095 span("R2", 300.0, 684.0, 60.0),
2096 ];
2097 let stats = PageStats::from_spans(&spans);
2098 assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2100 }
2101
2102 #[test]
2103 fn columns_band_aligned_rejects_wide_single_side_band() {
2104 let spans = vec![
2105 span("Wide banner line across top", 40.0, 740.0, 280.0),
2106 span("L1", 40.0, 700.0, 60.0),
2107 span("R1", 300.0, 700.0, 60.0),
2108 ];
2109 let stats = PageStats::from_spans(&spans);
2110 assert!(!columns_are_band_aligned(
2113 &spans, 200.0, 40.0, 360.0, &stats
2114 ));
2115 }
2116
2117 #[test]
2118 fn page_stats_computes_median_values() {
2119 let spans = vec![
2120 span("one", 40.0, 700.0, 30.0),
2121 span("two", 40.0, 680.0, 30.0),
2122 span("three", 40.0, 660.0, 50.0),
2123 ];
2124 let stats = PageStats::from_spans(&spans);
2125 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2126 assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2128 assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2130 }
2131
2132 #[test]
2133 fn page_stats_handles_empty_input() {
2134 let stats = PageStats::from_spans(&[]);
2135 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2136 assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2137 assert_eq!(stats.median_line_spacing, 0.0);
2138 }
2139
2140 #[test]
2141 fn narrow_gutter_detected_with_adaptive_threshold() {
2142 let mut spans = Vec::new();
2146 for y in [700.0, 684.0, 668.0] {
2147 spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2149 spans.push(span("dolor sit", 144.0, y, 80.0));
2150 spans.push(span("amet consec", 236.0, y, 100.0));
2152 spans.push(span("tetur adipi", 340.0, y, 80.0));
2153 }
2154 let texts = block_texts(spans);
2155 assert!(
2157 texts.len() >= 6,
2158 "expected column-major output, got {texts:?}"
2159 );
2160 assert!(
2162 texts[0].contains("Lorem"),
2163 "first block should be left column: {texts:?}"
2164 );
2165 }
2166
2167 #[test]
2168 fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2169 let texts = block_texts(vec![
2170 span("73022", 45.0, 750.0, 70.0),
2171 span("Federal Register banner", 125.6, 750.0, 260.0),
2172 span("Left column line one", 45.0, 725.0, 140.0),
2173 span("Middle column line one", 222.0, 725.0, 140.0),
2174 span("Right column line one", 399.0, 725.0, 120.0),
2175 span("Left column line two", 45.0, 715.0, 140.0),
2176 span("Middle column line two", 210.0, 715.0, 152.0),
2177 span("Right column line two", 388.0, 715.0, 132.0),
2178 span("Left column line three", 45.0, 705.0, 140.0),
2179 span("Middle column line three", 235.0, 705.0, 135.0),
2180 span("Right column line three", 408.0, 705.0, 118.0),
2181 ]);
2182
2183 assert_eq!(
2184 texts,
2185 vec![
2186 "73022 Federal Register banner",
2187 "Left column line one",
2188 "Left column line two",
2189 "Left column line three",
2190 "Middle column line one",
2191 "Middle column line two",
2192 "Middle column line three",
2193 "Right column line one",
2194 "Right column line two",
2195 "Right column line three",
2196 ]
2197 );
2198 }
2199
2200 #[test]
2201 fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2202 let texts = block_texts(vec![
2203 span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2204 span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2205 span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2206 span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2207 span("2 This is a fakebold", 27.0, 714.8, 142.0),
2208 span(" fakebold", 169.8, 714.8, 70.0),
2209 span(" fakebold", 170.1, 714.8, 70.0),
2210 span(" fakebold word.", 170.4, 714.8, 110.0),
2211 ]);
2212
2213 assert_eq!(
2214 texts,
2215 vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2216 );
2217 }
2218}