1use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7 BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11const BAND_Y_TOLERANCE: f64 = 5.0;
15const BAND_Y_FRACTION: f64 = 0.30;
20const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71#[derive(Debug, Clone)]
73pub struct TextSpan {
74 pub text: String,
76 pub x: f64,
78 pub y: f64,
80 pub width: f64,
82 pub height: f64,
84 pub font_size: f64,
86}
87
88impl TextSpan {
89 fn right(&self) -> f64 {
92 self.x + self.width.max(self.estimated_width())
93 }
94
95 fn measured_right(&self) -> f64 {
97 self.x + self.width
98 }
99
100 fn estimated_width(&self) -> f64 {
101 let char_count = self.text.chars().count() as f64;
102 if char_count <= 0.0 {
103 self.font_size * 0.5
104 } else {
105 self.font_size * 0.5 * char_count
106 }
107 }
108}
109
110#[derive(Debug, Clone)]
112pub struct TextBlock {
113 pub spans: Vec<TextSpan>,
115}
116
117impl TextBlock {
118 pub fn text(&self) -> String {
124 if self.spans.is_empty() {
125 return String::new();
126 }
127 let mut result = self.spans[0].text.clone();
128 for pair in self.spans.windows(2) {
129 let prev = &pair[0];
130 let curr = &pair[1];
131 let expected_end = prev.measured_right();
132 let gap = curr.x - expected_end;
133 if gap <= prev.font_size * 0.12 {
134 if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
135 result.push_str(&trimmed);
136 continue;
137 }
138 }
139 if gap > prev.font_size * 0.25 {
140 result.push(' ');
141 }
142 result.push_str(&curr.text);
143 }
144 result
145 }
146}
147
148#[derive(Debug, Clone)]
149struct TextBand {
150 y: f64,
151 spans: Vec<TextSpan>,
152}
153
154impl TextBand {
155 fn new(span: TextSpan) -> Self {
156 Self {
157 y: span.y,
158 spans: vec![span],
159 }
160 }
161
162 fn sort_spans(&mut self) {
163 self.spans.sort_by(|a, b| {
164 a.x.partial_cmp(&b.x)
165 .unwrap_or(Ordering::Equal)
166 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
167 });
168 collapse_overprinted_spans(&mut self.spans);
169 }
170
171 fn row_block(&self) -> TextBlock {
172 let mut spans = self.spans.clone();
173 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
174 TextBlock { spans }
175 }
176
177 fn left(&self) -> f64 {
178 self.spans
179 .iter()
180 .map(|span| span.x)
181 .fold(f64::INFINITY, f64::min)
182 }
183
184 fn right(&self) -> f64 {
185 self.spans
186 .iter()
187 .map(TextSpan::right)
188 .fold(f64::NEG_INFINITY, f64::max)
189 }
190
191 fn width(&self) -> f64 {
192 (self.right() - self.left()).max(0.0)
193 }
194
195 fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
196 self.gaps(column_gap_threshold)
197 .into_iter()
198 .map(|gap| (gap.start + gap.end) * 0.5)
199 .collect()
200 }
201
202 fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
203 if self.spans.len() < 2 {
204 return Vec::new();
205 }
206
207 let mut spans = self.spans.clone();
208 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
209
210 let mut gaps = Vec::new();
211 let mut prev_right = spans[0].right();
212 for span in spans.iter().skip(1) {
213 let gap = span.x - prev_right;
214 if gap >= column_gap_threshold {
215 gaps.push(BandGap {
216 start: prev_right,
217 end: span.x,
218 });
219 }
220 prev_right = prev_right.max(span.right());
221 }
222
223 gaps
224 }
225
226 fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
227 let mut columns = vec![Vec::new(); boundaries.len() + 1];
228 for span in &self.spans {
229 let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
230 let column_idx = boundaries
231 .iter()
232 .position(|boundary| center_x < *boundary)
233 .unwrap_or(boundaries.len());
234 columns[column_idx].push(span.clone());
235 }
236
237 for spans in &mut columns {
238 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
239 }
240
241 columns
242 }
243
244 fn fits_single_column(
245 &self,
246 boundaries: &[f64],
247 region_left: f64,
248 region_right: f64,
249 ) -> Option<usize> {
250 let mut column_idx: Option<usize> = None;
251 for span in &self.spans {
252 let left = span.x;
253 let right = span.right();
254 if boundaries
255 .iter()
256 .any(|boundary| left < *boundary && right > *boundary)
257 {
258 return None;
259 }
260
261 let center_x = left + (right - left) * 0.5;
262 let idx = boundaries
263 .iter()
264 .position(|boundary| center_x < *boundary)
265 .unwrap_or(boundaries.len());
266 match column_idx {
267 Some(existing) if existing != idx => return None,
268 Some(_) => {}
269 None => column_idx = Some(idx),
270 }
271 }
272 let idx = column_idx?;
273 let mut edges = Vec::with_capacity(boundaries.len() + 2);
274 edges.push(region_left);
275 edges.extend_from_slice(boundaries);
276 edges.push(region_right);
277
278 let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
279 if column_width <= 0.0 || self.width() > column_width * 0.8 {
280 return None;
281 }
282
283 Some(idx)
284 }
285}
286
287#[derive(Debug, Clone, Copy)]
288struct BandGap {
289 start: f64,
290 end: f64,
291}
292
293pub(crate) struct TextExtractionDevice {
315 spans: Vec<TextSpan>,
316 last_y: f64,
317 last_end_x: f64,
318 pending_tj_offset: f32,
322 glyph_widths: Vec<f64>,
326 cached_median_char_width: f64,
330}
331
332const MEDIAN_REFRESH: usize = 32;
333
334impl Default for TextExtractionDevice {
335 fn default() -> Self {
336 Self::new()
337 }
338}
339
340impl TextExtractionDevice {
341 pub fn new() -> Self {
343 Self {
344 spans: Vec::new(),
345 last_y: f64::NEG_INFINITY,
346 last_end_x: f64::NEG_INFINITY,
347 pending_tj_offset: 0.0,
348 glyph_widths: Vec::new(),
349 cached_median_char_width: 0.0,
350 }
351 }
352
353 fn refresh_median_char_width(&mut self) {
356 if self.glyph_widths.is_empty() {
357 self.cached_median_char_width = 0.0;
358 return;
359 }
360 let mut sorted = self.glyph_widths.clone();
361 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
362 self.cached_median_char_width = sorted[sorted.len() / 2];
363 }
364
365 fn evaluate_space_consensus(
368 &self,
369 gap: f64,
370 font_size: f64,
371 prev_text: &str,
372 next_text: &str,
373 ) -> bool {
374 let mut confidence = 0.0;
375
376 if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
379 confidence += TJ_SIGNAL_WEIGHT;
380 }
381
382 let gap_reference = if self.cached_median_char_width > 0.0 {
386 self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
387 } else {
388 font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
389 };
390 if gap > gap_reference {
391 confidence += GAP_SIGNAL_WEIGHT;
392 }
393
394 if let (Some(prev_last), Some(next_first)) =
398 (prev_text.chars().last(), next_text.chars().next())
399 {
400 let camel = prev_last.is_lowercase() && next_first.is_uppercase();
401 let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
402 let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
403 if camel || digit_to_letter || letter_to_digit {
404 confidence += HEURISTIC_SIGNAL_WEIGHT;
405 }
406 }
407
408 confidence >= SPACE_CONSENSUS_THRESHOLD
409 }
410
411 pub fn into_text(self) -> String {
413 let blocks = group_spans_into_blocks(self.spans);
414 let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
415 let stitched = stitch_hyphenated_lines(&lines);
416 normalize_text_output(&stitched)
417 }
418
419 pub fn into_blocks(self) -> Vec<TextBlock> {
421 group_spans_into_blocks(self.spans)
422 }
423
424 #[allow(dead_code)]
426 pub(crate) fn into_spans(self) -> Vec<TextSpan> {
427 self.spans
428 }
429}
430
431impl Device<'_> for TextExtractionDevice {
432 fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
433 fn set_blend_mode(&mut self, _: BlendMode) {}
434 fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
435 fn push_clip_path(&mut self, _: &ClipPath) {}
436 fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
437 fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
438 fn pop_clip_path(&mut self) {}
439 fn pop_transparency_group(&mut self) {}
440
441 fn draw_glyph(
442 &mut self,
443 glyph: &Glyph<'_>,
444 transform: Affine,
445 glyph_transform: Affine,
446 _paint: &Paint<'_>,
447 _draw_mode: &GlyphDrawMode,
448 ) {
449 let text = match glyph.as_unicode() {
450 Some(BfString::Char(c)) => c.to_string(),
451 Some(BfString::String(s)) => s,
452 None => return,
453 };
454
455 let composed = transform * glyph_transform;
456 let coeffs = composed.as_coeffs();
457 let x = coeffs[4];
458 let y = coeffs[5];
459 let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
460 let font_size = glyph_scale * 1000.0;
461 let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
462 let glyph_end_x = x + glyph_width;
463
464 if self.glyph_widths.len() < 4096 {
468 self.glyph_widths.push(glyph_width);
469 if self.glyph_widths.len() % MEDIAN_REFRESH == 0 {
470 self.refresh_median_char_width();
471 }
472 }
473
474 let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
475 let gap = x - self.last_end_x;
476 let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
477
478 if adjacent && !self.spans.is_empty() {
479 let want_space = {
487 let last = self.spans.last().expect("checked non-empty");
488 !last.text.ends_with(' ')
489 && !text.starts_with(' ')
490 && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
491 };
492 let last = self.spans.last_mut().expect("checked non-empty");
493 if want_space {
494 last.text.push(' ');
495 }
496 last.text.push_str(&text);
497 last.width = last.width.max(glyph_end_x - last.x);
498 last.height = last.height.max(font_size);
499 self.last_y = y;
500 self.last_end_x = glyph_end_x;
501 self.pending_tj_offset = 0.0;
504 return;
505 }
506
507 self.last_y = y;
508 self.last_end_x = glyph_end_x;
509 self.pending_tj_offset = 0.0;
513
514 self.spans.push(TextSpan {
515 text,
516 x,
517 y,
518 width: glyph_width,
519 height: font_size,
520 font_size,
521 });
522 }
523
524 fn text_adjustment(&mut self, amount: f32) {
529 self.pending_tj_offset += amount;
530 }
531}
532
533fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
534 match glyph {
535 Glyph::Outline(outline) => outline
536 .advance_width()
537 .map(|width| width as f64 / 1000.0 * font_size)
538 .unwrap_or(font_size * 0.5),
539 Glyph::Type3(_) => font_size * 0.5,
540 }
541}
542
543fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
549 if spans.len() < 2 {
550 return;
551 }
552
553 let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
554 for span in spans.drain(..) {
555 if let Some(last) = deduped.last_mut() {
556 if spans_are_overprint_duplicates(last, &span) {
557 let choose_incoming = span.text.chars().count() > last.text.chars().count()
558 || (span.text.chars().count() == last.text.chars().count()
559 && span.width > last.width);
560 let preferred_text = if choose_incoming {
561 span.text.clone()
562 } else {
563 last.text.clone()
564 };
565 let left = last.x.min(span.x);
566 let right = last.right().max(span.right());
567 last.x = left;
568 last.y = (last.y + span.y) * 0.5;
569 last.width = (right - left).max(last.width).max(span.width);
570 last.height = last.height.max(span.height);
571 last.font_size = last.font_size.max(span.font_size);
572 last.text = preferred_text;
573 continue;
574 }
575 }
576
577 deduped.push(span);
578 }
579
580 *spans = deduped;
581}
582
583fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
584 let lhs_text = lhs.text.trim();
585 let rhs_text = rhs.text.trim();
586 if lhs_text.is_empty() || rhs_text.is_empty() {
587 return false;
588 }
589
590 let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
591 if !same_baseline {
592 return false;
593 }
594
595 let lhs_left = lhs.x;
596 let lhs_right = lhs.right();
597 let rhs_left = rhs.x;
598 let rhs_right = rhs.right();
599 let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
600 let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
601 let heavily_overlaps = overlap / min_width >= 0.85;
602 if !heavily_overlaps {
603 return false;
604 }
605
606 lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
607}
608
609fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
610 let prev_chars: Vec<char> = prev.trim_end().chars().collect();
611 let curr_chars: Vec<char> = curr.trim_start().chars().collect();
612 let max = prev_chars.len().min(curr_chars.len());
613
614 for len in (4..=max).rev() {
615 let prev_start = prev_chars.len() - len;
616 if prev_chars[prev_start..] != curr_chars[..len] {
617 continue;
618 }
619
620 if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
621 continue;
622 }
623
624 let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
625 let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
626 if !prev_boundary || !curr_boundary {
627 continue;
628 }
629
630 return Some(curr_chars[len..].iter().collect());
631 }
632
633 None
634}
635
636fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
643 let mut all_gaps: Vec<f64> = Vec::new();
644
645 for band in bands {
646 if band.spans.len() < 2 {
647 continue;
648 }
649 let mut sorted = band.spans.clone();
650 sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
651 let mut prev_right = sorted[0].right();
652 for span in sorted.iter().skip(1) {
653 let gap = span.x - prev_right;
654 if gap > 0.0 {
655 all_gaps.push(gap);
656 }
657 prev_right = prev_right.max(span.right());
658 }
659 }
660
661 if all_gaps.is_empty() {
662 return COLUMN_GAP_THRESHOLD_FALLBACK;
663 }
664
665 all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
666
667 let min_gap = all_gaps[0];
668
669 if min_gap > COLUMN_GAP_THRESHOLD_MIN {
674 return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
675 }
676
677 let mut best_break_threshold = 0.0f64;
680 let mut best_ratio = 1.5f64; for pair in all_gaps.windows(2) {
682 if pair[0] > 0.5 {
683 let ratio = pair[1] / pair[0];
684 if ratio > best_ratio {
685 best_ratio = ratio;
686 best_break_threshold = (pair[0] + pair[1]) * 0.5;
687 }
688 }
689 }
690
691 if best_break_threshold > 0.0 {
692 return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
693 }
694
695 let mid = all_gaps.len() / 2;
697 let median = if all_gaps.len() % 2 == 0 {
698 (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
699 } else {
700 all_gaps[mid]
701 };
702
703 (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
704 .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
705}
706
707#[derive(Debug, Clone, Copy)]
715struct PageStats {
716 median_font_size: f64,
718 #[allow(dead_code)]
723 median_char_width: f64,
724 median_line_spacing: f64,
730}
731
732impl PageStats {
733 fn from_spans(spans: &[TextSpan]) -> Self {
734 if spans.is_empty() {
735 return Self {
736 median_font_size: 12.0,
737 median_char_width: 6.0,
738 median_line_spacing: 0.0,
739 };
740 }
741
742 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
744 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
745 let median_font_size = sizes[sizes.len() / 2];
746
747 let mut char_widths: Vec<f64> = spans
749 .iter()
750 .filter_map(|s| {
751 let chars = s.text.chars().count();
752 if chars > 0 && s.width > 0.0 {
753 Some(s.width / chars as f64)
754 } else {
755 None
756 }
757 })
758 .collect();
759 let median_char_width = if char_widths.is_empty() {
760 median_font_size * 0.5
761 } else {
762 char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
763 char_widths[char_widths.len() / 2]
764 };
765
766 let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
769 let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
770 ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
771 let mut band_ys: Vec<f64> = Vec::new();
772 for y in ys {
773 if band_ys
774 .last()
775 .map(|prev: &f64| (prev - y).abs() > band_tolerance)
776 .unwrap_or(true)
777 {
778 band_ys.push(y);
779 }
780 }
781 let median_line_spacing = if band_ys.len() < 2 {
790 0.0
791 } else {
792 let mut spacings: Vec<f64> = band_ys
793 .windows(2)
794 .map(|pair| (pair[0] - pair[1]).abs())
795 .collect();
796 spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
797 let q1_index = spacings.len() / 4;
798 spacings[q1_index]
799 };
800
801 Self {
802 median_font_size,
803 median_char_width,
804 median_line_spacing,
805 }
806 }
807}
808
809const XY_CUT_MAX_DEPTH: usize = 12;
814const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
817const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
821const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
824const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
828const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
831
832fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
836 if spans.is_empty() {
837 return Vec::new();
838 }
839 let stats = PageStats::from_spans(&spans);
840 xy_cut_recursive(spans, 0, &stats)
841}
842
843fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
844 if spans.is_empty() {
845 return Vec::new();
846 }
847 if depth >= XY_CUT_MAX_DEPTH {
848 return band_based_blocks(spans, stats);
849 }
850
851 let vcut = try_vertical_cut(&spans, stats);
858 let hcut = try_horizontal_cut(&spans, stats);
859
860 let (chosen, _) = match (vcut, hcut) {
861 (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
862 if v_gap >= h_gap {
863 (Some(v_groups), v_gap)
864 } else {
865 (Some(h_groups), h_gap)
866 }
867 }
868 (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
869 (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
870 (None, None) => (None, 0.0),
871 };
872
873 if let Some(groups) = chosen {
874 let mut out = Vec::new();
875 for group in groups {
876 out.extend(xy_cut_recursive(group, depth + 1, stats));
877 }
878 return out;
879 }
880
881 band_based_blocks(spans, stats)
882}
883
884fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
888 group_spans_into_blocks_legacy_with_stats(spans, stats)
893}
894
895#[allow(dead_code)]
898fn median_font_size(spans: &[TextSpan]) -> f64 {
899 if spans.is_empty() {
900 return 12.0;
901 }
902 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
903 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
904 sizes[sizes.len() / 2]
905}
906
907fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
924 if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
925 return None;
926 }
927
928 let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
929 let region_right = spans
930 .iter()
931 .map(TextSpan::right)
932 .fold(f64::NEG_INFINITY, f64::max);
933 let region_width = region_right - region_left;
934 if region_width <= 0.0 {
935 return None;
936 }
937
938 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
947 let adaptive = compute_adaptive_column_gap(&bands);
948 let floor = stats
949 .median_font_size
950 .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
951 .max(XY_CUT_VERTICAL_GAP_FLOOR);
952 let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
953
954 let mut intervals: Vec<(f64, f64)> = spans
957 .iter()
958 .map(|s| (s.x, s.right().max(s.x + 0.001)))
959 .collect();
960 intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
961
962 let mut cursor = intervals[0].1;
963 let mut best_gap: Option<(f64, f64)> = None; for (left, right) in intervals.iter().skip(1) {
965 if *left > cursor {
966 let gap = *left - cursor;
967 if gap >= min_gap {
968 match best_gap {
969 Some((best, _)) if best >= gap => {}
970 _ => {
971 let cut_x = (cursor + *left) * 0.5;
972 best_gap = Some((gap, cut_x));
973 }
974 }
975 }
976 }
977 cursor = cursor.max(*right);
978 }
979
980 let (gap_size, cut_x) = best_gap?;
981
982 let mut left_group = Vec::new();
985 let mut right_group = Vec::new();
986 for span in spans {
987 let midpoint = span.x + (span.right() - span.x) * 0.5;
988 if midpoint < cut_x {
989 left_group.push(span.clone());
990 } else {
991 right_group.push(span.clone());
992 }
993 }
994
995 if !columns_are_dense(&left_group, &right_group, stats) {
996 return None;
997 }
998 if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
999 return None;
1000 }
1001
1002 Some((vec![left_group, right_group], gap_size))
1003}
1004
1005fn columns_are_band_aligned(
1011 spans: &[TextSpan],
1012 cut_x: f64,
1013 region_left: f64,
1014 region_right: f64,
1015 stats: &PageStats,
1016) -> bool {
1017 let left_width = (cut_x - region_left).max(1.0);
1018 let right_width = (region_right - cut_x).max(1.0);
1019
1020 const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1025
1026 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1027 for band in &bands {
1028 let mut has_left = false;
1029 let mut has_right = false;
1030 for span in &band.spans {
1031 let midpoint = span.x + (span.right() - span.x) * 0.5;
1032 if midpoint < cut_x {
1033 has_left = true;
1034 } else {
1035 has_right = true;
1036 }
1037 }
1038 if has_left && has_right {
1039 continue; }
1041 let band_width = band.width();
1042 if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1043 return false;
1044 }
1045 if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1046 return false;
1047 }
1048 }
1049 true
1050}
1051
1052fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1057 for col in [left, right] {
1058 if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1059 return false;
1060 }
1061 let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1062 if bands.is_empty() {
1063 return false;
1064 }
1065 let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1066 let chars_per_band = total_chars as f64 / bands.len() as f64;
1067 if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1068 return false;
1069 }
1070 }
1071 true
1072}
1073
1074fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1078 if spans.len() < 2 {
1079 return None;
1080 }
1081 let mut sorted = spans.to_vec();
1083 sorted.sort_by(|a, b| {
1084 b.y.partial_cmp(&a.y)
1085 .unwrap_or(Ordering::Equal)
1086 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1087 });
1088
1089 let min_gap = if stats.median_line_spacing > 0.0 {
1095 stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1096 } else {
1097 stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1098 };
1099
1100 let mut best: Option<(f64, f64)> = None; let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1103 let mut band_bottom = sorted[0].y;
1104
1105 for span in sorted.iter().skip(1) {
1106 if (band_bottom - span.y).abs() <= tolerance {
1107 band_bottom = band_bottom.min(span.y);
1108 continue;
1109 }
1110 let gap = band_bottom - span.y;
1111 if gap >= min_gap {
1112 let cut_y = (band_bottom + span.y) * 0.5;
1113 match best {
1114 Some((best_gap, _)) if best_gap >= gap => {}
1115 _ => best = Some((gap, cut_y)),
1116 }
1117 }
1118 band_bottom = span.y;
1119 }
1120
1121 let (gap_size, cut_y) = best?;
1122
1123 let mut top_group = Vec::new();
1124 let mut bottom_group = Vec::new();
1125 for span in spans {
1126 if span.y > cut_y {
1127 top_group.push(span.clone());
1128 } else {
1129 bottom_group.push(span.clone());
1130 }
1131 }
1132 if top_group.is_empty() || bottom_group.is_empty() {
1133 return None;
1134 }
1135 Some((vec![top_group, bottom_group], gap_size))
1136}
1137
1138#[allow(dead_code)]
1142fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1143 let bands = group_spans_into_bands(spans);
1144 group_spans_into_blocks_legacy_from_bands(bands)
1145}
1146
1147fn group_spans_into_blocks_legacy_with_stats(
1148 spans: Vec<TextSpan>,
1149 stats: &PageStats,
1150) -> Vec<TextBlock> {
1151 let bands = group_spans_into_bands_with_stats(spans, stats);
1152 group_spans_into_blocks_legacy_from_bands(bands)
1153}
1154
1155fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1156 if bands.is_empty() {
1157 return Vec::new();
1158 }
1159
1160 let column_gap_threshold = compute_adaptive_column_gap(&bands);
1161
1162 let mut blocks = Vec::new();
1163 let mut idx = 0;
1164
1165 while idx < bands.len() {
1166 let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1167 if gap_midpoints.is_empty() {
1168 blocks.push(bands[idx].row_block());
1169 idx += 1;
1170 continue;
1171 }
1172
1173 let mut boundaries = gap_midpoints.clone();
1174 let mut band_indices = vec![idx];
1175 let mut gapped_band_count = 1usize;
1176 let mut region_left = bands[idx].left();
1177 let mut region_right = bands[idx].right();
1178 let mut next_idx = idx + 1;
1179
1180 while next_idx < bands.len() {
1181 let next_band = &bands[next_idx];
1182 let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1183 if next_gap_midpoints.is_empty() {
1184 if next_band
1185 .fits_single_column(&boundaries, region_left, region_right)
1186 .is_some()
1187 {
1188 band_indices.push(next_idx);
1189 next_idx += 1;
1190 continue;
1191 }
1192 break;
1193 }
1194
1195 if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1196 break;
1197 }
1198
1199 update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1200 gapped_band_count += 1;
1201 band_indices.push(next_idx);
1202 region_left = region_left.min(next_band.left());
1203 region_right = region_right.max(next_band.right());
1204 next_idx += 1;
1205 }
1206
1207 if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1208 append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1209 idx = next_idx;
1210 } else {
1211 blocks.push(bands[idx].row_block());
1212 idx += 1;
1213 }
1214 }
1215
1216 blocks
1217}
1218
1219fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1223 let stats = PageStats::from_spans(&spans);
1224 group_spans_into_bands_with_stats(spans, &stats)
1225}
1226
1227fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1228 if spans.is_empty() {
1229 return Vec::new();
1230 }
1231
1232 spans.sort_by(|a, b| {
1233 b.y.partial_cmp(&a.y)
1234 .unwrap_or(Ordering::Equal)
1235 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1236 });
1237
1238 let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1244
1245 let mut bands: Vec<TextBand> = Vec::new();
1246
1247 for span in spans {
1248 let tolerance = (span.height * BAND_Y_FRACTION)
1249 .max(page_tolerance)
1250 .max(BAND_Y_TOLERANCE);
1251 if let Some(band) = bands
1252 .iter_mut()
1253 .find(|band| (band.y - span.y).abs() <= tolerance)
1254 {
1255 let span_count = band.spans.len() as f64;
1256 band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1257 band.spans.push(span);
1258 } else {
1259 bands.push(TextBand::new(span));
1260 }
1261 }
1262
1263 for band in &mut bands {
1264 band.sort_spans();
1265 }
1266
1267 bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1268 bands
1269}
1270
1271fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1272 let tolerance = (column_gap_threshold * 1.5)
1273 .max(COLUMN_GAP_MATCH_TOLERANCE)
1274 .min(60.0);
1275 boundaries.len() == gap_midpoints.len()
1276 && boundaries
1277 .iter()
1278 .zip(gap_midpoints)
1279 .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1280}
1281
1282fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1283 for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1284 *boundary =
1285 (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1286 }
1287}
1288
1289fn region_is_columnar(
1290 bands: &[TextBand],
1291 band_indices: &[usize],
1292 boundaries: &[f64],
1293 gapped_band_count: usize,
1294) -> bool {
1295 if boundaries.is_empty()
1296 || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1297 || band_indices.is_empty()
1298 || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1299 {
1300 return false;
1301 }
1302
1303 let mut non_empty_slices = 0usize;
1304 let mut dense_slices = 0usize;
1305 let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1306
1307 for &band_idx in band_indices {
1308 let slices = bands[band_idx].split_by_boundaries(boundaries);
1309 for (column_idx, slice) in slices.iter().enumerate() {
1310 if slice.is_empty() {
1311 continue;
1312 }
1313
1314 non_empty_slices += 1;
1315 slices_per_column[column_idx] += 1;
1316
1317 let char_count = slice
1318 .iter()
1319 .map(|span| span.text.chars().count())
1320 .sum::<usize>();
1321 if slice.len() >= 2 || char_count >= 8 {
1322 dense_slices += 1;
1323 }
1324 }
1325 }
1326
1327 if non_empty_slices < boundaries.len() + 2 {
1328 return false;
1329 }
1330
1331 if slices_per_column.contains(&0) {
1332 return false;
1333 }
1334
1335 (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1336}
1337
1338fn append_column_region_blocks(
1339 bands: &[TextBand],
1340 band_indices: &[usize],
1341 boundaries: &[f64],
1342 blocks: &mut Vec<TextBlock>,
1343) {
1344 let column_count = boundaries.len() + 1;
1345 let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1346
1347 for &band_idx in band_indices {
1348 let slices = bands[band_idx].split_by_boundaries(boundaries);
1349 for (column_idx, slice) in slices.into_iter().enumerate() {
1350 if slice.is_empty() {
1351 continue;
1352 }
1353 column_bands[column_idx].push(TextSpan {
1354 text: String::new(),
1355 x: 0.0,
1356 y: 0.0,
1357 width: 0.0,
1358 height: 0.0,
1359 font_size: 0.0,
1360 });
1361 let marker_idx = column_bands[column_idx].len() - 1;
1362 column_bands[column_idx][marker_idx] = TextSpan {
1363 text: String::new(),
1364 x: f64::NEG_INFINITY,
1365 y: bands[band_idx].y,
1366 width: 0.0,
1367 height: 0.0,
1368 font_size: 0.0,
1369 };
1370 column_bands[column_idx].extend(slice);
1371 }
1372 }
1373
1374 for spans in column_bands {
1375 let mut current: Vec<TextSpan> = Vec::new();
1376 for span in spans {
1377 if span.x == f64::NEG_INFINITY {
1378 if !current.is_empty() {
1379 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1380 blocks.push(TextBlock {
1381 spans: std::mem::take(&mut current),
1382 });
1383 }
1384 continue;
1385 }
1386 current.push(span);
1387 }
1388 if !current.is_empty() {
1389 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1390 blocks.push(TextBlock { spans: current });
1391 }
1392 }
1393}
1394
1395fn stitch_hyphenated_lines(lines: &[String]) -> String {
1410 let mut out = String::new();
1411 for (idx, line) in lines.iter().enumerate() {
1412 if idx == 0 {
1413 out.push_str(line);
1414 continue;
1415 }
1416
1417 let next_trimmed = line.trim_start();
1418
1419 let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1421
1422 if should_merge {
1423 out.pop(); out.push_str(next_trimmed);
1425 } else {
1426 out.push('\n');
1427 out.push_str(line);
1428 }
1429 }
1430 out
1431}
1432
1433fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1436 if !accumulated.ends_with('-') {
1438 return false;
1439 }
1440
1441 let before_hyphen = accumulated.chars().rev().nth(1);
1443 if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1444 return false;
1445 }
1446
1447 let alpha_prefix_len = accumulated
1449 .chars()
1450 .rev()
1451 .skip(1) .take_while(|c| c.is_alphabetic())
1453 .count();
1454 if alpha_prefix_len < 3 {
1455 return false;
1456 }
1457
1458 let first_next = next_trimmed.chars().next();
1460 if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1461 return false;
1462 }
1463
1464 let next_alpha_len = next_trimmed
1466 .chars()
1467 .take_while(|c| c.is_ascii_lowercase())
1468 .count();
1469 if next_alpha_len < 3 {
1470 return false;
1471 }
1472
1473 true
1474}
1475
1476pub(crate) fn normalize_text_output(text: &str) -> String {
1483 if text.is_empty() {
1484 return String::new();
1485 }
1486
1487 let mut lines: Vec<&str> = Vec::new();
1488 for line in text.split('\n') {
1489 lines.push(line.trim_end());
1490 }
1491
1492 while lines.last() == Some(&"") {
1494 lines.pop();
1495 }
1496
1497 if lines.is_empty() {
1498 return String::new();
1499 }
1500
1501 let mut result = String::with_capacity(text.len());
1502 let mut consecutive_empty = 0u32;
1503
1504 for (i, line) in lines.iter().enumerate() {
1505 if line.is_empty() || *line == "\x0C" {
1506 if line.is_empty() {
1507 consecutive_empty += 1;
1508 if consecutive_empty <= 2 {
1510 result.push('\n');
1511 }
1512 } else {
1513 consecutive_empty = 0;
1515 result.push_str(line);
1516 if i + 1 < lines.len() {
1517 result.push('\n');
1518 }
1519 }
1520 } else {
1521 if line.starts_with('\x0C') {
1523 consecutive_empty = 0;
1524 result.push_str(line);
1525 } else {
1526 consecutive_empty = 0;
1527 result.push_str(line);
1528 }
1529 if i + 1 < lines.len() {
1530 result.push('\n');
1531 }
1532 }
1533 }
1534
1535 if !result.is_empty() && !result.ends_with('\n') {
1537 result.push('\n');
1538 }
1539
1540 result
1541}
1542
1543#[cfg(test)]
1544mod tests {
1545 use super::*;
1546
1547 fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1548 TextSpan {
1549 text: text.into(),
1550 x,
1551 y,
1552 width,
1553 height: 12.0,
1554 font_size: 12.0,
1555 }
1556 }
1557
1558 fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1559 group_spans_into_blocks(spans)
1560 .into_iter()
1561 .map(|block| block.text())
1562 .collect()
1563 }
1564
1565 #[test]
1566 fn empty_device_produces_empty_text() {
1567 let dev = TextExtractionDevice::new();
1568 assert!(dev.into_text().is_empty());
1569 }
1570
1571 #[test]
1572 fn single_column_stays_row_major() {
1573 let texts = block_texts(vec![
1574 span("Single Column Line 1", 40.0, 700.0, 140.0),
1575 span("Single Column Line 2", 40.0, 684.0, 140.0),
1576 span("Single Column Line 3", 40.0, 668.0, 140.0),
1577 ]);
1578
1579 assert_eq!(
1580 texts,
1581 vec![
1582 "Single Column Line 1",
1583 "Single Column Line 2",
1584 "Single Column Line 3",
1585 ]
1586 );
1587 }
1588
1589 #[test]
1590 fn two_column_region_reads_column_major() {
1591 let texts = block_texts(vec![
1592 span("Header", 200.0, 740.0, 80.0),
1593 span("Left column line one", 40.0, 700.0, 115.0),
1594 span("Right column line one", 320.0, 700.0, 120.0),
1595 span("Left column line two", 40.0, 684.0, 115.0),
1596 span("Right column line two", 320.0, 684.0, 120.0),
1597 span("Left column line three", 40.0, 668.0, 125.0),
1598 span("Right column line three", 320.0, 668.0, 130.0),
1599 span("Footer", 200.0, 620.0, 80.0),
1600 ]);
1601
1602 assert_eq!(
1603 texts,
1604 vec![
1605 "Header",
1606 "Left column line one",
1607 "Left column line two",
1608 "Left column line three",
1609 "Right column line one",
1610 "Right column line two",
1611 "Right column line three",
1612 "Footer",
1613 ]
1614 );
1615 }
1616
1617 #[test]
1618 fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1619 let texts = block_texts(vec![
1620 span("Intro paragraph", 40.0, 740.0, 180.0),
1621 span("L1 words here", 40.0, 700.0, 110.0),
1622 span("R1 words here", 320.0, 700.0, 110.0),
1623 span("L2 words here", 40.0, 684.0, 110.0),
1624 span("R2 words here", 320.0, 684.0, 110.0),
1625 span("L3 words here", 40.0, 668.0, 110.0),
1626 span("R3 words here", 320.0, 668.0, 110.0),
1627 span("Outro paragraph", 40.0, 620.0, 180.0),
1628 ]);
1629
1630 assert_eq!(
1631 texts,
1632 vec![
1633 "Intro paragraph",
1634 "L1 words here",
1635 "L2 words here",
1636 "L3 words here",
1637 "R1 words here",
1638 "R2 words here",
1639 "R3 words here",
1640 "Outro paragraph",
1641 ]
1642 );
1643 }
1644
1645 #[test]
1646 fn short_table_like_rows_fall_back_to_row_major() {
1647 let texts = block_texts(vec![
1648 span("Name", 40.0, 700.0, 30.0),
1649 span("Age", 320.0, 700.0, 20.0),
1650 span("Alice", 40.0, 684.0, 35.0),
1651 span("30", 320.0, 684.0, 15.0),
1652 span("Bob", 40.0, 668.0, 24.0),
1653 span("25", 320.0, 668.0, 15.0),
1654 ]);
1655
1656 assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1657 }
1658
1659 #[test]
1660 fn three_column_regions_are_supported() {
1661 let texts = block_texts(vec![
1662 span("Column one line one", 40.0, 700.0, 105.0),
1663 span("Column two line one", 220.0, 700.0, 105.0),
1664 span("Column three line one", 400.0, 700.0, 120.0),
1665 span("Column one line two", 40.0, 684.0, 105.0),
1666 span("Column two line two", 220.0, 684.0, 105.0),
1667 span("Column three line two", 400.0, 684.0, 120.0),
1668 span("Column one line three", 40.0, 668.0, 120.0),
1669 span("Column two line three", 220.0, 668.0, 120.0),
1670 span("Column three line three", 400.0, 668.0, 135.0),
1671 ]);
1672
1673 assert_eq!(
1674 texts,
1675 vec![
1676 "Column one line one",
1677 "Column one line two",
1678 "Column one line three",
1679 "Column two line one",
1680 "Column two line two",
1681 "Column two line three",
1682 "Column three line one",
1683 "Column three line two",
1684 "Column three line three",
1685 ]
1686 );
1687 }
1688
1689 #[test]
1690 fn text_block_concatenation_spaced() {
1691 let block = TextBlock {
1692 spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1693 };
1694 assert_eq!(block.text(), "A B");
1695 }
1696
1697 #[test]
1698 fn adaptive_column_gap_fallback_for_no_gaps() {
1699 let bands = vec![
1701 TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1702 TextBand::new(span("World", 40.0, 684.0, 80.0)),
1703 ];
1704 let threshold = compute_adaptive_column_gap(&bands);
1705 assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1706 }
1707
1708 #[test]
1709 fn adaptive_column_gap_uses_median() {
1710 let mut bands = Vec::new();
1712 for y in [700.0, 684.0, 668.0] {
1713 let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1714 band.spans.push(span("word2", 74.0, y, 30.0)); band.spans.push(span("word3", 108.0, y, 30.0)); bands.push(band);
1717 }
1718 let threshold = compute_adaptive_column_gap(&bands);
1719 assert!(
1721 threshold >= 10.0 && threshold <= 14.0,
1722 "expected ~12, got {threshold}"
1723 );
1724 }
1725
1726 #[test]
1727 fn adaptive_column_gap_clamps_to_min() {
1728 let mut bands = Vec::new();
1730 for y in [700.0, 684.0, 668.0, 652.0] {
1731 let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1732 band.spans.push(span("def", 20.0, y, 18.0));
1734 bands.push(band);
1735 }
1736 let threshold = compute_adaptive_column_gap(&bands);
1737 assert!(
1738 (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1739 "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1740 );
1741 }
1742
1743 #[test]
1744 fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1745 let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1747 band.spans.push(span("Right", 80.0, 700.0, 30.0)); let bands = vec![band];
1749 let threshold = compute_adaptive_column_gap(&bands);
1750 assert!(
1751 (threshold - 37.5).abs() < 0.01,
1752 "expected 37.5 (0.75×50), got {threshold}"
1753 );
1754 }
1755
1756 #[test]
1757 fn normalize_trims_trailing_whitespace_per_line() {
1758 assert_eq!(
1759 normalize_text_output("hello \nworld \n"),
1760 "hello\nworld\n"
1761 );
1762 }
1763
1764 #[test]
1765 fn normalize_collapses_excess_newlines() {
1766 assert_eq!(
1768 normalize_text_output("hello\n\n\n\n\nworld\n"),
1769 "hello\n\n\nworld\n"
1770 );
1771 }
1772
1773 #[test]
1774 fn normalize_preserves_double_newline() {
1775 assert_eq!(
1776 normalize_text_output("paragraph one\n\nparagraph two\n"),
1777 "paragraph one\n\nparagraph two\n"
1778 );
1779 }
1780
1781 #[test]
1782 fn normalize_preserves_form_feed() {
1783 assert_eq!(
1784 normalize_text_output("page1\n\n\x0Cpage2\n"),
1785 "page1\n\n\x0Cpage2\n"
1786 );
1787 }
1788
1789 #[test]
1790 fn normalize_adds_trailing_newline() {
1791 assert_eq!(normalize_text_output("hello"), "hello\n");
1792 }
1793
1794 #[test]
1795 fn normalize_empty_input() {
1796 assert_eq!(normalize_text_output(""), "");
1797 }
1798
1799 #[test]
1800 fn normalize_only_whitespace() {
1801 assert_eq!(normalize_text_output(" \n \n"), "");
1802 }
1803
1804 #[test]
1807 fn hyphen_stitch_joins_wrapped_word() {
1808 let lines = vec!["the aver-".into(), "age rainfall".into()];
1809 assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1810 }
1811
1812 #[test]
1813 fn hyphen_stitch_handles_leading_whitespace() {
1814 let lines = vec!["pre-".into(), " dict the outcome".into()];
1815 assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1817 }
1818
1819 #[test]
1820 fn hyphen_stitch_capital_continuation_not_stitched() {
1821 let lines = vec!["Section three-".into(), "Summary here".into()];
1822 assert_eq!(
1823 stitch_hyphenated_lines(&lines),
1824 "Section three-\nSummary here"
1825 );
1826 }
1827
1828 #[test]
1829 fn hyphen_stitch_bullet_dash_not_stitched() {
1830 let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1832 assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1833 }
1834
1835 #[test]
1836 fn hyphen_stitch_numeric_range_not_stitched() {
1837 let lines = vec!["page 42-".into(), "seventy".into()];
1839 assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1840 }
1841
1842 #[test]
1843 fn hyphen_stitch_short_prefix_not_stitched() {
1844 let lines = vec!["re-".into(), "organize".into()];
1846 assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1847 }
1848
1849 #[test]
1850 fn hyphen_stitch_short_continuation_not_stitched() {
1851 let lines = vec!["counter-".into(), "an example".into()];
1853 assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1854 }
1855
1856 #[test]
1857 fn hyphen_stitch_compound_word_midline_preserved() {
1858 let lines = vec!["real-time system".into()];
1861 assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1862 }
1863
1864 #[test]
1865 fn hyphen_stitch_single_line_unchanged() {
1866 let lines = vec!["only line".into()];
1867 assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1868 }
1869
1870 #[test]
1871 fn hyphen_stitch_empty_input() {
1872 let lines: Vec<String> = vec![];
1873 assert_eq!(stitch_hyphenated_lines(&lines), "");
1874 }
1875
1876 fn make_device_with_median(median: f64) -> TextExtractionDevice {
1879 let mut dev = TextExtractionDevice::new();
1880 for _ in 0..MEDIAN_REFRESH {
1882 dev.glyph_widths.push(median);
1883 }
1884 dev.refresh_median_char_width();
1885 assert!((dev.cached_median_char_width - median).abs() < 1e-9);
1886 dev
1887 }
1888
1889 #[test]
1890 fn consensus_inserts_space_on_strong_tj_offset_alone() {
1891 let mut dev = make_device_with_median(6.0);
1894 dev.pending_tj_offset = 250.0; assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1896 }
1897
1898 #[test]
1899 fn consensus_inserts_space_on_geometric_gap_alone() {
1900 let dev = make_device_with_median(6.0);
1902 assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
1905 }
1906
1907 #[test]
1908 fn consensus_no_space_on_kerning_gap() {
1909 let dev = make_device_with_median(6.0);
1913 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
1914 }
1915
1916 #[test]
1917 fn consensus_inserts_space_on_camel_case_plus_gap() {
1918 let dev = make_device_with_median(6.0);
1922 assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
1923 }
1924
1925 #[test]
1926 fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
1927 let dev = make_device_with_median(6.0);
1928 assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
1929 }
1930
1931 #[test]
1932 fn consensus_heuristic_alone_is_insufficient() {
1933 let dev = make_device_with_median(6.0);
1938 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
1939 }
1940
1941 #[test]
1942 fn consensus_falls_back_to_font_size_when_no_median() {
1943 let dev = TextExtractionDevice::new();
1945 assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
1947 assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
1949 }
1950
1951 #[test]
1952 fn consensus_ignores_tiny_tj_offsets() {
1953 let mut dev = make_device_with_median(6.0);
1955 dev.pending_tj_offset = 50.0;
1956 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1957 }
1958
1959 #[test]
1960 fn consensus_accepts_negative_tj_offsets() {
1961 let mut dev = make_device_with_median(6.0);
1964 dev.pending_tj_offset = -250.0;
1965 assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1966 }
1967
1968 #[test]
1969 fn text_adjustment_accumulates_until_glyph() {
1970 let mut dev = TextExtractionDevice::new();
1971 dev.text_adjustment(120.0);
1972 dev.text_adjustment(140.0);
1973 assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
1974 }
1975
1976 #[test]
1979 fn xy_cut_header_body_footer_with_two_columns() {
1980 let texts = block_texts(vec![
1986 span("HEADLINE TITLE", 180.0, 760.0, 120.0),
1987 span("Left col line A", 40.0, 700.0, 110.0),
1988 span("Right col line A", 320.0, 700.0, 115.0),
1989 span("Left col line B", 40.0, 684.0, 110.0),
1990 span("Right col line B", 320.0, 684.0, 115.0),
1991 span("Left col line C", 40.0, 668.0, 110.0),
1992 span("Right col line C", 320.0, 668.0, 115.0),
1993 span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
1994 ]);
1995 assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
1996 assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
1997 let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
1999 let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
2000 assert!(
2001 left_c_idx < right_a_idx,
2002 "expected column-major ordering in body: {texts:?}"
2003 );
2004 }
2005
2006 #[test]
2007 fn xy_cut_rejects_column_split_on_table_rows() {
2008 let texts = block_texts(vec![
2011 span("Name", 40.0, 700.0, 30.0),
2012 span("Age", 320.0, 700.0, 20.0),
2013 span("Alice", 40.0, 684.0, 35.0),
2014 span("30", 320.0, 684.0, 15.0),
2015 ]);
2016 assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2017 }
2018
2019 #[test]
2020 fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2021 let texts = block_texts(vec![
2025 span(
2026 "Full width intro spanning both columns here",
2027 40.0,
2028 740.0,
2029 360.0,
2030 ),
2031 span("Left A", 40.0, 700.0, 50.0),
2032 span("Right A", 320.0, 700.0, 50.0),
2033 span("Left B", 40.0, 684.0, 50.0),
2034 span("Right B", 320.0, 684.0, 50.0),
2035 ]);
2036 assert!(
2037 texts[0].contains("Full width intro"),
2038 "expected full-width intro first: {texts:?}"
2039 );
2040 }
2041
2042 #[test]
2043 fn xy_cut_horizontal_split_for_zone_boundaries() {
2044 let texts = block_texts(vec![
2048 span("First paragraph body text", 40.0, 740.0, 200.0),
2049 span("Second paragraph body", 40.0, 680.0, 180.0),
2050 ]);
2051 assert_eq!(texts.len(), 2);
2052 assert!(texts[0].starts_with("First"));
2053 assert!(texts[1].starts_with("Second"));
2054 }
2055
2056 #[test]
2057 fn xy_cut_recursion_terminates_with_single_span() {
2058 let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2059 assert_eq!(texts, vec!["Only one span on the page"]);
2060 }
2061
2062 #[test]
2063 fn median_font_size_handles_mixed_sizes() {
2064 let spans = vec![
2065 TextSpan {
2066 text: "small".into(),
2067 x: 0.0,
2068 y: 0.0,
2069 width: 10.0,
2070 height: 8.0,
2071 font_size: 8.0,
2072 },
2073 TextSpan {
2074 text: "medium".into(),
2075 x: 0.0,
2076 y: 0.0,
2077 width: 10.0,
2078 height: 12.0,
2079 font_size: 12.0,
2080 },
2081 TextSpan {
2082 text: "large".into(),
2083 x: 0.0,
2084 y: 0.0,
2085 width: 10.0,
2086 height: 24.0,
2087 font_size: 24.0,
2088 },
2089 ];
2090 assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2091 }
2092
2093 #[test]
2094 fn columns_band_aligned_accepts_aligned_columns() {
2095 let spans = vec![
2096 span("L1", 40.0, 700.0, 60.0),
2097 span("R1", 300.0, 700.0, 60.0),
2098 span("L2", 40.0, 684.0, 60.0),
2099 span("R2", 300.0, 684.0, 60.0),
2100 ];
2101 let stats = PageStats::from_spans(&spans);
2102 assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2104 }
2105
2106 #[test]
2107 fn columns_band_aligned_rejects_wide_single_side_band() {
2108 let spans = vec![
2109 span("Wide banner line across top", 40.0, 740.0, 280.0),
2110 span("L1", 40.0, 700.0, 60.0),
2111 span("R1", 300.0, 700.0, 60.0),
2112 ];
2113 let stats = PageStats::from_spans(&spans);
2114 assert!(!columns_are_band_aligned(
2117 &spans, 200.0, 40.0, 360.0, &stats
2118 ));
2119 }
2120
2121 #[test]
2122 fn page_stats_computes_median_values() {
2123 let spans = vec![
2124 span("one", 40.0, 700.0, 30.0),
2125 span("two", 40.0, 680.0, 30.0),
2126 span("three", 40.0, 660.0, 50.0),
2127 ];
2128 let stats = PageStats::from_spans(&spans);
2129 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2130 assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2132 assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2134 }
2135
2136 #[test]
2137 fn page_stats_handles_empty_input() {
2138 let stats = PageStats::from_spans(&[]);
2139 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2140 assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2141 assert_eq!(stats.median_line_spacing, 0.0);
2142 }
2143
2144 #[test]
2145 fn narrow_gutter_detected_with_adaptive_threshold() {
2146 let mut spans = Vec::new();
2150 for y in [700.0, 684.0, 668.0] {
2151 spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2153 spans.push(span("dolor sit", 144.0, y, 80.0));
2154 spans.push(span("amet consec", 236.0, y, 100.0));
2156 spans.push(span("tetur adipi", 340.0, y, 80.0));
2157 }
2158 let texts = block_texts(spans);
2159 assert!(
2161 texts.len() >= 6,
2162 "expected column-major output, got {texts:?}"
2163 );
2164 assert!(
2166 texts[0].contains("Lorem"),
2167 "first block should be left column: {texts:?}"
2168 );
2169 }
2170
2171 #[test]
2172 fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2173 let texts = block_texts(vec![
2174 span("73022", 45.0, 750.0, 70.0),
2175 span("Federal Register banner", 125.6, 750.0, 260.0),
2176 span("Left column line one", 45.0, 725.0, 140.0),
2177 span("Middle column line one", 222.0, 725.0, 140.0),
2178 span("Right column line one", 399.0, 725.0, 120.0),
2179 span("Left column line two", 45.0, 715.0, 140.0),
2180 span("Middle column line two", 210.0, 715.0, 152.0),
2181 span("Right column line two", 388.0, 715.0, 132.0),
2182 span("Left column line three", 45.0, 705.0, 140.0),
2183 span("Middle column line three", 235.0, 705.0, 135.0),
2184 span("Right column line three", 408.0, 705.0, 118.0),
2185 ]);
2186
2187 assert_eq!(
2188 texts,
2189 vec![
2190 "73022 Federal Register banner",
2191 "Left column line one",
2192 "Left column line two",
2193 "Left column line three",
2194 "Middle column line one",
2195 "Middle column line two",
2196 "Middle column line three",
2197 "Right column line one",
2198 "Right column line two",
2199 "Right column line three",
2200 ]
2201 );
2202 }
2203
2204 #[test]
2205 fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2206 let texts = block_texts(vec![
2207 span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2208 span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2209 span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2210 span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2211 span("2 This is a fakebold", 27.0, 714.8, 142.0),
2212 span(" fakebold", 169.8, 714.8, 70.0),
2213 span(" fakebold", 170.1, 714.8, 70.0),
2214 span(" fakebold word.", 170.4, 714.8, 110.0),
2215 ]);
2216
2217 assert_eq!(
2218 texts,
2219 vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2220 );
2221 }
2222}