1use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7 BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11const BAND_Y_TOLERANCE: f64 = 5.0;
15const BAND_Y_FRACTION: f64 = 0.30;
20const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
73pub enum WidthSource {
74 Metric,
76 #[default]
78 Estimate,
79}
80
81#[derive(Debug, Clone, Default)]
83pub struct TextSpan {
84 pub text: String,
86 pub x: f64,
88 pub y: f64,
90 pub width: f64,
92 pub height: f64,
94 pub font_size: f64,
96
97 pub font_name: Option<String>,
103 pub is_bold: bool,
107 pub is_italic: bool,
111 pub color: Option<[u8; 4]>,
115
116 pub width_source: WidthSource,
119 pub char_bounds: Vec<[f64; 4]>,
122}
123
124impl TextSpan {
125 fn right(&self) -> f64 {
128 self.x + self.width.max(self.estimated_width())
129 }
130
131 fn measured_right(&self) -> f64 {
133 self.x + self.width
134 }
135
136 fn estimated_width(&self) -> f64 {
137 let char_count = self.text.chars().count() as f64;
138 if char_count <= 0.0 {
139 self.font_size * 0.5
140 } else {
141 self.font_size * 0.5 * char_count
142 }
143 }
144}
145
146#[derive(Debug, Clone)]
148pub struct TextBlock {
149 pub spans: Vec<TextSpan>,
151}
152
153impl TextBlock {
154 pub fn text(&self) -> String {
160 if self.spans.is_empty() {
161 return String::new();
162 }
163 let mut result = self.spans[0].text.clone();
164 for pair in self.spans.windows(2) {
165 let prev = &pair[0];
166 let curr = &pair[1];
167 let expected_end = prev.measured_right();
168 let gap = curr.x - expected_end;
169 if gap <= prev.font_size * 0.12 {
170 if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
171 result.push_str(&trimmed);
172 continue;
173 }
174 }
175 if gap > prev.font_size * 0.25 {
176 result.push(' ');
177 }
178 result.push_str(&curr.text);
179 }
180 result
181 }
182}
183
184#[derive(Debug, Clone)]
185struct TextBand {
186 y: f64,
187 spans: Vec<TextSpan>,
188}
189
190impl TextBand {
191 fn new(span: TextSpan) -> Self {
192 Self {
193 y: span.y,
194 spans: vec![span],
195 }
196 }
197
198 fn sort_spans(&mut self) {
199 self.spans.sort_by(|a, b| {
200 a.x.partial_cmp(&b.x)
201 .unwrap_or(Ordering::Equal)
202 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
203 });
204 collapse_overprinted_spans(&mut self.spans);
205 }
206
207 fn row_block(&self) -> TextBlock {
208 let mut spans = self.spans.clone();
209 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
210 TextBlock { spans }
211 }
212
213 fn left(&self) -> f64 {
214 self.spans
215 .iter()
216 .map(|span| span.x)
217 .fold(f64::INFINITY, f64::min)
218 }
219
220 fn right(&self) -> f64 {
221 self.spans
222 .iter()
223 .map(TextSpan::right)
224 .fold(f64::NEG_INFINITY, f64::max)
225 }
226
227 fn width(&self) -> f64 {
228 (self.right() - self.left()).max(0.0)
229 }
230
231 fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
232 self.gaps(column_gap_threshold)
233 .into_iter()
234 .map(|gap| (gap.start + gap.end) * 0.5)
235 .collect()
236 }
237
238 fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
239 if self.spans.len() < 2 {
240 return Vec::new();
241 }
242
243 let mut spans = self.spans.clone();
244 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
245
246 let mut gaps = Vec::new();
247 let mut prev_right = spans[0].right();
248 for span in spans.iter().skip(1) {
249 let gap = span.x - prev_right;
250 if gap >= column_gap_threshold {
251 gaps.push(BandGap {
252 start: prev_right,
253 end: span.x,
254 });
255 }
256 prev_right = prev_right.max(span.right());
257 }
258
259 gaps
260 }
261
262 fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
263 let mut columns = vec![Vec::new(); boundaries.len() + 1];
264 for span in &self.spans {
265 let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
266 let column_idx = boundaries
267 .iter()
268 .position(|boundary| center_x < *boundary)
269 .unwrap_or(boundaries.len());
270 columns[column_idx].push(span.clone());
271 }
272
273 for spans in &mut columns {
274 spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
275 }
276
277 columns
278 }
279
280 fn fits_single_column(
281 &self,
282 boundaries: &[f64],
283 region_left: f64,
284 region_right: f64,
285 ) -> Option<usize> {
286 let mut column_idx: Option<usize> = None;
287 for span in &self.spans {
288 let left = span.x;
289 let right = span.right();
290 if boundaries
291 .iter()
292 .any(|boundary| left < *boundary && right > *boundary)
293 {
294 return None;
295 }
296
297 let center_x = left + (right - left) * 0.5;
298 let idx = boundaries
299 .iter()
300 .position(|boundary| center_x < *boundary)
301 .unwrap_or(boundaries.len());
302 match column_idx {
303 Some(existing) if existing != idx => return None,
304 Some(_) => {}
305 None => column_idx = Some(idx),
306 }
307 }
308 let idx = column_idx?;
309 let mut edges = Vec::with_capacity(boundaries.len() + 2);
310 edges.push(region_left);
311 edges.extend_from_slice(boundaries);
312 edges.push(region_right);
313
314 let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
315 if column_width <= 0.0 || self.width() > column_width * 0.8 {
316 return None;
317 }
318
319 Some(idx)
320 }
321}
322
323#[derive(Debug, Clone, Copy)]
324struct BandGap {
325 start: f64,
326 end: f64,
327}
328
329pub(crate) struct TextExtractionDevice {
351 spans: Vec<TextSpan>,
352 last_y: f64,
353 last_end_x: f64,
354 pending_tj_offset: f32,
358 glyph_widths: Vec<f64>,
362 cached_median_char_width: f64,
366}
367
368const MEDIAN_REFRESH: usize = 32;
369
370impl Default for TextExtractionDevice {
371 fn default() -> Self {
372 Self::new()
373 }
374}
375
376impl TextExtractionDevice {
377 pub fn new() -> Self {
379 Self {
380 spans: Vec::new(),
381 last_y: f64::NEG_INFINITY,
382 last_end_x: f64::NEG_INFINITY,
383 pending_tj_offset: 0.0,
384 glyph_widths: Vec::new(),
385 cached_median_char_width: 0.0,
386 }
387 }
388
389 fn refresh_median_char_width(&mut self) {
392 if self.glyph_widths.is_empty() {
393 self.cached_median_char_width = 0.0;
394 return;
395 }
396 let mut sorted = self.glyph_widths.clone();
397 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
398 self.cached_median_char_width = sorted[sorted.len() / 2];
399 }
400
401 fn evaluate_space_consensus(
404 &self,
405 gap: f64,
406 font_size: f64,
407 prev_text: &str,
408 next_text: &str,
409 ) -> bool {
410 let mut confidence = 0.0;
411
412 if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
415 confidence += TJ_SIGNAL_WEIGHT;
416 }
417
418 let gap_reference = if self.cached_median_char_width > 0.0 {
422 self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
423 } else {
424 font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
425 };
426 if gap > gap_reference {
427 confidence += GAP_SIGNAL_WEIGHT;
428 }
429
430 if let (Some(prev_last), Some(next_first)) =
434 (prev_text.chars().last(), next_text.chars().next())
435 {
436 let camel = prev_last.is_lowercase() && next_first.is_uppercase();
437 let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
438 let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
439 if camel || digit_to_letter || letter_to_digit {
440 confidence += HEURISTIC_SIGNAL_WEIGHT;
441 }
442 }
443
444 confidence >= SPACE_CONSENSUS_THRESHOLD
445 }
446
447 pub fn into_text(self) -> String {
449 let blocks = group_spans_into_blocks(self.spans);
450 let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
451 let stitched = stitch_hyphenated_lines(&lines);
452 normalize_text_output(&stitched)
453 }
454
455 pub fn into_blocks(self) -> Vec<TextBlock> {
457 group_spans_into_blocks(self.spans)
458 }
459
460 #[allow(dead_code)]
462 pub(crate) fn into_spans(self) -> Vec<TextSpan> {
463 self.spans
464 }
465}
466
467impl Device<'_> for TextExtractionDevice {
468 fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
469 fn set_blend_mode(&mut self, _: BlendMode) {}
470 fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
471 fn push_clip_path(&mut self, _: &ClipPath) {}
472 fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
473 fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
474 fn pop_clip_path(&mut self) {}
475 fn pop_transparency_group(&mut self) {}
476
477 fn draw_glyph(
478 &mut self,
479 glyph: &Glyph<'_>,
480 transform: Affine,
481 glyph_transform: Affine,
482 paint: &Paint<'_>,
483 _draw_mode: &GlyphDrawMode,
484 ) {
485 let text = match glyph.as_unicode() {
486 Some(BfString::Char(c)) => c.to_string(),
487 Some(BfString::String(s)) => s,
488 None => return,
489 };
490
491 let composed = transform * glyph_transform;
492 let coeffs = composed.as_coeffs();
493 let x = coeffs[4];
494 let y = coeffs[5];
495 let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
496 let font_size = glyph_scale * 1000.0;
497
498 let (glyph_width, glyph_ws) = glyph_width_and_source(glyph, font_size);
500 let glyph_end_x = x + glyph_width;
501 let glyph_bound = [x, y, glyph_end_x, y + font_size];
502
503 let style = derive_glyph_style(glyph);
504 let color = paint_to_rgba(paint);
505
506 if self.glyph_widths.len() < 4096 {
510 self.glyph_widths.push(glyph_width);
511 if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
512 self.refresh_median_char_width();
513 }
514 }
515
516 let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
517 let gap = x - self.last_end_x;
518 let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
519
520 let style_matches = self
524 .spans
525 .last()
526 .map(|last| {
527 last.font_name == style.font_name
528 && last.is_bold == style.is_bold
529 && last.is_italic == style.is_italic
530 && last.color == color
531 })
532 .unwrap_or(false);
533
534 if adjacent && !self.spans.is_empty() && style_matches {
535 let want_space = {
543 let last = self.spans.last().expect("checked non-empty");
544 !last.text.ends_with(' ')
545 && !text.starts_with(' ')
546 && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
547 };
548 let last = self.spans.last_mut().expect("checked non-empty");
549 if want_space {
550 last.text.push(' ');
551 }
552 last.text.push_str(&text);
553 last.width = last.width.max(glyph_end_x - last.x);
554 last.height = last.height.max(font_size);
555 last.char_bounds.push(glyph_bound);
557 if glyph_ws == WidthSource::Estimate {
558 last.width_source = WidthSource::Estimate;
559 }
560 self.last_y = y;
561 self.last_end_x = glyph_end_x;
562 self.pending_tj_offset = 0.0;
565 return;
566 }
567
568 self.last_y = y;
569 self.last_end_x = glyph_end_x;
570 self.pending_tj_offset = 0.0;
574
575 self.spans.push(TextSpan {
576 text,
577 x,
578 y,
579 width: glyph_width,
580 height: font_size,
581 font_size,
582 font_name: style.font_name,
583 is_bold: style.is_bold,
584 is_italic: style.is_italic,
585 color,
586 width_source: glyph_ws,
587 char_bounds: vec![glyph_bound],
588 });
589 }
590
591 fn text_adjustment(&mut self, amount: f32) {
596 self.pending_tj_offset += amount;
597 }
598}
599
600#[derive(Debug, Default, Clone)]
602struct GlyphStyle {
603 font_name: Option<String>,
604 is_bold: bool,
605 is_italic: bool,
606}
607
608fn strip_subset_prefix(name: &str) -> &str {
610 match name.split_once('+') {
611 Some((prefix, rest)) if prefix.len() == 6 => rest,
612 _ => name,
613 }
614}
615
616fn name_style_hints(name: &str) -> (bool, bool) {
620 let lower = name.to_ascii_lowercase();
621 let italic = lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
622 let bold = lower.contains("bold")
623 || lower.contains("demi")
624 || lower.contains("semibold")
625 || lower.contains("heavy")
626 || lower.contains("black");
627 (bold, italic)
628}
629
630fn derive_glyph_style(glyph: &Glyph<'_>) -> GlyphStyle {
631 match glyph {
632 Glyph::Outline(outline) => {
633 if let Some(data) = outline.font_data() {
634 let raw = data.postscript_name.as_deref().unwrap_or("");
635 let name = strip_subset_prefix(raw).to_string();
636 let weight_bold = data.weight.is_some_and(|w| w >= 700);
637 let (name_bold, name_italic) = name_style_hints(&name);
638 GlyphStyle {
639 font_name: if name.is_empty() { None } else { Some(name) },
640 is_bold: weight_bold || name_bold,
641 is_italic: data.is_italic || name_italic,
642 }
643 } else {
644 let raw = outline.postscript_name().unwrap_or_default();
648 let name = strip_subset_prefix(&raw).to_string();
649 let (name_bold, name_italic) = name_style_hints(&name);
650 GlyphStyle {
651 font_name: if name.is_empty() { None } else { Some(name) },
652 is_bold: name_bold,
653 is_italic: name_italic,
654 }
655 }
656 }
657 Glyph::Type3(_) => GlyphStyle::default(),
658 }
659}
660
661fn paint_to_rgba(paint: &Paint<'_>) -> Option<[u8; 4]> {
662 match paint {
663 Paint::Color(c) => Some(c.to_rgba().to_rgba8()),
664 Paint::Pattern(_) => None,
665 }
666}
667
668fn glyph_width_and_source(glyph: &Glyph<'_>, font_size: f64) -> (f64, WidthSource) {
675 match glyph {
676 Glyph::Outline(outline) => {
677 if let Some(w) = outline.advance_width() {
678 let advance = (w as f64 / 1000.0 * font_size).max(font_size * 0.25);
679 (advance, WidthSource::Metric)
680 } else {
681 (font_size * 0.5, WidthSource::Estimate)
682 }
683 }
684 Glyph::Type3(_) => (font_size * 0.5, WidthSource::Estimate),
685 }
686}
687
688fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
694 if spans.len() < 2 {
695 return;
696 }
697
698 let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
699 for span in spans.drain(..) {
700 if let Some(last) = deduped.last_mut() {
701 if spans_are_overprint_duplicates(last, &span) {
702 let choose_incoming = span.text.chars().count() > last.text.chars().count()
703 || (span.text.chars().count() == last.text.chars().count()
704 && span.width > last.width);
705 let preferred_text = if choose_incoming {
706 span.text.clone()
707 } else {
708 last.text.clone()
709 };
710 let left = last.x.min(span.x);
711 let right = last.right().max(span.right());
712 last.x = left;
713 last.y = (last.y + span.y) * 0.5;
714 last.width = (right - left).max(last.width).max(span.width);
715 last.height = last.height.max(span.height);
716 last.font_size = last.font_size.max(span.font_size);
717 last.text = preferred_text;
718 continue;
719 }
720 }
721
722 deduped.push(span);
723 }
724
725 *spans = deduped;
726}
727
728fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
729 let lhs_text = lhs.text.trim();
730 let rhs_text = rhs.text.trim();
731 if lhs_text.is_empty() || rhs_text.is_empty() {
732 return false;
733 }
734
735 let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
736 if !same_baseline {
737 return false;
738 }
739
740 let lhs_left = lhs.x;
741 let lhs_right = lhs.right();
742 let rhs_left = rhs.x;
743 let rhs_right = rhs.right();
744 let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
745 let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
746 let heavily_overlaps = overlap / min_width >= 0.85;
747 if !heavily_overlaps {
748 return false;
749 }
750
751 lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
752}
753
754fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
755 let prev_chars: Vec<char> = prev.trim_end().chars().collect();
756 let curr_chars: Vec<char> = curr.trim_start().chars().collect();
757 let max = prev_chars.len().min(curr_chars.len());
758
759 for len in (4..=max).rev() {
760 let prev_start = prev_chars.len() - len;
761 if prev_chars[prev_start..] != curr_chars[..len] {
762 continue;
763 }
764
765 if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
766 continue;
767 }
768
769 let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
770 let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
771 if !prev_boundary || !curr_boundary {
772 continue;
773 }
774
775 return Some(curr_chars[len..].iter().collect());
776 }
777
778 None
779}
780
781fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
788 let mut all_gaps: Vec<f64> = Vec::new();
789
790 for band in bands {
791 if band.spans.len() < 2 {
792 continue;
793 }
794 let mut sorted = band.spans.clone();
795 sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
796 let mut prev_right = sorted[0].right();
797 for span in sorted.iter().skip(1) {
798 let gap = span.x - prev_right;
799 if gap > 0.0 {
800 all_gaps.push(gap);
801 }
802 prev_right = prev_right.max(span.right());
803 }
804 }
805
806 if all_gaps.is_empty() {
807 return COLUMN_GAP_THRESHOLD_FALLBACK;
808 }
809
810 all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
811
812 let min_gap = all_gaps[0];
813
814 if min_gap > COLUMN_GAP_THRESHOLD_MIN {
819 return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
820 }
821
822 let mut best_break_threshold = 0.0f64;
825 let mut best_ratio = 1.5f64; for pair in all_gaps.windows(2) {
827 if pair[0] > 0.5 {
828 let ratio = pair[1] / pair[0];
829 if ratio > best_ratio {
830 best_ratio = ratio;
831 best_break_threshold = (pair[0] + pair[1]) * 0.5;
832 }
833 }
834 }
835
836 if best_break_threshold > 0.0 {
837 return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
838 }
839
840 let mid = all_gaps.len() / 2;
842 let median = if all_gaps.len().is_multiple_of(2) {
843 (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
844 } else {
845 all_gaps[mid]
846 };
847
848 (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
849 .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
850}
851
852#[derive(Debug, Clone, Copy)]
860struct PageStats {
861 median_font_size: f64,
863 #[allow(dead_code)]
868 median_char_width: f64,
869 median_line_spacing: f64,
875}
876
877impl PageStats {
878 fn from_spans(spans: &[TextSpan]) -> Self {
879 if spans.is_empty() {
880 return Self {
881 median_font_size: 12.0,
882 median_char_width: 6.0,
883 median_line_spacing: 0.0,
884 };
885 }
886
887 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
889 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
890 let median_font_size = sizes[sizes.len() / 2];
891
892 let mut char_widths: Vec<f64> = spans
894 .iter()
895 .filter_map(|s| {
896 let chars = s.text.chars().count();
897 if chars > 0 && s.width > 0.0 {
898 Some(s.width / chars as f64)
899 } else {
900 None
901 }
902 })
903 .collect();
904 let median_char_width = if char_widths.is_empty() {
905 median_font_size * 0.5
906 } else {
907 char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
908 char_widths[char_widths.len() / 2]
909 };
910
911 let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
914 let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
915 ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
916 let mut band_ys: Vec<f64> = Vec::new();
917 for y in ys {
918 if band_ys
919 .last()
920 .map(|prev: &f64| (prev - y).abs() > band_tolerance)
921 .unwrap_or(true)
922 {
923 band_ys.push(y);
924 }
925 }
926 let median_line_spacing = if band_ys.len() < 2 {
935 0.0
936 } else {
937 let mut spacings: Vec<f64> = band_ys
938 .windows(2)
939 .map(|pair| (pair[0] - pair[1]).abs())
940 .collect();
941 spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
942 let q1_index = spacings.len() / 4;
943 spacings[q1_index]
944 };
945
946 Self {
947 median_font_size,
948 median_char_width,
949 median_line_spacing,
950 }
951 }
952}
953
954const XY_CUT_MAX_DEPTH: usize = 12;
959const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
962const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
966const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
969const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
973const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
976
977fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
981 if spans.is_empty() {
982 return Vec::new();
983 }
984 let stats = PageStats::from_spans(&spans);
985 xy_cut_recursive(spans, 0, &stats)
986}
987
988fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
989 if spans.is_empty() {
990 return Vec::new();
991 }
992 if depth >= XY_CUT_MAX_DEPTH {
993 return band_based_blocks(spans, stats);
994 }
995
996 let vcut = try_vertical_cut(&spans, stats);
1003 let hcut = try_horizontal_cut(&spans, stats);
1004
1005 let (chosen, _) = match (vcut, hcut) {
1006 (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
1007 if v_gap >= h_gap {
1008 (Some(v_groups), v_gap)
1009 } else {
1010 (Some(h_groups), h_gap)
1011 }
1012 }
1013 (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
1014 (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
1015 (None, None) => (None, 0.0),
1016 };
1017
1018 if let Some(groups) = chosen {
1019 let mut out = Vec::new();
1020 for group in groups {
1021 out.extend(xy_cut_recursive(group, depth + 1, stats));
1022 }
1023 return out;
1024 }
1025
1026 band_based_blocks(spans, stats)
1027}
1028
1029fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
1033 group_spans_into_blocks_legacy_with_stats(spans, stats)
1038}
1039
1040#[allow(dead_code)]
1043fn median_font_size(spans: &[TextSpan]) -> f64 {
1044 if spans.is_empty() {
1045 return 12.0;
1046 }
1047 let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
1048 sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1049 sizes[sizes.len() / 2]
1050}
1051
1052fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1069 if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
1070 return None;
1071 }
1072
1073 let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
1074 let region_right = spans
1075 .iter()
1076 .map(TextSpan::right)
1077 .fold(f64::NEG_INFINITY, f64::max);
1078 let region_width = region_right - region_left;
1079 if region_width <= 0.0 {
1080 return None;
1081 }
1082
1083 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1092 let adaptive = compute_adaptive_column_gap(&bands);
1093 let floor = stats
1094 .median_font_size
1095 .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
1096 .max(XY_CUT_VERTICAL_GAP_FLOOR);
1097 let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
1098
1099 let mut intervals: Vec<(f64, f64)> = spans
1102 .iter()
1103 .map(|s| (s.x, s.right().max(s.x + 0.001)))
1104 .collect();
1105 intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
1106
1107 let mut cursor = intervals[0].1;
1108 let mut best_gap: Option<(f64, f64)> = None; for (left, right) in intervals.iter().skip(1) {
1110 if *left > cursor {
1111 let gap = *left - cursor;
1112 if gap >= min_gap {
1113 match best_gap {
1114 Some((best, _)) if best >= gap => {}
1115 _ => {
1116 let cut_x = (cursor + *left) * 0.5;
1117 best_gap = Some((gap, cut_x));
1118 }
1119 }
1120 }
1121 }
1122 cursor = cursor.max(*right);
1123 }
1124
1125 let (gap_size, cut_x) = best_gap?;
1126
1127 let mut left_group = Vec::new();
1130 let mut right_group = Vec::new();
1131 for span in spans {
1132 let midpoint = span.x + (span.right() - span.x) * 0.5;
1133 if midpoint < cut_x {
1134 left_group.push(span.clone());
1135 } else {
1136 right_group.push(span.clone());
1137 }
1138 }
1139
1140 if !columns_are_dense(&left_group, &right_group, stats) {
1141 return None;
1142 }
1143 if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
1144 return None;
1145 }
1146
1147 Some((vec![left_group, right_group], gap_size))
1148}
1149
1150fn columns_are_band_aligned(
1156 spans: &[TextSpan],
1157 cut_x: f64,
1158 region_left: f64,
1159 region_right: f64,
1160 stats: &PageStats,
1161) -> bool {
1162 let left_width = (cut_x - region_left).max(1.0);
1163 let right_width = (region_right - cut_x).max(1.0);
1164
1165 const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1170
1171 let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1172 for band in &bands {
1173 let mut has_left = false;
1174 let mut has_right = false;
1175 for span in &band.spans {
1176 let midpoint = span.x + (span.right() - span.x) * 0.5;
1177 if midpoint < cut_x {
1178 has_left = true;
1179 } else {
1180 has_right = true;
1181 }
1182 }
1183 if has_left && has_right {
1184 continue; }
1186 let band_width = band.width();
1187 if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1188 return false;
1189 }
1190 if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1191 return false;
1192 }
1193 }
1194 true
1195}
1196
1197fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1202 for col in [left, right] {
1203 if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1204 return false;
1205 }
1206 let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1207 if bands.is_empty() {
1208 return false;
1209 }
1210 let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1211 let chars_per_band = total_chars as f64 / bands.len() as f64;
1212 if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1213 return false;
1214 }
1215 }
1216 true
1217}
1218
1219fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1223 if spans.len() < 2 {
1224 return None;
1225 }
1226 let mut sorted = spans.to_vec();
1228 sorted.sort_by(|a, b| {
1229 b.y.partial_cmp(&a.y)
1230 .unwrap_or(Ordering::Equal)
1231 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1232 });
1233
1234 let min_gap = if stats.median_line_spacing > 0.0 {
1240 stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1241 } else {
1242 stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1243 };
1244
1245 let mut best: Option<(f64, f64)> = None; let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1248 let mut band_bottom = sorted[0].y;
1249
1250 for span in sorted.iter().skip(1) {
1251 if (band_bottom - span.y).abs() <= tolerance {
1252 band_bottom = band_bottom.min(span.y);
1253 continue;
1254 }
1255 let gap = band_bottom - span.y;
1256 if gap >= min_gap {
1257 let cut_y = (band_bottom + span.y) * 0.5;
1258 match best {
1259 Some((best_gap, _)) if best_gap >= gap => {}
1260 _ => best = Some((gap, cut_y)),
1261 }
1262 }
1263 band_bottom = span.y;
1264 }
1265
1266 let (gap_size, cut_y) = best?;
1267
1268 let mut top_group = Vec::new();
1269 let mut bottom_group = Vec::new();
1270 for span in spans {
1271 if span.y > cut_y {
1272 top_group.push(span.clone());
1273 } else {
1274 bottom_group.push(span.clone());
1275 }
1276 }
1277 if top_group.is_empty() || bottom_group.is_empty() {
1278 return None;
1279 }
1280 Some((vec![top_group, bottom_group], gap_size))
1281}
1282
1283#[allow(dead_code)]
1287fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1288 let bands = group_spans_into_bands(spans);
1289 group_spans_into_blocks_legacy_from_bands(bands)
1290}
1291
1292fn group_spans_into_blocks_legacy_with_stats(
1293 spans: Vec<TextSpan>,
1294 stats: &PageStats,
1295) -> Vec<TextBlock> {
1296 let bands = group_spans_into_bands_with_stats(spans, stats);
1297 group_spans_into_blocks_legacy_from_bands(bands)
1298}
1299
1300fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1301 if bands.is_empty() {
1302 return Vec::new();
1303 }
1304
1305 let column_gap_threshold = compute_adaptive_column_gap(&bands);
1306
1307 let mut blocks = Vec::new();
1308 let mut idx = 0;
1309
1310 while idx < bands.len() {
1311 let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1312 if gap_midpoints.is_empty() {
1313 blocks.push(bands[idx].row_block());
1314 idx += 1;
1315 continue;
1316 }
1317
1318 let mut boundaries = gap_midpoints.clone();
1319 let mut band_indices = vec![idx];
1320 let mut gapped_band_count = 1usize;
1321 let mut region_left = bands[idx].left();
1322 let mut region_right = bands[idx].right();
1323 let mut next_idx = idx + 1;
1324
1325 while next_idx < bands.len() {
1326 let next_band = &bands[next_idx];
1327 let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1328 if next_gap_midpoints.is_empty() {
1329 if next_band
1330 .fits_single_column(&boundaries, region_left, region_right)
1331 .is_some()
1332 {
1333 band_indices.push(next_idx);
1334 next_idx += 1;
1335 continue;
1336 }
1337 break;
1338 }
1339
1340 if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1341 break;
1342 }
1343
1344 update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1345 gapped_band_count += 1;
1346 band_indices.push(next_idx);
1347 region_left = region_left.min(next_band.left());
1348 region_right = region_right.max(next_band.right());
1349 next_idx += 1;
1350 }
1351
1352 if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1353 append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1354 idx = next_idx;
1355 } else {
1356 blocks.push(bands[idx].row_block());
1357 idx += 1;
1358 }
1359 }
1360
1361 blocks
1362}
1363
1364fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1368 let stats = PageStats::from_spans(&spans);
1369 group_spans_into_bands_with_stats(spans, &stats)
1370}
1371
1372fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1373 if spans.is_empty() {
1374 return Vec::new();
1375 }
1376
1377 spans.sort_by(|a, b| {
1378 b.y.partial_cmp(&a.y)
1379 .unwrap_or(Ordering::Equal)
1380 .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1381 });
1382
1383 let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1389
1390 let mut bands: Vec<TextBand> = Vec::new();
1391
1392 for span in spans {
1393 let tolerance = (span.height * BAND_Y_FRACTION)
1394 .max(page_tolerance)
1395 .max(BAND_Y_TOLERANCE);
1396 if let Some(band) = bands
1397 .iter_mut()
1398 .find(|band| (band.y - span.y).abs() <= tolerance)
1399 {
1400 let span_count = band.spans.len() as f64;
1401 band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1402 band.spans.push(span);
1403 } else {
1404 bands.push(TextBand::new(span));
1405 }
1406 }
1407
1408 for band in &mut bands {
1409 band.sort_spans();
1410 }
1411
1412 bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1413 bands
1414}
1415
1416fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1417 let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1418 boundaries.len() == gap_midpoints.len()
1419 && boundaries
1420 .iter()
1421 .zip(gap_midpoints)
1422 .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1423}
1424
1425fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1426 for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1427 *boundary =
1428 (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1429 }
1430}
1431
1432fn region_is_columnar(
1433 bands: &[TextBand],
1434 band_indices: &[usize],
1435 boundaries: &[f64],
1436 gapped_band_count: usize,
1437) -> bool {
1438 if boundaries.is_empty()
1439 || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1440 || band_indices.is_empty()
1441 || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1442 {
1443 return false;
1444 }
1445
1446 let mut non_empty_slices = 0usize;
1447 let mut dense_slices = 0usize;
1448 let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1449
1450 for &band_idx in band_indices {
1451 let slices = bands[band_idx].split_by_boundaries(boundaries);
1452 for (column_idx, slice) in slices.iter().enumerate() {
1453 if slice.is_empty() {
1454 continue;
1455 }
1456
1457 non_empty_slices += 1;
1458 slices_per_column[column_idx] += 1;
1459
1460 let char_count = slice
1461 .iter()
1462 .map(|span| span.text.chars().count())
1463 .sum::<usize>();
1464 if slice.len() >= 2 || char_count >= 8 {
1465 dense_slices += 1;
1466 }
1467 }
1468 }
1469
1470 if non_empty_slices < boundaries.len() + 2 {
1471 return false;
1472 }
1473
1474 if slices_per_column.contains(&0) {
1475 return false;
1476 }
1477
1478 (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1479}
1480
1481fn append_column_region_blocks(
1482 bands: &[TextBand],
1483 band_indices: &[usize],
1484 boundaries: &[f64],
1485 blocks: &mut Vec<TextBlock>,
1486) {
1487 let column_count = boundaries.len() + 1;
1488 let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1489
1490 for &band_idx in band_indices {
1491 let slices = bands[band_idx].split_by_boundaries(boundaries);
1492 for (column_idx, slice) in slices.into_iter().enumerate() {
1493 if slice.is_empty() {
1494 continue;
1495 }
1496 column_bands[column_idx].push(TextSpan::default());
1497 let marker_idx = column_bands[column_idx].len() - 1;
1498 column_bands[column_idx][marker_idx] = TextSpan {
1499 x: f64::NEG_INFINITY,
1500 y: bands[band_idx].y,
1501 ..TextSpan::default()
1502 };
1503 column_bands[column_idx].extend(slice);
1504 }
1505 }
1506
1507 for spans in column_bands {
1508 let mut current: Vec<TextSpan> = Vec::new();
1509 for span in spans {
1510 if span.x == f64::NEG_INFINITY {
1511 if !current.is_empty() {
1512 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1513 blocks.push(TextBlock {
1514 spans: std::mem::take(&mut current),
1515 });
1516 }
1517 continue;
1518 }
1519 current.push(span);
1520 }
1521 if !current.is_empty() {
1522 current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1523 blocks.push(TextBlock { spans: current });
1524 }
1525 }
1526}
1527
1528fn stitch_hyphenated_lines(lines: &[String]) -> String {
1543 let mut out = String::new();
1544 for (idx, line) in lines.iter().enumerate() {
1545 if idx == 0 {
1546 out.push_str(line);
1547 continue;
1548 }
1549
1550 let next_trimmed = line.trim_start();
1551
1552 let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1554
1555 if should_merge {
1556 out.pop(); out.push_str(next_trimmed);
1558 } else {
1559 out.push('\n');
1560 out.push_str(line);
1561 }
1562 }
1563 out
1564}
1565
1566fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1569 if !accumulated.ends_with('-') {
1571 return false;
1572 }
1573
1574 let before_hyphen = accumulated.chars().rev().nth(1);
1576 if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1577 return false;
1578 }
1579
1580 let alpha_prefix_len = accumulated
1582 .chars()
1583 .rev()
1584 .skip(1) .take_while(|c| c.is_alphabetic())
1586 .count();
1587 if alpha_prefix_len < 3 {
1588 return false;
1589 }
1590
1591 let first_next = next_trimmed.chars().next();
1593 if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1594 return false;
1595 }
1596
1597 let next_alpha_len = next_trimmed
1599 .chars()
1600 .take_while(|c| c.is_ascii_lowercase())
1601 .count();
1602 if next_alpha_len < 3 {
1603 return false;
1604 }
1605
1606 true
1607}
1608
1609pub(crate) fn normalize_text_output(text: &str) -> String {
1616 if text.is_empty() {
1617 return String::new();
1618 }
1619
1620 let mut lines: Vec<&str> = Vec::new();
1621 for line in text.split('\n') {
1622 lines.push(line.trim_end());
1623 }
1624
1625 while lines.last() == Some(&"") {
1627 lines.pop();
1628 }
1629
1630 if lines.is_empty() {
1631 return String::new();
1632 }
1633
1634 let mut result = String::with_capacity(text.len());
1635 let mut consecutive_empty = 0u32;
1636
1637 for (i, line) in lines.iter().enumerate() {
1638 if line.is_empty() || *line == "\x0C" {
1639 if line.is_empty() {
1640 consecutive_empty += 1;
1641 if consecutive_empty <= 2 {
1643 result.push('\n');
1644 }
1645 } else {
1646 consecutive_empty = 0;
1648 result.push_str(line);
1649 if i + 1 < lines.len() {
1650 result.push('\n');
1651 }
1652 }
1653 } else {
1654 consecutive_empty = 0;
1656 result.push_str(line);
1657 if i + 1 < lines.len() {
1658 result.push('\n');
1659 }
1660 }
1661 }
1662
1663 if !result.is_empty() && !result.ends_with('\n') {
1665 result.push('\n');
1666 }
1667
1668 result
1669}
1670
1671#[cfg(test)]
1672mod tests {
1673 use super::*;
1674
1675 fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1676 TextSpan {
1677 text: text.into(),
1678 x,
1679 y,
1680 width,
1681 height: 12.0,
1682 font_size: 12.0,
1683 ..TextSpan::default()
1684 }
1685 }
1686
1687 fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1688 group_spans_into_blocks(spans)
1689 .into_iter()
1690 .map(|block| block.text())
1691 .collect()
1692 }
1693
1694 #[test]
1695 fn empty_device_produces_empty_text() {
1696 let dev = TextExtractionDevice::new();
1697 assert!(dev.into_text().is_empty());
1698 }
1699
1700 #[test]
1701 fn single_column_stays_row_major() {
1702 let texts = block_texts(vec![
1703 span("Single Column Line 1", 40.0, 700.0, 140.0),
1704 span("Single Column Line 2", 40.0, 684.0, 140.0),
1705 span("Single Column Line 3", 40.0, 668.0, 140.0),
1706 ]);
1707
1708 assert_eq!(
1709 texts,
1710 vec![
1711 "Single Column Line 1",
1712 "Single Column Line 2",
1713 "Single Column Line 3",
1714 ]
1715 );
1716 }
1717
1718 #[test]
1719 fn two_column_region_reads_column_major() {
1720 let texts = block_texts(vec![
1721 span("Header", 200.0, 740.0, 80.0),
1722 span("Left column line one", 40.0, 700.0, 115.0),
1723 span("Right column line one", 320.0, 700.0, 120.0),
1724 span("Left column line two", 40.0, 684.0, 115.0),
1725 span("Right column line two", 320.0, 684.0, 120.0),
1726 span("Left column line three", 40.0, 668.0, 125.0),
1727 span("Right column line three", 320.0, 668.0, 130.0),
1728 span("Footer", 200.0, 620.0, 80.0),
1729 ]);
1730
1731 assert_eq!(
1732 texts,
1733 vec![
1734 "Header",
1735 "Left column line one",
1736 "Left column line two",
1737 "Left column line three",
1738 "Right column line one",
1739 "Right column line two",
1740 "Right column line three",
1741 "Footer",
1742 ]
1743 );
1744 }
1745
1746 #[test]
1747 fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1748 let texts = block_texts(vec![
1749 span("Intro paragraph", 40.0, 740.0, 180.0),
1750 span("L1 words here", 40.0, 700.0, 110.0),
1751 span("R1 words here", 320.0, 700.0, 110.0),
1752 span("L2 words here", 40.0, 684.0, 110.0),
1753 span("R2 words here", 320.0, 684.0, 110.0),
1754 span("L3 words here", 40.0, 668.0, 110.0),
1755 span("R3 words here", 320.0, 668.0, 110.0),
1756 span("Outro paragraph", 40.0, 620.0, 180.0),
1757 ]);
1758
1759 assert_eq!(
1760 texts,
1761 vec![
1762 "Intro paragraph",
1763 "L1 words here",
1764 "L2 words here",
1765 "L3 words here",
1766 "R1 words here",
1767 "R2 words here",
1768 "R3 words here",
1769 "Outro paragraph",
1770 ]
1771 );
1772 }
1773
1774 #[test]
1775 fn short_table_like_rows_fall_back_to_row_major() {
1776 let texts = block_texts(vec![
1777 span("Name", 40.0, 700.0, 30.0),
1778 span("Age", 320.0, 700.0, 20.0),
1779 span("Alice", 40.0, 684.0, 35.0),
1780 span("30", 320.0, 684.0, 15.0),
1781 span("Bob", 40.0, 668.0, 24.0),
1782 span("25", 320.0, 668.0, 15.0),
1783 ]);
1784
1785 assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1786 }
1787
1788 #[test]
1789 fn three_column_regions_are_supported() {
1790 let texts = block_texts(vec![
1791 span("Column one line one", 40.0, 700.0, 105.0),
1792 span("Column two line one", 220.0, 700.0, 105.0),
1793 span("Column three line one", 400.0, 700.0, 120.0),
1794 span("Column one line two", 40.0, 684.0, 105.0),
1795 span("Column two line two", 220.0, 684.0, 105.0),
1796 span("Column three line two", 400.0, 684.0, 120.0),
1797 span("Column one line three", 40.0, 668.0, 120.0),
1798 span("Column two line three", 220.0, 668.0, 120.0),
1799 span("Column three line three", 400.0, 668.0, 135.0),
1800 ]);
1801
1802 assert_eq!(
1803 texts,
1804 vec![
1805 "Column one line one",
1806 "Column one line two",
1807 "Column one line three",
1808 "Column two line one",
1809 "Column two line two",
1810 "Column two line three",
1811 "Column three line one",
1812 "Column three line two",
1813 "Column three line three",
1814 ]
1815 );
1816 }
1817
1818 #[test]
1819 fn text_block_concatenation_spaced() {
1820 let block = TextBlock {
1821 spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1822 };
1823 assert_eq!(block.text(), "A B");
1824 }
1825
1826 #[test]
1827 fn adaptive_column_gap_fallback_for_no_gaps() {
1828 let bands = vec![
1830 TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1831 TextBand::new(span("World", 40.0, 684.0, 80.0)),
1832 ];
1833 let threshold = compute_adaptive_column_gap(&bands);
1834 assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1835 }
1836
1837 #[test]
1838 fn adaptive_column_gap_uses_median() {
1839 let mut bands = Vec::new();
1841 for y in [700.0, 684.0, 668.0] {
1842 let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1843 band.spans.push(span("word2", 74.0, y, 30.0)); band.spans.push(span("word3", 108.0, y, 30.0)); bands.push(band);
1846 }
1847 let threshold = compute_adaptive_column_gap(&bands);
1848 assert!(
1850 (10.0..=14.0).contains(&threshold),
1851 "expected ~12, got {threshold}"
1852 );
1853 }
1854
1855 #[test]
1856 fn adaptive_column_gap_clamps_to_min() {
1857 let mut bands = Vec::new();
1859 for y in [700.0, 684.0, 668.0, 652.0] {
1860 let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1861 band.spans.push(span("def", 20.0, y, 18.0));
1863 bands.push(band);
1864 }
1865 let threshold = compute_adaptive_column_gap(&bands);
1866 assert!(
1867 (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1868 "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1869 );
1870 }
1871
1872 #[test]
1873 fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1874 let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1876 band.spans.push(span("Right", 80.0, 700.0, 30.0)); let bands = vec![band];
1878 let threshold = compute_adaptive_column_gap(&bands);
1879 assert!(
1880 (threshold - 37.5).abs() < 0.01,
1881 "expected 37.5 (0.75×50), got {threshold}"
1882 );
1883 }
1884
1885 #[test]
1886 fn normalize_trims_trailing_whitespace_per_line() {
1887 assert_eq!(
1888 normalize_text_output("hello \nworld \n"),
1889 "hello\nworld\n"
1890 );
1891 }
1892
1893 #[test]
1894 fn normalize_collapses_excess_newlines() {
1895 assert_eq!(
1897 normalize_text_output("hello\n\n\n\n\nworld\n"),
1898 "hello\n\n\nworld\n"
1899 );
1900 }
1901
1902 #[test]
1903 fn normalize_preserves_double_newline() {
1904 assert_eq!(
1905 normalize_text_output("paragraph one\n\nparagraph two\n"),
1906 "paragraph one\n\nparagraph two\n"
1907 );
1908 }
1909
1910 #[test]
1911 fn normalize_preserves_form_feed() {
1912 assert_eq!(
1913 normalize_text_output("page1\n\n\x0Cpage2\n"),
1914 "page1\n\n\x0Cpage2\n"
1915 );
1916 }
1917
1918 #[test]
1919 fn normalize_adds_trailing_newline() {
1920 assert_eq!(normalize_text_output("hello"), "hello\n");
1921 }
1922
1923 #[test]
1924 fn normalize_empty_input() {
1925 assert_eq!(normalize_text_output(""), "");
1926 }
1927
1928 #[test]
1929 fn normalize_only_whitespace() {
1930 assert_eq!(normalize_text_output(" \n \n"), "");
1931 }
1932
1933 #[test]
1936 fn hyphen_stitch_joins_wrapped_word() {
1937 let lines = vec!["the aver-".into(), "age rainfall".into()];
1938 assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1939 }
1940
1941 #[test]
1942 fn hyphen_stitch_handles_leading_whitespace() {
1943 let lines = vec!["pre-".into(), " dict the outcome".into()];
1944 assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1946 }
1947
1948 #[test]
1949 fn hyphen_stitch_capital_continuation_not_stitched() {
1950 let lines = vec!["Section three-".into(), "Summary here".into()];
1951 assert_eq!(
1952 stitch_hyphenated_lines(&lines),
1953 "Section three-\nSummary here"
1954 );
1955 }
1956
1957 #[test]
1958 fn hyphen_stitch_bullet_dash_not_stitched() {
1959 let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1961 assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1962 }
1963
1964 #[test]
1965 fn hyphen_stitch_numeric_range_not_stitched() {
1966 let lines = vec!["page 42-".into(), "seventy".into()];
1968 assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1969 }
1970
1971 #[test]
1972 fn hyphen_stitch_short_prefix_not_stitched() {
1973 let lines = vec!["re-".into(), "organize".into()];
1975 assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1976 }
1977
1978 #[test]
1979 fn hyphen_stitch_short_continuation_not_stitched() {
1980 let lines = vec!["counter-".into(), "an example".into()];
1982 assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1983 }
1984
1985 #[test]
1986 fn hyphen_stitch_compound_word_midline_preserved() {
1987 let lines = vec!["real-time system".into()];
1990 assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1991 }
1992
1993 #[test]
1994 fn hyphen_stitch_single_line_unchanged() {
1995 let lines = vec!["only line".into()];
1996 assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1997 }
1998
1999 #[test]
2000 fn hyphen_stitch_empty_input() {
2001 let lines: Vec<String> = vec![];
2002 assert_eq!(stitch_hyphenated_lines(&lines), "");
2003 }
2004
2005 fn make_device_with_median(median: f64) -> TextExtractionDevice {
2008 let mut dev = TextExtractionDevice::new();
2009 for _ in 0..MEDIAN_REFRESH {
2011 dev.glyph_widths.push(median);
2012 }
2013 dev.refresh_median_char_width();
2014 assert!((dev.cached_median_char_width - median).abs() < 1e-9);
2015 dev
2016 }
2017
2018 #[test]
2019 fn consensus_inserts_space_on_strong_tj_offset_alone() {
2020 let mut dev = make_device_with_median(6.0);
2023 dev.pending_tj_offset = 250.0; assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2025 }
2026
2027 #[test]
2028 fn consensus_inserts_space_on_geometric_gap_alone() {
2029 let dev = make_device_with_median(6.0);
2031 assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
2034 }
2035
2036 #[test]
2037 fn consensus_no_space_on_kerning_gap() {
2038 let dev = make_device_with_median(6.0);
2042 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
2043 }
2044
2045 #[test]
2046 fn consensus_inserts_space_on_camel_case_plus_gap() {
2047 let dev = make_device_with_median(6.0);
2051 assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
2052 }
2053
2054 #[test]
2055 fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
2056 let dev = make_device_with_median(6.0);
2057 assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
2058 }
2059
2060 #[test]
2061 fn consensus_heuristic_alone_is_insufficient() {
2062 let dev = make_device_with_median(6.0);
2067 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
2068 }
2069
2070 #[test]
2071 fn consensus_falls_back_to_font_size_when_no_median() {
2072 let dev = TextExtractionDevice::new();
2074 assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
2076 assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
2078 }
2079
2080 #[test]
2081 fn consensus_ignores_tiny_tj_offsets() {
2082 let mut dev = make_device_with_median(6.0);
2084 dev.pending_tj_offset = 50.0;
2085 assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2086 }
2087
2088 #[test]
2089 fn consensus_accepts_negative_tj_offsets() {
2090 let mut dev = make_device_with_median(6.0);
2093 dev.pending_tj_offset = -250.0;
2094 assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2095 }
2096
2097 #[test]
2098 fn text_adjustment_accumulates_until_glyph() {
2099 let mut dev = TextExtractionDevice::new();
2100 dev.text_adjustment(120.0);
2101 dev.text_adjustment(140.0);
2102 assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
2103 }
2104
2105 #[test]
2108 fn xy_cut_header_body_footer_with_two_columns() {
2109 let texts = block_texts(vec![
2115 span("HEADLINE TITLE", 180.0, 760.0, 120.0),
2116 span("Left col line A", 40.0, 700.0, 110.0),
2117 span("Right col line A", 320.0, 700.0, 115.0),
2118 span("Left col line B", 40.0, 684.0, 110.0),
2119 span("Right col line B", 320.0, 684.0, 115.0),
2120 span("Left col line C", 40.0, 668.0, 110.0),
2121 span("Right col line C", 320.0, 668.0, 115.0),
2122 span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
2123 ]);
2124 assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
2125 assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
2126 let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
2128 let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
2129 assert!(
2130 left_c_idx < right_a_idx,
2131 "expected column-major ordering in body: {texts:?}"
2132 );
2133 }
2134
2135 #[test]
2136 fn xy_cut_rejects_column_split_on_table_rows() {
2137 let texts = block_texts(vec![
2140 span("Name", 40.0, 700.0, 30.0),
2141 span("Age", 320.0, 700.0, 20.0),
2142 span("Alice", 40.0, 684.0, 35.0),
2143 span("30", 320.0, 684.0, 15.0),
2144 ]);
2145 assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2146 }
2147
2148 #[test]
2149 fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2150 let texts = block_texts(vec![
2154 span(
2155 "Full width intro spanning both columns here",
2156 40.0,
2157 740.0,
2158 360.0,
2159 ),
2160 span("Left A", 40.0, 700.0, 50.0),
2161 span("Right A", 320.0, 700.0, 50.0),
2162 span("Left B", 40.0, 684.0, 50.0),
2163 span("Right B", 320.0, 684.0, 50.0),
2164 ]);
2165 assert!(
2166 texts[0].contains("Full width intro"),
2167 "expected full-width intro first: {texts:?}"
2168 );
2169 }
2170
2171 #[test]
2172 fn xy_cut_horizontal_split_for_zone_boundaries() {
2173 let texts = block_texts(vec![
2177 span("First paragraph body text", 40.0, 740.0, 200.0),
2178 span("Second paragraph body", 40.0, 680.0, 180.0),
2179 ]);
2180 assert_eq!(texts.len(), 2);
2181 assert!(texts[0].starts_with("First"));
2182 assert!(texts[1].starts_with("Second"));
2183 }
2184
2185 #[test]
2186 fn xy_cut_recursion_terminates_with_single_span() {
2187 let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2188 assert_eq!(texts, vec!["Only one span on the page"]);
2189 }
2190
2191 #[test]
2192 fn median_font_size_handles_mixed_sizes() {
2193 let spans = vec![
2194 TextSpan {
2195 text: "small".into(),
2196 width: 10.0,
2197 height: 8.0,
2198 font_size: 8.0,
2199 ..TextSpan::default()
2200 },
2201 TextSpan {
2202 text: "medium".into(),
2203 width: 10.0,
2204 height: 12.0,
2205 font_size: 12.0,
2206 ..TextSpan::default()
2207 },
2208 TextSpan {
2209 text: "large".into(),
2210 width: 10.0,
2211 height: 24.0,
2212 font_size: 24.0,
2213 ..TextSpan::default()
2214 },
2215 ];
2216 assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2217 }
2218
2219 #[test]
2220 fn columns_band_aligned_accepts_aligned_columns() {
2221 let spans = vec![
2222 span("L1", 40.0, 700.0, 60.0),
2223 span("R1", 300.0, 700.0, 60.0),
2224 span("L2", 40.0, 684.0, 60.0),
2225 span("R2", 300.0, 684.0, 60.0),
2226 ];
2227 let stats = PageStats::from_spans(&spans);
2228 assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2230 }
2231
2232 #[test]
2233 fn columns_band_aligned_rejects_wide_single_side_band() {
2234 let spans = vec![
2235 span("Wide banner line across top", 40.0, 740.0, 280.0),
2236 span("L1", 40.0, 700.0, 60.0),
2237 span("R1", 300.0, 700.0, 60.0),
2238 ];
2239 let stats = PageStats::from_spans(&spans);
2240 assert!(!columns_are_band_aligned(
2243 &spans, 200.0, 40.0, 360.0, &stats
2244 ));
2245 }
2246
2247 #[test]
2248 fn page_stats_computes_median_values() {
2249 let spans = vec![
2250 span("one", 40.0, 700.0, 30.0),
2251 span("two", 40.0, 680.0, 30.0),
2252 span("three", 40.0, 660.0, 50.0),
2253 ];
2254 let stats = PageStats::from_spans(&spans);
2255 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2256 assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2258 assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2260 }
2261
2262 #[test]
2263 fn page_stats_handles_empty_input() {
2264 let stats = PageStats::from_spans(&[]);
2265 assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2266 assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2267 assert_eq!(stats.median_line_spacing, 0.0);
2268 }
2269
2270 #[test]
2271 fn narrow_gutter_detected_with_adaptive_threshold() {
2272 let mut spans = Vec::new();
2276 for y in [700.0, 684.0, 668.0] {
2277 spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2279 spans.push(span("dolor sit", 144.0, y, 80.0));
2280 spans.push(span("amet consec", 236.0, y, 100.0));
2282 spans.push(span("tetur adipi", 340.0, y, 80.0));
2283 }
2284 let texts = block_texts(spans);
2285 assert!(
2287 texts.len() >= 6,
2288 "expected column-major output, got {texts:?}"
2289 );
2290 assert!(
2292 texts[0].contains("Lorem"),
2293 "first block should be left column: {texts:?}"
2294 );
2295 }
2296
2297 #[test]
2298 fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2299 let texts = block_texts(vec![
2300 span("73022", 45.0, 750.0, 70.0),
2301 span("Federal Register banner", 125.6, 750.0, 260.0),
2302 span("Left column line one", 45.0, 725.0, 140.0),
2303 span("Middle column line one", 222.0, 725.0, 140.0),
2304 span("Right column line one", 399.0, 725.0, 120.0),
2305 span("Left column line two", 45.0, 715.0, 140.0),
2306 span("Middle column line two", 210.0, 715.0, 152.0),
2307 span("Right column line two", 388.0, 715.0, 132.0),
2308 span("Left column line three", 45.0, 705.0, 140.0),
2309 span("Middle column line three", 235.0, 705.0, 135.0),
2310 span("Right column line three", 408.0, 705.0, 118.0),
2311 ]);
2312
2313 assert_eq!(
2314 texts,
2315 vec![
2316 "73022 Federal Register banner",
2317 "Left column line one",
2318 "Left column line two",
2319 "Left column line three",
2320 "Middle column line one",
2321 "Middle column line two",
2322 "Middle column line three",
2323 "Right column line one",
2324 "Right column line two",
2325 "Right column line three",
2326 ]
2327 );
2328 }
2329
2330 #[test]
2331 fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2332 let texts = block_texts(vec![
2333 span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2334 span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2335 span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2336 span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2337 span("2 This is a fakebold", 27.0, 714.8, 142.0),
2338 span(" fakebold", 169.8, 714.8, 70.0),
2339 span(" fakebold", 170.1, 714.8, 70.0),
2340 span(" fakebold word.", 170.4, 714.8, 110.0),
2341 ]);
2342
2343 assert_eq!(
2344 texts,
2345 vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2346 );
2347 }
2348
2349 #[test]
2352 fn g1_default_text_span_has_empty_metadata() {
2353 let s = TextSpan::default();
2354 assert_eq!(s.font_name, None);
2355 assert!(!s.is_bold);
2356 assert!(!s.is_italic);
2357 assert_eq!(s.color, None);
2358 }
2359
2360 #[test]
2361 fn g1_strip_subset_prefix_handles_six_char_prefix() {
2362 assert_eq!(strip_subset_prefix("AAAAAA+Helvetica"), "Helvetica");
2363 assert_eq!(strip_subset_prefix("ABC+Helvetica"), "ABC+Helvetica");
2365 assert_eq!(strip_subset_prefix("Helvetica-Bold"), "Helvetica-Bold");
2367 }
2368
2369 #[test]
2370 fn g1_name_style_hints_match_pdf_interpret_rules() {
2371 assert_eq!(name_style_hints("Helvetica-Bold"), (true, false));
2372 assert_eq!(name_style_hints("Times-Italic"), (false, true));
2373 assert_eq!(name_style_hints("MyFont-BoldOblique"), (true, true));
2374 assert_eq!(name_style_hints("Helvetica"), (false, false));
2375 assert_eq!(name_style_hints("Roboto-DemiBold"), (true, false));
2377 assert_eq!(name_style_hints("Roboto-Black"), (true, false));
2378 assert_eq!(name_style_hints("Roboto-Oblique"), (false, true));
2380 assert_eq!(name_style_hints("MyFont-Slanted"), (false, true));
2381 }
2382
2383 #[test]
2386 fn g2_default_text_span_has_estimate_width_source() {
2387 let s = TextSpan::default();
2388 assert_eq!(s.width_source, WidthSource::Estimate);
2389 assert!(s.char_bounds.is_empty());
2390 }
2391
2392 #[test]
2395 fn g2_single_glyph_span_has_one_char_bound() {
2396 let s = TextSpan {
2397 text: "A".into(),
2398 x: 10.0,
2399 y: 100.0,
2400 width: 7.22,
2401 height: 10.0,
2402 font_size: 10.0,
2403 width_source: WidthSource::Metric,
2404 char_bounds: vec![[10.0, 100.0, 17.22, 110.0]],
2405 ..Default::default()
2406 };
2407
2408 assert_eq!(s.char_bounds.len(), 1);
2409 let [x0, y0, x1, y1] = s.char_bounds[0];
2410 assert!((x0 - 10.0).abs() < 0.001);
2411 assert!((x1 - 17.22).abs() < 0.001);
2412 assert!((y1 - y0 - s.font_size).abs() < 0.001);
2413 }
2414
2415 #[test]
2418 fn g2_merged_span_degrades_width_source_on_estimate() {
2419 let mut s = TextSpan {
2420 width_source: WidthSource::Metric,
2421 char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
2422 ..Default::default()
2423 };
2424
2425 s.char_bounds.push([7.0, 0.0, 12.0, 10.0]);
2427 s.width_source = WidthSource::Estimate; assert_eq!(s.width_source, WidthSource::Estimate);
2430 assert_eq!(s.char_bounds.len(), 2);
2431 }
2432
2433 #[test]
2435 fn g2_width_source_variants_are_correct() {
2436 assert_eq!(format!("{:?}", WidthSource::Metric), "Metric");
2437 assert_eq!(format!("{:?}", WidthSource::Estimate), "Estimate");
2438 assert_ne!(WidthSource::Metric, WidthSource::Estimate);
2439 assert_eq!(WidthSource::default(), WidthSource::Estimate);
2440 }
2441}