1use crate::models::bbox::BoundingBox;
18use crate::models::content::ContentElement;
19
20pub fn xycut_sort(elements: &mut [ContentElement], page_bbox: &BoundingBox) {
28 if elements.len() <= 1 {
29 return;
30 }
31 xycut_recursive(elements, page_bbox);
32}
33
34fn xycut_recursive(elements: &mut [ContentElement], region: &BoundingBox) {
35 if elements.len() <= 1 {
36 return;
37 }
38
39 let h_gap = find_horizontal_gap_size(elements);
41 let v_gap = find_vertical_gap_size(elements);
42 let vertical_analysis = v_gap.and_then(|(split_x, v_size)| {
43 analyze_vertical_cut(
44 elements,
45 split_x,
46 h_gap.map(|(_, gap)| gap).unwrap_or(0.0),
47 v_size,
48 )
49 .map(|analysis| (split_x, analysis))
50 });
51
52 if std::env::var("XYCUT_DEBUG").is_ok() {
53 eprintln!(
54 "[XYCUT] n={} h_gap={:?} v_gap={:?}",
55 elements.len(),
56 h_gap.map(|(y, g)| format!("y={:.1} gap={:.1}", y, g)),
57 v_gap.map(|(x, g)| format!("x={:.1} gap={:.1}", x, g))
58 );
59 for e in elements.iter() {
60 let b = e.bbox();
61 eprintln!(
62 " el: l={:.1} r={:.1} top={:.1} bot={:.1}",
63 b.left_x, b.right_x, b.top_y, b.bottom_y
64 );
65 }
66 }
67
68 let prefer_vertical = vertical_analysis.is_some();
74
75 if prefer_vertical {
76 if let Some((split_x, analysis)) = vertical_analysis.as_ref() {
78 if reorder_vertical_bands(elements, *split_x, analysis, region) {
79 return;
80 }
81 }
82 if let Some((split_x, _)) = v_gap {
83 let (left, right) = partition_by_x(elements, split_x);
84 if !left.is_empty() && !right.is_empty() {
85 xycut_recursive(left, region);
86 xycut_recursive(right, region);
87 return;
88 }
89 }
90 if let Some((split_y, _)) = h_gap {
92 let (top, bottom) = partition_by_y(elements, split_y);
93 if !top.is_empty() && !bottom.is_empty() {
94 xycut_recursive(top, region);
95 xycut_recursive(bottom, region);
96 return;
97 }
98 }
99 } else {
100 if let Some((split_y, _)) = h_gap {
102 let (top, bottom) = partition_by_y(elements, split_y);
103 if !top.is_empty() && !bottom.is_empty() {
104 xycut_recursive(top, region);
105 xycut_recursive(bottom, region);
106 return;
107 }
108 }
109 if let Some((split_x, _)) = v_gap {
111 let (left, right) = partition_by_x(elements, split_x);
112 if !left.is_empty() && !right.is_empty() {
113 xycut_recursive(left, region);
114 xycut_recursive(right, region);
115 return;
116 }
117 }
118 }
119
120 const BUCKET_PT: f64 = 4.0;
135 elements.sort_by_key(|e| {
136 let y_bucket = -((e.bbox().top_y / BUCKET_PT).round() as i64); let x_key = (e.bbox().left_x * 10.0).round() as i64; (y_bucket, x_key)
139 });
140}
141
142#[derive(Debug, Clone, Copy)]
143struct VerticalCutAnalysis {
144 shared_top: f64,
145 shared_bottom: f64,
146}
147
148fn analyze_vertical_cut(
149 elements: &[ContentElement],
150 split_x: f64,
151 horizontal_gap: f64,
152 vertical_gap: f64,
153) -> Option<VerticalCutAnalysis> {
154 if elements.len() < 4 {
155 return None;
156 }
157
158 let min_x = elements
159 .iter()
160 .map(|e| e.bbox().left_x)
161 .fold(f64::INFINITY, f64::min);
162 let max_x = elements
163 .iter()
164 .map(|e| e.bbox().right_x)
165 .fold(f64::NEG_INFINITY, f64::max);
166 let content_width = (max_x - min_x).max(1.0);
167 let spanning_width = content_width * 0.55;
168 let split_margin = 6.0;
169 let band_tolerance = 8.0;
170
171 let mut left_count = 0usize;
172 let mut right_count = 0usize;
173 let mut left_top = f64::NEG_INFINITY;
174 let mut left_bottom = f64::INFINITY;
175 let mut right_top = f64::NEG_INFINITY;
176 let mut right_bottom = f64::INFINITY;
177 let mut crossing_bands: Vec<(f64, f64)> = Vec::new();
178
179 for elem in elements {
180 let bbox = elem.bbox();
181 let width = bbox.right_x - bbox.left_x;
182 let crosses_split =
183 bbox.left_x < split_x - split_margin && bbox.right_x > split_x + split_margin;
184
185 if crosses_split && width >= spanning_width {
186 crossing_bands.push((bbox.top_y, bbox.bottom_y));
187 continue;
188 }
189
190 if bbox.center_x() < split_x {
191 left_count += 1;
192 left_top = left_top.max(bbox.top_y);
193 left_bottom = left_bottom.min(bbox.bottom_y);
194 } else {
195 right_count += 1;
196 right_top = right_top.max(bbox.top_y);
197 right_bottom = right_bottom.min(bbox.bottom_y);
198 }
199 }
200
201 if left_count < 2 || right_count < 2 {
202 return None;
203 }
204
205 let left_height = (left_top - left_bottom).max(0.0);
206 let right_height = (right_top - right_bottom).max(0.0);
207 if left_height <= 0.0 || right_height <= 0.0 {
208 return None;
209 }
210
211 let overlap = (left_top.min(right_top) - left_bottom.max(right_bottom)).max(0.0);
212 let overlap_ratio = overlap / left_height.min(right_height);
213 if overlap_ratio < 0.35 || vertical_gap < horizontal_gap * 0.5 {
214 return None;
215 }
216
217 let shared_top = left_top.min(right_top);
218 let shared_bottom = left_bottom.max(right_bottom);
219 if shared_top <= shared_bottom {
220 return None;
221 }
222
223 let ambiguous_crossing = crossing_bands
224 .iter()
225 .filter(|(top_y, bottom_y)| {
226 *bottom_y < shared_top - band_tolerance && *top_y > shared_bottom + band_tolerance
227 })
228 .count();
229 if ambiguous_crossing > 0 {
230 return None;
231 }
232
233 Some(VerticalCutAnalysis {
234 shared_top,
235 shared_bottom,
236 })
237}
238
239fn reorder_vertical_bands(
240 elements: &mut [ContentElement],
241 split_x: f64,
242 analysis: &VerticalCutAnalysis,
243 region: &BoundingBox,
244) -> bool {
245 const SPLIT_MARGIN: f64 = 6.0;
246 const BAND_TOLERANCE: f64 = 8.0;
247
248 let mut top_spanning = Vec::new();
249 let mut left = Vec::new();
250 let mut right = Vec::new();
251 let mut bottom_spanning = Vec::new();
252
253 for element in elements.iter() {
254 let bbox = element.bbox();
255 let crosses_split =
256 bbox.left_x < split_x - SPLIT_MARGIN && bbox.right_x > split_x + SPLIT_MARGIN;
257
258 if crosses_split {
259 if bbox.bottom_y >= analysis.shared_top - BAND_TOLERANCE {
260 top_spanning.push(element.clone());
261 continue;
262 }
263 if bbox.top_y <= analysis.shared_bottom + BAND_TOLERANCE {
264 bottom_spanning.push(element.clone());
265 continue;
266 }
267 return false;
268 }
269
270 if bbox.center_x() < split_x {
271 left.push(element.clone());
272 } else {
273 right.push(element.clone());
274 }
275 }
276
277 if left.is_empty() || right.is_empty() {
278 return false;
279 }
280
281 if top_spanning.len() > 1 {
282 xycut_recursive(top_spanning.as_mut_slice(), region);
283 }
284 xycut_recursive(left.as_mut_slice(), region);
285 xycut_recursive(right.as_mut_slice(), region);
286 if bottom_spanning.len() > 1 {
287 xycut_recursive(bottom_spanning.as_mut_slice(), region);
288 }
289
290 let mut ordered = Vec::with_capacity(elements.len());
291 ordered.extend(top_spanning);
292 ordered.extend(left);
293 ordered.extend(right);
294 ordered.extend(bottom_spanning);
295
296 if ordered.len() != elements.len() {
297 return false;
298 }
299
300 for (dst, src) in elements.iter_mut().zip(ordered.into_iter()) {
301 *dst = src;
302 }
303
304 true
305}
306
307const MIN_GAP_THRESHOLD: f64 = 5.0;
309
310fn find_horizontal_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
316 if elements.len() < 2 {
317 return None;
318 }
319
320 let mut sorted: Vec<(f64, f64)> = elements
322 .iter()
323 .map(|e| (e.bbox().bottom_y, e.bbox().top_y))
324 .collect();
325 sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
326
327 let mut best_gap = 0.0f64;
328 let mut best_y = None;
329 let mut prev_bottom = sorted[0].0; for &(bottom, top) in &sorted[1..] {
332 if prev_bottom > top {
333 let gap = prev_bottom - top;
335 if gap > best_gap && gap > MIN_GAP_THRESHOLD {
336 best_gap = gap;
337 best_y = Some((prev_bottom + top) / 2.0);
338 }
339 }
340 prev_bottom = prev_bottom.min(bottom); }
342
343 best_y.map(|y| (y, best_gap))
344}
345
346fn find_vertical_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
358 if elements.len() < 2 {
359 return None;
360 }
361
362 let min_x = elements
364 .iter()
365 .map(|e| e.bbox().left_x)
366 .fold(f64::INFINITY, f64::min);
367 let max_x = elements
368 .iter()
369 .map(|e| e.bbox().right_x)
370 .fold(f64::NEG_INFINITY, f64::max);
371 let content_width = (max_x - min_x).max(1.0);
372
373 let span_threshold = content_width * 0.70;
375
376 const BBOX_SHRINK: f64 = 3.0;
382
383 let mut intervals: Vec<(f64, f64)> = elements
385 .iter()
386 .filter_map(|e| {
387 let b = e.bbox();
388 let w = b.right_x - b.left_x;
389 if w > span_threshold {
390 None
391 } else {
392 let eff_left = b.left_x + BBOX_SHRINK;
393 let eff_right = b.right_x - BBOX_SHRINK;
394 if eff_right > eff_left {
395 Some((eff_left, eff_right))
396 } else {
397 None }
399 }
400 })
401 .collect();
402
403 if intervals.len() < 2 {
406 intervals = elements
407 .iter()
408 .filter_map(|e| {
409 let b = e.bbox();
410 let eff_left = b.left_x + BBOX_SHRINK;
411 let eff_right = b.right_x - BBOX_SHRINK;
412 if eff_right > eff_left {
413 Some((eff_left, eff_right))
414 } else {
415 None
416 }
417 })
418 .collect();
419 }
420
421 intervals.sort_by(|a, b| {
422 a.0.partial_cmp(&b.0)
423 .unwrap_or(std::cmp::Ordering::Equal)
424 .then_with(|| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
425 });
426
427 let mut merged: Vec<(f64, f64)> = Vec::new();
432 let merge_tolerance = 0.5; for (left, right) in intervals {
434 if let Some(last) = merged.last_mut() {
435 if left <= last.1 + merge_tolerance {
436 last.1 = last.1.max(right);
438 continue;
439 }
440 }
441 merged.push((left, right));
442 }
443
444 let mut best_gap = 0.0f64;
446 let mut best_x = None;
447 for w in merged.windows(2) {
448 let gap = w[1].0 - w[0].1;
449 if gap > best_gap && gap > MIN_GAP_THRESHOLD {
450 best_gap = gap;
451 best_x = Some((w[0].1 + w[1].0) / 2.0);
452 }
453 }
454
455 if best_x.is_none() {
476 const MIN_CENTER_GAP: f64 = 20.0;
477 const MIN_COL_ELEMENT_WIDTH: f64 = 50.0;
480 let mut centers: Vec<f64> = elements
481 .iter()
482 .filter_map(|e| {
483 let b = e.bbox();
484 let w = b.right_x - b.left_x;
485 if w > span_threshold || w < MIN_COL_ELEMENT_WIDTH {
486 None } else {
488 Some((b.left_x + b.right_x) * 0.5)
489 }
490 })
491 .collect();
492 if centers.len() >= 2 {
493 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
494 let mut candidate_x: Option<f64> = None;
495 let mut candidate_gap = 0.0f64;
496 for pair in centers.windows(2) {
497 let gap = pair[1] - pair[0];
498 if gap > candidate_gap && gap >= MIN_CENTER_GAP {
499 candidate_gap = gap;
500 candidate_x = Some((pair[0] + pair[1]) * 0.5);
501 }
502 }
503 if let Some(sx) = candidate_x {
506 let right_min_left_x = elements
507 .iter()
508 .filter(|e| {
509 let b = e.bbox();
510 (b.left_x + b.right_x) * 0.5 >= sx
511 })
512 .map(|e| e.bbox().left_x)
513 .fold(f64::INFINITY, f64::min);
514 if std::env::var("XYCUT_DEBUG").is_ok() {
515 eprintln!("[XYCUT] center_x candidate_x={:.1} candidate_gap={:.1} right_min_left_x={:.1} threshold={:.1} pass={}",
516 sx, candidate_gap, right_min_left_x, sx * 0.85, right_min_left_x >= sx * 0.85);
517 }
518 if right_min_left_x >= sx * 0.85 {
519 best_x = candidate_x;
520 best_gap = candidate_gap;
521 }
522 }
523 }
524 }
525
526 best_x.map(|x| (x, best_gap))
527}
528
529fn partition_by_y(
531 elements: &mut [ContentElement],
532 split_y: f64,
533) -> (&mut [ContentElement], &mut [ContentElement]) {
534 let pivot = itertools_partition(elements, |e| e.bbox().center_y() >= split_y);
535 elements.split_at_mut(pivot)
536}
537
538fn partition_by_x(
540 elements: &mut [ContentElement],
541 split_x: f64,
542) -> (&mut [ContentElement], &mut [ContentElement]) {
543 let pivot = itertools_partition(elements, |e| e.bbox().center_x() < split_x);
544 elements.split_at_mut(pivot)
545}
546
547fn itertools_partition<T, F>(data: &mut [T], pred: F) -> usize
550where
551 F: Fn(&T) -> bool,
552{
553 let mut pivot = 0;
554 for i in 0..data.len() {
555 if pred(&data[i]) {
556 data.swap(i, pivot);
557 pivot += 1;
558 }
559 }
560 pivot
561}
562
563#[cfg(test)]
564mod tests {
565 use super::*;
566 use crate::models::chunks::ImageChunk;
567
568 fn make_element(left: f64, bottom: f64, right: f64, top: f64) -> ContentElement {
569 ContentElement::Image(ImageChunk {
570 bbox: BoundingBox::new(Some(1), left, bottom, right, top),
571 index: None,
572 level: None,
573 })
574 }
575
576 #[test]
577 fn test_xycut_sort_single() {
578 let mut elements = vec![make_element(0.0, 0.0, 100.0, 50.0)];
579 xycut_sort(
580 &mut elements,
581 &BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0),
582 );
583 assert_eq!(elements.len(), 1);
584 }
585
586 #[test]
587 fn test_xycut_sort_two_rows() {
588 let mut elements = vec![
589 make_element(50.0, 100.0, 200.0, 150.0), make_element(50.0, 500.0, 200.0, 550.0), ];
592 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
593 xycut_sort(&mut elements, &page);
594 assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
596 }
597
598 #[test]
599 fn test_xycut_sort_two_columns_left_first() {
600 let mut elements = vec![
604 make_element(300.0, 500.0, 500.0, 550.0), make_element(50.0, 500.0, 250.0, 550.0), make_element(300.0, 400.0, 500.0, 450.0), make_element(50.0, 400.0, 250.0, 450.0), make_element(300.0, 300.0, 500.0, 350.0), make_element(50.0, 300.0, 250.0, 350.0), ];
611 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
612 xycut_sort(&mut elements, &page);
613
614 assert!(
617 elements[0].bbox().left_x < 260.0,
618 "First element should be left column"
619 );
620 assert!(
621 elements[1].bbox().left_x < 260.0,
622 "Second element should be left column"
623 );
624 assert!(
625 elements[2].bbox().left_x < 260.0,
626 "Third element should be left column"
627 );
628 assert!(
629 elements[3].bbox().left_x > 260.0,
630 "Fourth element should be right column"
631 );
632 assert!(
633 elements[4].bbox().left_x > 260.0,
634 "Fifth element should be right column"
635 );
636 assert!(
637 elements[5].bbox().left_x > 260.0,
638 "Sixth element should be right column"
639 );
640
641 assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
643 assert!(elements[1].bbox().top_y > elements[2].bbox().top_y);
644 assert!(elements[3].bbox().top_y > elements[4].bbox().top_y);
646 assert!(elements[4].bbox().top_y > elements[5].bbox().top_y);
647 }
648
649 #[test]
650 fn test_xycut_sort_header_then_two_columns() {
651 let mut elements = vec![
653 make_element(50.0, 700.0, 500.0, 750.0), make_element(300.0, 500.0, 500.0, 550.0), make_element(50.0, 500.0, 250.0, 550.0), make_element(300.0, 400.0, 500.0, 450.0), make_element(50.0, 400.0, 250.0, 450.0), ];
659 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
660 xycut_sort(&mut elements, &page);
661
662 assert!(
664 elements[0].bbox().top_y >= 750.0,
665 "Header should come first"
666 );
667 assert!(
669 elements[1].bbox().left_x < 260.0,
670 "Left col should come after header"
671 );
672 assert!(elements[2].bbox().left_x < 260.0);
673 assert!(
674 elements[3].bbox().left_x > 260.0,
675 "Right col should come last"
676 );
677 assert!(elements[4].bbox().left_x > 260.0);
678 }
679
680 #[test]
681 fn test_projection_gap_with_overlapping_elements() {
682 let mut elements = vec![
687 make_element(50.0, 500.0, 250.0, 550.0), make_element(60.0, 400.0, 240.0, 450.0), make_element(310.0, 500.0, 500.0, 550.0), make_element(310.0, 400.0, 500.0, 450.0), ];
692 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
693 xycut_sort(&mut elements, &page);
694
695 assert!(elements[0].bbox().left_x < 260.0);
697 assert!(elements[1].bbox().left_x < 260.0);
698 assert!(elements[2].bbox().left_x > 260.0);
699 assert!(elements[3].bbox().left_x > 260.0);
700 }
701
702 #[test]
703 fn test_xycut_prefers_columns_for_staggered_two_column_layout() {
704 let mut elements = vec![
705 make_element(50.0, 680.0, 250.0, 740.0), make_element(50.0, 420.0, 250.0, 660.0), make_element(320.0, 600.0, 520.0, 760.0), make_element(320.0, 360.0, 520.0, 580.0), ];
710 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
711 xycut_sort(&mut elements, &page);
712
713 assert!(elements[0].bbox().left_x < 260.0);
714 assert!(elements[1].bbox().left_x < 260.0);
715 assert!(elements[2].bbox().left_x > 260.0);
716 assert!(elements[3].bbox().left_x > 260.0);
717 }
718
719 #[test]
720 fn test_xycut_keeps_spanning_header_and_footer_outside_columns() {
721 let mut elements = vec![
722 make_element(40.0, 760.0, 540.0, 810.0), make_element(50.0, 640.0, 250.0, 700.0), make_element(50.0, 520.0, 250.0, 620.0), make_element(320.0, 640.0, 520.0, 700.0), make_element(320.0, 520.0, 520.0, 620.0), make_element(40.0, 430.0, 540.0, 480.0), ];
729 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
730 xycut_sort(&mut elements, &page);
731
732 assert!(
733 elements[0].bbox().top_y >= 800.0,
734 "header should stay first"
735 );
736 assert!(elements[1].bbox().left_x < 260.0);
737 assert!(elements[2].bbox().left_x < 260.0);
738 assert!(elements[3].bbox().left_x > 260.0);
739 assert!(elements[4].bbox().left_x > 260.0);
740 assert!(elements[5].bbox().top_y <= 480.0, "footer should stay last");
741 }
742
743 #[test]
744 fn test_xycut_rejects_vertical_cut_when_spanning_band_sits_between_columns() {
745 let mut elements = vec![
746 make_element(50.0, 700.0, 250.0, 760.0), make_element(320.0, 700.0, 520.0, 760.0), make_element(40.0, 610.0, 540.0, 680.0), make_element(50.0, 500.0, 250.0, 580.0), make_element(320.0, 500.0, 520.0, 580.0), ];
752 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
753 xycut_sort(&mut elements, &page);
754
755 assert!(elements[0].bbox().top_y >= 760.0);
756 assert!(elements[1].bbox().top_y >= 760.0);
757 assert!(elements[2].bbox().top_y >= 680.0 && elements[2].bbox().bottom_y <= 610.0);
758 assert!(elements[3].bbox().top_y <= 580.0);
759 assert!(elements[4].bbox().top_y <= 580.0);
760 }
761}