1use crate::models::bbox::BoundingBox;
18use crate::models::content::ContentElement;
19
20pub fn xycut_sort(elements: &mut [ContentElement], page_bbox: &BoundingBox) {
28 if elements.len() <= 1 {
29 return;
30 }
31 xycut_recursive(elements, page_bbox);
32}
33
34fn xycut_recursive(elements: &mut [ContentElement], _region: &BoundingBox) {
35 if elements.len() <= 1 {
36 return;
37 }
38
39 let h_gap = find_horizontal_gap_size(elements);
41 let v_gap = find_vertical_gap_size(elements);
42
43 if std::env::var("XYCUT_DEBUG").is_ok() {
44 eprintln!(
45 "[XYCUT] n={} h_gap={:?} v_gap={:?}",
46 elements.len(),
47 h_gap.map(|(y, g)| format!("y={:.1} gap={:.1}", y, g)),
48 v_gap.map(|(x, g)| format!("x={:.1} gap={:.1}", x, g))
49 );
50 for e in elements.iter() {
51 let b = e.bbox();
52 eprintln!(
53 " el: l={:.1} r={:.1} top={:.1} bot={:.1}",
54 b.left_x, b.right_x, b.top_y, b.bottom_y
55 );
56 }
57 }
58
59 let prefer_vertical = match (h_gap, v_gap) {
65 (None, Some(_)) => true,
66 (Some((_, h_size)), Some((split_x, v_size))) => {
67 should_prefer_vertical_cut(elements, split_x, h_size, v_size)
68 }
69 _ => false,
70 };
71
72 if prefer_vertical {
73 if let Some((split_x, _)) = v_gap {
75 let (left, right) = partition_by_x(elements, split_x);
76 if !left.is_empty() && !right.is_empty() {
77 xycut_recursive(left, _region);
78 xycut_recursive(right, _region);
79 return;
80 }
81 }
82 if let Some((split_y, _)) = h_gap {
84 let (top, bottom) = partition_by_y(elements, split_y);
85 if !top.is_empty() && !bottom.is_empty() {
86 xycut_recursive(top, _region);
87 xycut_recursive(bottom, _region);
88 return;
89 }
90 }
91 } else {
92 if let Some((split_y, _)) = h_gap {
94 let (top, bottom) = partition_by_y(elements, split_y);
95 if !top.is_empty() && !bottom.is_empty() {
96 xycut_recursive(top, _region);
97 xycut_recursive(bottom, _region);
98 return;
99 }
100 }
101 if let Some((split_x, _)) = v_gap {
103 let (left, right) = partition_by_x(elements, split_x);
104 if !left.is_empty() && !right.is_empty() {
105 xycut_recursive(left, _region);
106 xycut_recursive(right, _region);
107 return;
108 }
109 }
110 }
111
112 const BUCKET_PT: f64 = 4.0;
127 elements.sort_by_key(|e| {
128 let y_bucket = -((e.bbox().top_y / BUCKET_PT).round() as i64); let x_key = (e.bbox().left_x * 10.0).round() as i64; (y_bucket, x_key)
131 });
132}
133
134fn should_prefer_vertical_cut(
135 elements: &[ContentElement],
136 split_x: f64,
137 horizontal_gap: f64,
138 vertical_gap: f64,
139) -> bool {
140 if elements.len() < 4 {
141 return false;
142 }
143
144 let min_x = elements
145 .iter()
146 .map(|e| e.bbox().left_x)
147 .fold(f64::INFINITY, f64::min);
148 let max_x = elements
149 .iter()
150 .map(|e| e.bbox().right_x)
151 .fold(f64::NEG_INFINITY, f64::max);
152 let content_width = (max_x - min_x).max(1.0);
153 let spanning_width = content_width * 0.55;
154 let split_margin = 6.0;
155
156 let mut left_count = 0usize;
157 let mut right_count = 0usize;
158 let mut left_top = f64::NEG_INFINITY;
159 let mut left_bottom = f64::INFINITY;
160 let mut right_top = f64::NEG_INFINITY;
161 let mut right_bottom = f64::INFINITY;
162 let mut wide_crossing = 0usize;
163
164 for elem in elements {
165 let bbox = elem.bbox();
166 let width = bbox.right_x - bbox.left_x;
167 let crosses_split =
168 bbox.left_x < split_x - split_margin && bbox.right_x > split_x + split_margin;
169
170 if crosses_split && width >= spanning_width {
171 wide_crossing += 1;
172 continue;
173 }
174
175 if bbox.center_x() < split_x {
176 left_count += 1;
177 left_top = left_top.max(bbox.top_y);
178 left_bottom = left_bottom.min(bbox.bottom_y);
179 } else {
180 right_count += 1;
181 right_top = right_top.max(bbox.top_y);
182 right_bottom = right_bottom.min(bbox.bottom_y);
183 }
184 }
185
186 if left_count < 2 || right_count < 2 {
187 return false;
188 }
189 if wide_crossing > 0 {
190 return false;
191 }
192
193 let left_height = (left_top - left_bottom).max(0.0);
194 let right_height = (right_top - right_bottom).max(0.0);
195 if left_height <= 0.0 || right_height <= 0.0 {
196 return false;
197 }
198
199 let overlap = (left_top.min(right_top) - left_bottom.max(right_bottom)).max(0.0);
200 let overlap_ratio = overlap / left_height.min(right_height);
201
202 overlap_ratio >= 0.35 && vertical_gap >= horizontal_gap * 0.5
203}
204
205const MIN_GAP_THRESHOLD: f64 = 5.0;
207
208fn find_horizontal_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
214 if elements.len() < 2 {
215 return None;
216 }
217
218 let mut sorted: Vec<(f64, f64)> = elements
220 .iter()
221 .map(|e| (e.bbox().bottom_y, e.bbox().top_y))
222 .collect();
223 sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
224
225 let mut best_gap = 0.0f64;
226 let mut best_y = None;
227 let mut prev_bottom = sorted[0].0; for &(bottom, top) in &sorted[1..] {
230 if prev_bottom > top {
231 let gap = prev_bottom - top;
233 if gap > best_gap && gap > MIN_GAP_THRESHOLD {
234 best_gap = gap;
235 best_y = Some((prev_bottom + top) / 2.0);
236 }
237 }
238 prev_bottom = prev_bottom.min(bottom); }
240
241 best_y.map(|y| (y, best_gap))
242}
243
244fn find_vertical_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
256 if elements.len() < 2 {
257 return None;
258 }
259
260 let min_x = elements
262 .iter()
263 .map(|e| e.bbox().left_x)
264 .fold(f64::INFINITY, f64::min);
265 let max_x = elements
266 .iter()
267 .map(|e| e.bbox().right_x)
268 .fold(f64::NEG_INFINITY, f64::max);
269 let content_width = (max_x - min_x).max(1.0);
270
271 let span_threshold = content_width * 0.70;
273
274 const BBOX_SHRINK: f64 = 3.0;
280
281 let mut intervals: Vec<(f64, f64)> = elements
283 .iter()
284 .filter_map(|e| {
285 let b = e.bbox();
286 let w = b.right_x - b.left_x;
287 if w > span_threshold {
288 None
289 } else {
290 let eff_left = b.left_x + BBOX_SHRINK;
291 let eff_right = b.right_x - BBOX_SHRINK;
292 if eff_right > eff_left {
293 Some((eff_left, eff_right))
294 } else {
295 None }
297 }
298 })
299 .collect();
300
301 if intervals.len() < 2 {
304 intervals = elements
305 .iter()
306 .filter_map(|e| {
307 let b = e.bbox();
308 let eff_left = b.left_x + BBOX_SHRINK;
309 let eff_right = b.right_x - BBOX_SHRINK;
310 if eff_right > eff_left {
311 Some((eff_left, eff_right))
312 } else {
313 None
314 }
315 })
316 .collect();
317 }
318
319 intervals.sort_by(|a, b| {
320 a.0.partial_cmp(&b.0)
321 .unwrap_or(std::cmp::Ordering::Equal)
322 .then_with(|| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
323 });
324
325 let mut merged: Vec<(f64, f64)> = Vec::new();
330 let merge_tolerance = 0.5; for (left, right) in intervals {
332 if let Some(last) = merged.last_mut() {
333 if left <= last.1 + merge_tolerance {
334 last.1 = last.1.max(right);
336 continue;
337 }
338 }
339 merged.push((left, right));
340 }
341
342 let mut best_gap = 0.0f64;
344 let mut best_x = None;
345 for w in merged.windows(2) {
346 let gap = w[1].0 - w[0].1;
347 if gap > best_gap && gap > MIN_GAP_THRESHOLD {
348 best_gap = gap;
349 best_x = Some((w[0].1 + w[1].0) / 2.0);
350 }
351 }
352
353 if best_x.is_none() {
374 const MIN_CENTER_GAP: f64 = 20.0;
375 const MIN_COL_ELEMENT_WIDTH: f64 = 50.0;
378 let mut centers: Vec<f64> = elements
379 .iter()
380 .filter_map(|e| {
381 let b = e.bbox();
382 let w = b.right_x - b.left_x;
383 if w > span_threshold || w < MIN_COL_ELEMENT_WIDTH {
384 None } else {
386 Some((b.left_x + b.right_x) * 0.5)
387 }
388 })
389 .collect();
390 if centers.len() >= 2 {
391 centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
392 let mut candidate_x: Option<f64> = None;
393 let mut candidate_gap = 0.0f64;
394 for pair in centers.windows(2) {
395 let gap = pair[1] - pair[0];
396 if gap > candidate_gap && gap >= MIN_CENTER_GAP {
397 candidate_gap = gap;
398 candidate_x = Some((pair[0] + pair[1]) * 0.5);
399 }
400 }
401 if let Some(sx) = candidate_x {
404 let right_min_left_x = elements
405 .iter()
406 .filter(|e| {
407 let b = e.bbox();
408 (b.left_x + b.right_x) * 0.5 >= sx
409 })
410 .map(|e| e.bbox().left_x)
411 .fold(f64::INFINITY, f64::min);
412 if std::env::var("XYCUT_DEBUG").is_ok() {
413 eprintln!("[XYCUT] center_x candidate_x={:.1} candidate_gap={:.1} right_min_left_x={:.1} threshold={:.1} pass={}",
414 sx, candidate_gap, right_min_left_x, sx * 0.85, right_min_left_x >= sx * 0.85);
415 }
416 if right_min_left_x >= sx * 0.85 {
417 best_x = candidate_x;
418 best_gap = candidate_gap;
419 }
420 }
421 }
422 }
423
424 best_x.map(|x| (x, best_gap))
425}
426
427fn partition_by_y(
429 elements: &mut [ContentElement],
430 split_y: f64,
431) -> (&mut [ContentElement], &mut [ContentElement]) {
432 let pivot = itertools_partition(elements, |e| e.bbox().center_y() >= split_y);
433 elements.split_at_mut(pivot)
434}
435
436fn partition_by_x(
438 elements: &mut [ContentElement],
439 split_x: f64,
440) -> (&mut [ContentElement], &mut [ContentElement]) {
441 let pivot = itertools_partition(elements, |e| e.bbox().center_x() < split_x);
442 elements.split_at_mut(pivot)
443}
444
445fn itertools_partition<T, F>(data: &mut [T], pred: F) -> usize
448where
449 F: Fn(&T) -> bool,
450{
451 let mut pivot = 0;
452 for i in 0..data.len() {
453 if pred(&data[i]) {
454 data.swap(i, pivot);
455 pivot += 1;
456 }
457 }
458 pivot
459}
460
461#[cfg(test)]
462mod tests {
463 use super::*;
464 use crate::models::chunks::ImageChunk;
465
466 fn make_element(left: f64, bottom: f64, right: f64, top: f64) -> ContentElement {
467 ContentElement::Image(ImageChunk {
468 bbox: BoundingBox::new(Some(1), left, bottom, right, top),
469 index: None,
470 level: None,
471 })
472 }
473
474 #[test]
475 fn test_xycut_sort_single() {
476 let mut elements = vec![make_element(0.0, 0.0, 100.0, 50.0)];
477 xycut_sort(
478 &mut elements,
479 &BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0),
480 );
481 assert_eq!(elements.len(), 1);
482 }
483
484 #[test]
485 fn test_xycut_sort_two_rows() {
486 let mut elements = vec![
487 make_element(50.0, 100.0, 200.0, 150.0), make_element(50.0, 500.0, 200.0, 550.0), ];
490 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
491 xycut_sort(&mut elements, &page);
492 assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
494 }
495
496 #[test]
497 fn test_xycut_sort_two_columns_left_first() {
498 let mut elements = vec![
502 make_element(300.0, 500.0, 500.0, 550.0), make_element(50.0, 500.0, 250.0, 550.0), make_element(300.0, 400.0, 500.0, 450.0), make_element(50.0, 400.0, 250.0, 450.0), make_element(300.0, 300.0, 500.0, 350.0), make_element(50.0, 300.0, 250.0, 350.0), ];
509 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
510 xycut_sort(&mut elements, &page);
511
512 assert!(
515 elements[0].bbox().left_x < 260.0,
516 "First element should be left column"
517 );
518 assert!(
519 elements[1].bbox().left_x < 260.0,
520 "Second element should be left column"
521 );
522 assert!(
523 elements[2].bbox().left_x < 260.0,
524 "Third element should be left column"
525 );
526 assert!(
527 elements[3].bbox().left_x > 260.0,
528 "Fourth element should be right column"
529 );
530 assert!(
531 elements[4].bbox().left_x > 260.0,
532 "Fifth element should be right column"
533 );
534 assert!(
535 elements[5].bbox().left_x > 260.0,
536 "Sixth element should be right column"
537 );
538
539 assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
541 assert!(elements[1].bbox().top_y > elements[2].bbox().top_y);
542 assert!(elements[3].bbox().top_y > elements[4].bbox().top_y);
544 assert!(elements[4].bbox().top_y > elements[5].bbox().top_y);
545 }
546
547 #[test]
548 fn test_xycut_sort_header_then_two_columns() {
549 let mut elements = vec![
551 make_element(50.0, 700.0, 500.0, 750.0), make_element(300.0, 500.0, 500.0, 550.0), make_element(50.0, 500.0, 250.0, 550.0), make_element(300.0, 400.0, 500.0, 450.0), make_element(50.0, 400.0, 250.0, 450.0), ];
557 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
558 xycut_sort(&mut elements, &page);
559
560 assert!(
562 elements[0].bbox().top_y >= 750.0,
563 "Header should come first"
564 );
565 assert!(
567 elements[1].bbox().left_x < 260.0,
568 "Left col should come after header"
569 );
570 assert!(elements[2].bbox().left_x < 260.0);
571 assert!(
572 elements[3].bbox().left_x > 260.0,
573 "Right col should come last"
574 );
575 assert!(elements[4].bbox().left_x > 260.0);
576 }
577
578 #[test]
579 fn test_projection_gap_with_overlapping_elements() {
580 let mut elements = vec![
585 make_element(50.0, 500.0, 250.0, 550.0), make_element(60.0, 400.0, 240.0, 450.0), make_element(310.0, 500.0, 500.0, 550.0), make_element(310.0, 400.0, 500.0, 450.0), ];
590 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
591 xycut_sort(&mut elements, &page);
592
593 assert!(elements[0].bbox().left_x < 260.0);
595 assert!(elements[1].bbox().left_x < 260.0);
596 assert!(elements[2].bbox().left_x > 260.0);
597 assert!(elements[3].bbox().left_x > 260.0);
598 }
599
600 #[test]
601 fn test_xycut_prefers_columns_for_staggered_two_column_layout() {
602 let mut elements = vec![
603 make_element(50.0, 680.0, 250.0, 740.0), make_element(50.0, 420.0, 250.0, 660.0), make_element(320.0, 600.0, 520.0, 760.0), make_element(320.0, 360.0, 520.0, 580.0), ];
608 let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
609 xycut_sort(&mut elements, &page);
610
611 assert!(elements[0].bbox().left_x < 260.0);
612 assert!(elements[1].bbox().left_x < 260.0);
613 assert!(elements[2].bbox().left_x > 260.0);
614 assert!(elements[3].bbox().left_x > 260.0);
615 }
616}