Skip to main content

edgeparse_core/utils/
xycut.rs

1//! XY-Cut++ reading order algorithm.
2//!
3//! ```text
4//!   ┌──────────────────────────┐
5//!   │  Col A      │   Col B    │
6//!   │  ┌────────┐ │ ┌────────┐ │
7//!   │  │ Para 1 │ │ │ Para 3 │ │    1. Find largest vertical gap
8//!   │  └────────┘ │ └────────┘ │       → split left / right
9//!   │  ┌────────┐ │ ┌────────┐ │    2. Recurse on each half
10//!   │  │ Para 2 │ │ │ Para 4 │ │    3. Within half: find horizontal gap
11//!   │  └────────┘ │ └────────┘ │       → split top / bottom
12//!   └──────────────────────────┘    4. Order: top→bottom, left→right
13//!
14//!   Result: Para 1, Para 2, Para 3, Para 4
15//! ```
16
17use crate::models::bbox::BoundingBox;
18use crate::models::content::ContentElement;
19
20/// Sort content elements using the XY-Cut++ algorithm.
21///
22/// The algorithm recursively partitions the page:
23/// 1. Find largest horizontal gap → split into top/bottom
24/// 2. Find largest vertical gap → split into left/right
25/// 3. Recurse on each partition
26/// 4. Order: top-to-bottom, left-to-right within each partition
27pub fn xycut_sort(elements: &mut [ContentElement], page_bbox: &BoundingBox) {
28    if elements.len() <= 1 {
29        return;
30    }
31    xycut_recursive(elements, page_bbox);
32}
33
34fn xycut_recursive(elements: &mut [ContentElement], region: &BoundingBox) {
35    if elements.len() <= 1 {
36        return;
37    }
38
39    // Find both possible cuts
40    let h_gap = find_horizontal_gap_size(elements);
41    let v_gap = find_vertical_gap_size(elements);
42    let vertical_analysis = v_gap.and_then(|(split_x, v_size)| {
43        analyze_vertical_cut(
44            elements,
45            split_x,
46            h_gap.map(|(_, gap)| gap).unwrap_or(0.0),
47            v_size,
48        )
49        .map(|analysis| (split_x, analysis))
50    });
51
52    if std::env::var("XYCUT_DEBUG").is_ok() {
53        eprintln!(
54            "[XYCUT] n={} h_gap={:?} v_gap={:?}",
55            elements.len(),
56            h_gap.map(|(y, g)| format!("y={:.1} gap={:.1}", y, g)),
57            v_gap.map(|(x, g)| format!("x={:.1} gap={:.1}", x, g))
58        );
59        for e in elements.iter() {
60            let b = e.bbox();
61            eprintln!(
62                "  el: l={:.1} r={:.1} top={:.1} bot={:.1}",
63                b.left_x, b.right_x, b.top_y, b.bottom_y
64            );
65        }
66    }
67
68    // Prefer a vertical split only when the page really behaves like concurrent
69    // columns.  This avoids the old "always horizontal first" bias that
70    // interleaved figure-heavy two-column pages row-by-row, while still keeping
71    // full-width spanning elements (titles, section headings, footers) outside
72    // the column subtrees.
73    let prefer_vertical = vertical_analysis.is_some();
74
75    if prefer_vertical {
76        // Try vertical cut first (column split)
77        if let Some((split_x, analysis)) = vertical_analysis.as_ref() {
78            if reorder_vertical_bands(elements, *split_x, analysis, region) {
79                return;
80            }
81        }
82        if let Some((split_x, _)) = v_gap {
83            let (left, right) = partition_by_x(elements, split_x);
84            if !left.is_empty() && !right.is_empty() {
85                xycut_recursive(left, region);
86                xycut_recursive(right, region);
87                return;
88            }
89        }
90        // Fallback to horizontal cut
91        if let Some((split_y, _)) = h_gap {
92            let (top, bottom) = partition_by_y(elements, split_y);
93            if !top.is_empty() && !bottom.is_empty() {
94                xycut_recursive(top, region);
95                xycut_recursive(bottom, region);
96                return;
97            }
98        }
99    } else {
100        // Try horizontal cut first (row split)
101        if let Some((split_y, _)) = h_gap {
102            let (top, bottom) = partition_by_y(elements, split_y);
103            if !top.is_empty() && !bottom.is_empty() {
104                xycut_recursive(top, region);
105                xycut_recursive(bottom, region);
106                return;
107            }
108        }
109        // Fallback to vertical cut
110        if let Some((split_x, _)) = v_gap {
111            let (left, right) = partition_by_x(elements, split_x);
112            if !left.is_empty() && !right.is_empty() {
113                xycut_recursive(left, region);
114                xycut_recursive(right, region);
115                return;
116            }
117        }
118    }
119
120    // No split found — sort by quantized Y (top-to-bottom) then by left_x (column order).
121    //
122    // Rationale: in 2-column documents where the column gap is 0 pt (columns are
123    // perfectly adjacent with no whitespace), the vertical-gap detector cannot find
124    // a split, and the fallback sort takes over.  The left and right column elements
125    // at the same visual row often differ by only a fraction of a point in top_y due
126    // to PDF coordinate rounding.  If the right column element's top_y is even 0.5 pt
127    // higher, the old comparison by raw top_y would place the right column first,
128    // reversing the reading order.
129    //
130    // Quantizing top_y to 4 pt buckets groups elements that are visually on the same
131    // text row (sub-line-height Y difference) into the same bucket, then sorts within
132    // the bucket by left_x (left-column-first).  Typical body text lines are 10–14 pt
133    // apart, so a 4 pt bucket never merges elements from adjacent text lines.
134    const BUCKET_PT: f64 = 4.0;
135    elements.sort_by_key(|e| {
136        let y_bucket = -((e.bbox().top_y / BUCKET_PT).round() as i64); // descending
137        let x_key = (e.bbox().left_x * 10.0).round() as i64; // ascending
138        (y_bucket, x_key)
139    });
140}
141
142#[derive(Debug, Clone, Copy)]
143struct VerticalCutAnalysis {
144    shared_top: f64,
145    shared_bottom: f64,
146}
147
148fn analyze_vertical_cut(
149    elements: &[ContentElement],
150    split_x: f64,
151    horizontal_gap: f64,
152    vertical_gap: f64,
153) -> Option<VerticalCutAnalysis> {
154    if elements.len() < 4 {
155        return None;
156    }
157
158    let min_x = elements
159        .iter()
160        .map(|e| e.bbox().left_x)
161        .fold(f64::INFINITY, f64::min);
162    let max_x = elements
163        .iter()
164        .map(|e| e.bbox().right_x)
165        .fold(f64::NEG_INFINITY, f64::max);
166    let content_width = (max_x - min_x).max(1.0);
167    let spanning_width = content_width * 0.55;
168    let split_margin = 6.0;
169    let band_tolerance = 8.0;
170
171    let mut left_count = 0usize;
172    let mut right_count = 0usize;
173    let mut left_top = f64::NEG_INFINITY;
174    let mut left_bottom = f64::INFINITY;
175    let mut right_top = f64::NEG_INFINITY;
176    let mut right_bottom = f64::INFINITY;
177    let mut crossing_bands: Vec<(f64, f64)> = Vec::new();
178
179    for elem in elements {
180        let bbox = elem.bbox();
181        let width = bbox.right_x - bbox.left_x;
182        let crosses_split =
183            bbox.left_x < split_x - split_margin && bbox.right_x > split_x + split_margin;
184
185        if crosses_split && width >= spanning_width {
186            crossing_bands.push((bbox.top_y, bbox.bottom_y));
187            continue;
188        }
189
190        if bbox.center_x() < split_x {
191            left_count += 1;
192            left_top = left_top.max(bbox.top_y);
193            left_bottom = left_bottom.min(bbox.bottom_y);
194        } else {
195            right_count += 1;
196            right_top = right_top.max(bbox.top_y);
197            right_bottom = right_bottom.min(bbox.bottom_y);
198        }
199    }
200
201    if left_count < 2 || right_count < 2 {
202        return None;
203    }
204
205    let left_height = (left_top - left_bottom).max(0.0);
206    let right_height = (right_top - right_bottom).max(0.0);
207    if left_height <= 0.0 || right_height <= 0.0 {
208        return None;
209    }
210
211    let overlap = (left_top.min(right_top) - left_bottom.max(right_bottom)).max(0.0);
212    let overlap_ratio = overlap / left_height.min(right_height);
213    if overlap_ratio < 0.35 || vertical_gap < horizontal_gap * 0.5 {
214        return None;
215    }
216
217    let shared_top = left_top.min(right_top);
218    let shared_bottom = left_bottom.max(right_bottom);
219    if shared_top <= shared_bottom {
220        return None;
221    }
222
223    let ambiguous_crossing = crossing_bands
224        .iter()
225        .filter(|(top_y, bottom_y)| {
226            *bottom_y < shared_top - band_tolerance && *top_y > shared_bottom + band_tolerance
227        })
228        .count();
229    if ambiguous_crossing > 0 {
230        return None;
231    }
232
233    Some(VerticalCutAnalysis {
234        shared_top,
235        shared_bottom,
236    })
237}
238
239fn reorder_vertical_bands(
240    elements: &mut [ContentElement],
241    split_x: f64,
242    analysis: &VerticalCutAnalysis,
243    region: &BoundingBox,
244) -> bool {
245    const SPLIT_MARGIN: f64 = 6.0;
246    const BAND_TOLERANCE: f64 = 8.0;
247
248    let mut top_spanning = Vec::new();
249    let mut left = Vec::new();
250    let mut right = Vec::new();
251    let mut bottom_spanning = Vec::new();
252
253    for element in elements.iter() {
254        let bbox = element.bbox();
255        let crosses_split =
256            bbox.left_x < split_x - SPLIT_MARGIN && bbox.right_x > split_x + SPLIT_MARGIN;
257
258        if crosses_split {
259            if bbox.bottom_y >= analysis.shared_top - BAND_TOLERANCE {
260                top_spanning.push(element.clone());
261                continue;
262            }
263            if bbox.top_y <= analysis.shared_bottom + BAND_TOLERANCE {
264                bottom_spanning.push(element.clone());
265                continue;
266            }
267            return false;
268        }
269
270        if bbox.center_x() < split_x {
271            left.push(element.clone());
272        } else {
273            right.push(element.clone());
274        }
275    }
276
277    if left.is_empty() || right.is_empty() {
278        return false;
279    }
280
281    if top_spanning.len() > 1 {
282        xycut_recursive(top_spanning.as_mut_slice(), region);
283    }
284    xycut_recursive(left.as_mut_slice(), region);
285    xycut_recursive(right.as_mut_slice(), region);
286    if bottom_spanning.len() > 1 {
287        xycut_recursive(bottom_spanning.as_mut_slice(), region);
288    }
289
290    let mut ordered = Vec::with_capacity(elements.len());
291    ordered.extend(top_spanning);
292    ordered.extend(left);
293    ordered.extend(right);
294    ordered.extend(bottom_spanning);
295
296    if ordered.len() != elements.len() {
297        return false;
298    }
299
300    for (dst, src) in elements.iter_mut().zip(ordered.into_iter()) {
301        *dst = src;
302    }
303
304    true
305}
306
307/// Minimum gap size (points) required to consider a cut.
308const MIN_GAP_THRESHOLD: f64 = 5.0;
309
310/// Find the largest horizontal gap (between rows of elements)
311/// using projection-profile gap detection.
312/// Sorts elements by top_y descending (top of page first) and tracks
313/// the accumulated minimum bottom edge to find true gaps.
314/// Returns (split_y, gap_size).
315fn find_horizontal_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
316    if elements.len() < 2 {
317        return None;
318    }
319
320    // Sort by top_y descending (scan from top of page downward)
321    let mut sorted: Vec<(f64, f64)> = elements
322        .iter()
323        .map(|e| (e.bbox().bottom_y, e.bbox().top_y))
324        .collect();
325    sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
326
327    let mut best_gap = 0.0f64;
328    let mut best_y = None;
329    let mut prev_bottom = sorted[0].0; // track lowest bottom seen so far
330
331    for &(bottom, top) in &sorted[1..] {
332        if prev_bottom > top {
333            // Gap exists between previous accumulated bottom and this element's top
334            let gap = prev_bottom - top;
335            if gap > best_gap && gap > MIN_GAP_THRESHOLD {
336                best_gap = gap;
337                best_y = Some((prev_bottom + top) / 2.0);
338            }
339        }
340        prev_bottom = prev_bottom.min(bottom); // accumulate lowest bottom
341    }
342
343    best_y.map(|y| (y, best_gap))
344}
345
346/// Find the largest vertical gap (between columns of elements)
347/// using interval-merge gap detection.
348///
349/// Merges overlapping X-ranges into consolidated intervals, then finds the
350/// largest gap between consecutive intervals.  Elements significantly wider
351/// than the median width are excluded from gap computation — they are likely
352/// full-width spanning elements (titles, footers) that would otherwise
353/// swallow the column gap.  This pre-masking mirrors the approach used in
354/// column_detector.rs and the reference implementation XYCutPlusPlusSorter's cross-layout logic.
355///
356/// Returns (split_x, gap_size).
357fn find_vertical_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
358    if elements.len() < 2 {
359        return None;
360    }
361
362    // Compute content width extents to identify spanning elements
363    let min_x = elements
364        .iter()
365        .map(|e| e.bbox().left_x)
366        .fold(f64::INFINITY, f64::min);
367    let max_x = elements
368        .iter()
369        .map(|e| e.bbox().right_x)
370        .fold(f64::NEG_INFINITY, f64::max);
371    let content_width = (max_x - min_x).max(1.0);
372
373    // Exclude elements spanning > 70% of content width from gap computation
374    let span_threshold = content_width * 0.70;
375
376    // Erode each element's bbox by BBOX_SHRINK before building intervals.
377    // PDF TextChunk right_x is often inflated by advance-width calculations
378    // (overshooting the actual visible glyph by 3–7 pt), causing the interval
379    // to bridge a real column gutter and masking the gap.  Matching the 3 pt
380    // erosion used in column_detector.rs makes the gap appear ~6 pt wider here.
381    const BBOX_SHRINK: f64 = 3.0;
382
383    // Build sorted list of (left_x, right_x) intervals, excluding spanning elements
384    let mut intervals: Vec<(f64, f64)> = elements
385        .iter()
386        .filter_map(|e| {
387            let b = e.bbox();
388            let w = b.right_x - b.left_x;
389            if w > span_threshold {
390                None
391            } else {
392                let eff_left = b.left_x + BBOX_SHRINK;
393                let eff_right = b.right_x - BBOX_SHRINK;
394                if eff_right > eff_left {
395                    Some((eff_left, eff_right))
396                } else {
397                    None // element too narrow after erosion
398                }
399            }
400        })
401        .collect();
402
403    // If all elements are spanning (or too few remain), fall back to using all
404    // with erosion applied (raw bboxes are not needed for the gap calculation).
405    if intervals.len() < 2 {
406        intervals = elements
407            .iter()
408            .filter_map(|e| {
409                let b = e.bbox();
410                let eff_left = b.left_x + BBOX_SHRINK;
411                let eff_right = b.right_x - BBOX_SHRINK;
412                if eff_right > eff_left {
413                    Some((eff_left, eff_right))
414                } else {
415                    None
416                }
417            })
418            .collect();
419    }
420
421    intervals.sort_by(|a, b| {
422        a.0.partial_cmp(&b.0)
423            .unwrap_or(std::cmp::Ordering::Equal)
424            .then_with(|| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
425    });
426
427    // Merge overlapping intervals (with a small tolerance for near-touching
428    // elements whose bounding boxes slightly overlap due to PDF coordinate
429    // imprecision).  Kept at 0.5 pt — large enough for true glyph jitter
430    // but small enough not to swallow genuine column gaps that touch at 0 pt.
431    let mut merged: Vec<(f64, f64)> = Vec::new();
432    let merge_tolerance = 0.5; // points — tight to preserve genuinely adjacent columns
433    for (left, right) in intervals {
434        if let Some(last) = merged.last_mut() {
435            if left <= last.1 + merge_tolerance {
436                // Overlapping or within tolerance — extend
437                last.1 = last.1.max(right);
438                continue;
439            }
440        }
441        merged.push((left, right));
442    }
443
444    // Find the largest gap between consecutive merged intervals
445    let mut best_gap = 0.0f64;
446    let mut best_x = None;
447    for w in merged.windows(2) {
448        let gap = w[1].0 - w[0].1;
449        if gap > best_gap && gap > MIN_GAP_THRESHOLD {
450            best_gap = gap;
451            best_x = Some((w[0].1 + w[1].0) / 2.0);
452        }
453    }
454
455    // Fallback: center_x clustering.
456    //
457    // When bbox inflation causes left-column right_x to overshoot the right-
458    // column left_x (making the above interval method fail), we look for a
459    // large gap between consecutive element center_x values.  This is immune
460    // to bbox inflation because center_x ≈ visual center of the content.
461    //
462    // Two guards keep this fallback precise:
463    //  1. MIN_COL_ELEMENT_WIDTH: only elements at least 50pt wide are used to
464    //     compute center_x candidates.  Tiny chart bars, bullet dots, or legend
465    //     markers (typically < 30pt wide) would otherwise fill in the space
466    //     between two text columns, hiding the column gap from the clustering.
467    //  2. Right-partition left_x guard: after finding a candidate split_x,
468    //     verify that the right-partition elements actually start on the right
469    //     side of the page (min left_x of right-partition elements >= split_x *
470    //     0.85).  In genuine two-column layouts the right column's left margin
471    //     is close to or above split_x.  In single-column documents with
472    //     width-varying elements (headings, indented text, spanning paragraphs),
473    //     spanning elements are forced into the right partition by their center_x
474    //     but their left_x is still at the left margin — this guard rejects them.
475    if best_x.is_none() {
476        const MIN_CENTER_GAP: f64 = 20.0;
477        // Minimum element width to be used for center_x computation.
478        // Filters out tiny chart bars, bullet markers, and legend dots.
479        const MIN_COL_ELEMENT_WIDTH: f64 = 50.0;
480        let mut centers: Vec<f64> = elements
481            .iter()
482            .filter_map(|e| {
483                let b = e.bbox();
484                let w = b.right_x - b.left_x;
485                if w > span_threshold || w < MIN_COL_ELEMENT_WIDTH {
486                    None // skip spanning elements and tiny elements
487                } else {
488                    Some((b.left_x + b.right_x) * 0.5)
489                }
490            })
491            .collect();
492        if centers.len() >= 2 {
493            centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
494            let mut candidate_x: Option<f64> = None;
495            let mut candidate_gap = 0.0f64;
496            for pair in centers.windows(2) {
497                let gap = pair[1] - pair[0];
498                if gap > candidate_gap && gap >= MIN_CENTER_GAP {
499                    candidate_gap = gap;
500                    candidate_x = Some((pair[0] + pair[1]) * 0.5);
501                }
502            }
503            // Right-partition left_x guard: reject if right-partition elements
504            // don't genuinely start on the right side of the split.
505            if let Some(sx) = candidate_x {
506                let right_min_left_x = elements
507                    .iter()
508                    .filter(|e| {
509                        let b = e.bbox();
510                        (b.left_x + b.right_x) * 0.5 >= sx
511                    })
512                    .map(|e| e.bbox().left_x)
513                    .fold(f64::INFINITY, f64::min);
514                if std::env::var("XYCUT_DEBUG").is_ok() {
515                    eprintln!("[XYCUT] center_x candidate_x={:.1} candidate_gap={:.1} right_min_left_x={:.1} threshold={:.1} pass={}",
516                        sx, candidate_gap, right_min_left_x, sx * 0.85, right_min_left_x >= sx * 0.85);
517                }
518                if right_min_left_x >= sx * 0.85 {
519                    best_x = candidate_x;
520                    best_gap = candidate_gap;
521                }
522            }
523        }
524    }
525
526    best_x.map(|x| (x, best_gap))
527}
528
529/// Partition elements by Y coordinate (above/below split).
530fn partition_by_y(
531    elements: &mut [ContentElement],
532    split_y: f64,
533) -> (&mut [ContentElement], &mut [ContentElement]) {
534    let pivot = itertools_partition(elements, |e| e.bbox().center_y() >= split_y);
535    elements.split_at_mut(pivot)
536}
537
538/// Partition elements by X coordinate (left/right of split).
539fn partition_by_x(
540    elements: &mut [ContentElement],
541    split_x: f64,
542) -> (&mut [ContentElement], &mut [ContentElement]) {
543    let pivot = itertools_partition(elements, |e| e.bbox().center_x() < split_x);
544    elements.split_at_mut(pivot)
545}
546
547/// Partition a slice in-place: elements satisfying predicate come first.
548/// Returns the index where false elements start.
549fn itertools_partition<T, F>(data: &mut [T], pred: F) -> usize
550where
551    F: Fn(&T) -> bool,
552{
553    let mut pivot = 0;
554    for i in 0..data.len() {
555        if pred(&data[i]) {
556            data.swap(i, pivot);
557            pivot += 1;
558        }
559    }
560    pivot
561}
562
563#[cfg(test)]
564mod tests {
565    use super::*;
566    use crate::models::chunks::ImageChunk;
567
568    fn make_element(left: f64, bottom: f64, right: f64, top: f64) -> ContentElement {
569        ContentElement::Image(ImageChunk {
570            bbox: BoundingBox::new(Some(1), left, bottom, right, top),
571            index: None,
572            level: None,
573        })
574    }
575
576    #[test]
577    fn test_xycut_sort_single() {
578        let mut elements = vec![make_element(0.0, 0.0, 100.0, 50.0)];
579        xycut_sort(
580            &mut elements,
581            &BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0),
582        );
583        assert_eq!(elements.len(), 1);
584    }
585
586    #[test]
587    fn test_xycut_sort_two_rows() {
588        let mut elements = vec![
589            make_element(50.0, 100.0, 200.0, 150.0), // lower row
590            make_element(50.0, 500.0, 200.0, 550.0), // upper row
591        ];
592        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
593        xycut_sort(&mut elements, &page);
594        // Upper row should come first (higher Y = top of page in PDF)
595        assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
596    }
597
598    #[test]
599    fn test_xycut_sort_two_columns_left_first() {
600        // Two-column layout with clear X gap (> 5pt)
601        // Left column: x=[50, 250], Right column: x=[300, 500]
602        // Column gap = 50pt, much larger than any row gap
603        let mut elements = vec![
604            make_element(300.0, 500.0, 500.0, 550.0), // right col, top
605            make_element(50.0, 500.0, 250.0, 550.0),  // left col, top
606            make_element(300.0, 400.0, 500.0, 450.0), // right col, mid
607            make_element(50.0, 400.0, 250.0, 450.0),  // left col, mid
608            make_element(300.0, 300.0, 500.0, 350.0), // right col, bot
609            make_element(50.0, 300.0, 250.0, 350.0),  // left col, bot
610        ];
611        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
612        xycut_sort(&mut elements, &page);
613
614        // Left column (x<=250) should come before right column (x>=300)
615        // Left col: top, mid, bot, then Right col: top, mid, bot
616        assert!(
617            elements[0].bbox().left_x < 260.0,
618            "First element should be left column"
619        );
620        assert!(
621            elements[1].bbox().left_x < 260.0,
622            "Second element should be left column"
623        );
624        assert!(
625            elements[2].bbox().left_x < 260.0,
626            "Third element should be left column"
627        );
628        assert!(
629            elements[3].bbox().left_x > 260.0,
630            "Fourth element should be right column"
631        );
632        assert!(
633            elements[4].bbox().left_x > 260.0,
634            "Fifth element should be right column"
635        );
636        assert!(
637            elements[5].bbox().left_x > 260.0,
638            "Sixth element should be right column"
639        );
640
641        // Within left column: top-to-bottom
642        assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
643        assert!(elements[1].bbox().top_y > elements[2].bbox().top_y);
644        // Within right column: top-to-bottom
645        assert!(elements[3].bbox().top_y > elements[4].bbox().top_y);
646        assert!(elements[4].bbox().top_y > elements[5].bbox().top_y);
647    }
648
649    #[test]
650    fn test_xycut_sort_header_then_two_columns() {
651        // Header spanning full width, then two columns
652        let mut elements = vec![
653            make_element(50.0, 700.0, 500.0, 750.0), // header (full width)
654            make_element(300.0, 500.0, 500.0, 550.0), // right col, top
655            make_element(50.0, 500.0, 250.0, 550.0), // left col, top
656            make_element(300.0, 400.0, 500.0, 450.0), // right col, bot
657            make_element(50.0, 400.0, 250.0, 450.0), // left col, bot
658        ];
659        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
660        xycut_sort(&mut elements, &page);
661
662        // Header first (y=750 is highest)
663        assert!(
664            elements[0].bbox().top_y >= 750.0,
665            "Header should come first"
666        );
667        // Then left column, then right column
668        assert!(
669            elements[1].bbox().left_x < 260.0,
670            "Left col should come after header"
671        );
672        assert!(elements[2].bbox().left_x < 260.0);
673        assert!(
674            elements[3].bbox().left_x > 260.0,
675            "Right col should come last"
676        );
677        assert!(elements[4].bbox().left_x > 260.0);
678    }
679
680    #[test]
681    fn test_projection_gap_with_overlapping_elements() {
682        // Elements overlap in X but there's a real column gap
683        // Left col elements: x=[50,250] and x=[60, 240]
684        // Right col elements: x=[310, 500]
685        // Naive adjacent-pair comparison would miss the gap
686        let mut elements = vec![
687            make_element(50.0, 500.0, 250.0, 550.0),  // left col, wide
688            make_element(60.0, 400.0, 240.0, 450.0),  // left col, narrower
689            make_element(310.0, 500.0, 500.0, 550.0), // right col
690            make_element(310.0, 400.0, 500.0, 450.0), // right col
691        ];
692        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
693        xycut_sort(&mut elements, &page);
694
695        // Left column should come before right column
696        assert!(elements[0].bbox().left_x < 260.0);
697        assert!(elements[1].bbox().left_x < 260.0);
698        assert!(elements[2].bbox().left_x > 260.0);
699        assert!(elements[3].bbox().left_x > 260.0);
700    }
701
702    #[test]
703    fn test_xycut_prefers_columns_for_staggered_two_column_layout() {
704        let mut elements = vec![
705            make_element(50.0, 680.0, 250.0, 740.0), // left col, top figure
706            make_element(50.0, 420.0, 250.0, 660.0), // left col, body
707            make_element(320.0, 600.0, 520.0, 760.0), // right col, top figure
708            make_element(320.0, 360.0, 520.0, 580.0), // right col, body
709        ];
710        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
711        xycut_sort(&mut elements, &page);
712
713        assert!(elements[0].bbox().left_x < 260.0);
714        assert!(elements[1].bbox().left_x < 260.0);
715        assert!(elements[2].bbox().left_x > 260.0);
716        assert!(elements[3].bbox().left_x > 260.0);
717    }
718
719    #[test]
720    fn test_xycut_keeps_spanning_header_and_footer_outside_columns() {
721        let mut elements = vec![
722            make_element(40.0, 760.0, 540.0, 810.0),  // spanning header
723            make_element(50.0, 640.0, 250.0, 700.0),  // left col, top
724            make_element(50.0, 520.0, 250.0, 620.0),  // left col, bottom
725            make_element(320.0, 640.0, 520.0, 700.0), // right col, top
726            make_element(320.0, 520.0, 520.0, 620.0), // right col, bottom
727            make_element(40.0, 430.0, 540.0, 480.0),  // spanning footer/source
728        ];
729        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
730        xycut_sort(&mut elements, &page);
731
732        assert!(
733            elements[0].bbox().top_y >= 800.0,
734            "header should stay first"
735        );
736        assert!(elements[1].bbox().left_x < 260.0);
737        assert!(elements[2].bbox().left_x < 260.0);
738        assert!(elements[3].bbox().left_x > 260.0);
739        assert!(elements[4].bbox().left_x > 260.0);
740        assert!(elements[5].bbox().top_y <= 480.0, "footer should stay last");
741    }
742
743    #[test]
744    fn test_xycut_rejects_vertical_cut_when_spanning_band_sits_between_columns() {
745        let mut elements = vec![
746            make_element(50.0, 700.0, 250.0, 760.0),  // left col, top
747            make_element(320.0, 700.0, 520.0, 760.0), // right col, top
748            make_element(40.0, 610.0, 540.0, 680.0),  // spanning mid-band graphic
749            make_element(50.0, 500.0, 250.0, 580.0),  // left col, bottom
750            make_element(320.0, 500.0, 520.0, 580.0), // right col, bottom
751        ];
752        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
753        xycut_sort(&mut elements, &page);
754
755        assert!(elements[0].bbox().top_y >= 760.0);
756        assert!(elements[1].bbox().top_y >= 760.0);
757        assert!(elements[2].bbox().top_y >= 680.0 && elements[2].bbox().bottom_y <= 610.0);
758        assert!(elements[3].bbox().top_y <= 580.0);
759        assert!(elements[4].bbox().top_y <= 580.0);
760    }
761}