Skip to main content

edgeparse_core/utils/
xycut.rs

1//! XY-Cut++ reading order algorithm.
2//!
3//! ```text
4//!   ┌──────────────────────────┐
5//!   │  Col A      │   Col B    │
6//!   │  ┌────────┐ │ ┌────────┐ │
7//!   │  │ Para 1 │ │ │ Para 3 │ │    1. Find largest vertical gap
8//!   │  └────────┘ │ └────────┘ │       → split left / right
9//!   │  ┌────────┐ │ ┌────────┐ │    2. Recurse on each half
10//!   │  │ Para 2 │ │ │ Para 4 │ │    3. Within half: find horizontal gap
11//!   │  └────────┘ │ └────────┘ │       → split top / bottom
12//!   └──────────────────────────┘    4. Order: top→bottom, left→right
13//!
14//!   Result: Para 1, Para 2, Para 3, Para 4
15//! ```
16
17use crate::models::bbox::BoundingBox;
18use crate::models::content::ContentElement;
19
20/// Sort content elements using the XY-Cut++ algorithm.
21///
22/// The algorithm recursively partitions the page:
23/// 1. Find largest horizontal gap → split into top/bottom
24/// 2. Find largest vertical gap → split into left/right
25/// 3. Recurse on each partition
26/// 4. Order: top-to-bottom, left-to-right within each partition
27pub fn xycut_sort(elements: &mut [ContentElement], page_bbox: &BoundingBox) {
28    if elements.len() <= 1 {
29        return;
30    }
31    xycut_recursive(elements, page_bbox);
32}
33
34fn xycut_recursive(elements: &mut [ContentElement], _region: &BoundingBox) {
35    if elements.len() <= 1 {
36        return;
37    }
38
39    // Find both possible cuts
40    let h_gap = find_horizontal_gap_size(elements);
41    let v_gap = find_vertical_gap_size(elements);
42
43    if std::env::var("XYCUT_DEBUG").is_ok() {
44        eprintln!(
45            "[XYCUT] n={} h_gap={:?} v_gap={:?}",
46            elements.len(),
47            h_gap.map(|(y, g)| format!("y={:.1} gap={:.1}", y, g)),
48            v_gap.map(|(x, g)| format!("x={:.1} gap={:.1}", x, g))
49        );
50        for e in elements.iter() {
51            let b = e.bbox();
52            eprintln!(
53                "  el: l={:.1} r={:.1} top={:.1} bot={:.1}",
54                b.left_x, b.right_x, b.top_y, b.bottom_y
55            );
56        }
57    }
58
59    // Prefer a vertical split only when the page really behaves like concurrent
60    // columns.  This avoids the old "always horizontal first" bias that
61    // interleaved figure-heavy two-column pages row-by-row, while still keeping
62    // full-width spanning elements (titles, section headings, footers) outside
63    // the column subtrees.
64    let prefer_vertical = match (h_gap, v_gap) {
65        (None, Some(_)) => true,
66        (Some((_, h_size)), Some((split_x, v_size))) => {
67            should_prefer_vertical_cut(elements, split_x, h_size, v_size)
68        }
69        _ => false,
70    };
71
72    if prefer_vertical {
73        // Try vertical cut first (column split)
74        if let Some((split_x, _)) = v_gap {
75            let (left, right) = partition_by_x(elements, split_x);
76            if !left.is_empty() && !right.is_empty() {
77                xycut_recursive(left, _region);
78                xycut_recursive(right, _region);
79                return;
80            }
81        }
82        // Fallback to horizontal cut
83        if let Some((split_y, _)) = h_gap {
84            let (top, bottom) = partition_by_y(elements, split_y);
85            if !top.is_empty() && !bottom.is_empty() {
86                xycut_recursive(top, _region);
87                xycut_recursive(bottom, _region);
88                return;
89            }
90        }
91    } else {
92        // Try horizontal cut first (row split)
93        if let Some((split_y, _)) = h_gap {
94            let (top, bottom) = partition_by_y(elements, split_y);
95            if !top.is_empty() && !bottom.is_empty() {
96                xycut_recursive(top, _region);
97                xycut_recursive(bottom, _region);
98                return;
99            }
100        }
101        // Fallback to vertical cut
102        if let Some((split_x, _)) = v_gap {
103            let (left, right) = partition_by_x(elements, split_x);
104            if !left.is_empty() && !right.is_empty() {
105                xycut_recursive(left, _region);
106                xycut_recursive(right, _region);
107                return;
108            }
109        }
110    }
111
112    // No split found — sort by quantized Y (top-to-bottom) then by left_x (column order).
113    //
114    // Rationale: in 2-column documents where the column gap is 0 pt (columns are
115    // perfectly adjacent with no whitespace), the vertical-gap detector cannot find
116    // a split, and the fallback sort takes over.  The left and right column elements
117    // at the same visual row often differ by only a fraction of a point in top_y due
118    // to PDF coordinate rounding.  If the right column element's top_y is even 0.5 pt
119    // higher, the old comparison by raw top_y would place the right column first,
120    // reversing the reading order.
121    //
122    // Quantizing top_y to 4 pt buckets groups elements that are visually on the same
123    // text row (sub-line-height Y difference) into the same bucket, then sorts within
124    // the bucket by left_x (left-column-first).  Typical body text lines are 10–14 pt
125    // apart, so a 4 pt bucket never merges elements from adjacent text lines.
126    const BUCKET_PT: f64 = 4.0;
127    elements.sort_by_key(|e| {
128        let y_bucket = -((e.bbox().top_y / BUCKET_PT).round() as i64); // descending
129        let x_key = (e.bbox().left_x * 10.0).round() as i64; // ascending
130        (y_bucket, x_key)
131    });
132}
133
134fn should_prefer_vertical_cut(
135    elements: &[ContentElement],
136    split_x: f64,
137    horizontal_gap: f64,
138    vertical_gap: f64,
139) -> bool {
140    if elements.len() < 4 {
141        return false;
142    }
143
144    let min_x = elements
145        .iter()
146        .map(|e| e.bbox().left_x)
147        .fold(f64::INFINITY, f64::min);
148    let max_x = elements
149        .iter()
150        .map(|e| e.bbox().right_x)
151        .fold(f64::NEG_INFINITY, f64::max);
152    let content_width = (max_x - min_x).max(1.0);
153    let spanning_width = content_width * 0.55;
154    let split_margin = 6.0;
155
156    let mut left_count = 0usize;
157    let mut right_count = 0usize;
158    let mut left_top = f64::NEG_INFINITY;
159    let mut left_bottom = f64::INFINITY;
160    let mut right_top = f64::NEG_INFINITY;
161    let mut right_bottom = f64::INFINITY;
162    let mut wide_crossing = 0usize;
163
164    for elem in elements {
165        let bbox = elem.bbox();
166        let width = bbox.right_x - bbox.left_x;
167        let crosses_split =
168            bbox.left_x < split_x - split_margin && bbox.right_x > split_x + split_margin;
169
170        if crosses_split && width >= spanning_width {
171            wide_crossing += 1;
172            continue;
173        }
174
175        if bbox.center_x() < split_x {
176            left_count += 1;
177            left_top = left_top.max(bbox.top_y);
178            left_bottom = left_bottom.min(bbox.bottom_y);
179        } else {
180            right_count += 1;
181            right_top = right_top.max(bbox.top_y);
182            right_bottom = right_bottom.min(bbox.bottom_y);
183        }
184    }
185
186    if left_count < 2 || right_count < 2 {
187        return false;
188    }
189    if wide_crossing > 0 {
190        return false;
191    }
192
193    let left_height = (left_top - left_bottom).max(0.0);
194    let right_height = (right_top - right_bottom).max(0.0);
195    if left_height <= 0.0 || right_height <= 0.0 {
196        return false;
197    }
198
199    let overlap = (left_top.min(right_top) - left_bottom.max(right_bottom)).max(0.0);
200    let overlap_ratio = overlap / left_height.min(right_height);
201
202    overlap_ratio >= 0.35 && vertical_gap >= horizontal_gap * 0.5
203}
204
205/// Minimum gap size (points) required to consider a cut.
206const MIN_GAP_THRESHOLD: f64 = 5.0;
207
208/// Find the largest horizontal gap (between rows of elements)
209/// using projection-profile gap detection.
210/// Sorts elements by top_y descending (top of page first) and tracks
211/// the accumulated minimum bottom edge to find true gaps.
212/// Returns (split_y, gap_size).
213fn find_horizontal_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
214    if elements.len() < 2 {
215        return None;
216    }
217
218    // Sort by top_y descending (scan from top of page downward)
219    let mut sorted: Vec<(f64, f64)> = elements
220        .iter()
221        .map(|e| (e.bbox().bottom_y, e.bbox().top_y))
222        .collect();
223    sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
224
225    let mut best_gap = 0.0f64;
226    let mut best_y = None;
227    let mut prev_bottom = sorted[0].0; // track lowest bottom seen so far
228
229    for &(bottom, top) in &sorted[1..] {
230        if prev_bottom > top {
231            // Gap exists between previous accumulated bottom and this element's top
232            let gap = prev_bottom - top;
233            if gap > best_gap && gap > MIN_GAP_THRESHOLD {
234                best_gap = gap;
235                best_y = Some((prev_bottom + top) / 2.0);
236            }
237        }
238        prev_bottom = prev_bottom.min(bottom); // accumulate lowest bottom
239    }
240
241    best_y.map(|y| (y, best_gap))
242}
243
244/// Find the largest vertical gap (between columns of elements)
245/// using interval-merge gap detection.
246///
247/// Merges overlapping X-ranges into consolidated intervals, then finds the
248/// largest gap between consecutive intervals.  Elements significantly wider
249/// than the median width are excluded from gap computation — they are likely
250/// full-width spanning elements (titles, footers) that would otherwise
251/// swallow the column gap.  This pre-masking mirrors the approach used in
252/// column_detector.rs and the reference implementation XYCutPlusPlusSorter's cross-layout logic.
253///
254/// Returns (split_x, gap_size).
255fn find_vertical_gap_size(elements: &[ContentElement]) -> Option<(f64, f64)> {
256    if elements.len() < 2 {
257        return None;
258    }
259
260    // Compute content width extents to identify spanning elements
261    let min_x = elements
262        .iter()
263        .map(|e| e.bbox().left_x)
264        .fold(f64::INFINITY, f64::min);
265    let max_x = elements
266        .iter()
267        .map(|e| e.bbox().right_x)
268        .fold(f64::NEG_INFINITY, f64::max);
269    let content_width = (max_x - min_x).max(1.0);
270
271    // Exclude elements spanning > 70% of content width from gap computation
272    let span_threshold = content_width * 0.70;
273
274    // Erode each element's bbox by BBOX_SHRINK before building intervals.
275    // PDF TextChunk right_x is often inflated by advance-width calculations
276    // (overshooting the actual visible glyph by 3–7 pt), causing the interval
277    // to bridge a real column gutter and masking the gap.  Matching the 3 pt
278    // erosion used in column_detector.rs makes the gap appear ~6 pt wider here.
279    const BBOX_SHRINK: f64 = 3.0;
280
281    // Build sorted list of (left_x, right_x) intervals, excluding spanning elements
282    let mut intervals: Vec<(f64, f64)> = elements
283        .iter()
284        .filter_map(|e| {
285            let b = e.bbox();
286            let w = b.right_x - b.left_x;
287            if w > span_threshold {
288                None
289            } else {
290                let eff_left = b.left_x + BBOX_SHRINK;
291                let eff_right = b.right_x - BBOX_SHRINK;
292                if eff_right > eff_left {
293                    Some((eff_left, eff_right))
294                } else {
295                    None // element too narrow after erosion
296                }
297            }
298        })
299        .collect();
300
301    // If all elements are spanning (or too few remain), fall back to using all
302    // with erosion applied (raw bboxes are not needed for the gap calculation).
303    if intervals.len() < 2 {
304        intervals = elements
305            .iter()
306            .filter_map(|e| {
307                let b = e.bbox();
308                let eff_left = b.left_x + BBOX_SHRINK;
309                let eff_right = b.right_x - BBOX_SHRINK;
310                if eff_right > eff_left {
311                    Some((eff_left, eff_right))
312                } else {
313                    None
314                }
315            })
316            .collect();
317    }
318
319    intervals.sort_by(|a, b| {
320        a.0.partial_cmp(&b.0)
321            .unwrap_or(std::cmp::Ordering::Equal)
322            .then_with(|| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
323    });
324
325    // Merge overlapping intervals (with a small tolerance for near-touching
326    // elements whose bounding boxes slightly overlap due to PDF coordinate
327    // imprecision).  Kept at 0.5 pt — large enough for true glyph jitter
328    // but small enough not to swallow genuine column gaps that touch at 0 pt.
329    let mut merged: Vec<(f64, f64)> = Vec::new();
330    let merge_tolerance = 0.5; // points — tight to preserve genuinely adjacent columns
331    for (left, right) in intervals {
332        if let Some(last) = merged.last_mut() {
333            if left <= last.1 + merge_tolerance {
334                // Overlapping or within tolerance — extend
335                last.1 = last.1.max(right);
336                continue;
337            }
338        }
339        merged.push((left, right));
340    }
341
342    // Find the largest gap between consecutive merged intervals
343    let mut best_gap = 0.0f64;
344    let mut best_x = None;
345    for w in merged.windows(2) {
346        let gap = w[1].0 - w[0].1;
347        if gap > best_gap && gap > MIN_GAP_THRESHOLD {
348            best_gap = gap;
349            best_x = Some((w[0].1 + w[1].0) / 2.0);
350        }
351    }
352
353    // Fallback: center_x clustering.
354    //
355    // When bbox inflation causes left-column right_x to overshoot the right-
356    // column left_x (making the above interval method fail), we look for a
357    // large gap between consecutive element center_x values.  This is immune
358    // to bbox inflation because center_x ≈ visual center of the content.
359    //
360    // Two guards keep this fallback precise:
361    //  1. MIN_COL_ELEMENT_WIDTH: only elements at least 50pt wide are used to
362    //     compute center_x candidates.  Tiny chart bars, bullet dots, or legend
363    //     markers (typically < 30pt wide) would otherwise fill in the space
364    //     between two text columns, hiding the column gap from the clustering.
365    //  2. Right-partition left_x guard: after finding a candidate split_x,
366    //     verify that the right-partition elements actually start on the right
367    //     side of the page (min left_x of right-partition elements >= split_x *
368    //     0.85).  In genuine two-column layouts the right column's left margin
369    //     is close to or above split_x.  In single-column documents with
370    //     width-varying elements (headings, indented text, spanning paragraphs),
371    //     spanning elements are forced into the right partition by their center_x
372    //     but their left_x is still at the left margin — this guard rejects them.
373    if best_x.is_none() {
374        const MIN_CENTER_GAP: f64 = 20.0;
375        // Minimum element width to be used for center_x computation.
376        // Filters out tiny chart bars, bullet markers, and legend dots.
377        const MIN_COL_ELEMENT_WIDTH: f64 = 50.0;
378        let mut centers: Vec<f64> = elements
379            .iter()
380            .filter_map(|e| {
381                let b = e.bbox();
382                let w = b.right_x - b.left_x;
383                if w > span_threshold || w < MIN_COL_ELEMENT_WIDTH {
384                    None // skip spanning elements and tiny elements
385                } else {
386                    Some((b.left_x + b.right_x) * 0.5)
387                }
388            })
389            .collect();
390        if centers.len() >= 2 {
391            centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
392            let mut candidate_x: Option<f64> = None;
393            let mut candidate_gap = 0.0f64;
394            for pair in centers.windows(2) {
395                let gap = pair[1] - pair[0];
396                if gap > candidate_gap && gap >= MIN_CENTER_GAP {
397                    candidate_gap = gap;
398                    candidate_x = Some((pair[0] + pair[1]) * 0.5);
399                }
400            }
401            // Right-partition left_x guard: reject if right-partition elements
402            // don't genuinely start on the right side of the split.
403            if let Some(sx) = candidate_x {
404                let right_min_left_x = elements
405                    .iter()
406                    .filter(|e| {
407                        let b = e.bbox();
408                        (b.left_x + b.right_x) * 0.5 >= sx
409                    })
410                    .map(|e| e.bbox().left_x)
411                    .fold(f64::INFINITY, f64::min);
412                if std::env::var("XYCUT_DEBUG").is_ok() {
413                    eprintln!("[XYCUT] center_x candidate_x={:.1} candidate_gap={:.1} right_min_left_x={:.1} threshold={:.1} pass={}",
414                        sx, candidate_gap, right_min_left_x, sx * 0.85, right_min_left_x >= sx * 0.85);
415                }
416                if right_min_left_x >= sx * 0.85 {
417                    best_x = candidate_x;
418                    best_gap = candidate_gap;
419                }
420            }
421        }
422    }
423
424    best_x.map(|x| (x, best_gap))
425}
426
427/// Partition elements by Y coordinate (above/below split).
428fn partition_by_y(
429    elements: &mut [ContentElement],
430    split_y: f64,
431) -> (&mut [ContentElement], &mut [ContentElement]) {
432    let pivot = itertools_partition(elements, |e| e.bbox().center_y() >= split_y);
433    elements.split_at_mut(pivot)
434}
435
436/// Partition elements by X coordinate (left/right of split).
437fn partition_by_x(
438    elements: &mut [ContentElement],
439    split_x: f64,
440) -> (&mut [ContentElement], &mut [ContentElement]) {
441    let pivot = itertools_partition(elements, |e| e.bbox().center_x() < split_x);
442    elements.split_at_mut(pivot)
443}
444
445/// Partition a slice in-place: elements satisfying predicate come first.
446/// Returns the index where false elements start.
447fn itertools_partition<T, F>(data: &mut [T], pred: F) -> usize
448where
449    F: Fn(&T) -> bool,
450{
451    let mut pivot = 0;
452    for i in 0..data.len() {
453        if pred(&data[i]) {
454            data.swap(i, pivot);
455            pivot += 1;
456        }
457    }
458    pivot
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464    use crate::models::chunks::ImageChunk;
465
466    fn make_element(left: f64, bottom: f64, right: f64, top: f64) -> ContentElement {
467        ContentElement::Image(ImageChunk {
468            bbox: BoundingBox::new(Some(1), left, bottom, right, top),
469            index: None,
470            level: None,
471        })
472    }
473
474    #[test]
475    fn test_xycut_sort_single() {
476        let mut elements = vec![make_element(0.0, 0.0, 100.0, 50.0)];
477        xycut_sort(
478            &mut elements,
479            &BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0),
480        );
481        assert_eq!(elements.len(), 1);
482    }
483
484    #[test]
485    fn test_xycut_sort_two_rows() {
486        let mut elements = vec![
487            make_element(50.0, 100.0, 200.0, 150.0), // lower row
488            make_element(50.0, 500.0, 200.0, 550.0), // upper row
489        ];
490        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
491        xycut_sort(&mut elements, &page);
492        // Upper row should come first (higher Y = top of page in PDF)
493        assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
494    }
495
496    #[test]
497    fn test_xycut_sort_two_columns_left_first() {
498        // Two-column layout with clear X gap (> 5pt)
499        // Left column: x=[50, 250], Right column: x=[300, 500]
500        // Column gap = 50pt, much larger than any row gap
501        let mut elements = vec![
502            make_element(300.0, 500.0, 500.0, 550.0), // right col, top
503            make_element(50.0, 500.0, 250.0, 550.0),  // left col, top
504            make_element(300.0, 400.0, 500.0, 450.0), // right col, mid
505            make_element(50.0, 400.0, 250.0, 450.0),  // left col, mid
506            make_element(300.0, 300.0, 500.0, 350.0), // right col, bot
507            make_element(50.0, 300.0, 250.0, 350.0),  // left col, bot
508        ];
509        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
510        xycut_sort(&mut elements, &page);
511
512        // Left column (x<=250) should come before right column (x>=300)
513        // Left col: top, mid, bot, then Right col: top, mid, bot
514        assert!(
515            elements[0].bbox().left_x < 260.0,
516            "First element should be left column"
517        );
518        assert!(
519            elements[1].bbox().left_x < 260.0,
520            "Second element should be left column"
521        );
522        assert!(
523            elements[2].bbox().left_x < 260.0,
524            "Third element should be left column"
525        );
526        assert!(
527            elements[3].bbox().left_x > 260.0,
528            "Fourth element should be right column"
529        );
530        assert!(
531            elements[4].bbox().left_x > 260.0,
532            "Fifth element should be right column"
533        );
534        assert!(
535            elements[5].bbox().left_x > 260.0,
536            "Sixth element should be right column"
537        );
538
539        // Within left column: top-to-bottom
540        assert!(elements[0].bbox().top_y > elements[1].bbox().top_y);
541        assert!(elements[1].bbox().top_y > elements[2].bbox().top_y);
542        // Within right column: top-to-bottom
543        assert!(elements[3].bbox().top_y > elements[4].bbox().top_y);
544        assert!(elements[4].bbox().top_y > elements[5].bbox().top_y);
545    }
546
547    #[test]
548    fn test_xycut_sort_header_then_two_columns() {
549        // Header spanning full width, then two columns
550        let mut elements = vec![
551            make_element(50.0, 700.0, 500.0, 750.0), // header (full width)
552            make_element(300.0, 500.0, 500.0, 550.0), // right col, top
553            make_element(50.0, 500.0, 250.0, 550.0), // left col, top
554            make_element(300.0, 400.0, 500.0, 450.0), // right col, bot
555            make_element(50.0, 400.0, 250.0, 450.0), // left col, bot
556        ];
557        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
558        xycut_sort(&mut elements, &page);
559
560        // Header first (y=750 is highest)
561        assert!(
562            elements[0].bbox().top_y >= 750.0,
563            "Header should come first"
564        );
565        // Then left column, then right column
566        assert!(
567            elements[1].bbox().left_x < 260.0,
568            "Left col should come after header"
569        );
570        assert!(elements[2].bbox().left_x < 260.0);
571        assert!(
572            elements[3].bbox().left_x > 260.0,
573            "Right col should come last"
574        );
575        assert!(elements[4].bbox().left_x > 260.0);
576    }
577
578    #[test]
579    fn test_projection_gap_with_overlapping_elements() {
580        // Elements overlap in X but there's a real column gap
581        // Left col elements: x=[50,250] and x=[60, 240]
582        // Right col elements: x=[310, 500]
583        // Naive adjacent-pair comparison would miss the gap
584        let mut elements = vec![
585            make_element(50.0, 500.0, 250.0, 550.0),  // left col, wide
586            make_element(60.0, 400.0, 240.0, 450.0),  // left col, narrower
587            make_element(310.0, 500.0, 500.0, 550.0), // right col
588            make_element(310.0, 400.0, 500.0, 450.0), // right col
589        ];
590        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
591        xycut_sort(&mut elements, &page);
592
593        // Left column should come before right column
594        assert!(elements[0].bbox().left_x < 260.0);
595        assert!(elements[1].bbox().left_x < 260.0);
596        assert!(elements[2].bbox().left_x > 260.0);
597        assert!(elements[3].bbox().left_x > 260.0);
598    }
599
600    #[test]
601    fn test_xycut_prefers_columns_for_staggered_two_column_layout() {
602        let mut elements = vec![
603            make_element(50.0, 680.0, 250.0, 740.0), // left col, top figure
604            make_element(50.0, 420.0, 250.0, 660.0), // left col, body
605            make_element(320.0, 600.0, 520.0, 760.0), // right col, top figure
606            make_element(320.0, 360.0, 520.0, 580.0), // right col, body
607        ];
608        let page = BoundingBox::new(Some(1), 0.0, 0.0, 595.0, 842.0);
609        xycut_sort(&mut elements, &page);
610
611        assert!(elements[0].bbox().left_x < 260.0);
612        assert!(elements[1].bbox().left_x < 260.0);
613        assert!(elements[2].bbox().left_x > 260.0);
614        assert!(elements[3].bbox().left_x > 260.0);
615    }
616}