Skip to main content

pdfplumber_core/
layout.rs

1use crate::geometry::BBox;
2use crate::words::Word;
3
4/// A text line: a sequence of words on the same y-level.
5#[derive(Debug, Clone, PartialEq)]
6#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
7pub struct TextLine {
8    /// Words in this line, sorted left-to-right.
9    pub words: Vec<Word>,
10    /// Bounding box of this line.
11    pub bbox: BBox,
12}
13
14/// A text block: a group of lines forming a coherent paragraph or section.
15#[derive(Debug, Clone, PartialEq)]
16#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
17pub struct TextBlock {
18    /// Lines in this block, sorted top-to-bottom.
19    pub lines: Vec<TextLine>,
20    /// Bounding box of this block.
21    pub bbox: BBox,
22}
23
24/// Options for layout-aware text extraction.
25#[derive(Debug, Clone)]
26pub struct TextOptions {
27    /// If true, use layout-aware extraction (detect blocks and reading order).
28    /// If false, simple concatenation of words by spatial order.
29    pub layout: bool,
30    /// Vertical tolerance for clustering words into the same line (in points).
31    pub y_tolerance: f64,
32    /// Maximum vertical gap between lines to group into the same block (in points).
33    pub y_density: f64,
34    /// Minimum horizontal gap to detect column boundaries (in points).
35    pub x_density: f64,
36}
37
38impl Default for TextOptions {
39    fn default() -> Self {
40        Self {
41            layout: false,
42            y_tolerance: 3.0,
43            y_density: 10.0,
44            x_density: 10.0,
45        }
46    }
47}
48
49/// Cluster words into text lines based on y-proximity.
50///
51/// Words whose vertical midpoints are within `y_tolerance` of a line's
52/// vertical midpoint are grouped into the same line. Words within each
53/// line are sorted left-to-right.
54pub fn cluster_words_into_lines(words: &[Word], y_tolerance: f64) -> Vec<TextLine> {
55    if words.is_empty() {
56        return Vec::new();
57    }
58
59    let mut sorted: Vec<&Word> = words.iter().collect();
60    sorted.sort_by(|a, b| {
61        a.bbox
62            .top
63            .partial_cmp(&b.bbox.top)
64            .unwrap()
65            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
66    });
67
68    let mut lines: Vec<TextLine> = Vec::new();
69
70    for word in sorted {
71        let word_mid_y = (word.bbox.top + word.bbox.bottom) / 2.0;
72
73        // Try to find an existing line this word belongs to
74        let mut found = false;
75        for line in &mut lines {
76            let line_mid_y = (line.bbox.top + line.bbox.bottom) / 2.0;
77            if (word_mid_y - line_mid_y).abs() <= y_tolerance {
78                line.bbox = line.bbox.union(&word.bbox);
79                line.words.push(word.clone());
80                found = true;
81                break;
82            }
83        }
84
85        if !found {
86            lines.push(TextLine {
87                words: vec![word.clone()],
88                bbox: word.bbox,
89            });
90        }
91    }
92
93    // Sort words within each line left-to-right
94    for line in &mut lines {
95        line.words
96            .sort_by(|a, b| a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap());
97    }
98
99    // Sort lines top-to-bottom
100    lines.sort_by(|a, b| a.bbox.top.partial_cmp(&b.bbox.top).unwrap());
101
102    lines
103}
104
105/// Split text lines at large horizontal gaps to detect column boundaries.
106///
107/// Within each line, if consecutive words have a gap larger than `x_density`,
108/// the line is split into separate line segments (one per column).
109pub fn split_lines_at_columns(lines: Vec<TextLine>, x_density: f64) -> Vec<TextLine> {
110    let mut result = Vec::new();
111    for line in lines {
112        if line.words.len() <= 1 {
113            result.push(line);
114            continue;
115        }
116
117        let mut current_words = vec![line.words[0].clone()];
118        let mut current_bbox = line.words[0].bbox;
119
120        for word in line.words.iter().skip(1) {
121            let gap = word.bbox.x0 - current_bbox.x1;
122            if gap > x_density {
123                result.push(TextLine {
124                    words: current_words,
125                    bbox: current_bbox,
126                });
127                current_words = vec![word.clone()];
128                current_bbox = word.bbox;
129            } else {
130                current_bbox = current_bbox.union(&word.bbox);
131                current_words.push(word.clone());
132            }
133        }
134
135        result.push(TextLine {
136            words: current_words,
137            bbox: current_bbox,
138        });
139    }
140
141    // Re-sort by (top, x0) after splitting
142    result.sort_by(|a, b| {
143        a.bbox
144            .top
145            .partial_cmp(&b.bbox.top)
146            .unwrap()
147            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
148    });
149
150    result
151}
152
153/// Cluster text line segments into text blocks based on x-overlap and vertical proximity.
154///
155/// Line segments that vertically follow each other (gap <= `y_density`) and
156/// have overlapping x-ranges are grouped into the same block.
157pub fn cluster_lines_into_blocks(lines: Vec<TextLine>, y_density: f64) -> Vec<TextBlock> {
158    if lines.is_empty() {
159        return Vec::new();
160    }
161
162    let mut blocks: Vec<TextBlock> = Vec::new();
163
164    for line in lines {
165        // Find the best matching block: closest vertically, with x-overlap
166        let mut best_block: Option<usize> = None;
167        let mut best_gap = f64::INFINITY;
168
169        for (i, block) in blocks.iter().enumerate() {
170            let gap = line.bbox.top - block.bbox.bottom;
171            if gap >= 0.0
172                && gap <= y_density
173                && has_x_overlap(&line.bbox, &block.bbox)
174                && gap < best_gap
175            {
176                best_gap = gap;
177                best_block = Some(i);
178            }
179        }
180
181        if let Some(idx) = best_block {
182            blocks[idx].bbox = blocks[idx].bbox.union(&line.bbox);
183            blocks[idx].lines.push(line);
184        } else {
185            blocks.push(TextBlock {
186                bbox: line.bbox,
187                lines: vec![line],
188            });
189        }
190    }
191
192    // Sort lines within each block top-to-bottom
193    for block in &mut blocks {
194        block
195            .lines
196            .sort_by(|a, b| a.bbox.top.partial_cmp(&b.bbox.top).unwrap());
197    }
198
199    blocks
200}
201
202/// Check if two bounding boxes overlap horizontally.
203fn has_x_overlap(a: &BBox, b: &BBox) -> bool {
204    a.x0 < b.x1 && b.x0 < a.x1
205}
206
207/// Sort text blocks in natural reading order.
208///
209/// Sorts blocks by top position first, then by x0 within the same vertical band.
210/// This produces left-to-right, top-to-bottom reading order.
211pub fn sort_blocks_reading_order(blocks: &mut [TextBlock], _x_density: f64) {
212    blocks.sort_by(|a, b| {
213        a.bbox
214            .top
215            .partial_cmp(&b.bbox.top)
216            .unwrap()
217            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
218    });
219}
220
221/// Convert text blocks into a string.
222///
223/// Words within a line are joined by spaces.
224/// Lines within a block are joined by newlines.
225/// Blocks are separated by double newlines.
226pub fn blocks_to_text(blocks: &[TextBlock]) -> String {
227    blocks
228        .iter()
229        .map(|block| {
230            block
231                .lines
232                .iter()
233                .map(|line| {
234                    line.words
235                        .iter()
236                        .map(|w| w.text.as_str())
237                        .collect::<Vec<_>>()
238                        .join(" ")
239                })
240                .collect::<Vec<_>>()
241                .join("\n")
242        })
243        .collect::<Vec<_>>()
244        .join("\n\n")
245}
246
247/// Simple (non-layout) text extraction from words.
248///
249/// Clusters words into lines by y-proximity, then joins with spaces/newlines.
250pub fn words_to_text(words: &[Word], y_tolerance: f64) -> String {
251    let lines = cluster_words_into_lines(words, y_tolerance);
252    lines
253        .iter()
254        .map(|line| {
255            line.words
256                .iter()
257                .map(|w| w.text.as_str())
258                .collect::<Vec<_>>()
259                .join(" ")
260        })
261        .collect::<Vec<_>>()
262        .join("\n")
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use crate::text::Char;
269
270    fn make_word(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Word {
271        Word {
272            text: text.to_string(),
273            bbox: BBox::new(x0, top, x1, bottom),
274            doctop: top,
275            direction: crate::text::TextDirection::Ltr,
276            chars: vec![Char {
277                text: text.to_string(),
278                bbox: BBox::new(x0, top, x1, bottom),
279                fontname: "TestFont".to_string(),
280                size: 12.0,
281                doctop: top,
282                upright: true,
283                direction: crate::text::TextDirection::Ltr,
284                stroking_color: None,
285                non_stroking_color: None,
286                ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
287                char_code: 0,
288                mcid: None,
289                tag: None,
290            }],
291        }
292    }
293
294    // --- TextOptions ---
295
296    #[test]
297    fn test_text_options_default() {
298        let opts = TextOptions::default();
299        assert!(!opts.layout);
300        assert_eq!(opts.y_tolerance, 3.0);
301        assert_eq!(opts.y_density, 10.0);
302        assert_eq!(opts.x_density, 10.0);
303    }
304
305    #[test]
306    fn test_text_options_layout_true() {
307        let opts = TextOptions {
308            layout: true,
309            ..TextOptions::default()
310        };
311        assert!(opts.layout);
312    }
313
314    // --- cluster_words_into_lines ---
315
316    #[test]
317    fn test_cluster_empty_words() {
318        let lines = cluster_words_into_lines(&[], 3.0);
319        assert!(lines.is_empty());
320    }
321
322    #[test]
323    fn test_cluster_single_word() {
324        let words = vec![make_word("Hello", 10.0, 100.0, 50.0, 112.0)];
325        let lines = cluster_words_into_lines(&words, 3.0);
326        assert_eq!(lines.len(), 1);
327        assert_eq!(lines[0].words.len(), 1);
328        assert_eq!(lines[0].words[0].text, "Hello");
329        assert_eq!(lines[0].bbox, BBox::new(10.0, 100.0, 50.0, 112.0));
330    }
331
332    #[test]
333    fn test_cluster_words_same_line() {
334        let words = vec![
335            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
336            make_word("World", 55.0, 100.0, 95.0, 112.0),
337        ];
338        let lines = cluster_words_into_lines(&words, 3.0);
339        assert_eq!(lines.len(), 1);
340        assert_eq!(lines[0].words.len(), 2);
341        assert_eq!(lines[0].words[0].text, "Hello");
342        assert_eq!(lines[0].words[1].text, "World");
343    }
344
345    #[test]
346    fn test_cluster_words_different_lines() {
347        let words = vec![
348            make_word("Line1", 10.0, 100.0, 50.0, 112.0),
349            make_word("Line2", 10.0, 120.0, 50.0, 132.0),
350        ];
351        let lines = cluster_words_into_lines(&words, 3.0);
352        assert_eq!(lines.len(), 2);
353        assert_eq!(lines[0].words[0].text, "Line1");
354        assert_eq!(lines[1].words[0].text, "Line2");
355    }
356
357    #[test]
358    fn test_cluster_words_slight_y_variation() {
359        // Words on "same line" but slightly different y positions (within tolerance)
360        let words = vec![
361            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
362            make_word("World", 55.0, 101.0, 95.0, 113.0), // 1pt y-offset
363        ];
364        let lines = cluster_words_into_lines(&words, 3.0);
365        assert_eq!(lines.len(), 1);
366        assert_eq!(lines[0].words.len(), 2);
367    }
368
369    #[test]
370    fn test_cluster_words_sorted_left_to_right_within_line() {
371        // Words given in reverse x-order
372        let words = vec![
373            make_word("World", 55.0, 100.0, 95.0, 112.0),
374            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
375        ];
376        let lines = cluster_words_into_lines(&words, 3.0);
377        assert_eq!(lines[0].words[0].text, "Hello");
378        assert_eq!(lines[0].words[1].text, "World");
379    }
380
381    #[test]
382    fn test_cluster_three_lines() {
383        let words = vec![
384            make_word("First", 10.0, 100.0, 50.0, 112.0),
385            make_word("line", 55.0, 100.0, 85.0, 112.0),
386            make_word("Second", 10.0, 120.0, 60.0, 132.0),
387            make_word("line", 65.0, 120.0, 95.0, 132.0),
388            make_word("Third", 10.0, 140.0, 50.0, 152.0),
389            make_word("line", 55.0, 140.0, 85.0, 152.0),
390        ];
391        let lines = cluster_words_into_lines(&words, 3.0);
392        assert_eq!(lines.len(), 3);
393        assert_eq!(lines[0].words.len(), 2);
394        assert_eq!(lines[1].words.len(), 2);
395        assert_eq!(lines[2].words.len(), 2);
396    }
397
398    #[test]
399    fn test_cluster_line_bbox_is_union() {
400        let words = vec![
401            make_word("A", 10.0, 98.0, 20.0, 112.0),
402            make_word("B", 25.0, 100.0, 35.0, 110.0),
403        ];
404        let lines = cluster_words_into_lines(&words, 3.0);
405        assert_eq!(lines[0].bbox, BBox::new(10.0, 98.0, 35.0, 112.0));
406    }
407
408    // --- cluster_lines_into_blocks ---
409
410    #[test]
411    fn test_cluster_lines_empty() {
412        let blocks = cluster_lines_into_blocks(vec![], 10.0);
413        assert!(blocks.is_empty());
414    }
415
416    #[test]
417    fn test_cluster_lines_single_block() {
418        let lines = vec![
419            TextLine {
420                words: vec![make_word("Line1", 10.0, 100.0, 50.0, 112.0)],
421                bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
422            },
423            TextLine {
424                words: vec![make_word("Line2", 10.0, 115.0, 50.0, 127.0)],
425                bbox: BBox::new(10.0, 115.0, 50.0, 127.0),
426            },
427        ];
428        let blocks = cluster_lines_into_blocks(lines, 10.0);
429        assert_eq!(blocks.len(), 1);
430        assert_eq!(blocks[0].lines.len(), 2);
431        assert_eq!(blocks[0].bbox, BBox::new(10.0, 100.0, 50.0, 127.0));
432    }
433
434    #[test]
435    fn test_cluster_lines_two_blocks() {
436        let lines = vec![
437            TextLine {
438                words: vec![make_word("Block1", 10.0, 100.0, 60.0, 112.0)],
439                bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
440            },
441            TextLine {
442                words: vec![make_word("Still1", 10.0, 115.0, 60.0, 127.0)],
443                bbox: BBox::new(10.0, 115.0, 60.0, 127.0),
444            },
445            // Large gap (127 to 200 = 73pt gap, >> 10.0)
446            TextLine {
447                words: vec![make_word("Block2", 10.0, 200.0, 60.0, 212.0)],
448                bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
449            },
450        ];
451        let blocks = cluster_lines_into_blocks(lines, 10.0);
452        assert_eq!(blocks.len(), 2);
453        assert_eq!(blocks[0].lines.len(), 2);
454        assert_eq!(blocks[1].lines.len(), 1);
455    }
456
457    #[test]
458    fn test_cluster_lines_block_bbox() {
459        let lines = vec![
460            TextLine {
461                words: vec![make_word("Line1", 10.0, 100.0, 80.0, 112.0)],
462                bbox: BBox::new(10.0, 100.0, 80.0, 112.0),
463            },
464            TextLine {
465                words: vec![make_word("Line2", 5.0, 115.0, 90.0, 127.0)],
466                bbox: BBox::new(5.0, 115.0, 90.0, 127.0),
467            },
468        ];
469        let blocks = cluster_lines_into_blocks(lines, 10.0);
470        assert_eq!(blocks[0].bbox, BBox::new(5.0, 100.0, 90.0, 127.0));
471    }
472
473    // --- sort_blocks_reading_order ---
474
475    #[test]
476    fn test_sort_single_column_top_to_bottom() {
477        let mut blocks = vec![
478            TextBlock {
479                lines: vec![TextLine {
480                    words: vec![make_word("Second", 10.0, 200.0, 60.0, 212.0)],
481                    bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
482                }],
483                bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
484            },
485            TextBlock {
486                lines: vec![TextLine {
487                    words: vec![make_word("First", 10.0, 100.0, 60.0, 112.0)],
488                    bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
489                }],
490                bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
491            },
492        ];
493        sort_blocks_reading_order(&mut blocks, 10.0);
494        assert_eq!(blocks[0].lines[0].words[0].text, "First");
495        assert_eq!(blocks[1].lines[0].words[0].text, "Second");
496    }
497
498    #[test]
499    fn test_sort_two_columns() {
500        // Left column at x=10..100, right column at x=200..300
501        // Blocks at different y-levels: sorts by (top, x0)
502        let mut blocks = vec![
503            TextBlock {
504                lines: vec![TextLine {
505                    words: vec![make_word("Right1", 200.0, 100.0, 300.0, 112.0)],
506                    bbox: BBox::new(200.0, 100.0, 300.0, 112.0),
507                }],
508                bbox: BBox::new(200.0, 100.0, 300.0, 112.0),
509            },
510            TextBlock {
511                lines: vec![TextLine {
512                    words: vec![make_word("Left1", 10.0, 100.0, 100.0, 112.0)],
513                    bbox: BBox::new(10.0, 100.0, 100.0, 112.0),
514                }],
515                bbox: BBox::new(10.0, 100.0, 100.0, 112.0),
516            },
517            TextBlock {
518                lines: vec![TextLine {
519                    words: vec![make_word("Right2", 200.0, 200.0, 300.0, 212.0)],
520                    bbox: BBox::new(200.0, 200.0, 300.0, 212.0),
521                }],
522                bbox: BBox::new(200.0, 200.0, 300.0, 212.0),
523            },
524            TextBlock {
525                lines: vec![TextLine {
526                    words: vec![make_word("Left2", 10.0, 200.0, 100.0, 212.0)],
527                    bbox: BBox::new(10.0, 200.0, 100.0, 212.0),
528                }],
529                bbox: BBox::new(10.0, 200.0, 100.0, 212.0),
530            },
531        ];
532        sort_blocks_reading_order(&mut blocks, 10.0);
533        // Reading order: top-to-bottom, left-to-right within same y-level
534        assert_eq!(blocks[0].lines[0].words[0].text, "Left1");
535        assert_eq!(blocks[1].lines[0].words[0].text, "Right1");
536        assert_eq!(blocks[2].lines[0].words[0].text, "Left2");
537        assert_eq!(blocks[3].lines[0].words[0].text, "Right2");
538    }
539
540    #[test]
541    fn test_sort_single_block_unchanged() {
542        let mut blocks = vec![TextBlock {
543            lines: vec![TextLine {
544                words: vec![make_word("Only", 10.0, 100.0, 50.0, 112.0)],
545                bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
546            }],
547            bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
548        }];
549        sort_blocks_reading_order(&mut blocks, 10.0);
550        assert_eq!(blocks[0].lines[0].words[0].text, "Only");
551    }
552
553    // --- blocks_to_text ---
554
555    #[test]
556    fn test_blocks_to_text_single_block_single_line() {
557        let blocks = vec![TextBlock {
558            lines: vec![TextLine {
559                words: vec![
560                    make_word("Hello", 10.0, 100.0, 50.0, 112.0),
561                    make_word("World", 55.0, 100.0, 95.0, 112.0),
562                ],
563                bbox: BBox::new(10.0, 100.0, 95.0, 112.0),
564            }],
565            bbox: BBox::new(10.0, 100.0, 95.0, 112.0),
566        }];
567        assert_eq!(blocks_to_text(&blocks), "Hello World");
568    }
569
570    #[test]
571    fn test_blocks_to_text_single_block_multi_line() {
572        let blocks = vec![TextBlock {
573            lines: vec![
574                TextLine {
575                    words: vec![make_word("Line1", 10.0, 100.0, 50.0, 112.0)],
576                    bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
577                },
578                TextLine {
579                    words: vec![make_word("Line2", 10.0, 115.0, 50.0, 127.0)],
580                    bbox: BBox::new(10.0, 115.0, 50.0, 127.0),
581                },
582            ],
583            bbox: BBox::new(10.0, 100.0, 50.0, 127.0),
584        }];
585        assert_eq!(blocks_to_text(&blocks), "Line1\nLine2");
586    }
587
588    #[test]
589    fn test_blocks_to_text_two_blocks() {
590        let blocks = vec![
591            TextBlock {
592                lines: vec![TextLine {
593                    words: vec![make_word("Block1", 10.0, 100.0, 60.0, 112.0)],
594                    bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
595                }],
596                bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
597            },
598            TextBlock {
599                lines: vec![TextLine {
600                    words: vec![make_word("Block2", 10.0, 200.0, 60.0, 212.0)],
601                    bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
602                }],
603                bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
604            },
605        ];
606        assert_eq!(blocks_to_text(&blocks), "Block1\n\nBlock2");
607    }
608
609    #[test]
610    fn test_blocks_to_text_empty() {
611        assert_eq!(blocks_to_text(&[]), "");
612    }
613
614    // --- words_to_text ---
615
616    #[test]
617    fn test_words_to_text_single_line() {
618        let words = vec![
619            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
620            make_word("World", 55.0, 100.0, 95.0, 112.0),
621        ];
622        assert_eq!(words_to_text(&words, 3.0), "Hello World");
623    }
624
625    #[test]
626    fn test_words_to_text_multi_line() {
627        let words = vec![
628            make_word("Line1", 10.0, 100.0, 50.0, 112.0),
629            make_word("Line2", 10.0, 120.0, 50.0, 132.0),
630        ];
631        assert_eq!(words_to_text(&words, 3.0), "Line1\nLine2");
632    }
633
634    #[test]
635    fn test_words_to_text_empty() {
636        assert_eq!(words_to_text(&[], 3.0), "");
637    }
638
639    // --- split_lines_at_columns ---
640
641    #[test]
642    fn test_split_lines_no_columns() {
643        let lines = vec![TextLine {
644            words: vec![
645                make_word("Hello", 10.0, 100.0, 50.0, 112.0),
646                make_word("World", 55.0, 100.0, 95.0, 112.0),
647            ],
648            bbox: BBox::new(10.0, 100.0, 95.0, 112.0),
649        }];
650        let result = split_lines_at_columns(lines, 50.0);
651        assert_eq!(result.len(), 1); // gap=5 < x_density=50
652    }
653
654    #[test]
655    fn test_split_lines_with_column_gap() {
656        let lines = vec![TextLine {
657            words: vec![
658                make_word("Left", 10.0, 100.0, 50.0, 112.0),
659                make_word("Right", 200.0, 100.0, 250.0, 112.0),
660            ],
661            bbox: BBox::new(10.0, 100.0, 250.0, 112.0),
662        }];
663        let result = split_lines_at_columns(lines, 10.0);
664        assert_eq!(result.len(), 2);
665        assert_eq!(result[0].words[0].text, "Left");
666        assert_eq!(result[1].words[0].text, "Right");
667    }
668
669    #[test]
670    fn test_split_lines_single_word_line() {
671        let lines = vec![TextLine {
672            words: vec![make_word("Only", 10.0, 100.0, 50.0, 112.0)],
673            bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
674        }];
675        let result = split_lines_at_columns(lines, 10.0);
676        assert_eq!(result.len(), 1);
677    }
678
679    // --- End-to-end layout tests ---
680
681    #[test]
682    fn test_end_to_end_single_column() {
683        // Two paragraphs in a single column
684        let words = vec![
685            make_word("Para1", 10.0, 100.0, 50.0, 112.0),
686            make_word("line1", 55.0, 100.0, 90.0, 112.0),
687            make_word("Para1", 10.0, 115.0, 50.0, 127.0),
688            make_word("line2", 55.0, 115.0, 90.0, 127.0),
689            // Large gap
690            make_word("Para2", 10.0, 200.0, 50.0, 212.0),
691            make_word("line1", 55.0, 200.0, 90.0, 212.0),
692        ];
693        let lines = cluster_words_into_lines(&words, 3.0);
694        let split = split_lines_at_columns(lines, 10.0);
695        let mut blocks = cluster_lines_into_blocks(split, 10.0);
696        sort_blocks_reading_order(&mut blocks, 10.0);
697        let text = blocks_to_text(&blocks);
698
699        assert_eq!(text, "Para1 line1\nPara1 line2\n\nPara2 line1");
700    }
701
702    #[test]
703    fn test_end_to_end_two_column_layout() {
704        // Left column at x=10..60, right column at x=200..260
705        // Each column has 2 lines
706        let words = vec![
707            // Left column
708            make_word("Left", 10.0, 100.0, 40.0, 112.0),
709            make_word("L1", 45.0, 100.0, 60.0, 112.0),
710            make_word("Left", 10.0, 115.0, 40.0, 127.0),
711            make_word("L2", 45.0, 115.0, 60.0, 127.0),
712            // Right column
713            make_word("Right", 200.0, 100.0, 240.0, 112.0),
714            make_word("R1", 245.0, 100.0, 260.0, 112.0),
715            make_word("Right", 200.0, 115.0, 240.0, 127.0),
716            make_word("R2", 245.0, 115.0, 260.0, 127.0),
717        ];
718        let lines = cluster_words_into_lines(&words, 3.0);
719        let split = split_lines_at_columns(lines, 10.0);
720        let mut blocks = cluster_lines_into_blocks(split, 10.0);
721        sort_blocks_reading_order(&mut blocks, 10.0);
722        let text = blocks_to_text(&blocks);
723
724        // Left column block first (top=100), then right column block (top=100)
725        // Both start at same y, sorted left-to-right
726        assert_eq!(text, "Left L1\nLeft L2\n\nRight R1\nRight R2");
727    }
728
729    #[test]
730    fn test_end_to_end_mixed_blocks() {
731        // Full-width header, then two columns, then full-width footer
732        let words = vec![
733            // Header (full width)
734            make_word("Header", 10.0, 50.0, 100.0, 62.0),
735            // Left column
736            make_word("Left", 10.0, 100.0, 50.0, 112.0),
737            // Right column
738            make_word("Right", 200.0, 100.0, 250.0, 112.0),
739            // Footer (full width)
740            make_word("Footer", 10.0, 250.0, 100.0, 262.0),
741        ];
742        let lines = cluster_words_into_lines(&words, 3.0);
743        let split = split_lines_at_columns(lines, 10.0);
744        let mut blocks = cluster_lines_into_blocks(split, 10.0);
745        sort_blocks_reading_order(&mut blocks, 10.0);
746        let text = blocks_to_text(&blocks);
747
748        // Header, Left, Right, Footer
749        assert_eq!(text, "Header\n\nLeft\n\nRight\n\nFooter");
750    }
751
752    #[test]
753    fn test_reading_order_top_to_bottom_left_to_right() {
754        // Verify blocks are in proper reading order
755        let words = vec![
756            make_word("C", 10.0, 300.0, 50.0, 312.0),
757            make_word("A", 10.0, 100.0, 50.0, 112.0),
758            make_word("B", 10.0, 200.0, 50.0, 212.0),
759        ];
760        let lines = cluster_words_into_lines(&words, 3.0);
761        let split = split_lines_at_columns(lines, 10.0);
762        let mut blocks = cluster_lines_into_blocks(split, 10.0);
763        sort_blocks_reading_order(&mut blocks, 10.0);
764        let text = blocks_to_text(&blocks);
765
766        assert_eq!(text, "A\n\nB\n\nC");
767    }
768}