Skip to main content

pdfplumber_core/
layout.rs

1use crate::geometry::BBox;
2use crate::words::Word;
3
4/// A text line: a sequence of words on the same y-level.
5#[derive(Debug, Clone, PartialEq)]
6#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
7pub struct TextLine {
8    /// Words in this line, sorted left-to-right.
9    pub words: Vec<Word>,
10    /// Bounding box of this line.
11    pub bbox: BBox,
12}
13
14/// A text block: a group of lines forming a coherent paragraph or section.
15#[derive(Debug, Clone, PartialEq)]
16#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
17pub struct TextBlock {
18    /// Lines in this block, sorted top-to-bottom.
19    pub lines: Vec<TextLine>,
20    /// Bounding box of this block.
21    pub bbox: BBox,
22}
23
24/// Options for layout-aware text extraction.
25#[derive(Debug, Clone)]
26pub struct TextOptions {
27    /// If true, use layout-aware extraction (detect blocks and reading order).
28    /// If false, simple concatenation of words by spatial order.
29    pub layout: bool,
30    /// Vertical tolerance for clustering words into the same line (in points).
31    pub y_tolerance: f64,
32    /// Maximum vertical gap between lines to group into the same block (in points).
33    pub y_density: f64,
34    /// Minimum horizontal gap to detect column boundaries (in points).
35    pub x_density: f64,
36}
37
38impl Default for TextOptions {
39    fn default() -> Self {
40        Self {
41            layout: false,
42            y_tolerance: 3.0,
43            y_density: 10.0,
44            x_density: 10.0,
45        }
46    }
47}
48
49/// Cluster words into text lines based on y-proximity.
50///
51/// Words whose vertical midpoints are within `y_tolerance` of a line's
52/// vertical midpoint are grouped into the same line. Words within each
53/// line are sorted left-to-right.
54pub fn cluster_words_into_lines(words: &[Word], y_tolerance: f64) -> Vec<TextLine> {
55    if words.is_empty() {
56        return Vec::new();
57    }
58
59    let mut sorted: Vec<&Word> = words.iter().collect();
60    sorted.sort_by(|a, b| {
61        a.bbox
62            .top
63            .partial_cmp(&b.bbox.top)
64            .unwrap()
65            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
66    });
67
68    let mut lines: Vec<TextLine> = Vec::new();
69
70    for word in sorted {
71        let word_mid_y = (word.bbox.top + word.bbox.bottom) / 2.0;
72
73        // Try to find an existing line this word belongs to
74        let mut found = false;
75        for line in &mut lines {
76            let line_mid_y = (line.bbox.top + line.bbox.bottom) / 2.0;
77            if (word_mid_y - line_mid_y).abs() <= y_tolerance {
78                line.bbox = line.bbox.union(&word.bbox);
79                line.words.push(word.clone());
80                found = true;
81                break;
82            }
83        }
84
85        if !found {
86            lines.push(TextLine {
87                words: vec![word.clone()],
88                bbox: word.bbox,
89            });
90        }
91    }
92
93    // Sort words within each line left-to-right
94    for line in &mut lines {
95        line.words
96            .sort_by(|a, b| a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap());
97    }
98
99    // Sort lines top-to-bottom
100    lines.sort_by(|a, b| a.bbox.top.partial_cmp(&b.bbox.top).unwrap());
101
102    lines
103}
104
105/// Split text lines at large horizontal gaps to detect column boundaries.
106///
107/// Within each line, if consecutive words have a gap larger than `x_density`,
108/// the line is split into separate line segments (one per column).
109pub fn split_lines_at_columns(lines: Vec<TextLine>, x_density: f64) -> Vec<TextLine> {
110    let mut result = Vec::new();
111    for line in lines {
112        if line.words.len() <= 1 {
113            result.push(line);
114            continue;
115        }
116
117        let mut current_words = vec![line.words[0].clone()];
118        let mut current_bbox = line.words[0].bbox;
119
120        for word in line.words.iter().skip(1) {
121            let gap = word.bbox.x0 - current_bbox.x1;
122            if gap > x_density {
123                result.push(TextLine {
124                    words: current_words,
125                    bbox: current_bbox,
126                });
127                current_words = vec![word.clone()];
128                current_bbox = word.bbox;
129            } else {
130                current_bbox = current_bbox.union(&word.bbox);
131                current_words.push(word.clone());
132            }
133        }
134
135        result.push(TextLine {
136            words: current_words,
137            bbox: current_bbox,
138        });
139    }
140
141    // Re-sort by (top, x0) after splitting
142    result.sort_by(|a, b| {
143        a.bbox
144            .top
145            .partial_cmp(&b.bbox.top)
146            .unwrap()
147            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
148    });
149
150    result
151}
152
153/// Cluster text line segments into text blocks based on x-overlap and vertical proximity.
154///
155/// Line segments that vertically follow each other (gap <= `y_density`) and
156/// have overlapping x-ranges are grouped into the same block.
157pub fn cluster_lines_into_blocks(lines: Vec<TextLine>, y_density: f64) -> Vec<TextBlock> {
158    if lines.is_empty() {
159        return Vec::new();
160    }
161
162    let mut blocks: Vec<TextBlock> = Vec::new();
163
164    for line in lines {
165        // Find the best matching block: closest vertically, with x-overlap
166        let mut best_block: Option<usize> = None;
167        let mut best_gap = f64::INFINITY;
168
169        for (i, block) in blocks.iter().enumerate() {
170            let gap = line.bbox.top - block.bbox.bottom;
171            if gap >= 0.0
172                && gap <= y_density
173                && has_x_overlap(&line.bbox, &block.bbox)
174                && gap < best_gap
175            {
176                best_gap = gap;
177                best_block = Some(i);
178            }
179        }
180
181        if let Some(idx) = best_block {
182            blocks[idx].bbox = blocks[idx].bbox.union(&line.bbox);
183            blocks[idx].lines.push(line);
184        } else {
185            blocks.push(TextBlock {
186                bbox: line.bbox,
187                lines: vec![line],
188            });
189        }
190    }
191
192    // Sort lines within each block top-to-bottom
193    for block in &mut blocks {
194        block
195            .lines
196            .sort_by(|a, b| a.bbox.top.partial_cmp(&b.bbox.top).unwrap());
197    }
198
199    blocks
200}
201
202/// Check if two bounding boxes overlap horizontally.
203fn has_x_overlap(a: &BBox, b: &BBox) -> bool {
204    a.x0 < b.x1 && b.x0 < a.x1
205}
206
207/// Sort text blocks in natural reading order.
208///
209/// Sorts blocks by top position first, then by x0 within the same vertical band.
210/// This produces left-to-right, top-to-bottom reading order.
211pub fn sort_blocks_reading_order(blocks: &mut [TextBlock], _x_density: f64) {
212    blocks.sort_by(|a, b| {
213        a.bbox
214            .top
215            .partial_cmp(&b.bbox.top)
216            .unwrap()
217            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
218    });
219}
220
221/// Convert text blocks into a string.
222///
223/// Words within a line are joined by spaces.
224/// Lines within a block are joined by newlines.
225/// Blocks are separated by double newlines.
226pub fn blocks_to_text(blocks: &[TextBlock]) -> String {
227    blocks
228        .iter()
229        .map(|block| {
230            block
231                .lines
232                .iter()
233                .map(|line| {
234                    line.words
235                        .iter()
236                        .map(|w| w.text.as_str())
237                        .collect::<Vec<_>>()
238                        .join(" ")
239                })
240                .collect::<Vec<_>>()
241                .join("\n")
242        })
243        .collect::<Vec<_>>()
244        .join("\n\n")
245}
246
247/// Simple (non-layout) text extraction from words.
248///
249/// Clusters words into lines by y-proximity, then joins with spaces/newlines.
250pub fn words_to_text(words: &[Word], y_tolerance: f64) -> String {
251    let lines = cluster_words_into_lines(words, y_tolerance);
252    lines
253        .iter()
254        .map(|line| {
255            line.words
256                .iter()
257                .map(|w| w.text.as_str())
258                .collect::<Vec<_>>()
259                .join(" ")
260        })
261        .collect::<Vec<_>>()
262        .join("\n")
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use crate::text::Char;
269
270    fn make_word(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Word {
271        Word {
272            text: text.to_string(),
273            bbox: BBox::new(x0, top, x1, bottom),
274            doctop: top,
275            direction: crate::text::TextDirection::Ltr,
276            chars: vec![Char {
277                text: text.to_string(),
278                bbox: BBox::new(x0, top, x1, bottom),
279                fontname: "TestFont".to_string(),
280                size: 12.0,
281                doctop: top,
282                upright: true,
283                direction: crate::text::TextDirection::Ltr,
284                stroking_color: None,
285                non_stroking_color: None,
286                ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
287                char_code: 0,
288            }],
289        }
290    }
291
292    // --- TextOptions ---
293
294    #[test]
295    fn test_text_options_default() {
296        let opts = TextOptions::default();
297        assert!(!opts.layout);
298        assert_eq!(opts.y_tolerance, 3.0);
299        assert_eq!(opts.y_density, 10.0);
300        assert_eq!(opts.x_density, 10.0);
301    }
302
303    #[test]
304    fn test_text_options_layout_true() {
305        let opts = TextOptions {
306            layout: true,
307            ..TextOptions::default()
308        };
309        assert!(opts.layout);
310    }
311
312    // --- cluster_words_into_lines ---
313
314    #[test]
315    fn test_cluster_empty_words() {
316        let lines = cluster_words_into_lines(&[], 3.0);
317        assert!(lines.is_empty());
318    }
319
320    #[test]
321    fn test_cluster_single_word() {
322        let words = vec![make_word("Hello", 10.0, 100.0, 50.0, 112.0)];
323        let lines = cluster_words_into_lines(&words, 3.0);
324        assert_eq!(lines.len(), 1);
325        assert_eq!(lines[0].words.len(), 1);
326        assert_eq!(lines[0].words[0].text, "Hello");
327        assert_eq!(lines[0].bbox, BBox::new(10.0, 100.0, 50.0, 112.0));
328    }
329
330    #[test]
331    fn test_cluster_words_same_line() {
332        let words = vec![
333            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
334            make_word("World", 55.0, 100.0, 95.0, 112.0),
335        ];
336        let lines = cluster_words_into_lines(&words, 3.0);
337        assert_eq!(lines.len(), 1);
338        assert_eq!(lines[0].words.len(), 2);
339        assert_eq!(lines[0].words[0].text, "Hello");
340        assert_eq!(lines[0].words[1].text, "World");
341    }
342
343    #[test]
344    fn test_cluster_words_different_lines() {
345        let words = vec![
346            make_word("Line1", 10.0, 100.0, 50.0, 112.0),
347            make_word("Line2", 10.0, 120.0, 50.0, 132.0),
348        ];
349        let lines = cluster_words_into_lines(&words, 3.0);
350        assert_eq!(lines.len(), 2);
351        assert_eq!(lines[0].words[0].text, "Line1");
352        assert_eq!(lines[1].words[0].text, "Line2");
353    }
354
355    #[test]
356    fn test_cluster_words_slight_y_variation() {
357        // Words on "same line" but slightly different y positions (within tolerance)
358        let words = vec![
359            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
360            make_word("World", 55.0, 101.0, 95.0, 113.0), // 1pt y-offset
361        ];
362        let lines = cluster_words_into_lines(&words, 3.0);
363        assert_eq!(lines.len(), 1);
364        assert_eq!(lines[0].words.len(), 2);
365    }
366
367    #[test]
368    fn test_cluster_words_sorted_left_to_right_within_line() {
369        // Words given in reverse x-order
370        let words = vec![
371            make_word("World", 55.0, 100.0, 95.0, 112.0),
372            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
373        ];
374        let lines = cluster_words_into_lines(&words, 3.0);
375        assert_eq!(lines[0].words[0].text, "Hello");
376        assert_eq!(lines[0].words[1].text, "World");
377    }
378
379    #[test]
380    fn test_cluster_three_lines() {
381        let words = vec![
382            make_word("First", 10.0, 100.0, 50.0, 112.0),
383            make_word("line", 55.0, 100.0, 85.0, 112.0),
384            make_word("Second", 10.0, 120.0, 60.0, 132.0),
385            make_word("line", 65.0, 120.0, 95.0, 132.0),
386            make_word("Third", 10.0, 140.0, 50.0, 152.0),
387            make_word("line", 55.0, 140.0, 85.0, 152.0),
388        ];
389        let lines = cluster_words_into_lines(&words, 3.0);
390        assert_eq!(lines.len(), 3);
391        assert_eq!(lines[0].words.len(), 2);
392        assert_eq!(lines[1].words.len(), 2);
393        assert_eq!(lines[2].words.len(), 2);
394    }
395
396    #[test]
397    fn test_cluster_line_bbox_is_union() {
398        let words = vec![
399            make_word("A", 10.0, 98.0, 20.0, 112.0),
400            make_word("B", 25.0, 100.0, 35.0, 110.0),
401        ];
402        let lines = cluster_words_into_lines(&words, 3.0);
403        assert_eq!(lines[0].bbox, BBox::new(10.0, 98.0, 35.0, 112.0));
404    }
405
406    // --- cluster_lines_into_blocks ---
407
408    #[test]
409    fn test_cluster_lines_empty() {
410        let blocks = cluster_lines_into_blocks(vec![], 10.0);
411        assert!(blocks.is_empty());
412    }
413
414    #[test]
415    fn test_cluster_lines_single_block() {
416        let lines = vec![
417            TextLine {
418                words: vec![make_word("Line1", 10.0, 100.0, 50.0, 112.0)],
419                bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
420            },
421            TextLine {
422                words: vec![make_word("Line2", 10.0, 115.0, 50.0, 127.0)],
423                bbox: BBox::new(10.0, 115.0, 50.0, 127.0),
424            },
425        ];
426        let blocks = cluster_lines_into_blocks(lines, 10.0);
427        assert_eq!(blocks.len(), 1);
428        assert_eq!(blocks[0].lines.len(), 2);
429        assert_eq!(blocks[0].bbox, BBox::new(10.0, 100.0, 50.0, 127.0));
430    }
431
432    #[test]
433    fn test_cluster_lines_two_blocks() {
434        let lines = vec![
435            TextLine {
436                words: vec![make_word("Block1", 10.0, 100.0, 60.0, 112.0)],
437                bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
438            },
439            TextLine {
440                words: vec![make_word("Still1", 10.0, 115.0, 60.0, 127.0)],
441                bbox: BBox::new(10.0, 115.0, 60.0, 127.0),
442            },
443            // Large gap (127 to 200 = 73pt gap, >> 10.0)
444            TextLine {
445                words: vec![make_word("Block2", 10.0, 200.0, 60.0, 212.0)],
446                bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
447            },
448        ];
449        let blocks = cluster_lines_into_blocks(lines, 10.0);
450        assert_eq!(blocks.len(), 2);
451        assert_eq!(blocks[0].lines.len(), 2);
452        assert_eq!(blocks[1].lines.len(), 1);
453    }
454
455    #[test]
456    fn test_cluster_lines_block_bbox() {
457        let lines = vec![
458            TextLine {
459                words: vec![make_word("Line1", 10.0, 100.0, 80.0, 112.0)],
460                bbox: BBox::new(10.0, 100.0, 80.0, 112.0),
461            },
462            TextLine {
463                words: vec![make_word("Line2", 5.0, 115.0, 90.0, 127.0)],
464                bbox: BBox::new(5.0, 115.0, 90.0, 127.0),
465            },
466        ];
467        let blocks = cluster_lines_into_blocks(lines, 10.0);
468        assert_eq!(blocks[0].bbox, BBox::new(5.0, 100.0, 90.0, 127.0));
469    }
470
471    // --- sort_blocks_reading_order ---
472
473    #[test]
474    fn test_sort_single_column_top_to_bottom() {
475        let mut blocks = vec![
476            TextBlock {
477                lines: vec![TextLine {
478                    words: vec![make_word("Second", 10.0, 200.0, 60.0, 212.0)],
479                    bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
480                }],
481                bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
482            },
483            TextBlock {
484                lines: vec![TextLine {
485                    words: vec![make_word("First", 10.0, 100.0, 60.0, 112.0)],
486                    bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
487                }],
488                bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
489            },
490        ];
491        sort_blocks_reading_order(&mut blocks, 10.0);
492        assert_eq!(blocks[0].lines[0].words[0].text, "First");
493        assert_eq!(blocks[1].lines[0].words[0].text, "Second");
494    }
495
496    #[test]
497    fn test_sort_two_columns() {
498        // Left column at x=10..100, right column at x=200..300
499        // Blocks at different y-levels: sorts by (top, x0)
500        let mut blocks = vec![
501            TextBlock {
502                lines: vec![TextLine {
503                    words: vec![make_word("Right1", 200.0, 100.0, 300.0, 112.0)],
504                    bbox: BBox::new(200.0, 100.0, 300.0, 112.0),
505                }],
506                bbox: BBox::new(200.0, 100.0, 300.0, 112.0),
507            },
508            TextBlock {
509                lines: vec![TextLine {
510                    words: vec![make_word("Left1", 10.0, 100.0, 100.0, 112.0)],
511                    bbox: BBox::new(10.0, 100.0, 100.0, 112.0),
512                }],
513                bbox: BBox::new(10.0, 100.0, 100.0, 112.0),
514            },
515            TextBlock {
516                lines: vec![TextLine {
517                    words: vec![make_word("Right2", 200.0, 200.0, 300.0, 212.0)],
518                    bbox: BBox::new(200.0, 200.0, 300.0, 212.0),
519                }],
520                bbox: BBox::new(200.0, 200.0, 300.0, 212.0),
521            },
522            TextBlock {
523                lines: vec![TextLine {
524                    words: vec![make_word("Left2", 10.0, 200.0, 100.0, 212.0)],
525                    bbox: BBox::new(10.0, 200.0, 100.0, 212.0),
526                }],
527                bbox: BBox::new(10.0, 200.0, 100.0, 212.0),
528            },
529        ];
530        sort_blocks_reading_order(&mut blocks, 10.0);
531        // Reading order: top-to-bottom, left-to-right within same y-level
532        assert_eq!(blocks[0].lines[0].words[0].text, "Left1");
533        assert_eq!(blocks[1].lines[0].words[0].text, "Right1");
534        assert_eq!(blocks[2].lines[0].words[0].text, "Left2");
535        assert_eq!(blocks[3].lines[0].words[0].text, "Right2");
536    }
537
538    #[test]
539    fn test_sort_single_block_unchanged() {
540        let mut blocks = vec![TextBlock {
541            lines: vec![TextLine {
542                words: vec![make_word("Only", 10.0, 100.0, 50.0, 112.0)],
543                bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
544            }],
545            bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
546        }];
547        sort_blocks_reading_order(&mut blocks, 10.0);
548        assert_eq!(blocks[0].lines[0].words[0].text, "Only");
549    }
550
551    // --- blocks_to_text ---
552
553    #[test]
554    fn test_blocks_to_text_single_block_single_line() {
555        let blocks = vec![TextBlock {
556            lines: vec![TextLine {
557                words: vec![
558                    make_word("Hello", 10.0, 100.0, 50.0, 112.0),
559                    make_word("World", 55.0, 100.0, 95.0, 112.0),
560                ],
561                bbox: BBox::new(10.0, 100.0, 95.0, 112.0),
562            }],
563            bbox: BBox::new(10.0, 100.0, 95.0, 112.0),
564        }];
565        assert_eq!(blocks_to_text(&blocks), "Hello World");
566    }
567
568    #[test]
569    fn test_blocks_to_text_single_block_multi_line() {
570        let blocks = vec![TextBlock {
571            lines: vec![
572                TextLine {
573                    words: vec![make_word("Line1", 10.0, 100.0, 50.0, 112.0)],
574                    bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
575                },
576                TextLine {
577                    words: vec![make_word("Line2", 10.0, 115.0, 50.0, 127.0)],
578                    bbox: BBox::new(10.0, 115.0, 50.0, 127.0),
579                },
580            ],
581            bbox: BBox::new(10.0, 100.0, 50.0, 127.0),
582        }];
583        assert_eq!(blocks_to_text(&blocks), "Line1\nLine2");
584    }
585
586    #[test]
587    fn test_blocks_to_text_two_blocks() {
588        let blocks = vec![
589            TextBlock {
590                lines: vec![TextLine {
591                    words: vec![make_word("Block1", 10.0, 100.0, 60.0, 112.0)],
592                    bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
593                }],
594                bbox: BBox::new(10.0, 100.0, 60.0, 112.0),
595            },
596            TextBlock {
597                lines: vec![TextLine {
598                    words: vec![make_word("Block2", 10.0, 200.0, 60.0, 212.0)],
599                    bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
600                }],
601                bbox: BBox::new(10.0, 200.0, 60.0, 212.0),
602            },
603        ];
604        assert_eq!(blocks_to_text(&blocks), "Block1\n\nBlock2");
605    }
606
607    #[test]
608    fn test_blocks_to_text_empty() {
609        assert_eq!(blocks_to_text(&[]), "");
610    }
611
612    // --- words_to_text ---
613
614    #[test]
615    fn test_words_to_text_single_line() {
616        let words = vec![
617            make_word("Hello", 10.0, 100.0, 50.0, 112.0),
618            make_word("World", 55.0, 100.0, 95.0, 112.0),
619        ];
620        assert_eq!(words_to_text(&words, 3.0), "Hello World");
621    }
622
623    #[test]
624    fn test_words_to_text_multi_line() {
625        let words = vec![
626            make_word("Line1", 10.0, 100.0, 50.0, 112.0),
627            make_word("Line2", 10.0, 120.0, 50.0, 132.0),
628        ];
629        assert_eq!(words_to_text(&words, 3.0), "Line1\nLine2");
630    }
631
632    #[test]
633    fn test_words_to_text_empty() {
634        assert_eq!(words_to_text(&[], 3.0), "");
635    }
636
637    // --- split_lines_at_columns ---
638
639    #[test]
640    fn test_split_lines_no_columns() {
641        let lines = vec![TextLine {
642            words: vec![
643                make_word("Hello", 10.0, 100.0, 50.0, 112.0),
644                make_word("World", 55.0, 100.0, 95.0, 112.0),
645            ],
646            bbox: BBox::new(10.0, 100.0, 95.0, 112.0),
647        }];
648        let result = split_lines_at_columns(lines, 50.0);
649        assert_eq!(result.len(), 1); // gap=5 < x_density=50
650    }
651
652    #[test]
653    fn test_split_lines_with_column_gap() {
654        let lines = vec![TextLine {
655            words: vec![
656                make_word("Left", 10.0, 100.0, 50.0, 112.0),
657                make_word("Right", 200.0, 100.0, 250.0, 112.0),
658            ],
659            bbox: BBox::new(10.0, 100.0, 250.0, 112.0),
660        }];
661        let result = split_lines_at_columns(lines, 10.0);
662        assert_eq!(result.len(), 2);
663        assert_eq!(result[0].words[0].text, "Left");
664        assert_eq!(result[1].words[0].text, "Right");
665    }
666
667    #[test]
668    fn test_split_lines_single_word_line() {
669        let lines = vec![TextLine {
670            words: vec![make_word("Only", 10.0, 100.0, 50.0, 112.0)],
671            bbox: BBox::new(10.0, 100.0, 50.0, 112.0),
672        }];
673        let result = split_lines_at_columns(lines, 10.0);
674        assert_eq!(result.len(), 1);
675    }
676
677    // --- End-to-end layout tests ---
678
679    #[test]
680    fn test_end_to_end_single_column() {
681        // Two paragraphs in a single column
682        let words = vec![
683            make_word("Para1", 10.0, 100.0, 50.0, 112.0),
684            make_word("line1", 55.0, 100.0, 90.0, 112.0),
685            make_word("Para1", 10.0, 115.0, 50.0, 127.0),
686            make_word("line2", 55.0, 115.0, 90.0, 127.0),
687            // Large gap
688            make_word("Para2", 10.0, 200.0, 50.0, 212.0),
689            make_word("line1", 55.0, 200.0, 90.0, 212.0),
690        ];
691        let lines = cluster_words_into_lines(&words, 3.0);
692        let split = split_lines_at_columns(lines, 10.0);
693        let mut blocks = cluster_lines_into_blocks(split, 10.0);
694        sort_blocks_reading_order(&mut blocks, 10.0);
695        let text = blocks_to_text(&blocks);
696
697        assert_eq!(text, "Para1 line1\nPara1 line2\n\nPara2 line1");
698    }
699
700    #[test]
701    fn test_end_to_end_two_column_layout() {
702        // Left column at x=10..60, right column at x=200..260
703        // Each column has 2 lines
704        let words = vec![
705            // Left column
706            make_word("Left", 10.0, 100.0, 40.0, 112.0),
707            make_word("L1", 45.0, 100.0, 60.0, 112.0),
708            make_word("Left", 10.0, 115.0, 40.0, 127.0),
709            make_word("L2", 45.0, 115.0, 60.0, 127.0),
710            // Right column
711            make_word("Right", 200.0, 100.0, 240.0, 112.0),
712            make_word("R1", 245.0, 100.0, 260.0, 112.0),
713            make_word("Right", 200.0, 115.0, 240.0, 127.0),
714            make_word("R2", 245.0, 115.0, 260.0, 127.0),
715        ];
716        let lines = cluster_words_into_lines(&words, 3.0);
717        let split = split_lines_at_columns(lines, 10.0);
718        let mut blocks = cluster_lines_into_blocks(split, 10.0);
719        sort_blocks_reading_order(&mut blocks, 10.0);
720        let text = blocks_to_text(&blocks);
721
722        // Left column block first (top=100), then right column block (top=100)
723        // Both start at same y, sorted left-to-right
724        assert_eq!(text, "Left L1\nLeft L2\n\nRight R1\nRight R2");
725    }
726
727    #[test]
728    fn test_end_to_end_mixed_blocks() {
729        // Full-width header, then two columns, then full-width footer
730        let words = vec![
731            // Header (full width)
732            make_word("Header", 10.0, 50.0, 100.0, 62.0),
733            // Left column
734            make_word("Left", 10.0, 100.0, 50.0, 112.0),
735            // Right column
736            make_word("Right", 200.0, 100.0, 250.0, 112.0),
737            // Footer (full width)
738            make_word("Footer", 10.0, 250.0, 100.0, 262.0),
739        ];
740        let lines = cluster_words_into_lines(&words, 3.0);
741        let split = split_lines_at_columns(lines, 10.0);
742        let mut blocks = cluster_lines_into_blocks(split, 10.0);
743        sort_blocks_reading_order(&mut blocks, 10.0);
744        let text = blocks_to_text(&blocks);
745
746        // Header, Left, Right, Footer
747        assert_eq!(text, "Header\n\nLeft\n\nRight\n\nFooter");
748    }
749
750    #[test]
751    fn test_reading_order_top_to_bottom_left_to_right() {
752        // Verify blocks are in proper reading order
753        let words = vec![
754            make_word("C", 10.0, 300.0, 50.0, 312.0),
755            make_word("A", 10.0, 100.0, 50.0, 112.0),
756            make_word("B", 10.0, 200.0, 50.0, 212.0),
757        ];
758        let lines = cluster_words_into_lines(&words, 3.0);
759        let split = split_lines_at_columns(lines, 10.0);
760        let mut blocks = cluster_lines_into_blocks(split, 10.0);
761        sort_blocks_reading_order(&mut blocks, 10.0);
762        let text = blocks_to_text(&blocks);
763
764        assert_eq!(text, "A\n\nB\n\nC");
765    }
766}