Skip to main content

bookforge_pdf/
reconstruct.rs

1//! Deterministic layout reconstruction: fragments → lines → columns →
2//! reading order → paragraphs/headings.
3//!
4//! The heuristics here are deliberately simple and inspectable. They are
5//! tuned for the two layouts that matter first (ROADMAP §9b.1):
6//! single-column books and two-column scientific papers. Anything the
7//! heuristics get wrong shows up in the conversion report as a per-page
8//! coverage gap, never as silently dropped text.
9
10use std::collections::HashMap;
11
12use crate::model::{ColumnMode, DocBlock, Fragment, Line, Page, Span};
13
14/// Per-page reconstruction diagnostics for the conversion report.
15#[derive(Debug, Clone, serde::Serialize)]
16pub struct PageStats {
17    pub page: u32,
18    pub lines: usize,
19    pub chars: usize,
20    pub two_column: bool,
21}
22
23pub struct Reconstruction {
24    pub blocks: Vec<DocBlock>,
25    pub pages: Vec<PageStats>,
26}
27
28pub fn reconstruct(pages: &[Page], columns: ColumnMode) -> Reconstruction {
29    let body_size = body_font_size(pages);
30    let heading_levels = heading_levels(pages, body_size);
31
32    let mut blocks: Vec<DocBlock> = Vec::new();
33    let mut stats = Vec::new();
34
35    for page in pages {
36        let lines = merge_fragments_into_lines(page);
37        let two_column = match columns {
38            ColumnMode::Single => false,
39            ColumnMode::Two => true,
40            ColumnMode::Auto => detect_two_columns(page, &lines),
41        };
42        let ordered = if two_column {
43            order_two_column(page, &lines)
44        } else {
45            let mut ordered = lines.clone();
46            ordered.sort_by_key(|line| (line.top, line.left));
47            ordered
48        };
49
50        stats.push(PageStats {
51            page: page.number,
52            lines: ordered.len(),
53            chars: ordered.iter().map(Line::char_count).sum(),
54            two_column,
55        });
56
57        let page_blocks = cluster_paragraphs(&ordered, body_size, &heading_levels);
58        append_with_continuation(&mut blocks, page_blocks);
59    }
60
61    Reconstruction {
62        blocks,
63        pages: stats,
64    }
65}
66
67/// Group fragments that share a baseline into one visual line, joining
68/// fragment gaps with a space when they are visually separated.
69fn merge_fragments_into_lines(page: &Page) -> Vec<Line> {
70    // poppler reports rotated text (arXiv margin watermarks, sidebar
71    // decorations) with zero width. It is not part of the reading flow
72    // and, left in, it skews the column-midline estimate.
73    let mut fragments: Vec<&Fragment> = page
74        .fragments
75        .iter()
76        .filter(|fragment| fragment.width > 0)
77        .collect();
78    fragments.sort_by_key(|fragment| (fragment.top, fragment.left));
79
80    let mut lines: Vec<Line> = Vec::new();
81    for fragment in fragments {
82        let size = page
83            .font_sizes
84            .get(&fragment.font)
85            .copied()
86            .unwrap_or(fragment.height.unsigned_abs());
87        let same_line = lines.last().is_some_and(|line| {
88            let tolerance = (line.height.max(fragment.height) as f32 * 0.5) as i32;
89            // The gap cap keeps same-baseline lines in *different columns*
90            // from merging. Style-split fragments sit flush and justified
91            // word gaps stay under ~1em, while column gutters run wider
92            // than the line height — observed as low as ~1.7× height on
93            // two-column papers, so the cap must stay below that.
94            let max_join_gap = line.height.max(fragment.height) * 5 / 4;
95            (fragment.top - line.top).abs() <= tolerance
96                && fragment.left >= line.right - 2
97                && fragment.left - line.right <= max_join_gap
98        });
99
100        if same_line {
101            let line = lines.last_mut().expect("checked above");
102            let gap = fragment.left - line.right;
103            if gap > 1 && !line.spans.last().is_some_and(|s| s.text.ends_with(' ')) {
104                push_joined(&mut line.spans, " ", false, false);
105            }
106            for span in &fragment.spans {
107                push_joined(&mut line.spans, &span.text, span.bold, span.italic);
108            }
109            line.right = line.right.max(fragment.right());
110            line.height = line.height.max(fragment.height);
111            line.font_size = line.font_size.max(size);
112        } else {
113            lines.push(Line {
114                top: fragment.top,
115                left: fragment.left,
116                right: fragment.right(),
117                height: fragment.height,
118                font_size: size,
119                spans: fragment.spans.clone(),
120            });
121        }
122    }
123
124    lines.retain(|line| !line.text().trim().is_empty());
125    lines
126}
127
128fn push_joined(spans: &mut Vec<Span>, text: &str, bold: bool, italic: bool) {
129    if text.is_empty() {
130        return;
131    }
132    if let Some(last) = spans.last_mut()
133        && last.bold == bold
134        && last.italic == italic
135    {
136        last.text.push_str(text);
137        return;
138    }
139    spans.push(Span {
140        text: text.to_string(),
141        bold,
142        italic,
143    });
144}
145
146/// A page is two-column when most non-full-width lines sit entirely in
147/// the left or right half with a clear gutter and few lines crossing it.
148fn detect_two_columns(page: &Page, lines: &[Line]) -> bool {
149    if lines.len() < 8 {
150        return false;
151    }
152    let content_left = lines.iter().map(|line| line.left).min().unwrap_or(0);
153    let content_right = lines
154        .iter()
155        .map(|line| line.right)
156        .max()
157        .unwrap_or(page.width);
158    let content_width = (content_right - content_left).max(1);
159    let mid = content_left + content_width / 2;
160    let slack = content_width / 20;
161
162    let column_lines: Vec<&Line> = lines
163        .iter()
164        .filter(|line| line.width() < (content_width as f32 * 0.62) as i32)
165        .collect();
166    if column_lines.len() < 6 {
167        return false;
168    }
169
170    let left = column_lines
171        .iter()
172        .filter(|line| line.right <= mid + slack)
173        .count();
174    let right = column_lines
175        .iter()
176        .filter(|line| line.left >= mid - slack)
177        .count();
178    let crossing = column_lines.len().saturating_sub(left + right);
179
180    left >= 3 && right >= 3 && crossing * 10 <= column_lines.len()
181}
182
183/// Reading order for a two-column page: full-width lines act as band
184/// separators (title, abstract, figure rows); within each band the left
185/// column is read before the right.
186fn order_two_column(page: &Page, lines: &[Line]) -> Vec<Line> {
187    let content_left = lines.iter().map(|line| line.left).min().unwrap_or(0);
188    let content_right = lines
189        .iter()
190        .map(|line| line.right)
191        .max()
192        .unwrap_or(page.width);
193    let content_width = (content_right - content_left).max(1);
194    let mid = content_left + content_width / 2;
195
196    let mut sorted: Vec<&Line> = lines.iter().collect();
197    sorted.sort_by_key(|line| (line.top, line.left));
198
199    let mut ordered = Vec::with_capacity(lines.len());
200    let mut band_left: Vec<&Line> = Vec::new();
201    let mut band_right: Vec<&Line> = Vec::new();
202
203    let flush =
204        |ordered: &mut Vec<Line>, band_left: &mut Vec<&Line>, band_right: &mut Vec<&Line>| {
205            for line in band_left.drain(..) {
206                ordered.push(line.clone());
207            }
208            for line in band_right.drain(..) {
209                ordered.push(line.clone());
210            }
211        };
212
213    for line in sorted {
214        if line.width() >= (content_width as f32 * 0.62) as i32 {
215            flush(&mut ordered, &mut band_left, &mut band_right);
216            ordered.push(line.clone());
217        } else if line.left + line.width() / 2 <= mid {
218            band_left.push(line);
219        } else {
220            band_right.push(line);
221        }
222    }
223    flush(&mut ordered, &mut band_left, &mut band_right);
224
225    ordered
226}
227
228/// The dominant body font size, weighted by character volume.
229fn body_font_size(pages: &[Page]) -> u32 {
230    let mut weights: HashMap<u32, usize> = HashMap::new();
231    for page in pages {
232        for fragment in &page.fragments {
233            let size = page
234                .font_sizes
235                .get(&fragment.font)
236                .copied()
237                .unwrap_or(fragment.height.unsigned_abs());
238            *weights.entry(size).or_default() += fragment.char_count();
239        }
240    }
241    weights
242        .into_iter()
243        .max_by_key(|(_, chars)| *chars)
244        .map(|(size, _)| size)
245        .unwrap_or(12)
246}
247
248/// Distinct font sizes clearly larger than body text, ranked largest
249/// first, mapped to heading levels h1..h3.
250fn heading_levels(pages: &[Page], body_size: u32) -> HashMap<u32, u8> {
251    let mut sizes: Vec<u32> = pages
252        .iter()
253        .flat_map(|page| page.font_sizes.values().copied())
254        .filter(|size| *size as f32 >= body_size as f32 * 1.15 && *size >= body_size + 2)
255        .collect();
256    sizes.sort_unstable_by(|a, b| b.cmp(a));
257    sizes.dedup();
258    sizes
259        .into_iter()
260        .take(3)
261        .enumerate()
262        .map(|(rank, size)| (size, rank as u8 + 1))
263        .collect()
264}
265
266/// Cluster ordered lines into paragraphs and headings.
267fn cluster_paragraphs(
268    lines: &[Line],
269    body_size: u32,
270    heading_levels: &HashMap<u32, u8>,
271) -> Vec<DocBlock> {
272    let median_gap = median_line_gap(lines);
273    let mut blocks = Vec::new();
274    let mut current: Vec<Span> = Vec::new();
275    let mut current_heading: Option<u8> = None;
276    let mut previous: Option<&Line> = None;
277
278    let flush = |spans: &mut Vec<Span>, heading: &mut Option<u8>, blocks: &mut Vec<DocBlock>| {
279        if spans.is_empty() {
280            return;
281        }
282        let spans = std::mem::take(spans);
283        match heading.take() {
284            Some(level) => blocks.push(DocBlock::Heading { level, spans }),
285            None => blocks.push(DocBlock::Paragraph { spans }),
286        }
287    };
288
289    for line in lines {
290        let heading = heading_levels.get(&line.font_size).copied();
291        let new_block = match previous {
292            None => true,
293            Some(prev) => {
294                let gap = line.top - prev.top;
295                let size_changed = line.font_size != prev.font_size
296                    && (heading.is_some()
297                        || heading_levels.contains_key(&prev.font_size)
298                        || line.font_size.abs_diff(prev.font_size) >= 2);
299                let large_gap = gap > (median_gap as f32 * 1.8) as i32 || gap < 0;
300                let indented =
301                    line.left > prev.left + (body_size as i32) / 2 && prev.left <= line.left; // fresh indent, not a continuation of one
302                size_changed || large_gap || indented
303            }
304        };
305
306        if new_block {
307            flush(&mut current, &mut current_heading, &mut blocks);
308            current_heading = heading;
309        }
310        join_line_into(&mut current, line);
311        previous = Some(line);
312    }
313    flush(&mut current, &mut current_heading, &mut blocks);
314
315    blocks
316}
317
318/// Append a line's spans to a paragraph buffer, repairing soft hyphens
319/// at the join.
320fn join_line_into(buffer: &mut Vec<Span>, line: &Line) {
321    if buffer.is_empty() {
322        buffer.extend(line.spans.iter().cloned());
323        return;
324    }
325
326    let next_starts_lower = line
327        .text()
328        .chars()
329        .next()
330        .is_some_and(|ch| ch.is_lowercase());
331    let hyphenated = buffer
332        .last()
333        .is_some_and(|span| span.text.trim_end().ends_with('-'));
334
335    if hyphenated && next_starts_lower {
336        if let Some(last) = buffer.last_mut() {
337            let trimmed = last.text.trim_end();
338            let without = trimmed[..trimmed.len() - 1].to_string();
339            last.text = without;
340        }
341    } else if !buffer.last().is_some_and(|span| span.text.ends_with(' ')) {
342        push_joined(buffer, " ", false, false);
343    }
344    for span in &line.spans {
345        push_joined(buffer, &span.text, span.bold, span.italic);
346    }
347}
348
349fn median_line_gap(lines: &[Line]) -> i32 {
350    let mut gaps: Vec<i32> = lines
351        .windows(2)
352        .filter_map(|pair| {
353            let gap = pair[1].top - pair[0].top;
354            let height = pair[0].height.max(1);
355            // Only genuine line advances vote: superscript and subscript
356            // fragments produce micro-gaps that would drag the median
357            // down until ordinary leading looks like a paragraph break,
358            // and figure whitespace would drag it up.
359            (gap >= height / 2 && gap <= height * 4).then_some(gap)
360        })
361        .collect();
362    if gaps.is_empty() {
363        return 16;
364    }
365    gaps.sort_unstable();
366    // Lower median: on short pages the gap list is tiny and the upper
367    // median can land on the paragraph gap itself, hiding the break.
368    gaps[(gaps.len() - 1) / 2]
369}
370
371/// Cross-page paragraph continuation: a page's first paragraph continues
372/// the previous page's last one when the earlier text does not end a
373/// sentence and the new text starts lowercase.
374fn append_with_continuation(blocks: &mut Vec<DocBlock>, mut incoming: Vec<DocBlock>) {
375    if let (Some(DocBlock::Paragraph { spans: tail }), Some(DocBlock::Paragraph { spans: head })) =
376        (blocks.last_mut(), incoming.first_mut())
377    {
378        let tail_text: String = tail.iter().map(|span| span.text.as_str()).collect();
379        let head_text: String = head.iter().map(|span| span.text.as_str()).collect();
380        let continues = !tail_text
381            .trim_end()
382            .ends_with(['.', '!', '?', ':', ';', '"', '\u{201d}', '\u{2019}'])
383            && head_text.chars().next().is_some_and(|ch| ch.is_lowercase());
384        if continues {
385            let hyphenated = tail_text.trim_end().ends_with('-');
386            if hyphenated {
387                if let Some(last) = tail.last_mut() {
388                    let trimmed = last.text.trim_end();
389                    last.text = trimmed[..trimmed.len() - 1].to_string();
390                }
391            } else if !tail.last().is_some_and(|span| span.text.ends_with(' ')) {
392                push_joined(tail, " ", false, false);
393            }
394            for span in head.drain(..) {
395                push_joined(tail, &span.text, span.bold, span.italic);
396            }
397            incoming.remove(0);
398        }
399    }
400    blocks.append(&mut incoming);
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406    use crate::parse::parse_pdf2xml;
407
408    fn fixture_two_column() -> String {
409        // A miniature two-column paper page: full-width title, abstract,
410        // then two columns whose reading order must be left-then-right,
411        // with a hyphenated join inside the left column.
412        let mut texts = String::new();
413        texts.push_str(
414            r#"<text top="80" left="150" width="620" height="26" font="0">A Study of <i>Synthetic</i> Layouts</text>"#,
415        );
416        texts.push_str(
417            r#"<text top="140" left="120" width="680" height="13" font="1">Abstract spanning the full width of the page for testing purposes.</text>"#,
418        );
419        for (i, words) in [
420            "Left column opening line about num-",
421            "bers and continuation of thought.",
422            "Another left line here.",
423        ]
424        .iter()
425        .enumerate()
426        {
427            texts.push_str(&format!(
428                r#"<text top="{}" left="100" width="330" height="12" font="1">{}</text>"#,
429                220 + i * 16,
430                words
431            ));
432        }
433        for (i, words) in [
434            "Right column first line.",
435            "Right column second line.",
436            "Right column third line.",
437        ]
438        .iter()
439        .enumerate()
440        {
441            texts.push_str(&format!(
442                r#"<text top="{}" left="480" width="330" height="12" font="1">{}</text>"#,
443                220 + i * 16,
444                words
445            ));
446        }
447        format!(
448            r#"<?xml version="1.0"?>
449<pdf2xml producer="poppler">
450<page number="1" width="918" height="1188">
451<fontspec id="0" size="22" family="T"/>
452<fontspec id="1" size="11" family="T"/>
453{texts}
454</page>
455</pdf2xml>"#
456        )
457    }
458
459    #[test]
460    fn two_column_page_reads_title_left_then_right() {
461        let pages = parse_pdf2xml(&fixture_two_column()).expect("fixture parses");
462        let result = reconstruct(&pages, ColumnMode::Auto);
463
464        assert!(result.pages[0].two_column, "page must detect two columns");
465        let texts: Vec<String> = result.blocks.iter().map(DocBlock::text).collect();
466
467        assert!(
468            matches!(&result.blocks[0], DocBlock::Heading { level: 1, .. }),
469            "title should be an h1, got {:?}",
470            result.blocks[0]
471        );
472        assert_eq!(texts[0], "A Study of Synthetic Layouts");
473        assert!(texts[1].starts_with("Abstract spanning"));
474        assert!(
475            texts[2].contains("numbers and continuation"),
476            "hyphenated line join must repair the soft hyphen, got: {}",
477            texts[2]
478        );
479        assert!(
480            texts[2].contains("Another left line"),
481            "left column must precede right, got: {texts:?}"
482        );
483        assert!(texts[3].starts_with("Right column first"));
484    }
485
486    #[test]
487    fn single_column_page_clusters_paragraphs_by_gap() {
488        let xml = r#"<?xml version="1.0"?>
489<pdf2xml><page number="1" width="600" height="800">
490<fontspec id="0" size="11" family="T"/>
491<text top="100" left="80" width="440" height="12" font="0">First paragraph line one.</text>
492<text top="116" left="80" width="440" height="12" font="0">First paragraph line two.</text>
493<text top="170" left="80" width="440" height="12" font="0">Second paragraph after a large gap.</text>
494</page></pdf2xml>"#;
495        let pages = parse_pdf2xml(xml).expect("fixture parses");
496        let result = reconstruct(&pages, ColumnMode::Auto);
497
498        let texts: Vec<String> = result.blocks.iter().map(DocBlock::text).collect();
499        assert_eq!(
500            texts,
501            vec![
502                "First paragraph line one. First paragraph line two.".to_string(),
503                "Second paragraph after a large gap.".to_string(),
504            ]
505        );
506    }
507
508    #[test]
509    fn narrow_gutter_does_not_merge_columns_on_aligned_baselines() {
510        // Regression for the BERT page-1 defect: left and right column
511        // lines sharing the exact same `top`, separated by a gutter of
512        // only ~1.7x line height (461 - 435 = 26 at height 15). Merging
513        // them creates a fake full-width line that scrambles band order.
514        let xml = r#"<?xml version="1.0"?>
515<pdf2xml><page number="1" width="892" height="1262">
516<fontspec id="0" size="15" family="T"/>
517<text top="100" left="108" width="327" height="15" font="0">left one alpha beta gamma delta.</text>
518<text top="120" left="108" width="327" height="15" font="0">left two epsilon zeta eta theta.</text>
519<text top="100" left="461" width="327" height="15" font="0">right one iota kappa lambda mu.</text>
520<text top="120" left="461" width="327" height="15" font="0">right two nu xi omicron pi rho.</text>
521<text top="140" left="108" width="327" height="15" font="0">left three sigma tau upsilon phi.</text>
522<text top="140" left="461" width="327" height="15" font="0">right three chi psi omega end.</text>
523<text top="160" left="108" width="327" height="15" font="0">left four extra line for detection.</text>
524<text top="160" left="461" width="327" height="15" font="0">right four extra line as well.</text>
525</page></pdf2xml>"#;
526        let pages = parse_pdf2xml(xml).expect("fixture parses");
527        let result = reconstruct(&pages, ColumnMode::Two);
528
529        let all_text: String = result
530            .blocks
531            .iter()
532            .map(|block| block.text())
533            .collect::<Vec<_>>()
534            .join("\n");
535        let left_pos = all_text.find("left four").expect("left text present");
536        let right_pos = all_text.find("right one").expect("right text present");
537        assert!(
538            left_pos < right_pos,
539            "every left-column line must precede the right column, got:\n{all_text}"
540        );
541        assert!(
542            !all_text.contains("alpha beta gamma delta. right one"),
543            "aligned baselines must not merge across the gutter:\n{all_text}"
544        );
545    }
546
547    #[test]
548    fn paragraph_continues_across_pages() {
549        let xml = r#"<?xml version="1.0"?>
550<pdf2xml>
551<page number="1" width="600" height="800">
552<fontspec id="0" size="11" family="T"/>
553<text top="700" left="80" width="440" height="12" font="0">This sentence does not end</text>
554</page>
555<page number="2" width="600" height="800">
556<fontspec id="0" size="11" family="T"/>
557<text top="80" left="80" width="440" height="12" font="0">until the following page.</text>
558</page>
559</pdf2xml>"#;
560        let pages = parse_pdf2xml(xml).expect("fixture parses");
561        let result = reconstruct(&pages, ColumnMode::Auto);
562
563        let texts: Vec<String> = result.blocks.iter().map(DocBlock::text).collect();
564        assert_eq!(
565            texts,
566            vec!["This sentence does not end until the following page.".to_string()]
567        );
568    }
569}