Skip to main content

pdf_docx/
layout.rs

1//! Spatial grouping of text blocks into lines, paragraphs, and tables.
2
3use pdf_extract::TextBlock;
4
5/// Tolerance for grouping text blocks into lines (points).
6const LINE_Y_TOLERANCE: f64 = 2.0;
7
8/// Vertical gap threshold for paragraph breaks (fraction of font size).
9const PARAGRAPH_GAP_FACTOR: f64 = 1.5;
10
11/// Tolerance for column alignment in table detection (points).
12const TABLE_X_TOLERANCE: f64 = 5.0;
13
14/// A run of text with consistent formatting — the smallest unit emitted to
15/// the resulting DOCX.
16///
17/// A [`Paragraph`] is a sequence of one or more runs. A new run is started
18/// whenever the layout detector observes a change in font, size, weight, or
19/// style on the same line; consecutive characters with the same formatting
20/// stay in a single run.
21#[derive(Debug, Clone)]
22pub struct Run {
23    /// The actual text content of this run, in source order. May contain
24    /// any UTF-8 characters extracted from the PDF page.
25    pub text: String,
26    /// PostScript name of the font as it appears in the PDF (e.g.
27    /// `Helvetica`, `TimesNewRomanPS-BoldMT`). Mapped to a Word font name
28    /// during DOCX writing.
29    pub font_name: String,
30    /// Font size in PDF user-space points. Persisted to DOCX as half-points
31    /// (Word's native unit).
32    pub font_size: f64,
33    /// Whether the run is rendered bold. Detected from font name suffix
34    /// (`-Bold`, `Bd`) or PDF font flags.
35    pub bold: bool,
36    /// Whether the run is rendered italic. Detected from font name suffix
37    /// (`-Italic`, `It`, `Oblique`) or PDF font flags.
38    pub italic: bool,
39}
40
41/// A paragraph composed of one or more [`Run`]s.
42///
43/// Produced by the line-grouping pass: lines whose vertical gap is below
44/// [`PARAGRAPH_GAP_FACTOR`] times the font size are considered part of the
45/// same paragraph; a larger gap ends the paragraph.
46#[derive(Debug, Clone)]
47pub struct Paragraph {
48    /// The runs that make up this paragraph, in reading order. Empty
49    /// paragraphs are valid and represent blank lines.
50    pub runs: Vec<Run>,
51}
52
53/// A table reconstructed from text blocks aligned in columns.
54///
55/// Detected when consecutive lines share the same column x-coordinates
56/// within [`TABLE_X_TOLERANCE`]. Rebuilt into a regular grid where each
57/// row has the same number of cells (`col_count`); short rows are
58/// right-padded with empty strings.
59#[derive(Debug, Clone)]
60pub struct Table {
61    /// Row-major cell content. `rows[r][c]` is the text in column `c` of
62    /// row `r`. All rows have length [`Self::col_count`].
63    pub rows: Vec<Vec<String>>,
64    /// Number of columns in the table — the maximum column count observed
65    /// during column-alignment detection.
66    pub col_count: usize,
67}
68
69/// An image to embed into the resulting DOCX document.
70///
71/// Produced when the layout pass identifies an image XObject on the page
72/// that should be carried over to the Word document. The bytes are kept
73/// verbatim; the DOCX writer wraps them in the appropriate `w:drawing`
74/// element with the given dimensions.
75#[derive(Debug, Clone)]
76pub struct DocxImage {
77    /// Raw image bytes in the format described by [`Self::content_type`]
78    /// (typically PNG or JPEG).
79    pub data: Vec<u8>,
80    /// Image width in pixels. Used to compute the on-page rendered size.
81    pub width: u32,
82    /// Image height in pixels. Used to compute the on-page rendered size.
83    pub height: u32,
84    /// MIME type of [`Self::data`] — e.g. `image/png`, `image/jpeg`. Drives
85    /// the part name and `Override` content-type entry in the DOCX `[Content_Types].xml`.
86    pub content_type: String,
87    /// Stable identifier used to deduplicate images that appear on multiple
88    /// pages and to wire up the relationship reference in the DOCX.
89    pub id: String,
90}
91
92/// One element in the per-page layout: a paragraph, a table, or an image.
93///
94/// Produced by the layout analysis pass. Pages emit a `Vec<PageElement>` in
95/// reading order; the DOCX writer iterates these and produces matching
96/// Word document parts.
97#[derive(Debug, Clone)]
98pub enum PageElement {
99    /// A flowing paragraph of text — see [`Paragraph`].
100    Para(Paragraph),
101    /// A reconstructed table — see [`Table`].
102    Tbl(Table),
103    /// An embedded image — see [`DocxImage`].
104    Img(DocxImage),
105}
106
107/// A line of text (blocks at roughly the same y-coordinate).
108#[derive(Debug)]
109struct Line {
110    y: f64,
111    font_size: f64,
112    blocks: Vec<TextBlock>,
113}
114
115/// Analyze text blocks from a page and group them into paragraphs and tables.
116pub fn analyze_page(blocks: &[TextBlock]) -> Vec<PageElement> {
117    if blocks.is_empty() {
118        return Vec::new();
119    }
120
121    let lines = group_into_lines(blocks);
122    let table = try_detect_table(&lines);
123
124    if let Some(tbl) = table {
125        return vec![PageElement::Tbl(tbl)];
126    }
127
128    group_into_paragraphs(&lines)
129}
130
131/// Group text blocks into lines based on y-coordinate proximity.
132fn group_into_lines(blocks: &[TextBlock]) -> Vec<Line> {
133    let mut sorted: Vec<&TextBlock> = blocks.iter().collect();
134    // Sort by y descending (PDF origin is bottom-left), then x ascending.
135    sorted.sort_by(|a, b| {
136        let y_cmp = b.bbox[1]
137            .partial_cmp(&a.bbox[1])
138            .unwrap_or(std::cmp::Ordering::Equal);
139        if y_cmp == std::cmp::Ordering::Equal {
140            a.bbox[0]
141                .partial_cmp(&b.bbox[0])
142                .unwrap_or(std::cmp::Ordering::Equal)
143        } else {
144            y_cmp
145        }
146    });
147
148    let mut lines: Vec<Line> = Vec::new();
149
150    for block in sorted {
151        let y = block.bbox[1];
152        let matched = lines
153            .iter_mut()
154            .find(|line| (line.y - y).abs() < LINE_Y_TOLERANCE);
155
156        if let Some(line) = matched {
157            line.blocks.push(block.clone());
158        } else {
159            lines.push(Line {
160                y,
161                font_size: block.font_size,
162                blocks: vec![block.clone()],
163            });
164        }
165    }
166
167    // Sort each line's blocks by x-coordinate.
168    for line in &mut lines {
169        line.blocks.sort_by(|a, b| {
170            a.bbox[0]
171                .partial_cmp(&b.bbox[0])
172                .unwrap_or(std::cmp::Ordering::Equal)
173        });
174    }
175
176    lines
177}
178
179/// Try to detect a table from aligned text lines.
180///
181/// A table is detected when multiple lines share the same column structure
182/// (i.e., text blocks start at similar x-positions across lines).
183fn try_detect_table(lines: &[Line]) -> Option<Table> {
184    if lines.len() < 2 {
185        return None;
186    }
187
188    // Collect all unique x-positions across all lines.
189    let mut x_positions: Vec<f64> = Vec::new();
190    for line in lines {
191        for block in &line.blocks {
192            let x = block.bbox[0];
193            if !x_positions
194                .iter()
195                .any(|&px| (px - x).abs() < TABLE_X_TOLERANCE)
196            {
197                x_positions.push(x);
198            }
199        }
200    }
201    x_positions.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
202
203    if x_positions.len() < 2 {
204        return None;
205    }
206
207    // Check if most lines have blocks at multiple column positions.
208    let multi_col_lines = lines
209        .iter()
210        .filter(|line| {
211            let unique_cols = line
212                .blocks
213                .iter()
214                .map(|b| {
215                    x_positions
216                        .iter()
217                        .position(|&px| (px - b.bbox[0]).abs() < TABLE_X_TOLERANCE)
218                        .unwrap_or(0)
219                })
220                .collect::<std::collections::HashSet<_>>();
221            unique_cols.len() >= 2
222        })
223        .count();
224
225    // At least 60% of lines need multiple columns for table detection.
226    if multi_col_lines * 100 / lines.len() < 60 {
227        return None;
228    }
229
230    let col_count = x_positions.len();
231    let mut rows = Vec::new();
232
233    for line in lines {
234        let mut row = vec![String::new(); col_count];
235        for block in &line.blocks {
236            let col_idx = x_positions
237                .iter()
238                .position(|&px| (px - block.bbox[0]).abs() < TABLE_X_TOLERANCE)
239                .unwrap_or(0);
240            if !row[col_idx].is_empty() {
241                row[col_idx].push(' ');
242            }
243            row[col_idx].push_str(&block.text);
244        }
245        rows.push(row);
246    }
247
248    Some(Table { rows, col_count })
249}
250
251/// Group lines into paragraphs based on vertical spacing.
252fn group_into_paragraphs(lines: &[Line]) -> Vec<PageElement> {
253    let mut elements = Vec::new();
254    let mut current_runs: Vec<Run> = Vec::new();
255    let mut prev_y: Option<f64> = None;
256    let mut prev_font_size: f64 = 12.0;
257
258    for line in lines {
259        let line_text: String = line
260            .blocks
261            .iter()
262            .map(|b| b.text.as_str())
263            .collect::<Vec<_>>()
264            .join(" ");
265
266        if line_text.trim().is_empty() {
267            continue;
268        }
269
270        let is_new_paragraph = if let Some(py) = prev_y {
271            let gap = (py - line.y).abs();
272            gap > prev_font_size * PARAGRAPH_GAP_FACTOR
273        } else {
274            false
275        };
276
277        if is_new_paragraph && !current_runs.is_empty() {
278            elements.push(PageElement::Para(Paragraph {
279                runs: std::mem::take(&mut current_runs),
280            }));
281        }
282
283        let font_name = line
284            .blocks
285            .first()
286            .map(|b| b.font_name.clone())
287            .unwrap_or_default();
288        let font_size = line.font_size;
289
290        let bold = font_name.contains("Bold") || font_name.contains("bold");
291        let italic = font_name.contains("Italic")
292            || font_name.contains("italic")
293            || font_name.contains("Oblique");
294
295        current_runs.push(Run {
296            text: line_text,
297            font_name,
298            font_size,
299            bold,
300            italic,
301        });
302
303        prev_y = Some(line.y);
304        prev_font_size = font_size;
305    }
306
307    if !current_runs.is_empty() {
308        elements.push(PageElement::Para(Paragraph { runs: current_runs }));
309    }
310
311    elements
312}
313
314/// Detect bold/italic from a PDF font name.
315pub fn map_font_name(pdf_font: &str) -> &str {
316    // Strip common prefixes like "ABCDEF+" used in subset fonts.
317    let name = if let Some(pos) = pdf_font.find('+') {
318        &pdf_font[pos + 1..]
319    } else {
320        pdf_font
321    };
322
323    // Map common font families.
324    if name.contains("Times") || name.contains("Serif") {
325        "Times New Roman"
326    } else if name.contains("Arial") || name.contains("Helvetica") || name.contains("Sans") {
327        "Arial"
328    } else if name.contains("Courier") || name.contains("Mono") {
329        "Courier New"
330    } else if name.contains("Symbol") {
331        "Symbol"
332    } else {
333        "Calibri"
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    fn make_block(text: &str, x: f64, y: f64, font_size: f64) -> TextBlock {
342        TextBlock {
343            text: text.to_string(),
344            page: 1,
345            bbox: [x, y, x + text.len() as f64 * font_size * 0.5, y + font_size],
346            font_name: "F1".to_string(),
347            font_size,
348            actual_text: None,
349            base_font: None,
350            is_bold: false,
351            is_italic: false,
352            color: None,
353            width_source: Default::default(),
354            char_bounds: vec![],
355        }
356    }
357
358    #[test]
359    fn single_line_becomes_paragraph() {
360        let blocks = vec![make_block("Hello World", 72.0, 720.0, 12.0)];
361        let elements = analyze_page(&blocks);
362        assert_eq!(elements.len(), 1);
363        assert!(matches!(elements[0], PageElement::Para(_)));
364    }
365
366    #[test]
367    fn two_close_lines_same_paragraph() {
368        let blocks = vec![
369            make_block("Line 1", 72.0, 720.0, 12.0),
370            make_block("Line 2", 72.0, 706.0, 12.0), // gap = 14, < 12*1.5=18
371        ];
372        let elements = analyze_page(&blocks);
373        assert_eq!(elements.len(), 1);
374    }
375
376    #[test]
377    fn two_distant_lines_different_paragraphs() {
378        let blocks = vec![
379            make_block("Para 1", 72.0, 720.0, 12.0),
380            make_block("Para 2", 72.0, 680.0, 12.0), // gap = 40, > 18
381        ];
382        let elements = analyze_page(&blocks);
383        assert_eq!(elements.len(), 2);
384    }
385
386    #[test]
387    fn table_detection() {
388        let blocks = vec![
389            // Row 1
390            make_block("Name", 72.0, 700.0, 12.0),
391            make_block("Age", 200.0, 700.0, 12.0),
392            // Row 2
393            make_block("Alice", 72.0, 684.0, 12.0),
394            make_block("30", 200.0, 684.0, 12.0),
395            // Row 3
396            make_block("Bob", 72.0, 668.0, 12.0),
397            make_block("25", 200.0, 668.0, 12.0),
398        ];
399        let elements = analyze_page(&blocks);
400        assert_eq!(elements.len(), 1);
401        assert!(matches!(elements[0], PageElement::Tbl(_)));
402        if let PageElement::Tbl(ref tbl) = elements[0] {
403            assert_eq!(tbl.rows.len(), 3);
404            assert_eq!(tbl.col_count, 2);
405        }
406    }
407
408    #[test]
409    fn empty_blocks_returns_empty() {
410        let elements = analyze_page(&[]);
411        assert!(elements.is_empty());
412    }
413}