Skip to main content

pdf_docx/
layout.rs

1//! Spatial grouping of text blocks into lines, paragraphs, and tables.
2
3use pdf_extract::TextBlock;
4
5/// Tolerance for grouping text blocks into lines (points).
6const LINE_Y_TOLERANCE: f64 = 2.0;
7
8/// Vertical gap threshold for paragraph breaks (fraction of font size).
9const PARAGRAPH_GAP_FACTOR: f64 = 1.5;
10
11/// Tolerance for column alignment in table detection (points).
12const TABLE_X_TOLERANCE: f64 = 5.0;
13
14/// A run of text with formatting.
15#[derive(Debug, Clone)]
16pub struct Run {
17    pub text: String,
18    pub font_name: String,
19    pub font_size: f64,
20    pub bold: bool,
21    pub italic: bool,
22}
23
24/// A paragraph composed of one or more runs.
25#[derive(Debug, Clone)]
26pub struct Paragraph {
27    pub runs: Vec<Run>,
28}
29
30/// A table with rows and columns.
31#[derive(Debug, Clone)]
32pub struct Table {
33    pub rows: Vec<Vec<String>>,
34    pub col_count: usize,
35}
36
37/// An image to include in the document.
38#[derive(Debug, Clone)]
39pub struct DocxImage {
40    pub data: Vec<u8>,
41    pub width: u32,
42    pub height: u32,
43    pub content_type: String,
44    pub id: String,
45}
46
47/// Page content after layout analysis.
48#[derive(Debug, Clone)]
49pub enum PageElement {
50    Para(Paragraph),
51    Tbl(Table),
52    Img(DocxImage),
53}
54
55/// A line of text (blocks at roughly the same y-coordinate).
56#[derive(Debug)]
57struct Line {
58    y: f64,
59    font_size: f64,
60    blocks: Vec<TextBlock>,
61}
62
63/// Analyze text blocks from a page and group them into paragraphs and tables.
64pub fn analyze_page(blocks: &[TextBlock]) -> Vec<PageElement> {
65    if blocks.is_empty() {
66        return Vec::new();
67    }
68
69    let lines = group_into_lines(blocks);
70    let table = try_detect_table(&lines);
71
72    if let Some(tbl) = table {
73        return vec![PageElement::Tbl(tbl)];
74    }
75
76    group_into_paragraphs(&lines)
77}
78
79/// Group text blocks into lines based on y-coordinate proximity.
80fn group_into_lines(blocks: &[TextBlock]) -> Vec<Line> {
81    let mut sorted: Vec<&TextBlock> = blocks.iter().collect();
82    // Sort by y descending (PDF origin is bottom-left), then x ascending.
83    sorted.sort_by(|a, b| {
84        let y_cmp = b.bbox[1]
85            .partial_cmp(&a.bbox[1])
86            .unwrap_or(std::cmp::Ordering::Equal);
87        if y_cmp == std::cmp::Ordering::Equal {
88            a.bbox[0]
89                .partial_cmp(&b.bbox[0])
90                .unwrap_or(std::cmp::Ordering::Equal)
91        } else {
92            y_cmp
93        }
94    });
95
96    let mut lines: Vec<Line> = Vec::new();
97
98    for block in sorted {
99        let y = block.bbox[1];
100        let matched = lines
101            .iter_mut()
102            .find(|line| (line.y - y).abs() < LINE_Y_TOLERANCE);
103
104        if let Some(line) = matched {
105            line.blocks.push(block.clone());
106        } else {
107            lines.push(Line {
108                y,
109                font_size: block.font_size,
110                blocks: vec![block.clone()],
111            });
112        }
113    }
114
115    // Sort each line's blocks by x-coordinate.
116    for line in &mut lines {
117        line.blocks.sort_by(|a, b| {
118            a.bbox[0]
119                .partial_cmp(&b.bbox[0])
120                .unwrap_or(std::cmp::Ordering::Equal)
121        });
122    }
123
124    lines
125}
126
127/// Try to detect a table from aligned text lines.
128///
129/// A table is detected when multiple lines share the same column structure
130/// (i.e., text blocks start at similar x-positions across lines).
131fn try_detect_table(lines: &[Line]) -> Option<Table> {
132    if lines.len() < 2 {
133        return None;
134    }
135
136    // Collect all unique x-positions across all lines.
137    let mut x_positions: Vec<f64> = Vec::new();
138    for line in lines {
139        for block in &line.blocks {
140            let x = block.bbox[0];
141            if !x_positions
142                .iter()
143                .any(|&px| (px - x).abs() < TABLE_X_TOLERANCE)
144            {
145                x_positions.push(x);
146            }
147        }
148    }
149    x_positions.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
150
151    if x_positions.len() < 2 {
152        return None;
153    }
154
155    // Check if most lines have blocks at multiple column positions.
156    let multi_col_lines = lines
157        .iter()
158        .filter(|line| {
159            let unique_cols = line
160                .blocks
161                .iter()
162                .map(|b| {
163                    x_positions
164                        .iter()
165                        .position(|&px| (px - b.bbox[0]).abs() < TABLE_X_TOLERANCE)
166                        .unwrap_or(0)
167                })
168                .collect::<std::collections::HashSet<_>>();
169            unique_cols.len() >= 2
170        })
171        .count();
172
173    // At least 60% of lines need multiple columns for table detection.
174    if multi_col_lines * 100 / lines.len() < 60 {
175        return None;
176    }
177
178    let col_count = x_positions.len();
179    let mut rows = Vec::new();
180
181    for line in lines {
182        let mut row = vec![String::new(); col_count];
183        for block in &line.blocks {
184            let col_idx = x_positions
185                .iter()
186                .position(|&px| (px - block.bbox[0]).abs() < TABLE_X_TOLERANCE)
187                .unwrap_or(0);
188            if !row[col_idx].is_empty() {
189                row[col_idx].push(' ');
190            }
191            row[col_idx].push_str(&block.text);
192        }
193        rows.push(row);
194    }
195
196    Some(Table { rows, col_count })
197}
198
199/// Group lines into paragraphs based on vertical spacing.
200fn group_into_paragraphs(lines: &[Line]) -> Vec<PageElement> {
201    let mut elements = Vec::new();
202    let mut current_runs: Vec<Run> = Vec::new();
203    let mut prev_y: Option<f64> = None;
204    let mut prev_font_size: f64 = 12.0;
205
206    for line in lines {
207        let line_text: String = line
208            .blocks
209            .iter()
210            .map(|b| b.text.as_str())
211            .collect::<Vec<_>>()
212            .join(" ");
213
214        if line_text.trim().is_empty() {
215            continue;
216        }
217
218        let is_new_paragraph = if let Some(py) = prev_y {
219            let gap = (py - line.y).abs();
220            gap > prev_font_size * PARAGRAPH_GAP_FACTOR
221        } else {
222            false
223        };
224
225        if is_new_paragraph && !current_runs.is_empty() {
226            elements.push(PageElement::Para(Paragraph {
227                runs: std::mem::take(&mut current_runs),
228            }));
229        }
230
231        let font_name = line
232            .blocks
233            .first()
234            .map(|b| b.font_name.clone())
235            .unwrap_or_default();
236        let font_size = line.font_size;
237
238        let bold = font_name.contains("Bold") || font_name.contains("bold");
239        let italic = font_name.contains("Italic")
240            || font_name.contains("italic")
241            || font_name.contains("Oblique");
242
243        current_runs.push(Run {
244            text: line_text,
245            font_name,
246            font_size,
247            bold,
248            italic,
249        });
250
251        prev_y = Some(line.y);
252        prev_font_size = font_size;
253    }
254
255    if !current_runs.is_empty() {
256        elements.push(PageElement::Para(Paragraph { runs: current_runs }));
257    }
258
259    elements
260}
261
262/// Detect bold/italic from a PDF font name.
263pub fn map_font_name(pdf_font: &str) -> &str {
264    // Strip common prefixes like "ABCDEF+" used in subset fonts.
265    let name = if let Some(pos) = pdf_font.find('+') {
266        &pdf_font[pos + 1..]
267    } else {
268        pdf_font
269    };
270
271    // Map common font families.
272    if name.contains("Times") || name.contains("Serif") {
273        "Times New Roman"
274    } else if name.contains("Arial") || name.contains("Helvetica") || name.contains("Sans") {
275        "Arial"
276    } else if name.contains("Courier") || name.contains("Mono") {
277        "Courier New"
278    } else if name.contains("Symbol") {
279        "Symbol"
280    } else {
281        "Calibri"
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    fn make_block(text: &str, x: f64, y: f64, font_size: f64) -> TextBlock {
290        TextBlock {
291            text: text.to_string(),
292            page: 1,
293            bbox: [x, y, x + text.len() as f64 * font_size * 0.5, y + font_size],
294            font_name: "F1".to_string(),
295            font_size,
296            actual_text: None,
297        }
298    }
299
300    #[test]
301    fn single_line_becomes_paragraph() {
302        let blocks = vec![make_block("Hello World", 72.0, 720.0, 12.0)];
303        let elements = analyze_page(&blocks);
304        assert_eq!(elements.len(), 1);
305        assert!(matches!(elements[0], PageElement::Para(_)));
306    }
307
308    #[test]
309    fn two_close_lines_same_paragraph() {
310        let blocks = vec![
311            make_block("Line 1", 72.0, 720.0, 12.0),
312            make_block("Line 2", 72.0, 706.0, 12.0), // gap = 14, < 12*1.5=18
313        ];
314        let elements = analyze_page(&blocks);
315        assert_eq!(elements.len(), 1);
316    }
317
318    #[test]
319    fn two_distant_lines_different_paragraphs() {
320        let blocks = vec![
321            make_block("Para 1", 72.0, 720.0, 12.0),
322            make_block("Para 2", 72.0, 680.0, 12.0), // gap = 40, > 18
323        ];
324        let elements = analyze_page(&blocks);
325        assert_eq!(elements.len(), 2);
326    }
327
328    #[test]
329    fn table_detection() {
330        let blocks = vec![
331            // Row 1
332            make_block("Name", 72.0, 700.0, 12.0),
333            make_block("Age", 200.0, 700.0, 12.0),
334            // Row 2
335            make_block("Alice", 72.0, 684.0, 12.0),
336            make_block("30", 200.0, 684.0, 12.0),
337            // Row 3
338            make_block("Bob", 72.0, 668.0, 12.0),
339            make_block("25", 200.0, 668.0, 12.0),
340        ];
341        let elements = analyze_page(&blocks);
342        assert_eq!(elements.len(), 1);
343        assert!(matches!(elements[0], PageElement::Tbl(_)));
344        if let PageElement::Tbl(ref tbl) = elements[0] {
345            assert_eq!(tbl.rows.len(), 3);
346            assert_eq!(tbl.col_count, 2);
347        }
348    }
349
350    #[test]
351    fn empty_blocks_returns_empty() {
352        let elements = analyze_page(&[]);
353        assert!(elements.is_empty());
354    }
355}