Skip to main content

edgeparse_core/utils/
layout_analysis.rs

1//! Layout analysis utilities — page geometry classification, margin detection,
2//! and content density analysis for multi-column and complex layouts.
3
4use crate::models::bbox::BoundingBox;
5use crate::models::content::ContentElement;
6
7/// Classification of a page layout.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum LayoutType {
10    /// Single column of text.
11    SingleColumn,
12    /// Two columns side by side.
13    TwoColumn,
14    /// Three or more columns.
15    MultiColumn,
16    /// Primarily a table-based layout.
17    Tabular,
18    /// Mixed layout (images + text, irregular).
19    Mixed,
20    /// Empty page or only decorations.
21    Empty,
22}
23
24/// Detected page margins.
25#[derive(Debug, Clone, PartialEq)]
26pub struct PageMargins {
27    /// Top margin distance from the page edge.
28    pub top: f64,
29    /// Bottom margin distance from the page edge.
30    pub bottom: f64,
31    /// Left margin distance from the page edge.
32    pub left: f64,
33    /// Right margin distance from the page edge.
34    pub right: f64,
35}
36
37/// Content density info for a page.
38#[derive(Debug, Clone)]
39pub struct ContentDensity {
40    /// Total area covered by content elements.
41    pub content_area: f64,
42    /// Total page area.
43    pub page_area: f64,
44    /// Ratio of content to page area (0.0 to 1.0).
45    pub density: f64,
46    /// Number of content elements.
47    pub element_count: usize,
48}
49
50/// Classify the layout type of a page based on its content elements.
51pub fn classify_layout(elements: &[ContentElement], page_width: f64) -> LayoutType {
52    if elements.is_empty() {
53        return LayoutType::Empty;
54    }
55
56    // Count element types
57    let table_count = elements
58        .iter()
59        .filter(|e| matches!(e, ContentElement::Table(_)))
60        .count();
61
62    if table_count > 0 && table_count * 3 >= elements.len() {
63        return LayoutType::Tabular;
64    }
65
66    // Detect columns by analyzing X positions
67    let x_ranges: Vec<(f64, f64)> = elements
68        .iter()
69        .map(|e| {
70            let b = e.bbox();
71            (b.left_x, b.right_x)
72        })
73        .collect();
74
75    let columns = detect_column_count(&x_ranges, page_width);
76
77    match columns {
78        0 | 1 => {
79            // Check if mixed (has images interspersed with text)
80            let has_images = elements
81                .iter()
82                .any(|e| matches!(e, ContentElement::Image(_) | ContentElement::Figure(_)));
83            let has_text = elements.iter().any(|e| {
84                matches!(
85                    e,
86                    ContentElement::Paragraph(_)
87                        | ContentElement::TextBlock(_)
88                        | ContentElement::Heading(_)
89                )
90            });
91            if has_images && has_text {
92                LayoutType::Mixed
93            } else {
94                LayoutType::SingleColumn
95            }
96        }
97        2 => LayoutType::TwoColumn,
98        _ => LayoutType::MultiColumn,
99    }
100}
101
102/// Detect page margins from content bounding boxes.
103pub fn detect_margins(
104    elements: &[ContentElement],
105    page_width: f64,
106    page_height: f64,
107) -> PageMargins {
108    if elements.is_empty() {
109        return PageMargins {
110            top: 0.0,
111            bottom: 0.0,
112            left: 0.0,
113            right: 0.0,
114        };
115    }
116
117    let content_bbox = content_bounding_box(elements);
118
119    PageMargins {
120        left: content_bbox.left_x.max(0.0),
121        right: (page_width - content_bbox.right_x).max(0.0),
122        // PDF coordinates: bottom_y is lower, top_y is upper
123        bottom: content_bbox.bottom_y.max(0.0),
124        top: (page_height - content_bbox.top_y).max(0.0),
125    }
126}
127
128/// Compute content density for a page.
129pub fn compute_density(
130    elements: &[ContentElement],
131    page_width: f64,
132    page_height: f64,
133) -> ContentDensity {
134    let page_area = page_width * page_height;
135    if page_area <= 0.0 {
136        return ContentDensity {
137            content_area: 0.0,
138            page_area: 0.0,
139            density: 0.0,
140            element_count: elements.len(),
141        };
142    }
143
144    let content_area: f64 = elements
145        .iter()
146        .map(|e| {
147            let b = e.bbox();
148            b.width() * b.height()
149        })
150        .sum();
151
152    ContentDensity {
153        content_area,
154        page_area,
155        density: (content_area / page_area).min(1.0),
156        element_count: elements.len(),
157    }
158}
159
160/// Compute the bounding box enclosing all elements.
161fn content_bounding_box(elements: &[ContentElement]) -> BoundingBox {
162    let mut min_x = f64::MAX;
163    let mut min_y = f64::MAX;
164    let mut max_x = f64::MIN;
165    let mut max_y = f64::MIN;
166
167    for e in elements {
168        let b = e.bbox();
169        min_x = min_x.min(b.left_x);
170        min_y = min_y.min(b.bottom_y);
171        max_x = max_x.max(b.right_x);
172        max_y = max_y.max(b.top_y);
173    }
174
175    BoundingBox::new(None, min_x, min_y, max_x, max_y)
176}
177
178/// Detect the number of columns from X-ranges of elements.
179fn detect_column_count(x_ranges: &[(f64, f64)], page_width: f64) -> usize {
180    if x_ranges.is_empty() || page_width <= 0.0 {
181        return 0;
182    }
183
184    // Divide page into bins and count elements per bin
185    let bin_count = 60;
186    let bin_width = page_width / bin_count as f64;
187    let mut bins = vec![0u32; bin_count];
188
189    for (left, right) in x_ranges {
190        let start_bin = ((*left / bin_width) as usize).min(bin_count - 1);
191        let end_bin = ((*right / bin_width) as usize).min(bin_count - 1);
192        for bin in &mut bins[start_bin..=end_bin] {
193            *bin += 1;
194        }
195    }
196
197    // Find the first and last bins that have content — ignore margin areas
198    let threshold = (x_ranges.len() as f64 * 0.1) as u32;
199    let first_content = bins.iter().position(|&c| c > threshold);
200    let last_content = bins.iter().rposition(|&c| c > threshold);
201
202    let (first, last) = match (first_content, last_content) {
203        (Some(f), Some(l)) if f < l => (f, l),
204        _ => return 1, // all content in same bin or no content
205    };
206
207    // Count internal gaps (empty bins between first and last content bins)
208    let mut gap_count = 0;
209    let mut in_gap = false;
210    for &count in &bins[first..=last] {
211        if count <= threshold {
212            if !in_gap {
213                gap_count += 1;
214                in_gap = true;
215            }
216        } else {
217            in_gap = false;
218        }
219    }
220
221    // Number of columns = number of internal gaps + 1
222    gap_count + 1
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228    use crate::models::chunks::TextChunk;
229    use crate::models::enums::{PdfLayer, TextFormat, TextType};
230
231    fn make_text_at(x: f64, y: f64, w: f64, h: f64) -> ContentElement {
232        ContentElement::TextChunk(TextChunk {
233            value: "text".to_string(),
234            bbox: BoundingBox::new(None, x, y, x + w, y + h),
235            font_name: "F".to_string(),
236            font_size: 12.0,
237            font_weight: 400.0,
238            italic_angle: 0.0,
239            font_color: "#000".to_string(),
240            contrast_ratio: 21.0,
241            symbol_ends: vec![],
242            text_format: TextFormat::Normal,
243            text_type: TextType::Regular,
244            pdf_layer: PdfLayer::Main,
245            ocg_visible: true,
246            index: None,
247            page_number: Some(1),
248            level: None,
249            mcid: None,
250        })
251    }
252
253    #[test]
254    fn test_classify_empty() {
255        assert_eq!(classify_layout(&[], 612.0), LayoutType::Empty);
256    }
257
258    #[test]
259    fn test_classify_single_column() {
260        let elements = vec![
261            make_text_at(72.0, 100.0, 468.0, 12.0),
262            make_text_at(72.0, 120.0, 468.0, 12.0),
263            make_text_at(72.0, 140.0, 468.0, 12.0),
264        ];
265        let layout = classify_layout(&elements, 612.0);
266        assert_eq!(layout, LayoutType::SingleColumn);
267    }
268
269    #[test]
270    fn test_detect_margins() {
271        let elements = vec![make_text_at(72.0, 72.0, 468.0, 648.0)];
272        let margins = detect_margins(&elements, 612.0, 792.0);
273        assert!((margins.left - 72.0).abs() < 0.1);
274        assert!((margins.right - 72.0).abs() < 0.1);
275        assert!((margins.bottom - 72.0).abs() < 0.1);
276        assert!((margins.top - 72.0).abs() < 0.1);
277    }
278
279    #[test]
280    fn test_compute_density() {
281        let elements = vec![make_text_at(0.0, 0.0, 100.0, 50.0)];
282        let density = compute_density(&elements, 200.0, 100.0);
283        assert!((density.density - 0.25).abs() < 0.01); // 5000 / 20000
284        assert_eq!(density.element_count, 1);
285    }
286
287    #[test]
288    fn test_column_detection() {
289        // Two columns: left (72-290) and right (322-540)
290        let elements = vec![
291            make_text_at(72.0, 100.0, 218.0, 12.0),
292            make_text_at(72.0, 120.0, 218.0, 12.0),
293            make_text_at(322.0, 100.0, 218.0, 12.0),
294            make_text_at(322.0, 120.0, 218.0, 12.0),
295        ];
296        let layout = classify_layout(&elements, 612.0);
297        assert_eq!(layout, LayoutType::TwoColumn);
298    }
299}