1use crate::models::bbox::BoundingBox;
5use crate::models::content::ContentElement;
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum LayoutType {
10 SingleColumn,
12 TwoColumn,
14 MultiColumn,
16 Tabular,
18 Mixed,
20 Empty,
22}
23
24#[derive(Debug, Clone, PartialEq)]
26pub struct PageMargins {
27 pub top: f64,
29 pub bottom: f64,
31 pub left: f64,
33 pub right: f64,
35}
36
37#[derive(Debug, Clone)]
39pub struct ContentDensity {
40 pub content_area: f64,
42 pub page_area: f64,
44 pub density: f64,
46 pub element_count: usize,
48}
49
50pub fn classify_layout(elements: &[ContentElement], page_width: f64) -> LayoutType {
52 if elements.is_empty() {
53 return LayoutType::Empty;
54 }
55
56 let table_count = elements
58 .iter()
59 .filter(|e| matches!(e, ContentElement::Table(_)))
60 .count();
61
62 if table_count > 0 && table_count * 3 >= elements.len() {
63 return LayoutType::Tabular;
64 }
65
66 let x_ranges: Vec<(f64, f64)> = elements
68 .iter()
69 .map(|e| {
70 let b = e.bbox();
71 (b.left_x, b.right_x)
72 })
73 .collect();
74
75 let columns = detect_column_count(&x_ranges, page_width);
76
77 match columns {
78 0 | 1 => {
79 let has_images = elements
81 .iter()
82 .any(|e| matches!(e, ContentElement::Image(_) | ContentElement::Figure(_)));
83 let has_text = elements.iter().any(|e| {
84 matches!(
85 e,
86 ContentElement::Paragraph(_)
87 | ContentElement::TextBlock(_)
88 | ContentElement::Heading(_)
89 )
90 });
91 if has_images && has_text {
92 LayoutType::Mixed
93 } else {
94 LayoutType::SingleColumn
95 }
96 }
97 2 => LayoutType::TwoColumn,
98 _ => LayoutType::MultiColumn,
99 }
100}
101
102pub fn detect_margins(
104 elements: &[ContentElement],
105 page_width: f64,
106 page_height: f64,
107) -> PageMargins {
108 if elements.is_empty() {
109 return PageMargins {
110 top: 0.0,
111 bottom: 0.0,
112 left: 0.0,
113 right: 0.0,
114 };
115 }
116
117 let content_bbox = content_bounding_box(elements);
118
119 PageMargins {
120 left: content_bbox.left_x.max(0.0),
121 right: (page_width - content_bbox.right_x).max(0.0),
122 bottom: content_bbox.bottom_y.max(0.0),
124 top: (page_height - content_bbox.top_y).max(0.0),
125 }
126}
127
128pub fn compute_density(
130 elements: &[ContentElement],
131 page_width: f64,
132 page_height: f64,
133) -> ContentDensity {
134 let page_area = page_width * page_height;
135 if page_area <= 0.0 {
136 return ContentDensity {
137 content_area: 0.0,
138 page_area: 0.0,
139 density: 0.0,
140 element_count: elements.len(),
141 };
142 }
143
144 let content_area: f64 = elements
145 .iter()
146 .map(|e| {
147 let b = e.bbox();
148 b.width() * b.height()
149 })
150 .sum();
151
152 ContentDensity {
153 content_area,
154 page_area,
155 density: (content_area / page_area).min(1.0),
156 element_count: elements.len(),
157 }
158}
159
160fn content_bounding_box(elements: &[ContentElement]) -> BoundingBox {
162 let mut min_x = f64::MAX;
163 let mut min_y = f64::MAX;
164 let mut max_x = f64::MIN;
165 let mut max_y = f64::MIN;
166
167 for e in elements {
168 let b = e.bbox();
169 min_x = min_x.min(b.left_x);
170 min_y = min_y.min(b.bottom_y);
171 max_x = max_x.max(b.right_x);
172 max_y = max_y.max(b.top_y);
173 }
174
175 BoundingBox::new(None, min_x, min_y, max_x, max_y)
176}
177
178fn detect_column_count(x_ranges: &[(f64, f64)], page_width: f64) -> usize {
180 if x_ranges.is_empty() || page_width <= 0.0 {
181 return 0;
182 }
183
184 let bin_count = 60;
186 let bin_width = page_width / bin_count as f64;
187 let mut bins = vec![0u32; bin_count];
188
189 for (left, right) in x_ranges {
190 let start_bin = ((*left / bin_width) as usize).min(bin_count - 1);
191 let end_bin = ((*right / bin_width) as usize).min(bin_count - 1);
192 for bin in &mut bins[start_bin..=end_bin] {
193 *bin += 1;
194 }
195 }
196
197 let threshold = (x_ranges.len() as f64 * 0.1) as u32;
199 let first_content = bins.iter().position(|&c| c > threshold);
200 let last_content = bins.iter().rposition(|&c| c > threshold);
201
202 let (first, last) = match (first_content, last_content) {
203 (Some(f), Some(l)) if f < l => (f, l),
204 _ => return 1, };
206
207 let mut gap_count = 0;
209 let mut in_gap = false;
210 for &count in &bins[first..=last] {
211 if count <= threshold {
212 if !in_gap {
213 gap_count += 1;
214 in_gap = true;
215 }
216 } else {
217 in_gap = false;
218 }
219 }
220
221 gap_count + 1
223}
224
225#[cfg(test)]
226mod tests {
227 use super::*;
228 use crate::models::chunks::TextChunk;
229 use crate::models::enums::{PdfLayer, TextFormat, TextType};
230
231 fn make_text_at(x: f64, y: f64, w: f64, h: f64) -> ContentElement {
232 ContentElement::TextChunk(TextChunk {
233 value: "text".to_string(),
234 bbox: BoundingBox::new(None, x, y, x + w, y + h),
235 font_name: "F".to_string(),
236 font_size: 12.0,
237 font_weight: 400.0,
238 italic_angle: 0.0,
239 font_color: "#000".to_string(),
240 contrast_ratio: 21.0,
241 symbol_ends: vec![],
242 text_format: TextFormat::Normal,
243 text_type: TextType::Regular,
244 pdf_layer: PdfLayer::Main,
245 ocg_visible: true,
246 index: None,
247 page_number: Some(1),
248 level: None,
249 mcid: None,
250 })
251 }
252
253 #[test]
254 fn test_classify_empty() {
255 assert_eq!(classify_layout(&[], 612.0), LayoutType::Empty);
256 }
257
258 #[test]
259 fn test_classify_single_column() {
260 let elements = vec![
261 make_text_at(72.0, 100.0, 468.0, 12.0),
262 make_text_at(72.0, 120.0, 468.0, 12.0),
263 make_text_at(72.0, 140.0, 468.0, 12.0),
264 ];
265 let layout = classify_layout(&elements, 612.0);
266 assert_eq!(layout, LayoutType::SingleColumn);
267 }
268
269 #[test]
270 fn test_detect_margins() {
271 let elements = vec![make_text_at(72.0, 72.0, 468.0, 648.0)];
272 let margins = detect_margins(&elements, 612.0, 792.0);
273 assert!((margins.left - 72.0).abs() < 0.1);
274 assert!((margins.right - 72.0).abs() < 0.1);
275 assert!((margins.bottom - 72.0).abs() < 0.1);
276 assert!((margins.top - 72.0).abs() < 0.1);
277 }
278
279 #[test]
280 fn test_compute_density() {
281 let elements = vec![make_text_at(0.0, 0.0, 100.0, 50.0)];
282 let density = compute_density(&elements, 200.0, 100.0);
283 assert!((density.density - 0.25).abs() < 0.01); assert_eq!(density.element_count, 1);
285 }
286
287 #[test]
288 fn test_column_detection() {
289 let elements = vec![
291 make_text_at(72.0, 100.0, 218.0, 12.0),
292 make_text_at(72.0, 120.0, 218.0, 12.0),
293 make_text_at(322.0, 100.0, 218.0, 12.0),
294 make_text_at(322.0, 120.0, 218.0, 12.0),
295 ];
296 let layout = classify_layout(&elements, 612.0);
297 assert_eq!(layout, LayoutType::TwoColumn);
298 }
299}