1use pdf_extract::TextBlock;
4
5const LINE_Y_TOLERANCE: f64 = 2.0;
7
8const PARAGRAPH_GAP_FACTOR: f64 = 1.5;
10
11const TABLE_X_TOLERANCE: f64 = 5.0;
13
14#[derive(Debug, Clone)]
16pub struct Run {
17 pub text: String,
18 pub font_name: String,
19 pub font_size: f64,
20 pub bold: bool,
21 pub italic: bool,
22}
23
24#[derive(Debug, Clone)]
26pub struct Paragraph {
27 pub runs: Vec<Run>,
28}
29
30#[derive(Debug, Clone)]
32pub struct Table {
33 pub rows: Vec<Vec<String>>,
34 pub col_count: usize,
35}
36
37#[derive(Debug, Clone)]
39pub struct DocxImage {
40 pub data: Vec<u8>,
41 pub width: u32,
42 pub height: u32,
43 pub content_type: String,
44 pub id: String,
45}
46
47#[derive(Debug, Clone)]
49pub enum PageElement {
50 Para(Paragraph),
51 Tbl(Table),
52 Img(DocxImage),
53}
54
55#[derive(Debug)]
57struct Line {
58 y: f64,
59 font_size: f64,
60 blocks: Vec<TextBlock>,
61}
62
63pub fn analyze_page(blocks: &[TextBlock]) -> Vec<PageElement> {
65 if blocks.is_empty() {
66 return Vec::new();
67 }
68
69 let lines = group_into_lines(blocks);
70 let table = try_detect_table(&lines);
71
72 if let Some(tbl) = table {
73 return vec![PageElement::Tbl(tbl)];
74 }
75
76 group_into_paragraphs(&lines)
77}
78
79fn group_into_lines(blocks: &[TextBlock]) -> Vec<Line> {
81 let mut sorted: Vec<&TextBlock> = blocks.iter().collect();
82 sorted.sort_by(|a, b| {
84 let y_cmp = b.bbox[1]
85 .partial_cmp(&a.bbox[1])
86 .unwrap_or(std::cmp::Ordering::Equal);
87 if y_cmp == std::cmp::Ordering::Equal {
88 a.bbox[0]
89 .partial_cmp(&b.bbox[0])
90 .unwrap_or(std::cmp::Ordering::Equal)
91 } else {
92 y_cmp
93 }
94 });
95
96 let mut lines: Vec<Line> = Vec::new();
97
98 for block in sorted {
99 let y = block.bbox[1];
100 let matched = lines
101 .iter_mut()
102 .find(|line| (line.y - y).abs() < LINE_Y_TOLERANCE);
103
104 if let Some(line) = matched {
105 line.blocks.push(block.clone());
106 } else {
107 lines.push(Line {
108 y,
109 font_size: block.font_size,
110 blocks: vec![block.clone()],
111 });
112 }
113 }
114
115 for line in &mut lines {
117 line.blocks.sort_by(|a, b| {
118 a.bbox[0]
119 .partial_cmp(&b.bbox[0])
120 .unwrap_or(std::cmp::Ordering::Equal)
121 });
122 }
123
124 lines
125}
126
127fn try_detect_table(lines: &[Line]) -> Option<Table> {
132 if lines.len() < 2 {
133 return None;
134 }
135
136 let mut x_positions: Vec<f64> = Vec::new();
138 for line in lines {
139 for block in &line.blocks {
140 let x = block.bbox[0];
141 if !x_positions
142 .iter()
143 .any(|&px| (px - x).abs() < TABLE_X_TOLERANCE)
144 {
145 x_positions.push(x);
146 }
147 }
148 }
149 x_positions.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
150
151 if x_positions.len() < 2 {
152 return None;
153 }
154
155 let multi_col_lines = lines
157 .iter()
158 .filter(|line| {
159 let unique_cols = line
160 .blocks
161 .iter()
162 .map(|b| {
163 x_positions
164 .iter()
165 .position(|&px| (px - b.bbox[0]).abs() < TABLE_X_TOLERANCE)
166 .unwrap_or(0)
167 })
168 .collect::<std::collections::HashSet<_>>();
169 unique_cols.len() >= 2
170 })
171 .count();
172
173 if multi_col_lines * 100 / lines.len() < 60 {
175 return None;
176 }
177
178 let col_count = x_positions.len();
179 let mut rows = Vec::new();
180
181 for line in lines {
182 let mut row = vec![String::new(); col_count];
183 for block in &line.blocks {
184 let col_idx = x_positions
185 .iter()
186 .position(|&px| (px - block.bbox[0]).abs() < TABLE_X_TOLERANCE)
187 .unwrap_or(0);
188 if !row[col_idx].is_empty() {
189 row[col_idx].push(' ');
190 }
191 row[col_idx].push_str(&block.text);
192 }
193 rows.push(row);
194 }
195
196 Some(Table { rows, col_count })
197}
198
199fn group_into_paragraphs(lines: &[Line]) -> Vec<PageElement> {
201 let mut elements = Vec::new();
202 let mut current_runs: Vec<Run> = Vec::new();
203 let mut prev_y: Option<f64> = None;
204 let mut prev_font_size: f64 = 12.0;
205
206 for line in lines {
207 let line_text: String = line
208 .blocks
209 .iter()
210 .map(|b| b.text.as_str())
211 .collect::<Vec<_>>()
212 .join(" ");
213
214 if line_text.trim().is_empty() {
215 continue;
216 }
217
218 let is_new_paragraph = if let Some(py) = prev_y {
219 let gap = (py - line.y).abs();
220 gap > prev_font_size * PARAGRAPH_GAP_FACTOR
221 } else {
222 false
223 };
224
225 if is_new_paragraph && !current_runs.is_empty() {
226 elements.push(PageElement::Para(Paragraph {
227 runs: std::mem::take(&mut current_runs),
228 }));
229 }
230
231 let font_name = line
232 .blocks
233 .first()
234 .map(|b| b.font_name.clone())
235 .unwrap_or_default();
236 let font_size = line.font_size;
237
238 let bold = font_name.contains("Bold") || font_name.contains("bold");
239 let italic = font_name.contains("Italic")
240 || font_name.contains("italic")
241 || font_name.contains("Oblique");
242
243 current_runs.push(Run {
244 text: line_text,
245 font_name,
246 font_size,
247 bold,
248 italic,
249 });
250
251 prev_y = Some(line.y);
252 prev_font_size = font_size;
253 }
254
255 if !current_runs.is_empty() {
256 elements.push(PageElement::Para(Paragraph { runs: current_runs }));
257 }
258
259 elements
260}
261
262pub fn map_font_name(pdf_font: &str) -> &str {
264 let name = if let Some(pos) = pdf_font.find('+') {
266 &pdf_font[pos + 1..]
267 } else {
268 pdf_font
269 };
270
271 if name.contains("Times") || name.contains("Serif") {
273 "Times New Roman"
274 } else if name.contains("Arial") || name.contains("Helvetica") || name.contains("Sans") {
275 "Arial"
276 } else if name.contains("Courier") || name.contains("Mono") {
277 "Courier New"
278 } else if name.contains("Symbol") {
279 "Symbol"
280 } else {
281 "Calibri"
282 }
283}
284
285#[cfg(test)]
286mod tests {
287 use super::*;
288
289 fn make_block(text: &str, x: f64, y: f64, font_size: f64) -> TextBlock {
290 TextBlock {
291 text: text.to_string(),
292 page: 1,
293 bbox: [x, y, x + text.len() as f64 * font_size * 0.5, y + font_size],
294 font_name: "F1".to_string(),
295 font_size,
296 actual_text: None,
297 }
298 }
299
300 #[test]
301 fn single_line_becomes_paragraph() {
302 let blocks = vec![make_block("Hello World", 72.0, 720.0, 12.0)];
303 let elements = analyze_page(&blocks);
304 assert_eq!(elements.len(), 1);
305 assert!(matches!(elements[0], PageElement::Para(_)));
306 }
307
308 #[test]
309 fn two_close_lines_same_paragraph() {
310 let blocks = vec![
311 make_block("Line 1", 72.0, 720.0, 12.0),
312 make_block("Line 2", 72.0, 706.0, 12.0), ];
314 let elements = analyze_page(&blocks);
315 assert_eq!(elements.len(), 1);
316 }
317
318 #[test]
319 fn two_distant_lines_different_paragraphs() {
320 let blocks = vec![
321 make_block("Para 1", 72.0, 720.0, 12.0),
322 make_block("Para 2", 72.0, 680.0, 12.0), ];
324 let elements = analyze_page(&blocks);
325 assert_eq!(elements.len(), 2);
326 }
327
328 #[test]
329 fn table_detection() {
330 let blocks = vec![
331 make_block("Name", 72.0, 700.0, 12.0),
333 make_block("Age", 200.0, 700.0, 12.0),
334 make_block("Alice", 72.0, 684.0, 12.0),
336 make_block("30", 200.0, 684.0, 12.0),
337 make_block("Bob", 72.0, 668.0, 12.0),
339 make_block("25", 200.0, 668.0, 12.0),
340 ];
341 let elements = analyze_page(&blocks);
342 assert_eq!(elements.len(), 1);
343 assert!(matches!(elements[0], PageElement::Tbl(_)));
344 if let PageElement::Tbl(ref tbl) = elements[0] {
345 assert_eq!(tbl.rows.len(), 3);
346 assert_eq!(tbl.col_count, 2);
347 }
348 }
349
350 #[test]
351 fn empty_blocks_returns_empty() {
352 let elements = analyze_page(&[]);
353 assert!(elements.is_empty());
354 }
355}