1use pdf_extract::TextBlock;
4
5const LINE_Y_TOLERANCE: f64 = 2.0;
7
8const PARAGRAPH_GAP_FACTOR: f64 = 1.5;
10
11const TABLE_X_TOLERANCE: f64 = 5.0;
13
14#[derive(Debug, Clone)]
22pub struct Run {
23 pub text: String,
26 pub font_name: String,
30 pub font_size: f64,
33 pub bold: bool,
36 pub italic: bool,
39}
40
41#[derive(Debug, Clone)]
47pub struct Paragraph {
48 pub runs: Vec<Run>,
51}
52
53#[derive(Debug, Clone)]
60pub struct Table {
61 pub rows: Vec<Vec<String>>,
64 pub col_count: usize,
67}
68
69#[derive(Debug, Clone)]
76pub struct DocxImage {
77 pub data: Vec<u8>,
80 pub width: u32,
82 pub height: u32,
84 pub content_type: String,
87 pub id: String,
90}
91
92#[derive(Debug, Clone)]
98pub enum PageElement {
99 Para(Paragraph),
101 Tbl(Table),
103 Img(DocxImage),
105}
106
107#[derive(Debug)]
109struct Line {
110 y: f64,
111 font_size: f64,
112 blocks: Vec<TextBlock>,
113}
114
115pub fn analyze_page(blocks: &[TextBlock]) -> Vec<PageElement> {
117 if blocks.is_empty() {
118 return Vec::new();
119 }
120
121 let lines = group_into_lines(blocks);
122 let table = try_detect_table(&lines);
123
124 if let Some(tbl) = table {
125 return vec![PageElement::Tbl(tbl)];
126 }
127
128 group_into_paragraphs(&lines)
129}
130
131fn group_into_lines(blocks: &[TextBlock]) -> Vec<Line> {
133 let mut sorted: Vec<&TextBlock> = blocks.iter().collect();
134 sorted.sort_by(|a, b| {
136 let y_cmp = b.bbox[1]
137 .partial_cmp(&a.bbox[1])
138 .unwrap_or(std::cmp::Ordering::Equal);
139 if y_cmp == std::cmp::Ordering::Equal {
140 a.bbox[0]
141 .partial_cmp(&b.bbox[0])
142 .unwrap_or(std::cmp::Ordering::Equal)
143 } else {
144 y_cmp
145 }
146 });
147
148 let mut lines: Vec<Line> = Vec::new();
149
150 for block in sorted {
151 let y = block.bbox[1];
152 let matched = lines
153 .iter_mut()
154 .find(|line| (line.y - y).abs() < LINE_Y_TOLERANCE);
155
156 if let Some(line) = matched {
157 line.blocks.push(block.clone());
158 } else {
159 lines.push(Line {
160 y,
161 font_size: block.font_size,
162 blocks: vec![block.clone()],
163 });
164 }
165 }
166
167 for line in &mut lines {
169 line.blocks.sort_by(|a, b| {
170 a.bbox[0]
171 .partial_cmp(&b.bbox[0])
172 .unwrap_or(std::cmp::Ordering::Equal)
173 });
174 }
175
176 lines
177}
178
179fn try_detect_table(lines: &[Line]) -> Option<Table> {
184 if lines.len() < 2 {
185 return None;
186 }
187
188 let mut x_positions: Vec<f64> = Vec::new();
190 for line in lines {
191 for block in &line.blocks {
192 let x = block.bbox[0];
193 if !x_positions
194 .iter()
195 .any(|&px| (px - x).abs() < TABLE_X_TOLERANCE)
196 {
197 x_positions.push(x);
198 }
199 }
200 }
201 x_positions.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
202
203 if x_positions.len() < 2 {
204 return None;
205 }
206
207 let multi_col_lines = lines
209 .iter()
210 .filter(|line| {
211 let unique_cols = line
212 .blocks
213 .iter()
214 .map(|b| {
215 x_positions
216 .iter()
217 .position(|&px| (px - b.bbox[0]).abs() < TABLE_X_TOLERANCE)
218 .unwrap_or(0)
219 })
220 .collect::<std::collections::HashSet<_>>();
221 unique_cols.len() >= 2
222 })
223 .count();
224
225 if multi_col_lines * 100 / lines.len() < 60 {
227 return None;
228 }
229
230 let col_count = x_positions.len();
231 let mut rows = Vec::new();
232
233 for line in lines {
234 let mut row = vec![String::new(); col_count];
235 for block in &line.blocks {
236 let col_idx = x_positions
237 .iter()
238 .position(|&px| (px - block.bbox[0]).abs() < TABLE_X_TOLERANCE)
239 .unwrap_or(0);
240 if !row[col_idx].is_empty() {
241 row[col_idx].push(' ');
242 }
243 row[col_idx].push_str(&block.text);
244 }
245 rows.push(row);
246 }
247
248 Some(Table { rows, col_count })
249}
250
251fn group_into_paragraphs(lines: &[Line]) -> Vec<PageElement> {
253 let mut elements = Vec::new();
254 let mut current_runs: Vec<Run> = Vec::new();
255 let mut prev_y: Option<f64> = None;
256 let mut prev_font_size: f64 = 12.0;
257
258 for line in lines {
259 let line_text: String = line
260 .blocks
261 .iter()
262 .map(|b| b.text.as_str())
263 .collect::<Vec<_>>()
264 .join(" ");
265
266 if line_text.trim().is_empty() {
267 continue;
268 }
269
270 let is_new_paragraph = if let Some(py) = prev_y {
271 let gap = (py - line.y).abs();
272 gap > prev_font_size * PARAGRAPH_GAP_FACTOR
273 } else {
274 false
275 };
276
277 if is_new_paragraph && !current_runs.is_empty() {
278 elements.push(PageElement::Para(Paragraph {
279 runs: std::mem::take(&mut current_runs),
280 }));
281 }
282
283 let font_name = line
284 .blocks
285 .first()
286 .map(|b| b.font_name.clone())
287 .unwrap_or_default();
288 let font_size = line.font_size;
289
290 let bold = font_name.contains("Bold") || font_name.contains("bold");
291 let italic = font_name.contains("Italic")
292 || font_name.contains("italic")
293 || font_name.contains("Oblique");
294
295 current_runs.push(Run {
296 text: line_text,
297 font_name,
298 font_size,
299 bold,
300 italic,
301 });
302
303 prev_y = Some(line.y);
304 prev_font_size = font_size;
305 }
306
307 if !current_runs.is_empty() {
308 elements.push(PageElement::Para(Paragraph { runs: current_runs }));
309 }
310
311 elements
312}
313
314pub fn map_font_name(pdf_font: &str) -> &str {
316 let name = if let Some(pos) = pdf_font.find('+') {
318 &pdf_font[pos + 1..]
319 } else {
320 pdf_font
321 };
322
323 if name.contains("Times") || name.contains("Serif") {
325 "Times New Roman"
326 } else if name.contains("Arial") || name.contains("Helvetica") || name.contains("Sans") {
327 "Arial"
328 } else if name.contains("Courier") || name.contains("Mono") {
329 "Courier New"
330 } else if name.contains("Symbol") {
331 "Symbol"
332 } else {
333 "Calibri"
334 }
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340
341 fn make_block(text: &str, x: f64, y: f64, font_size: f64) -> TextBlock {
342 TextBlock {
343 text: text.to_string(),
344 page: 1,
345 bbox: [x, y, x + text.len() as f64 * font_size * 0.5, y + font_size],
346 font_name: "F1".to_string(),
347 font_size,
348 actual_text: None,
349 base_font: None,
350 is_bold: false,
351 is_italic: false,
352 color: None,
353 width_source: Default::default(),
354 char_bounds: vec![],
355 }
356 }
357
358 #[test]
359 fn single_line_becomes_paragraph() {
360 let blocks = vec![make_block("Hello World", 72.0, 720.0, 12.0)];
361 let elements = analyze_page(&blocks);
362 assert_eq!(elements.len(), 1);
363 assert!(matches!(elements[0], PageElement::Para(_)));
364 }
365
366 #[test]
367 fn two_close_lines_same_paragraph() {
368 let blocks = vec![
369 make_block("Line 1", 72.0, 720.0, 12.0),
370 make_block("Line 2", 72.0, 706.0, 12.0), ];
372 let elements = analyze_page(&blocks);
373 assert_eq!(elements.len(), 1);
374 }
375
376 #[test]
377 fn two_distant_lines_different_paragraphs() {
378 let blocks = vec![
379 make_block("Para 1", 72.0, 720.0, 12.0),
380 make_block("Para 2", 72.0, 680.0, 12.0), ];
382 let elements = analyze_page(&blocks);
383 assert_eq!(elements.len(), 2);
384 }
385
386 #[test]
387 fn table_detection() {
388 let blocks = vec![
389 make_block("Name", 72.0, 700.0, 12.0),
391 make_block("Age", 200.0, 700.0, 12.0),
392 make_block("Alice", 72.0, 684.0, 12.0),
394 make_block("30", 200.0, 684.0, 12.0),
395 make_block("Bob", 72.0, 668.0, 12.0),
397 make_block("25", 200.0, 668.0, 12.0),
398 ];
399 let elements = analyze_page(&blocks);
400 assert_eq!(elements.len(), 1);
401 assert!(matches!(elements[0], PageElement::Tbl(_)));
402 if let PageElement::Tbl(ref tbl) = elements[0] {
403 assert_eq!(tbl.rows.len(), 3);
404 assert_eq!(tbl.col_count, 2);
405 }
406 }
407
408 #[test]
409 fn empty_blocks_returns_empty() {
410 let elements = analyze_page(&[]);
411 assert!(elements.is_empty());
412 }
413}