use pdf_extract::TextBlock;
const LINE_Y_TOLERANCE: f64 = 2.0;
const PARAGRAPH_GAP_FACTOR: f64 = 1.5;
const TABLE_X_TOLERANCE: f64 = 5.0;
#[derive(Debug, Clone)]
pub struct Run {
pub text: String,
pub font_name: String,
pub font_size: f64,
pub bold: bool,
pub italic: bool,
}
#[derive(Debug, Clone)]
pub struct Paragraph {
pub runs: Vec<Run>,
}
#[derive(Debug, Clone)]
pub struct Table {
pub rows: Vec<Vec<String>>,
pub col_count: usize,
}
#[derive(Debug, Clone)]
pub struct DocxImage {
pub data: Vec<u8>,
pub width: u32,
pub height: u32,
pub content_type: String,
pub id: String,
}
#[derive(Debug, Clone)]
pub enum PageElement {
Para(Paragraph),
Tbl(Table),
Img(DocxImage),
}
#[derive(Debug)]
struct Line {
y: f64,
font_size: f64,
blocks: Vec<TextBlock>,
}
pub fn analyze_page(blocks: &[TextBlock]) -> Vec<PageElement> {
if blocks.is_empty() {
return Vec::new();
}
let lines = group_into_lines(blocks);
let table = try_detect_table(&lines);
if let Some(tbl) = table {
return vec![PageElement::Tbl(tbl)];
}
group_into_paragraphs(&lines)
}
fn group_into_lines(blocks: &[TextBlock]) -> Vec<Line> {
let mut sorted: Vec<&TextBlock> = blocks.iter().collect();
sorted.sort_by(|a, b| {
let y_cmp = b.bbox[1]
.partial_cmp(&a.bbox[1])
.unwrap_or(std::cmp::Ordering::Equal);
if y_cmp == std::cmp::Ordering::Equal {
a.bbox[0]
.partial_cmp(&b.bbox[0])
.unwrap_or(std::cmp::Ordering::Equal)
} else {
y_cmp
}
});
let mut lines: Vec<Line> = Vec::new();
for block in sorted {
let y = block.bbox[1];
let matched = lines
.iter_mut()
.find(|line| (line.y - y).abs() < LINE_Y_TOLERANCE);
if let Some(line) = matched {
line.blocks.push(block.clone());
} else {
lines.push(Line {
y,
font_size: block.font_size,
blocks: vec![block.clone()],
});
}
}
for line in &mut lines {
line.blocks.sort_by(|a, b| {
a.bbox[0]
.partial_cmp(&b.bbox[0])
.unwrap_or(std::cmp::Ordering::Equal)
});
}
lines
}
fn try_detect_table(lines: &[Line]) -> Option<Table> {
if lines.len() < 2 {
return None;
}
let mut x_positions: Vec<f64> = Vec::new();
for line in lines {
for block in &line.blocks {
let x = block.bbox[0];
if !x_positions
.iter()
.any(|&px| (px - x).abs() < TABLE_X_TOLERANCE)
{
x_positions.push(x);
}
}
}
x_positions.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
if x_positions.len() < 2 {
return None;
}
let multi_col_lines = lines
.iter()
.filter(|line| {
let unique_cols = line
.blocks
.iter()
.map(|b| {
x_positions
.iter()
.position(|&px| (px - b.bbox[0]).abs() < TABLE_X_TOLERANCE)
.unwrap_or(0)
})
.collect::<std::collections::HashSet<_>>();
unique_cols.len() >= 2
})
.count();
if multi_col_lines * 100 / lines.len() < 60 {
return None;
}
let col_count = x_positions.len();
let mut rows = Vec::new();
for line in lines {
let mut row = vec![String::new(); col_count];
for block in &line.blocks {
let col_idx = x_positions
.iter()
.position(|&px| (px - block.bbox[0]).abs() < TABLE_X_TOLERANCE)
.unwrap_or(0);
if !row[col_idx].is_empty() {
row[col_idx].push(' ');
}
row[col_idx].push_str(&block.text);
}
rows.push(row);
}
Some(Table { rows, col_count })
}
fn group_into_paragraphs(lines: &[Line]) -> Vec<PageElement> {
let mut elements = Vec::new();
let mut current_runs: Vec<Run> = Vec::new();
let mut prev_y: Option<f64> = None;
let mut prev_font_size: f64 = 12.0;
for line in lines {
let line_text: String = line
.blocks
.iter()
.map(|b| b.text.as_str())
.collect::<Vec<_>>()
.join(" ");
if line_text.trim().is_empty() {
continue;
}
let is_new_paragraph = if let Some(py) = prev_y {
let gap = (py - line.y).abs();
gap > prev_font_size * PARAGRAPH_GAP_FACTOR
} else {
false
};
if is_new_paragraph && !current_runs.is_empty() {
elements.push(PageElement::Para(Paragraph {
runs: std::mem::take(&mut current_runs),
}));
}
let font_name = line
.blocks
.first()
.map(|b| b.font_name.clone())
.unwrap_or_default();
let font_size = line.font_size;
let bold = font_name.contains("Bold") || font_name.contains("bold");
let italic = font_name.contains("Italic")
|| font_name.contains("italic")
|| font_name.contains("Oblique");
current_runs.push(Run {
text: line_text,
font_name,
font_size,
bold,
italic,
});
prev_y = Some(line.y);
prev_font_size = font_size;
}
if !current_runs.is_empty() {
elements.push(PageElement::Para(Paragraph { runs: current_runs }));
}
elements
}
pub fn map_font_name(pdf_font: &str) -> &str {
let name = if let Some(pos) = pdf_font.find('+') {
&pdf_font[pos + 1..]
} else {
pdf_font
};
if name.contains("Times") || name.contains("Serif") {
"Times New Roman"
} else if name.contains("Arial") || name.contains("Helvetica") || name.contains("Sans") {
"Arial"
} else if name.contains("Courier") || name.contains("Mono") {
"Courier New"
} else if name.contains("Symbol") {
"Symbol"
} else {
"Calibri"
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_block(text: &str, x: f64, y: f64, font_size: f64) -> TextBlock {
TextBlock {
text: text.to_string(),
page: 1,
bbox: [x, y, x + text.len() as f64 * font_size * 0.5, y + font_size],
font_name: "F1".to_string(),
font_size,
actual_text: None,
base_font: None,
is_bold: false,
is_italic: false,
color: None,
width_source: Default::default(),
char_bounds: vec![],
}
}
#[test]
fn single_line_becomes_paragraph() {
let blocks = vec![make_block("Hello World", 72.0, 720.0, 12.0)];
let elements = analyze_page(&blocks);
assert_eq!(elements.len(), 1);
assert!(matches!(elements[0], PageElement::Para(_)));
}
#[test]
fn two_close_lines_same_paragraph() {
let blocks = vec![
make_block("Line 1", 72.0, 720.0, 12.0),
make_block("Line 2", 72.0, 706.0, 12.0), ];
let elements = analyze_page(&blocks);
assert_eq!(elements.len(), 1);
}
#[test]
fn two_distant_lines_different_paragraphs() {
let blocks = vec![
make_block("Para 1", 72.0, 720.0, 12.0),
make_block("Para 2", 72.0, 680.0, 12.0), ];
let elements = analyze_page(&blocks);
assert_eq!(elements.len(), 2);
}
#[test]
fn table_detection() {
let blocks = vec![
make_block("Name", 72.0, 700.0, 12.0),
make_block("Age", 200.0, 700.0, 12.0),
make_block("Alice", 72.0, 684.0, 12.0),
make_block("30", 200.0, 684.0, 12.0),
make_block("Bob", 72.0, 668.0, 12.0),
make_block("25", 200.0, 668.0, 12.0),
];
let elements = analyze_page(&blocks);
assert_eq!(elements.len(), 1);
assert!(matches!(elements[0], PageElement::Tbl(_)));
if let PageElement::Tbl(ref tbl) = elements[0] {
assert_eq!(tbl.rows.len(), 3);
assert_eq!(tbl.col_count, 2);
}
}
#[test]
fn empty_blocks_returns_empty() {
let elements = analyze_page(&[]);
assert!(elements.is_empty());
}
}