use super::bounding_box::BoundingBox;
use super::clustering::FontSizeCluster;
use crate::core::config::ExtractionConfig;
use crate::pdf::error::{PdfError, Result};
use pdfium_render::prelude::*;
const DEFAULT_FONT_SIZE: f32 = 12.0;
const MERGE_INTERSECTION_THRESHOLD: f32 = 0.05;
const MERGE_X_THRESHOLD_MULTIPLIER: f32 = 2.0;
const MERGE_Y_THRESHOLD_MULTIPLIER: f32 = 1.5;
#[derive(Debug, Clone)]
pub struct CharData {
pub text: String,
pub x: f32,
pub y: f32,
pub font_size: f32,
pub width: f32,
pub height: f32,
}
#[derive(Debug, Clone, PartialEq)]
pub struct TextBlock {
pub text: String,
pub bbox: BoundingBox,
pub font_size: f32,
}
#[derive(Debug, Clone)]
pub struct KMeansResult {
pub labels: Vec<u32>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HierarchyLevel {
H1 = 1,
H2 = 2,
H3 = 3,
H4 = 4,
H5 = 5,
H6 = 6,
Body = 0,
}
#[derive(Debug, Clone)]
pub struct HierarchyBlock {
pub text: String,
pub bbox: BoundingBox,
pub font_size: f32,
pub hierarchy_level: HierarchyLevel,
}
impl HierarchyLevel {
pub fn from_level(level: usize) -> Self {
match level {
1 => HierarchyLevel::H1,
2 => HierarchyLevel::H2,
3 => HierarchyLevel::H3,
4 => HierarchyLevel::H4,
5 => HierarchyLevel::H5,
6 => HierarchyLevel::H6,
_ => HierarchyLevel::Body,
}
}
}
pub fn assign_hierarchy_levels(blocks: &[TextBlock], kmeans_result: &KMeansResult) -> Vec<HierarchyBlock> {
if blocks.is_empty() || kmeans_result.labels.is_empty() {
return Vec::new();
}
blocks
.iter()
.zip(kmeans_result.labels.iter())
.map(|(block, &cluster_id)| {
let hierarchy_level = match cluster_id {
0 => HierarchyLevel::H1,
1 => HierarchyLevel::H2,
2 => HierarchyLevel::H3,
3 => HierarchyLevel::H4,
4 => HierarchyLevel::H5,
5 => HierarchyLevel::H6,
_ => HierarchyLevel::Body,
};
HierarchyBlock {
text: block.text.clone(),
bbox: block.bbox,
font_size: block.font_size,
hierarchy_level,
}
})
.collect()
}
pub fn assign_hierarchy_levels_from_clusters(
blocks: &[TextBlock],
clusters: &[FontSizeCluster],
) -> Vec<(TextBlock, HierarchyLevel)> {
if blocks.is_empty() || clusters.is_empty() {
return Vec::new();
}
if clusters.len() == 1 {
return blocks.iter().map(|b| (b.clone(), HierarchyLevel::Body)).collect();
}
let max_heading_levels = 6;
let num_headings = (clusters.len() - 1).min(max_heading_levels);
let mut result = Vec::new();
for block in blocks {
let mut assigned_level = HierarchyLevel::Body;
for (idx, cluster) in clusters.iter().enumerate() {
let font_size = block.font_size;
if (font_size - cluster.centroid).abs() < 1.0 || cluster.members.contains(block) {
if idx < num_headings {
assigned_level = HierarchyLevel::from_level(idx + 1);
} else {
assigned_level = HierarchyLevel::Body;
}
break;
}
}
result.push((block.clone(), assigned_level));
}
result
}
pub fn extract_chars_with_fonts(page: &PdfPage) -> Result<Vec<CharData>> {
let page_text = page
.text()
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
let chars = page_text.chars();
let char_count = chars.len();
let mut char_data_list = Vec::with_capacity(char_count);
for i in 0..char_count {
let Ok(pdf_char) = chars.get(i) else {
continue;
};
let Some(ch) = pdf_char.unicode_char() else {
continue;
};
let font_size = pdf_char.unscaled_font_size().value;
let font_size = if font_size > 0.0 { font_size } else { DEFAULT_FONT_SIZE };
let Ok(bounds) = pdf_char.loose_bounds() else {
continue;
};
let char_data = CharData {
text: ch.to_string(),
x: bounds.left().value,
y: bounds.bottom().value,
width: bounds.width().value,
height: bounds.height().value,
font_size,
};
char_data_list.push(char_data);
}
Ok(char_data_list)
}
pub fn merge_chars_into_blocks(chars: Vec<CharData>) -> Vec<TextBlock> {
if chars.is_empty() {
return Vec::new();
}
let mut char_boxes: Vec<(CharData, BoundingBox)> = chars
.into_iter()
.map(|char_data| {
let bbox = BoundingBox {
left: char_data.x,
top: char_data.y - char_data.height,
right: char_data.x + char_data.width,
bottom: char_data.y,
};
(char_data, bbox)
})
.collect();
char_boxes.sort_by(|a, b| {
let y_diff =
a.1.top
.partial_cmp(&b.1.top)
.expect("Failed to compare top coordinates");
if y_diff != std::cmp::Ordering::Equal {
y_diff
} else {
a.1.left
.partial_cmp(&b.1.left)
.expect("Failed to compare left coordinates")
}
});
let mut blocks: Vec<Vec<CharData>> = Vec::new();
let mut used = vec![false; char_boxes.len()];
for i in 0..char_boxes.len() {
if used[i] {
continue;
}
let mut current_block = vec![char_boxes[i].0.clone()];
let mut block_bbox = char_boxes[i].1;
used[i] = true;
let mut changed = true;
while changed {
changed = false;
for j in (i + 1)..char_boxes.len() {
if used[j] {
continue;
}
let next_char = &char_boxes[j];
let next_bbox = char_boxes[j].1;
let avg_font_size = (block_bbox.bottom - block_bbox.top).max(next_bbox.bottom - next_bbox.top);
let intersection_ratio = block_bbox.intersection_ratio(&next_bbox);
let (self_center_x, self_center_y) = block_bbox.center();
let (other_center_x, other_center_y) = next_bbox.center();
let dx = (self_center_x - other_center_x).abs();
let dy = (self_center_y - other_center_y).abs();
let x_threshold = avg_font_size * MERGE_X_THRESHOLD_MULTIPLIER;
let y_threshold = avg_font_size * MERGE_Y_THRESHOLD_MULTIPLIER;
let merge_by_distance = (dx < x_threshold) && (dy < y_threshold);
if merge_by_distance || intersection_ratio > MERGE_INTERSECTION_THRESHOLD {
current_block.push(next_char.0.clone());
block_bbox.left = block_bbox.left.min(next_bbox.left);
block_bbox.top = block_bbox.top.min(next_bbox.top);
block_bbox.right = block_bbox.right.max(next_bbox.right);
block_bbox.bottom = block_bbox.bottom.max(next_bbox.bottom);
used[j] = true;
changed = true;
}
}
}
blocks.push(current_block);
}
blocks
.into_iter()
.map(|block| {
let text = block.iter().map(|c| c.text.clone()).collect::<String>();
let (min_x, min_y, max_x, max_y, total_font_size) = block.iter().fold(
(f32::INFINITY, f32::INFINITY, f32::NEG_INFINITY, f32::NEG_INFINITY, 0.0),
|(min_x, min_y, max_x, max_y, total_font_size), char_data| {
(
min_x.min(char_data.x),
min_y.min(char_data.y - char_data.height),
max_x.max(char_data.x + char_data.width),
max_y.max(char_data.y),
total_font_size + char_data.font_size,
)
},
);
let avg_font_size = total_font_size / block.len() as f32;
TextBlock {
text,
bbox: BoundingBox {
left: min_x,
top: min_y,
right: max_x,
bottom: max_y,
},
font_size: avg_font_size,
}
})
.collect()
}
pub fn should_trigger_ocr(page: &PdfPage, blocks: &[TextBlock], config: &ExtractionConfig) -> bool {
let page_width = page.width().value;
let page_height = page.height().value;
let page_area = page_width * page_height;
if page_area <= 0.0 {
return true; }
let text_area: f32 = blocks
.iter()
.map(|block| {
let width = (block.bbox.right - block.bbox.left).max(0.0);
let height = (block.bbox.bottom - block.bbox.top).max(0.0);
width * height
})
.sum();
let coverage = text_area / page_area;
let threshold = config
.pdf_options
.as_ref()
.and_then(|pdf_config| pdf_config.hierarchy.as_ref())
.and_then(|hierarchy_config| hierarchy_config.ocr_coverage_threshold)
.unwrap_or(0.5);
coverage < threshold
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_char_data_creation() {
let char_data = CharData {
text: "A".to_string(),
x: 100.0,
y: 50.0,
font_size: 12.0,
width: 10.0,
height: 12.0,
};
assert_eq!(char_data.text, "A");
assert_eq!(char_data.x, 100.0);
assert_eq!(char_data.y, 50.0);
assert_eq!(char_data.font_size, 12.0);
assert_eq!(char_data.width, 10.0);
assert_eq!(char_data.height, 12.0);
}
#[test]
fn test_char_data_clone() {
let char_data = CharData {
text: "B".to_string(),
x: 200.0,
y: 100.0,
font_size: 14.0,
width: 8.0,
height: 14.0,
};
let cloned = char_data.clone();
assert_eq!(cloned.text, char_data.text);
assert_eq!(cloned.font_size, char_data.font_size);
}
}