use super::bounding_box::BoundingBox;
use super::clustering::FontSizeCluster;
use crate::core::config::ExtractionConfig;
use crate::pdf::error::{PdfError, Result};
use pdfium_render::prelude::*;
const DEFAULT_FONT_SIZE: f32 = 12.0;
const MERGE_INTERSECTION_THRESHOLD: f32 = 0.05;
const MERGE_X_THRESHOLD_MULTIPLIER: f32 = 2.0;
const MERGE_Y_THRESHOLD_MULTIPLIER: f32 = 1.5;
#[derive(Debug, Clone)]
pub struct CharData {
pub text: String,
pub x: f32,
pub y: f32,
pub font_size: f32,
pub width: f32,
pub height: f32,
pub is_bold: bool,
pub is_italic: bool,
pub baseline_y: f32,
}
#[derive(Debug, Clone, PartialEq)]
pub struct TextBlock {
pub text: String,
pub bbox: BoundingBox,
pub font_size: f32,
}
#[derive(Debug, Clone)]
pub struct KMeansResult {
pub labels: Vec<u32>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HierarchyLevel {
H1 = 1,
H2 = 2,
H3 = 3,
H4 = 4,
H5 = 5,
H6 = 6,
Body = 0,
}
#[derive(Debug, Clone)]
pub struct HierarchyBlock {
pub text: String,
pub bbox: BoundingBox,
pub font_size: f32,
pub hierarchy_level: HierarchyLevel,
}
impl HierarchyLevel {
pub fn from_level(level: usize) -> Self {
match level {
1 => HierarchyLevel::H1,
2 => HierarchyLevel::H2,
3 => HierarchyLevel::H3,
4 => HierarchyLevel::H4,
5 => HierarchyLevel::H5,
6 => HierarchyLevel::H6,
_ => HierarchyLevel::Body,
}
}
}
pub fn assign_hierarchy_levels(blocks: &[TextBlock], kmeans_result: &KMeansResult) -> Vec<HierarchyBlock> {
if blocks.is_empty() || kmeans_result.labels.is_empty() {
return Vec::new();
}
blocks
.iter()
.zip(kmeans_result.labels.iter())
.map(|(block, &cluster_id)| {
let hierarchy_level = match cluster_id {
0 => HierarchyLevel::H1,
1 => HierarchyLevel::H2,
2 => HierarchyLevel::H3,
3 => HierarchyLevel::H4,
4 => HierarchyLevel::H5,
5 => HierarchyLevel::H6,
_ => HierarchyLevel::Body,
};
HierarchyBlock {
text: block.text.clone(),
bbox: block.bbox,
font_size: block.font_size,
hierarchy_level,
}
})
.collect()
}
pub fn assign_hierarchy_levels_from_clusters(
blocks: &[TextBlock],
clusters: &[FontSizeCluster],
) -> Vec<(TextBlock, HierarchyLevel)> {
if blocks.is_empty() || clusters.is_empty() {
return Vec::new();
}
if clusters.len() == 1 {
return blocks.iter().map(|b| (b.clone(), HierarchyLevel::Body)).collect();
}
let max_heading_levels = 6;
let num_headings = (clusters.len() - 1).min(max_heading_levels);
let mut result = Vec::new();
for block in blocks {
let mut assigned_level = HierarchyLevel::Body;
for (idx, cluster) in clusters.iter().enumerate() {
let font_size = block.font_size;
if (font_size - cluster.centroid).abs() < 1.0 || cluster.members.contains(block) {
if idx < num_headings {
assigned_level = HierarchyLevel::from_level(idx + 1);
} else {
assigned_level = HierarchyLevel::Body;
}
break;
}
}
result.push((block.clone(), assigned_level));
}
result
}
pub fn extract_chars_with_fonts(page: &PdfPage) -> Result<Vec<CharData>> {
let page_text = page
.text()
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
let chars = page_text.chars();
let char_count = chars.len();
let mut char_data_list = Vec::with_capacity(char_count);
for i in 0..char_count {
let Ok(pdf_char) = chars.get(i) else {
continue;
};
let Some(ch) = pdf_char.unicode_char() else {
continue;
};
let font_size = pdf_char.unscaled_font_size().value;
let font_size = if font_size > 0.0 { font_size } else { DEFAULT_FONT_SIZE };
let Ok(bounds) = pdf_char.loose_bounds() else {
continue;
};
let (font_name, is_bold_flag, is_italic_flag) = pdf_char.font_info();
let (bold_from_name, italic_from_name, bold_from_weight) = if !is_bold_flag || !is_italic_flag {
let name_lower = font_name.to_lowercase();
let bold_n = name_lower.contains("bold");
let italic_n = name_lower.contains("italic") || name_lower.contains("oblique");
let bold_w = pdf_char
.font_weight()
.map(|w| {
matches!(
w,
PdfFontWeight::Weight700Bold | PdfFontWeight::Weight800 | PdfFontWeight::Weight900
)
})
.unwrap_or(false);
(bold_n, italic_n, bold_w)
} else {
(false, false, false)
};
let is_bold = is_bold_flag || bold_from_name || bold_from_weight;
let is_italic = is_italic_flag || italic_from_name;
let baseline_y = pdf_char
.origin()
.map(|(_x, y)| y.value)
.unwrap_or(bounds.bottom().value);
let char_data = CharData {
text: ch.to_string(),
x: bounds.left().value,
y: bounds.bottom().value,
width: bounds.width().value,
height: bounds.height().value,
font_size,
is_bold,
is_italic,
baseline_y,
};
char_data_list.push(char_data);
}
Ok(char_data_list)
}
#[derive(Debug, Clone)]
pub struct SegmentData {
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub font_size: f32,
pub is_bold: bool,
pub is_italic: bool,
pub is_monospace: bool,
pub baseline_y: f32,
}
pub fn extract_segments_from_page(page: &PdfPage) -> Result<Vec<SegmentData>> {
let page_text = page
.text()
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
let segments = page_text.segments();
let seg_count = segments.len();
let mut segment_data_list = Vec::with_capacity(seg_count);
for i in 0..seg_count {
let Ok(segment) = segments.get(i) else {
continue;
};
let text = segment.text();
if text.trim().is_empty() {
continue;
}
let bounds = segment.bounds();
let seg_left = bounds.left().value;
let seg_bottom = bounds.bottom().value;
let seg_width = bounds.width().value;
let seg_height = bounds.height().value;
let chars = match segment.chars() {
Ok(c) => c,
Err(_) => continue,
};
let char_count = chars.len();
let mut font_size = DEFAULT_FONT_SIZE;
let mut is_bold = false;
let mut is_italic = false;
let mut is_monospace = false;
let mut baseline_y = seg_bottom;
let mut sampled = false;
for ci in 0..char_count {
let Ok(ch) = chars.get(ci) else { continue };
let Some(uc) = ch.unicode_char() else { continue };
if uc.is_whitespace() || uc.is_control() {
continue;
}
let fs = ch.unscaled_font_size().value;
font_size = if fs > 0.0 { fs } else { DEFAULT_FONT_SIZE };
let (font_name, is_bold_flag, is_italic_flag) = ch.font_info();
let (bold_from_name, italic_from_name, bold_from_weight) = if !is_bold_flag || !is_italic_flag {
let name_lower = font_name.to_lowercase();
let bold_n = name_lower.contains("bold");
let italic_n = name_lower.contains("italic") || name_lower.contains("oblique");
let bold_w = ch
.font_weight()
.map(|w| {
matches!(
w,
PdfFontWeight::Weight700Bold | PdfFontWeight::Weight800 | PdfFontWeight::Weight900
)
})
.unwrap_or(false);
(bold_n, italic_n, bold_w)
} else {
(false, false, false)
};
is_bold = is_bold_flag || bold_from_name || bold_from_weight;
is_italic = is_italic_flag || italic_from_name;
is_monospace = is_monospace_font(&font_name.to_lowercase());
baseline_y = ch.origin().map(|(_x, y)| y.value).unwrap_or(seg_bottom);
sampled = true;
break;
}
if !sampled {
continue;
}
segment_data_list.push(SegmentData {
text,
x: seg_left,
y: seg_bottom,
width: seg_width,
height: seg_height,
font_size,
is_bold,
is_italic,
is_monospace,
baseline_y,
});
}
Ok(segment_data_list)
}
fn is_monospace_font(name_lower: &str) -> bool {
const MONOSPACE_PATTERNS: &[&str] = &[
"mono",
"courier",
"consolas",
"menlo",
"source code",
"inconsolata",
"fira code",
"liberation mono",
"lucida console",
"andale mono",
"dejavu sans mono",
"roboto mono",
"noto mono",
"ibm plex mono",
"jetbrains mono",
"cascadia",
"hack",
];
MONOSPACE_PATTERNS.iter().any(|p| name_lower.contains(p))
}
pub fn merge_chars_into_blocks(chars: Vec<CharData>) -> Vec<TextBlock> {
if chars.is_empty() {
return Vec::new();
}
let mut char_boxes: Vec<(CharData, BoundingBox)> = chars
.into_iter()
.map(|char_data| {
let bbox = BoundingBox {
left: char_data.x,
top: char_data.y - char_data.height,
right: char_data.x + char_data.width,
bottom: char_data.y,
};
(char_data, bbox)
})
.collect();
char_boxes.sort_by(|a, b| {
let y_diff = a.1.top.partial_cmp(&b.1.top).unwrap_or(std::cmp::Ordering::Equal);
if y_diff != std::cmp::Ordering::Equal {
y_diff
} else {
a.1.left.partial_cmp(&b.1.left).unwrap_or(std::cmp::Ordering::Equal)
}
});
let mut blocks: Vec<Vec<CharData>> = Vec::new();
let mut used = vec![false; char_boxes.len()];
for i in 0..char_boxes.len() {
if used[i] {
continue;
}
let mut current_block = vec![char_boxes[i].0.clone()];
let mut block_bbox = char_boxes[i].1;
used[i] = true;
let mut changed = true;
while changed {
changed = false;
for j in (i + 1)..char_boxes.len() {
if used[j] {
continue;
}
let next_char = &char_boxes[j];
let next_bbox = char_boxes[j].1;
let avg_font_size = (block_bbox.bottom - block_bbox.top).max(next_bbox.bottom - next_bbox.top);
let intersection_ratio = block_bbox.intersection_ratio(&next_bbox);
let (self_center_x, self_center_y) = block_bbox.center();
let (other_center_x, other_center_y) = next_bbox.center();
let dx = (self_center_x - other_center_x).abs();
let dy = (self_center_y - other_center_y).abs();
let x_threshold = avg_font_size * MERGE_X_THRESHOLD_MULTIPLIER;
let y_threshold = avg_font_size * MERGE_Y_THRESHOLD_MULTIPLIER;
let merge_by_distance = (dx < x_threshold) && (dy < y_threshold);
if merge_by_distance || intersection_ratio > MERGE_INTERSECTION_THRESHOLD {
current_block.push(next_char.0.clone());
block_bbox.left = block_bbox.left.min(next_bbox.left);
block_bbox.top = block_bbox.top.min(next_bbox.top);
block_bbox.right = block_bbox.right.max(next_bbox.right);
block_bbox.bottom = block_bbox.bottom.max(next_bbox.bottom);
used[j] = true;
changed = true;
}
}
}
blocks.push(current_block);
}
blocks
.into_iter()
.map(|block| {
let text = block.iter().map(|c| c.text.clone()).collect::<String>();
let (min_x, min_y, max_x, max_y, total_font_size) = block.iter().fold(
(f32::INFINITY, f32::INFINITY, f32::NEG_INFINITY, f32::NEG_INFINITY, 0.0),
|(min_x, min_y, max_x, max_y, total_font_size), char_data| {
(
min_x.min(char_data.x),
min_y.min(char_data.y - char_data.height),
max_x.max(char_data.x + char_data.width),
max_y.max(char_data.y),
total_font_size + char_data.font_size,
)
},
);
let avg_font_size = total_font_size / block.len() as f32;
TextBlock {
text,
bbox: BoundingBox {
left: min_x,
top: min_y,
right: max_x,
bottom: max_y,
},
font_size: avg_font_size,
}
})
.collect()
}
pub fn should_trigger_ocr(page: &PdfPage, blocks: &[TextBlock], config: &ExtractionConfig) -> bool {
let page_width = page.width().value;
let page_height = page.height().value;
let page_area = page_width * page_height;
if page_area <= 0.0 {
return true; }
let text_area: f32 = blocks
.iter()
.map(|block| {
let width = (block.bbox.right - block.bbox.left).max(0.0);
let height = (block.bbox.bottom - block.bbox.top).max(0.0);
width * height
})
.sum();
let coverage = text_area / page_area;
let threshold = config
.pdf_options
.as_ref()
.and_then(|pdf_config| pdf_config.hierarchy.as_ref())
.and_then(|hierarchy_config| hierarchy_config.ocr_coverage_threshold)
.unwrap_or(0.5);
coverage < threshold
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_char_data_creation() {
let char_data = CharData {
text: "A".to_string(),
x: 100.0,
y: 50.0,
font_size: 12.0,
width: 10.0,
height: 12.0,
is_bold: true,
is_italic: false,
baseline_y: 48.0,
};
assert_eq!(char_data.text, "A");
assert_eq!(char_data.x, 100.0);
assert_eq!(char_data.y, 50.0);
assert_eq!(char_data.font_size, 12.0);
assert_eq!(char_data.width, 10.0);
assert_eq!(char_data.height, 12.0);
assert!(char_data.is_bold);
assert!(!char_data.is_italic);
assert_eq!(char_data.baseline_y, 48.0);
}
#[test]
fn test_char_data_clone() {
let char_data = CharData {
text: "B".to_string(),
x: 200.0,
y: 100.0,
font_size: 14.0,
width: 8.0,
height: 14.0,
is_bold: false,
is_italic: true,
baseline_y: 98.0,
};
let cloned = char_data.clone();
assert_eq!(cloned.text, char_data.text);
assert_eq!(cloned.font_size, char_data.font_size);
assert_eq!(cloned.is_bold, char_data.is_bold);
assert_eq!(cloned.is_italic, char_data.is_italic);
assert_eq!(cloned.baseline_y, char_data.baseline_y);
}
}