use std::collections::HashMap;
use super::backend::{get_number_from_value, PdfBackend, PdfValue};
use crate::error::{Error, Result};
#[derive(Debug, Clone)]
pub struct TextSpan {
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub font_size: f32,
pub font_name: String,
pub is_bold: bool,
pub is_italic: bool,
}
impl TextSpan {
pub fn new(text: String, x: f32, y: f32, font_size: f32, font_name: String) -> Self {
let is_bold = font_name.to_lowercase().contains("bold")
|| font_name.to_lowercase().contains("black")
|| font_name.to_lowercase().contains("heavy");
let is_italic = font_name.to_lowercase().contains("italic")
|| font_name.to_lowercase().contains("oblique");
Self {
text,
x,
y,
width: 0.0, font_size,
font_name,
is_bold,
is_italic,
}
}
pub fn bottom(&self) -> f32 {
self.y - self.font_size * 0.2 }
pub fn top(&self) -> f32 {
self.y + self.font_size * 0.8 }
}
#[derive(Debug, Clone)]
pub struct TextLine {
pub spans: Vec<TextSpan>,
pub y: f32,
pub x: f32,
pub font_size: f32,
pub is_heading: bool,
pub heading_level: u8,
}
impl TextLine {
pub fn from_spans(mut spans: Vec<TextSpan>) -> Self {
if spans.is_empty() {
return Self {
spans: vec![],
y: 0.0,
x: 0.0,
font_size: 0.0,
is_heading: false,
heading_level: 0,
};
}
spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal));
let total_chars: usize = spans.iter().map(|s| s.text.len()).sum();
let weighted_size: f32 = spans
.iter()
.map(|s| s.font_size * s.text.len() as f32)
.sum();
let font_size = if total_chars > 0 {
weighted_size / total_chars as f32
} else {
spans[0].font_size
};
let y = spans[0].y;
let x = spans[0].x;
Self {
spans,
y,
x,
font_size,
is_heading: false,
heading_level: 0,
}
}
pub fn text(&self) -> String {
if self.spans.is_empty() {
return String::new();
}
if self.spans.len() == 1 {
return self.spans[0].text.clone();
}
let mut result = String::new();
for (i, span) in self.spans.iter().enumerate() {
if i == 0 {
result.push_str(&span.text);
continue;
}
let prev_span = &self.spans[i - 1];
let prev_end = prev_span.x + prev_span.width;
let gap = span.x - prev_end;
let char_count = span.text.chars().count();
let avg_char_width = if char_count > 0 && span.width > 0.0 {
span.width / char_count as f32
} else {
span.font_size * 0.5 };
let space_threshold = avg_char_width * 0.2;
let prev_last_char = prev_span.text.chars().last();
let curr_first_char = span.text.chars().next();
let should_insert_space = if gap > space_threshold {
let prev_is_cjk = prev_last_char
.map(is_spaceless_script_char)
.unwrap_or(false);
let curr_is_cjk = curr_first_char
.map(is_spaceless_script_char)
.unwrap_or(false);
!(prev_is_cjk && curr_is_cjk)
} else {
false
};
let prev_ends_with_space =
prev_span.text.ends_with(' ') || prev_span.text.ends_with('\u{00A0}');
let curr_starts_with_space =
span.text.starts_with(' ') || span.text.starts_with('\u{00A0}');
if should_insert_space && !prev_ends_with_space && !curr_starts_with_space {
result.push(' ');
}
result.push_str(&span.text);
}
if super::bidi::contains_rtl(&result) {
result = super::bidi::reorder_bidi(&result);
}
result
}
pub fn is_bold(&self) -> bool {
let bold_chars: usize = self
.spans
.iter()
.filter(|s| s.is_bold)
.map(|s| s.text.len())
.sum();
let total_chars: usize = self.spans.iter().map(|s| s.text.len()).sum();
total_chars > 0 && bold_chars as f32 / total_chars as f32 > 0.5
}
pub fn is_uppercase(&self) -> bool {
let text = self.text();
let letters: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
!letters.is_empty() && letters.iter().all(|c| c.is_uppercase())
}
}
#[derive(Debug, Clone)]
pub struct TextBlock {
pub lines: Vec<TextLine>,
pub block_type: BlockType,
pub heading_level: u8,
}
#[derive(Debug, Clone)]
pub struct Column {
pub left: f32,
pub right: f32,
pub index: usize,
}
impl Column {
pub fn contains(&self, x: f32) -> bool {
x >= self.left && x <= self.right
}
pub fn contains_span(&self, span: &TextSpan) -> bool {
let center = span.x + span.width / 2.0;
self.contains(span.x) || self.contains(center)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BlockType {
Heading,
Paragraph,
ListItem,
Unknown,
}
impl TextBlock {
pub fn new(lines: Vec<TextLine>, block_type: BlockType) -> Self {
Self {
lines,
block_type,
heading_level: 0,
}
}
pub fn text(&self) -> String {
self.lines
.iter()
.map(|l| l.text())
.collect::<Vec<_>>()
.join(" ")
}
pub fn is_empty(&self) -> bool {
self.lines.is_empty() || self.text().trim().is_empty()
}
}
pub struct LayoutAnalyzer<'a> {
backend: &'a dyn PdfBackend,
font_stats: FontStatistics,
}
#[derive(Debug, Clone, Default)]
pub struct FontStatistics {
pub body_size: f32,
pub heading_sizes: Vec<f32>,
pub size_histogram: HashMap<i32, usize>,
}
impl FontStatistics {
pub fn add_size(&mut self, size: f32) {
let key = (size * 10.0) as i32; *self.size_histogram.entry(key).or_insert(0) += 1;
}
pub fn analyze(&mut self) {
if self.size_histogram.is_empty() {
self.body_size = 12.0;
return;
}
let (body_key, _) = self
.size_histogram
.iter()
.max_by_key(|(_, count)| *count)
.unwrap();
self.body_size = *body_key as f32 / 10.0;
let mut larger_sizes: Vec<f32> = self
.size_histogram
.keys()
.filter(|k| **k as f32 / 10.0 > self.body_size + 0.5)
.map(|k| *k as f32 / 10.0)
.collect();
larger_sizes.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
self.heading_sizes = larger_sizes;
}
pub fn get_heading_level(&self, font_size: f32, is_bold: bool) -> u8 {
let strong_threshold = self.body_size + 2.5;
let bold_threshold = self.body_size + 1.5;
let qualifies = font_size >= strong_threshold || (is_bold && font_size >= bold_threshold);
if !qualifies {
return 0;
}
let mut tier = 0u8;
let mut last_tier_size: Option<f32> = None;
for &heading_size in &self.heading_sizes {
let is_new_tier = match last_tier_size {
None => true,
Some(prev) => (prev - heading_size).abs() >= 2.0,
};
if is_new_tier {
tier = tier.saturating_add(1);
last_tier_size = Some(heading_size);
}
if font_size >= heading_size - 0.5 {
return tier.min(4);
}
}
4
}
}
impl<'a> LayoutAnalyzer<'a> {
pub fn new(backend: &'a dyn PdfBackend) -> Self {
Self {
backend,
font_stats: FontStatistics::default(),
}
}
pub fn font_stats_mut(&mut self) -> &mut FontStatistics {
&mut self.font_stats
}
pub fn group_spans_into_lines_pub(&self, spans: Vec<TextSpan>) -> Vec<TextLine> {
self.group_spans_into_lines(spans)
}
pub fn detect_headings_pub(&self, lines: Vec<TextLine>) -> Vec<TextLine> {
self.detect_headings(lines)
}
pub fn group_lines_into_blocks_pub(&self, lines: Vec<TextLine>) -> Vec<TextBlock> {
self.group_lines_into_blocks(lines)
}
pub fn extract_page_spans(&self, page_num: u32) -> Result<Vec<TextSpan>> {
let pages = self.backend.pages();
let page_id = pages
.get(&page_num)
.ok_or(Error::PageOutOfRange(page_num, pages.len() as u32))?;
let backend_fonts = self.backend.page_fonts(*page_id)?;
let mut fonts = HashMap::new();
for fi in &backend_fonts {
fonts.insert(
fi.name.clone(),
FontInfo {
name: fi.base_font.clone(),
},
);
}
let content = self.backend.page_content(*page_id)?;
self.parse_operations(&content, &fonts, *page_id)
}
pub fn extract_page_blocks(&mut self, page_num: u32) -> Result<Vec<TextBlock>> {
let pages = self.backend.pages();
let page_id = pages
.get(&page_num)
.ok_or(Error::PageOutOfRange(page_num, pages.len() as u32))?;
let (_page_width, page_height) = self.backend.page_dimensions(*page_id);
let mut spans = self.extract_page_spans(page_num)?;
filter_header_footer_spans(&mut spans, page_height);
for span in &spans {
self.font_stats.add_size(span.font_size);
}
self.font_stats.analyze();
let lines = self.group_spans_into_lines(spans);
let lines = self.detect_headings(lines);
let blocks = self.group_lines_into_blocks(lines);
Ok(blocks)
}
fn parse_operations(
&self,
content: &[u8],
fonts: &HashMap<Vec<u8>, FontInfo>,
page_id: super::backend::PageId,
) -> Result<Vec<TextSpan>> {
let operations = self.backend.decode_content(content)?;
let mut spans = Vec::new();
let mut current_font = String::new();
let mut current_font_name: Vec<u8> = Vec::new();
let mut current_font_size: f32 = 12.0;
let mut text_matrix = TextMatrix::default();
let mut in_text_block = false;
for op in &operations {
match op.operator.as_str() {
"BT" => {
in_text_block = true;
text_matrix = TextMatrix::default();
}
"ET" => {
in_text_block = false;
}
"Tf" => {
if op.operands.len() >= 2 {
if let PdfValue::Name(font_name) = &op.operands[0] {
current_font_name = font_name.clone();
if let Some(info) = fonts.get(font_name.as_slice()) {
current_font = info.name.clone();
} else {
current_font =
String::from_utf8_lossy(font_name.as_slice()).to_string();
}
}
current_font_size = get_number_from_value(&op.operands[1]).unwrap_or(12.0);
}
}
"Td" | "TD" => {
if op.operands.len() >= 2 {
let tx = get_number_from_value(&op.operands[0]).unwrap_or(0.0);
let ty = get_number_from_value(&op.operands[1]).unwrap_or(0.0);
text_matrix.translate(tx, ty);
}
}
"Tm" => {
if op.operands.len() >= 6 {
text_matrix.set(
get_number_from_value(&op.operands[0]).unwrap_or(1.0),
get_number_from_value(&op.operands[1]).unwrap_or(0.0),
get_number_from_value(&op.operands[2]).unwrap_or(0.0),
get_number_from_value(&op.operands[3]).unwrap_or(1.0),
get_number_from_value(&op.operands[4]).unwrap_or(0.0),
get_number_from_value(&op.operands[5]).unwrap_or(0.0),
);
}
}
"T*" => {
text_matrix.next_line();
}
"Tj" | "TJ" => {
if in_text_block {
let text = if op.operator == "TJ" {
if let Some(PdfValue::Array(arr)) = op.operands.first() {
let mut combined = String::new();
for item in arr {
match item {
PdfValue::Str(bytes) => {
combined.push_str(&self.backend.decode_text(
page_id,
¤t_font_name,
bytes,
));
}
PdfValue::Integer(n) => {
let adjustment = -(*n as f32);
maybe_insert_space_tj(&mut combined, adjustment);
}
PdfValue::Real(n) => {
let adjustment = -n;
maybe_insert_space_tj(&mut combined, adjustment);
}
_ => {}
}
}
combined
} else {
String::new()
}
} else {
if let Some(PdfValue::Str(bytes)) = op.operands.first() {
self.backend.decode_text(page_id, ¤t_font_name, bytes)
} else {
String::new()
}
};
if !text.trim().is_empty() {
let (x, y) = text_matrix.get_position();
let effective_size = current_font_size * text_matrix.get_scale();
spans.push(TextSpan::new(
text,
x,
y,
effective_size,
current_font.clone(),
));
}
}
}
"'" | "\"" => {
text_matrix.next_line();
if in_text_block {
let text_idx = if op.operator == "\"" { 2 } else { 0 };
if let Some(PdfValue::Str(bytes)) = op.operands.get(text_idx) {
let text = self.backend.decode_text(page_id, ¤t_font_name, bytes);
if !text.trim().is_empty() {
let (x, y) = text_matrix.get_position();
let effective_size = current_font_size * text_matrix.get_scale();
spans.push(TextSpan::new(
text,
x,
y,
effective_size,
current_font.clone(),
));
}
}
}
}
_ => {}
}
}
Ok(spans)
}
fn detect_columns(&self, spans: &[TextSpan]) -> Vec<Column> {
if spans.is_empty() {
return vec![];
}
let min_x = spans
.iter()
.map(|s| s.x)
.min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or(0.0);
let max_x = spans
.iter()
.map(|s| s.x + s.width)
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or(0.0);
let page_width = max_x - min_x;
if page_width < 250.0 {
return vec![Column {
left: min_x - 10.0,
right: max_x + 10.0,
index: 0,
}];
}
let slice_width = 3.0; let num_slices = ((page_width / slice_width) as usize) + 1;
let mut slice_occupancy = vec![0usize; num_slices];
for span in spans {
let start_slice = ((span.x - min_x) / slice_width) as usize;
let end_slice = (((span.x + span.width) - min_x) / slice_width) as usize;
for slot in slice_occupancy
.iter_mut()
.take(end_slice.min(num_slices - 1) + 1)
.skip(start_slice)
{
*slot += 1;
}
}
let search_start = num_slices * 15 / 100; let search_end = num_slices * 85 / 100;
let mut best_gap_start = 0;
let mut best_gap_len = 0;
let mut best_gap_center_dist = f32::MAX;
let page_center = num_slices / 2;
let mut current_gap_start = 0;
let mut current_gap_len = 0;
for (i, &occupancy) in slice_occupancy
.iter()
.enumerate()
.take(search_end)
.skip(search_start)
{
if occupancy == 0 {
if current_gap_len == 0 {
current_gap_start = i;
}
current_gap_len += 1;
} else {
if current_gap_len > 0 {
let gap_center = current_gap_start + current_gap_len / 2;
let center_dist = (gap_center as i32 - page_center as i32).abs() as f32;
let current_gap_width = current_gap_len as f32 * slice_width;
if current_gap_width >= 10.0 {
let best_gap_width = best_gap_len as f32 * slice_width;
if current_gap_width > best_gap_width * 1.5
|| (current_gap_width >= best_gap_width * 0.7
&& center_dist < best_gap_center_dist)
{
best_gap_start = current_gap_start;
best_gap_len = current_gap_len;
best_gap_center_dist = center_dist;
}
}
}
current_gap_len = 0;
}
}
if current_gap_len > 0 {
let gap_center = current_gap_start + current_gap_len / 2;
let center_dist = (gap_center as i32 - page_center as i32).abs() as f32;
let current_gap_width = current_gap_len as f32 * slice_width;
let best_gap_width = best_gap_len as f32 * slice_width;
if current_gap_width >= 10.0
&& (current_gap_width > best_gap_width * 1.5
|| (current_gap_width >= best_gap_width * 0.7
&& center_dist < best_gap_center_dist))
{
best_gap_start = current_gap_start;
best_gap_len = current_gap_len;
}
}
let gap_width = best_gap_len as f32 * slice_width;
log::debug!(
"Best gap: width={:.1}pt at x={:.1}, page_width={:.1}",
gap_width,
min_x + best_gap_start as f32 * slice_width,
page_width
);
if gap_width < 12.0 {
log::debug!("Gap too small (< 12pt), treating as single column");
return vec![Column {
left: min_x - 10.0,
right: max_x + 10.0,
index: 0,
}];
}
let gutter_center =
min_x + (best_gap_start as f32 + best_gap_len as f32 / 2.0) * slice_width;
let left_col_width = gutter_center - min_x;
let right_col_width = max_x - gutter_center;
log::debug!(
"Column widths: left={:.1}, right={:.1}",
left_col_width,
right_col_width
);
if left_col_width < 80.0 || right_col_width < 80.0 {
log::debug!("Column too narrow, treating as single column");
return vec![Column {
left: min_x - 10.0,
right: max_x + 10.0,
index: 0,
}];
}
let left_spans = spans
.iter()
.filter(|s| s.x + s.width / 2.0 < gutter_center)
.count();
let right_spans = spans
.iter()
.filter(|s| s.x + s.width / 2.0 >= gutter_center)
.count();
log::debug!(
"Spans: left={}, right={}, total={}",
left_spans,
right_spans,
spans.len()
);
let min_spans = spans.len() / 10;
if left_spans < min_spans.max(2) || right_spans < min_spans.max(2) {
log::debug!("Spans too imbalanced, treating as single column");
return vec![Column {
left: min_x - 10.0,
right: max_x + 10.0,
index: 0,
}];
}
vec![
Column {
left: min_x - 10.0,
right: gutter_center,
index: 0,
},
Column {
left: gutter_center,
right: max_x + 10.0,
index: 1,
},
]
}
fn group_spans_into_lines(&self, spans: Vec<TextSpan>) -> Vec<TextLine> {
if spans.is_empty() {
return vec![];
}
let blocks: Vec<super::xycut::Block> = spans
.iter()
.map(|s| super::xycut::Block {
x: s.x,
y: s.y,
width: s.width,
height: s.font_size,
})
.collect();
let median_font = median_font_size(&spans);
let min_x_gap = (median_font * 5.0).max(60.0);
let min_y_gap = (median_font * 3.0).max(36.0);
let groups = super::xycut::xycut_segment(&blocks, min_x_gap, min_y_gap);
log::debug!(
"XY-Cut segmented {} spans into {} groups (median_font={:.1}, min_x_gap={:.1}, min_y_gap={:.1})",
spans.len(),
groups.len(),
median_font,
min_x_gap,
min_y_gap,
);
if groups.len() <= 1 {
return self.group_spans_into_lines_single_column(spans);
}
let mut all_lines = Vec::new();
for group in &groups {
let group_spans: Vec<TextSpan> = spans
.iter()
.filter(|s| {
group
.iter()
.any(|b| (s.x - b.x).abs() < 1.0 && (s.y - b.y).abs() < 1.0)
})
.cloned()
.collect();
let lines = self.group_spans_into_lines_single_column(group_spans);
all_lines.extend(lines);
}
all_lines
}
#[allow(dead_code)]
fn group_spans_into_lines_legacy_columns(&self, spans: Vec<TextSpan>) -> Vec<TextLine> {
if spans.is_empty() {
return vec![];
}
let columns = self.detect_columns(&spans);
log::debug!("Detected {} columns", columns.len());
for col in &columns {
log::debug!(
" Column {}: left={:.1}, right={:.1}",
col.index,
col.left,
col.right
);
}
if columns.len() <= 1 {
return self.group_spans_into_lines_single_column(spans);
}
let mut column_lines: Vec<Vec<TextLine>> = vec![Vec::new(); columns.len()];
let mut column_spans: Vec<Vec<TextSpan>> = vec![Vec::new(); columns.len()];
for span in spans {
let col_idx = columns
.iter()
.position(|c| c.contains_span(&span))
.unwrap_or(0);
column_spans[col_idx].push(span);
}
log::debug!(
"Spans per column: {:?}",
column_spans.iter().map(|v| v.len()).collect::<Vec<_>>()
);
for (col_idx, col_spans) in column_spans.into_iter().enumerate() {
column_lines[col_idx] = self.group_spans_into_lines_single_column(col_spans);
}
let mut all_lines: Vec<(usize, TextLine)> = Vec::new();
for (col_idx, lines) in column_lines.into_iter().enumerate() {
for line in lines {
all_lines.push((col_idx, line));
}
}
all_lines.sort_by(|(col_a, line_a), (col_b, line_b)| {
let col_cmp = col_a.cmp(col_b);
if col_cmp != std::cmp::Ordering::Equal {
col_cmp
} else {
line_b
.y
.partial_cmp(&line_a.y)
.unwrap_or(std::cmp::Ordering::Equal)
}
});
all_lines.into_iter().map(|(_, line)| line).collect()
}
fn group_spans_into_lines_single_column(&self, spans: Vec<TextSpan>) -> Vec<TextLine> {
if spans.is_empty() {
return vec![];
}
let mut spans = spans;
spans.sort_by(|a, b| {
let y_cmp = b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal);
if y_cmp == std::cmp::Ordering::Equal {
a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
} else {
y_cmp
}
});
spans = merge_fragmented_spans(spans);
let mut lines: Vec<TextLine> = Vec::new();
let mut current_line_spans: Vec<TextSpan> = Vec::new();
let mut current_y: Option<f32> = None;
for span in spans {
let y_tolerance = span.font_size * 0.3;
if let Some(y) = current_y {
if (span.y - y).abs() <= y_tolerance {
current_line_spans.push(span);
} else {
if !current_line_spans.is_empty() {
lines.push(TextLine::from_spans(std::mem::take(
&mut current_line_spans,
)));
}
current_y = Some(span.y);
current_line_spans.push(span);
}
} else {
current_y = Some(span.y);
current_line_spans.push(span);
}
}
if !current_line_spans.is_empty() {
lines.push(TextLine::from_spans(current_line_spans));
}
lines
}
fn detect_headings(&self, mut lines: Vec<TextLine>) -> Vec<TextLine> {
let sizes: Vec<f32> = lines.iter().map(|l| l.font_size).collect();
let body_size = self.font_stats.body_size;
for (i, line) in lines.iter_mut().enumerate() {
let visible_chars: usize = line
.text()
.chars()
.filter(|c| !c.is_whitespace() && !c.is_ascii_punctuation())
.count();
if visible_chars < 3 {
continue;
}
let trimmed = line.text();
let trimmed = trimmed.trim_start();
if let Some(first) = trimmed.chars().next() {
const BULLETS: &[char] = &['-', '*', '·', '•', '◦', '▶', '▷', '◎', '☞', '※'];
if BULLETS.contains(&first) {
continue;
}
}
let level = self
.font_stats
.get_heading_level(line.font_size, line.is_bold() || line.is_uppercase());
if level == 0 {
continue;
}
let prev_size = if i > 0 { Some(sizes[i - 1]) } else { None };
let next_size = if i + 1 < sizes.len() {
Some(sizes[i + 1])
} else {
None
};
let same = |a: f32, b: f32| (a - b).abs() < 0.5;
let matches_prev = prev_size.is_some_and(|p| same(p, line.font_size));
let matches_next = next_size.is_some_and(|n| same(n, line.font_size));
if (matches_prev || matches_next) && line.font_size < body_size + 6.0 {
continue;
}
line.is_heading = true;
line.heading_level = level;
}
lines
}
fn group_lines_into_blocks(&self, lines: Vec<TextLine>) -> Vec<TextBlock> {
if lines.is_empty() {
return vec![];
}
let mut blocks: Vec<TextBlock> = Vec::new();
let mut current_block_lines: Vec<TextLine> = Vec::new();
let avg_spacing = self.calculate_avg_line_spacing(&lines);
for (i, line) in lines.into_iter().enumerate() {
if i == 0 {
current_block_lines.push(line);
continue;
}
let prev_line = current_block_lines.last().unwrap();
let should_break = self.should_break_block(prev_line, &line, avg_spacing);
if should_break {
if !current_block_lines.is_empty() {
let block_type = if current_block_lines.iter().any(|l| l.is_heading) {
BlockType::Heading
} else {
BlockType::Paragraph
};
let mut block =
TextBlock::new(std::mem::take(&mut current_block_lines), block_type);
if block_type == BlockType::Heading {
block.heading_level = block
.lines
.iter()
.filter(|l| l.is_heading)
.map(|l| l.heading_level)
.min()
.unwrap_or(0);
}
blocks.push(block);
}
}
current_block_lines.push(line);
}
if !current_block_lines.is_empty() {
let block_type = if current_block_lines.iter().any(|l| l.is_heading) {
BlockType::Heading
} else {
BlockType::Paragraph
};
let mut block = TextBlock::new(current_block_lines, block_type);
if block_type == BlockType::Heading {
block.heading_level = block
.lines
.iter()
.filter(|l| l.is_heading)
.map(|l| l.heading_level)
.min()
.unwrap_or(0);
}
blocks.push(block);
}
blocks
}
fn calculate_avg_line_spacing(&self, lines: &[TextLine]) -> f32 {
if lines.len() < 2 {
return 12.0; }
let spacings: Vec<f32> = lines
.windows(2)
.map(|w| (w[0].y - w[1].y).abs())
.filter(|s| *s > 0.1) .collect();
if spacings.is_empty() {
return 12.0;
}
spacings.iter().sum::<f32>() / spacings.len() as f32
}
fn should_break_block(
&self,
prev_line: &TextLine,
curr_line: &TextLine,
avg_spacing: f32,
) -> bool {
if curr_line.is_heading {
if prev_line.is_heading && (prev_line.font_size - curr_line.font_size).abs() <= 2.0 {
let gap = (prev_line.y - curr_line.y).abs();
let bigger = prev_line.font_size.max(curr_line.font_size);
let close = gap <= (bigger * 2.5).max(avg_spacing * 2.0);
if close {
return false;
}
}
return true;
}
if prev_line.is_heading {
return true;
}
let spacing = (prev_line.y - curr_line.y).abs();
if spacing > avg_spacing * 1.5 {
return true;
}
if (prev_line.font_size - curr_line.font_size).abs() >= 2.0 {
return true;
}
if (prev_line.x - curr_line.x).abs() > 40.0 {
return true;
}
false
}
}
fn filter_header_footer_spans(spans: &mut Vec<TextSpan>, page_height: f32) {
if spans.is_empty() || page_height <= 0.0 {
return;
}
let margin = page_height * 0.05;
let top_threshold = page_height - margin; let bottom_threshold = margin;
spans.retain(|span| {
let in_header = span.y >= top_threshold;
let in_footer = span.y <= bottom_threshold;
if !in_header && !in_footer {
return true; }
let text = span.text.trim();
if text.is_empty() {
return false; }
let is_page_num = text.chars().all(|c| c.is_ascii_digit()) || is_page_number_pattern(text);
!is_page_num
});
}
fn is_page_number_pattern(text: &str) -> bool {
let text = text.trim();
for dash in &['-', '–', '—'] {
let dash_str = dash.to_string();
if let Some(inner) = text.strip_prefix(dash_str.as_str()) {
if let Some(inner) = inner.trim().strip_suffix(dash_str.as_str()) {
if inner.trim().chars().all(|c| c.is_ascii_digit()) {
return true;
}
}
}
}
if let Some(rest) = text
.strip_prefix("Page ")
.or_else(|| text.strip_prefix("page "))
{
if rest.trim().chars().all(|c| c.is_ascii_digit()) {
return true;
}
}
let tokens: Vec<&str> = text
.split(|c: char| c == '/' || c.is_ascii_whitespace())
.filter(|s| !s.is_empty())
.collect();
if tokens.len() == 3
&& tokens[0].chars().all(|c| c.is_ascii_digit())
&& (tokens[1] == "of" || tokens[1] == "/")
&& tokens[2].chars().all(|c| c.is_ascii_digit())
{
return true;
}
if tokens.len() == 2
&& tokens[0].chars().all(|c| c.is_ascii_digit())
&& tokens[1].chars().all(|c| c.is_ascii_digit())
&& text.contains('/')
{
return true;
}
false
}
#[derive(Debug, Clone)]
struct FontInfo {
name: String,
}
#[derive(Debug, Clone)]
struct TextMatrix {
a: f32,
b: f32,
c: f32,
d: f32,
e: f32, f: f32, line_y: f32,
}
impl Default for TextMatrix {
fn default() -> Self {
Self {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: 0.0,
f: 0.0,
line_y: 0.0,
}
}
}
impl TextMatrix {
fn set(&mut self, a: f32, b: f32, c: f32, d: f32, e: f32, f: f32) {
self.a = a;
self.b = b;
self.c = c;
self.d = d;
self.e = e;
self.f = f;
self.line_y = f;
}
fn translate(&mut self, tx: f32, ty: f32) {
self.e += tx * self.a + ty * self.c;
self.f += tx * self.b + ty * self.d;
if ty != 0.0 {
self.line_y = self.f;
}
}
fn next_line(&mut self) {
self.f -= 12.0 * self.d;
self.line_y = self.f;
}
fn get_position(&self) -> (f32, f32) {
(self.e, self.f)
}
fn get_scale(&self) -> f32 {
(self.a * self.a + self.c * self.c).sqrt()
}
}
fn maybe_insert_space_tj(text: &mut String, adjustment: f32) {
if text.is_empty() || text.ends_with(' ') || text.ends_with('\u{00A0}') {
return;
}
if let Some(last_char) = text.chars().last() {
if is_spaceless_script_char(last_char) {
return;
}
let threshold = if is_hangul_char(last_char) {
500.0
} else {
200.0
};
if adjustment > threshold {
text.push(' ');
}
}
}
fn is_hangul_char(c: char) -> bool {
let code = c as u32;
(0xAC00..=0xD7AF).contains(&code)
|| (0x1100..=0x11FF).contains(&code)
|| (0x3130..=0x318F).contains(&code)
|| (0xA960..=0xA97F).contains(&code)
|| (0xD7B0..=0xD7FF).contains(&code)
}
fn median_font_size(spans: &[TextSpan]) -> f32 {
if spans.is_empty() {
return 12.0;
}
let mut sizes: Vec<f32> = spans.iter().map(|s| s.font_size).collect();
sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
sizes[sizes.len() / 2]
}
fn is_spaceless_script_char(c: char) -> bool {
let code = c as u32;
(0x4E00..=0x9FFF).contains(&code)
|| (0x3400..=0x4DBF).contains(&code)
|| (0x20000..=0x2A6DF).contains(&code)
|| (0x2A700..=0x2B73F).contains(&code)
|| (0x2B740..=0x2B81F).contains(&code)
|| (0x2B820..=0x2CEAF).contains(&code)
|| (0x2CEB0..=0x2EBEF).contains(&code)
|| (0x3040..=0x309F).contains(&code)
|| (0x30A0..=0x30FF).contains(&code)
|| (0x3000..=0x303F).contains(&code)
}
fn merge_fragmented_spans(spans: Vec<TextSpan>) -> Vec<TextSpan> {
if spans.len() < 2 {
return spans;
}
let mut result: Vec<TextSpan> = Vec::with_capacity(spans.len());
let mut was_fragment: Vec<bool> = Vec::with_capacity(spans.len());
for span in spans {
let is_fragment = span.text.chars().count() <= 3 && span.width <= 0.0;
let should_merge =
if let Some((prev, prev_was_frag)) = result.last().zip(was_fragment.last()) {
if !is_fragment || !prev_was_frag {
false
} else {
let y_tolerance = span.font_size * 0.3;
let same_y = (prev.y - span.y).abs() <= y_tolerance;
let same_font = prev.font_name == span.font_name
&& (prev.font_size - span.font_size).abs() < 0.1;
if !same_y || !same_font {
false
} else {
let est_char_width = prev.font_size * 0.6;
let prev_end = if prev.width > 0.0 {
prev.x + prev.width
} else {
prev.x + est_char_width * prev.text.chars().count() as f32
};
let gap = span.x - prev_end;
gap < est_char_width * 0.4 && gap > -est_char_width * 0.5
}
}
} else {
false
};
if should_merge {
let prev = result.last_mut().unwrap();
let new_end = span.x + span.font_size * 0.6 * span.text.chars().count() as f32;
prev.width = new_end - prev.x;
prev.text.push_str(&span.text);
} else {
was_fragment.push(is_fragment);
result.push(span);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_font_statistics() {
let mut stats = FontStatistics::default();
for _ in 0..100 {
stats.add_size(12.0);
}
for _ in 0..5 {
stats.add_size(18.0);
}
for _ in 0..3 {
stats.add_size(24.0);
}
stats.analyze();
assert!((stats.body_size - 12.0).abs() < 0.1);
assert_eq!(stats.get_heading_level(12.0, false), 0);
assert!(stats.get_heading_level(18.0, false) > 0);
assert!(stats.get_heading_level(24.0, false) > 0);
}
#[test]
fn test_text_span_bold_detection() {
let span = TextSpan::new(
"Test".to_string(),
0.0,
0.0,
12.0,
"Helvetica-Bold".to_string(),
);
assert!(span.is_bold);
assert!(!span.is_italic);
let span2 = TextSpan::new(
"Test".to_string(),
0.0,
0.0,
12.0,
"Helvetica-Oblique".to_string(),
);
assert!(!span2.is_bold);
assert!(span2.is_italic);
}
#[test]
fn test_merge_fragmented_spans_single_chars() {
let spans: Vec<TextSpan> = "Hello"
.chars()
.enumerate()
.map(|(i, c)| TextSpan {
text: c.to_string(),
x: 100.0 + i as f32 * 6.0,
y: 500.0,
width: 0.0, font_size: 12.0,
font_name: "Helvetica".to_string(),
is_bold: false,
is_italic: false,
})
.collect();
let merged = merge_fragmented_spans(spans);
assert_eq!(merged.len(), 1);
assert_eq!(merged[0].text, "Hello");
}
#[test]
fn test_merge_fragmented_spans_preserves_normal() {
let spans = vec![
TextSpan {
text: "Hello".to_string(),
x: 100.0,
y: 500.0,
width: 30.0,
font_size: 12.0,
font_name: "Helvetica".to_string(),
is_bold: false,
is_italic: false,
},
TextSpan {
text: "World".to_string(),
x: 145.0, y: 500.0,
width: 30.0,
font_size: 12.0,
font_name: "Helvetica".to_string(),
is_bold: false,
is_italic: false,
},
];
let merged = merge_fragmented_spans(spans);
assert_eq!(merged.len(), 2);
}
#[test]
fn test_column_contains() {
let col = Column {
left: 100.0,
right: 200.0,
index: 0,
};
assert!(col.contains(100.0));
assert!(col.contains(150.0));
assert!(col.contains(200.0));
assert!(!col.contains(99.0));
assert!(!col.contains(201.0));
}
#[test]
fn test_column_contains_span() {
let col = Column {
left: 100.0,
right: 200.0,
index: 0,
};
let span1 = TextSpan::new(
"Test".to_string(),
120.0,
0.0,
12.0,
"Helvetica".to_string(),
);
let span1 = TextSpan {
width: 50.0,
..span1
};
assert!(col.contains_span(&span1));
let span2 = TextSpan::new("Test".to_string(), 90.0, 0.0, 12.0, "Helvetica".to_string());
let span2 = TextSpan {
width: 40.0,
..span2
}; assert!(col.contains_span(&span2));
let span3 = TextSpan::new(
"Test".to_string(),
250.0,
0.0,
12.0,
"Helvetica".to_string(),
);
let span3 = TextSpan {
width: 30.0,
..span3
};
assert!(!col.contains_span(&span3));
}
}