use std::borrow::Cow;
use crate::pdf::hierarchy::SegmentData;
use crate::pdf::text_data::ExtractedSegment;
use pdfium_render::prelude::*;
use super::text_repair::{apply_ligature_repairs, build_ligature_repair_map, normalize_text_encoding};
use super::types::PdfParagraph;
use crate::pdf::text_data::{PageTextData, extract_page_text_data};
use pdfium_render::prelude::PdfParagraph as PdfiumParagraph;
#[derive(Debug, Clone)]
pub(super) struct ImagePosition {
pub page_number: usize,
pub image_index: usize,
}
pub(super) fn filter_sidebar_blocks(blocks: &[ExtractedBlock], page_width: f32) -> Cow<'_, [ExtractedBlock]> {
if page_width <= 0.0 {
return Cow::Borrowed(blocks);
}
let left_cutoff = page_width * 0.08;
let right_cutoff = page_width * 0.92;
let sidebar_count = count_sidebar_blocks(blocks, left_cutoff, right_cutoff);
if sidebar_count < 3 {
return Cow::Borrowed(blocks);
}
Cow::Owned(filter_blocks_recursive(blocks, left_cutoff, right_cutoff))
}
fn count_sidebar_blocks(blocks: &[ExtractedBlock], left_cutoff: f32, right_cutoff: f32) -> usize {
let mut count = 0;
for block in blocks {
if !block.children.is_empty() {
count += count_sidebar_blocks(&block.children, left_cutoff, right_cutoff);
} else if is_sidebar_block(block, left_cutoff, right_cutoff) {
count += 1;
}
}
count
}
fn is_sidebar_block(block: &ExtractedBlock, left_cutoff: f32, right_cutoff: f32) -> bool {
let trimmed = block.text.trim();
if trimmed.is_empty() || trimmed.chars().count() > 3 {
return false;
}
if let Some(bounds) = &block.bounds {
let left = bounds.left().value;
let right = bounds.right().value;
right < left_cutoff || left > right_cutoff
} else {
false
}
}
fn filter_blocks_recursive(blocks: &[ExtractedBlock], left_cutoff: f32, right_cutoff: f32) -> Vec<ExtractedBlock> {
blocks
.iter()
.filter_map(|block| {
if !block.children.is_empty() {
let filtered_children = filter_blocks_recursive(&block.children, left_cutoff, right_cutoff);
if filtered_children.is_empty() {
return None;
}
Some(ExtractedBlock {
children: filtered_children,
..block.clone()
})
} else if is_sidebar_block(block, left_cutoff, right_cutoff) {
None
} else {
Some(block.clone())
}
})
.collect()
}
pub(super) fn extracted_blocks_to_paragraphs(blocks: &[ExtractedBlock]) -> Vec<PdfParagraph> {
let page_content = super::adapters::from_structure_tree(blocks);
super::content_convert::content_to_paragraphs(&page_content)
}
pub(super) fn objects_to_page_data(
page: &PdfPage,
page_number: usize,
image_offset: &mut usize,
) -> (Vec<SegmentData>, Vec<ImagePosition>) {
let objects: Vec<PdfPageObject> = page.objects().iter().collect();
let mut images = Vec::new();
for obj in &objects {
if obj.as_image_object().is_some() {
images.push(ImagePosition {
page_number,
image_index: *image_offset,
});
*image_offset += 1;
}
}
let page_height = page.height().value;
let page_width = page.width().value;
if let Some(data) = extract_page_text_data(page) {
let ref_tokens = data.full_text.split_whitespace().count();
let dto_result = extract_segments_from_dto(&data, page_width);
let merged_result = extract_segments_merged(page, page_height);
let dto_tokens = dto_result.as_ref().map_or(0, |segs| {
segs.iter().map(|s| s.text.split_whitespace().count()).sum::<usize>()
});
let merged_tokens = merged_result.as_ref().map_or(0, |segs| {
segs.iter().map(|s| s.text.split_whitespace().count()).sum::<usize>()
});
tracing::debug!(
page = page_number,
full_text_len = data.full_text.len(),
ref_tokens,
dto_tokens,
merged_tokens,
"best-of-two extraction: token comparison"
);
if let Some(ref segs) = dto_result {
for (i, s) in segs.iter().enumerate() {
tracing::debug!(page = page_number, seg = i, text = %s.text, "dto segment");
}
}
if let Some(ref segs) = merged_result {
for (i, s) in segs.iter().enumerate() {
tracing::debug!(page = page_number, seg = i, text = %s.text, "merged segment");
}
}
let ref_chars = data.full_text.chars().filter(|c| !c.is_whitespace()).count();
let dto_chars = dto_result.as_ref().map_or(0, |segs| {
segs.iter()
.flat_map(|s| s.text.chars())
.filter(|c| !c.is_whitespace())
.count()
});
let merged_chars = merged_result.as_ref().map_or(0, |segs| {
segs.iter()
.flat_map(|s| s.text.chars())
.filter(|c| !c.is_whitespace())
.count()
});
let best = if ref_tokens == 0 {
dto_result.or(merged_result)
} else {
let dto_diff = (dto_tokens as f64 - ref_tokens as f64).abs();
let merged_diff = (merged_tokens as f64 - ref_tokens as f64).abs();
if dto_result.is_some() && merged_result.is_some() {
let dto_char_diff = (dto_chars as isize - ref_chars as isize).unsigned_abs();
let merged_char_diff = (merged_chars as isize - ref_chars as isize).unsigned_abs();
if dto_char_diff > merged_char_diff && merged_char_diff == 0 {
tracing::debug!(
page = page_number,
dto_chars,
merged_chars,
ref_chars,
"preferring merged: DTO has spurious characters"
);
merged_result
} else {
let dto_within_10pct = dto_diff <= merged_diff * 1.1;
if dto_within_10pct { dto_result } else { merged_result }
}
} else {
dto_result.or(merged_result)
}
};
if let Some(mut segments) = best {
if !data.full_text.is_empty() {
repair_word_breaks_from_full_text(&mut segments, &data.full_text);
}
return (segments, images);
}
if let Some(segments) = chars_to_segments_from_data(&data, page_width) {
return (segments, images);
}
}
let mut segments = Vec::new();
let column_groups = super::columns::split_objects_into_columns(&objects);
let column_vecs = partition_objects_by_columns(objects, &column_groups);
for column_objects in &column_vecs {
let paragraphs: Vec<PdfiumParagraph> = PdfiumParagraph::from_objects(column_objects);
extract_paragraphs_to_segments(paragraphs, &mut segments);
}
if let Some(repair_map) = build_ligature_repair_map(page) {
for seg in &mut segments {
if let Cow::Owned(s) = apply_ligature_repairs(&seg.text, &repair_map) {
seg.text = s;
}
}
}
(segments, images)
}
struct TextCell {
text: String,
pdf_left: f32,
pdf_bottom: f32,
pdf_right: f32,
pdf_top: f32,
top: f32,
bottom: f32,
font_size: f32,
is_bold: bool,
is_italic: bool,
is_monospace: bool,
baseline_y: f32,
}
struct TextRow {
cells: Vec<TextCell>,
top: f32,
bottom: f32,
}
impl TextRow {
fn height(&self) -> f32 {
(self.bottom - self.top).abs()
}
}
struct MergedCellGroup {
cells: Vec<TextCell>,
pdf_left: f32,
pdf_bottom: f32,
pdf_right: f32,
pdf_top: f32,
}
fn extract_segments_merged(page: &PdfPage, page_height: f32) -> Option<Vec<SegmentData>> {
let text_obj = page.text().ok()?;
let pdfium_segments = text_obj.segments();
let seg_count = pdfium_segments.len();
if seg_count == 0 {
return None;
}
let page_text = text_obj.all();
let page_needs_respacing = super::text_repair::text_has_broken_word_spacing(&page_text);
let mut cells: Vec<TextCell> = Vec::with_capacity(seg_count);
for i in 0..seg_count {
let seg = match pdfium_segments.get(i) {
Ok(s) => s,
Err(_) => continue,
};
let text = if page_needs_respacing {
seg.text_respaced(0.33)
} else {
seg.text()
};
if text.trim().is_empty() {
continue;
}
let bounds = seg.bounds();
let pdf_left = bounds.left().value;
let pdf_bottom = bounds.bottom().value;
let pdf_right = bounds.right().value;
let pdf_top = bounds.top().value;
let top = page_height - pdf_top;
let bottom = page_height - pdf_bottom;
let (font_size, is_bold, is_italic, is_monospace, baseline_y) = sample_font_from_segment(&seg);
cells.push(TextCell {
text,
pdf_left,
pdf_bottom,
pdf_right,
pdf_top,
top,
bottom,
font_size,
is_bold,
is_italic,
is_monospace,
baseline_y,
});
}
if cells.is_empty() {
return None;
}
let page_width = page.width().value;
let sidebar_cutoff = page_width * 0.05;
cells.retain(|c| c.pdf_right > sidebar_cutoff);
if cells.is_empty() {
return None;
}
let rows = group_cells_into_rows(cells);
let mut segments = Vec::new();
for row in rows {
let merged_groups = merge_cells_in_row(row);
for group in merged_groups {
let first = &group.cells[0];
let first_font_size = first.font_size;
let first_is_bold = first.is_bold;
let first_is_italic = first.is_italic;
let first_is_monospace = first.is_monospace;
let first_baseline_y = first.baseline_y;
let text = if group.cells.len() == 1 {
let Some(cell) = group.cells.into_iter().next() else {
continue;
};
cell.text
} else {
let rect = PdfRect::new_from_values(group.pdf_bottom, group.pdf_left, group.pdf_top, group.pdf_right);
let reextracted = text_obj.inside_rect(rect);
if reextracted.trim().is_empty() {
group.cells.iter().map(|c| c.text.as_str()).collect::<Vec<_>>().join("")
} else {
reextracted
}
};
let trimmed = text.trim();
if trimmed.is_empty() {
continue;
}
let width = group.pdf_right - group.pdf_left;
let height = group.pdf_top - group.pdf_bottom;
segments.push(SegmentData {
text: trimmed.to_string(),
x: group.pdf_left,
y: first_baseline_y,
width: width.max(first_font_size),
height: height.max(first_font_size),
font_size: first_font_size,
is_bold: first_is_bold,
is_italic: first_is_italic,
is_monospace: first_is_monospace,
baseline_y: first_baseline_y,
});
}
}
if segments.is_empty() { None } else { Some(segments) }
}
fn sample_font_from_segment(seg: &pdfium_render::prelude::PdfPageTextSegment<'_>) -> (f32, bool, bool, bool, f32) {
let bounds = seg.bounds();
let default_baseline = bounds.bottom().value;
if let Ok(seg_chars) = seg.chars() {
for ch in seg_chars.iter() {
let uv = ch.unicode_value();
if let Some(uc) = char::from_u32(uv)
&& uc.is_whitespace()
{
continue;
}
let scaled = ch.scaled_font_size().value;
let fs = if scaled > 0.0 { scaled } else { 12.0 };
let info = ch.font_info();
let mono = crate::pdf::text_data::is_truly_monospace(ch.font_is_fixed_pitch(), &info.0);
let bl_y = ch.origin().map(|o| o.1.value).unwrap_or(default_baseline);
return (fs, info.1, info.2, mono, bl_y);
}
}
(12.0, false, false, false, default_baseline)
}
fn group_cells_into_rows(cells: Vec<TextCell>) -> Vec<TextRow> {
const VERTICAL_THRESHOLD: f32 = 0.5;
let mut rows: Vec<TextRow> = Vec::new();
for cell in cells {
let cell_top = cell.top;
let cell_bottom = cell.bottom;
let matching_row = rows.iter().position(|row| {
let row_h = row.height().max(1.0);
let tolerance = row_h * VERTICAL_THRESHOLD;
(cell_top - row.top).abs() <= tolerance && (cell_bottom - row.bottom).abs() <= tolerance
});
if let Some(idx) = matching_row {
rows[idx].top = rows[idx].top.min(cell_top);
rows[idx].bottom = rows[idx].bottom.max(cell_bottom);
rows[idx].cells.push(cell);
} else {
rows.push(TextRow {
cells: vec![cell],
top: cell_top,
bottom: cell_bottom,
});
}
}
rows.sort_by(|a, b| a.top.partial_cmp(&b.top).unwrap_or(std::cmp::Ordering::Equal));
rows
}
fn merge_cells_in_row(mut row: TextRow) -> Vec<MergedCellGroup> {
const HORIZONTAL_THRESHOLD: f32 = 1.0;
row.cells
.sort_by(|a, b| a.pdf_left.partial_cmp(&b.pdf_left).unwrap_or(std::cmp::Ordering::Equal));
let avg_height = if row.cells.is_empty() {
12.0
} else {
row.cells.iter().map(|c| (c.pdf_top - c.pdf_bottom).abs()).sum::<f32>() / row.cells.len() as f32
};
let merge_threshold = avg_height * HORIZONTAL_THRESHOLD;
let mut groups: Vec<MergedCellGroup> = Vec::new();
for cell in row.cells {
let should_merge = if let Some(last_group) = groups.last() {
let gap = cell.pdf_left - last_group.pdf_right;
gap <= merge_threshold
} else {
false
};
if should_merge {
let Some(group) = groups.last_mut() else {
continue;
};
group.pdf_left = group.pdf_left.min(cell.pdf_left);
group.pdf_bottom = group.pdf_bottom.min(cell.pdf_bottom);
group.pdf_right = group.pdf_right.max(cell.pdf_right);
group.pdf_top = group.pdf_top.max(cell.pdf_top);
group.cells.push(cell);
} else {
groups.push(MergedCellGroup {
pdf_left: cell.pdf_left,
pdf_bottom: cell.pdf_bottom,
pdf_right: cell.pdf_right,
pdf_top: cell.pdf_top,
cells: vec![cell],
});
}
}
groups
}
fn repair_word_breaks_from_full_text(segments: &mut [SegmentData], full_text: &str) {
let normalized_full: String = full_text
.chars()
.filter(|c| !c.is_control() || *c == ' ' || *c == '\n')
.collect();
let full_words: std::collections::HashSet<&str> = normalized_full.split_whitespace().collect();
for seg in segments.iter_mut() {
if !seg.text.contains(' ') {
continue;
}
let mut result = String::with_capacity(seg.text.len());
let words: Vec<&str> = seg.text.split(' ').collect();
let mut i = 0;
let mut changed = false;
while i < words.len() {
if i + 1 < words.len() && !words[i].is_empty() && !words[i + 1].is_empty() {
let w1_clean: String = words[i].chars().filter(|c| !c.is_control()).collect();
let w2_clean: String = words[i + 1].chars().filter(|c| !c.is_control()).collect();
if w1_clean.ends_with(|c: char| c.is_alphabetic()) && w2_clean.starts_with(|c: char| c.is_lowercase()) {
let joined = format!("{}{}", w1_clean, w2_clean);
if full_words.contains(joined.as_str()) && !full_words.contains(w1_clean.as_str()) {
if !result.is_empty() {
result.push(' ');
}
result.push_str(&joined);
i += 2;
changed = true;
continue;
}
}
}
if !result.is_empty() {
result.push(' ');
}
result.push_str(words[i]);
i += 1;
}
if changed {
seg.text = result;
}
}
}
struct SegmentRow {
segment_indices: Vec<usize>,
top: f32,
bottom: f32,
}
fn extract_segments_from_dto(data: &PageTextData, page_width: f32) -> Option<Vec<SegmentData>> {
if data.segments.is_empty() {
return None;
}
let sidebar_cutoff = page_width * 0.06;
let filtered: Vec<&ExtractedSegment> = data
.segments
.iter()
.filter(|s| (s.left + s.right) * 0.5 > sidebar_cutoff)
.filter(|s| !s.text.trim().is_empty())
.collect();
if filtered.is_empty() {
return None;
}
let mut rows: Vec<SegmentRow> = Vec::new();
for (seg_idx, seg) in filtered.iter().enumerate() {
let seg_top = seg.top;
let seg_bottom = seg.bottom;
let seg_height = (seg_top - seg_bottom).abs().max(1.0);
let tolerance = seg_height * 0.5;
let matching_row = rows
.iter()
.position(|row| (seg_top - row.top).abs() <= tolerance && (seg_bottom - row.bottom).abs() <= tolerance);
if let Some(idx) = matching_row {
rows[idx].top = rows[idx].top.max(seg_top);
rows[idx].bottom = rows[idx].bottom.min(seg_bottom);
rows[idx].segment_indices.push(seg_idx);
} else {
rows.push(SegmentRow {
segment_indices: vec![seg_idx],
top: seg_top,
bottom: seg_bottom,
});
}
}
rows.sort_by(|a, b| b.top.partial_cmp(&a.top).unwrap_or(std::cmp::Ordering::Equal));
let mut result: Vec<SegmentData> = Vec::with_capacity(rows.len());
for row in &rows {
let mut sorted_segs: Vec<&ExtractedSegment> = row.segment_indices.iter().map(|&i| filtered[i]).collect();
sorted_segs.sort_by(|a, b| a.left.partial_cmp(&b.left).unwrap_or(std::cmp::Ordering::Equal));
sorted_segs.dedup_by(|b_seg, a_seg| a_seg.text == b_seg.text && (a_seg.left - b_seg.left).abs() < 1.0);
let mut row_text = String::new();
let mut row_left = f32::MAX;
let mut row_right = f32::MIN;
let mut row_font_size = 12.0_f32;
let mut row_bold = false;
let mut row_italic = false;
let mut row_mono = false;
let mut row_baseline = 0.0_f32;
let mut prev_right = f32::MIN;
for seg in &sorted_segs {
if row_text.is_empty() {
row_left = seg.left;
row_font_size = seg.font_size;
row_bold = seg.is_bold;
row_italic = seg.is_italic;
row_mono = seg.is_monospace;
row_baseline = seg.baseline_y;
} else {
let gap = seg.left - prev_right;
let seg_width = (seg.right - seg.left).max(0.1);
let seg_chars = seg.text.chars().count().max(1) as f32;
let avg_char_w = seg_width / seg_chars;
let space_threshold = avg_char_w * 0.33;
if gap > space_threshold {
row_text.push(' ');
}
}
row_text.push_str(&seg.text);
row_right = seg.right;
prev_right = seg.right;
}
let trimmed = row_text.trim();
if trimmed.is_empty() {
continue;
}
result.push(SegmentData {
text: trimmed.to_string(),
x: row_left,
y: row_baseline,
width: (row_right - row_left).max(row_font_size),
height: (row.top - row.bottom).max(row_font_size),
font_size: row_font_size,
is_bold: row_bold,
is_italic: row_italic,
is_monospace: row_mono,
baseline_y: row_baseline,
});
}
if result.is_empty() {
return None;
}
Some(result)
}
fn partition_objects_by_columns<'a>(
objects: Vec<PdfPageObject<'a>>,
column_groups: &[Vec<usize>],
) -> Vec<Vec<PdfPageObject<'a>>> {
if column_groups.len() <= 1 {
return vec![objects];
}
let total = objects.len();
let num_columns = column_groups.len();
let mut col_for_obj = vec![0usize; total];
for (col_idx, group) in column_groups.iter().enumerate() {
for &obj_idx in group {
if obj_idx < total {
col_for_obj[obj_idx] = col_idx;
}
}
}
let mut result: Vec<Vec<PdfPageObject<'a>>> = (0..num_columns).map(|_| Vec::new()).collect();
for (i, obj) in objects.into_iter().enumerate() {
result[col_for_obj[i]].push(obj);
}
result
}
#[derive(Clone)]
struct CharInfo {
ch: char,
x: f32,
y: f32,
font_size: f32,
right_x: f32,
is_bold: bool,
is_italic: bool,
is_monospace: bool,
has_map_error: bool,
is_symbolic: bool,
#[allow(dead_code)]
is_hyphen: bool,
}
fn filter_sidebar_characters(char_infos: &mut Vec<CharInfo>, page_width: f32) {
if char_infos.len() < 20 || page_width <= 0.0 {
return;
}
let total_non_space = char_infos.iter().filter(|c| c.ch != ' ').count();
if total_non_space < 20 {
return;
}
let margin_band = page_width * 0.065;
let margin_indices: Vec<usize> = char_infos
.iter()
.enumerate()
.filter(|(_, c)| c.ch != ' ' && c.x < margin_band)
.map(|(i, _)| i)
.collect();
if margin_indices.is_empty() || margin_indices.len() * 20 > total_non_space {
return;
}
let (y_min, y_max) = char_infos
.iter()
.filter(|c| c.ch != ' ')
.fold((f32::INFINITY, f32::NEG_INFINITY), |(lo, hi), c| {
(lo.min(c.y), hi.max(c.y))
});
let page_text_height = (y_max - y_min).max(1.0);
let (margin_y_min, margin_y_max) =
margin_indices
.iter()
.fold((f32::INFINITY, f32::NEG_INFINITY), |(lo, hi), &i| {
let y = char_infos[i].y;
(lo.min(y), hi.max(y))
});
let margin_y_span = (margin_y_max - margin_y_min).abs();
if margin_y_span < page_text_height * 0.3 {
return; }
let mut word_start_count = 0usize;
for &idx in &margin_indices {
if idx + 1 < char_infos.len() {
let curr = &char_infos[idx];
let next = &char_infos[idx + 1];
let same_line = (curr.y - next.y).abs() < curr.font_size * 0.5;
let close_x = (next.x - curr.x) < curr.font_size * 1.2;
if same_line && close_x && next.ch != ' ' {
word_start_count += 1;
}
}
}
if word_start_count * 2 > margin_indices.len() {
return;
}
let mut keep = vec![true; char_infos.len()];
for &idx in &margin_indices {
keep[idx] = false;
}
let mut write = 0;
#[allow(clippy::needless_range_loop)]
for read in 0..char_infos.len() {
if keep[read] {
char_infos.swap(write, read);
write += 1;
}
}
char_infos.truncate(write);
}
fn build_line_text(chars: &[CharInfo], repair_map: Option<&[(char, &str)]>) -> String {
let mut line_text = String::new();
for (idx, ci) in chars.iter().enumerate() {
if ci.has_map_error
&& !ci.is_symbolic
&& let Some(map) = repair_map
&& let Some((_, replacement)) = map.iter().find(|(c, _)| *c == ci.ch)
{
line_text.push_str(replacement);
continue;
}
if idx > 0 && ci.ch != ' ' {
let prev = &chars[idx - 1];
if prev.ch == ' ' {
let last_real = chars[..idx - 1].iter().rev().find(|c| c.ch != ' ');
if let Some(real_prev) = last_real {
let gap = ci.x - real_prev.right_x;
let real_prev_width = (real_prev.right_x - real_prev.x).max(0.0);
let curr_width = (ci.right_x - ci.x).max(0.0);
let avg_char_width = if real_prev_width > 0.0 && curr_width > 0.0 {
(real_prev_width + curr_width) * 0.5
} else {
(ci.font_size + real_prev.font_size) * 0.3
};
if gap < avg_char_width * 0.5 {
line_text.pop();
}
}
} else {
let gap = ci.x - prev.right_x;
let avg_height = (ci.font_size + prev.font_size) * 0.5;
if gap > avg_height {
line_text.push(' ');
}
}
}
line_text.push(ci.ch);
}
line_text
}
const MIN_CHARS_PER_COLUMN: usize = 20;
const MIN_CHAR_COLUMN_GAP_FRACTION: f32 = 0.04;
const MIN_CHAR_COLUMN_GAP_ABS: f32 = 20.0;
const MIN_CHAR_COLUMN_VERTICAL_SPAN: f32 = 0.3;
const MAX_CHAR_COLUMN_DEPTH: usize = 3;
fn detect_char_column_splits(char_infos: &[CharInfo]) -> Vec<f32> {
fn detect_recursive(chars: &[CharInfo], depth: usize) -> Vec<f32> {
if depth >= MAX_CHAR_COLUMN_DEPTH {
return Vec::new();
}
let non_space: Vec<&CharInfo> = chars.iter().filter(|c| c.ch != ' ').collect();
if non_space.len() < MIN_CHARS_PER_COLUMN * 2 {
return Vec::new();
}
let x_min = non_space.iter().map(|c| c.x).fold(f32::MAX, f32::min);
let x_max = non_space.iter().map(|c| c.right_x).fold(f32::MIN, f32::max);
let x_span = x_max - x_min;
if x_span < 1.0 {
return Vec::new();
}
let y_min = non_space.iter().map(|c| c.y).fold(f32::MAX, f32::min);
let y_max = non_space.iter().map(|c| c.y).fold(f32::MIN, f32::max);
let y_span = y_max - y_min;
if y_span < 1.0 {
return Vec::new();
}
let mut edges: Vec<(f32, f32)> = non_space.iter().map(|c| (c.x, c.right_x)).collect();
edges.sort_by(|a, b| a.0.total_cmp(&b.0));
let mut max_right = f32::MIN;
let mut best_gap = 0.0_f32;
let mut best_split: Option<f32> = None;
for &(left, right) in &edges {
if max_right > f32::MIN {
let gap = left - max_right;
if gap > best_gap {
best_gap = gap;
best_split = Some((max_right + left) / 2.0);
}
}
max_right = max_right.max(right);
}
let min_gap = (x_span * MIN_CHAR_COLUMN_GAP_FRACTION).max(MIN_CHAR_COLUMN_GAP_ABS);
if best_gap < min_gap {
return Vec::new();
}
let split_x = match best_split {
Some(x) => x,
None => return Vec::new(),
};
let left_chars: Vec<&CharInfo> = non_space.iter().filter(|c| c.x < split_x).copied().collect();
let right_chars: Vec<&CharInfo> = non_space.iter().filter(|c| c.x >= split_x).copied().collect();
if left_chars.len() < MIN_CHARS_PER_COLUMN || right_chars.len() < MIN_CHARS_PER_COLUMN {
return Vec::new();
}
let left_y_min = left_chars.iter().map(|c| c.y).fold(f32::MAX, f32::min);
let left_y_max = left_chars.iter().map(|c| c.y).fold(f32::MIN, f32::max);
let right_y_min = right_chars.iter().map(|c| c.y).fold(f32::MAX, f32::min);
let right_y_max = right_chars.iter().map(|c| c.y).fold(f32::MIN, f32::max);
let left_y_span = left_y_max - left_y_min;
let right_y_span = right_y_max - right_y_min;
if left_y_span < y_span * MIN_CHAR_COLUMN_VERTICAL_SPAN || right_y_span < y_span * MIN_CHAR_COLUMN_VERTICAL_SPAN
{
return Vec::new();
}
let left_all: Vec<CharInfo> = chars.iter().filter(|c| c.x < split_x).cloned().collect();
let right_all: Vec<CharInfo> = chars.iter().filter(|c| c.x >= split_x).cloned().collect();
let mut splits = detect_recursive(&left_all, depth + 1);
splits.push(split_x);
splits.extend(detect_recursive(&right_all, depth + 1));
splits.sort_by(|a, b| a.total_cmp(b));
splits
}
detect_recursive(char_infos, 0)
}
fn partition_chars_by_columns(chars: Vec<CharInfo>, splits: &[f32]) -> Vec<Vec<CharInfo>> {
let num_columns = splits.len() + 1;
let mut columns: Vec<Vec<CharInfo>> = (0..num_columns).map(|_| Vec::new()).collect();
for ci in chars {
let col = splits.iter().filter(|&&s| ci.x >= s).count();
columns[col].push(ci);
}
for col in &mut columns {
sort_chars_reading_order(col);
}
columns
}
fn sort_chars_reading_order(chars: &mut [CharInfo]) {
if chars.len() < 2 {
return;
}
let avg_font_size = chars.iter().map(|c| c.font_size).sum::<f32>() / chars.len() as f32;
let y_tolerance = avg_font_size * 0.5;
if y_tolerance <= 0.0 {
return;
}
chars.sort_by(|a, b| {
let a_band = (a.y / y_tolerance).round() as i64;
let b_band = (b.y / y_tolerance).round() as i64;
b_band.cmp(&a_band).then_with(|| a.x.total_cmp(&b.x))
});
}
fn assemble_segments_from_chars(char_infos: &[CharInfo], repair_map: Option<&[(char, &str)]>) -> Vec<SegmentData> {
if char_infos.is_empty() {
return Vec::new();
}
let mut y_jumps: Vec<f32> = Vec::new();
for i in 1..char_infos.len() {
if char_infos[i].ch == ' ' || char_infos[i - 1].ch == ' ' {
continue;
}
let dy = (char_infos[i].y - char_infos[i - 1].y).abs();
if dy > 1.0 && dy < 200.0 {
y_jumps.push(dy);
}
}
let line_height_threshold = if y_jumps.len() >= 3 {
y_jumps.sort_by(|a, b| a.total_cmp(b));
y_jumps[y_jumps.len() / 2] * 0.6 } else {
let avg_fs = char_infos.iter().map(|c| c.font_size).sum::<f32>() / char_infos.len() as f32;
avg_fs * 0.5
};
let line_break_threshold = line_height_threshold.max(2.0);
let mut line_ranges: Vec<(usize, usize)> = Vec::new();
{
let mut ls = 0;
for i in 1..=char_infos.len() {
let brk = if i == char_infos.len() {
true
} else {
let dy = (char_infos[i].y - char_infos[ls].y).abs();
dy > line_break_threshold && char_infos[i].ch != ' '
};
if brk {
line_ranges.push((ls, i));
if i < char_infos.len() {
ls = i;
}
}
}
}
let _right_margin = compute_right_margin(char_infos, &line_ranges);
let mut segments = Vec::new();
let mut pending_text: Option<String> = None;
let mut pending_start: usize = 0;
for (range_idx, &(start, end)) in line_ranges.iter().enumerate() {
let line_text = build_line_text(&char_infos[start..end], repair_map);
let trimmed = line_text.trim();
if trimmed.is_empty() {
continue;
}
if let Some(ref mut pending) = pending_text {
pending.push_str(trimmed);
} else {
pending_text = Some(trimmed.to_string());
pending_start = start;
}
let merge_with_next = range_idx + 1 < line_ranges.len() && line_ends_with_break_hyphen(&char_infos[start..end]);
if merge_with_next {
if let Some(ref mut pending) = pending_text {
strip_trailing_hyphen(pending);
}
continue;
}
if let Some(text) = pending_text.take() {
let first = &char_infos[pending_start];
let last_idx = (pending_start..end)
.rev()
.find(|&j| char_infos[j].ch != ' ')
.unwrap_or(pending_start);
let last = &char_infos[last_idx];
let width = (last.right_x - first.x).max(first.font_size);
segments.push(SegmentData {
text,
x: first.x,
y: first.y,
width,
height: first.font_size,
font_size: first.font_size,
is_bold: first.is_bold,
is_italic: first.is_italic,
is_monospace: first.is_monospace,
baseline_y: first.y,
});
}
}
if let Some(text) = pending_text.take() {
let Some(last_range) = line_ranges.last() else {
return segments;
};
let first = &char_infos[pending_start];
let last_idx = (pending_start..last_range.1)
.rev()
.find(|&j| char_infos[j].ch != ' ')
.unwrap_or(pending_start);
let last = &char_infos[last_idx];
let width = (last.right_x - first.x).max(first.font_size);
segments.push(SegmentData {
text,
x: first.x,
y: first.y,
width,
height: first.font_size,
font_size: first.font_size,
is_bold: first.is_bold,
is_italic: first.is_italic,
is_monospace: first.is_monospace,
baseline_y: first.y,
});
}
segments
}
fn compute_right_margin(char_infos: &[CharInfo], line_ranges: &[(usize, usize)]) -> f32 {
let mut max_right = f32::MIN;
for &(start, end) in line_ranges {
let non_space_count = char_infos[start..end].iter().filter(|c| c.ch != ' ').count();
if non_space_count < 3 {
continue;
}
if let Some(last) = (start..end).rev().find(|&j| char_infos[j].ch != ' ') {
max_right = max_right.max(char_infos[last].right_x);
}
}
max_right
}
fn line_ends_with_break_hyphen(line_chars: &[CharInfo]) -> bool {
let last = match line_chars.iter().rev().find(|c| c.ch != ' ') {
Some(c) => c,
None => return false,
};
if !last.is_hyphen {
return false;
}
if !matches!(last.ch, '-' | '\u{2010}' | '\u{00AD}' | '\u{2011}') {
return false;
}
line_chars
.iter()
.rev()
.filter(|c| c.ch != ' ')
.nth(1) .is_some_and(|c| c.ch.is_alphabetic())
}
fn strip_trailing_hyphen(text: &mut String) {
if let Some(ch) = text.chars().next_back()
&& matches!(ch, '-' | '\u{2010}' | '\u{00AD}' | '\u{2011}')
{
text.pop();
}
}
fn chars_to_segments_from_data(data: &PageTextData, page_width: f32) -> Option<Vec<SegmentData>> {
if data.chars.is_empty() {
return None;
}
let mut char_infos: Vec<CharInfo> = data
.chars
.iter()
.map(|ec| CharInfo {
ch: ec.ch,
x: ec.x,
y: ec.y,
right_x: ec.right_x,
font_size: ec.font_size,
is_bold: ec.is_bold,
is_italic: ec.is_italic,
is_monospace: ec.is_monospace,
has_map_error: ec.has_map_error,
is_symbolic: ec.is_symbolic,
is_hyphen: ec.is_hyphen,
})
.collect();
filter_sidebar_characters(&mut char_infos, page_width);
if char_infos.is_empty() {
return None;
}
let column_splits = detect_char_column_splits(&char_infos);
let repair_map = data.ligature_repair_map.as_deref();
let segments = if column_splits.is_empty() {
assemble_segments_from_chars(&char_infos, repair_map)
} else {
let columns = partition_chars_by_columns(char_infos, &column_splits);
columns
.iter()
.flat_map(|col| assemble_segments_from_chars(col, repair_map))
.collect()
};
if segments.is_empty() { None } else { Some(segments) }
}
fn extract_paragraphs_to_segments(paragraphs: Vec<PdfiumParagraph>, segments: &mut Vec<SegmentData>) {
for para in paragraphs {
for line in para.into_lines() {
let line_baseline = line.bottom.value;
let line_left = line.left.value;
let mut running_x = line_left;
for fragment in &line.fragments {
match fragment {
PdfParagraphFragment::StyledString(styled) => {
let text = normalize_text_encoding(styled.text());
if text.trim().is_empty() {
continue;
}
let font_size = styled.font_size().value;
let is_bold = styled.is_bold();
let is_italic = styled.is_italic();
let is_monospace = styled.is_monospace();
let estimated_width = text.len() as f32 * font_size * 0.5;
segments.push(SegmentData {
text: text.into_owned(),
x: running_x,
y: line_baseline,
width: estimated_width,
height: font_size,
font_size,
is_bold,
is_italic,
is_monospace,
baseline_y: line_baseline,
});
running_x += estimated_width;
}
PdfParagraphFragment::NonTextObject(_) | PdfParagraphFragment::LineBreak { .. } => {}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_block(role: ContentRole, text: &str) -> ExtractedBlock {
ExtractedBlock {
role,
text: text.to_string(),
bounds: None,
font_size: Some(12.0),
is_bold: false,
is_italic: false,
is_monospace: false,
children: Vec::new(),
}
}
fn make_block_with_font(role: ContentRole, text: &str, font_size: f32) -> ExtractedBlock {
ExtractedBlock {
role,
text: text.to_string(),
bounds: None,
font_size: Some(font_size),
is_bold: false,
is_italic: false,
is_monospace: false,
children: Vec::new(),
}
}
#[test]
fn test_heading_block() {
let blocks = vec![
make_block_with_font(ContentRole::Heading { level: 2 }, "Section Title", 18.0),
make_block_with_font(ContentRole::Paragraph, "Body text line one", 12.0),
make_block_with_font(ContentRole::Paragraph, "Body text line two", 12.0),
make_block_with_font(ContentRole::Paragraph, "Body text line three", 12.0),
];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert_eq!(paragraphs.len(), 4);
assert_eq!(paragraphs[0].heading_level, Some(2));
}
#[test]
fn test_heading_trusted_from_structure_tree() {
let blocks = vec![
make_block(ContentRole::Heading { level: 3 }, "Not really a heading"),
make_block(ContentRole::Paragraph, "Body text"),
make_block(ContentRole::Paragraph, "More body text"),
];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert_eq!(paragraphs.len(), 3);
assert_eq!(paragraphs[0].heading_level, Some(3)); }
#[test]
fn test_body_block() {
let blocks = vec![make_block(ContentRole::Paragraph, "Body text")];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].heading_level, None);
assert!(!paragraphs[0].is_list_item);
}
#[test]
fn test_list_item_block() {
let blocks = vec![ExtractedBlock {
role: ContentRole::ListItem {
label: Some("1.".to_string()),
},
text: "First item".to_string(),
bounds: None,
font_size: Some(12.0),
is_bold: false,
is_italic: false,
is_monospace: false,
children: Vec::new(),
}];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert_eq!(paragraphs.len(), 1);
assert!(paragraphs[0].is_list_item);
let first_seg_text = ¶graphs[0].lines[0].segments[0].text;
assert_eq!(first_seg_text, "1.");
}
#[test]
fn test_empty_text_skipped() {
let blocks = vec![make_block(ContentRole::Paragraph, "")];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert!(paragraphs.is_empty());
}
#[test]
fn test_whitespace_only_skipped() {
let blocks = vec![make_block(ContentRole::Paragraph, " ")];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert!(paragraphs.is_empty());
}
#[test]
fn test_children_processed() {
let blocks = vec![ExtractedBlock {
role: ContentRole::Other("Table".to_string()),
text: String::new(),
bounds: None,
font_size: None,
is_bold: false,
is_italic: false,
is_monospace: false,
children: vec![
make_block(ContentRole::Paragraph, "Cell 1"),
make_block(ContentRole::Paragraph, "Cell 2"),
],
}];
let paragraphs = extracted_blocks_to_paragraphs(&blocks);
assert_eq!(paragraphs.len(), 2);
}
fn make_char(ch: char, x: f32, y: f32, font_size: f32) -> CharInfo {
CharInfo {
ch,
x,
y,
font_size,
right_x: x + font_size * 0.6, is_bold: false,
is_italic: false,
is_monospace: false,
has_map_error: false,
is_symbolic: false,
is_hyphen: false,
}
}
#[test]
fn test_issue_431_build_line_text_inserts_spaces_at_x_gaps() {
let fs = 12.0;
let chars = vec![
make_char('M', 100.0, 0.0, fs),
make_char('a', 107.0, 0.0, fs),
make_char('i', 114.0, 0.0, fs),
make_char('n', 121.0, 0.0, fs),
make_char('D', 200.0, 0.0, fs),
make_char('e', 207.0, 0.0, fs),
make_char('c', 214.0, 0.0, fs),
make_char('k', 221.0, 0.0, fs),
];
let result = build_line_text(&chars, None);
assert_eq!(result, "Main Deck", "Large X-gap should produce a space between words");
}
#[test]
fn test_issue_431_build_line_text_no_false_spaces() {
let fs = 12.0;
let chars = vec![
make_char('H', 100.0, 0.0, fs),
make_char('e', 107.0, 0.0, fs),
make_char('l', 114.0, 0.0, fs),
make_char('l', 121.0, 0.0, fs),
make_char('o', 128.0, 0.0, fs),
];
let result = build_line_text(&chars, None);
assert_eq!(result, "Hello", "Normal spacing should not insert extra spaces");
}
#[test]
fn test_issue_431_tabular_numbers_with_gaps() {
let fs = 12.0;
let chars = vec![
make_char('1', 50.0, 0.0, fs),
make_char('2', 57.0, 0.0, fs),
make_char(',', 64.0, 0.0, fs),
make_char('4', 71.0, 0.0, fs),
make_char('0', 78.0, 0.0, fs),
make_char('4', 200.0, 0.0, fs),
make_char('8', 207.0, 0.0, fs),
make_char('0', 214.0, 0.0, fs),
];
let result = build_line_text(&chars, None);
assert_eq!(result, "12,40 480", "Column gap should produce space between numbers");
}
#[test]
fn test_issue_431_preserves_existing_spaces() {
let fs = 12.0;
let chars = vec![
make_char('A', 100.0, 0.0, fs),
make_char(' ', 107.0, 0.0, fs),
make_char('B', 200.0, 0.0, fs),
];
let result = build_line_text(&chars, None);
assert_eq!(result, "A B", "Should not insert extra space when space char exists");
}
fn make_column_chars(
x_start: f32,
x_end: f32,
y_start: f32,
y_end: f32,
chars_per_line: usize,
num_lines: usize,
font_size: f32,
) -> Vec<CharInfo> {
let mut chars = Vec::new();
let x_step = if chars_per_line > 1 {
(x_end - x_start) / (chars_per_line as f32 - 1.0)
} else {
0.0
};
let y_step = if num_lines > 1 {
(y_end - y_start) / (num_lines as f32 - 1.0)
} else {
0.0
};
for line in 0..num_lines {
let y = y_start + line as f32 * y_step;
for c in 0..chars_per_line {
let x = x_start + c as f32 * x_step;
chars.push(make_char('a', x, y, font_size));
}
}
chars
}
#[test]
fn test_detect_no_split_single_column() {
let chars = make_column_chars(10.0, 80.0, 0.0, 500.0, 10, 10, 12.0);
let splits = detect_char_column_splits(&chars);
assert!(splits.is_empty(), "Single column should produce no splits");
}
#[test]
fn test_detect_two_columns() {
let mut chars = make_column_chars(10.0, 200.0, 0.0, 400.0, 5, 6, 12.0);
chars.extend(make_column_chars(350.0, 540.0, 0.0, 400.0, 5, 6, 12.0));
let splits = detect_char_column_splits(&chars);
assert_eq!(splits.len(), 1, "Should detect one column split");
assert!(
splits[0] > 200.0 && splits[0] < 350.0,
"Split should be between columns"
);
}
#[test]
fn test_detect_three_columns() {
let mut chars = make_column_chars(10.0, 120.0, 0.0, 400.0, 4, 6, 12.0);
chars.extend(make_column_chars(250.0, 370.0, 0.0, 400.0, 4, 6, 12.0));
chars.extend(make_column_chars(500.0, 620.0, 0.0, 400.0, 4, 6, 12.0));
let splits = detect_char_column_splits(&chars);
assert_eq!(splits.len(), 2, "Should detect two column splits for 3 columns");
assert!(splits[0] < splits[1], "Splits should be sorted");
}
#[test]
fn test_no_false_split_table() {
let mut chars = make_column_chars(10.0, 100.0, 200.0, 230.0, 5, 3, 12.0);
chars.extend(make_column_chars(300.0, 400.0, 200.0, 230.0, 5, 3, 12.0));
let total_chars = make_column_chars(10.0, 400.0, 200.0, 210.0, 20, 2, 12.0);
let splits = detect_char_column_splits(&total_chars);
assert!(splits.is_empty(), "Single-line table data should not split");
}
#[test]
fn test_no_false_split_few_chars() {
let mut chars = make_column_chars(10.0, 100.0, 0.0, 400.0, 3, 3, 12.0); chars.extend(make_column_chars(300.0, 400.0, 0.0, 400.0, 3, 3, 12.0)); let splits = detect_char_column_splits(&chars);
assert!(splits.is_empty(), "Too few chars per side should not split");
}
#[test]
fn test_no_false_split_word_spacing() {
let fs = 12.0;
let mut chars = Vec::new();
for line in 0..10 {
let y = line as f32 * 15.0;
for i in 0..5 {
chars.push(make_char('a', 10.0 + i as f32 * 7.0, y, fs));
}
for i in 0..5 {
chars.push(make_char('b', 53.0 + i as f32 * 7.0, y, fs));
}
}
let splits = detect_char_column_splits(&chars);
assert!(splits.is_empty(), "Normal word spacing should not trigger column split");
}
#[test]
fn test_assemble_segments_basic() {
let chars = vec![
make_char('H', 10.0, 100.0, 12.0),
make_char('i', 20.0, 100.0, 12.0),
make_char('B', 10.0, 80.0, 12.0),
make_char('y', 20.0, 80.0, 12.0),
make_char('e', 30.0, 80.0, 12.0),
make_char('!', 10.0, 60.0, 12.0),
];
let segments = assemble_segments_from_chars(&chars, None);
assert_eq!(segments.len(), 3, "Should produce 3 segments for 3 lines");
assert_eq!(segments[0].text, "Hi");
assert_eq!(segments[1].text, "Bye");
assert_eq!(segments[2].text, "!");
}
#[test]
fn test_two_column_ordered_segments() {
let mut chars = Vec::new();
for line in 0..5 {
let y = 300.0 - line as f32 * 20.0;
for c in 0..5 {
chars.push(make_char('L', 50.0 + c as f32 * 8.0, y, 12.0));
}
}
for line in 0..5 {
let y = 300.0 - line as f32 * 20.0;
for c in 0..5 {
chars.push(make_char('R', 350.0 + c as f32 * 8.0, y, 12.0));
}
}
let splits = detect_char_column_splits(&chars);
assert!(!splits.is_empty(), "Should detect column split");
let columns = partition_chars_by_columns(chars, &splits);
assert_eq!(columns.len(), 2);
let left_segs = assemble_segments_from_chars(&columns[0], None);
let right_segs = assemble_segments_from_chars(&columns[1], None);
assert_eq!(left_segs.len(), 5, "Left column should have 5 lines");
assert_eq!(right_segs.len(), 5, "Right column should have 5 lines");
for seg in &left_segs {
assert!(
seg.text.chars().all(|c| c == 'L'),
"Left column should only have L chars"
);
}
for seg in &right_segs {
assert!(
seg.text.chars().all(|c| c == 'R'),
"Right column should only have R chars"
);
}
}
#[test]
fn test_two_column_interleaved_chars_reading_order() {
let mut chars = Vec::new();
for line in 0..5 {
let y = 300.0 - line as f32 * 20.0;
for c in 0..5 {
chars.push(make_char('L', 50.0 + c as f32 * 8.0, y, 12.0));
}
for c in 0..5 {
chars.push(make_char('R', 350.0 + c as f32 * 8.0, y, 12.0));
}
}
let splits = detect_char_column_splits(&chars);
assert!(!splits.is_empty(), "Should detect column split");
let columns = partition_chars_by_columns(chars, &splits);
assert_eq!(columns.len(), 2);
let left_segs = assemble_segments_from_chars(&columns[0], None);
let right_segs = assemble_segments_from_chars(&columns[1], None);
assert_eq!(left_segs.len(), 5, "Left column should have 5 lines");
assert_eq!(right_segs.len(), 5, "Right column should have 5 lines");
for i in 1..left_segs.len() {
assert!(
left_segs[i - 1].y >= left_segs[i].y,
"Left column segments should be in top-to-bottom order: y[{}]={} < y[{}]={}",
i - 1,
left_segs[i - 1].y,
i,
left_segs[i].y
);
}
for i in 1..right_segs.len() {
assert!(
right_segs[i - 1].y >= right_segs[i].y,
"Right column segments should be in top-to-bottom order: y[{}]={} < y[{}]={}",
i - 1,
right_segs[i - 1].y,
i,
right_segs[i].y
);
}
}
#[test]
fn test_sort_chars_reading_order_reversed_y() {
let mut chars = vec![
make_char('C', 10.0, 60.0, 12.0), make_char('B', 10.0, 80.0, 12.0), make_char('A', 10.0, 100.0, 12.0), ];
sort_chars_reading_order(&mut chars);
assert_eq!(chars[0].ch, 'A');
assert_eq!(chars[1].ch, 'B');
assert_eq!(chars[2].ch, 'C');
}
#[test]
fn test_sort_chars_reading_order_same_line_left_to_right() {
let mut chars = vec![
make_char('C', 30.0, 100.0, 12.0),
make_char('A', 10.0, 100.0, 12.0),
make_char('B', 20.0, 100.0, 12.0),
];
sort_chars_reading_order(&mut chars);
assert_eq!(chars[0].ch, 'A');
assert_eq!(chars[1].ch, 'B');
assert_eq!(chars[2].ch, 'C');
}
#[test]
fn test_two_column_full_page_segment_order() {
let mut chars = Vec::new();
for line in 0..5 {
let y = 300.0 - line as f32 * 20.0;
for c in 0..5 {
chars.push(make_char(char::from(b'A' + line as u8), 50.0 + c as f32 * 8.0, y, 12.0));
}
}
for line in 0..5 {
let y = 300.0 - line as f32 * 20.0;
for c in 0..5 {
chars.push(make_char(
char::from(b'a' + line as u8),
350.0 + c as f32 * 8.0,
y,
12.0,
));
}
}
let splits = detect_char_column_splits(&chars);
assert!(!splits.is_empty(), "Should detect column split");
let columns = partition_chars_by_columns(chars, &splits);
let all_segments: Vec<SegmentData> = columns
.iter()
.flat_map(|col| assemble_segments_from_chars(col, None))
.collect();
assert!(all_segments.len() >= 2, "Should have at least 2 segments");
let first_lowercase_idx = all_segments
.iter()
.position(|s| s.text.chars().any(|c| c.is_ascii_lowercase()))
.unwrap_or(all_segments.len());
for seg in &all_segments[..first_lowercase_idx] {
assert!(
seg.text.chars().all(|c| c.is_ascii_uppercase()),
"Left column segments should be uppercase, got: {}",
seg.text
);
}
for seg in &all_segments[first_lowercase_idx..] {
assert!(
seg.text.chars().all(|c| c.is_ascii_lowercase()),
"Right column segments should be lowercase, got: {}",
seg.text
);
}
}
fn make_char_exact(ch: char, x: f32, y: f32, font_size: f32, right_x: f32) -> CharInfo {
CharInfo {
ch,
x,
y,
font_size,
right_x,
is_bold: false,
is_italic: false,
is_monospace: false,
has_map_error: false,
is_symbolic: false,
is_hyphen: false,
}
}
fn make_word_chars(word: &str, x_start: f32, y: f32, font_size: f32) -> Vec<CharInfo> {
let char_width = font_size * 0.6; word.chars()
.enumerate()
.map(|(i, ch)| {
let x = x_start + i as f32 * char_width;
make_char_exact(ch, x, y, font_size, x + char_width)
})
.collect()
}
#[test]
fn test_word_break_merge_software() {
let fs = 12.0;
let cw = fs * 0.6; let right_margin_x = 300.0;
let mut chars = Vec::new();
chars.extend(make_word_chars("this", 10.0, 100.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 4.0 * cw + 1.0,
100.0,
fs,
10.0 + 4.0 * cw + 1.0 + cw,
));
chars.extend(make_word_chars("is", 10.0 + 5.0 * cw + 2.0, 100.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 7.0 * cw + 3.0,
100.0,
fs,
10.0 + 7.0 * cw + 3.0 + cw,
));
let soft_start = right_margin_x - 5.0 * cw; chars.extend(make_word_chars("soft", soft_start, 100.0, fs));
chars.push(make_hyphen_char(
'-',
soft_start + 4.0 * cw,
100.0,
fs,
soft_start + 5.0 * cw,
));
chars.extend(make_word_chars("ware", 10.0, 80.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 4.0 * cw + 1.0,
80.0,
fs,
10.0 + 4.0 * cw + 1.0 + cw,
));
chars.extend(make_word_chars("is", 10.0 + 5.0 * cw + 2.0, 80.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 7.0 * cw + 3.0,
80.0,
fs,
10.0 + 7.0 * cw + 3.0 + cw,
));
let great_start = right_margin_x - 5.0 * cw;
chars.extend(make_word_chars("great", great_start, 80.0, fs));
let segments = assemble_segments_from_chars(&chars, None);
let all_text: String = segments.iter().map(|s| s.text.as_str()).collect::<Vec<_>>().join(" ");
assert!(
all_text.contains("software"),
"Expected 'software' in merged text, got: {all_text}",
);
}
#[test]
fn test_no_false_merge_short_line() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = Vec::new();
chars.extend(make_word_chars("table", 10.0, 100.0, fs));
let words = ["structure", "is", "important", "here", "today"];
let mut x = 10.0;
for (i, word) in words.iter().enumerate() {
if i > 0 {
chars.push(make_char_exact(' ', x, 80.0, fs, x + cw));
x += cw;
}
chars.extend(make_word_chars(word, x, 80.0, fs));
x += word.len() as f32 * cw;
}
let segments = assemble_segments_from_chars(&chars, None);
assert!(
segments.len() >= 2,
"Short line should NOT merge with next: got {} segments",
segments.len()
);
assert_eq!(segments[0].text, "table");
}
#[test]
fn test_no_merge_uppercase_end() {
let fs = 12.0;
let cw = fs * 0.6;
let right_margin_x = 200.0;
let mut chars = Vec::new();
let title_start = right_margin_x - 5.0 * cw;
chars.extend(make_word_chars("TITLE", title_start, 100.0, fs));
let words = ["details", "follow", "here"];
let mut x = 10.0;
for (i, word) in words.iter().enumerate() {
if i > 0 {
chars.push(make_char_exact(' ', x, 80.0, fs, x + cw));
x += cw;
}
chars.extend(make_word_chars(word, x, 80.0, fs));
x += word.len() as f32 * cw;
}
let segments = assemble_segments_from_chars(&chars, None);
assert!(
segments.len() >= 2,
"Uppercase-ending line should NOT merge: got {} segments",
segments.len()
);
}
#[test]
fn test_no_merge_punctuation_end() {
let fs = 12.0;
let cw = fs * 0.6;
let right_margin_x = 200.0;
let mut chars = Vec::new();
let word_start = right_margin_x - 9.0 * cw;
chars.extend(make_word_chars("sentence.", word_start, 100.0, fs));
chars.extend(make_word_chars("next", 10.0, 80.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 4.0 * cw + 1.0,
80.0,
fs,
10.0 + 5.0 * cw + 1.0,
));
let line2_start = right_margin_x - 4.0 * cw;
chars.extend(make_word_chars("line", line2_start, 80.0, fs));
let segments = assemble_segments_from_chars(&chars, None);
assert!(
segments.len() >= 2,
"Punctuation-ending line should NOT merge: got {} segments",
segments.len()
);
}
#[test]
fn test_word_break_merge_chain() {
let fs = 12.0;
let cw = fs * 0.6;
let right_margin_x = 200.0;
let mut chars = Vec::new();
chars.extend(make_word_chars("the", 10.0, 100.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 3.0 * cw + 1.0,
100.0,
fs,
10.0 + 4.0 * cw + 1.0,
));
let recog_start = right_margin_x - 6.0 * cw; chars.extend(make_word_chars("recog", recog_start, 100.0, fs));
chars.push(make_hyphen_char(
'-',
recog_start + 5.0 * cw,
100.0,
fs,
recog_start + 6.0 * cw,
));
let ni_start = 10.0;
chars.extend(make_word_chars("nition", ni_start, 80.0, fs));
chars.push(make_char_exact(
' ',
ni_start + 6.0 * cw + 1.0,
80.0,
fs,
ni_start + 7.0 * cw + 1.0,
));
let is_start = right_margin_x - 2.0 * cw;
chars.extend(make_word_chars("is", is_start, 80.0, fs));
chars.extend(make_word_chars("great", 10.0, 60.0, fs));
let segments = assemble_segments_from_chars(&chars, None);
let all_text: String = segments.iter().map(|s| s.text.as_str()).collect::<Vec<_>>().join(" ");
assert!(
all_text.contains("recognition"),
"Expected 'recognition' in output, got: {all_text}",
);
}
fn make_hyphen_char(ch: char, x: f32, y: f32, font_size: f32, right_x: f32) -> CharInfo {
CharInfo {
ch,
x,
y,
font_size,
right_x,
is_bold: false,
is_italic: false,
is_monospace: false,
has_map_error: false,
is_symbolic: false,
is_hyphen: true,
}
}
#[test]
fn test_line_ends_with_break_hyphen_basic() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = make_word_chars("soft", 10.0, 100.0, fs);
chars.push(make_hyphen_char('-', 10.0 + 4.0 * cw, 100.0, fs, 10.0 + 5.0 * cw));
assert!(line_ends_with_break_hyphen(&chars));
}
#[test]
fn test_line_ends_with_break_hyphen_not_flagged() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = make_word_chars("word", 10.0, 100.0, fs);
chars.push(make_char_exact('-', 10.0 + 4.0 * cw, 100.0, fs, 10.0 + 5.0 * cw));
assert!(!line_ends_with_break_hyphen(&chars));
}
#[test]
fn test_line_ends_with_break_hyphen_standalone_dash() {
let fs = 12.0;
let chars = vec![make_hyphen_char('-', 10.0, 100.0, fs, 10.0 + fs * 0.6)];
assert!(!line_ends_with_break_hyphen(&chars));
}
#[test]
fn test_line_ends_with_break_hyphen_em_dash_ignored() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = make_word_chars("word", 10.0, 100.0, fs);
chars.push(make_hyphen_char(
'\u{2014}',
10.0 + 4.0 * cw,
100.0,
fs,
10.0 + 5.0 * cw,
));
assert!(!line_ends_with_break_hyphen(&chars));
}
#[test]
fn test_strip_trailing_hyphen() {
let mut text = "soft-".to_string();
strip_trailing_hyphen(&mut text);
assert_eq!(text, "soft");
}
#[test]
fn test_strip_trailing_hyphen_no_hyphen() {
let mut text = "word".to_string();
strip_trailing_hyphen(&mut text);
assert_eq!(text, "word");
}
#[test]
fn test_strip_trailing_soft_hyphen() {
let mut text = "soft\u{00AD}".to_string();
strip_trailing_hyphen(&mut text);
assert_eq!(text, "soft");
}
#[test]
fn test_assemble_segments_dehyphenates_pdfium_hyphen() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = Vec::new();
chars.extend(make_word_chars("soft", 10.0, 100.0, fs));
chars.push(make_hyphen_char('-', 10.0 + 4.0 * cw, 100.0, fs, 10.0 + 5.0 * cw));
chars.extend(make_word_chars("ware", 10.0, 80.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 4.0 * cw + 1.0,
80.0,
fs,
10.0 + 5.0 * cw + 1.0,
));
chars.extend(make_word_chars("is", 10.0 + 5.0 * cw + 2.0, 80.0, fs));
chars.push(make_char_exact(
' ',
10.0 + 7.0 * cw + 3.0,
80.0,
fs,
10.0 + 8.0 * cw + 3.0,
));
chars.extend(make_word_chars("great", 10.0 + 8.0 * cw + 4.0, 80.0, fs));
let segments = assemble_segments_from_chars(&chars, None);
let all_text: String = segments.iter().map(|s| s.text.as_str()).collect::<Vec<_>>().join(" ");
assert!(
all_text.contains("software"),
"Expected 'software' (dehyphenated), got: {all_text}",
);
assert!(
!all_text.contains("soft-"),
"Trailing hyphen should have been removed, got: {all_text}",
);
}
#[test]
fn test_assemble_segments_preserves_non_break_hyphen() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = Vec::new();
chars.extend(make_word_chars("well", 10.0, 100.0, fs));
chars.push(make_char_exact('-', 10.0 + 4.0 * cw, 100.0, fs, 10.0 + 5.0 * cw));
chars.extend(make_word_chars("known", 10.0, 80.0, fs));
let segments = assemble_segments_from_chars(&chars, None);
let all_text: String = segments.iter().map(|s| s.text.as_str()).collect::<Vec<_>>().join(" ");
assert!(
all_text.contains("well-"),
"Non-break hyphen should be preserved, got: {all_text}",
);
assert!(
!all_text.contains("wellknown"),
"Should NOT merge across non-break hyphen, got: {all_text}",
);
}
#[test]
fn test_assemble_segments_multi_hyphen_chain() {
let fs = 12.0;
let cw = fs * 0.6;
let mut chars = Vec::new();
chars.extend(make_word_chars("config", 10.0, 120.0, fs));
chars.push(make_hyphen_char('-', 10.0 + 6.0 * cw, 120.0, fs, 10.0 + 7.0 * cw));
chars.extend(make_word_chars("ura", 10.0, 100.0, fs));
chars.push(make_hyphen_char('-', 10.0 + 3.0 * cw, 100.0, fs, 10.0 + 4.0 * cw));
chars.extend(make_word_chars("tion", 10.0, 80.0, fs));
let segments = assemble_segments_from_chars(&chars, None);
let all_text: String = segments.iter().map(|s| s.text.as_str()).collect::<Vec<_>>().join(" ");
assert!(
all_text.contains("configuration"),
"Expected 'configuration' from multi-line dehyphenation, got: {all_text}",
);
}
fn make_seg(text: &str) -> SegmentData {
SegmentData {
text: text.to_string(),
x: 0.0,
y: 0.0,
width: 100.0,
height: 12.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: 0.0,
}
}
#[test]
fn test_repair_word_break_basic() {
let mut segs = vec![make_seg("given docu ment here")];
let full_text = "given document here";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "given document here");
}
#[test]
fn test_repair_word_break_with_stx_marker() {
let mut segs = vec![make_seg("given docu\x02 ment here")];
let full_text = "given docu\x02ment here";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "given document here");
}
#[test]
fn test_repair_preserves_real_word_boundaries() {
let mut segs = vec![make_seg("hello world test")];
let full_text = "hello world test";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "hello world test");
}
#[test]
fn test_repair_multiple_breaks_in_one_segment() {
let mut segs = vec![make_seg("ad ditional con version")];
let full_text = "additional conversion";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "additional conversion");
}
#[test]
fn test_repair_no_change_when_fragment_is_real_word() {
let mut segs = vec![make_seg("con version")];
let full_text = "con version conversion";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "con version");
}
#[test]
fn test_repair_uppercase_not_joined() {
let mut segs = vec![make_seg("hello World")];
let full_text = "helloWorld hello World";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "hello World");
}
#[test]
fn test_repair_empty_full_text() {
let mut segs = vec![make_seg("docu ment")];
repair_word_breaks_from_full_text(&mut segs, "");
assert_eq!(segs[0].text, "docu ment");
}
#[test]
fn test_repair_across_multiple_segments() {
let mut segs = vec![make_seg("first docu ment"), make_seg("second cor recting")];
let full_text = "first document second correcting";
repair_word_breaks_from_full_text(&mut segs, full_text);
assert_eq!(segs[0].text, "first document");
assert_eq!(segs[1].text, "second correcting");
}
}