use std::borrow::Cow;
use crate::pdf::error::Result;
use crate::pdf::hierarchy::{BoundingBox, SegmentData, TextBlock, assign_heading_levels_smart, cluster_font_sizes};
use pdfium_render::prelude::*;
#[cfg(not(target_arch = "wasm32"))]
use rayon::prelude::*;
use super::assembly::assemble_internal_document;
use super::bridge::{ImagePosition, extracted_blocks_to_paragraphs, filter_sidebar_blocks, objects_to_page_data};
use super::classify::{
classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections, mark_arxiv_noise,
mark_cross_page_repeating_short_text, mark_cross_page_repeating_text, refine_heading_hierarchy,
};
use super::constants::{
FULL_LINE_FRACTION, MIN_FONT_SIZE, MIN_HEADING_FONT_GAP, MIN_HEADING_FONT_RATIO, PAGE_BOTTOM_MARGIN_FRACTION,
PAGE_TOP_MARGIN_FRACTION,
};
use super::lines::is_cjk_char;
use super::paragraphs::{merge_continuation_paragraphs, split_embedded_list_items};
use super::text_repair::{
apply_ligature_repairs, apply_to_all_segments, build_ligature_repair_map, clean_duplicate_punctuation,
expand_ligatures_with_space_absorption, normalize_text_encoding, normalize_unicode_text,
repair_broken_word_spacing, repair_contextual_ligatures, repair_ligature_spaces, text_has_broken_word_spacing,
text_has_ligature_corruption,
};
use super::types::{LayoutHint, PdfParagraph};
#[allow(clippy::type_complexity)]
fn extract_structure_tree_pages(
pages: &PdfPages,
page_count: PdfPageIndex,
) -> Result<(Vec<Option<Vec<PdfParagraph>>>, Vec<usize>, bool)> {
let mut struct_tree_results: Vec<Option<Vec<PdfParagraph>>> = Vec::with_capacity(page_count as usize);
let mut heuristic_pages: Vec<usize> = Vec::new();
let mut has_font_encoding_issues = false;
for i in 0..page_count {
let page = pages.get(i).map_err(|e| {
crate::pdf::error::PdfError::TextExtractionFailed(format!("Failed to get page {}: {:?}", i, e))
})?;
let page_t = crate::utils::timing::Instant::now();
match extract_page_content(&page) {
Ok(extraction) if extraction.method == ExtractionMethod::StructureTree && !extraction.blocks.is_empty() => {
tracing::trace!(
page = i,
method = ?extraction.method,
block_count = extraction.blocks.len(),
"PDF structure pipeline: page extracted via structure tree"
);
for (bi, block) in extraction.blocks.iter().take(10).enumerate() {
tracing::trace!(
page = i,
block_index = bi,
role = ?block.role,
text_preview = block.text.chars().take(60).collect::<String>(),
font_size = ?block.font_size,
is_bold = block.is_bold,
child_count = block.children.len(),
"PDF structure pipeline: structure tree block"
);
}
let page_width = page.width().value;
let filtered_blocks = filter_sidebar_blocks(&extraction.blocks, page_width);
let mut paragraphs = extracted_blocks_to_paragraphs(&filtered_blocks);
if let Some(repair_map) = build_ligature_repair_map(&page) {
has_font_encoding_issues = true;
apply_to_all_segments(&mut paragraphs, |t| apply_ligature_repairs(t, &repair_map));
}
{
let all_text = build_page_text(¶graphs);
if text_has_ligature_corruption(&all_text) {
apply_to_all_segments(&mut paragraphs, repair_contextual_ligatures);
}
if text_has_broken_word_spacing(&all_text) {
apply_to_all_segments(&mut paragraphs, repair_broken_word_spacing);
}
}
apply_to_all_segments(&mut paragraphs, fused_text_repairs);
let has_positions = paragraphs.iter().any(|p| {
p.lines
.iter()
.any(|l| l.segments.iter().any(|s| s.width > 0.0 || s.x > 0.0))
});
dehyphenate_paragraphs(&mut paragraphs, has_positions);
split_embedded_list_items(&mut paragraphs);
let heading_count = paragraphs.iter().filter(|p| p.heading_level.is_some()).count();
let bold_count = paragraphs.iter().filter(|p| p.is_bold).count();
let has_font_variation = has_font_size_variation(¶graphs);
tracing::trace!(
page = i,
paragraph_count = paragraphs.len(),
heading_count,
bold_count,
has_font_variation,
"PDF structure pipeline: structure tree paragraphs after conversion"
);
if paragraphs.is_empty() {
struct_tree_results.push(None);
heuristic_pages.push(i as usize);
} else if heading_count == 0 && has_font_variation {
tracing::debug!(
page = i,
"PDF structure pipeline: structure tree has font variation but no headings, will classify via font-size clustering"
);
struct_tree_results.push(Some(paragraphs));
heuristic_pages.push(i as usize);
} else {
struct_tree_results.push(Some(paragraphs));
}
}
Ok(_) => {
struct_tree_results.push(None);
heuristic_pages.push(i as usize);
}
Err(_) => {
struct_tree_results.push(None);
heuristic_pages.push(i as usize);
}
}
let page_ms = page_t.elapsed_ms();
if page_ms > 2000.0 {
tracing::warn!(page = i, elapsed_ms = page_ms, "slow structure tree extraction");
}
}
tracing::debug!(
heuristic_page_count = heuristic_pages.len(),
struct_tree_ok = struct_tree_results.iter().filter(|r| r.is_some()).count(),
"PDF structure pipeline: stage 0 complete"
);
Ok((struct_tree_results, heuristic_pages, has_font_encoding_issues))
}
#[allow(clippy::type_complexity, clippy::too_many_arguments)]
fn extract_heuristic_segments(
pages: &PdfPages,
page_count: PdfPageIndex,
heuristic_pages: &[usize],
top_margin: Option<f32>,
bottom_margin: Option<f32>,
has_layout_hints: bool,
include_headers: bool,
include_footers: bool,
) -> (Vec<Vec<SegmentData>>, Vec<ImagePosition>, Vec<Vec<f32>>, Vec<f32>) {
let stage1_start = crate::utils::timing::Instant::now();
let mut all_page_segments: Vec<Vec<SegmentData>> = vec![Vec::new(); page_count as usize];
let mut all_page_gap_ys: Vec<Vec<f32>> = vec![Vec::new(); page_count as usize];
let mut page_heights: Vec<f32> = vec![0.0; page_count as usize];
let mut all_image_positions: Vec<ImagePosition> = Vec::new();
let mut image_offset = 0usize;
for &i in heuristic_pages {
let page = match pages.get(i as PdfPageIndex) {
Ok(p) => p,
Err(e) => {
tracing::warn!("Failed to get page {} for heuristic extraction: {:?}", i, e);
continue;
}
};
page_heights[i] = page.height().value;
let page_t = crate::utils::timing::Instant::now();
let (mut segments, image_positions, paragraph_gap_ys) = objects_to_page_data(&page, i + 1, &mut image_offset);
let page_ms = page_t.elapsed_ms();
if page_ms > 1000.0 {
tracing::warn!(
"slow objects_to_page_data page {}: {:.0}ms, {} segments",
i + 1,
page_ms,
segments.len()
);
}
if segments
.iter()
.any(|s| s.font_size >= MIN_FONT_SIZE && !s.text.trim().is_empty())
{
segments.retain(|s| s.font_size >= MIN_FONT_SIZE);
}
if !has_layout_hints {
let page_height = page.height().value;
let top_frac = top_margin.unwrap_or(PAGE_TOP_MARGIN_FRACTION).clamp(0.0, 0.5);
let bottom_frac = bottom_margin.unwrap_or(PAGE_BOTTOM_MARGIN_FRACTION).clamp(0.0, 0.5);
let top_cutoff = page_height * (1.0 - top_frac);
let bottom_cutoff = page_height * bottom_frac;
let skip_top = include_headers;
let skip_bottom = include_footers;
if !skip_top || !skip_bottom {
let would_survive = segments.iter().any(|s| {
!s.text.trim().is_empty()
&& (s.baseline_y == 0.0
|| ((skip_top || s.baseline_y <= top_cutoff)
&& (skip_bottom || s.baseline_y >= bottom_cutoff)))
});
if would_survive {
segments.retain(|s| {
s.baseline_y == 0.0
|| ((skip_top || s.baseline_y <= top_cutoff)
&& (skip_bottom || s.baseline_y >= bottom_cutoff))
});
}
}
filter_standalone_page_numbers(&mut segments);
}
let filtered = segments;
all_page_segments[i] = filtered;
all_page_gap_ys[i] = paragraph_gap_ys;
all_image_positions.extend(image_positions);
}
tracing::debug!(
stage1_ms = stage1_start.elapsed_ms(),
total_segments = all_page_segments.iter().map(|s| s.len()).sum::<usize>(),
"PDF structure pipeline: stage 1 complete"
);
(all_page_segments, all_image_positions, all_page_gap_ys, page_heights)
}
#[allow(clippy::type_complexity)]
fn build_heading_map(
all_page_segments: &[Vec<SegmentData>],
struct_tree_results: &[Option<Vec<PdfParagraph>>],
heuristic_pages: &[usize],
k_clusters: usize,
) -> Result<(Vec<(f32, Option<u8>)>, ahash::AHashSet<usize>)> {
let struct_tree_needs_classify: ahash::AHashSet<usize> = struct_tree_results
.iter()
.enumerate()
.filter_map(|(i, result)| {
result.as_ref().and_then(|paragraphs| {
let has_headings = paragraphs.iter().any(|p| p.heading_level.is_some());
if !has_headings && has_font_size_variation(paragraphs) {
Some(i)
} else {
None
}
})
})
.collect();
let mut all_blocks: Vec<TextBlock> = Vec::new();
let empty_bbox = BoundingBox {
left: 0.0,
top: 0.0,
right: 0.0,
bottom: 0.0,
};
for &i in heuristic_pages {
for seg in &all_page_segments[i] {
if seg.text.trim().is_empty() {
continue;
}
all_blocks.push(TextBlock {
text: String::new(),
bbox: empty_bbox,
font_size: seg.font_size,
});
}
}
for &i in &struct_tree_needs_classify {
if let Some(paragraphs) = &struct_tree_results[i] {
for para in paragraphs {
all_blocks.push(TextBlock {
text: String::new(),
bbox: empty_bbox,
font_size: para.dominant_font_size,
});
}
}
}
let heading_map = if all_blocks.is_empty() {
Vec::new()
} else {
let clusters = cluster_font_sizes(&all_blocks, k_clusters)?;
assign_heading_levels_smart(&clusters, MIN_HEADING_FONT_RATIO, MIN_HEADING_FONT_GAP)
};
Ok((heading_map, struct_tree_needs_classify))
}
struct PageInput {
page_index: usize,
struct_paragraphs: Option<Vec<PdfParagraph>>,
heuristic_segments: Vec<SegmentData>,
page_hints: Option<Vec<LayoutHint>>,
table_bboxes: Vec<crate::types::BoundingBox>,
#[allow(dead_code)]
hint_validations: Vec<super::regions::layout_validation::RegionValidation>,
needs_classify: bool,
paragraph_gap_ys: Vec<f32>,
}
fn process_single_page(
input: PageInput,
heading_map: &[(f32, Option<u8>)],
doc_body_font_size: Option<f32>,
) -> Vec<PdfParagraph> {
let PageInput {
page_index: i,
struct_paragraphs,
heuristic_segments,
page_hints,
table_bboxes,
hint_validations: _,
needs_classify,
paragraph_gap_ys: _,
} = input;
if let Some(mut paragraphs) = struct_paragraphs {
if needs_classify {
tracing::debug!(
page = i,
"PDF structure pipeline: classifying struct tree page via font-size clustering"
);
classify_paragraphs(&mut paragraphs, heading_map);
}
merge_continuation_paragraphs(&mut paragraphs);
if let Some(ref hints) = page_hints {
super::layout_classify::apply_layout_overrides(&mut paragraphs, hints, 0.5, 0.2, doc_body_font_size);
tracing::debug!(
page = i,
headings = paragraphs.iter().filter(|p| p.heading_level.is_some()).count(),
lists = paragraphs.iter().filter(|p| p.is_list_item).count(),
furniture = paragraphs.iter().filter(|p| p.is_page_furniture).count(),
"layout overrides applied"
);
retain_page_furniture_safely(&mut paragraphs);
}
paragraphs
} else {
let page_segments = heuristic_segments;
tracing::debug!(
page = i,
segments = page_segments.len(),
has_layout_hints = page_hints.is_some(),
"process_single_page: heuristic path"
);
let page_segments = filter_segments_by_table_bboxes(page_segments, &table_bboxes);
let mut paragraphs = super::bridge::blocks_to_paragraphs(page_segments, heading_map, &input.paragraph_gap_ys);
tracing::debug!(
page = i,
paragraphs = paragraphs.len(),
"heuristic paragraphs classified"
);
if let Some(ref hints) = page_hints {
super::layout_classify::apply_layout_overrides(&mut paragraphs, hints, 0.5, 0.2, doc_body_font_size);
tracing::debug!(
page = i,
headings = paragraphs.iter().filter(|p| p.heading_level.is_some()).count(),
lists = paragraphs.iter().filter(|p| p.is_list_item).count(),
furniture = paragraphs.iter().filter(|p| p.is_page_furniture).count(),
"layout overrides applied"
);
}
retain_page_furniture_safely(&mut paragraphs);
paragraphs
}
}
#[allow(clippy::too_many_arguments)]
pub fn extract_document_structure(
document: &PdfDocument,
k_clusters: usize,
tables: &[crate::types::Table],
top_margin: Option<f32>,
bottom_margin: Option<f32>,
layout_hints: Option<&[Vec<LayoutHint>]>,
#[cfg(feature = "layout-detection")] layout_images: Option<&[image::DynamicImage]>,
#[cfg(not(feature = "layout-detection"))] _layout_images: Option<()>,
#[cfg(feature = "layout-detection")] layout_results: Option<&[crate::pdf::layout_runner::PageLayoutResult]>,
#[cfg(not(feature = "layout-detection"))] _layout_results: Option<()>,
allow_single_column: bool,
#[cfg(feature = "layout-detection")] table_model: crate::core::config::layout::TableModel,
#[cfg(not(feature = "layout-detection"))] _table_model: Option<()>,
strip_repeating_text: bool,
include_headers: bool,
include_footers: bool,
) -> Result<(crate::types::internal::InternalDocument, bool)> {
let pages = document.pages();
let page_count = pages.len();
let total_table_hints = layout_hints
.map(|h| {
h.iter()
.flat_map(|p| p.iter())
.filter(|hint| matches!(hint.class, super::types::LayoutHintClass::Table))
.count()
})
.unwrap_or(0);
tracing::trace!(
has_layout = layout_hints.is_some(),
total_table_hints,
"Starting structure with tables pipeline"
);
tracing::debug!(page_count, "PDF structure pipeline: starting render");
let mut has_font_encoding_issues = false;
let (mut struct_tree_results, heuristic_pages, struct_tree_font_issues) =
extract_structure_tree_pages(pages, page_count)?;
has_font_encoding_issues |= struct_tree_font_issues;
#[cfg(feature = "pdf-oxide")]
let oxide_segments: Option<Vec<Vec<SegmentData>>> = {
let t = crate::utils::timing::Instant::now();
let result = crate::pdf::oxide_text::extract_segments_with_oxide(page_count as usize);
if let Some(ref segs) = result {
let total: usize = segs.iter().map(|s| s.len()).sum();
tracing::debug!(
total_segments = total,
elapsed_ms = t.elapsed_ms(),
"pdf_oxide text extraction complete"
);
}
result
};
#[cfg(not(feature = "pdf-oxide"))]
let oxide_segments: Option<Vec<Vec<SegmentData>>> = None;
let (mut all_page_segments, all_image_positions, mut all_page_gap_ys, page_heights) =
if let Some(oxide_segs) = oxide_segments {
let mut all_segs = oxide_segs;
all_segs.resize_with(page_count as usize, Vec::new);
let has_hints = layout_hints.is_some();
let (_, image_positions, _, page_heights) = extract_heuristic_segments(
pages,
page_count,
&heuristic_pages,
top_margin,
bottom_margin,
has_hints,
include_headers,
include_footers,
);
(
all_segs,
image_positions,
vec![Vec::new(); page_count as usize],
page_heights,
)
} else {
let has_hints = layout_hints.is_some();
extract_heuristic_segments(
pages,
page_count,
&heuristic_pages,
top_margin,
bottom_margin,
has_hints,
include_headers,
include_footers,
)
};
for &i in &heuristic_pages {
let page = pages.get(i as PdfPageIndex).map_err(|e| {
crate::pdf::error::PdfError::TextExtractionFailed(format!("Failed to get page {}: {:?}", i, e))
})?;
if build_ligature_repair_map(&page).is_some() {
has_font_encoding_issues = true;
break;
}
}
let (heading_map, struct_tree_needs_classify) =
build_heading_map(&all_page_segments, &struct_tree_results, &heuristic_pages, k_clusters)?;
let doc_body_font_size: Option<f32> = heading_map
.iter()
.find(|(_, level)| level.is_none())
.map(|(size, _)| *size);
let mut layout_tables: Vec<crate::types::Table> = Vec::new();
if let Some(hints_pages) = layout_hints {
struct TablePageData {
page_idx: usize,
words: Vec<crate::pdf::table_reconstruct::HocrWord>,
page_height: f32,
}
let mut table_pages: Vec<TablePageData> = Vec::new();
#[allow(clippy::needless_range_loop)]
for page_idx in 0..page_count as usize {
let Some(hints) = hints_pages.get(page_idx) else {
continue;
};
if !hints.iter().any(|h| h.class == super::types::LayoutHintClass::Table) {
continue;
}
let page = pages.get(page_idx as PdfPageIndex).map_err(|e| {
crate::pdf::error::PdfError::TextExtractionFailed(format!(
"Failed to get page {} for table extraction: {:?}",
page_idx, e
))
})?;
let page_height = page.height().value;
let (words, page_height) = match crate::pdf::table::extract_words_from_page(&page, 0.0) {
Ok(w) => (w, page_height),
Err(e) => {
tracing::debug!(page = page_idx, error = %e, "table extraction: word extraction failed");
continue;
}
};
if words.is_empty() {
tracing::trace!(page = page_idx, "Table extraction: no words found, skipping");
continue;
}
tracing::trace!(
page = page_idx,
word_count = words.len(),
page_height,
"Table page prepared"
);
table_pages.push(TablePageData {
page_idx,
words,
page_height,
});
}
#[cfg(feature = "layout-detection")]
{
use crate::core::config::layout::TableModel;
use std::cell::RefCell;
let use_model_inference = table_model != TableModel::Disabled;
thread_local! {
static TL_TATR: RefCell<Option<crate::layout::models::tatr::TatrModel>> = const { RefCell::new(None) };
static TL_SLANET: RefCell<Option<crate::layout::models::slanet::SlanetModel>> = const { RefCell::new(None) };
static TL_SLANET_ALT: RefCell<Option<crate::layout::models::slanet::SlanetModel>> = const { RefCell::new(None) };
static TL_CLASSIFIER: RefCell<Option<crate::layout::models::table_classifier::TableClassifier>> = const { RefCell::new(None) };
}
let slanet_variant = match table_model {
TableModel::SlanetWired => Some("slanet_wired"),
TableModel::SlanetWireless => Some("slanet_wireless"),
TableModel::SlanetPlus => Some("slanet_plus"),
TableModel::SlanetAuto => Some("slanet_wired"), TableModel::Tatr | TableModel::Disabled => None,
};
let is_auto = table_model == TableModel::SlanetAuto;
let has_table_model = if !use_model_inference {
false
} else if let Some(variant) = slanet_variant {
let seed = if layout_images.is_some() {
crate::layout::take_or_create_slanet(variant)
} else {
None
};
let has = seed.is_some();
if let Some(model) = seed {
TL_SLANET.with(|cell| {
*cell.borrow_mut() = Some(model);
});
}
if is_auto && has {
if let Some(alt) = crate::layout::take_or_create_slanet("slanet_wireless") {
TL_SLANET_ALT.with(|cell| {
*cell.borrow_mut() = Some(alt);
});
}
if let Some(cls) = crate::layout::take_or_create_table_classifier() {
TL_CLASSIFIER.with(|cell| {
*cell.borrow_mut() = Some(cls);
});
}
}
has
} else {
let seed = if layout_images.is_some() {
crate::layout::take_or_create_tatr()
} else {
None
};
let has = seed.is_some();
if let Some(model) = seed {
TL_TATR.with(|cell| {
*cell.borrow_mut() = Some(model);
});
}
has
};
tracing::debug!(
has_table_model,
table_model = %table_model,
table_page_count = table_pages.len(),
"Table extraction phase 2: model availability"
);
if use_model_inference && !has_table_model && !table_pages.is_empty() {
let model_name = slanet_variant.unwrap_or("tatr");
return Err(crate::pdf::error::PdfError::TextExtractionFailed(format!(
"Layout detection found table regions but {model_name} model is not available. \
Ensure the ONNX model is downloaded. Tables cannot be extracted without it."
)));
}
if has_table_model {
if let (Some(images), Some(results)) = (layout_images, layout_results) {
#[cfg(not(target_arch = "wasm32"))]
let parallel_tables: Vec<Vec<crate::types::Table>> = table_pages
.par_iter()
.map(|tp| {
if let Some(variant) = slanet_variant {
TL_SLANET.with(|cell| {
let mut slanet_ref = cell.borrow_mut();
if slanet_ref.is_none() {
*slanet_ref = crate::layout::take_or_create_slanet(variant);
}
});
if is_auto {
TL_SLANET_ALT.with(|cell| {
let mut alt_ref = cell.borrow_mut();
if alt_ref.is_none() {
*alt_ref = crate::layout::take_or_create_slanet("slanet_wireless");
}
});
TL_CLASSIFIER.with(|cell| {
let mut cls_ref = cell.borrow_mut();
if cls_ref.is_none() {
*cls_ref = crate::layout::take_or_create_table_classifier();
}
});
}
TL_SLANET.with(|slanet_cell| {
let mut slanet_ref = slanet_cell.borrow_mut();
let Some(slanet) = slanet_ref.as_mut() else {
tracing::warn!("SLANeXT model unavailable in worker thread");
return Vec::new();
};
if let (Some(page_image), Some(page_result)) =
(images.get(tp.page_idx), results.get(tp.page_idx))
{
let hints = &hints_pages[tp.page_idx];
let mut classifier_pair = if is_auto {
let alt = TL_SLANET_ALT.with(|c| c.borrow_mut().take());
let cls = TL_CLASSIFIER.with(|c| c.borrow_mut().take());
match (cls, alt) {
(Some(c), Some(a)) => Some((c, a)),
(c, a) => {
if let Some(cls) = c {
TL_CLASSIFIER.with(|cell| {
*cell.borrow_mut() = Some(cls);
});
}
if let Some(alt) = a {
TL_SLANET_ALT.with(|cell| {
*cell.borrow_mut() = Some(alt);
});
}
None
}
}
} else {
None
};
let classifier_arg =
classifier_pair.as_mut().map(|(cls, alt)| {
(cls as &mut crate::layout::models::table_classifier::TableClassifier,
alt as &mut crate::layout::models::slanet::SlanetModel)
});
let slanet_tables = super::regions::recognize_tables_slanet(
page_image,
hints,
&tp.words,
page_result,
tp.page_height,
tp.page_idx,
slanet,
classifier_arg,
);
if let Some((cls, alt)) = classifier_pair {
TL_CLASSIFIER.with(|cell| {
*cell.borrow_mut() = Some(cls);
});
TL_SLANET_ALT.with(|cell| {
*cell.borrow_mut() = Some(alt);
});
}
if !slanet_tables.is_empty() {
return slanet_tables;
}
}
let hints = &hints_pages[tp.page_idx];
super::regions::extract_tables_from_layout_hints(
&tp.words,
hints,
tp.page_idx,
tp.page_height,
0.5,
allow_single_column,
)
})
} else {
TL_TATR.with(|cell| {
let mut tatr_ref = cell.borrow_mut();
if tatr_ref.is_none() {
*tatr_ref = crate::layout::take_or_create_tatr();
}
let Some(tatr) = tatr_ref.as_mut() else {
tracing::warn!("TATR model unavailable in worker thread");
return Vec::new();
};
if let (Some(page_image), Some(page_result)) =
(images.get(tp.page_idx), results.get(tp.page_idx))
{
let hints = &hints_pages[tp.page_idx];
let tatr_tables = super::regions::recognize_tables_for_native_page(
page_image,
hints,
&tp.words,
page_result,
tp.page_height,
tp.page_idx,
tatr,
);
tracing::trace!(
page = tp.page_idx,
tatr_tables = tatr_tables.len(),
"TATR table recognition result"
);
if !tatr_tables.is_empty() {
return tatr_tables;
}
}
let hints = &hints_pages[tp.page_idx];
super::regions::extract_tables_from_layout_hints(
&tp.words,
hints,
tp.page_idx,
tp.page_height,
0.5,
allow_single_column,
)
})
}
})
.collect();
#[cfg(target_arch = "wasm32")]
let parallel_tables: Vec<Vec<crate::types::Table>> = table_pages
.iter()
.map(|tp| {
if let Some(variant) = slanet_variant {
TL_SLANET.with(|cell| {
let mut slanet_ref = cell.borrow_mut();
if slanet_ref.is_none() {
*slanet_ref = crate::layout::take_or_create_slanet(variant);
}
});
if is_auto {
TL_SLANET_ALT.with(|cell| {
let mut alt_ref = cell.borrow_mut();
if alt_ref.is_none() {
*alt_ref = crate::layout::take_or_create_slanet("slanet_wireless");
}
});
TL_CLASSIFIER.with(|cell| {
let mut cls_ref = cell.borrow_mut();
if cls_ref.is_none() {
*cls_ref = crate::layout::take_or_create_table_classifier();
}
});
}
TL_SLANET.with(|slanet_cell| {
let mut slanet_ref = slanet_cell.borrow_mut();
let Some(slanet) = slanet_ref.as_mut() else {
tracing::warn!("SLANeXT model unavailable in worker thread");
return Vec::new();
};
if let (Some(page_image), Some(page_result)) =
(images.get(tp.page_idx), results.get(tp.page_idx))
{
let hints = &hints_pages[tp.page_idx];
let mut classifier_pair = if is_auto {
let alt = TL_SLANET_ALT.with(|c| c.borrow_mut().take());
let cls = TL_CLASSIFIER.with(|c| c.borrow_mut().take());
match (cls, alt) {
(Some(c), Some(a)) => Some((c, a)),
(c, a) => {
if let Some(cls) = c {
TL_CLASSIFIER.with(|cell| {
*cell.borrow_mut() = Some(cls);
});
}
if let Some(alt) = a {
TL_SLANET_ALT.with(|cell| {
*cell.borrow_mut() = Some(alt);
});
}
None
}
}
} else {
None
};
let classifier_arg =
classifier_pair.as_mut().map(|(cls, alt)| {
(cls as &mut crate::layout::models::table_classifier::TableClassifier,
alt as &mut crate::layout::models::slanet::SlanetModel)
});
let slanet_tables = super::regions::recognize_tables_slanet(
page_image,
hints,
&tp.words,
page_result,
tp.page_height,
tp.page_idx,
slanet,
classifier_arg,
);
if let Some((cls, alt)) = classifier_pair {
TL_CLASSIFIER.with(|cell| {
*cell.borrow_mut() = Some(cls);
});
TL_SLANET_ALT.with(|cell| {
*cell.borrow_mut() = Some(alt);
});
}
if !slanet_tables.is_empty() {
return slanet_tables;
}
}
let hints = &hints_pages[tp.page_idx];
super::regions::extract_tables_from_layout_hints(
&tp.words,
hints,
tp.page_idx,
tp.page_height,
0.5,
allow_single_column,
)
})
} else {
TL_TATR.with(|cell| {
let mut tatr_ref = cell.borrow_mut();
if tatr_ref.is_none() {
*tatr_ref = crate::layout::take_or_create_tatr();
}
let Some(tatr) = tatr_ref.as_mut() else {
tracing::warn!("TATR model unavailable in worker thread");
return Vec::new();
};
if let (Some(page_image), Some(page_result)) =
(images.get(tp.page_idx), results.get(tp.page_idx))
{
let hints = &hints_pages[tp.page_idx];
let tatr_tables = super::regions::recognize_tables_for_native_page(
page_image,
hints,
&tp.words,
page_result,
tp.page_height,
tp.page_idx,
tatr,
);
if !tatr_tables.is_empty() {
return tatr_tables;
}
}
let hints = &hints_pages[tp.page_idx];
super::regions::extract_tables_from_layout_hints(
&tp.words,
hints,
tp.page_idx,
tp.page_height,
0.5,
allow_single_column,
)
})
}
})
.collect();
for tables in parallel_tables {
layout_tables.extend(tables);
}
if let Some(variant) = slanet_variant {
TL_SLANET.with(|cell| {
if let Some(model) = cell.borrow_mut().take() {
crate::layout::return_slanet(variant, model);
}
});
if is_auto {
TL_SLANET_ALT.with(|cell| {
if let Some(model) = cell.borrow_mut().take() {
crate::layout::return_slanet("slanet_wireless", model);
}
});
TL_CLASSIFIER.with(|cell| {
if let Some(model) = cell.borrow_mut().take() {
crate::layout::return_table_classifier(model);
}
});
}
} else {
TL_TATR.with(|cell| {
if let Some(model) = cell.borrow_mut().take() {
crate::layout::return_tatr(model);
}
});
}
}
} else {
tracing::debug!(
table_page_count = table_pages.len(),
"Running heuristic table extraction (no TATR)"
);
for tp in &table_pages {
let hints = &hints_pages[tp.page_idx];
layout_tables.extend(super::regions::extract_tables_from_layout_hints(
&tp.words,
hints,
tp.page_idx,
tp.page_height,
0.5,
allow_single_column,
));
}
}
}
#[cfg(not(feature = "layout-detection"))]
{
for tp in &table_pages {
let hints = &hints_pages[tp.page_idx];
layout_tables.extend(super::regions::extract_tables_from_layout_hints(
&tp.words,
hints,
tp.page_idx,
tp.page_height,
0.5,
allow_single_column,
));
}
}
}
tracing::debug!(tables_found = layout_tables.len(), "Table extraction complete");
let extracted_table_bboxes_by_page: ahash::AHashMap<usize, Vec<crate::types::BoundingBox>> = {
let mut map: ahash::AHashMap<usize, Vec<crate::types::BoundingBox>> = ahash::AHashMap::new();
for table in &layout_tables {
if let Some(ref bb) = table.bounding_box {
map.entry(table.page_number.saturating_sub(1)).or_default().push(*bb);
}
}
for table in tables {
if let Some(ref bb) = table.bounding_box {
map.entry(table.page_number.saturating_sub(1)).or_default().push(*bb);
}
}
tracing::debug!(
layout_tables = layout_tables.len(),
heuristic_tables = tables.len(),
pages_with_bboxes = map.len(),
total_bboxes = map.values().map(|v| v.len()).sum::<usize>(),
"table bbox suppression map built"
);
map
};
#[cfg(feature = "layout-detection")]
let validations_by_page: ahash::AHashMap<usize, Vec<super::regions::layout_validation::RegionValidation>> = {
let mut map = ahash::AHashMap::new();
if let (Some(images), Some(results), Some(hints_pages)) = (layout_images, layout_results, layout_hints) {
for page_idx in 0..page_count as usize {
if let (Some(img), Some(res), Some(hints)) =
(images.get(page_idx), results.get(page_idx), hints_pages.get(page_idx))
{
let validations = super::regions::layout_validation::validate_page_regions(img, hints, res);
if validations.contains(&super::regions::layout_validation::RegionValidation::Empty) {
tracing::debug!(
page = page_idx,
empty_count = validations
.iter()
.filter(|v| **v == super::regions::layout_validation::RegionValidation::Empty)
.count(),
"layout validation: found empty regions"
);
}
map.insert(page_idx, validations);
}
}
}
map
};
#[cfg(not(feature = "layout-detection"))]
let validations_by_page: ahash::AHashMap<usize, Vec<super::regions::layout_validation::RegionValidation>> =
ahash::AHashMap::new();
let page_inputs: Vec<PageInput> = (0..page_count as usize)
.map(|i| PageInput {
page_index: i,
struct_paragraphs: struct_tree_results[i].take(),
heuristic_segments: std::mem::take(&mut all_page_segments[i]),
page_hints: layout_hints.and_then(|h| h.get(i)).cloned(),
table_bboxes: extracted_table_bboxes_by_page.get(&i).cloned().unwrap_or_default(),
hint_validations: validations_by_page.get(&i).cloned().unwrap_or_default(),
needs_classify: struct_tree_needs_classify.contains(&i),
paragraph_gap_ys: std::mem::take(&mut all_page_gap_ys[i]),
})
.collect();
#[cfg(not(target_arch = "wasm32"))]
let mut all_page_paragraphs: Vec<Vec<PdfParagraph>> = page_inputs
.into_par_iter()
.map(|input| process_single_page(input, &heading_map, doc_body_font_size))
.collect();
#[cfg(target_arch = "wasm32")]
let mut all_page_paragraphs: Vec<Vec<PdfParagraph>> = page_inputs
.into_iter()
.map(|input| process_single_page(input, &heading_map, doc_body_font_size))
.collect();
refine_heading_hierarchy(&mut all_page_paragraphs);
demote_unnumbered_subsections(&mut all_page_paragraphs);
demote_heading_runs(&mut all_page_paragraphs);
if strip_repeating_text {
mark_cross_page_repeating_text(&mut all_page_paragraphs, &page_heights);
mark_cross_page_repeating_short_text(&mut all_page_paragraphs);
}
mark_arxiv_noise(&mut all_page_paragraphs);
for page in &mut all_page_paragraphs {
retain_page_furniture_safely(page);
}
deduplicate_paragraphs(&mut all_page_paragraphs);
let total_paragraphs: usize = all_page_paragraphs.iter().map(|p| p.len()).sum();
tracing::debug!(
heuristic_page_count = heuristic_pages.len(),
total_paragraphs,
heading_map_len = heading_map.len(),
"PDF structure pipeline: stage 3 complete, assembling document"
);
let mut combined_tables: Vec<crate::types::Table> = tables.iter().cloned().chain(layout_tables).collect();
deduplicate_overlapping_tables(&mut combined_tables);
let image_pos_pairs: Vec<(usize, usize)> = all_image_positions
.iter()
.map(|img| (img.page_number, img.image_index))
.collect();
tracing::debug!(
combined_tables = combined_tables.len(),
image_positions = image_pos_pairs.len(),
total_paragraphs = all_page_paragraphs.iter().map(|p| p.len()).sum::<usize>(),
"stage 4: assembling document"
);
let mut doc = assemble_internal_document(all_page_paragraphs, &combined_tables, &image_pos_pairs);
populate_images_from_pdfium(document, &all_image_positions, &mut doc);
let element_count = doc.elements.len();
tracing::debug!(element_count, "PDF structure pipeline: assembly complete");
for elem in &mut doc.elements {
if elem.text.is_empty() {
continue;
}
let t1 = repair_contextual_ligatures(&elem.text);
let t2 = expand_ligatures_with_space_absorption(&t1);
let t3 = normalize_unicode_text(&t2);
if let Cow::Owned(normalized) = t3 {
elem.text = normalized;
} else if let Cow::Owned(normalized) = t2 {
elem.text = normalized;
} else if let Cow::Owned(normalized) = t1 {
elem.text = normalized;
}
}
Ok((doc, has_font_encoding_issues))
}
fn filter_segments_by_table_bboxes(
segments: Vec<SegmentData>,
table_bboxes: &[crate::types::BoundingBox],
) -> Vec<SegmentData> {
if table_bboxes.is_empty() {
return segments;
}
segments
.into_iter()
.filter(|seg| {
let seg_area = seg.width * seg.height;
if seg_area <= 0.0 || seg.text.trim().is_empty() {
return true;
}
!table_bboxes.iter().any(|bb| {
let inter_left = seg.x.max(bb.x0 as f32);
let inter_right = (seg.x + seg.width).min(bb.x1 as f32);
let inter_bottom = seg.y.max(bb.y0 as f32);
let inter_top = (seg.y + seg.height).min(bb.y1 as f32);
if inter_left >= inter_right || inter_bottom >= inter_top {
return false;
}
let inter_area = (inter_right - inter_left) * (inter_top - inter_bottom);
inter_area / seg_area >= 0.5
})
})
.collect()
}
fn build_page_text(paragraphs: &[PdfParagraph]) -> String {
let mut all_text = String::new();
for p in paragraphs {
for l in &p.lines {
for s in &l.segments {
if !all_text.is_empty() {
all_text.push(' ');
}
all_text.push_str(&s.text);
}
}
}
all_text
}
fn fused_text_repairs(text: &str) -> Cow<'_, str> {
let t1 = normalize_text_encoding(text);
let t2 = repair_ligature_spaces(&t1);
let t3 = expand_ligatures_with_space_absorption(&t2);
let t4 = normalize_unicode_text(&t3);
let t5 = clean_duplicate_punctuation(&t4);
match (&t1, &t2, &t3, &t4, &t5) {
(Cow::Borrowed(_), Cow::Borrowed(_), Cow::Borrowed(_), Cow::Borrowed(_), Cow::Borrowed(_)) => {
Cow::Borrowed(text)
}
_ => Cow::Owned(t5.into_owned()),
}
}
fn deduplicate_overlapping_tables(tables: &mut Vec<crate::types::Table>) {
if tables.len() < 2 {
return;
}
let mut to_remove = ahash::AHashSet::new();
for i in 0..tables.len() {
if to_remove.contains(&i) {
continue;
}
for j in (i + 1)..tables.len() {
if to_remove.contains(&j) {
continue;
}
if tables[i].page_number != tables[j].page_number {
continue;
}
if let (Some(a), Some(b)) = (&tables[i].bounding_box, &tables[j].bounding_box) {
let inter_x = (a.x1.min(b.x1) - a.x0.max(b.x0)).max(0.0);
let inter_y = (a.y1.min(b.y1) - a.y0.max(b.y0)).max(0.0);
let intersection = inter_x * inter_y;
let area_a = (a.x1 - a.x0) * (a.y1 - a.y0);
let area_b = (b.x1 - b.x0) * (b.y1 - b.y0);
let min_area = area_a.min(area_b);
if min_area > 0.0 && intersection / min_area > 0.5 {
let content_a = tables[i].cells.len() + tables[i].markdown.len();
let content_b = tables[j].cells.len() + tables[j].markdown.len();
if content_a >= content_b {
to_remove.insert(j);
} else {
to_remove.insert(i);
}
}
}
}
}
let mut idx = 0;
tables.retain(|_| {
let keep = !to_remove.contains(&idx);
idx += 1;
keep
});
}
fn retain_page_furniture_safely(paragraphs: &mut Vec<PdfParagraph>) {
let total = paragraphs.len();
let furniture_count = paragraphs.iter().filter(|p| p.is_page_furniture).count();
if furniture_count == 0 {
return; }
if furniture_count >= total {
for para in paragraphs.iter_mut() {
para.is_page_furniture = false;
}
return;
}
let total_alphanum: usize = paragraphs.iter().map(paragraph_alphanum_len).sum();
if total_alphanum > 0 {
let furniture_alphanum: usize = paragraphs
.iter()
.filter(|p| p.is_page_furniture)
.map(paragraph_alphanum_len)
.sum();
if furniture_alphanum * 100 > total_alphanum * 30 {
for para in paragraphs.iter_mut() {
para.is_page_furniture = false;
}
return;
}
}
const MIN_SUBSTANTIVE_CHARS: usize = 80;
paragraphs.retain(|p| {
if !p.is_page_furniture {
return true;
}
paragraph_alphanum_len(p) > MIN_SUBSTANTIVE_CHARS
});
}
fn paragraph_alphanum_len(para: &PdfParagraph) -> usize {
para.lines
.iter()
.flat_map(|line| line.segments.iter())
.map(|seg| seg.text.bytes().filter(|b| b.is_ascii_alphanumeric()).count())
.sum()
}
fn filter_standalone_page_numbers(segments: &mut Vec<SegmentData>) {
if segments.is_empty() {
return;
}
let tolerance = 3.0_f32; let candidates: Vec<usize> = segments
.iter()
.enumerate()
.filter(|(_, s)| {
let trimmed = s.text.trim();
!trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
})
.filter(|(idx, s)| {
!segments
.iter()
.enumerate()
.any(|(j, other)| j != *idx && (other.baseline_y - s.baseline_y).abs() < tolerance)
})
.map(|(idx, _)| idx)
.collect();
for &idx in candidates.iter().rev() {
segments.remove(idx);
}
}
fn dehyphenate_paragraphs(paragraphs: &mut [PdfParagraph], has_positions: bool) {
for para in paragraphs.iter_mut() {
if para.is_code_block || para.lines.len() < 2 {
continue;
}
if has_positions {
dehyphenate_paragraph_lines(para);
} else {
dehyphenate_hyphen_only(para);
}
}
}
fn dehyphenate_paragraph_lines(para: &mut PdfParagraph) {
let max_right_edge = para
.lines
.iter()
.filter_map(|line| line.segments.last().map(|seg| seg.x + seg.width))
.fold(0.0_f32, f32::max);
if max_right_edge <= 0.0 {
dehyphenate_hyphen_only(para);
return;
}
let threshold = max_right_edge * FULL_LINE_FRACTION;
let line_count = para.lines.len();
for i in (0..line_count - 1).rev() {
let line_right = para.lines[i]
.segments
.last()
.map(|seg| seg.x + seg.width)
.unwrap_or(0.0);
let is_full_line = line_right >= threshold;
if !is_full_line {
continue;
}
let trailing_seg_text: &str = match para.lines[i].segments.last() {
Some(seg) if !seg.text.is_empty() => &seg.text,
_ => continue,
};
let trailing_word = match trailing_seg_text.split_whitespace().next_back() {
Some(w) => w,
None => continue,
};
let leading_seg_text: &str = match para.lines[i + 1].segments.first() {
Some(seg) if !seg.text.is_empty() => &seg.text,
_ => continue,
};
let leading_word = match leading_seg_text.split_whitespace().next() {
Some(w) => w,
None => continue,
};
if trailing_word.chars().any(is_cjk_char) || leading_word.chars().any(is_cjk_char) {
continue;
}
if let Some(stem) = trailing_word.strip_suffix('-')
&& !stem.is_empty()
&& leading_word.starts_with(|c: char| c.is_lowercase())
{
let joined = format!("{}{}", stem, leading_word);
let tw = trailing_word.to_string();
let lw = leading_word.to_string();
apply_dehyphenation_join(para, i, &tw, &lw, &joined);
continue;
}
}
}
fn dehyphenate_hyphen_only(para: &mut PdfParagraph) {
let line_count = para.lines.len();
for i in (0..line_count - 1).rev() {
let trailing_seg_text: &str = match para.lines[i].segments.last() {
Some(seg) if !seg.text.is_empty() => &seg.text,
_ => continue,
};
let trailing_word = match trailing_seg_text.split_whitespace().next_back() {
Some(w) => w,
None => continue,
};
if !trailing_word.ends_with('-') {
continue;
}
let leading_seg_text: &str = match para.lines[i + 1].segments.first() {
Some(seg) if !seg.text.is_empty() => &seg.text,
_ => continue,
};
let leading_word = match leading_seg_text.split_whitespace().next() {
Some(w) => w,
None => continue,
};
if trailing_word.chars().any(is_cjk_char) || leading_word.chars().any(is_cjk_char) {
continue;
}
let stem = &trailing_word[..trailing_word.len() - 1];
if !stem.is_empty() && leading_word.starts_with(|c: char| c.is_lowercase()) {
let joined = format!("{}{}", stem, leading_word);
let tw = trailing_word.to_string();
let lw = leading_word.to_string();
apply_dehyphenation_join(para, i, &tw, &lw, &joined);
}
}
}
fn apply_dehyphenation_join(
para: &mut PdfParagraph,
line_idx: usize,
trailing_word: &str,
leading_word: &str,
joined: &str,
) {
if let Some(seg) = para.lines[line_idx].segments.last_mut()
&& let Some(pos) = seg.text.rfind(trailing_word)
{
seg.text.replace_range(pos..pos + trailing_word.len(), joined);
}
if let Some(seg) = para.lines[line_idx + 1].segments.first_mut()
&& let Some(pos) = seg.text.find(leading_word)
{
let end = pos + leading_word.len();
let trim_end = seg.text[end..]
.find(|c: char| !c.is_whitespace())
.map_or(seg.text.len(), |off| end + off);
seg.text.replace_range(pos..trim_end, "");
}
}
fn has_font_size_variation(paragraphs: &[PdfParagraph]) -> bool {
let mut first_size: Option<f32> = None;
for para in paragraphs {
let size = para.dominant_font_size;
if size <= 0.0 {
continue;
}
match first_size {
None => first_size = Some(size),
Some(fs) if (size - fs).abs() > 0.5 => return true,
_ => {}
}
}
false
}
fn deduplicate_paragraphs(all_pages: &mut [Vec<PdfParagraph>]) {
for page in all_pages.iter_mut() {
if page.len() < 2 {
continue;
}
let mut i = 0;
while i + 1 < page.len() {
let a_text = paragraph_text_normalized(&page[i]);
let b_text = paragraph_text_normalized(&page[i + 1]);
if a_text.len() >= 5 && a_text == b_text {
page.remove(i + 1);
} else {
i += 1;
}
}
let mut seen = ahash::AHashSet::new();
let mut to_remove = Vec::new();
for (idx, para) in page.iter().enumerate() {
if !is_dedup_candidate(para) {
continue;
}
let text = paragraph_text_normalized(para);
if text.len() < 15 {
continue;
}
if !seen.insert(text) {
to_remove.push(idx);
}
}
for idx in to_remove.into_iter().rev() {
page.remove(idx);
}
}
}
fn paragraph_text_normalized(p: &PdfParagraph) -> String {
let mut result = String::new();
for line in &p.lines {
for seg in &line.segments {
for word in seg.text.split_whitespace() {
if !result.is_empty() {
result.push(' ');
}
for c in word.chars() {
for lc in c.to_lowercase() {
result.push(lc);
}
}
}
}
}
result
}
fn is_dedup_candidate(p: &PdfParagraph) -> bool {
p.heading_level.is_none()
&& !p.is_list_item
&& !p.is_code_block
&& !p.is_formula
&& !p.is_page_furniture
&& p.caption_for.is_none()
}
fn populate_images_from_pdfium(
document: &PdfDocument,
image_positions: &[super::bridge::ImagePosition],
doc: &mut crate::types::internal::InternalDocument,
) {
use bytes::Bytes;
use image::ImageEncoder;
if image_positions.is_empty() {
return;
}
let mut by_page: std::collections::BTreeMap<usize, Vec<usize>> = std::collections::BTreeMap::new();
for pos in image_positions {
by_page.entry(pos.page_number).or_default().push(pos.image_index);
}
let pages = document.pages();
let mut extracted_count = 0u32;
for (&page_num, indices) in &by_page {
let page_idx = page_num.saturating_sub(1) as i32;
let Ok(page) = pages.get(page_idx) else {
for &idx in indices {
doc.images.push(empty_image_placeholder(idx, page_num));
}
continue;
};
let first_idx_on_page = indices.iter().copied().min().unwrap_or(0);
let mut current_image = 0usize;
let mut extracted_on_page: std::collections::BTreeMap<usize, crate::types::ExtractedImage> =
std::collections::BTreeMap::new();
for obj in page.objects().iter() {
if let Some(image_obj) = obj.as_image_object() {
let global_idx = first_idx_on_page + current_image;
if indices.contains(&global_idx)
&& let Ok(dynamic_image) = image_obj.get_processed_image(document)
{
let w = dynamic_image.width();
let h = dynamic_image.height();
let rgba = dynamic_image.to_rgba8();
let mut png_buf: Vec<u8> = Vec::new();
if image::codecs::png::PngEncoder::new(&mut png_buf)
.write_image(rgba.as_raw(), w, h, image::ExtendedColorType::Rgba8)
.is_ok()
{
extracted_count += 1;
extracted_on_page.insert(
global_idx,
crate::types::ExtractedImage {
data: Bytes::from(png_buf),
format: std::borrow::Cow::Borrowed("png"),
image_index: global_idx,
page_number: Some(page_num),
width: Some(w),
height: Some(h),
colorspace: Some("RGBA".to_string()),
bits_per_component: Some(8),
is_mask: false,
description: None,
ocr_result: None,
bounding_box: None,
source_path: None,
},
);
}
}
current_image += 1;
}
}
for &idx in indices {
let img = extracted_on_page
.remove(&idx)
.unwrap_or_else(|| empty_image_placeholder(idx, page_num));
doc.images.push(img);
}
}
tracing::debug!(
total_positions = image_positions.len(),
extracted = extracted_count,
"populated document images from pdfium"
);
}
fn empty_image_placeholder(idx: usize, page_num: usize) -> crate::types::ExtractedImage {
crate::types::ExtractedImage {
data: bytes::Bytes::new(),
format: std::borrow::Cow::Borrowed("unknown"),
image_index: idx,
page_number: Some(page_num),
width: None,
height: None,
colorspace: None,
bits_per_component: None,
is_mask: false,
description: None,
ocr_result: None,
bounding_box: None,
source_path: None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pdf::hierarchy::SegmentData;
use crate::pdf::structure::types::{PdfLine, PdfParagraph};
fn seg(text: &str, x: f32, width: f32) -> SegmentData {
SegmentData {
text: text.to_string(),
x,
y: 0.0,
width,
height: 12.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: 0.0,
}
}
fn line(segments: Vec<SegmentData>) -> PdfLine {
PdfLine {
segments,
baseline_y: 0.0,
dominant_font_size: 12.0,
is_bold: false,
is_monospace: false,
}
}
fn para(lines: Vec<PdfLine>) -> PdfParagraph {
PdfParagraph {
text: String::new(),
lines,
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
}
}
fn full_line_seg(text: &str) -> SegmentData {
seg(text, 10.0, 490.0)
}
fn short_line_seg(text: &str) -> SegmentData {
seg(text, 10.0, 100.0)
}
#[test]
fn test_case1_trailing_hyphen_full_line() {
let mut p = para(vec![
line(vec![full_line_seg("some soft-")]),
line(vec![seg("ware is great", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "some software");
assert_eq!(p.lines[1].segments[0].text, "is great");
}
#[test]
fn test_case2_no_hyphen_full_line_no_join() {
let mut p = para(vec![
line(vec![full_line_seg("the soft")]),
line(vec![seg("ware is great", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "the soft");
assert_eq!(p.lines[1].segments[0].text, "ware is great");
}
#[test]
fn test_short_line_no_join() {
let mut p = para(vec![
line(vec![short_line_seg("hello")]),
line(vec![full_line_seg("world and more")]),
]);
let original_trailing = p.lines[0].segments[0].text.clone();
let original_leading = p.lines[1].segments[0].text.clone();
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, original_trailing);
assert_eq!(p.lines[1].segments[0].text, original_leading);
}
#[test]
fn test_code_block_not_joined() {
let mut p = para(vec![
line(vec![full_line_seg("some soft-")]),
line(vec![seg("ware is code", 10.0, 200.0)]),
]);
p.is_code_block = true;
let mut paragraphs = vec![p];
dehyphenate_paragraphs(&mut paragraphs, true);
assert_eq!(paragraphs[0].lines[0].segments[0].text, "some soft-");
}
#[test]
fn test_uppercase_leading_not_joined() {
let mut p = para(vec![
line(vec![full_line_seg("some text")]),
line(vec![seg("Next sentence here", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "some text");
assert_eq!(p.lines[1].segments[0].text, "Next sentence here");
}
#[test]
fn test_cjk_not_joined() {
let mut p = para(vec![
line(vec![full_line_seg("some \u{4E00}-")]),
line(vec![seg("text here", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "some \u{4E00}-");
}
#[test]
fn test_real_world_software_no_join_without_hyphen() {
let mut p = para(vec![
line(vec![full_line_seg("advanced soft")]),
line(vec![seg("ware development", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "advanced soft");
assert_eq!(p.lines[1].segments[0].text, "ware development");
}
#[test]
fn test_real_world_hardware_no_join_without_hyphen() {
let mut p = para(vec![
line(vec![full_line_seg("modern hard")]),
line(vec![seg("ware components", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "modern hard");
assert_eq!(p.lines[1].segments[0].text, "ware components");
}
#[test]
fn test_leading_word_with_trailing_punctuation_no_join() {
let mut p = para(vec![
line(vec![full_line_seg("the soft")]),
line(vec![seg("ware, which is great", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[0].text, "the soft");
assert_eq!(p.lines[1].segments[0].text, "ware, which is great");
}
#[test]
fn test_hyphen_only_fallback() {
let mut p = para(vec![
line(vec![seg("some soft-", 0.0, 0.0)]),
line(vec![seg("ware is great", 0.0, 0.0)]),
]);
dehyphenate_hyphen_only(&mut p);
assert_eq!(p.lines[0].segments[0].text, "some software");
assert_eq!(p.lines[1].segments[0].text, "is great");
}
#[test]
fn test_hyphen_only_uppercase_not_joined() {
let mut p = para(vec![
line(vec![seg("some well-", 0.0, 0.0)]),
line(vec![seg("Known thing", 0.0, 0.0)]),
]);
dehyphenate_hyphen_only(&mut p);
assert_eq!(p.lines[0].segments[0].text, "some well-");
}
#[test]
fn test_single_line_paragraph_skipped() {
let mut paragraphs = vec![para(vec![line(vec![full_line_seg("single line")])])];
dehyphenate_paragraphs(&mut paragraphs, true);
assert_eq!(paragraphs[0].lines[0].segments[0].text, "single line");
}
#[test]
fn test_multi_segment_line_no_join_without_hyphen() {
let mut p = para(vec![
line(vec![
seg("first part", 10.0, 200.0),
seg("soft", 220.0, 280.0), ]),
line(vec![seg("ware next words", 10.0, 200.0)]),
]);
dehyphenate_paragraph_lines(&mut p);
assert_eq!(p.lines[0].segments[1].text, "soft");
assert_eq!(p.lines[1].segments[0].text, "ware next words");
}
fn para_with_font_size(font_size: f32) -> PdfParagraph {
PdfParagraph {
text: String::new(),
lines: vec![line(vec![seg("text", 0.0, 100.0)])],
dominant_font_size: font_size,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
}
}
#[test]
fn test_has_font_size_variation_empty() {
assert!(!has_font_size_variation(&[]));
}
#[test]
fn test_has_font_size_variation_single_size() {
let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(12.0)];
assert!(!has_font_size_variation(¶graphs));
}
#[test]
fn test_has_font_size_variation_different_sizes() {
let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(18.0)];
assert!(has_font_size_variation(¶graphs));
}
#[test]
fn test_has_font_size_variation_small_difference_ignored() {
let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(12.3)];
assert!(!has_font_size_variation(¶graphs));
}
#[test]
fn test_has_font_size_variation_zero_sizes_ignored() {
let paragraphs = vec![para_with_font_size(0.0), para_with_font_size(0.0)];
assert!(!has_font_size_variation(¶graphs));
}
}