use super::constants::{MAX_BOLD_HEADING_WORD_COUNT, MAX_HEADING_DISTANCE_MULTIPLIER, MAX_HEADING_WORD_COUNT};
use super::regions::looks_like_figure_label;
use super::types::PdfParagraph;
pub(super) fn classify_paragraphs(paragraphs: &mut [PdfParagraph], heading_map: &[(f32, Option<u8>)]) {
let gap_info = precompute_gap_info(heading_map);
let body_font_size = heading_map
.iter()
.find(|(_, level)| level.is_none())
.map(|(centroid, _)| *centroid)
.unwrap_or(0.0);
for para in paragraphs.iter_mut() {
let word_count: usize = para
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.split_whitespace().count())
.sum();
let layout_says_text = para.layout_class == Some(super::types::LayoutHintClass::Text);
let heading_level = find_heading_level(para.dominant_font_size, heading_map, &gap_info);
let heading_level = if layout_says_text {
if para.is_bold && heading_level.is_some() {
heading_level
} else {
None
}
} else {
heading_level
};
if let Some(level) = heading_level
&& word_count <= MAX_HEADING_WORD_COUNT
{
let text: String = para
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
if !super::layout_classify::is_separator_text(&text) {
para.heading_level = Some(level);
continue;
}
}
let is_italic = !para.lines.is_empty() && para.lines.iter().all(|l| l.segments.iter().all(|s| s.is_italic));
let layout_text_overridable = if layout_says_text {
body_font_size > 0.0 && para.dominant_font_size > body_font_size + 0.5
} else {
true
};
if (para.is_bold || is_italic)
&& !para.is_list_item
&& layout_text_overridable
&& word_count <= MAX_BOLD_HEADING_WORD_COUNT
{
let text: String = para
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let t = text.trim();
let italic_ok = if is_italic && !para.is_bold {
!t.contains('@') && !t.contains(',') && t.chars().next().is_some_and(|c| c.is_uppercase())
} else {
true
};
let too_short_at_body =
word_count <= 2 && body_font_size > 0.0 && para.dominant_font_size <= body_font_size + 0.5;
let period_ok = !t.ends_with('.') || is_section_pattern(t);
if italic_ok
&& !too_short_at_body
&& period_ok
&& !t.ends_with(':')
&& !looks_like_figure_label(t)
&& !super::layout_classify::is_separator_text(t)
{
para.heading_level = Some(2);
}
}
if para.is_code_block {
para.heading_level = None;
}
}
}
pub(super) fn find_heading_level(font_size: f32, heading_map: &[(f32, Option<u8>)], gap_info: &GapInfo) -> Option<u8> {
if heading_map.is_empty() {
return None;
}
if heading_map.len() == 1 {
return heading_map[0].1;
}
let mut best_distance = f32::INFINITY;
let mut best_level: Option<u8> = None;
for &(centroid, level) in heading_map {
let dist = (font_size - centroid).abs();
if dist < best_distance {
best_distance = dist;
best_level = level;
}
}
if best_distance > MAX_HEADING_DISTANCE_MULTIPLIER * gap_info.avg_gap {
return None;
}
best_level
}
pub(super) struct GapInfo {
avg_gap: f32,
}
pub(super) fn precompute_gap_info(heading_map: &[(f32, Option<u8>)]) -> GapInfo {
if heading_map.len() <= 1 {
return GapInfo { avg_gap: f32::INFINITY };
}
let mut centroids: Vec<f32> = heading_map.iter().map(|(c, _)| *c).collect();
centroids.sort_by(|a, b| a.total_cmp(b));
let gaps: Vec<f32> = centroids.windows(2).map(|w| (w[1] - w[0]).abs()).collect();
let avg_gap = if gaps.is_empty() {
f32::INFINITY
} else {
gaps.iter().sum::<f32>() / gaps.len() as f32
};
GapInfo { avg_gap }
}
pub(super) fn refine_heading_hierarchy(all_pages: &mut [Vec<PdfParagraph>]) {
let h1_count: usize = all_pages
.iter()
.flat_map(|page| page.iter())
.filter(|p| p.heading_level == Some(1))
.count();
if h1_count == 0 {
let has_any_heading = all_pages
.iter()
.flat_map(|page| page.iter())
.any(|p| p.heading_level.is_some());
if has_any_heading {
promote_title_heading(all_pages);
}
}
let h1_count: usize = all_pages
.iter()
.flat_map(|page| page.iter())
.filter(|p| p.heading_level == Some(1))
.count();
if h1_count <= 1 {
return;
}
for page in all_pages.iter_mut() {
merge_consecutive_h1s(page);
}
let h1_count: usize = all_pages
.iter()
.flat_map(|page| page.iter())
.filter(|p| p.heading_level == Some(1))
.count();
if h1_count <= 1 {
return;
}
let first_h1_is_title = all_pages
.iter()
.flat_map(|page| page.iter())
.find(|p| p.heading_level == Some(1))
.is_some_and(|p| !starts_with_section_number(¶graph_plain_text(p)));
if !first_h1_is_title {
return;
}
let mut found_first = false;
for page in all_pages.iter_mut() {
for para in page.iter_mut() {
if para.heading_level == Some(1) {
if !found_first {
found_first = true;
continue;
}
if starts_with_section_number(¶graph_plain_text(para)) {
para.heading_level = Some(2);
}
}
}
}
}
pub(super) fn is_section_pattern(text: &str) -> bool {
let t = text.trim();
if t.starts_with('§') {
return true;
}
let words = t.split_whitespace().count();
if words <= 6 && t.chars().filter(|c| c.is_alphabetic()).all(|c| c.is_uppercase()) {
return true;
}
starts_with_section_number(t)
}
fn starts_with_section_number(text: &str) -> bool {
let trimmed = text.trim();
let bytes = trimmed.as_bytes();
if bytes.is_empty() {
return false;
}
let digit_end = bytes.iter().position(|&b| !b.is_ascii_digit()).unwrap_or(0);
if digit_end > 0 && digit_end < bytes.len() {
let next = bytes[digit_end];
return next == b' ' || next == b'.' || next == b')';
}
false
}
pub(super) fn demote_unnumbered_subsections(all_pages: &mut [Vec<PdfParagraph>]) {
let mut h2_info: Vec<(usize, usize, bool)> = Vec::new(); for (page_idx, page) in all_pages.iter().enumerate() {
for (para_idx, para) in page.iter().enumerate() {
if para.heading_level == Some(2) {
let text = paragraph_plain_text(para);
h2_info.push((page_idx, para_idx, starts_with_section_number(&text)));
}
}
}
let numbered_count = h2_info.iter().filter(|(_, _, numbered)| *numbered).count();
if numbered_count < 3 {
return; }
let numbered_positions: Vec<usize> = h2_info
.iter()
.enumerate()
.filter(|(_, (_, _, numbered))| *numbered)
.map(|(idx, _)| idx)
.collect();
for window in numbered_positions.windows(2) {
let start = window[0];
let end = window[1];
for &(page_idx, para_idx, is_numbered) in &h2_info[start + 1..end] {
if !is_numbered {
let layout_confirmed = matches!(
all_pages[page_idx][para_idx].layout_class,
Some(super::types::LayoutHintClass::SectionHeader | super::types::LayoutHintClass::Title)
);
if !layout_confirmed {
all_pages[page_idx][para_idx].heading_level = Some(3);
}
}
}
}
}
pub(super) fn demote_heading_runs(all_pages: &mut [Vec<PdfParagraph>]) {
const MAX_CONSECUTIVE: usize = 3;
for page in all_pages.iter_mut() {
let mut run_start = 0;
while run_start < page.len() {
let Some(level) = page[run_start].heading_level else {
run_start += 1;
continue;
};
let mut run_end = run_start + 1;
while run_end < page.len() && page[run_end].heading_level == Some(level) {
run_end += 1;
}
let run_len = run_end - run_start;
if run_len > MAX_CONSECUTIVE {
for para in &mut page[run_start + 1..run_end] {
let layout_confirmed = matches!(
para.layout_class,
Some(super::types::LayoutHintClass::SectionHeader | super::types::LayoutHintClass::Title)
);
if !layout_confirmed {
para.heading_level = None;
}
}
}
run_start = run_end;
}
}
}
fn paragraph_plain_text(para: &PdfParagraph) -> String {
para.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ")
}
fn promote_title_heading(all_pages: &mut [Vec<PdfParagraph>]) {
for page in all_pages.iter_mut() {
for para in page.iter_mut() {
if para.heading_level.is_some() && para.layout_class == Some(super::types::LayoutHintClass::Title) {
para.heading_level = Some(1);
return;
}
}
}
if all_pages.is_empty() {
return;
}
let page = &all_pages[0];
let headings: Vec<(usize, f32)> = page
.iter()
.enumerate()
.filter(|(_, p)| p.heading_level.is_some())
.map(|(i, p)| (i, p.dominant_font_size))
.collect();
if headings.is_empty() {
return;
}
if headings.len() == 1 {
all_pages[0][headings[0].0].heading_level = Some(1);
return;
}
let max_size = headings.iter().map(|(_, s)| *s).fold(0.0f32, f32::max);
let second_max = headings
.iter()
.map(|(_, s)| *s)
.filter(|s| *s < max_size)
.fold(0.0f32, f32::max);
if max_size - second_max >= 1.5
&& let Some(&(idx, _)) = headings.iter().find(|(_, s)| *s == max_size)
{
all_pages[0][idx].heading_level = Some(1);
}
}
fn merge_consecutive_h1s(page: &mut Vec<PdfParagraph>) {
let mut i = 0;
while i < page.len() {
if page[i].heading_level != Some(1) {
i += 1;
continue;
}
let base_fs = page[i].dominant_font_size;
let mut run_end = i + 1;
while run_end < page.len()
&& page[run_end].heading_level == Some(1)
&& (page[run_end].dominant_font_size - base_fs).abs() < 0.5
{
run_end += 1;
}
if run_end - i > 1 {
let mut merged_lines = std::mem::take(&mut page[i].lines);
for para in &page[i + 1..run_end] {
merged_lines.extend(para.lines.clone());
}
page[i].lines = merged_lines;
page.drain(i + 1..run_end);
}
i += 1;
}
}
pub(super) fn mark_cross_page_repeating_text(all_pages: &mut [Vec<PdfParagraph>]) {
if all_pages.len() < 4 {
return; }
let mut text_page_count: ahash::AHashMap<String, usize> = ahash::AHashMap::new();
let mut alphanum_to_exact: ahash::AHashMap<String, ahash::AHashSet<String>> = ahash::AHashMap::new();
let mut alphanum_page_count: ahash::AHashMap<String, usize> = ahash::AHashMap::new();
for page in all_pages.iter() {
let mut seen_exact: ahash::AHashSet<String> = ahash::AHashSet::new();
let mut seen_alphanum: ahash::AHashSet<String> = ahash::AHashSet::new();
for para in page {
if para.is_page_furniture {
continue;
}
let text = paragraph_plain_text(para);
let normalized = text.trim().to_lowercase();
let word_count = normalized.split_whitespace().count();
if word_count == 0 || word_count > 8 {
continue;
}
if seen_exact.insert(normalized.clone()) {
*text_page_count.entry(normalized.clone()).or_insert(0) += 1;
}
let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
if alphanum_key.len() >= 6 {
alphanum_to_exact
.entry(alphanum_key.clone())
.or_default()
.insert(normalized.clone());
if seen_alphanum.insert(alphanum_key.clone()) {
*alphanum_page_count.entry(alphanum_key).or_insert(0) += 1;
}
}
}
}
let threshold = all_pages.len() / 2;
let mut repeating: ahash::AHashSet<String> = text_page_count
.into_iter()
.filter(|(_, count)| *count > threshold)
.map(|(text, _)| text)
.collect();
for (alphanum_key, count) in alphanum_page_count {
if count > threshold
&& let Some(exact_variants) = alphanum_to_exact.get(&alphanum_key)
{
for variant in exact_variants {
repeating.insert(variant.clone());
}
}
}
if repeating.is_empty() {
return;
}
for page in all_pages.iter_mut() {
for para in page.iter_mut() {
if para.is_page_furniture {
continue;
}
let text = paragraph_plain_text(para);
let normalized = text.trim().to_lowercase();
if repeating.contains(&normalized) {
para.is_page_furniture = true;
para.heading_level = None; }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pdf::hierarchy::SegmentData;
fn make_paragraph(font_size: f32, segment_count: usize) -> PdfParagraph {
let segments: Vec<SegmentData> = (0..segment_count)
.map(|i| SegmentData {
text: format!("word{}", i),
x: i as f32 * 50.0,
y: 700.0,
width: 40.0,
height: font_size,
font_size,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: 700.0,
})
.collect();
PdfParagraph {
lines: vec![super::super::types::PdfLine {
segments,
baseline_y: 700.0,
dominant_font_size: font_size,
is_bold: false,
is_monospace: false,
}],
dominant_font_size: font_size,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
}
}
#[test]
fn test_classify_heading() {
let heading_map = vec![(18.0, Some(1)), (12.0, None)];
let mut paragraphs = vec![make_paragraph(18.0, 3)];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, Some(1));
}
#[test]
fn test_classify_body() {
let heading_map = vec![(18.0, Some(1)), (12.0, None)];
let mut paragraphs = vec![make_paragraph(12.0, 5)];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
#[test]
fn test_classify_too_many_segments_for_heading() {
let heading_map = vec![(18.0, Some(1)), (12.0, None)];
let mut paragraphs = vec![make_paragraph(18.0, 21)]; classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
#[test]
fn test_find_heading_level_empty_map() {
let gap_info = precompute_gap_info(&[]);
assert_eq!(find_heading_level(12.0, &[], &gap_info), None);
}
#[test]
fn test_find_heading_level_single_entry() {
let heading_map = vec![(12.0, Some(1))];
let gap_info = precompute_gap_info(&heading_map);
assert_eq!(find_heading_level(12.0, &heading_map, &gap_info), Some(1));
}
#[test]
fn test_find_heading_level_outlier_rejected() {
let heading_map = vec![(12.0, None), (16.0, Some(2)), (20.0, Some(1))];
let gap_info = precompute_gap_info(&heading_map);
assert_eq!(find_heading_level(50.0, &heading_map, &gap_info), None);
}
#[test]
fn test_find_heading_level_close_match() {
let heading_map = vec![(12.0, None), (16.0, Some(2)), (20.0, Some(1))];
let gap_info = precompute_gap_info(&heading_map);
assert_eq!(find_heading_level(15.5, &heading_map, &gap_info), Some(2));
}
#[test]
fn test_classify_bold_short_paragraph_promoted_to_heading() {
let heading_map = vec![(12.0, None)]; let mut para = make_paragraph(12.0, 3);
para.is_bold = true;
para.lines[0].is_bold = true;
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, Some(2));
}
#[test]
fn test_classify_bold_long_paragraph_not_promoted() {
let heading_map = vec![(12.0, None)];
let mut para = make_paragraph(12.0, 20); para.is_bold = true;
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
#[test]
fn test_classify_bold_list_item_not_promoted() {
let heading_map = vec![(12.0, None)];
let mut para = make_paragraph(12.0, 3);
para.is_bold = true;
para.is_list_item = true;
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
#[test]
fn test_classify_few_segments_many_words_not_heading() {
let segments: Vec<SegmentData> = (0..3)
.map(|i| SegmentData {
text: "one two three four five six seven".to_string(),
x: i as f32 * 200.0,
y: 700.0,
width: 180.0,
height: 18.0,
font_size: 18.0,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: 700.0,
})
.collect();
let mut paragraphs = vec![PdfParagraph {
lines: vec![super::super::types::PdfLine {
segments,
baseline_y: 700.0,
dominant_font_size: 18.0,
is_bold: false,
is_monospace: false,
}],
dominant_font_size: 18.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
}];
let heading_map = vec![(18.0, Some(1)), (12.0, None)];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
fn make_h1(font_size: f32, text: &str) -> PdfParagraph {
let mut p = make_paragraph(font_size, 1);
p.lines[0].segments[0].text = text.to_string();
p.heading_level = Some(1);
p
}
#[test]
fn test_merge_consecutive_h1s_same_font() {
let mut page = vec![
make_h1(24.0, "KAISUN HOLDINGS"),
make_h1(24.0, "LIMITED"),
make_paragraph(12.0, 3), ];
merge_consecutive_h1s(&mut page);
assert_eq!(page.len(), 2);
assert_eq!(page[0].heading_level, Some(1));
assert_eq!(page[0].lines.len(), 2);
}
#[test]
fn test_merge_h1s_different_font_no_merge() {
let mut page = vec![make_h1(24.0, "Title"), make_h1(18.0, "Subtitle")];
merge_consecutive_h1s(&mut page);
assert_eq!(page.len(), 2); }
#[test]
fn test_cross_page_repeating_text() {
let make_body = |text: &str| {
let mut p = make_paragraph(12.0, 1);
p.lines[0].segments[0].text = text.to_string();
p
};
let mut pages = vec![
vec![make_body("Page 1 of 10"), make_body("Unique content A")],
vec![make_body("Page 1 of 10"), make_body("Unique content B")],
vec![make_body("Page 1 of 10"), make_body("Unique content C")],
vec![make_body("Page 1 of 10"), make_body("Unique content D")],
];
mark_cross_page_repeating_text(&mut pages);
assert!(pages[0][0].is_page_furniture);
assert!(!pages[0][1].is_page_furniture);
}
#[test]
fn test_cross_page_repeating_marks_repeated_headings_as_furniture() {
let mut pages = vec![];
for _ in 0..6 {
let mut h = make_h1(24.0, "Chapter");
h.heading_level = Some(1);
pages.push(vec![h, make_paragraph(12.0, 3)]);
}
mark_cross_page_repeating_text(&mut pages);
assert!(pages[0][0].is_page_furniture);
assert!(pages[0][0].heading_level.is_none());
}
#[test]
fn test_cross_page_repeating_fuzzy_matches_iso_variants() {
let make_body = |text: &str| {
let mut p = make_paragraph(12.0, 1);
p.lines[0].segments[0].text = text.to_string();
p
};
let mut pages = vec![
vec![
make_body("O ISO 2021 All rights reserved"),
make_body("Section content A"),
],
vec![
make_body("O ISO 2021 All rights reserved"),
make_body("Section content B"),
],
vec![
make_body("O ISO 2021 All rights reserved"),
make_body("Section content C"),
],
vec![make_body("OISO 2021Allrightsreserved"), make_body("Section content D")],
vec![make_body("OISO 2021Allrightsreserved"), make_body("Section content E")],
vec![make_body("OISO 2021Allrightsreserved"), make_body("Section content F")],
];
mark_cross_page_repeating_text(&mut pages);
assert!(
pages[0][0].is_page_furniture,
"even-page copyright variant should be furniture"
);
assert!(
pages[3][0].is_page_furniture,
"odd-page copyright variant should be furniture"
);
assert!(!pages[0][1].is_page_furniture);
assert!(!pages[3][1].is_page_furniture);
}
#[test]
fn test_layout_text_bold_heading_font_promoted() {
let heading_map = vec![(16.0, Some(2)), (12.0, None)];
let mut para = make_paragraph(16.0, 3);
para.is_bold = true;
para.layout_class = Some(super::super::types::LayoutHintClass::Text);
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, Some(2));
}
#[test]
fn test_layout_text_non_bold_heading_font_not_promoted() {
let heading_map = vec![(16.0, Some(2)), (12.0, None)];
let mut para = make_paragraph(16.0, 3);
para.layout_class = Some(super::super::types::LayoutHintClass::Text);
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
#[test]
fn test_layout_text_bold_body_font_not_promoted_pass1() {
let heading_map = vec![(16.0, Some(2)), (12.0, None)];
let mut para = make_paragraph(12.0, 3);
para.is_bold = true;
para.layout_class = Some(super::super::types::LayoutHintClass::Text);
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, None);
}
#[test]
fn test_layout_text_bold_larger_font_promoted_pass2() {
let heading_map = vec![(12.0, None)]; let mut para = make_paragraph(14.0, 3);
para.is_bold = true;
para.layout_class = Some(super::super::types::LayoutHintClass::Text);
let mut paragraphs = vec![para];
classify_paragraphs(&mut paragraphs, &heading_map);
assert_eq!(paragraphs[0].heading_level, Some(2));
}
}