use crate::types::{OutlineTarget, ParsedPage, ProjectedLine, StructNode};
use super::classify::is_rotated_line;
use super::inline::line_all_bold;
use super::paragraphs::continues_paragraph;
use super::tables::{TABLE_MIN_COLUMNS, split_cells};
pub(super) const HEADING_SIZE_EPSILON: f32 = 0.5;
pub(super) const ESTIMATED_HEADING_SIZE_MARGIN: f32 = 1.5;
pub(super) const MAX_HEADING_LEVELS: usize = 6;
pub(super) const REPEATED_HEADING_LINE_MIN: usize = 3;
fn normalize_repeat_key(text: &str) -> String {
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.to_lowercase()
}
pub(super) const FONT_SIZE_HEADING_TOLERANCE: f32 = 0.6;
pub(super) const BOLD_HEADING_MAX_CHARS: usize = 80;
pub(super) const HEADING_MAX_TEXT_CHARS: usize = 140;
pub(super) fn is_section_number_prefix(s: &str) -> bool {
let t = s.trim();
if t.is_empty() {
return false;
}
let stripped = t
.strip_prefix('§')
.or_else(|| {
for lead in [
"Sec.", "Sec", "Ch.", "Ch", "Chapter", "Chap.", "Chap", "Part",
] {
if let Some(rest) = t.strip_prefix(lead) {
return Some(rest.trim_start());
}
}
None
})
.unwrap_or(t);
if stripped.is_empty() {
return true;
}
let mut saw_digit = false;
let mut prev_dot = true;
for c in stripped.chars() {
if c.is_ascii_digit() {
saw_digit = true;
prev_dot = false;
} else if c.is_ascii_uppercase() && !prev_dot {
return false;
} else if c.is_ascii_uppercase() {
prev_dot = false;
} else if c == '.' {
if prev_dot {
return false;
}
prev_dot = true;
} else {
return false;
}
}
saw_digit
}
pub(super) fn is_attribution_line(text: &str) -> bool {
let t = text.trim_start();
const PREFIXES: &[&str] = &[
"Source:",
"Sources:",
"Note:",
"Notes:",
"Adapted from",
"Reproduced from",
"Reprinted from",
"Image:",
"Image source:",
"Photo:",
"Photo credit:",
"Credit:",
"Caption:",
];
for p in PREFIXES {
if t.len() >= p.len() && t.is_char_boundary(p.len()) && t[..p.len()].eq_ignore_ascii_case(p)
{
return true;
}
}
false
}
pub(super) fn is_caption_line(text: &str) -> bool {
let t = text.trim_start();
const PREFIXES: &[&str] = &[
"Figure",
"Figures",
"Fig.",
"Fig ",
"Table",
"Tables",
"Tab.",
"Tab ",
"Equation",
"Eq.",
"Eq ",
"Scheme",
"Chart",
"Plate",
"Photo",
"Algorithm",
"Listing",
];
let lower_t_first_word: String = t
.chars()
.take_while(|c| c.is_alphabetic() || *c == '.')
.collect();
for p in PREFIXES {
let p_trim = p.trim_end();
if lower_t_first_word.eq_ignore_ascii_case(p_trim) {
let rest = t[lower_t_first_word.len()..].trim_start();
let mut chars = rest.chars();
if let Some(c0) = chars.next()
&& (c0.is_ascii_digit()
|| (c0 == '(' && chars.next().is_some_and(|c| c.is_ascii_digit()))
|| matches!(c0, 'I' | 'V' | 'X' | 'L' | 'C'))
{
return true;
}
}
}
false
}
pub(super) fn toc_entry_arabic_number(text: &str) -> Option<i32> {
let s = text.trim();
if s.is_empty() {
return None;
}
let chars: Vec<char> = s.chars().collect();
let n = chars.len();
let mut tail_start = n;
while tail_start > 0 && chars[tail_start - 1].is_ascii_digit() {
tail_start -= 1;
}
let tail_len = n - tail_start;
if tail_len == 0 || tail_len > 4 {
return None;
}
let mut sep_end = tail_start;
let mut saw_ws = false;
while sep_end > 0 {
let c = chars[sep_end - 1];
if c.is_whitespace() {
sep_end -= 1;
saw_ws = true;
} else if c == '.' {
sep_end -= 1;
} else {
break;
}
}
if !saw_ws {
return None;
}
let body = &chars[..sep_end];
let alpha = body.iter().filter(|c| c.is_alphabetic()).count();
if alpha < 8 {
return None;
}
if body
.iter()
.rev()
.find(|c| !c.is_whitespace())
.is_some_and(|c| c.is_ascii_digit())
{
return None;
}
let tail: String = chars[tail_start..].iter().collect();
tail.parse::<i32>().ok()
}
pub(super) fn looks_like_toc_entry(text: &str) -> bool {
if toc_entry_arabic_number(text).is_some() {
return true;
}
let s = text.trim();
let chars: Vec<char> = s.chars().collect();
let n = chars.len();
let mut tail_start = n;
while tail_start > 0
&& matches!(
chars[tail_start - 1],
'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm' | 'I' | 'V' | 'X' | 'L' | 'C' | 'D' | 'M'
)
{
tail_start -= 1;
}
let tail_len = n - tail_start;
if !(2..=6).contains(&tail_len) {
return false;
}
let mut sep_end = tail_start;
let mut saw_ws = false;
while sep_end > 0 {
let c = chars[sep_end - 1];
if c.is_whitespace() {
sep_end -= 1;
saw_ws = true;
} else if c == '.' {
sep_end -= 1;
} else {
break;
}
}
if !saw_ws {
return false;
}
let alpha = chars[..sep_end]
.iter()
.filter(|c| c.is_alphabetic())
.count();
alpha >= 5
}
pub(super) fn is_toc_title(text: &str) -> bool {
let t = text.trim().trim_end_matches(':').to_ascii_lowercase();
matches!(
t.as_str(),
"contents"
| "table of contents"
| "table of content"
| "index"
| "list of figures"
| "list of tables"
| "table of figures"
| "toc"
)
}
pub(super) fn page_is_toc(page: &ParsedPage) -> bool {
let mut nums: Vec<i32> = Vec::new();
let mut total_toc_like = 0usize;
for line in &page.projected_lines {
if is_rotated_line(line) {
continue;
}
if let Some(n) = toc_entry_arabic_number(&line.text) {
nums.push(n);
total_toc_like += 1;
} else if looks_like_toc_entry(&line.text) {
total_toc_like += 1;
}
}
if total_toc_like < 4 || nums.len() < 3 {
return false;
}
let mut nondec = 0usize;
for w in nums.windows(2) {
if w[1] >= w[0] {
nondec += 1;
}
}
let frac = nondec as f32 / (nums.len() - 1) as f32;
frac >= 0.7
}
pub(super) fn looks_like_bold_heading(
line: &ProjectedLine,
prev: Option<&ProjectedLine>,
next: Option<&ProjectedLine>,
) -> bool {
let text = line.text.trim();
if text.is_empty() || text.chars().count() > BOLD_HEADING_MAX_CHARS {
return false;
}
if is_caption_line(text) {
return false;
}
if is_attribution_line(text) {
return false;
}
if !line_all_bold(line) {
return false;
}
if text.ends_with('.') {
return false;
}
let run_in_break = text
.find(". ")
.map(|p| (p, 2))
.or_else(|| text.find(": ").map(|p| (p, 2)));
if let Some((pos, sep_len)) = run_in_break {
let before = &text[..pos];
let is_section_number = is_section_number_prefix(before);
let after = text[pos + sep_len..].trim();
let starts_upper = after.chars().next().is_some_and(|c| c.is_ascii_uppercase());
let word_count = after.split_whitespace().count();
let ends_hyphen = text.trim_end().ends_with('-');
if !is_section_number
&& starts_upper
&& ((word_count >= 2 && ends_hyphen) || (word_count >= 3 && text.chars().count() > 50))
{
if *super::flags::DEBUG_MD {
eprintln!(
"[MD bold-heading REJECT run-in] '{}' (pos={} word_count={} ends_hyphen={} len={})",
text.chars().take(80).collect::<String>(),
pos,
word_count,
ends_hyphen,
text.chars().count()
);
}
return false;
}
}
if text.chars().next().is_some_and(|c| c.is_lowercase()) {
return false;
}
if alpha_ratio(text) < 0.5 {
return false;
}
if split_cells(line).len() >= TABLE_MIN_COLUMNS {
return false;
}
let multi_space_tokens = text
.split(" ")
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.count();
if multi_space_tokens >= TABLE_MIN_COLUMNS {
return false;
}
let gap_above_ok = match prev {
None => true,
Some(p) => !continues_paragraph(p, line),
};
if !gap_above_ok {
return false;
}
match next {
None => true,
Some(n) => !continues_paragraph(line, n) || line_all_bold(n),
}
}
pub(super) fn looks_like_numbered_bold_heading(
line: &ProjectedLine,
rest: &str,
prev: Option<&ProjectedLine>,
) -> bool {
let rest_trim = rest.trim();
if rest_trim.is_empty() || rest_trim.chars().count() > BOLD_HEADING_MAX_CHARS {
return false;
}
if is_caption_line(&line.text) {
return false;
}
if rest_trim.ends_with('.')
&& rest_trim
.chars()
.filter(|c| *c == '.' || *c == '?' || *c == '!')
.count()
>= 2
{
return false;
}
let mut saw_bold_body = false;
let mut saw_non_bold_body = false;
for span in &line.spans {
let text = span.text.trim();
if text.is_empty() {
continue;
}
let is_marker = text
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == ')' || c == '(');
if is_marker {
continue;
}
if crate::projection::is_mono_item(span) {
return false;
}
if crate::projection::is_bold_item(span) {
saw_bold_body = true;
} else {
saw_non_bold_body = true;
}
}
if !saw_bold_body || saw_non_bold_body {
return false;
}
if alpha_ratio(rest_trim) < 0.5 {
return false;
}
match prev {
None => true,
Some(p) => !continues_paragraph(p, line),
}
}
const BODY_CODOMINANT_FRACTION: f32 = 0.5;
pub fn compute_body_size(pages: &[ParsedPage]) -> f32 {
use std::collections::HashMap;
let mut weights: HashMap<u32, (f32, usize)> = HashMap::new();
for page in pages {
for line in &page.projected_lines {
if is_rotated_line(line) {
continue;
}
let size = heading_size_of(line);
if size <= 0.0 {
continue;
}
let chars = line.text.chars().count().max(1);
let key = (size * 100.0).round() as u32;
let entry = weights.entry(key).or_insert((size, 0));
entry.1 += chars;
}
}
let max_weight = weights.values().map(|(_, n)| *n).max().unwrap_or(0);
if max_weight == 0 {
return 0.0;
}
let threshold = (max_weight as f32 * BODY_CODOMINANT_FRACTION) as usize;
weights
.values()
.filter(|(_, n)| *n >= threshold)
.map(|(s, _)| *s)
.fold(0.0_f32, f32::max)
}
const MIN_HEADING_TOTAL_CHARS: usize = 10;
const MIN_HEADING_AVG_LINE_CHARS: f32 = 8.0;
const MAX_HEADING_AVG_LINE_CHARS: f32 = 200.0;
const MIN_HEADING_ALPHA_RATIO: f32 = 0.5;
pub fn build_heading_map(pages: &[ParsedPage], body_size: f32) -> Vec<(f32, u8)> {
use std::collections::HashMap;
let heading_margin = |line: &ProjectedLine| {
if line.font_size_is_estimated {
ESTIMATED_HEADING_SIZE_MARGIN
} else {
HEADING_SIZE_EPSILON
}
};
let mut text_freq: HashMap<String, usize> = HashMap::new();
for page in pages {
for line in &page.projected_lines {
if is_rotated_line(line) || is_caption_line(&line.text) || line.in_figure {
continue;
}
if heading_size_of(line) > body_size + heading_margin(line) {
*text_freq
.entry(normalize_repeat_key(&line.text))
.or_insert(0) += 1;
}
}
}
let mut sizes: HashMap<u32, (f32, usize, usize, usize)> = HashMap::new();
for page in pages {
for line in &page.projected_lines {
if is_rotated_line(line) || line.in_figure {
continue;
}
if is_caption_line(&line.text) {
continue;
}
if text_freq
.get(&normalize_repeat_key(&line.text))
.copied()
.unwrap_or(0)
>= REPEATED_HEADING_LINE_MIN
{
continue;
}
let size = heading_size_of(line);
let margin = heading_margin(line);
if size > body_size + margin {
let key = (size * 100.0).round() as u32;
let entry = sizes.entry(key).or_insert((size, 0, 0, 0));
entry.1 += 1;
for c in line.text.chars() {
if c.is_whitespace() {
continue;
}
entry.2 += 1;
if c.is_alphabetic() {
entry.3 += 1;
}
}
}
}
}
let all: Vec<(f32, usize, usize, usize)> = sizes.into_values().collect();
let mut kept: Vec<f32> = all
.iter()
.filter(|(_, lines, chars, alpha)| {
let alpha_ratio = if *chars == 0 {
0.0
} else {
(*alpha as f32) / (*chars as f32)
};
let avg_line_chars = *chars as f32 / (*lines).max(1) as f32;
*chars >= MIN_HEADING_TOTAL_CHARS
&& (MIN_HEADING_AVG_LINE_CHARS..=MAX_HEADING_AVG_LINE_CHARS)
.contains(&avg_line_chars)
&& alpha_ratio >= MIN_HEADING_ALPHA_RATIO
})
.map(|(s, _, _, _)| *s)
.collect();
kept.sort_by(|a, b| b.total_cmp(a));
kept.truncate(MAX_HEADING_LEVELS);
kept.into_iter()
.enumerate()
.map(|(i, s)| (s, (i + 1) as u8))
.collect()
}
fn alpha_ratio(text: &str) -> f32 {
let mut alpha = 0usize;
let mut total = 0usize;
for c in text.chars() {
if c.is_whitespace() {
continue;
}
total += 1;
if c.is_alphabetic() {
alpha += 1;
}
}
if total == 0 {
return 0.0;
}
alpha as f32 / total as f32
}
pub(super) fn heading_size_of(line: &ProjectedLine) -> f32 {
line.heading_font_size.unwrap_or(line.dominant_font_size)
}
pub(super) fn heading_level_for(size: f32, heading_map: &[(f32, u8)]) -> Option<u8> {
for (s, level) in heading_map {
if (size - *s).abs() < FONT_SIZE_HEADING_TOLERANCE {
return Some(*level);
}
}
None
}
pub(super) fn struct_heading_level(
line: &ProjectedLine,
struct_nodes: &[StructNode],
) -> Option<u8> {
let mcid = line.mcid?;
for node in struct_nodes {
if !node.mcids.contains(&mcid) {
continue;
}
if let Some(level) = parse_heading_role(&node.role) {
return Some(level);
}
}
None
}
fn parse_heading_role(role: &str) -> Option<u8> {
let trimmed = role.trim();
if !trimmed.starts_with('H') && !trimmed.starts_with('h') {
return None;
}
let digits = &trimmed[1..];
let n: u8 = digits.parse().ok()?;
if (1..=6).contains(&n) { Some(n) } else { None }
}
pub(super) fn outline_heading_level(
line: &ProjectedLine,
page_height: f32,
outline: &[OutlineTarget],
line_text: &str,
) -> Option<u8> {
if outline.is_empty() {
return None;
}
let normalized_line = normalize_outline_text(line_text);
if normalized_line.is_empty() {
return None;
}
let row_h = line.bbox.height.max(super::MIN_ROW_HEIGHT_PT);
let y_tolerance = row_h * 1.5;
for entry in outline {
let normalized_title = normalize_outline_text(&entry.title);
if normalized_title.is_empty() {
continue;
}
let y_ok = match entry.y_pdf {
Some(y) => {
let y_view = page_height - y;
if y_view < 0.0 || y_view > page_height {
true
} else {
(y_view - line.bbox.y).abs() <= y_tolerance
}
}
None => true,
};
if !y_ok {
continue;
}
let line_len = normalized_line.chars().count();
let title_len = normalized_title.chars().count();
let max_line_len = (title_len * 3).max(120);
let sentence_breaks = normalized_line.matches(". ").count();
if line_len > max_line_len || sentence_breaks >= 2 {
continue;
}
if sentence_breaks >= 1 && line_len > title_len + 15 {
continue;
}
if normalized_line.starts_with(&normalized_title)
|| normalized_title.starts_with(&normalized_line)
{
return Some(entry.level.min(MAX_HEADING_LEVELS as u8));
}
}
None
}
fn normalize_outline_text(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut prev_space = true;
for c in s.chars() {
if c.is_whitespace() {
if !prev_space {
out.push(' ');
prev_space = true;
}
} else {
for lc in c.to_lowercase() {
out.push(lc);
}
prev_space = false;
}
}
if out.ends_with(' ') {
out.pop();
}
out
}
#[cfg(test)]
mod tests {
use super::super::test_helpers::{line, page};
use super::*;
#[test]
fn toc_entry_arabic_extracts_trailing_page_number() {
assert_eq!(toc_entry_arabic_number("Introduction 7"), Some(7));
assert_eq!(
toc_entry_arabic_number("1. A Fountain in the Square 1"),
Some(1)
);
assert_eq!(
toc_entry_arabic_number("6. For the Love of Iran . . . 41"),
Some(41)
);
}
#[test]
fn toc_entry_arabic_rejects_decimals_and_axis_labels() {
assert_eq!(toc_entry_arabic_number("OCR-Recall3 7 94.2"), None);
assert_eq!(toc_entry_arabic_number("Chapter 7"), None);
assert_eq!(toc_entry_arabic_number("Section1"), None);
}
#[test]
fn is_toc_title_matches_common_variants() {
assert!(is_toc_title("Contents"));
assert!(is_toc_title("Table of Contents"));
assert!(is_toc_title("table of contents"));
assert!(is_toc_title("Index"));
assert!(!is_toc_title("Introduction"));
}
#[test]
fn page_is_toc_requires_monotonic_page_numbers() {
let pages_toc = page(vec![
line("Table of contents", 50.0, 30.0, 18.0, 18.0),
line("Introduction 7", 50.0, 60.0, 12.0, 12.0),
line("Part I: New Children 21", 50.0, 72.0, 12.0, 12.0),
line("Part II: From Solitary 45", 50.0, 84.0, 12.0, 12.0),
line("Part III: Commercial 71", 50.0, 96.0, 12.0, 12.0),
line("Conclusion 127", 50.0, 108.0, 12.0, 12.0),
]);
assert!(page_is_toc(&pages_toc));
let pages_chart = page(vec![
line("OCR-Recall is 94.2", 50.0, 60.0, 9.0, 9.0),
line("Precision rate 89.0", 50.0, 72.0, 9.0, 9.0),
line("F1 score 80.4", 50.0, 84.0, 9.0, 9.0),
]);
assert!(!page_is_toc(&pages_chart));
}
#[test]
fn body_size_picks_most_common() {
let pages = vec![page(vec![
line("Title", 50.0, 50.0, 18.0, 18.0),
line("body line one", 50.0, 80.0, 10.0, 10.0),
line("body line two", 50.0, 92.0, 10.0, 10.0),
line("body line three", 50.0, 104.0, 10.0, 10.0),
])];
let body = compute_body_size(&pages);
assert!((body - 10.0).abs() < 0.01, "body size = {body}");
}
#[test]
fn heading_map_descending_levels() {
let pages = vec![page(vec![
line("The largest heading on the page", 50.0, 50.0, 24.0, 24.0),
line("A smaller heading right below it", 50.0, 80.0, 18.0, 18.0),
line(
"body text line one with plenty of content",
50.0,
110.0,
10.0,
10.0,
),
line(
"body text line two with plenty of content",
50.0,
122.0,
10.0,
10.0,
),
line(
"body text line three with even more content",
50.0,
134.0,
10.0,
10.0,
),
line(
"body text line four with even more content",
50.0,
146.0,
10.0,
10.0,
),
])];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
assert_eq!(map.len(), 2);
assert_eq!(map[0].1, 1);
assert_eq!(map[1].1, 2);
assert!(map[0].0 > map[1].0);
}
}