use crate::types::{OutlineTarget, ParsedPage, ProjectedLine};
use super::blocks::{Block, paragraph_from_accum};
use super::headings::{
HEADING_MAX_TEXT_CHARS, MAX_HEADING_LEVELS, heading_level_for, heading_size_of,
is_caption_line, is_toc_title, looks_like_bold_heading, looks_like_numbered_bold_heading,
outline_heading_level, page_is_toc, struct_heading_level,
};
use super::hr::detect_horizontal_rules;
use super::inline::{
append_inline_continuation, line_uniform_style, render_line_inline, render_list_item_text,
};
use super::lists::{LIST_INDENT_STEP_PT, parse_list_marker};
use super::paragraphs::{
ParaAccum, append_to_paragraph, collapse_whitespace, continues_heading, continues_paragraph,
ends_hyphenated, ends_sentence_final, is_soft_hyphen_break,
};
use super::repetition::is_header_or_footer;
use super::tables::{detect_ruled_tables, detect_tables, merge_table_runs};
#[derive(Clone)]
enum Interruption {
Hr,
Figure(crate::types::ImageRef),
Table(super::blocks::Block),
}
pub(super) fn is_rotated_line(line: &ProjectedLine) -> bool {
line.spans.iter().any(|s| {
let r = s.rotation.abs() % 360.0;
!(r < 5.0 || (175.0..=185.0).contains(&r) || (355.0..=360.0).contains(&r))
})
}
#[cfg(test)]
pub(super) fn classify_page(page: &ParsedPage, heading_map: &[(f32, u8)]) -> Vec<Block> {
classify_page_with_filters(
page,
heading_map,
&std::collections::HashSet::new(),
&[],
crate::config::ImageMode::Placeholder,
&std::collections::HashSet::new(),
)
}
pub fn classify_page_with_filters(
page: &ParsedPage,
heading_map: &[(f32, u8)],
header_footer: &std::collections::HashSet<String>,
outline: &[OutlineTarget],
image_mode: crate::config::ImageMode,
chrome_indices: &std::collections::HashSet<usize>,
) -> Vec<Block> {
let debug = *super::flags::DEBUG_MD;
let toc_page = page_is_toc(page) || {
let mut saw_title = false;
let mut long_lines = 0usize;
for line in &page.projected_lines {
if is_rotated_line(line) {
continue;
}
let t = line.text.trim();
if t.is_empty() {
continue;
}
if !saw_title {
if is_toc_title(t) {
saw_title = true;
continue;
}
if t.chars().count() > 40 {
break;
}
} else if t.chars().count() >= 80 {
let alpha = t.chars().filter(|c| c.is_alphabetic()).count();
let alpha_ratio = alpha as f32 / t.chars().count() as f32;
if alpha_ratio < 0.3 {
continue;
}
long_lines += 1;
if long_lines >= 3 {
saw_title = false;
break;
}
}
}
saw_title
};
let need_filter = !header_footer.is_empty() || !chrome_indices.is_empty();
let filtered_owned: Vec<ProjectedLine> = if !need_filter {
Vec::new()
} else {
page.projected_lines
.iter()
.enumerate()
.filter(|(idx, l)| {
!chrome_indices.contains(idx) && !is_header_or_footer(l, page, header_footer)
})
.map(|(_, l)| l.clone())
.collect()
};
let lines: &[ProjectedLine] = if !need_filter {
&page.projected_lines
} else {
&filtered_owned
};
let mut global_ruled_tables: Vec<(f32, Block)> = Vec::new();
let mut global_ruled_consumed: std::collections::HashSet<usize> =
std::collections::HashSet::new();
for (run, consumed) in super::tables::detect_ruled_tables_global(
lines,
&page.graphics,
page.page_width,
page.page_height,
) {
let mut groups: std::collections::HashMap<&Vec<u16>, Vec<usize>> =
std::collections::HashMap::new();
for &i in &consumed {
groups.entry(&lines[i].region_path).or_default().push(i);
}
if groups.len() < 2 {
continue;
}
let already_handled = groups.values().any(|idxs| {
if idxs.len() < 2 {
return false;
}
let sub: Vec<ProjectedLine> = idxs.iter().map(|&i| lines[i].clone()).collect();
!super::tables::detect_ruled_tables(
&sub,
&page.graphics,
page.page_width,
page.page_height,
)
.is_empty()
|| !super::tables::detect_tables(&sub).is_empty()
});
if already_handled {
continue;
}
if let Block::Table { header, rows } = &run.block {
let cols = header
.as_ref()
.map(|h| h.len())
.or_else(|| rows.first().map(Vec::len))
.unwrap_or(0);
if cols < 2 {
continue;
}
}
let top_y = consumed
.iter()
.map(|&i| lines[i].bbox.y)
.fold(f32::INFINITY, f32::min);
global_ruled_tables.push((top_y, run.block));
global_ruled_consumed.extend(consumed);
}
let global_ruled_owned: Option<Vec<ProjectedLine>> = if global_ruled_consumed.is_empty() {
None
} else {
Some(
lines
.iter()
.enumerate()
.filter(|(i, _)| !global_ruled_consumed.contains(i))
.map(|(_, l)| l.clone())
.collect(),
)
};
let lines: &[ProjectedLine] = global_ruled_owned.as_deref().unwrap_or(lines);
let mut cross_merged_owned: Option<Vec<ProjectedLine>> = None;
let mut cross_region_runs: Vec<(Vec<u16>, Vec<super::tables::TableRun>)> = Vec::new();
for _ in 0..3 {
if toc_page {
break;
}
let cur: &[ProjectedLine] = cross_merged_owned.as_deref().unwrap_or(lines);
let Some(m) = super::cross_region::find_cross_region_table_merge(cur) else {
break;
};
let mut next: Vec<ProjectedLine> = Vec::with_capacity(cur.len());
next.extend_from_slice(&cur[..m.start]);
let merged_path = m.merged[0].region_path.clone();
next.extend(m.merged);
next.extend_from_slice(&cur[m.end..]);
cross_region_runs.push((merged_path, m.runs));
cross_merged_owned = Some(next);
}
let lines: &[ProjectedLine] = cross_merged_owned.as_deref().unwrap_or(lines);
let mut region_ranges: Vec<(usize, usize)> = Vec::new();
{
let mut s = 0;
while s < lines.len() {
let path = &lines[s].region_path;
let mut e = s + 1;
while e < lines.len() && lines[e].region_path == *path {
e += 1;
}
region_ranges.push((s, e));
s = e;
}
}
let mut all_interruptions: Vec<(f32, Interruption)> = detect_horizontal_rules(page)
.into_iter()
.map(|y| (y, Interruption::Hr))
.collect();
if !matches!(image_mode, crate::config::ImageMode::Off) {
for r in &page.image_refs {
all_interruptions.push((r.bbox.y, Interruption::Figure(r.clone())));
}
}
for (y, block) in global_ruled_tables {
all_interruptions.push((y, Interruption::Table(block)));
}
all_interruptions.sort_by(|a, b| a.0.total_cmp(&b.0));
let mut region_boundaries: Vec<usize> = Vec::new();
let mut interrupt_cursor = 0usize;
let mut blocks: Vec<Block> = Vec::new();
let push_interruption = |blocks: &mut Vec<Block>, kind: Interruption| {
blocks.push(match kind {
Interruption::Hr => Block::HorizontalRule,
Interruption::Figure(r) => Block::Figure { id: r.id },
Interruption::Table(b) => b,
});
};
for (rstart, rend) in region_ranges {
let region_lines = &lines[rstart..rend];
let mut y_min = f32::INFINITY;
let mut y_max = f32::NEG_INFINITY;
for l in region_lines {
y_min = y_min.min(l.bbox.y);
y_max = y_max.max(l.bbox.y + l.bbox.height);
}
while interrupt_cursor < all_interruptions.len()
&& all_interruptions[interrupt_cursor].0 < y_min
{
let (_, kind) = all_interruptions[interrupt_cursor].clone();
push_interruption(&mut blocks, kind);
interrupt_cursor += 1;
}
let mut region_interruptions: Vec<(f32, Interruption)> = Vec::new();
while interrupt_cursor < all_interruptions.len()
&& all_interruptions[interrupt_cursor].0 <= y_max
{
region_interruptions.push(all_interruptions[interrupt_cursor].clone());
interrupt_cursor += 1;
}
let region_start = blocks.len();
let precomputed_tables = cross_region_runs
.iter()
.find(|(path, _)| *path == lines[rstart].region_path)
.map(|(_, runs)| runs.clone());
let region_blocks = classify_region(
region_lines,
region_interruptions,
page,
heading_map,
outline,
toc_page,
debug,
precomputed_tables,
);
if !region_blocks.is_empty() {
region_boundaries.push(region_start);
blocks.extend(region_blocks);
}
}
while interrupt_cursor < all_interruptions.len() {
let (_, kind) = all_interruptions[interrupt_cursor].clone();
push_interruption(&mut blocks, kind);
interrupt_cursor += 1;
}
stitch_regions(blocks, ®ion_boundaries)
}
#[derive(Default)]
struct FlowState {
paragraph: Option<ParaAccum>,
code: Option<Vec<String>>,
list_base_indent: Option<f32>,
last_list_item_idx: Option<usize>,
last_list_line: Option<usize>,
}
impl FlowState {
fn reset_list(&mut self) {
self.list_base_indent = None;
self.last_list_item_idx = None;
self.last_list_line = None;
}
fn flush_paragraph(&mut self, blocks: &mut Vec<Block>) {
if let Some(acc) = self.paragraph.take()
&& !acc.raw.trim().is_empty()
{
blocks.push(paragraph_from_accum(acc));
}
}
fn flush_code(&mut self, blocks: &mut Vec<Block>) {
if let Some(lines) = self.code.take()
&& !lines.is_empty()
{
let lang = detect_code_language(&lines);
blocks.push(Block::CodeBlock { lines, lang });
}
}
fn emit_before(
&mut self,
blocks: &mut Vec<Block>,
iter: &mut std::iter::Peekable<std::vec::IntoIter<(f32, Interruption)>>,
before_y: f32,
) {
while let Some((y, _)) = iter.peek() {
if *y > before_y {
break;
}
let (_, kind) = iter.next().unwrap();
self.flush_paragraph(blocks);
self.flush_code(blocks);
self.reset_list();
blocks.push(match kind {
Interruption::Hr => Block::HorizontalRule,
Interruption::Figure(r) => Block::Figure { id: r.id },
Interruption::Table(b) => b,
});
}
}
}
#[allow(clippy::too_many_arguments)]
fn classify_region(
lines: &[ProjectedLine],
interruptions: Vec<(f32, Interruption)>,
page: &ParsedPage,
heading_map: &[(f32, u8)],
outline: &[OutlineTarget],
toc_page: bool,
debug: bool,
precomputed_tables: Option<Vec<super::tables::TableRun>>,
) -> Vec<Block> {
let mut blocks: Vec<Block> = Vec::new();
let mut state = FlowState::default();
let mut heading_run: Option<(u8, usize)> = None;
let mut toc_title_emitted = false;
let ruled_runs = detect_ruled_tables(lines, &page.graphics, page.page_width, page.page_height);
let borderless_runs = precomputed_tables.unwrap_or_else(|| detect_tables(lines));
let table_runs = merge_table_runs(ruled_runs, borderless_runs);
const TABLE_HR_SUPPRESS_HEADROOM_ROWS: f32 = 4.0;
let table_y_extents: Vec<(f32, f32)> = table_runs
.iter()
.map(|run| {
let top_line = &lines[run.start];
let row_h = top_line.bbox.height.max(super::MIN_ROW_HEIGHT_PT);
let top = top_line.bbox.y - row_h * TABLE_HR_SUPPRESS_HEADROOM_ROWS;
let last = &lines[run.end.saturating_sub(1).max(run.start)];
let bot = last.bbox.y + last.bbox.height;
(top, bot)
})
.collect();
let mut table_iter = table_runs.into_iter().peekable();
let in_table_band = |y: f32| {
table_y_extents
.iter()
.any(|(top, bot)| y >= *top - 2.0 && y <= *bot + 2.0)
};
let mut region_interruptions: Vec<(f32, Interruption)> = interruptions
.into_iter()
.filter(|(y, _)| !in_table_band(*y))
.collect();
region_interruptions.sort_by(|a, b| a.0.total_cmp(&b.0));
let mut interruptions = region_interruptions.into_iter().peekable();
let mut idx = 0;
while idx < lines.len() {
if let Some(run) = table_iter.peek()
&& run.start == idx
{
let table_top = lines[run.start].bbox.y;
state.emit_before(&mut blocks, &mut interruptions, table_top);
state.flush_paragraph(&mut blocks);
state.flush_code(&mut blocks);
state.reset_list();
let run = table_iter.next().unwrap();
blocks.push(run.block);
idx = run.end;
continue;
}
let line_idx = idx;
let line = &lines[line_idx];
state.emit_before(&mut blocks, &mut interruptions, line.bbox.y);
idx += 1;
let text = line.text.trim();
if text.is_empty() {
continue;
}
if is_rotated_line(line) {
continue;
}
if debug {
eprintln!(
"[md] y={:.1} h={:.1} size={:.2} anchor={:?} indent={:.1} text={:?}",
line.bbox.y,
line.bbox.height,
line.dominant_font_size,
line.anchor,
line.indent_x,
text
);
}
if line.all_mono {
state.flush_paragraph(&mut blocks);
state.reset_list();
state
.code
.get_or_insert_with(Vec::new)
.push(line.text.trim_end().to_string());
continue;
}
state.flush_code(&mut blocks);
if let Some(is_rule) = decorative_divider_kind(text) {
state.flush_paragraph(&mut blocks);
state.reset_list();
heading_run = None;
if is_rule {
blocks.push(Block::HorizontalRule);
}
if debug {
eprintln!(
"[md decorative] {} '{}'",
if is_rule { "rule" } else { "drop" },
text.chars().take(40).collect::<String>(),
);
}
continue;
}
let tagged_level = struct_heading_level(line, &page.struct_nodes);
let is_first_toc_title = is_toc_title(text) && !toc_title_emitted;
let toc_suppress = toc_page && !is_first_toc_title;
let outline_level = tagged_level.or_else(|| {
if toc_suppress {
None
} else {
outline_heading_level(line, page.page_height, outline, text)
}
});
let size_level =
if is_caption_line(text) || toc_suppress || is_rotated_line(line) || line.in_figure {
None
} else {
heading_level_for(heading_size_of(line), heading_map)
};
let size_level = size_level.filter(|_| {
let starts_lower = text.chars().next().is_some_and(|c| c.is_lowercase());
let prev = state
.paragraph
.as_ref()
.map(|p| &p.last)
.or(state.last_list_line.map(|i| &lines[i]));
let prev_hyphen_wrap = prev.is_some_and(|p| ends_hyphenated(&p.text));
let sentence_tail = starts_lower
&& ends_sentence_final(text)
&& prev.is_some_and(|p| !ends_sentence_final(&p.text));
let is_continuation =
prev.is_some_and(|p| continues_paragraph(p, line)) || sentence_tail;
!((starts_lower || prev_hyphen_wrap) && is_continuation)
});
let size_level = size_level.filter(|_| text.chars().count() <= HEADING_MAX_TEXT_CHARS);
let size_level =
size_level.filter(|_| !crate::markdown_layout::headings::is_attribution_line(text));
let size_level = size_level.filter(|_| {
let has_mid_sentence = text.contains(". ") || text.contains(": ");
let mid_word_wrap = text.trim_end().ends_with('-');
!(has_mid_sentence && mid_word_wrap)
});
let toc_title_level = if is_first_toc_title { Some(1u8) } else { None };
if is_first_toc_title {
toc_title_emitted = true;
}
let level = outline_level
.or(size_level)
.or(toc_title_level)
.map(|l| l.clamp(1, MAX_HEADING_LEVELS as u8));
let mut demoted_heading = false;
let cont_looks_like_heading = parse_list_marker(text).is_none() && !text.contains(". ");
if level.is_none()
&& cont_looks_like_heading
&& let Some((run_level, run_idx)) = heading_run.as_ref()
&& continues_heading(&lines[*run_idx], line)
&& let Some(Block::Heading {
level: last_level,
text: htext,
}) = blocks.last_mut()
&& *last_level == *run_level
{
let combined_chars = htext.chars().count() + 1 + text.chars().count();
if combined_chars <= HEADING_MAX_TEXT_CHARS {
let run_level = *run_level;
if debug {
eprintln!(
"[MD heading-wrap-uncond] merge h{} '{}' <- '{}' (prev_idx={} cur_idx={} combined={})",
run_level,
htext.chars().take(40).collect::<String>(),
text.chars().take(60).collect::<String>(),
run_idx,
line_idx,
combined_chars
);
}
append_inline_continuation(htext, text, &collapse_whitespace(text));
heading_run = Some((run_level, line_idx));
continue;
}
}
if let Some(level) = level {
if let Some((run_level, run_idx)) = heading_run.as_ref()
&& *run_level == level
&& continues_heading(&lines[*run_idx], line)
&& let Some(Block::Heading {
level: last_level,
text: htext,
}) = blocks.last_mut()
&& *last_level == level
{
let combined_chars = htext.chars().count() + 1 + text.chars().count();
if combined_chars > HEADING_MAX_TEXT_CHARS {
let demoted = std::mem::take(htext);
blocks.pop();
state.paragraph = Some(ParaAccum {
raw: demoted.clone(),
inline: demoted,
last: lines[*run_idx].clone(),
uniform: None,
});
heading_run = None;
demoted_heading = true;
} else {
append_inline_continuation(htext, text, &collapse_whitespace(text));
heading_run = Some((level, line_idx));
continue;
}
}
if !demoted_heading {
state.flush_paragraph(&mut blocks);
state.reset_list();
if debug {
eprintln!(
"[MD heading-emit size/outline] h{} idx={} '{}' size={:.2}",
level,
line_idx,
text.chars().take(80).collect::<String>(),
line.dominant_font_size,
);
}
blocks.push(Block::Heading {
level,
text: collapse_whitespace(text),
});
heading_run = Some((level, line_idx));
continue;
}
}
if let Some((ordered, marker, rest)) = parse_list_marker(text) {
if ordered
&& !toc_suppress
&& looks_like_numbered_bold_heading(
line,
rest,
state
.paragraph
.as_ref()
.map(|p| &p.last)
.or(state.last_list_line.map(|i| &lines[i])),
)
{
state.flush_paragraph(&mut blocks);
state.reset_list();
let level = (heading_map.len() as u8 + 1).clamp(1, MAX_HEADING_LEVELS as u8);
blocks.push(Block::Heading {
level,
text: collapse_whitespace(text),
});
continue;
}
state.flush_paragraph(&mut blocks);
let base = *state.list_base_indent.get_or_insert(line.indent_x);
let level = (((line.indent_x - base) / LIST_INDENT_STEP_PT)
.round()
.max(0.0)) as u8;
state.last_list_item_idx = Some(blocks.len());
state.last_list_line = Some(line_idx);
let item_text = render_list_item_text(line, &marker, rest);
blocks.push(Block::ListItem {
ordered,
marker,
level,
text: item_text,
bold: false,
italic: false,
});
continue;
}
if let Some(item_idx) = state.last_list_item_idx
&& let Some(prev_idx) = state.last_list_line
&& continues_paragraph(&lines[prev_idx], line)
&& let Some(Block::ListItem {
text: prev_text, ..
}) = blocks.get_mut(item_idx)
{
let cont_inline = render_line_inline(line);
append_inline_continuation(prev_text, text, &cont_inline);
state.last_list_line = Some(line_idx);
continue;
}
let prev_for_gap = state
.paragraph
.as_ref()
.map(|p| &p.last)
.or(state.last_list_line.map(|i| &lines[i]));
let next_for_gap = lines.get(idx);
if !toc_suppress && looks_like_bold_heading(line, prev_for_gap, next_for_gap) {
state.flush_paragraph(&mut blocks);
state.reset_list();
let level = (heading_map.len() as u8 + 1).clamp(1, MAX_HEADING_LEVELS as u8);
if debug {
eprintln!(
"[MD heading-emit bold] h{} idx={} '{}' size={:.2}",
level,
line_idx,
text.chars().take(80).collect::<String>(),
line.dominant_font_size,
);
}
blocks.push(Block::Heading {
level,
text: collapse_whitespace(text),
});
heading_run = Some((level, line_idx));
continue;
}
match state.paragraph.as_mut() {
Some(acc) if continues_paragraph(&acc.last, line) => {
append_to_paragraph(acc, line);
}
_ => {
state.flush_paragraph(&mut blocks);
state.reset_list();
let inline = render_line_inline(line);
let raw = collapse_whitespace(text);
let uniform = line_uniform_style(line)
.filter(|s| !s.strike)
.map(|s| (s.bold, s.italic));
state.paragraph = Some(ParaAccum {
raw,
inline,
last: line.clone(),
uniform,
});
}
}
}
state.flush_paragraph(&mut blocks);
state.flush_code(&mut blocks);
state.emit_before(&mut blocks, &mut interruptions, f32::INFINITY);
blocks
}
fn decorative_divider_kind(text: &str) -> Option<bool> {
let mut symbols = 0usize;
for c in text.chars() {
if c.is_whitespace() {
continue;
}
if c.is_alphanumeric() || !is_divider_symbol(c) {
return None;
}
symbols += 1;
}
if symbols == 0 {
return None;
}
Some(symbols >= 3)
}
fn is_divider_symbol(c: char) -> bool {
matches!(
c,
'*' | '-' | '_' | '–' | '—' | '•' | '·' | '●' | '▪' | '■' | '◦' | '★' | '☆'
)
}
fn detect_code_language(lines: &[String]) -> Option<String> {
let body = lines.join("\n");
let trimmed = body.trim_start();
if let Some(first) = trimmed.chars().find(|c| !c.is_whitespace())
&& (first == '{' || first == '[')
&& body.contains("\":")
&& body.matches('{').count() + body.matches('[').count() >= 1
{
return Some("json".to_string());
}
let cpp_hits = [
"#include",
"std::",
"int main",
"nullptr",
"->",
"::",
"template<",
]
.iter()
.filter(|s| body.contains(**s))
.count();
if cpp_hits >= 2 {
return Some("cpp".to_string());
}
let py_signals = [
"self.", "import ", "from ", "def ", "class ", "print(", "lambda ", "elif ", "f'", "f\"",
"__", " for ", "len(", "sorted(", "range(",
];
let py_hits = py_signals.iter().filter(|s| body.contains(**s)).count();
let py_colon_block = lines.iter().any(|l| {
let t = l.trim_end();
t.ends_with(':')
&& [
"if ", "for ", "while ", "def ", "class ", "elif ", "else", "try", "except",
"with ",
]
.iter()
.any(|kw| t.trim_start().starts_with(kw))
});
if py_hits >= 2 || (py_hits >= 1 && py_colon_block) {
return Some("python".to_string());
}
None
}
fn stitch_regions(blocks: Vec<Block>, region_starts: &[usize]) -> Vec<Block> {
if region_starts.len() <= 1 {
return blocks;
}
let boundary_set: std::collections::HashSet<usize> =
region_starts.iter().skip(1).copied().collect();
let mut out: Vec<Block> = Vec::with_capacity(blocks.len());
for (i, block) in blocks.into_iter().enumerate() {
if boundary_set.contains(&i)
&& let Some(prev) = out.last_mut()
&& let (
Block::Paragraph {
text: prev_text, ..
},
Block::Paragraph {
text: cur_text,
bold: false,
italic: false,
},
) = (prev, &block)
{
let prev_trim = prev_text.trim_end();
let starts_lower = cur_text
.trim_start()
.chars()
.next()
.is_some_and(|c| c.is_lowercase());
if is_soft_hyphen_break(prev_text, cur_text) {
while prev_text.ends_with(|c: char| c.is_whitespace()) {
prev_text.pop();
}
prev_text.pop(); prev_text.push_str(cur_text.trim_start());
continue;
}
let ends_open = !prev_trim.ends_with(|c: char| {
matches!(
c,
'.' | '!' | '?' | ':' | ';' | '”' | '"' | ')' | ']' | '。' | '』' | '」'
)
});
if ends_open && starts_lower {
prev_text.push(' ');
prev_text.push_str(cur_text.trim_start());
continue;
}
}
out.push(block);
}
out
}
#[cfg(test)]
mod tests {
use super::super::blocks::{Block, render_blocks};
use super::super::headings::{build_heading_map, compute_body_size};
use super::super::repetition::compute_header_footer_set;
use super::super::test_helpers::{
header_footer_page, line, mono_line, page, page_with_graphics, stroke, styled_line,
};
use super::*;
use crate::types::TextItem;
#[test]
fn classify_emits_heading_and_paragraph() {
let p = page(vec![
line("Title of the document goes here", 50.0, 50.0, 18.0, 18.0),
line("First sentence of the para-", 50.0, 80.0, 10.0, 10.0),
line("graph continues here.", 50.0, 92.0, 10.0, 10.0),
line("Another paragraph.", 50.0, 130.0, 10.0, 10.0),
]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 3);
match &blocks[0] {
Block::Heading { level, text } => {
assert_eq!(*level, 1);
assert_eq!(text, "Title of the document goes here");
}
other => panic!("expected heading, got {other:?}"),
}
match &blocks[1] {
Block::Paragraph { text: t, .. } => {
assert!(t.contains("paragraph continues"), "got: {t}");
assert!(!t.contains("para-"), "de-hyphenation failed: {t}");
}
other => panic!("expected paragraph, got {other:?}"),
}
match &blocks[2] {
Block::Paragraph { text: t, .. } => assert_eq!(t, "Another paragraph."),
other => panic!("expected paragraph, got {other:?}"),
}
}
#[test]
fn paragraph_break_on_big_gap() {
let p = page(vec![
line("Line A.", 50.0, 80.0, 10.0, 10.0),
line("Line B.", 50.0, 200.0, 10.0, 10.0),
]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 2);
}
#[test]
fn classify_emits_list_items() {
let p = page(vec![
line("Intro line.", 50.0, 50.0, 10.0, 10.0),
line("• first bullet", 60.0, 80.0, 10.0, 10.0),
line("• second bullet", 60.0, 92.0, 10.0, 10.0),
line("◦ nested item", 72.0, 104.0, 10.0, 10.0),
line("• back to top", 60.0, 116.0, 10.0, 10.0),
]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
let list_items: Vec<&Block> = blocks
.iter()
.filter(|b| matches!(b, Block::ListItem { .. }))
.collect();
assert_eq!(list_items.len(), 4);
if let Block::ListItem { level, text, .. } = list_items[0] {
assert_eq!(*level, 0);
assert_eq!(text, "first bullet");
} else {
panic!();
}
if let Block::ListItem { level, .. } = list_items[2] {
assert_eq!(*level, 1);
} else {
panic!();
}
}
#[test]
fn classify_emits_code_block() {
let p = page(vec![
line("Intro line.", 50.0, 50.0, 10.0, 10.0),
mono_line(" let x = 1;", 80.0),
mono_line(" let y = x + 2;", 92.0),
line("After the code.", 50.0, 120.0, 10.0, 10.0),
]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 3);
match &blocks[1] {
Block::CodeBlock { lines, .. } => {
assert_eq!(lines.len(), 2);
assert!(lines[0].contains("let x = 1;"));
assert!(lines[1].contains("let y = x + 2;"));
}
other => panic!("expected code block, got {other:?}"),
}
let s = render_blocks(&blocks);
assert!(s.contains("```\n let x = 1;"));
assert!(s.ends_with("After the code."));
}
#[test]
fn detect_code_language_classifies_common_langs() {
let py = vec![
"self.mm_list = sorted([x for x in self.files_list])".to_string(),
"self.mm_total = len(self.mm_list)".to_string(),
];
assert_eq!(detect_code_language(&py).as_deref(), Some("python"));
let py_block = vec![
"if item.total > 0:".to_string(),
" print('many')".to_string(),
];
assert_eq!(detect_code_language(&py_block).as_deref(), Some("python"));
let json = vec![
"{".to_string(),
" \"formatVersion\": \"1.0\",".to_string(),
"}".to_string(),
];
assert_eq!(detect_code_language(&json).as_deref(), Some("json"));
let cpp = vec![
"#include <vector>".to_string(),
"std::vector<int> v;".to_string(),
];
assert_eq!(detect_code_language(&cpp).as_deref(), Some("cpp"));
let unknown = vec!["let x = 1;".to_string(), "let y = x + 2;".to_string()];
assert_eq!(detect_code_language(&unknown), None);
}
#[test]
fn classify_marks_paragraph_bold_when_all_lines_bold() {
let mut a = line("Bold line one.", 50.0, 50.0, 10.0, 10.0);
let mut b = line("bold continuation.", 50.0, 62.0, 10.0, 10.0);
let bold_span = TextItem {
text: "x".into(),
font_name: Some("Arial-Bold".into()),
..Default::default()
};
a.spans = vec![bold_span.clone()];
b.spans = vec![bold_span];
a.all_bold = true;
b.all_bold = true;
let p = page(vec![a, b]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 1);
match &blocks[0] {
Block::Paragraph { bold, italic, .. } => {
assert!(*bold);
assert!(!*italic);
}
other => panic!("expected paragraph, got {other:?}"),
}
let s = render_blocks(&blocks);
assert!(s.starts_with("**") && s.ends_with("**"), "got: {s}");
}
#[test]
fn detects_simple_borderless_table() {
use super::super::test_helpers::line_with_spans;
let lines = vec![
line_with_spans(
&[("Name", 50.0), ("Age", 150.0), ("City", 250.0)],
100.0,
10.0,
),
line_with_spans(
&[("Alice", 50.0), ("30", 150.0), ("NYC", 250.0)],
115.0,
10.0,
),
line_with_spans(&[("Bob", 50.0), ("25", 150.0), ("LA", 250.0)], 130.0, 10.0),
];
let p = page(lines);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 1, "got: {blocks:?}");
match &blocks[0] {
Block::Table { header, rows } => {
assert!(header.is_none());
assert_eq!(rows.len(), 3);
assert_eq!(rows[0][0], "Name");
assert_eq!(rows[1][2], "NYC");
}
other => panic!("expected table, got {other:?}"),
}
}
#[test]
fn full_format_strips_header_footer() {
let pages = vec![
header_footer_page(1, "Acme Confidential", "Page 1 of 2", "First page body."),
header_footer_page(2, "Acme Confidential", "Page 2 of 2", "Second page body."),
];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let set = compute_header_footer_set(&pages);
let blocks = classify_page_with_filters(
&pages[0],
&map,
&set,
&[],
crate::config::ImageMode::Placeholder,
&std::collections::HashSet::new(),
);
let s = render_blocks(&blocks);
assert!(!s.contains("Acme Confidential"), "got: {s}");
assert!(!s.contains("Page 1 of 2"), "got: {s}");
assert!(s.contains("First page body."));
}
#[test]
fn classify_paragraph_with_mid_line_bold() {
let a = styled_line(
&[
("a sentence with a", 50.0, Some("Arial")),
("bold", 180.0, Some("Arial-Bold")),
("word in it.", 230.0, Some("Arial")),
],
50.0,
10.0,
);
let p = page(vec![a]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 1, "got: {blocks:?}");
match &blocks[0] {
Block::Paragraph { text, bold, italic } => {
assert!(!*bold, "mixed-style paragraph shouldn't set block bold");
assert!(!*italic);
assert!(text.contains("**bold**"), "got: {text}");
}
other => panic!("expected paragraph, got {other:?}"),
}
}
#[test]
fn classify_list_item_strips_marker_under_emphasis() {
let l = styled_line(
&[
("•", 60.0, Some("Arial-Bold")),
("important item", 80.0, Some("Arial-Bold")),
],
50.0,
10.0,
);
let p = page(vec![l]);
let pages = vec![p];
let body = compute_body_size(&pages);
let map = build_heading_map(&pages, body);
let blocks = classify_page(&pages[0], &map);
assert_eq!(blocks.len(), 1);
match &blocks[0] {
Block::ListItem { text, .. } => {
assert_eq!(text, "**important item**");
}
other => panic!("expected list item, got {other:?}"),
}
}
#[test]
fn hr_emitted_between_lines_by_y_order() {
let lines = vec![
line("before the rule", 50.0, 100.0, 10.0, 10.0),
line("after the rule", 50.0, 300.0, 10.0, 10.0),
];
let p = page_with_graphics(lines, vec![stroke(50.0, 200.0, 450.0, 200.0, 0.5)]);
let blocks = classify_page(&p, &[]);
let has_hr = blocks
.iter()
.position(|b| matches!(b, Block::HorizontalRule));
assert!(has_hr.is_some(), "expected an HR block, got {blocks:?}");
let pos = has_hr.unwrap();
assert!(pos > 0 && pos < blocks.len() - 1);
}
}