use crate::config::ImageMode;
use crate::markdown_layout::{
build_heading_map, classify_page_with_filters, compute_body_size, compute_header_footer_set,
detect_single_page_chrome, render_blocks,
};
use crate::types::{OutlineTarget, ParsedPage};
pub fn format_markdown(
pages: &[ParsedPage],
outline: &[OutlineTarget],
image_mode: ImageMode,
) -> String {
format_markdown_pages(pages, outline, image_mode).join("\n\n-----\n\n")
}
pub fn format_markdown_pages(
pages: &[ParsedPage],
outline: &[OutlineTarget],
image_mode: ImageMode,
) -> Vec<String> {
if pages.is_empty() {
return Vec::new();
}
let body_size = compute_body_size(pages);
let heading_map = build_heading_map(pages, body_size);
let header_footer = compute_header_footer_set(pages);
pages
.iter()
.map(|page| {
if page.projected_lines.is_empty() {
let mut out = String::from("```text\n");
out.push_str(&page.text);
if !page.text.ends_with('\n') {
out.push('\n');
}
out.push_str("```");
return out;
}
let target_index = (page.page_number as i32).saturating_sub(1);
let page_outline: Vec<OutlineTarget> = outline
.iter()
.filter(|e| e.page_index == target_index)
.cloned()
.collect();
let chrome_indices = detect_single_page_chrome(page, body_size);
let mut blocks = classify_page_with_filters(
page,
&heading_map,
&header_footer,
&page_outline,
image_mode,
&chrome_indices,
);
let has_content = blocks.iter().any(|b| {
!matches!(
b,
crate::markdown_layout::Block::HorizontalRule
| crate::markdown_layout::Block::Figure { .. }
)
});
if !has_content {
blocks.retain(|b| !matches!(b, crate::markdown_layout::Block::HorizontalRule));
}
dedupe_rules(&mut blocks);
render_blocks(&blocks)
})
.collect()
}
fn dedupe_rules(blocks: &mut Vec<crate::markdown_layout::Block>) {
use crate::markdown_layout::Block::HorizontalRule;
while matches!(blocks.first(), Some(HorizontalRule)) {
blocks.remove(0);
}
while matches!(blocks.last(), Some(HorizontalRule)) {
blocks.pop();
}
blocks.dedup_by(|a, b| matches!((a, b), (HorizontalRule, HorizontalRule)));
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{Anchor, ProjectedLine, Rect, TextItem};
fn line(text: &str, x: f32, y: f32, h: f32, size: f32) -> ProjectedLine {
ProjectedLine {
text: text.into(),
bbox: Rect {
x,
y,
width: text.chars().count() as f32 * (size * 0.5),
height: h,
},
anchor: Anchor::Left,
indent_x: x,
dominant_font_size: size,
font_size_is_estimated: false,
heading_font_size: None,
dominant_font_name: Some("Arial".into()),
all_bold: false,
all_italic: false,
all_mono: false,
all_strike: false,
spans: vec![TextItem::default()],
region_path: Vec::new(),
mcid: None,
in_figure: false,
}
}
fn page_with(n: usize, lines: Vec<ProjectedLine>) -> ParsedPage {
ParsedPage {
page_number: n,
page_width: 612.0,
page_height: 792.0,
text: "fallback".into(),
markdown: String::new(),
text_items: vec![],
projected_lines: lines,
regions: crate::types::Region::default(),
graphics: vec![],
figures: vec![],
struct_nodes: vec![],
image_refs: vec![],
}
}
#[test]
fn test_empty() {
assert_eq!(format_markdown(&[], &[], ImageMode::Placeholder), "");
}
#[test]
fn dedupe_rules_drops_edges_and_collapses_runs() {
use crate::markdown_layout::Block::{self, HorizontalRule, Paragraph};
let p = |t: &str| Paragraph {
text: t.into(),
bold: false,
italic: false,
};
let mut blocks = vec![
HorizontalRule,
p("a"),
HorizontalRule,
HorizontalRule,
p("b"),
HorizontalRule,
];
dedupe_rules(&mut blocks);
let kinds: Vec<bool> = blocks
.iter()
.map(|b| matches!(b, Block::HorizontalRule))
.collect();
assert_eq!(kinds, vec![false, true, false]);
}
#[test]
fn test_fallback_when_no_projected_lines() {
let p = ParsedPage {
page_number: 1,
page_width: 0.0,
page_height: 0.0,
text: "hello".into(),
markdown: String::new(),
text_items: vec![],
projected_lines: vec![],
regions: crate::types::Region::default(),
graphics: vec![],
figures: vec![],
struct_nodes: vec![],
image_refs: vec![],
};
let out = format_markdown(&[p], &[], ImageMode::Placeholder);
assert!(out.contains("```text"));
assert!(out.contains("hello"));
}
#[test]
fn test_heading_and_paragraph() {
let p = page_with(
1,
vec![
line("My Title For This Test Document", 50.0, 50.0, 18.0, 18.0),
line("First sentence of body prose here.", 50.0, 80.0, 10.0, 10.0),
line(
"Second sentence of body prose here.",
50.0,
92.0,
10.0,
10.0,
),
line(
"Third sentence of body prose here.",
50.0,
104.0,
10.0,
10.0,
),
],
);
let out = format_markdown(&[p], &[], ImageMode::Placeholder);
assert!(out.contains("# My Title For This Test Document"));
assert!(out.contains("First sentence of body prose here."));
}
#[test]
fn test_multi_page_separator() {
let a = page_with(1, vec![line("A page.", 50.0, 80.0, 10.0, 10.0)]);
let b = page_with(2, vec![line("B page.", 50.0, 80.0, 10.0, 10.0)]);
let out = format_markdown(&[a, b], &[], ImageMode::Placeholder);
assert!(out.contains("-----"));
assert!(out.find("A page.").unwrap() < out.find("B page.").unwrap());
}
#[test]
fn test_per_page_matches_joined_document() {
let a = page_with(1, vec![line("A page.", 50.0, 80.0, 10.0, 10.0)]);
let b = page_with(2, vec![line("B page.", 50.0, 80.0, 10.0, 10.0)]);
let pages = [a, b];
let per_page = format_markdown_pages(&pages, &[], ImageMode::Placeholder);
assert_eq!(per_page.len(), 2);
assert!(per_page[0].contains("A page."));
assert!(per_page[1].contains("B page."));
assert!(!per_page[0].contains("-----"));
assert_eq!(
format_markdown(&pages, &[], ImageMode::Placeholder),
per_page.join("\n\n-----\n\n")
);
}
}