use std::borrow::Cow;
use crate::pdf::hierarchy::SegmentData;
use super::lines::needs_space_between;
use super::types::{LayoutHintClass, PdfLine, PdfParagraph};
pub(crate) fn render_paragraph_to_output(para: &PdfParagraph, output: &mut String) {
if let Some(level) = para.heading_level {
let prefix = "#".repeat(level as usize);
let joined = join_line_texts(¶.lines);
let text = escape_html_entities(&joined);
output.push_str(&prefix);
output.push(' ');
output.push_str(&text);
} else if para.is_code_block {
output.push_str("```\n");
for line in ¶.lines {
let line_text = line
.segments
.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let line_text = collapse_inner_spaces(&line_text);
output.push_str(&line_text);
output.push('\n');
}
output.push_str("```");
} else if para.is_formula {
let text = join_line_texts(¶.lines);
output.push_str("$$\n");
output.push_str(&text);
output.push_str("\n$$");
} else if para.is_list_item {
let text = render_paragraph_with_inline_markup(para);
let normalized = normalize_list_prefix(&text);
output.push_str(&normalized);
} else if matches!(para.layout_class, Some(LayoutHintClass::Caption)) {
let joined = join_line_texts(¶.lines);
let text = escape_html_entities(&joined);
let escaped = text.replace('*', "\\*");
output.push('*');
output.push_str(&escaped);
output.push('*');
} else {
let text = render_paragraph_with_inline_markup(para);
output.push_str(&text);
}
}
#[allow(dead_code)] pub(crate) fn render_paragraphs_to_string(paragraphs: &[PdfParagraph]) -> String {
let mut output = String::new();
for para in paragraphs {
if !output.is_empty() {
output.push_str("\n\n");
}
render_paragraph_to_output(para, &mut output);
}
output
}
pub fn inject_image_placeholders(markdown: &str, images: &[crate::types::ExtractedImage]) -> String {
if images.is_empty() {
return markdown.to_string();
}
let mut images_by_page: std::collections::BTreeMap<usize, Vec<(usize, &crate::types::ExtractedImage)>> =
std::collections::BTreeMap::new();
for (idx, img) in images.iter().enumerate() {
let page = img.page_number.unwrap_or(0);
images_by_page.entry(page).or_default().push((idx, img));
}
let mut result = markdown.to_string();
for (&page, page_images) in &images_by_page {
for (_idx, img) in page_images {
let ii = img.image_index;
let label = if page > 0 {
format!("", ii, page, page, ii)
} else {
format!("", ii, ii)
};
result.push_str("\n\n");
result.push_str(&label);
if let Some(ref ocr) = img.ocr_result {
let text = ocr.content.trim();
if !text.is_empty() {
result.push_str(&format!("\n> *Image text: {}*", text));
}
}
}
}
result
}
fn normalize_list_prefix(text: &str) -> String {
let trimmed = text.trim_start();
const BULLET_CHARS: &[char] = &[
'\u{2022}', '\u{00B7}', ];
for &ch in BULLET_CHARS {
if trimmed.starts_with(ch) {
let rest = trimmed[ch.len_utf8()..].trim_start();
return format!("- {rest}");
}
}
if let Some(stripped) = trimmed.strip_prefix("* ") {
let rest = stripped.trim_start();
return format!("- {rest}");
}
if trimmed.starts_with("- ") {
return text.trim_start().to_string();
}
const DASH_BULLETS: &[char] = &[
'–', '—', '−', '‐', '‑', '‒', '―', '➤', '►', '▶', '○', '●', '◦', ];
for &ch in DASH_BULLETS {
if trimmed.starts_with(ch) {
let rest = trimmed[ch.len_utf8()..].trim_start();
return format!("- {rest}");
}
}
let bytes = trimmed.as_bytes();
let digit_end = bytes.iter().position(|&b| !b.is_ascii_digit()).unwrap_or(0);
if digit_end > 0 && digit_end < bytes.len() {
let suffix = bytes[digit_end];
if suffix == b'.' || suffix == b')' {
return text.trim_start().to_string();
}
}
format!("- {trimmed}")
}
fn join_line_texts(lines: &[PdfLine]) -> String {
if lines.is_empty() {
return String::new();
}
let words_per_line: Vec<Vec<&str>> = lines
.iter()
.map(|l| l.segments.iter().flat_map(|s| s.text.split_whitespace()).collect())
.collect();
let mut result = String::new();
for (line_idx, line_words) in words_per_line.iter().enumerate() {
for (word_idx, &word) in line_words.iter().enumerate() {
if result.is_empty() {
result.push_str(word);
continue;
}
let prev_word = if word_idx > 0 {
line_words[word_idx - 1]
} else {
words_per_line[..line_idx]
.iter()
.rev()
.find_map(|lw| lw.last().copied())
.unwrap_or("")
};
if should_dehyphenate(prev_word, word) {
result.pop(); result.push_str(word);
} else if needs_space_between(prev_word, word) {
result.push(' ');
result.push_str(word);
} else {
result.push_str(word);
}
}
}
result
}
#[cfg(test)]
fn join_texts_cjk_aware(texts: &[&str]) -> String {
if texts.is_empty() {
return String::new();
}
let mut result = String::from(texts[0]);
for pair in texts.windows(2) {
let prev = pair[0];
let next = pair[1];
if should_dehyphenate(prev, next) {
result.pop(); result.push_str(next);
} else {
if needs_space_between(prev, next) {
result.push(' ');
}
result.push_str(next);
}
}
result
}
fn should_dehyphenate(prev: &str, next: &str) -> bool {
if prev.len() < 2 || !prev.ends_with('-') {
return false;
}
let before_hyphen = prev[..prev.len() - 1].chars().next_back();
if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
return false;
}
next.chars().next().is_some_and(|c| c.is_lowercase())
}
pub(in crate::pdf::markdown) fn escape_html_entities(text: &str) -> Cow<'_, str> {
let needs_amp = text.contains('&');
let needs_lt = text.contains('<');
let needs_gt = text.contains('>');
if !needs_amp && !needs_lt && !needs_gt {
return Cow::Borrowed(text);
}
let mut result = String::with_capacity(text.len() + 16);
for ch in text.chars() {
match ch {
'&' => result.push_str("&"),
'<' => result.push_str("<"),
'>' => result.push_str(">"),
_ => result.push(ch),
}
}
Cow::Owned(result)
}
fn collapse_inner_spaces(line: &str) -> String {
let leading = line.len() - line.trim_start_matches(' ').len();
let prefix = &line[..leading];
let rest = &line[leading..];
if !rest.contains(" ") {
return line.to_string();
}
let mut result = String::with_capacity(line.len());
result.push_str(prefix);
let mut prev_space = false;
for ch in rest.chars() {
if ch == ' ' {
if !prev_space {
result.push(ch);
}
prev_space = true;
} else {
prev_space = false;
result.push(ch);
}
}
result
}
fn render_paragraph_with_inline_markup(para: &PdfParagraph) -> String {
let all_segments: Vec<&SegmentData> = para.lines.iter().flat_map(|l| &l.segments).collect();
let rendered = render_segment_refs_with_markup(&all_segments);
escape_html_entities(&rendered).into_owned()
}
fn render_segment_refs_with_markup(segments: &[&SegmentData]) -> String {
if segments.is_empty() {
return String::new();
}
let mut result = String::new();
let mut i = 0;
while i < segments.len() {
let bold = segments[i].is_bold;
let italic = segments[i].is_italic;
let run_start = i;
while i < segments.len() && segments[i].is_bold == bold && segments[i].is_italic == italic {
i += 1;
}
let mut run_words: Vec<&str> = Vec::new();
for seg in &segments[run_start..i] {
for word in seg.text.split_whitespace() {
run_words.push(word);
}
}
let run_text = join_texts_cjk_aware_line_aware(&run_words);
if !result.is_empty() {
let prev_last = segments[run_start - 1]
.text
.split_whitespace()
.next_back()
.unwrap_or("");
let next_first = segments[run_start].text.split_whitespace().next().unwrap_or("");
if should_dehyphenate(prev_last, next_first) {
result.pop(); } else if needs_space_between(prev_last, next_first) {
result.push(' ');
}
}
match (bold, italic) {
(true, true) => {
result.push_str("***");
result.push_str(&run_text);
result.push_str("***");
}
(true, false) => {
result.push_str("**");
result.push_str(&run_text);
result.push_str("**");
}
(false, true) => {
result.push('*');
result.push_str(&run_text);
result.push('*');
}
(false, false) => {
result.push_str(&run_text);
}
}
}
result
}
fn join_texts_cjk_aware_line_aware(texts: &[&str]) -> String {
if texts.is_empty() {
return String::new();
}
let mut result = String::from(texts[0]);
for pair in texts.windows(2) {
let prev = pair[0];
let next = pair[1];
if should_dehyphenate(prev, next) {
result.pop(); result.push_str(next);
} else {
if needs_space_between(prev, next) {
result.push(' ');
}
result.push_str(next);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
fn make_segment(text: &str, is_bold: bool, is_italic: bool) -> SegmentData {
SegmentData {
text: text.to_string(),
x: 0.0,
y: 0.0,
width: 0.0,
height: 12.0,
font_size: 12.0,
is_bold,
is_italic,
is_monospace: false,
baseline_y: 700.0,
}
}
fn make_line(segments: Vec<SegmentData>) -> PdfLine {
PdfLine {
segments,
baseline_y: 700.0,
dominant_font_size: 12.0,
is_bold: false,
is_monospace: false,
}
}
#[test]
fn test_render_plain_paragraph() {
let para = PdfParagraph {
lines: vec![make_line(vec![
make_segment("Hello", false, false),
make_segment("world", false, false),
])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "Hello world");
}
#[test]
fn test_render_heading() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("Title", false, false)])],
dominant_font_size: 18.0,
heading_level: Some(2),
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "## Title");
}
#[test]
fn test_render_bold_markup() {
let para = PdfParagraph {
lines: vec![make_line(vec![
make_segment("bold", true, false),
make_segment("text", true, false),
])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "**bold text**");
}
#[test]
fn test_render_italic_markup() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("italic", false, true)])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "*italic*");
}
#[test]
fn test_render_bold_italic_markup() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("both", true, true)])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "***both***");
}
#[test]
fn test_render_mixed_formatting() {
let para = PdfParagraph {
lines: vec![make_line(vec![
make_segment("normal", false, false),
make_segment("bold", true, false),
make_segment("normal2", false, false),
])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "normal **bold** normal2");
}
#[test]
fn test_inject_image_placeholders_empty() {
let result = inject_image_placeholders("Hello", &[]);
assert_eq!(result, "Hello");
}
#[test]
fn test_render_multiword_segments_no_double_space() {
let para = PdfParagraph {
lines: vec![make_line(vec![
make_segment("hello ", false, false),
make_segment("world", false, false),
])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "hello world");
}
#[test]
fn test_render_mixed_formatting_multiword() {
let para = PdfParagraph {
lines: vec![make_line(vec![
make_segment("normal text", false, false),
make_segment("bold text", true, false),
make_segment("more normal", false, false),
])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "normal text **bold text** more normal");
}
#[test]
fn test_dehyphenate_basic() {
let words = vec!["syl-", "lable"];
assert_eq!(join_texts_cjk_aware(&words), "syllable");
}
#[test]
fn test_dehyphenate_in_paragraph() {
let para = PdfParagraph {
lines: vec![
make_line(vec![make_segment("The neglect-", false, false)]),
make_line(vec![make_segment("ed buildings are old.", false, false)]),
],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "The neglected buildings are old.");
}
#[test]
fn test_no_dehyphenate_uppercase_next() {
let words = vec!["word-", "The"];
assert_eq!(join_texts_cjk_aware(&words), "word- The");
}
#[test]
fn test_no_dehyphenate_standalone_hyphen() {
let words = vec!["-", "word"];
assert_eq!(join_texts_cjk_aware(&words), "- word");
}
#[test]
fn test_no_dehyphenate_number_suffix() {
let words = vec!["item-", "3"];
assert_eq!(join_texts_cjk_aware(&words), "item- 3");
}
#[test]
fn test_heading_multiword_segments() {
let para = PdfParagraph {
lines: vec![make_line(vec![
make_segment("Chapter One", false, false),
make_segment("Title", false, false),
])],
dominant_font_size: 18.0,
heading_level: Some(1),
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "# Chapter One Title");
}
#[test]
fn test_escape_html_entities_ampersand() {
assert_eq!(escape_html_entities("a & b"), "a & b");
}
#[test]
fn test_escape_html_entities_lt_gt() {
assert_eq!(escape_html_entities("a < b > c"), "a < b > c");
}
#[test]
fn test_escape_html_entities_no_double_escape() {
assert_eq!(escape_html_entities("a & b < c"), "a & b < c");
}
#[test]
fn test_escape_html_entities_underscore_preserved() {
assert_eq!(escape_html_entities("foo_bar"), "foo_bar");
assert_eq!(escape_html_entities("CTC_ARP_01"), "CTC_ARP_01");
}
#[test]
fn test_escape_html_entities_url_preserves_underscore() {
assert_eq!(
escape_html_entities("https://example.com/foo_bar"),
"https://example.com/foo_bar"
);
}
#[test]
fn test_render_paragraph_html_entities_escaped() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("a & b < c > d", false, false)])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "a & b < c > d");
}
#[test]
fn test_render_code_block_no_html_escaping() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("a & b < c", false, false)])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: true,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "```\na & b < c\n```");
}
#[test]
fn test_render_formula_no_html_escaping() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("x < y & z > w", false, false)])],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: true,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "$$\nx < y & z > w\n$$");
}
#[test]
fn test_escape_html_entities_basic() {
assert_eq!(escape_html_entities("a & b"), "a & b");
assert_eq!(escape_html_entities("x < y"), "x < y");
assert_eq!(escape_html_entities("p > q"), "p > q");
assert_eq!(escape_html_entities("a & b < c > d"), "a & b < c > d");
}
#[test]
fn test_escape_underscores_not_escaped() {
assert_eq!(escape_html_entities("foo_bar"), "foo_bar");
assert_eq!(escape_html_entities("a_b_c"), "a_b_c");
assert_eq!(escape_html_entities("CTC_ARP_01"), "CTC_ARP_01");
assert_eq!(escape_html_entities("no underscores here"), "no underscores here");
}
#[test]
fn test_escape_preserves_urls() {
let url = "https://example.com/path_to_resource";
assert_eq!(escape_html_entities(url), url);
let proto = "ftp://host/file_name.txt";
assert_eq!(escape_html_entities(proto), proto);
}
#[test]
fn test_render_caption_layout_class() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("Figure 1. A caption.", false, false)])],
dominant_font_size: 10.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: Some(LayoutHintClass::Caption),
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "*Figure 1. A caption.*");
}
#[test]
fn test_render_non_caption_layout_class_not_italic() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("Footnote text.", false, false)])],
dominant_font_size: 8.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: Some(LayoutHintClass::Footnote),
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "Footnote text.");
}
#[test]
fn test_heading_text_is_escaped() {
let para = PdfParagraph {
lines: vec![make_line(vec![make_segment("Result <unknown>", false, false)])],
dominant_font_size: 18.0,
heading_level: Some(2),
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert!(output.contains("<"), "heading should contain < but got: {output}");
assert!(output.contains(">"), "heading should contain > but got: {output}");
assert!(!output.contains('<'), "raw < should not appear in heading output");
}
fn render_multiline_para(lines: Vec<Vec<&str>>) -> String {
let pdf_lines: Vec<PdfLine> = lines
.into_iter()
.map(|segs| {
let segments = segs.into_iter().map(|t| make_segment(t, false, false)).collect();
make_line(segments)
})
.collect();
let para = PdfParagraph {
lines: pdf_lines,
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
output
}
#[test]
fn test_word_break_software() {
let result = render_multiline_para(vec![vec!["The soft"], vec!["ware is great."]]);
assert_eq!(result, "The soft ware is great.");
}
#[test]
fn test_word_break_recognition() {
let result = render_multiline_para(vec![vec!["text recog"], vec!["nition system"]]);
assert_eq!(result, "text recog nition system");
}
#[test]
fn test_word_break_structures() {
let result = render_multiline_para(vec![vec!["data struc"], vec!["tures are important"]]);
assert_eq!(result, "data struc tures are important");
}
#[test]
fn test_word_break_document() {
let result = render_multiline_para(vec![vec!["the docu"], vec!["ment contains text"]]);
assert_eq!(result, "the docu ment contains text");
}
#[test]
fn test_word_break_configuration() {
let result = render_multiline_para(vec![vec!["system config"], vec!["uration file"]]);
assert_eq!(result, "system config uration file");
}
#[test]
fn test_no_word_break_function_word_the() {
let result = render_multiline_para(vec![vec!["install the"], vec!["software now"]]);
assert_eq!(result, "install the software now");
}
#[test]
fn test_no_word_break_function_word_a() {
let result = render_multiline_para(vec![vec!["this is a"], vec!["test case"]]);
assert_eq!(result, "this is a test case");
}
#[test]
fn test_no_word_break_function_word_and() {
let result = render_multiline_para(vec![vec!["cats and"], vec!["dogs are pets"]]);
assert_eq!(result, "cats and dogs are pets");
}
#[test]
fn test_no_word_break_function_word_for() {
let result = render_multiline_para(vec![vec!["search for"], vec!["meaning"]]);
assert_eq!(result, "search for meaning");
}
#[test]
fn test_no_word_break_function_word_with() {
let result = render_multiline_para(vec![vec!["work with"], vec!["data"]]);
assert_eq!(result, "work with data");
}
#[test]
fn test_no_word_break_function_word_from() {
let result = render_multiline_para(vec![vec!["data from"], vec!["sources"]]);
assert_eq!(result, "data from sources");
}
#[test]
fn test_no_word_break_uppercase_next() {
let result = render_multiline_para(vec![vec!["some text"], vec!["Another sentence"]]);
assert_eq!(result, "some text Another sentence");
}
#[test]
fn test_no_word_break_punctuation() {
let result = render_multiline_para(vec![vec!["value=10"], vec!["units"]]);
assert_eq!(result, "value=10 units");
}
#[test]
fn test_word_break_with_dehyphenation_still_works() {
let result = render_multiline_para(vec![vec!["the neglect-"], vec!["ed buildings"]]);
assert_eq!(result, "the neglected buildings");
}
#[test]
fn test_word_break_multiple_on_same_paragraph() {
let result = render_multiline_para(vec![vec!["soft"], vec!["ware recog"], vec!["nition"]]);
assert_eq!(result, "soft ware recog nition");
}
#[test]
fn test_no_word_break_same_line() {
let result = render_multiline_para(vec![vec!["soft ware"]]);
assert_eq!(result, "soft ware");
}
#[test]
fn test_word_break_with_inline_markup() {
let para = PdfParagraph {
lines: vec![
make_line(vec![make_segment("the docu", false, false)]),
make_line(vec![make_segment("ment is ready", false, false)]),
],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "the docu ment is ready");
}
#[test]
fn test_word_break_heading() {
let para = PdfParagraph {
lines: vec![
make_line(vec![make_segment("Intro", false, false)]),
make_line(vec![make_segment("duction", false, false)]),
],
dominant_font_size: 18.0,
heading_level: Some(1),
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
};
let mut output = String::new();
render_paragraph_to_output(¶, &mut output);
assert_eq!(output, "# Intro duction");
}
#[test]
fn test_no_word_break_is_and_are() {
let result = render_multiline_para(vec![vec!["these is"], vec!["correct"]]);
assert_eq!(result, "these is correct");
let result = render_multiline_para(vec![vec!["they are"], vec!["here"]]);
assert_eq!(result, "they are here");
}
#[test]
fn test_escape_preserves_identifiers() {
assert_eq!(escape_html_entities("CTC_ARP_01"), "CTC_ARP_01");
}
}