use crate::error::Result;
use crate::layout::FontWeight;
use crate::pipeline::{OrderedTextSpan, StructRole, TextPipelineConfig};
use crate::structure::table_extractor::Table;
use crate::text::HyphenationHandler;
use regex::Regex;
use std::sync::LazyLock;
use super::OutputConverter;
static RE_URL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(https?://[^\s<>\[\]]*[^\s<>\[\].,!?;:])").unwrap());
static RE_EMAIL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})").unwrap());
fn is_table_separator_line(line: &str) -> bool {
let trimmed = line.trim();
if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
return false;
}
let inner = &trimmed[1..trimmed.len() - 1];
let cells: Vec<&str> = inner.split('|').collect();
if cells.len() < 2 {
return false;
}
cells.iter().all(|cell| {
let c = cell.trim();
!c.is_empty() && c.chars().all(|ch| ch == '-' || ch == ':')
})
}
fn escape_stray_leading_pipes(s: &str) -> String {
let lines: Vec<&str> = s.split('\n').collect();
let mut in_table = vec![false; lines.len()];
for (i, line) in lines.iter().enumerate() {
if is_table_separator_line(line) {
in_table[i] = true;
if i > 0 && lines[i - 1].trim_start().starts_with('|') {
in_table[i - 1] = true;
}
let mut j = i + 1;
while j < lines.len() && lines[j].trim_start().starts_with('|') {
in_table[j] = true;
j += 1;
}
}
}
let mut out = String::with_capacity(s.len());
for (i, line) in lines.iter().enumerate() {
if !in_table[i] {
let leading_ws_len = line.len() - line.trim_start().len();
let trimmed = &line[leading_ws_len..];
if let Some(rest) = trimmed.strip_prefix('|') {
out.push_str(&line[..leading_ws_len]);
out.push_str("\\|");
out.push_str(rest);
} else {
out.push_str(line);
}
} else {
out.push_str(line);
}
if i + 1 < lines.len() {
out.push('\n');
}
}
out
}
fn looks_like_heading_wrap(first: &str, second: &str) -> bool {
let first_trim = first.trim_end();
if let Some(last) = first_trim.chars().last() {
if matches!(last, '.' | '?' | '!' | '。' | '?' | '!' | '\u{061F}') {
return false;
}
if matches!(last, ',' | ';' | '、' | ';' | '·') {
return true;
}
}
let second_first = second.trim_start().chars().next();
if let Some(c) = second_first {
if c.is_lowercase() {
return true;
}
}
false
}
#[allow(dead_code)]
fn dedup_consecutive_paragraphs(s: &str) -> String {
let paras: Vec<&str> = s.split("\n\n").collect();
let mut out: Vec<&str> = Vec::with_capacity(paras.len());
let mut prev_norm: Option<String> = None;
for p in paras {
let norm: String = p
.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty())
.collect::<Vec<_>>()
.join(" ");
if norm.is_empty() {
out.push(p);
prev_norm = None;
continue;
}
if prev_norm.as_deref() == Some(norm.as_str()) {
continue;
}
prev_norm = Some(norm);
out.push(p);
}
out.join("\n\n")
}
#[allow(dead_code)]
fn dedup_identical_header_cells(s: &str) -> String {
let lines: Vec<&str> = s.split('\n').collect();
let mut out: Vec<String> = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let line = lines[i];
let next_is_sep = i + 1 < lines.len() && is_table_separator_line(lines[i + 1]);
let trimmed = line.trim();
let looks_like_header = trimmed.starts_with('|') && trimmed.ends_with('|');
if !next_is_sep || !looks_like_header {
out.push(line.to_string());
i += 1;
continue;
}
let inner = &trimmed[1..trimmed.len() - 1];
let cells: Vec<&str> = inner.split('|').collect();
let non_empty: Vec<&str> = cells
.iter()
.map(|c| c.trim())
.filter(|c| !c.is_empty())
.collect();
if non_empty.len() < 3 {
out.push(line.to_string());
i += 1;
continue;
}
let first = non_empty[0];
let all_same = non_empty.iter().all(|c| *c == first);
if !all_same {
out.push(line.to_string());
i += 1;
continue;
}
let mut new_cells: Vec<String> = Vec::with_capacity(cells.len());
let mut wrote_first = false;
for cell in &cells {
if cell.trim().is_empty() {
new_cells.push(String::new());
} else if !wrote_first {
new_cells.push(format!(" {} ", cell.trim()));
wrote_first = true;
} else {
new_cells.push(String::from(" "));
}
}
out.push(format!("|{}|", new_cells.join("|")));
i += 1;
}
out.join("\n")
}
fn merge_consecutive_same_level_headings(s: &str) -> String {
let lines: Vec<&str> = s.split('\n').collect();
let mut out: Vec<String> = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let line = lines[i];
let trimmed = line.trim_start();
let level = trimmed.bytes().take_while(|&b| b == b'#').count();
let is_heading =
(1..=6).contains(&level) && trimmed.as_bytes().get(level).copied() == Some(b' ');
if !is_heading {
out.push(line.to_string());
i += 1;
continue;
}
let mut texts: Vec<String> = vec![trimmed[level + 1..].trim().to_string()];
let mut j = i + 1;
loop {
while j < lines.len() && lines[j].trim().is_empty() {
j += 1;
}
if j >= lines.len() {
break;
}
let next_trim = lines[j].trim_start();
let next_level = next_trim.bytes().take_while(|&b| b == b'#').count();
let next_is_heading =
next_level == level && next_trim.as_bytes().get(next_level).copied() == Some(b' ');
if !next_is_heading {
break;
}
let next_text = next_trim[next_level + 1..].trim().to_string();
if next_text.split_whitespace().count() > 15 {
break;
}
texts.push(next_text);
j += 1;
}
let three_plus_short =
texts.len() >= 3 && texts.iter().all(|t| t.split_whitespace().count() <= 2);
let wrapped_two = texts.len() == 2 && looks_like_heading_wrap(&texts[0], &texts[1]);
if three_plus_short || wrapped_two {
let merged = texts.join(" ");
let hashes = "#".repeat(level);
out.push(format!("{} {}", hashes, merged));
i = j;
} else {
out.push(line.to_string());
i += 1;
}
}
out.join("\n")
}
#[allow(dead_code)]
fn filter_page_number_lines(s: &str) -> String {
s.to_string()
}
#[allow(dead_code)]
fn normalize_bullet_glyphs(s: &str) -> String {
s.to_string()
}
#[allow(dead_code)]
fn simplify_degenerate_tables(s: &str) -> String {
let lines: Vec<&str> = s.split('\n').collect();
let mut out: Vec<String> = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let header = lines[i];
if !header.trim_start().starts_with('|')
|| i + 1 >= lines.len()
|| !is_table_separator_line(lines[i + 1])
{
out.push(header.to_string());
i += 1;
continue;
}
let mut block_end = i + 2;
while block_end < lines.len() && lines[block_end].trim_start().starts_with('|') {
block_end += 1;
}
let block = &lines[i..block_end];
let parse_row = |row: &str| -> Vec<String> {
row.trim()
.trim_start_matches('|')
.trim_end_matches('|')
.split('|')
.map(|c| c.trim().to_string())
.collect()
};
let header_cells = parse_row(header);
let data_rows: Vec<Vec<String>> = block.iter().skip(2).map(|r| parse_row(r)).collect();
let cols = header_cells.len();
let data_row_count = data_rows.len();
if cols < 5 || data_row_count < 2 {
out.extend(block.iter().map(|l| l.to_string()));
i = block_end;
continue;
}
let mut non_empty = 0usize;
let mut single_word = 0usize;
for cell in header_cells.iter().chain(data_rows.iter().flatten()) {
if cell.is_empty() {
continue;
}
non_empty += 1;
if cell.split_whitespace().count() == 1 {
single_word += 1;
}
}
if non_empty == 0 {
i = block_end;
continue;
}
let single_ratio = single_word as f32 / non_empty as f32;
if single_ratio < 0.6 {
out.extend(block.iter().map(|l| l.to_string()));
i = block_end;
continue;
}
let mut words: Vec<String> = Vec::new();
for cell in header_cells.iter().chain(data_rows.iter().flatten()) {
if !cell.is_empty() {
words.push(cell.clone());
}
}
out.push(words.join(" "));
i = block_end;
}
out.join("\n")
}
fn collapse_numeric_heading_runs(s: &str) -> String {
static RE_NUMERIC_HEADING: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^(#{1,2})\s+([\$£€]?\d[\d,.:\-/]*\s*(?:%|K|M|B|days|day|hrs|hr|min|sec)?)\s*$")
.unwrap()
});
let lines: Vec<&str> = s.split('\n').collect();
let mut out: Vec<String> = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
if !RE_NUMERIC_HEADING.is_match(lines[i]) {
out.push(lines[i].to_string());
i += 1;
continue;
}
let level = lines[i]
.trim_start()
.bytes()
.take_while(|&b| b == b'#')
.count();
let mut values: Vec<String> = Vec::new();
let mut last_match_idx = i;
let mut j = i;
while j < lines.len() {
if lines[j].trim().is_empty() {
j += 1;
continue;
}
let trim = lines[j].trim_start();
let l = trim.bytes().take_while(|&b| b == b'#').count();
if l != level {
break;
}
if let Some(caps) = RE_NUMERIC_HEADING.captures(lines[j]) {
let v = caps
.get(2)
.map(|m| m.as_str().trim().to_string())
.unwrap_or_default();
if v.chars().count() > 20 {
break;
}
values.push(v);
last_match_idx = j;
j += 1;
} else {
break;
}
}
if values.len() < 2 {
out.push(lines[i].to_string());
i += 1;
continue;
}
for v in &values {
out.push(format!("- {}", v));
}
out.push(String::new()); i = last_match_idx + 1;
}
out.join("\n")
}
fn coalesce_camelcase_bold_fragments(s: &str) -> String {
static RE_CAMELCASE_BOLD_INLINE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\*\*(\p{Lu})\s+(\p{Ll}+\p{Lu}\p{Ll}*)\s+(\p{Ll}+)\*\*").unwrap()
});
static RE_CAMELCASE_BOLD_BOUND: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\*\*(\p{Lu})\s+(\p{Ll}+\p{Lu}\p{Ll}*)\*\*\s*(\p{Ll}+)").unwrap()
});
let pass1 = RE_CAMELCASE_BOLD_INLINE
.replace_all(s, |caps: ®ex::Captures| {
format!("**{}{}{}**", &caps[1], &caps[2], &caps[3])
})
.to_string();
RE_CAMELCASE_BOLD_BOUND
.replace_all(&pass1, |caps: ®ex::Captures| {
format!("**{}{}{}**", &caps[1], &caps[2], &caps[3])
})
.to_string()
}
pub struct MarkdownOutputConverter {
paragraph_gap_ratio: f32,
}
impl MarkdownOutputConverter {
pub fn new() -> Self {
Self {
paragraph_gap_ratio: 1.5,
}
}
pub fn with_paragraph_gap(ratio: f32) -> Self {
Self {
paragraph_gap_ratio: ratio,
}
}
fn is_bold(&self, span: &OrderedTextSpan, config: &TextPipelineConfig) -> bool {
use crate::pipeline::config::BoldMarkerBehavior;
match span.span.font_weight {
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold => {
match config.output.bold_marker_behavior {
BoldMarkerBehavior::Aggressive => true,
BoldMarkerBehavior::Conservative => {
span.span.text.chars().any(|c| !c.is_whitespace())
},
}
},
_ => false,
}
}
fn is_italic(&self, span: &OrderedTextSpan) -> bool {
span.span.is_italic && span.span.text.chars().any(|c| !c.is_whitespace())
}
fn linkify(&self, text: &str) -> String {
let might_have_url = text.contains("://") || text.contains("www.");
let might_have_email = text.contains('@');
if !might_have_url && !might_have_email {
return text.to_string();
}
let mut result = if might_have_url {
RE_URL
.replace_all(text, |caps: ®ex::Captures| {
let url = &caps[0];
format!("[{}]({})", url, url)
})
.to_string()
} else {
text.to_string()
};
if might_have_email {
result = RE_EMAIL
.replace_all(&result, |caps: ®ex::Captures| {
let email = &caps[0];
format!("[{}](mailto:{})", email, email)
})
.to_string();
}
result
}
fn normalize_whitespace(&self, text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn is_paragraph_break(&self, current: &OrderedTextSpan, previous: &OrderedTextSpan) -> bool {
let line_height = current.span.font_size.max(previous.span.font_size);
let gap = (previous.span.bbox.y - current.span.bbox.y).abs();
if gap > line_height * self.paragraph_gap_ratio {
return true;
}
let line_changed =
(previous.span.bbox.y - current.span.bbox.y).abs() > current.span.font_size * 0.5;
if line_changed {
let cur_text = current.span.text.trim_start();
let cur_starts_list = Self::is_bullet_span(cur_text)
|| Self::starts_with_bullet(cur_text)
|| Self::is_ordered_list_marker(cur_text).is_some();
let prev_text = previous.span.text.trim_start();
let prev_starts_list = Self::is_bullet_span(prev_text)
|| Self::starts_with_bullet(prev_text)
|| Self::is_ordered_list_marker(prev_text).is_some();
if cur_starts_list && !prev_starts_list {
return true;
}
}
false
}
fn is_ordered_list_marker(text: &str) -> Option<u32> {
let t = text.trim_start();
let bytes = t.as_bytes();
if bytes.is_empty() {
return None;
}
let mut idx = 0;
while idx < bytes.len() && bytes[idx].is_ascii_digit() && idx < 3 {
idx += 1;
}
let numeric_n = if idx > 0 {
std::str::from_utf8(&bytes[..idx])
.ok()
.and_then(|s| s.parse::<u32>().ok())
} else {
None
};
if idx == 0 && bytes.len() >= 2 && bytes[0].is_ascii_alphabetic() {
let mut roman_end = 0;
while roman_end < bytes.len().min(4)
&& matches!(bytes[roman_end], b'i' | b'v' | b'x' | b'I' | b'V' | b'X')
{
roman_end += 1;
}
if roman_end >= 1 && bytes.len() > roman_end {
let punct = bytes[roman_end];
if matches!(punct, b'.' | b')') && bytes.get(roman_end + 1).copied() == Some(b' ') {
return Some(1); }
}
if bytes.len() >= 3
&& matches!(bytes[1], b'.' | b')')
&& bytes[2] == b' '
&& bytes[0].is_ascii_alphabetic()
{
return Some(1);
}
return None;
}
if idx > 0 && bytes.len() > idx {
let punct = bytes[idx];
if matches!(punct, b'.' | b')') && bytes.get(idx + 1).copied() == Some(b' ') {
return numeric_n;
}
}
None
}
fn is_bullet_span(text: &str) -> bool {
let t = text.trim();
matches!(
t,
"►" | "•"
| "▪"
| "▸"
| "‣"
| "◦"
| "●"
| "■"
| "◆"
| "○"
| "□"
| "❍"
| "❖"
| "✓"
| "✔"
| "➢"
| "➤"
| "\x7f"
)
}
fn starts_with_bullet(text: &str) -> bool {
let t = text.trim_start();
t.starts_with('►')
|| t.starts_with('•')
|| t.starts_with('▪')
|| t.starts_with('▸')
|| t.starts_with('‣')
|| t.starts_with('◦')
|| t.starts_with('●')
|| t.starts_with('■')
|| t.starts_with('◆')
|| t.starts_with('○')
|| t.starts_with('□')
|| t.starts_with('❍')
|| t.starts_with('❖')
|| t.starts_with('✓')
|| t.starts_with('✔')
|| t.starts_with('➢')
|| t.starts_with('➤')
|| t.starts_with('\x7f')
}
fn is_valid_heading_text(text: &str) -> bool {
let trimmed = text.trim();
let text_len = trimmed.chars().count();
if !(2..=200).contains(&text_len) {
return false;
}
let word_count = trimmed.split_whitespace().count();
if word_count > 20 {
return false;
}
let bytes = trimmed.as_bytes();
for i in 0..bytes.len().saturating_sub(2) {
if bytes[i] == b'.' && bytes[i + 1] == b' ' {
let next = bytes[i + 2];
if next.is_ascii_alphabetic() {
return false;
}
}
}
let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
if text_len <= 8 {
if alpha_count < 2 {
return false;
}
} else if alpha_count * 10 < text_len * 3 {
return false;
}
let first = trimmed.chars().next().unwrap_or(' ');
if first.is_ascii_digit() || matches!(first, '+' | '-' | '$' | '€' | '£' | '¥' | '%') {
return false;
}
true
}
fn strip_bullet(text: &str) -> &str {
let t = text.trim_start();
if Self::starts_with_bullet(t) {
let mut chars = t.chars();
chars.next(); chars.as_str().trim_start()
} else {
text
}
}
fn heading_level_ratio(&self, span: &OrderedTextSpan, base_font_size: f32) -> Option<u8> {
if !Self::is_valid_heading_text(span.span.text.trim()) {
return None;
}
if base_font_size <= 0.0 {
return None;
}
let size_ratio = span.span.font_size / base_font_size;
let is_bold = matches!(
span.span.font_weight,
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold
);
if size_ratio >= 1.8 {
Some(1)
} else if size_ratio >= 1.4 {
Some(2)
} else if size_ratio >= 1.2 {
Some(3)
} else if is_bold && size_ratio >= 1.05 {
Some(4)
} else {
None
}
}
fn render_table_markdown(&self, table: &Table, config: &TextPipelineConfig) -> String {
if table.rows.is_empty() {
return String::new();
}
let mut output = String::new();
let header_end = if table.has_header {
table.rows.iter().position(|r| !r.is_header).unwrap_or(1)
} else {
1
};
let max_cols = table
.rows
.iter()
.map(|row| {
row.cells
.iter()
.map(|c| c.colspan.max(1) as usize)
.sum::<usize>()
})
.max()
.unwrap_or(0);
for (row_idx, row) in table.rows.iter().enumerate() {
output.push('|');
let mut cols_written: usize = 0;
for cell in &row.cells {
output.push(' ');
let cell_text = if !cell.spans.is_empty() {
let mut cell_md = String::new();
let mut active_bold = false;
let mut active_italic = false;
for (i, span) in cell.spans.iter().enumerate() {
let is_bold = self.is_bold_raw(span, config);
let is_italic = span.is_italic;
let formatting_changed =
is_bold != active_bold || is_italic != active_italic;
if formatting_changed {
if active_italic {
cell_md.push('*');
}
if active_bold {
cell_md.push_str("**");
}
}
if i > 0 {
let prev = &cell.spans[i - 1];
let has_gap = super::has_horizontal_gap(prev, span);
let already_has_space =
cell_md.ends_with(' ') || span.text.starts_with(' ');
if has_gap && !already_has_space {
cell_md.push(' ');
}
}
if formatting_changed {
if is_bold {
cell_md.push_str("**");
}
if is_italic {
cell_md.push('*');
}
active_bold = is_bold;
active_italic = is_italic;
}
let mut processed_text = String::new();
crate::document::PdfDocument::push_span_text(&mut processed_text, span);
let mut text = processed_text.replace('|', "\\|").replace('\n', " ");
let just_opened = is_bold || is_italic;
if just_opened && (cell_md.ends_with("**") || cell_md.ends_with('*')) {
while text.starts_with(' ') {
text.remove(0);
}
}
cell_md.push_str(&text);
}
if active_italic || active_bold {
let content_end = cell_md.trim_end().len();
let trailing = cell_md[content_end..].to_string();
cell_md.truncate(content_end);
if active_italic {
cell_md.push('*');
}
if active_bold {
cell_md.push_str("**");
}
cell_md.push_str(&trailing);
}
cell_md
} else {
cell.text.replace('|', "\\|").replace('\n', " ")
};
output.push_str(cell_text.trim());
output.push(' ');
let span = cell.colspan.max(1) as usize;
for _ in 1..span {
output.push_str("| ");
}
output.push('|');
cols_written += span;
}
for _ in cols_written..max_cols {
output.push_str(" |");
}
output.push('\n');
if row_idx + 1 == header_end {
output.push('|');
let header_cols: usize = row.cells.iter().map(|c| c.colspan.max(1) as usize).sum();
for _ in 0..max_cols.max(header_cols) {
output.push_str("---|");
}
output.push('\n');
}
}
output
}
fn is_bold_raw(&self, span: &crate::layout::TextSpan, config: &TextPipelineConfig) -> bool {
use crate::pipeline::config::BoldMarkerBehavior;
match span.font_weight {
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold => {
match config.output.bold_marker_behavior {
BoldMarkerBehavior::Aggressive => true,
BoldMarkerBehavior::Conservative => {
span.text.chars().any(|c| !c.is_whitespace())
},
}
},
_ => false,
}
}
fn render_spans(
&self,
spans: &[OrderedTextSpan],
tables: &[Table],
config: &TextPipelineConfig,
) -> Result<String> {
if spans.is_empty() && tables.is_empty() {
return Ok(String::new());
}
let mut sorted: Vec<_> = spans.iter().collect();
sorted.sort_by_key(|s| s.reading_order);
let base_font_size = if config.output.detect_headings {
let mut size_counts: std::collections::HashMap<u32, usize> =
std::collections::HashMap::new();
for s in sorted.iter() {
let sz = s.span.font_size;
if sz < 9.0 {
continue;
}
*size_counts.entry((sz * 2.0).round() as u32).or_insert(0) += 1;
}
let mode = size_counts
.into_iter()
.max_by(|a, b| a.1.cmp(&b.1).then_with(|| b.0.cmp(&a.0)))
.map(|(bucket, _)| bucket as f32 / 2.0)
.unwrap_or(12.0);
mode.min(12.0)
} else {
12.0
};
let mut tables_rendered = vec![false; tables.len()];
let table_mds: Vec<String> = tables
.iter()
.map(|t| self.render_table_markdown(t, config))
.collect();
let mut table_skipped_spans: Vec<Vec<&OrderedTextSpan>> = vec![Vec::new(); tables.len()];
let mut result = String::new();
let mut prev_span: Option<&OrderedTextSpan> = None;
let mut current_line = String::new();
let mut active_bold = false;
let mut active_italic = false;
let mut current_heading_level: Option<u8> = None;
fn close_formatting(line: &mut String, bold: &mut bool, italic: &mut bool) {
if !*bold && !*italic {
return;
}
let content_end = line.trim_end().len();
let trailing_ws = line[content_end..].to_string();
line.truncate(content_end);
if *italic {
line.push('*');
*italic = false;
}
if *bold {
line.push_str("**");
*bold = false;
}
line.push_str(&trailing_ws);
}
fn strip_emphasis(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let chars: Vec<char> = s.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i] == '*' {
i += 1;
if i < chars.len() && chars[i] == '*' {
i += 1;
}
continue;
}
out.push(chars[i]);
i += 1;
}
out
}
for span in sorted.iter() {
if span.span.artifact_type.is_some() {
continue;
}
{
let t = span.span.text.trim();
let char_count = t.chars().count();
if char_count > 0
&& char_count <= 2
&& !t.chars().any(|c| c.is_alphanumeric())
&& !Self::is_bullet_span(t)
&& !Self::starts_with_bullet(t)
{
continue;
}
}
if !tables.is_empty() {
if let Some(table_idx) = super::span_in_table(span, tables) {
if !tables_rendered[table_idx] {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push_str("\n\n");
current_line.clear();
}
result.push_str(&table_mds[table_idx]);
result.push('\n');
tables_rendered[table_idx] = true;
prev_span = None;
}
table_skipped_spans[table_idx].push(span);
continue;
}
}
let span_heading_level = match span.struct_role {
Some(StructRole::Heading(level)) => Some(level.clamp(1, 6)),
_ if config.output.detect_headings => {
self.heading_level_ratio(span, base_font_size)
},
_ => None,
};
let is_list_item_role = matches!(
span.struct_role,
Some(StructRole::ListItemBody)
| Some(StructRole::ListItem)
| Some(StructRole::ListItemLabel)
);
let same_line = prev_span
.map(|prev| (span.span.bbox.y - prev.span.bbox.y).abs() < span.span.font_size * 0.5)
.unwrap_or(true);
if let Some(prev) = prev_span {
let group_changed = match (span.group_id, prev.group_id) {
(Some(a), Some(b)) => a != b,
_ => false,
};
let heading_changed = current_heading_level != span_heading_level;
let group_flush = group_changed && !same_line;
let prev_was_list_item = matches!(
prev.struct_role,
Some(StructRole::ListItemBody)
| Some(StructRole::ListItem)
| Some(StructRole::ListItemLabel)
);
let list_item_changed = is_list_item_role != prev_was_list_item;
let column_gap = is_column_gap(prev, span);
let line_truly_continuous = same_line && !column_gap;
let block_changed = match (span.block_id, prev.block_id) {
(Some(a), Some(b)) => a != b,
_ => false,
} && !line_truly_continuous;
let heading_changed_break = heading_changed && !line_truly_continuous;
if group_flush
|| self.is_paragraph_break(span, prev)
|| heading_changed_break
|| list_item_changed
|| block_changed
|| column_gap
{
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
if let Some(level) = current_heading_level {
let prefix = "#".repeat(level as usize);
result.push_str(&format!(
"{} {}\n\n",
prefix,
strip_emphasis(current_line.trim())
));
} else {
result.push_str(current_line.trim());
result.push_str("\n\n");
}
current_line.clear();
}
current_heading_level = span_heading_level;
if is_list_item_role {
current_line.push_str("- ");
}
} else if !same_line {
let is_bullet = Self::is_bullet_span(&span.span.text)
|| Self::starts_with_bullet(&span.span.text);
let is_ordered =
Self::is_ordered_list_marker(span.span.text.trim_start()).is_some();
let starts_new_list_item = if span.block_id.is_some() && prev.block_id.is_some()
{
is_list_item_role && (list_item_changed || block_changed)
} else {
is_list_item_role
};
if is_bullet || is_ordered || starts_new_list_item {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
if let Some(level) = current_heading_level {
let prefix = "#".repeat(level as usize);
result.push_str(&format!(
"{} {}\n\n",
prefix,
strip_emphasis(current_line.trim())
));
} else {
result.push_str(current_line.trim());
result.push('\n');
}
current_line.clear();
}
current_heading_level = span_heading_level;
if starts_new_list_item {
current_line.push_str("- ");
}
} else {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if config.output.preserve_layout {
let spacing = (span.span.bbox.x - prev.span.bbox.x).max(0.0) as usize;
for _ in 0..spacing.min(20) {
current_line.push(' ');
}
} else {
current_line.push(' ');
}
}
}
} else {
current_heading_level = span_heading_level;
if is_list_item_role {
current_line.push_str("- ");
}
}
if Self::is_bullet_span(&span.span.text) {
if !current_line.ends_with("- ") {
if !current_line.is_empty() && !current_line.ends_with(' ') {
current_line.push(' ');
}
current_line.push_str("- ");
}
prev_span = Some(span);
continue;
}
let mut text_str = String::new();
crate::document::PdfDocument::push_span_text(&mut text_str, &span.span);
let trim_start = text_str.trim_start();
if let Some(first) = trim_start.chars().next() {
if first == '\x7f' || first == '❍' {
let leading_ws_len = text_str.len() - trim_start.len();
let bullet_byte_len = first.len_utf8();
text_str = format!(
"{}•{}",
&text_str[..leading_ws_len],
&text_str[leading_ws_len + bullet_byte_len..]
);
}
}
let mut text = text_str.as_str();
if Self::starts_with_bullet(text) {
let stripped = Self::strip_bullet(text);
if !current_line.ends_with("- ") {
if !current_line.is_empty() && !current_line.ends_with(' ') {
current_line.push(' ');
}
current_line.push_str("- ");
}
text = stripped;
}
let normalized;
if !config.output.preserve_layout {
let had_leading_space =
same_line && prev_span.is_some() && text.starts_with(char::is_whitespace);
let had_trailing_space = text.ends_with(char::is_whitespace);
let mut norm = self.normalize_whitespace(text);
if had_leading_space && !norm.starts_with(' ') {
norm.insert(0, ' ');
}
if had_trailing_space && !norm.ends_with(' ') && !norm.is_empty() {
norm.push(' ');
}
normalized = norm;
text = &normalized;
}
let linkified = self.linkify(text);
let is_bold = self.is_bold(span, config);
let is_italic = self.is_italic(span);
if same_line && !current_line.is_empty() {
if let Some(prev) = prev_span {
let no_existing_ws =
!current_line.ends_with(' ') && !linkified.starts_with(' ');
let visual_gap = super::has_horizontal_gap(&prev.span, &span.span);
let punct_boundary = current_line
.chars()
.last()
.is_some_and(|c| matches!(c, '.' | ',' | ';' | ':' | '?' | '!'))
&& linkified
.chars()
.next()
.is_some_and(|c| c.is_ascii_uppercase() || c.is_ascii_digit());
if no_existing_ws && (visual_gap || punct_boundary) {
current_line.push(' ');
}
}
}
if is_bold != active_bold || is_italic != active_italic {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if is_bold {
current_line.push_str("**");
active_bold = true;
}
if is_italic {
current_line.push('*');
active_italic = true;
}
}
current_line.push_str(&linkified);
prev_span = Some(span);
}
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
for (table_idx, skipped) in table_skipped_spans.iter().enumerate() {
if !tables_rendered[table_idx] || skipped.is_empty() {
continue;
}
let rendered = &table_mds[table_idx];
let mut orphans: Vec<&&OrderedTextSpan> = skipped
.iter()
.filter(|s| {
let trimmed = s.span.text.trim();
!trimmed.is_empty() && !rendered.contains(trimmed)
})
.collect();
if !orphans.is_empty() {
orphans.sort_by_key(|s| s.reading_order);
for orphan in orphans {
if !result.ends_with(' ') && !result.ends_with('\n') {
result.push(' ');
}
let mut processed = String::new();
crate::document::PdfDocument::push_span_text(&mut processed, &orphan.span);
result.push_str(&processed);
}
}
}
for (i, table) in tables.iter().enumerate() {
if !tables_rendered[i] && !table.is_empty() {
if !current_line.is_empty() {
if let Some(level) = current_heading_level {
let prefix = "#".repeat(level as usize);
result.push_str(&format!(
"{} {}\n\n",
prefix,
strip_emphasis(current_line.trim())
));
} else {
result.push_str(current_line.trim());
result.push_str("\n\n");
}
current_line.clear();
}
result.push_str(&table_mds[i]);
result.push('\n');
}
}
if !current_line.is_empty() {
if let Some(level) = current_heading_level {
let prefix = "#".repeat(level as usize);
result.push_str(&format!("{} {}\n", prefix, strip_emphasis(current_line.trim())));
} else {
result.push_str(current_line.trim());
result.push('\n');
}
}
let mut final_result = if config.output.preserve_layout {
result
} else {
let cleaned = result
.split("\n\n")
.map(|para| para.trim())
.filter(|para| !para.is_empty())
.collect::<Vec<_>>()
.join("\n\n");
if result.ends_with('\n') && !cleaned.ends_with('\n') {
format!("{}\n", cleaned)
} else {
cleaned
}
};
final_result = super::merge_key_value_pairs(&final_result);
let is_tagged = sorted.iter().any(|s| s.struct_role.is_some());
final_result = escape_stray_leading_pipes(&final_result);
final_result = coalesce_camelcase_bold_fragments(&final_result);
if !is_tagged {
final_result = collapse_numeric_heading_runs(&final_result);
final_result = merge_consecutive_same_level_headings(&final_result);
}
if config.enable_hyphenation_reconstruction {
let handler = HyphenationHandler::new();
final_result = handler.process_text(&final_result);
}
if crate::text::bidi::looks_rtl(&final_result) {
let trailing_newlines: String = final_result
.chars()
.rev()
.take_while(|&c| c == '\n')
.collect::<String>()
.chars()
.rev()
.collect();
final_result = final_result
.lines()
.map(|line| {
if crate::text::bidi::looks_rtl(line) {
strip_inline_emphasis_in_rtl(line)
} else {
line.to_string()
}
})
.collect::<Vec<_>>()
.join("\n");
if !trailing_newlines.is_empty() {
final_result.push_str(&trailing_newlines);
}
}
if crate::text::bidi::looks_rtl(&final_result) {
final_result = wrap_bidi_isolates_per_line(&final_result);
}
Ok(final_result)
}
}
fn wrap_bidi_isolates_per_line(text: &str) -> String {
let trailing_newlines: String = text
.chars()
.rev()
.take_while(|&c| c == '\n')
.collect::<String>()
.chars()
.rev()
.collect();
let lines: Vec<&str> = text.lines().collect();
let mut out = String::with_capacity(text.len() + 16);
for (i, line) in lines.iter().enumerate() {
if crate::text::bidi::looks_rtl(line) {
let block_is_rtl = crate::text::bidi::paragraph_is_rtl(line);
out.push_str(&crate::text::bidi::wrap_rtl_isolates(line, block_is_rtl));
} else {
out.push_str(line);
}
if i + 1 < lines.len() {
out.push('\n');
}
}
if !trailing_newlines.is_empty() {
out.push_str(&trailing_newlines);
}
out
}
fn strip_inline_emphasis_in_rtl(line: &str) -> String {
if !line.contains('*') {
return line.to_string();
}
let bytes = line.as_bytes();
let mut out = String::with_capacity(line.len());
let mut i = 0;
let mut last_copy = 0;
while i < bytes.len() {
if i + 1 < bytes.len() && bytes[i] == b'*' && bytes[i + 1] == b'*' {
if let Some(close) = find_matching(bytes, i + 2, b"**") {
if i > last_copy {
out.push_str(&line[last_copy..i]);
}
let inner = &line[i + 2..close];
if crate::text::bidi::looks_rtl(inner) {
out.push_str(inner);
} else {
out.push_str("**");
out.push_str(inner);
out.push_str("**");
}
i = close + 2;
last_copy = i;
continue;
}
}
if bytes[i] == b'*' {
if let Some(close) = find_matching(bytes, i + 1, b"*") {
if i > last_copy {
out.push_str(&line[last_copy..i]);
}
let inner = &line[i + 1..close];
if crate::text::bidi::looks_rtl(inner) {
out.push_str(inner);
} else {
out.push('*');
out.push_str(inner);
out.push('*');
}
i = close + 1;
last_copy = i;
continue;
}
}
i += 1;
}
if last_copy < bytes.len() {
out.push_str(&line[last_copy..]);
}
out
}
fn find_matching(bytes: &[u8], from: usize, needle: &[u8]) -> Option<usize> {
let mut i = from;
while i + needle.len() <= bytes.len() {
if &bytes[i..i + needle.len()] == needle {
return Some(i);
}
i += 1;
}
None
}
fn is_column_gap(prev: &OrderedTextSpan, current: &OrderedTextSpan) -> bool {
let prev_right = prev.span.bbox.x + prev.span.bbox.width;
let cur_left = current.span.bbox.x;
let font_size = current.span.font_size.max(prev.span.font_size).max(1.0);
if cur_left + font_size * 2.0 < prev.span.bbox.x {
return true;
}
let gap = cur_left - prev_right;
if gap <= 0.0 {
return false;
}
let threshold = (font_size * 3.0).max(30.0);
gap > threshold
}
impl Default for MarkdownOutputConverter {
fn default() -> Self {
Self::new()
}
}
impl OutputConverter for MarkdownOutputConverter {
fn convert(&self, spans: &[OrderedTextSpan], config: &TextPipelineConfig) -> Result<String> {
self.render_spans(spans, &[], config)
}
fn convert_with_tables(
&self,
spans: &[OrderedTextSpan],
tables: &[Table],
config: &TextPipelineConfig,
) -> Result<String> {
self.render_spans(spans, tables, config)
}
fn name(&self) -> &'static str {
"MarkdownOutputConverter"
}
fn mime_type(&self) -> &'static str {
"text/markdown"
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
use crate::layout::{Color, TextSpan};
use crate::pipeline::converters::span_in_table;
use crate::pipeline::StructRole;
use crate::structure::table_extractor::{TableCell, TableRow};
#[test]
fn test_struct_role_heading_emits_markdown_heading() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut title = make_span("Document Title", 0.0, 100.0, 12.0, FontWeight::Normal);
title.struct_role = Some(StructRole::Heading(1));
let body = make_span("Body paragraph one.", 0.0, 80.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[title, body], &config).unwrap();
assert!(
result.contains("# Document Title"),
"expected '# Document Title' in output, got:\n{}",
result
);
assert!(result.contains("Body paragraph one."));
}
#[test]
fn test_struct_role_h2_overrides_font_size_heuristic() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut h2 = make_span("Section Header", 0.0, 100.0, 11.0, FontWeight::Normal);
h2.struct_role = Some(StructRole::Heading(2));
let result = converter.convert(&[h2], &config).unwrap();
assert!(result.starts_with("## "), "expected `## ` heading prefix, got:\n{}", result);
}
#[test]
fn test_is_ordered_list_marker_recognition() {
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("1. Foo"), Some(1));
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("12. Foo"), Some(12));
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("a) Foo"), Some(1));
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("A. Foo"), Some(1));
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("iv. Foo"), Some(1));
assert!(MarkdownOutputConverter::is_ordered_list_marker("1.1 Foo").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker("1986 was").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker("Item one").is_none());
}
#[test]
fn test_numbered_list_consecutive_lines_separate() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let s1 = make_span("1. Treasurer", 0.0, 100.0, 12.0, FontWeight::Normal);
let s2 = make_span("2. Safeguarding", 0.0, 88.0, 12.0, FontWeight::Normal);
let s3 = make_span("3. Volunteering", 0.0, 76.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[s1, s2, s3], &config).unwrap();
for marker in ["1. Treasurer", "2. Safeguarding", "3. Volunteering"] {
assert!(
result.lines().any(|l| l.trim_start().starts_with(marker)),
"expected line starting with `{}`, got:\n{}",
marker,
result
);
}
}
#[test]
fn test_bullet_after_paragraph_forces_break() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let intro = make_span("Intro sentence.", 0.0, 100.0, 12.0, FontWeight::Normal);
let b1 = make_span("• First item", 0.0, 88.0, 12.0, FontWeight::Normal);
let b2 = make_span("• Second item", 0.0, 76.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[intro, b1, b2], &config).unwrap();
assert!(
result.contains("Intro sentence.\n\n- First item"),
"expected blank line + bullet after intro, got:\n{}",
result
);
}
#[test]
fn test_struct_role_emits_each_heading_level() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
for level in 1u8..=6 {
let mut s =
make_span(&format!("Title L{}", level), 0.0, 100.0, 12.0, FontWeight::Normal);
s.struct_role = Some(StructRole::Heading(level));
let body = make_span("body", 0.0, 80.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[s, body], &config).unwrap();
let prefix = "#".repeat(level as usize);
let expected = format!("{} Title L{}", prefix, level);
assert!(result.contains(&expected), "expected `{}`, got:\n{}", expected, result);
}
}
#[test]
fn test_struct_role_heading_level_is_clamped() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
for raw_level in [0u8, 7, 99, 250] {
let mut s = make_span("Edgy", 0.0, 100.0, 12.0, FontWeight::Normal);
s.struct_role = Some(StructRole::Heading(raw_level));
let result = converter.convert(&[s], &config).unwrap();
let first_line = result.lines().next().unwrap_or("");
let hash_count = first_line.chars().take_while(|c| *c == '#').count();
assert!(
(1..=6).contains(&hash_count),
"raw_level {} produced {} `#`s in `{}`",
raw_level,
hash_count,
first_line
);
}
}
#[test]
fn test_struct_role_all_list_variants_emit_bullets() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
for role in [
StructRole::ListItem,
StructRole::ListItemLabel,
StructRole::ListItemBody,
] {
let mut s = make_span("Item", 0.0, 100.0, 12.0, FontWeight::Normal);
s.struct_role = Some(role);
let result = converter.convert(&[s], &config).unwrap();
assert!(
result.lines().any(|l| l.starts_with("- ")),
"role {:?} did not emit a bullet, got:\n{}",
role,
result
);
}
}
#[test]
fn test_heading_then_list_item_transition() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut h = make_span("Section", 0.0, 100.0, 12.0, FontWeight::Normal);
h.struct_role = Some(StructRole::Heading(2));
let mut li = make_span("First", 0.0, 80.0, 12.0, FontWeight::Normal);
li.struct_role = Some(StructRole::ListItemBody);
let result = converter.convert(&[h, li], &config).unwrap();
assert!(result.contains("## Section"));
assert!(result.contains("- First"));
assert!(
!result.contains("## - "),
"heading prefix and bullet must not co-occur, got:\n{}",
result
);
}
#[test]
fn test_block_id_three_paragraphs_three_breaks() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut spans = Vec::new();
for (i, t) in ["alpha", "beta", "gamma"].iter().enumerate() {
let mut s = make_span(t, 0.0, 100.0 - (i as f32 * 14.0), 12.0, FontWeight::Normal);
s.block_id = Some((i + 1) as u32);
spans.push(s);
}
let result = converter.convert(&spans, &config).unwrap();
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert_eq!(
paras,
vec!["alpha", "beta", "gamma"],
"expected 3 separate paragraphs, got {:?}",
paras
);
}
#[test]
fn test_partial_block_id_does_not_force_break() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let s1 = make_span("first", 0.0, 100.0, 12.0, FontWeight::Normal);
let mut s2 = make_span("second", 0.0, 88.0, 12.0, FontWeight::Normal);
s2.block_id = Some(1);
let result = converter.convert(&[s1, s2], &config).unwrap();
assert!(
!result.contains("\n\n"),
"partial block_id must not introduce paragraph break, got:\n{}",
result
);
}
#[test]
fn test_is_ordered_list_marker_extras() {
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("99. Foo"), Some(99));
assert_eq!(MarkdownOutputConverter::is_ordered_list_marker("z) Last"), Some(1));
assert!(MarkdownOutputConverter::is_ordered_list_marker("1.Foo").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker(". Foo").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker(") Foo").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker("").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker(" ").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker("$1. Total").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker("3.14 pi").is_none());
assert!(MarkdownOutputConverter::is_ordered_list_marker("2024. Year").is_none());
}
#[test]
fn test_multiple_superscripts_one_line() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let parts: Vec<OrderedTextSpan> = vec![
make_span("On the 1", 0.0, 100.0, 11.0, FontWeight::Normal),
make_span("st", 25.0, 102.5, 7.0, FontWeight::Normal),
make_span(", 2", 30.0, 100.0, 11.0, FontWeight::Normal),
make_span("nd", 40.0, 102.5, 7.0, FontWeight::Normal),
make_span(", and 3", 47.0, 100.0, 11.0, FontWeight::Normal),
make_span("rd", 70.0, 102.5, 7.0, FontWeight::Normal),
make_span(" days", 75.0, 100.0, 11.0, FontWeight::Normal),
];
let result = converter.convert(&parts, &config).unwrap();
for sup in ["st", "nd", "rd"] {
assert!(
!result.lines().any(|l| l.trim() == sup),
"bare `{}` line found in:\n{}",
sup,
result
);
}
for token in ["1st", "2nd", "3rd"] {
assert!(result.contains(token), "expected `{}` in output, got:\n{}", token, result);
}
}
#[test]
fn test_bold_slight_size_bump_is_heading() {
let converter = MarkdownOutputConverter::new();
let mut config = TextPipelineConfig::default();
config.output.detect_headings = true;
let body_a = make_span("First body sentence.", 0.0, 100.0, 11.0, FontWeight::Normal);
let body_b = make_span("Second body sentence.", 0.0, 88.0, 11.0, FontWeight::Normal);
let head = make_span("Section Header", 0.0, 76.0, 11.55, FontWeight::Bold);
let body_c = make_span("After-heading body.", 0.0, 64.0, 11.0, FontWeight::Normal);
let result = converter
.convert(&[body_a, body_b, head, body_c], &config)
.unwrap();
assert!(
result.contains("### Section Header") || result.contains("#### Section Header"),
"expected heading prefix on bold +5% line, got:\n{}",
result
);
}
#[test]
fn test_same_baseline_blocks_do_not_split_heading() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, bid: u32| {
let mut s = make_span(t, x, 100.0, 18.0, FontWeight::Bold);
s.struct_role = Some(StructRole::Heading(1));
s.block_id = Some(bid);
s
};
let spans = vec![
mk("Form", 0.0, 1),
mk("1040", 50.0, 2),
mk("U.S. Individual Income Tax Return", 100.0, 3),
];
let result = converter.convert(&spans, &config).unwrap();
let heading_lines: Vec<&str> = result
.lines()
.filter(|l| l.trim_start().starts_with("# "))
.collect();
assert_eq!(
heading_lines.len(),
1,
"expected one combined heading line, got {} in:\n{}",
heading_lines.len(),
result
);
assert!(
heading_lines[0].contains("Form")
&& heading_lines[0].contains("1040")
&& heading_lines[0].contains("U.S. Individual Income Tax Return"),
"all three pieces must be in the single heading line, got: {}",
heading_lines[0]
);
}
#[test]
fn test_same_baseline_blocks_do_not_split_list_items() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, bid: u32| {
let mut s = make_span(t, x, 100.0, 12.0, FontWeight::Normal);
s.struct_role = Some(StructRole::ListItemBody);
s.block_id = Some(bid);
s
};
let spans = vec![
mk("Apple", 0.0, 1),
mk("Banana", 60.0, 2),
mk("Cherry", 120.0, 3),
];
let result = converter.convert(&spans, &config).unwrap();
let bullet_lines: Vec<&str> = result.lines().filter(|l| l.starts_with("- ")).collect();
assert_eq!(
bullet_lines.len(),
1,
"horizontal list on one line must stay one bullet, got {} in:\n{}",
bullet_lines.len(),
result
);
}
#[test]
fn test_different_baseline_blocks_still_split() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut p1 = make_span("First.", 0.0, 100.0, 12.0, FontWeight::Normal);
p1.block_id = Some(1);
let mut p2 = make_span("Second.", 0.0, 70.0, 12.0, FontWeight::Normal);
p2.block_id = Some(2);
let mut p3 = make_span("Third.", 0.0, 40.0, 12.0, FontWeight::Normal);
p3.block_id = Some(3);
let result = converter.convert(&[p1, p2, p3], &config).unwrap();
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert_eq!(
paras,
vec!["First.", "Second.", "Third."],
"different baselines must still produce three paragraphs"
);
}
#[test]
fn test_strip_inline_emphasis_preserves_rtl_chars_around_lone_asterisk() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let span = make_span("בנימין * world", 0.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[span], &config).unwrap();
assert!(
result.contains("בנימין"),
"Hebrew letters lost — UTF-8 corruption: {:?}",
result
);
assert!(
!result
.chars()
.any(|c| (c as u32) == 0x91 || (c as u32) == 0xA0),
"byte-as-char ghost characters present in: {:?}",
result
);
}
#[test]
fn test_arabic_strip_inline_emphasis_matrix() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let cases: &[(&str, &[&str])] = &[
("اللغة العربية بسيطة", &["اللغة", "العربية", "بسيطة"]),
("בנימין * world", &["בנימין", "* world"]),
("مرحبا *عالم* اليوم", &["مرحبا", "عالم", "اليوم"]),
("مرحبا **عالم** اليوم", &["مرحبا", "عالم", "اليوم"]),
("مرحبا *Hello* اليوم", &["مرحبا", "*Hello*", "اليوم"]),
];
for (input, expected_subs) in cases {
let span = make_span(input, 0.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[span], &config).unwrap();
let result_no_iso: String = result
.chars()
.filter(|c| !matches!(*c, '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}'))
.collect();
for needle in *expected_subs {
assert!(
result_no_iso.contains(needle),
"input {:?} → expected {:?} in output:\n{}",
input,
needle,
result_no_iso
);
}
assert!(
!result.chars().any(|c| {
let n = c as u32;
(0x80..=0x9F).contains(&n) || n == 0xA0
}),
"input {:?} produced Latin-1 ghost chars in: {:?}",
input,
result
);
}
}
#[test]
fn test_wrapped_list_item_body_does_not_emit_extra_bullet() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut a = make_span("First half of an item", 0.0, 100.0, 12.0, FontWeight::Normal);
a.struct_role = Some(StructRole::ListItemBody);
a.block_id = Some(7);
let mut b = make_span("that wraps to next line.", 0.0, 86.0, 12.0, FontWeight::Normal);
b.struct_role = Some(StructRole::ListItemBody);
b.block_id = Some(7);
let result = converter.convert(&[a, b], &config).unwrap();
let bullet_lines: Vec<&str> = result.lines().filter(|l| l.starts_with("- ")).collect();
assert_eq!(
bullet_lines.len(),
1,
"wrapped list item body must stay one bullet, got {} lines:\n{}",
bullet_lines.len(),
result
);
}
#[test]
fn test_logical_hebrew_passes_through_unchanged() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let span = make_span("בנימין", 0.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[span], &config).unwrap();
assert!(result.contains("בנימין"), "Hebrew must survive intact; got: {:?}", result);
assert!(
!result.contains("ןימינב"),
"must NOT contain reversed Hebrew; got: {:?}",
result
);
}
#[test]
fn test_arabic_heading_keeps_hash_at_start() {
let converter = MarkdownOutputConverter::new();
let mut config = TextPipelineConfig::default();
config.output.detect_headings = true;
let mut h = make_span("ﺔﻴﺑﺮﻌﻟا", 0.0, 100.0, 24.0, FontWeight::Bold);
h.struct_role = Some(StructRole::Heading(1));
let result = converter.convert(&[h], &config).unwrap();
for line in result.lines() {
if line.contains("ﺔﻴﺑﺮﻌﻟا") {
assert!(
line.trim_start().starts_with('#'),
"heading line must start with `#`, got: {:?}",
line
);
}
}
}
#[test]
fn test_backward_x_wrap_at_same_baseline_splits_paragraph() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, y: f32| make_span(t, x, y, 12.0, FontWeight::Normal);
let prev = mk("constitution", 976.7, 1013.2);
let cur = mk("Assailing", 192.6, 1011.7);
let result = converter.convert(&[prev, cur], &config).unwrap();
assert!(
!result.contains("constitutionAssailing"),
"column wrap created concatenation, got:\n{}",
result
);
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert!(
paras.len() >= 2,
"expected ≥2 paragraphs from column wrap, got {} in:\n{}",
paras.len(),
result
);
assert!(result.contains("constitution"));
assert!(result.contains("Assailing"));
}
#[test]
fn test_minor_x_backwards_within_tolerance_does_not_split() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let prev = make_span("hello", 100.0, 100.0, 12.0, FontWeight::Normal);
let cur = make_span("world", 92.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[prev, cur], &config).unwrap();
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert_eq!(paras.len(), 1, "minor backstep must stay on one paragraph: {:?}", result);
}
#[test]
fn test_backward_x_wrap_works_with_or_without_block_id() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
for assign_block in [false, true] {
let mut a = make_span("end of col1", 800.0, 100.0, 12.0, FontWeight::Normal);
let mut b = make_span("Start of col2", 100.0, 100.0, 12.0, FontWeight::Normal);
if assign_block {
a.block_id = Some(1);
b.block_id = Some(2);
}
let result = converter.convert(&[a, b], &config).unwrap();
assert!(
!result.contains("col1Start"),
"block_id={}: column wrap concat in:\n{}",
assign_block,
result
);
}
}
#[test]
fn test_backward_x_wrap_on_different_baseline() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let prev = make_span("col1 last", 800.0, 200.0, 12.0, FontWeight::Normal);
let cur = make_span("Col2 first", 100.0, 600.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[prev, cur], &config).unwrap();
assert!(!result.contains("lastCol2"));
}
#[test]
fn test_all_five_ia_0047_patterns_split() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let patterns: &[(&str, &str, f32, f32)] = &[
("constitution", "Assailing", 1013.0, 12.0),
("harvesting", "Senator", 1162.0, 12.0),
("humoro", "Spartacus", 950.0, 11.0),
("posscssec", "France", 800.0, 12.0),
("should", "Satisfy", 600.0, 12.0),
];
for (a, b, y, sz) in patterns {
let prev = make_span(a, 800.0, *y, *sz, FontWeight::Normal);
let cur = make_span(b, 150.0, *y - 1.0, *sz, FontWeight::Normal);
let result = converter.convert(&[prev, cur], &config).unwrap();
let joined = format!("{}{}", a, b);
assert!(
!result.contains(&joined),
"pattern {:?}+{:?} created `{}` in:\n{}",
a,
b,
joined,
result
);
}
}
#[test]
fn test_column_wrap_does_not_break_form_heading_join() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, bid: u32| {
let mut s = make_span(t, x, 100.0, 18.0, FontWeight::Bold);
s.struct_role = Some(StructRole::Heading(1));
s.block_id = Some(bid);
s
};
let spans = vec![
mk("Form", 0.0, 1),
mk("1040", 35.0, 2),
mk("Title", 80.0, 3),
];
let result = converter.convert(&spans, &config).unwrap();
let heading_lines: Vec<&str> = result.lines().filter(|l| l.starts_with("# ")).collect();
assert_eq!(heading_lines.len(), 1, "form heading still joins: {}", result);
}
#[test]
fn test_is_column_gap_matrix() {
let cases: &[(f32, f32, f32, f32, bool)] = &[
(100.0, 50.0, 154.0, 12.0, false),
(100.0, 50.0, 186.5, 12.0, true),
(100.0, 50.0, 160.0, 12.0, false),
(200.0, 50.0, 100.0, 12.0, true),
(100.0, 50.0, 92.0, 12.0, false),
(976.7, 37.8, 192.6, 12.0, true),
];
for (px, pw, cx, font, expected) in cases {
let prev = make_span("p", *px, 100.0, *font, FontWeight::Normal);
let mut prev = prev;
prev.span.bbox.width = *pw;
let cur = make_span("c", *cx, 100.0, *font, FontWeight::Normal);
let actual = is_column_gap(&prev, &cur);
assert_eq!(
actual, *expected,
"(px={}, pw={}, cx={}, font={}) expected {} got {}",
px, pw, cx, font, expected, actual
);
}
}
#[test]
fn test_column_gap_with_block_change_splits() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut col1 = make_span("and", 0.0, 100.0, 12.0, FontWeight::Normal);
col1.block_id = Some(1);
let mut col2 = make_span("might", 180.0, 100.0, 12.0, FontWeight::Normal);
col2.block_id = Some(2);
let result = converter.convert(&[col1, col2], &config).unwrap();
assert!(
!result.contains("andmight"),
"column-gap join produced concatenated token, got:\n{}",
result
);
assert!(result.contains("and"));
assert!(result.contains("might"));
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert!(
paras.len() >= 2,
"expected ≥2 paragraphs separated by column gap, got {} in:\n{}",
paras.len(),
result
);
}
#[test]
fn test_form_heading_inline_gap_still_joins() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, bid: u32| {
let mut s = make_span(t, x, 100.0, 18.0, FontWeight::Bold);
s.struct_role = Some(StructRole::Heading(1));
s.block_id = Some(bid);
s
};
let spans = vec![
mk("Form", 0.0, 1),
mk("1040", 40.0, 2),
mk("U.S.", 100.0, 3),
];
let result = converter.convert(&spans, &config).unwrap();
let heading_lines: Vec<&str> = result.lines().filter(|l| l.starts_with("# ")).collect();
assert_eq!(
heading_lines.len(),
1,
"small-gap form pieces must stay on one heading line, got:\n{}",
result
);
}
#[test]
fn test_moderate_gap_does_not_force_column_break() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut a = make_span("First field", 0.0, 100.0, 12.0, FontWeight::Normal);
a.block_id = Some(1);
let mut b = make_span("Second field", 80.0, 100.0, 12.0, FontWeight::Normal);
b.block_id = Some(2);
let result = converter.convert(&[a, b], &config).unwrap();
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert_eq!(
paras.len(),
1,
"moderate gap (≈2.5× font) must keep content on one paragraph, got:\n{}",
result
);
}
#[test]
fn test_three_column_layout_splits_into_three_paragraphs() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, bid: u32| {
let mut s = make_span(t, x, 100.0, 12.0, FontWeight::Normal);
s.block_id = Some(bid);
s
};
let spans = vec![
mk("col one", 0.0, 1),
mk("col two", 200.0, 2),
mk("col three", 400.0, 3),
];
let result = converter.convert(&spans, &config).unwrap();
let paras: Vec<&str> = result
.split("\n\n")
.map(|p| p.trim())
.filter(|p| !p.is_empty())
.collect();
assert_eq!(paras.len(), 3, "three columns must produce three paragraphs, got:\n{}", result);
}
#[test]
fn test_column_gap_without_block_id_still_splits() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let a = make_span("left column.", 0.0, 100.0, 12.0, FontWeight::Normal);
let b = make_span("right column.", 200.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[a, b], &config).unwrap();
assert!(
result.contains("left column") && result.contains("right column"),
"both columns must surface, got:\n{}",
result
);
assert!(
!result.contains("column.right"),
"must not concatenate across column gap, got:\n{}",
result
);
}
#[test]
fn test_minor_baseline_jitter_still_joins() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, y: f32, bid: u32| {
let mut s = make_span(t, x, y, 18.0, FontWeight::Bold);
s.struct_role = Some(StructRole::Heading(1));
s.block_id = Some(bid);
s
};
let spans = vec![
mk("A", 0.0, 100.0, 1),
mk("B", 30.0, 100.3, 2),
mk("C", 60.0, 99.7, 3),
];
let result = converter.convert(&spans, &config).unwrap();
let heading_lines: Vec<&str> = result.lines().filter(|l| l.starts_with("# ")).collect();
assert_eq!(heading_lines.len(), 1, "tiny jitter must not split heading, got:\n{}", result);
}
#[test]
fn test_large_baseline_drop_still_splits_heading() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, y: f32, bid: u32| {
let mut s = make_span(t, 0.0, y, 18.0, FontWeight::Bold);
s.struct_role = Some(StructRole::Heading(1));
s.block_id = Some(bid);
s
};
let spans = vec![mk("First Heading", 100.0, 1), mk("Second Heading", 70.0, 2)];
let result = converter.convert(&spans, &config).unwrap();
let heading_lines: Vec<&str> = result.lines().filter(|l| l.starts_with("# ")).collect();
assert_eq!(
heading_lines.len(),
2,
"two visually-separated headings must both surface, got:\n{}",
result
);
}
#[test]
fn test_block_id_change_forces_paragraph_break() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut p1 = make_span("Paragraph one body text.", 0.0, 100.0, 12.0, FontWeight::Normal);
p1.block_id = Some(1);
let mut p2 = make_span("Paragraph two starts here.", 0.0, 88.0, 12.0, FontWeight::Normal);
p2.block_id = Some(2);
let result = converter.convert(&[p1, p2], &config).unwrap();
assert!(
result.contains("Paragraph one body text.\n\nParagraph two starts here."),
"expected double newline between block_ids 1→2, got:\n{:?}",
result
);
}
#[test]
fn test_same_block_id_keeps_paragraph_continuous() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut l1 = make_span("first line", 0.0, 100.0, 12.0, FontWeight::Normal);
l1.block_id = Some(7);
let mut l2 = make_span("second line", 0.0, 88.0, 12.0, FontWeight::Normal);
l2.block_id = Some(7);
let result = converter.convert(&[l1, l2], &config).unwrap();
assert!(
!result.contains("\n\n"),
"same block_id must not introduce paragraph break, got:\n{:?}",
result
);
}
#[test]
fn test_superscript_text_rise_does_not_split_line() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let pre = make_span("On June 21", 0.0, 100.0, 11.0, FontWeight::Normal);
let sup = make_span("st", 35.0, 102.5, 7.0, FontWeight::Normal);
let post = make_span(" they met.", 42.0, 100.0, 11.0, FontWeight::Normal);
let result = converter.convert(&[pre, sup, post], &config).unwrap();
assert!(
result.contains("21st they met"),
"expected '21st they met' inline, got:\n{}",
result
);
assert!(
!result.lines().any(|l| l.trim() == "st"),
"no bare 'st' line allowed, got:\n{}",
result
);
}
#[test]
fn test_struct_role_list_items_emit_bullets() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut items = Vec::new();
for (i, t) in ["Apple", "Banana", "Cherry"].iter().enumerate() {
let mut s = make_span(t, 0.0, 100.0 - (i as f32 * 14.0), 12.0, FontWeight::Normal);
s.struct_role = Some(StructRole::ListItemBody);
items.push(s);
}
let result = converter.convert(&items, &config).unwrap();
for t in ["- Apple", "- Banana", "- Cherry"] {
assert!(result.contains(t), "expected `{}` line in output, got:\n{}", t, result);
}
}
fn make_span_w(
text: &str,
x: f32,
y: f32,
width: f32,
font_size: f32,
weight: FontWeight,
) -> OrderedTextSpan {
OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, width, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
heading_level: None,
},
0,
)
}
fn make_span(
text: &str,
x: f32,
y: f32,
font_size: f32,
weight: FontWeight,
) -> OrderedTextSpan {
OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, 50.0, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
heading_level: None,
},
0,
)
}
#[test]
fn test_empty_spans() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let result = converter.convert(&[], &config).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_single_span() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(
"Hello world",
0.0,
100.0,
12.0,
FontWeight::Normal,
)];
let result = converter.convert(&spans, &config).unwrap();
assert_eq!(result, "Hello world\n");
}
#[test]
fn test_bold_text() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span("Bold text", 0.0, 100.0, 12.0, FontWeight::Bold)];
let result = converter.convert(&spans, &config).unwrap();
assert_eq!(result, "**Bold text**\n");
}
#[test]
fn test_whitespace_bold_conservative() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(" ", 0.0, 100.0, 12.0, FontWeight::Bold)];
let result = converter.convert(&spans, &config).unwrap();
assert!(!result.contains("**"));
}
#[test]
fn test_convert_with_tables_renders_markdown_table() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut table = Table::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
table.col_count = 2;
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Name".to_string(), true));
header.add_cell(TableCell::new("Value".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("A".to_string(), false));
data.add_cell(TableCell::new("1".to_string(), false));
table.add_row(data);
let result = converter
.convert_with_tables(&[], &[table], &config)
.unwrap();
assert!(result.contains("| Name |"));
assert!(result.contains("| Value |"));
assert!(result.contains("---|"));
assert!(result.contains("| A |"));
assert!(result.contains("| 1 |"));
}
#[test]
fn test_render_table_markdown_empty() {
let table = Table::new();
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert_eq!(result, "");
}
#[test]
fn test_render_table_markdown_single_row_no_header() {
let mut table = Table::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("A".to_string(), false));
row.add_cell(TableCell::new("B".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert!(result.contains("| A |"));
assert!(result.contains("| B |"));
assert!(result.contains("---|"));
}
#[test]
fn test_render_table_markdown_with_colspan() {
let mut table = Table::new();
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Wide".to_string(), true).with_colspan(2));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("Left".to_string(), false));
data.add_cell(TableCell::new("Right".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert!(result.contains("| Wide |"));
assert!(result.contains("---|---|"));
}
#[test]
fn test_render_table_markdown_escapes_pipes() {
let mut table = Table::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("A|B".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert!(result.contains("A\\|B"), "Pipes should be backslash-escaped: {}", result);
}
#[test]
fn test_render_table_markdown_replaces_newlines() {
let mut table = Table::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("Line1\nLine2".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert!(!result.contains("Line1\nLine2"), "Newlines in cells should be replaced");
assert!(result.contains("Line1 Line2"));
}
#[test]
fn test_render_table_markdown_trims_whitespace() {
let mut table = Table::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new(" padded ".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert!(result.contains("| padded |"));
}
#[test]
fn test_render_table_markdown_multiple_header_rows() {
let mut table = Table::new();
table.has_header = true;
let mut h1 = TableRow::new(true);
h1.add_cell(TableCell::new("H1".to_string(), true));
table.add_row(h1);
let mut h2 = TableRow::new(true);
h2.add_cell(TableCell::new("H2".to_string(), true));
table.add_row(h2);
let mut d1 = TableRow::new(false);
d1.add_cell(TableCell::new("D1".to_string(), false));
table.add_row(d1);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
let lines: Vec<&str> = result.lines().collect();
assert_eq!(lines.len(), 4); assert!(lines[2].contains("---|"));
}
#[test]
fn test_span_in_table_match() {
let span = make_span("text", 50.0, 70.0, 12.0, FontWeight::Normal);
let mut table = Table::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[table]), Some(0));
}
#[test]
fn test_span_in_table_no_match() {
let span = make_span("text", 500.0, 500.0, 12.0, FontWeight::Normal);
let mut table = Table::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[table]), None);
}
#[test]
fn test_span_in_table_none_bbox() {
let span = make_span("text", 50.0, 70.0, 12.0, FontWeight::Normal);
let table = Table::new(); assert_eq!(span_in_table(&span, &[table]), None);
}
#[test]
fn test_span_in_table_tolerance() {
let span = make_span("text", 8.5, 48.5, 12.0, FontWeight::Normal);
let mut table = Table::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[table]), Some(0), "Should match within tolerance");
}
#[test]
fn test_span_in_table_multiple_tables() {
let span = make_span("text", 350.0, 70.0, 12.0, FontWeight::Normal);
let mut t1 = Table::new();
t1.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
let mut t2 = Table::new();
t2.bbox = Some(Rect::new(300.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[t1, t2]), Some(1));
}
#[test]
fn test_convert_with_tables_mixed_content() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut span_before = make_span("Before table", 10.0, 200.0, 12.0, FontWeight::Normal);
span_before.reading_order = 0;
let mut span_after = make_span("After table", 10.0, 20.0, 12.0, FontWeight::Normal);
span_after.reading_order = 2;
let mut span_in_table = make_span("Val", 50.0, 70.0, 12.0, FontWeight::Normal);
span_in_table.reading_order = 1;
let mut table = Table::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Col".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("Val".to_string(), false));
table.add_row(data);
let result = converter
.convert_with_tables(&[span_before, span_in_table, span_after], &[table], &config)
.unwrap();
assert!(result.contains("Before table"), "Should contain text before table");
assert!(result.contains("| Col |"), "Should contain table");
assert!(result.contains("After table"), "Should contain text after table");
}
#[test]
fn test_convert_with_tables_no_tables_is_same_as_convert() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span("Hello", 0.0, 100.0, 12.0, FontWeight::Normal)];
let result_convert = converter.convert(&spans, &config).unwrap();
let result_with_tables = converter.convert_with_tables(&spans, &[], &config).unwrap();
assert_eq!(result_convert, result_with_tables);
}
#[test]
fn test_convert_with_tables_multiple_tables() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let make_table = |x: f32, text: &str| -> Table {
let mut t = Table::new();
t.bbox = Some(Rect::new(x, 50.0, 100.0, 50.0));
let mut row = TableRow::new(false);
row.add_cell(TableCell::new(text.to_string(), false));
t.add_row(row);
t
};
let result = converter
.convert_with_tables(&[], &[make_table(10.0, "T1"), make_table(200.0, "T2")], &config)
.unwrap();
assert!(result.contains("| T1 |"), "Should contain first table");
assert!(result.contains("| T2 |"), "Should contain second table");
}
#[test]
fn test_is_bullet_span() {
assert!(MarkdownOutputConverter::is_bullet_span("►"));
assert!(MarkdownOutputConverter::is_bullet_span("•"));
assert!(MarkdownOutputConverter::is_bullet_span("▪"));
assert!(MarkdownOutputConverter::is_bullet_span(" ► "));
assert!(!MarkdownOutputConverter::is_bullet_span("text"));
assert!(!MarkdownOutputConverter::is_bullet_span("►text"));
assert!(!MarkdownOutputConverter::is_bullet_span(""));
}
#[test]
fn test_starts_with_bullet() {
assert!(MarkdownOutputConverter::starts_with_bullet("►text"));
assert!(MarkdownOutputConverter::starts_with_bullet("• item"));
assert!(MarkdownOutputConverter::starts_with_bullet(" ► indented"));
assert!(!MarkdownOutputConverter::starts_with_bullet("text"));
assert!(!MarkdownOutputConverter::starts_with_bullet(""));
}
#[test]
fn test_strip_bullet() {
assert_eq!(MarkdownOutputConverter::strip_bullet("► text"), "text");
assert_eq!(MarkdownOutputConverter::strip_bullet("•item"), "item");
assert_eq!(MarkdownOutputConverter::strip_bullet("no bullet"), "no bullet");
}
#[test]
fn test_bullet_spans_become_list_items() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut title = make_span("FEATURES", 50.0, 660.0, 11.0, FontWeight::Bold);
title.reading_order = 0;
let mut bullet = make_span("►", 50.0, 640.0, 8.8, FontWeight::Normal);
bullet.reading_order = 1;
let mut text = make_span("Analog input", 60.0, 640.0, 11.0, FontWeight::Normal);
text.reading_order = 2;
let mut bullet2 = make_span("►", 50.0, 626.0, 8.8, FontWeight::Normal);
bullet2.reading_order = 3;
let mut text2 = make_span("16-bit ADC", 60.0, 626.0, 11.0, FontWeight::Normal);
text2.reading_order = 4;
let spans = vec![title, bullet, text, bullet2, text2];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("- Analog input"),
"Should convert bullet to list item: {}",
result
);
assert!(result.contains("- 16-bit ADC"), "Should convert second bullet: {}", result);
assert!(!result.contains("►"), "Should not contain raw bullet character: {}", result);
}
#[test]
fn test_inline_bullet_becomes_list_item() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut title = make_span("TITLE", 50.0, 660.0, 11.0, FontWeight::Bold);
title.reading_order = 0;
let mut bullet_text = make_span("► Analog input", 50.0, 640.0, 11.0, FontWeight::Normal);
bullet_text.reading_order = 1;
let spans = vec![title, bullet_text];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("- Analog input"),
"Should convert inline bullet to list item: {}",
result
);
}
#[test]
fn test_first_span_inline_bullet() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut bullet_text = make_span("► First item", 50.0, 660.0, 11.0, FontWeight::Normal);
bullet_text.reading_order = 0;
let mut bullet_text2 = make_span("► Second item", 50.0, 646.0, 11.0, FontWeight::Normal);
bullet_text2.reading_order = 1;
let spans = vec![bullet_text, bullet_text2];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("- First item"),
"First-span inline bullet should become list item: {}",
result
);
assert!(
result.contains("- Second item"),
"Second inline bullet should become list item: {}",
result
);
}
fn config_with_headings() -> TextPipelineConfig {
let mut config = TextPipelineConfig::default();
config.output.detect_headings = true;
config
}
#[test]
fn test_heading_base_font_excludes_small_spans() {
let converter = MarkdownOutputConverter::new();
let config = config_with_headings();
let mut spans = Vec::new();
let mut order = 0;
for i in 0..10 {
let mut s = make_span("►", 50.0, 600.0 - (i as f32) * 14.0, 8.8, FontWeight::Normal);
s.reading_order = order;
order += 1;
spans.push(s);
}
for i in 0..10 {
let mut s = make_span(
"body text content",
60.0,
600.0 - (i as f32) * 14.0,
11.0,
FontWeight::Bold,
);
s.reading_order = order;
order += 1;
spans.push(s);
}
let result = converter.convert(&spans, &config).unwrap();
assert!(
!result.contains("### body text content"),
"11pt bold text should not be heading when base is 11pt: {}",
result
);
}
fn make_span_with_width(
text: &str,
x: f32,
y: f32,
width: f32,
font_size: f32,
weight: FontWeight,
order: usize,
) -> OrderedTextSpan {
let mut s = OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, width, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
heading_level: None,
},
order,
);
s.reading_order = order;
s
}
#[test]
fn test_issue_260_single_word_bt_et_blocks_get_spaces() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span_with_width("The", 72.0, 500.0, 20.0, 12.0, FontWeight::Normal, 0),
make_span_with_width("quick", 96.0, 500.0, 30.0, 12.0, FontWeight::Normal, 1),
make_span_with_width("brown", 130.0, 500.0, 33.0, 12.0, FontWeight::Normal, 2),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("The quick brown"),
"Single-word BT/ET spans with gaps should have spaces inserted: got {:?}",
result
);
}
#[test]
fn test_issue_260_no_space_for_tight_spans() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span_with_width("Hel", 72.0, 500.0, 18.0, 12.0, FontWeight::Normal, 0),
make_span_with_width("lo", 90.0, 500.0, 12.0, 12.0, FontWeight::Normal, 1),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("Hello"),
"Tight spans should be merged without space: got {:?}",
result
);
}
#[test]
fn test_heading_detection_still_works_for_large_fonts() {
let converter = MarkdownOutputConverter::new();
let config = config_with_headings();
let mut heading = make_span("BIG HEADING", 50.0, 100.0, 24.0, FontWeight::Bold);
heading.reading_order = 0;
let mut body = make_span("Body text", 50.0, 70.0, 11.0, FontWeight::Normal);
body.reading_order = 1;
let spans = vec![heading, body];
let result = converter.convert(&spans, &config).unwrap();
assert!(result.contains("# BIG HEADING"), "24pt text should be H1: {}", result);
}
#[test]
fn test_bold_consolidation_adjacent_bold_spans() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut s1 = make_span_w("ACME", 72.0, 700.0, 55.0, 12.0, FontWeight::Bold);
s1.reading_order = 0;
let mut s2 = make_span_w("GLOBAL", 130.0, 700.0, 42.0, 12.0, FontWeight::Bold);
s2.reading_order = 1;
let mut s3 = make_span_w("LTD.", 175.0, 700.0, 24.0, 12.0, FontWeight::Bold);
s3.reading_order = 2;
let spans = vec![s1, s2, s3];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("**ACME GLOBAL LTD.**"),
"Adjacent bold spans should be consolidated into one bold block, got: {}",
result
);
assert!(
!result.contains("**ACME** **GLOBAL**"),
"Should not wrap each word individually in bold markers, got: {}",
result
);
}
#[test]
fn test_render_table_markdown_all_cells_present() {
let mut table = Table::new();
table.has_header = true;
table.col_count = 4;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Account No.".to_string(), true));
header.add_cell(TableCell::new("Reference".to_string(), true));
header.add_cell(TableCell::new("Tax ID".to_string(), true));
header.add_cell(TableCell::new("Confirmation".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("20003035".to_string(), false));
data.add_cell(TableCell::new("403852".to_string(), false));
data.add_cell(TableCell::new("123 456 789".to_string(), false));
data.add_cell(TableCell::new("4351966".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
assert!(
result.contains("403852"),
"Reference value '403852' must be present in markdown table: {}",
result
);
assert!(result.contains("20003035"), "Account No. value must be present: {}", result);
assert!(result.contains("123 456 789"), "Tax ID value must be present: {}", result);
assert!(result.contains("4351966"), "Confirmation value must be present: {}", result);
assert!(result.contains("Reference"), "Header must be present: {}", result);
assert!(result.contains("|"), "Must be markdown table format with pipe separators");
}
#[test]
fn test_render_table_markdown_short_row_padded() {
let mut table = Table::new();
table.has_header = true;
table.col_count = 4;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("A".to_string(), true));
header.add_cell(TableCell::new("B".to_string(), true));
header.add_cell(TableCell::new("C".to_string(), true));
header.add_cell(TableCell::new("D".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("1".to_string(), false));
data.add_cell(TableCell::new("2".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
let lines: Vec<&str> = result.lines().collect();
assert!(lines.len() >= 3, "Must have header, separator, and data row: {}", result);
let header_pipes = lines[0].matches('|').count();
let data_pipes = lines[2].matches('|').count();
assert_eq!(
header_pipes, data_pipes,
"Header and data rows must have same number of pipe separators.\nHeader ({}): {}\nData ({}): {}",
header_pipes, lines[0], data_pipes, lines[2]
);
}
#[test]
fn test_render_table_markdown_short_header_padded() {
let mut table = Table::new();
table.has_header = true;
table.col_count = 3;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("X".to_string(), true));
header.add_cell(TableCell::new("Y".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("1".to_string(), false));
data.add_cell(TableCell::new("2".to_string(), false));
data.add_cell(TableCell::new("3".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &crate::pipeline::TextPipelineConfig::default());
let lines: Vec<&str> = result.lines().collect();
assert!(lines.len() >= 3, "Must have header, separator, and data row: {}", result);
let header_pipes = lines[0].matches('|').count();
let data_pipes = lines[2].matches('|').count();
assert_eq!(
header_pipes, data_pipes,
"Header and data rows must have same number of pipe separators.\nHeader ({}): {}\nData ({}): {}",
header_pipes, lines[0], data_pipes, lines[2]
);
assert!(result.contains("| 3 |"), "Third cell in data row must be present: {}", result);
}
#[test]
fn test_key_value_pair_merging_in_markdown() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut s0 = make_span("Grand Total", 50.0, 200.0, 12.0, FontWeight::Normal);
s0.reading_order = 0;
s0.group_id = Some(0);
let mut s1 = make_span("$750.00", 300.0, 185.0, 12.0, FontWeight::Normal);
s1.reading_order = 1;
s1.group_id = Some(1);
let spans = vec![s0, s1];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("Grand Total $750.00"),
"Should merge label with value on same line: {:?}",
result,
);
}
#[test]
fn test_arabic_bold_span_no_spurious_bold_markers() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let span = make_span("مرحبا", 0.0, 100.0, 12.0, FontWeight::Bold);
let result = converter.convert(&[span], &config).unwrap();
assert!(
!result.contains("**"),
"spurious bold markers found in Arabic output: {:?}",
result
);
assert!(result.contains("مرحبا"), "Arabic text lost in output: {:?}", result);
}
#[test]
fn test_rtl_detection_arabic_and_ascii() {
assert!(crate::text::bidi::looks_rtl("مرحبا"), "Arabic U+0600-U+06FF must be RTL");
assert!(
crate::text::bidi::looks_rtl("\u{FE80}"),
"Arabic Presentation Forms-B U+FE80 must be RTL"
);
assert!(crate::text::bidi::looks_rtl("שלום"), "Hebrew U+0590-U+05FF must be RTL");
assert!(!crate::text::bidi::looks_rtl("hello world"), "ASCII must not be RTL");
assert!(!crate::text::bidi::looks_rtl(""), "empty string must not be RTL");
}
#[test]
fn test_strip_inline_emphasis_removes_rtl_markers() {
let out = strip_inline_emphasis_in_rtl("**مرح**با");
assert!(!out.contains("**"), "bold markers must be stripped from Arabic: {:?}", out);
assert!(
out.contains("مرح") && out.contains("با"),
"Arabic chars must survive stripping: {:?}",
out
);
let out2 = strip_inline_emphasis_in_rtl("*مرحبا*");
assert!(!out2.contains('*'), "italic markers must be stripped from Arabic: {:?}", out2);
assert!(out2.contains("مرحبا"), "Arabic text lost: {:?}", out2);
let out3 = strip_inline_emphasis_in_rtl("*Hello*");
assert_eq!(out3, "*Hello*", "LTR emphasis must be preserved: {:?}", out3);
let out4 = strip_inline_emphasis_in_rtl("مرحبا");
assert_eq!(out4, "مرحبا", "no-asterisk path must be identity: {:?}", out4);
}
#[test]
fn test_rtl_cleanup_preserves_trailing_newline() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut s1 = make_span("مرحبا", 0.0, 200.0, 12.0, FontWeight::Normal);
s1.block_id = Some(1);
let mut s2 = make_span("عالم", 0.0, 100.0, 12.0, FontWeight::Normal);
s2.block_id = Some(2);
let result = converter.convert(&[s1, s2], &config).unwrap();
assert!(result.contains("مرحبا"), "first Arabic word lost: {:?}", result);
assert!(result.contains("عالم"), "second Arabic word lost: {:?}", result);
assert!(
result.ends_with('\n'),
"trailing newline was dropped by RTL cleanup: {:?}",
result
);
}
#[test]
fn test_issue10_escape_stray_leading_pipes_basic() {
let input = "| Finished Goods\n| Internal Use Only\nPage 1 of 12\n";
let out = escape_stray_leading_pipes(input);
assert!(out.contains("\\| Finished Goods"), "stray pipe must be escaped, got:\n{}", out);
assert!(
out.contains("\\| Internal Use Only"),
"second stray pipe must be escaped, got:\n{}",
out
);
}
#[test]
fn test_issue10_preserves_real_tables() {
let input = "| Col A | Col B |\n|---|---|\n| 1 | 2 |\n";
let out = escape_stray_leading_pipes(input);
assert!(!out.contains("\\|"), "real table rows must not be escaped, got:\n{}", out);
}
#[test]
fn test_regression_real_sparse_table_not_flattened() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut table = Table::new();
let mut header = TableRow::new(true);
for h in ["", "Indonesia", "Germany", "Austria", "France", "Vatican"] {
header.add_cell(TableCell::new(h.to_string(), true));
}
table.add_row(header);
for (label, vals) in [
("Continent", ["Asia", "", "Europe", "", ""]),
("Capital", ["Jakarta", "Berlin", "Vienna", "Paris", "Vatican City"]),
] {
let mut row = TableRow::new(false);
row.add_cell(TableCell::new(label.to_string(), false));
for v in vals {
row.add_cell(TableCell::new(v.to_string(), false));
}
table.add_row(row);
}
let result = converter
.convert_with_tables(&[], &[table], &config)
.unwrap();
assert!(
result.contains("|---|") || result.contains("| Indonesia |"),
"real sparse table must survive as a table, got:\n{}",
result
);
}
#[test]
fn test_regression_repeated_identical_paragraphs_preserved() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span("Radio button, unselected", 0.0, 100.0, 12.0, FontWeight::Normal),
make_span("Radio button, unselected", 0.0, 80.0, 12.0, FontWeight::Normal),
make_span("Radio button, unselected", 0.0, 60.0, 12.0, FontWeight::Normal),
];
let result = converter.convert(&spans, &config).unwrap();
let count = result.matches("Radio button, unselected").count();
assert_eq!(
count, 3,
"three distinct identical-label widgets must all survive, got {}:\n{}",
count, result
);
}
#[test]
fn test_tagged_distinct_headings_are_not_merged() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, y: f32| {
let mut s = make_span(t, 0.0, y, 18.0, FontWeight::Bold);
s.struct_role = Some(StructRole::Heading(1));
s
};
let spans = vec![mk("Alpha", 100.0), mk("Beta", 60.0), mk("Gamma", 20.0)];
let result = converter.convert(&spans, &config).unwrap();
let h1_count = result.lines().filter(|l| l.starts_with("# ")).count();
assert_eq!(
h1_count, 3,
"tagged distinct H1 elements must NOT be merged (spec §14.8.4.3.2), got:\n{}",
result
);
}
#[test]
fn test_issue1_merge_word_per_heading_runs() {
let input = "# Quarterly\n\n# Inventory\n\n# Review\n";
let out = merge_consecutive_same_level_headings(input);
assert_eq!(
out.trim(),
"# Quarterly Inventory Review",
"three same-level short H1s must merge, got:\n{}",
out
);
}
#[test]
fn test_issue4_merge_wrapped_heading_trailing_comma() {
let input = "## Despite seasonal slowdown,\n## warehouse maintained throughput\n";
let out = merge_consecutive_same_level_headings(input);
assert!(
out.contains("## Despite seasonal slowdown, warehouse maintained throughput"),
"wrapped heading with trailing comma must fuse, got:\n{}",
out
);
}
#[test]
fn test_issue4_merge_wrapped_heading_connector_opener() {
let input = "# Architecture\n# and Implementation\n";
let out = merge_consecutive_same_level_headings(input);
assert!(
out.contains("# Architecture and Implementation"),
"wrapped heading with connector opener must fuse, got:\n{}",
out
);
}
#[test]
fn test_issue4_does_not_fuse_ambiguous_two_headings() {
let input = "# First Heading\n# Second Heading\n";
let out = merge_consecutive_same_level_headings(input);
let h_lines = out.lines().filter(|l| l.starts_with("# ")).count();
assert_eq!(
h_lines, 2,
"ambiguous 2-fragment same-level headings must NOT fuse, got:\n{}",
out
);
}
#[test]
fn test_issue1_does_not_fuse_long_distinct_headings() {
let h1 = "# Annual Sales Performance Across Every Region in Detail";
let h2 = "# Q1 Highlights and Outlook for the Year";
let input = format!("{}\n\n{}\n", h1, h2);
let out = merge_consecutive_same_level_headings(&input);
assert!(
out.contains(h1) && out.contains(h2),
"two long distinct headings must remain separate, got:\n{}",
out
);
}
#[test]
fn test_issue3_degenerate_table_collapses_to_paragraph() {
let input = "\
| Q1 | Warehouse | throughput | increased | 15% |
|---|---|---|---|---|
| quarter | over | quarter | to | 23,500 |
| units | per | day | strong | demand |
";
let out = simplify_degenerate_tables(input);
assert!(!out.contains("|---|"), "separator row should be gone, got:\n{}", out);
assert!(
out.contains("Q1 Warehouse throughput increased 15%"),
"header words flattened to prose, got:\n{}",
out
);
}
#[test]
fn test_issue3_preserves_legitimate_multi_word_tables() {
let input = "\
| Region | Revenue Q1 | Revenue Q2 | Revenue Q3 | Revenue Q4 |
|---|---|---|---|---|
| North America Sales | 1.2 M | 1.5 M | 1.7 M | 1.9 M |
| Europe Sales Total | 0.8 M | 0.9 M | 1.1 M | 1.3 M |
";
let out = simplify_degenerate_tables(input);
assert!(out.contains("|---|"), "real table must keep separator, got:\n{}", out);
assert!(
out.contains("| North America Sales |"),
"real table cells must remain, got:\n{}",
out
);
}
#[test]
fn test_issue9_preserves_page_number_shaped_lines() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span("Some text.", 0.0, 100.0, 12.0, FontWeight::Normal),
make_span("Page 1 of 12", 0.0, 80.0, 10.0, FontWeight::Normal),
make_span("More text.", 0.0, 60.0, 12.0, FontWeight::Normal),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(result.contains("Page 1 of 12"), "page-N text must survive, got:\n{}", result);
assert!(result.contains("Some text."), "prose must survive, got:\n{}", result);
assert!(result.contains("More text."), "prose must survive, got:\n{}", result);
}
#[test]
fn test_issue9_preserves_page_in_prose() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(
"See Page 3 for details about the change.",
0.0,
100.0,
12.0,
FontWeight::Normal,
)];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("See Page 3 for details"),
"in-prose 'Page N' must not be dropped, got:\n{}",
result
);
}
#[test]
fn test_issue13_preserves_bullet_text_content() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span("\u{274D} First item", 0.0, 100.0, 12.0, FontWeight::Normal),
make_span("\u{25E6} Second item", 0.0, 80.0, 12.0, FontWeight::Normal),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(result.contains("First item"), "list-item text must survive: {}", result);
assert!(result.contains("Second item"), "list-item text must survive: {}", result);
}
#[test]
fn test_issue13_preserves_mid_prose_bullet_codepoint() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(
"The symbol \u{274D} indicates a shadow circle.",
0.0,
100.0,
12.0,
FontWeight::Normal,
)];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("\u{274D}"),
"mid-prose U+274D must survive verbatim, got:\n{}",
result
);
}
#[test]
fn test_issue11_collapses_numeric_heading_run() {
let input = "# 23,500\n\n# 99.2%\n\n# 87%\n\n# 4.2 days\n";
let out = collapse_numeric_heading_runs(input);
for v in ["- 23,500", "- 99.2%", "- 87%", "- 4.2 days"] {
assert!(out.contains(v), "expected `{}` in output, got:\n{}", v, out);
}
assert!(!out.contains("# 23,500"), "H1 form must be gone, got:\n{}", out);
}
#[test]
fn test_issue11_preserves_single_numeric_heading() {
let input = "# 2024 Annual Report\n";
let out = collapse_numeric_heading_runs(input);
assert_eq!(out, input, "single non-numeric heading must be untouched: {}", out);
}
#[test]
fn test_issue12_coalesces_inline_camelcase_bold() {
let input = "**S alesF orce** is great.\n";
let out = coalesce_camelcase_bold_fragments(input);
assert!(
out.contains("**SalesForce**"),
"inline CamelCase bold must coalesce, got:\n{}",
out
);
}
#[test]
fn test_issue12_preserves_normal_multi_word_bold() {
let input = "**John Smith** wrote.\n**USB Type C** cable.\n";
let out = coalesce_camelcase_bold_fragments(input);
assert!(
out.contains("**John Smith**"),
"two-word person bold must not be merged, got:\n{}",
out
);
assert!(
out.contains("**USB Type C**"),
"three-word product bold must not be merged, got:\n{}",
out
);
}
#[test]
fn test_issue12_bound_camelcase_bold_coalesces() {
let input = "**N orthW** ind";
let out = coalesce_camelcase_bold_fragments(input);
let acceptable = out.contains("**NorthWind**")
|| out.contains("**NorthW**ind")
|| out.contains("**N**orthWind");
assert!(
acceptable,
"bound CamelCase bold (closing ** mid-word) should coalesce, got:\n{}",
out
);
}
#[test]
fn test_issue8_table_cell_renders_bold_marker() {
let bold_span = TextSpan {
artifact_type: None,
text: "Critical".to_string(),
bbox: Rect::new(0.0, 0.0, 50.0, 12.0),
font_name: "Test-Bold".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
heading_level: None,
};
let mut cell = TableCell::new("Critical".to_string(), false);
cell.spans.push(bold_span.clone());
let mut row = TableRow::new(false);
row.add_cell(cell);
let mut table = Table::new();
table.add_row(row);
let result = MarkdownOutputConverter::new()
.render_table_markdown(&table, &TextPipelineConfig::default());
assert!(
result.contains("**Critical**"),
"bold marker must appear in rendered cell, got:\n{}",
result
);
}
#[test]
fn test_issue2_dedup_consecutive_duplicate_paragraphs() {
let input = "Revenue grew by 15%.\n\nRevenue grew by 15%.\n\nNext paragraph here.\n";
let out = dedup_consecutive_paragraphs(input);
let occurrences = out.matches("Revenue grew by 15%.").count();
assert_eq!(
occurrences, 1,
"exact-duplicate consecutive paragraph must collapse, got:\n{}",
out
);
assert!(
out.contains("Next paragraph here."),
"subsequent paragraph must survive, got:\n{}",
out
);
}
#[test]
fn test_issue2_preserves_nonconsecutive_repeats() {
let input = "Important note.\n\nOther content.\n\nImportant note.\n";
let out = dedup_consecutive_paragraphs(input);
let occurrences = out.matches("Important note.").count();
assert_eq!(occurrences, 2, "non-consecutive repeat must survive, got:\n{}", out);
}
#[test]
fn test_issue5_dedups_identical_header_cells() {
let input = "| Q1'25 | Q1'25 | Q1'25 | Q1'25 |\n|---|---|---|---|\n| Zone A | | | |\n";
let out = dedup_identical_header_cells(input);
let q1_count = out.matches("Q1'25").count();
assert_eq!(
q1_count, 1,
"all-identical header cells must dedup to one, got {} in:\n{}",
q1_count, out
);
assert!(out.contains("Zone A"), "data row must remain intact, got:\n{}", out);
}
#[test]
fn test_issue5_preserves_real_distinct_headers() {
let input = "| North | South | East | West |\n|---|---|---|---|\n| 1 | 2 | 3 | 4 |\n";
let out = dedup_identical_header_cells(input);
for col in ["North", "South", "East", "West"] {
assert!(out.contains(col), "distinct header `{}` must survive: {}", col, out);
}
}
#[test]
fn test_issue7_no_column_interleaving() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mk = |t: &str, x: f32, y: f32, bid: u32| {
let mut s = make_span(t, x, y, 12.0, FontWeight::Normal);
s.block_id = Some(bid);
s
};
let spans = vec![
mk("Left A.", 0.0, 100.0, 1),
mk("Right A.", 300.0, 100.0, 2),
mk("Left B.", 0.0, 88.0, 1),
mk("Right B.", 300.0, 88.0, 2),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("Left A.") && result.contains("Left B."),
"left column must surface, got:\n{}",
result
);
assert!(
!result.contains("Left A. Right A."),
"columns must not interleave at the line level, got:\n{}",
result
);
}
#[test]
fn markdown_wraps_rtl_run_with_rli_pdi() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let span =
make_span("The article שלום עולם is greetings.", 0.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[span], &config).unwrap();
assert!(
result.contains('\u{2067}'),
"expected U+2067 (RLI) in markdown output, got:\n{:?}",
result
);
assert!(
result.contains('\u{2069}'),
"expected U+2069 (PDI) in markdown output, got:\n{:?}",
result
);
assert!(
!result.contains('\u{2066}'),
"unexpected U+2066 (LRI) in LTR-block output:\n{:?}",
result
);
}
#[test]
fn markdown_wraps_ltr_run_inside_rtl_block_with_lri_pdi() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let span = make_span("הספר Microsoft חדש", 0.0, 100.0, 12.0, FontWeight::Normal);
let result = converter.convert(&[span], &config).unwrap();
assert!(
result.contains('\u{2066}'),
"expected U+2066 (LRI) wrapping the embedded LTR token, got:\n{:?}",
result
);
assert!(
result.contains('\u{2069}'),
"expected U+2069 (PDI) closing the LRI, got:\n{:?}",
result
);
assert!(
!result.contains('\u{2067}'),
"unexpected U+2067 (RLI) in RTL-block output:\n{:?}",
result
);
}
#[test]
fn markdown_leaves_pure_ltr_unchanged() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span("The first paragraph.", 0.0, 100.0, 12.0, FontWeight::Normal),
make_span("A second sentence.", 0.0, 84.0, 12.0, FontWeight::Normal),
make_span("Numbers 123 and (parens) too.", 0.0, 68.0, 12.0, FontWeight::Normal),
];
let result = converter.convert(&spans, &config).unwrap();
for marker in ['\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}'] {
assert!(
!result.contains(marker),
"pure-LTR output must not contain U+{:04X}, got:\n{:?}",
marker as u32,
result
);
}
}
}