use crate::error::Result;
use crate::layout::FontWeight;
use crate::pipeline::{OrderedTextSpan, TextPipelineConfig};
use crate::structure::table_extractor::ExtractedTable;
use crate::text::HyphenationHandler;
use lazy_static::lazy_static;
use regex::Regex;
use super::OutputConverter;
lazy_static! {
static ref RE_URL: Regex = Regex::new(r"(https?://[^\s<>\[\]]*[^\s<>\[\].,!?;:])").unwrap();
static ref RE_EMAIL: Regex = Regex::new(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})").unwrap();
}
pub struct MarkdownOutputConverter {
paragraph_gap_ratio: f32,
}
impl MarkdownOutputConverter {
pub fn new() -> Self {
Self {
paragraph_gap_ratio: 1.5,
}
}
pub fn with_paragraph_gap(ratio: f32) -> Self {
Self {
paragraph_gap_ratio: ratio,
}
}
fn is_bold(&self, span: &OrderedTextSpan, config: &TextPipelineConfig) -> bool {
use crate::pipeline::config::BoldMarkerBehavior;
match span.span.font_weight {
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold => {
match config.output.bold_marker_behavior {
BoldMarkerBehavior::Aggressive => true,
BoldMarkerBehavior::Conservative => {
span.span.text.chars().any(|c| !c.is_whitespace())
},
}
},
_ => false,
}
}
fn is_italic(&self, span: &OrderedTextSpan) -> bool {
span.span.is_italic && span.span.text.chars().any(|c| !c.is_whitespace())
}
fn linkify(&self, text: &str) -> String {
let might_have_url = text.contains("://") || text.contains("www.");
let might_have_email = text.contains('@');
if !might_have_url && !might_have_email {
return text.to_string();
}
let mut result = if might_have_url {
RE_URL
.replace_all(text, |caps: ®ex::Captures| {
let url = &caps[0];
format!("[{}]({})", url, url)
})
.to_string()
} else {
text.to_string()
};
if might_have_email {
result = RE_EMAIL
.replace_all(&result, |caps: ®ex::Captures| {
let email = &caps[0];
format!("[{}](mailto:{})", email, email)
})
.to_string();
}
result
}
fn normalize_whitespace(&self, text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn is_paragraph_break(&self, current: &OrderedTextSpan, previous: &OrderedTextSpan) -> bool {
let line_height = current.span.font_size.max(previous.span.font_size);
let gap = (previous.span.bbox.y - current.span.bbox.y).abs();
gap > line_height * self.paragraph_gap_ratio
}
fn is_bullet_span(text: &str) -> bool {
let t = text.trim();
matches!(t, "►" | "•" | "▪" | "▸" | "‣" | "◦" | "●" | "■" | "◆" | "○" | "□")
}
fn starts_with_bullet(text: &str) -> bool {
let t = text.trim_start();
t.starts_with('►')
|| t.starts_with('•')
|| t.starts_with('▪')
|| t.starts_with('▸')
|| t.starts_with('‣')
|| t.starts_with('◦')
|| t.starts_with('●')
|| t.starts_with('■')
|| t.starts_with('◆')
|| t.starts_with('○')
|| t.starts_with('□')
}
fn strip_bullet(text: &str) -> &str {
let t = text.trim_start();
if Self::starts_with_bullet(t) {
let mut chars = t.chars();
chars.next(); chars.as_str().trim_start()
} else {
text
}
}
fn heading_level_absolute(&self, span: &OrderedTextSpan) -> Option<u8> {
let size = span.span.font_size;
let text_len = span.span.text.trim().len();
if !(2..=200).contains(&text_len) {
return None;
}
if size >= 24.0 {
Some(1)
} else if size >= 18.0 {
Some(2)
} else if size >= 16.0 {
Some(3)
} else {
None
}
}
fn heading_level_ratio(&self, span: &OrderedTextSpan, base_font_size: f32) -> Option<u8> {
let text_len = span.span.text.trim().len();
if !(2..=200).contains(&text_len) {
return None;
}
let size_ratio = span.span.font_size / base_font_size;
let is_bold = matches!(
span.span.font_weight,
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold
);
if size_ratio >= 2.0 {
Some(1)
} else if size_ratio >= 1.5 {
Some(2)
} else if size_ratio >= 1.3 {
Some(3)
} else if is_bold && size_ratio >= 1.15 {
Some(3)
} else {
None
}
}
fn render_table_markdown(table: &ExtractedTable) -> String {
if table.rows.is_empty() {
return String::new();
}
let mut output = String::new();
let header_end = if table.has_header {
table.rows.iter().position(|r| !r.is_header).unwrap_or(1)
} else {
1
};
let max_cols = table
.rows
.iter()
.map(|row| {
row.cells
.iter()
.map(|c| c.colspan.max(1) as usize)
.sum::<usize>()
})
.max()
.unwrap_or(0);
for (row_idx, row) in table.rows.iter().enumerate() {
output.push('|');
let mut cols_written: usize = 0;
for cell in &row.cells {
output.push(' ');
let text = cell.text.replace('|', "\\|");
let text = text.replace('\n', " ");
output.push_str(text.trim());
output.push(' ');
let span = cell.colspan.max(1) as usize;
for _ in 1..span {
output.push_str("| ");
}
output.push('|');
cols_written += span;
}
for _ in cols_written..max_cols {
output.push_str(" |");
}
output.push('\n');
if row_idx + 1 == header_end {
output.push('|');
let header_cols: usize = row.cells.iter().map(|c| c.colspan.max(1) as usize).sum();
for _ in 0..max_cols.max(header_cols) {
output.push_str("---|");
}
output.push('\n');
}
}
output
}
fn render_spans(
&self,
spans: &[OrderedTextSpan],
tables: &[ExtractedTable],
config: &TextPipelineConfig,
) -> Result<String> {
if spans.is_empty() && tables.is_empty() {
return Ok(String::new());
}
let mut sorted: Vec<_> = spans.iter().collect();
sorted.sort_by_key(|s| s.reading_order);
let base_font_size = if config.output.detect_headings {
let mut sizes_sorted: Vec<f32> = sorted
.iter()
.map(|s| s.span.font_size)
.filter(|&s| s >= 9.0)
.collect();
sizes_sorted.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
sizes_sorted
.get(sizes_sorted.len() / 2)
.copied()
.unwrap_or(12.0)
.max(8.0)
} else {
12.0
};
let mut tables_rendered = vec![false; tables.len()];
let table_mds: Vec<String> = tables.iter().map(Self::render_table_markdown).collect();
let mut table_skipped_spans: Vec<Vec<&OrderedTextSpan>> = vec![Vec::new(); tables.len()];
let mut result = String::new();
let mut prev_span: Option<&OrderedTextSpan> = None;
let mut current_line = String::new();
let mut active_bold = false;
let mut active_italic = false;
fn close_formatting(line: &mut String, bold: &mut bool, italic: &mut bool) {
if !*bold && !*italic {
return;
}
let content_end = line.trim_end().len();
let trailing_ws = line[content_end..].to_string();
line.truncate(content_end);
if *italic {
line.push('*');
*italic = false;
}
if *bold {
line.push_str("**");
*bold = false;
}
line.push_str(&trailing_ws);
}
for span in sorted.iter() {
if !tables.is_empty() {
if let Some(table_idx) = super::span_in_table(span, tables) {
if !tables_rendered[table_idx] {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push_str("\n\n");
current_line.clear();
}
result.push_str(&table_mds[table_idx]);
result.push('\n');
tables_rendered[table_idx] = true;
prev_span = None;
}
table_skipped_spans[table_idx].push(span);
continue;
}
}
let same_line = prev_span
.map(|prev| (span.span.bbox.y - prev.span.bbox.y).abs() < span.span.font_size * 0.5)
.unwrap_or(true);
if let Some(prev) = prev_span {
let group_changed = match (span.group_id, prev.group_id) {
(Some(a), Some(b)) => a != b,
_ => false,
};
if group_changed || self.is_paragraph_break(span, prev) {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push_str("\n\n");
current_line.clear();
}
} else if !same_line {
let is_bullet = Self::is_bullet_span(&span.span.text)
|| Self::starts_with_bullet(&span.span.text);
if is_bullet {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push('\n');
current_line.clear();
}
} else {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if config.output.preserve_layout {
let spacing = (span.span.bbox.x - prev.span.bbox.x).max(0.0) as usize;
for _ in 0..spacing.min(20) {
current_line.push(' ');
}
} else {
current_line.push(' ');
}
}
}
}
if Self::is_bullet_span(&span.span.text) {
if same_line && !current_line.is_empty() && !current_line.ends_with("- ") {
current_line.push_str(&span.span.text);
} else if !current_line.ends_with("- ") {
current_line.push_str("- ");
}
prev_span = Some(span);
continue;
}
if Self::starts_with_bullet(&span.span.text) && (!same_line || prev_span.is_none()) {
let stripped = Self::strip_bullet(&span.span.text);
if !current_line.ends_with("- ") {
current_line.push_str("- ");
}
let normalized_bullet;
let mut text = stripped;
if !config.output.preserve_layout {
normalized_bullet = self.normalize_whitespace(text);
text = &normalized_bullet;
}
let linkified = self.linkify(text);
let is_bold = self.is_bold(span, config);
let is_italic = self.is_italic(span);
if is_bold != active_bold || is_italic != active_italic {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if is_bold {
current_line.push_str("**");
active_bold = true;
}
if is_italic {
current_line.push('*');
active_italic = true;
}
}
current_line.push_str(&linkified);
prev_span = Some(span);
continue;
}
if config.output.detect_headings {
let level = match (
self.heading_level_absolute(span),
self.heading_level_ratio(span, base_font_size),
) {
(Some(a), Some(b)) => Some(a.min(b)),
(a, b) => a.or(b),
};
if let Some(level) = level {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push_str("\n\n");
current_line.clear();
}
let prefix = "#".repeat(level as usize);
result.push_str(&format!("{} {}\n\n", prefix, span.span.text.trim()));
prev_span = None;
continue;
}
}
let mut text = span.span.text.as_str();
let normalized;
if !config.output.preserve_layout {
let had_leading_space =
same_line && prev_span.is_some() && text.starts_with(char::is_whitespace);
let had_trailing_space = text.ends_with(char::is_whitespace);
let mut norm = self.normalize_whitespace(text);
if had_leading_space && !norm.starts_with(' ') {
norm.insert(0, ' ');
}
if had_trailing_space && !norm.ends_with(' ') && !norm.is_empty() {
norm.push(' ');
}
normalized = norm;
text = &normalized;
}
let linkified = self.linkify(text);
let is_bold = self.is_bold(span, config);
let is_italic = self.is_italic(span);
if same_line && !current_line.is_empty() {
if let Some(prev) = prev_span {
let needs_gap_space = !current_line.ends_with(' ')
&& !linkified.starts_with(' ')
&& super::has_horizontal_gap(&prev.span, &span.span);
if needs_gap_space {
current_line.push(' ');
}
}
}
if is_bold != active_bold || is_italic != active_italic {
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
if is_bold {
current_line.push_str("**");
active_bold = true;
}
if is_italic {
current_line.push('*');
active_italic = true;
}
}
current_line.push_str(&linkified);
prev_span = Some(span);
}
close_formatting(&mut current_line, &mut active_bold, &mut active_italic);
for (table_idx, skipped) in table_skipped_spans.iter().enumerate() {
if !tables_rendered[table_idx] || skipped.is_empty() {
continue;
}
let rendered = &table_mds[table_idx];
let mut orphans: Vec<&&OrderedTextSpan> = skipped
.iter()
.filter(|s| {
let trimmed = s.span.text.trim();
!trimmed.is_empty() && !rendered.contains(trimmed)
})
.collect();
if !orphans.is_empty() {
orphans.sort_by_key(|s| s.reading_order);
for orphan in orphans {
if !result.ends_with(' ') && !result.ends_with('\n') {
result.push(' ');
}
result.push_str(&orphan.span.text);
}
}
}
for (i, table) in tables.iter().enumerate() {
if !tables_rendered[i] && !table.is_empty() {
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push_str("\n\n");
current_line.clear();
}
result.push_str(&table_mds[i]);
result.push('\n');
}
}
if !current_line.is_empty() {
result.push_str(current_line.trim());
result.push('\n');
}
let mut final_result = if config.output.preserve_layout {
result
} else {
let cleaned = result
.split("\n\n")
.map(|para| para.trim())
.filter(|para| !para.is_empty())
.collect::<Vec<_>>()
.join("\n\n");
if result.ends_with('\n') && !cleaned.ends_with('\n') {
format!("{}\n", cleaned)
} else {
cleaned
}
};
final_result = super::merge_key_value_pairs(&final_result);
if config.enable_hyphenation_reconstruction {
let handler = HyphenationHandler::new();
final_result = handler.process_text(&final_result);
}
Ok(final_result)
}
}
impl Default for MarkdownOutputConverter {
fn default() -> Self {
Self::new()
}
}
impl OutputConverter for MarkdownOutputConverter {
fn convert(&self, spans: &[OrderedTextSpan], config: &TextPipelineConfig) -> Result<String> {
self.render_spans(spans, &[], config)
}
fn convert_with_tables(
&self,
spans: &[OrderedTextSpan],
tables: &[ExtractedTable],
config: &TextPipelineConfig,
) -> Result<String> {
self.render_spans(spans, tables, config)
}
fn name(&self) -> &'static str {
"MarkdownOutputConverter"
}
fn mime_type(&self) -> &'static str {
"text/markdown"
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
use crate::layout::{Color, TextSpan};
use crate::pipeline::converters::span_in_table;
use crate::structure::table_extractor::{TableCell, TableRow};
fn make_span_w(
text: &str,
x: f32,
y: f32,
width: f32,
font_size: f32,
weight: FontWeight,
) -> OrderedTextSpan {
OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, width, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
0,
)
}
fn make_span(
text: &str,
x: f32,
y: f32,
font_size: f32,
weight: FontWeight,
) -> OrderedTextSpan {
OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, 50.0, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
0,
)
}
#[test]
fn test_empty_spans() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let result = converter.convert(&[], &config).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_single_span() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(
"Hello world",
0.0,
100.0,
12.0,
FontWeight::Normal,
)];
let result = converter.convert(&spans, &config).unwrap();
assert_eq!(result, "Hello world\n");
}
#[test]
fn test_bold_text() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span("Bold text", 0.0, 100.0, 12.0, FontWeight::Bold)];
let result = converter.convert(&spans, &config).unwrap();
assert_eq!(result, "**Bold text**\n");
}
#[test]
fn test_whitespace_bold_conservative() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(" ", 0.0, 100.0, 12.0, FontWeight::Bold)];
let result = converter.convert(&spans, &config).unwrap();
assert!(!result.contains("**"));
}
#[test]
fn test_convert_with_tables_renders_markdown_table() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
table.col_count = 2;
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Name".to_string(), true));
header.add_cell(TableCell::new("Value".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("A".to_string(), false));
data.add_cell(TableCell::new("1".to_string(), false));
table.add_row(data);
let result = converter
.convert_with_tables(&[], &[table], &config)
.unwrap();
assert!(result.contains("| Name |"));
assert!(result.contains("| Value |"));
assert!(result.contains("---|"));
assert!(result.contains("| A |"));
assert!(result.contains("| 1 |"));
}
#[test]
fn test_render_table_markdown_empty() {
let table = ExtractedTable::new();
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert_eq!(result, "");
}
#[test]
fn test_render_table_markdown_single_row_no_header() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("A".to_string(), false));
row.add_cell(TableCell::new("B".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert!(result.contains("| A |"));
assert!(result.contains("| B |"));
assert!(result.contains("---|"));
}
#[test]
fn test_render_table_markdown_with_colspan() {
let mut table = ExtractedTable::new();
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Wide".to_string(), true).with_colspan(2));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("Left".to_string(), false));
data.add_cell(TableCell::new("Right".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert!(result.contains("| Wide |"));
assert!(result.contains("---|---|"));
}
#[test]
fn test_render_table_markdown_escapes_pipes() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("A|B".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert!(result.contains("A\\|B"), "Pipes should be escaped: {}", result);
}
#[test]
fn test_render_table_markdown_replaces_newlines() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("Line1\nLine2".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert!(!result.contains("Line1\nLine2"), "Newlines in cells should be replaced");
assert!(result.contains("Line1 Line2"));
}
#[test]
fn test_render_table_markdown_trims_whitespace() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new(" padded ".to_string(), false));
table.add_row(row);
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert!(result.contains("| padded |"));
}
#[test]
fn test_render_table_markdown_multiple_header_rows() {
let mut table = ExtractedTable::new();
table.has_header = true;
let mut h1 = TableRow::new(true);
h1.add_cell(TableCell::new("H1".to_string(), true));
table.add_row(h1);
let mut h2 = TableRow::new(true);
h2.add_cell(TableCell::new("H2".to_string(), true));
table.add_row(h2);
let mut d1 = TableRow::new(false);
d1.add_cell(TableCell::new("D1".to_string(), false));
table.add_row(d1);
let result = MarkdownOutputConverter::render_table_markdown(&table);
let lines: Vec<&str> = result.lines().collect();
assert_eq!(lines.len(), 4); assert!(lines[2].contains("---|"));
}
#[test]
fn test_span_in_table_match() {
let span = make_span("text", 50.0, 70.0, 12.0, FontWeight::Normal);
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[table]), Some(0));
}
#[test]
fn test_span_in_table_no_match() {
let span = make_span("text", 500.0, 500.0, 12.0, FontWeight::Normal);
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[table]), None);
}
#[test]
fn test_span_in_table_none_bbox() {
let span = make_span("text", 50.0, 70.0, 12.0, FontWeight::Normal);
let table = ExtractedTable::new(); assert_eq!(span_in_table(&span, &[table]), None);
}
#[test]
fn test_span_in_table_tolerance() {
let span = make_span("text", 8.5, 48.5, 12.0, FontWeight::Normal);
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[table]), Some(0), "Should match within tolerance");
}
#[test]
fn test_span_in_table_multiple_tables() {
let span = make_span("text", 350.0, 70.0, 12.0, FontWeight::Normal);
let mut t1 = ExtractedTable::new();
t1.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
let mut t2 = ExtractedTable::new();
t2.bbox = Some(Rect::new(300.0, 50.0, 200.0, 100.0));
assert_eq!(span_in_table(&span, &[t1, t2]), Some(1));
}
#[test]
fn test_convert_with_tables_mixed_content() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut span_before = make_span("Before table", 10.0, 200.0, 12.0, FontWeight::Normal);
span_before.reading_order = 0;
let mut span_after = make_span("After table", 10.0, 20.0, 12.0, FontWeight::Normal);
span_after.reading_order = 2;
let mut span_in_table = make_span("Val", 50.0, 70.0, 12.0, FontWeight::Normal);
span_in_table.reading_order = 1;
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Col".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("Val".to_string(), false));
table.add_row(data);
let result = converter
.convert_with_tables(&[span_before, span_in_table, span_after], &[table], &config)
.unwrap();
assert!(result.contains("Before table"), "Should contain text before table");
assert!(result.contains("| Col |"), "Should contain table");
assert!(result.contains("After table"), "Should contain text after table");
}
#[test]
fn test_convert_with_tables_no_tables_is_same_as_convert() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span("Hello", 0.0, 100.0, 12.0, FontWeight::Normal)];
let result_convert = converter.convert(&spans, &config).unwrap();
let result_with_tables = converter.convert_with_tables(&spans, &[], &config).unwrap();
assert_eq!(result_convert, result_with_tables);
}
#[test]
fn test_convert_with_tables_multiple_tables() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let make_table = |x: f32, text: &str| -> ExtractedTable {
let mut t = ExtractedTable::new();
t.bbox = Some(Rect::new(x, 50.0, 100.0, 50.0));
let mut row = TableRow::new(false);
row.add_cell(TableCell::new(text.to_string(), false));
t.add_row(row);
t
};
let result = converter
.convert_with_tables(&[], &[make_table(10.0, "T1"), make_table(200.0, "T2")], &config)
.unwrap();
assert!(result.contains("| T1 |"), "Should contain first table");
assert!(result.contains("| T2 |"), "Should contain second table");
}
#[test]
fn test_is_bullet_span() {
assert!(MarkdownOutputConverter::is_bullet_span("►"));
assert!(MarkdownOutputConverter::is_bullet_span("•"));
assert!(MarkdownOutputConverter::is_bullet_span("▪"));
assert!(MarkdownOutputConverter::is_bullet_span(" ► "));
assert!(!MarkdownOutputConverter::is_bullet_span("text"));
assert!(!MarkdownOutputConverter::is_bullet_span("►text"));
assert!(!MarkdownOutputConverter::is_bullet_span(""));
}
#[test]
fn test_starts_with_bullet() {
assert!(MarkdownOutputConverter::starts_with_bullet("►text"));
assert!(MarkdownOutputConverter::starts_with_bullet("• item"));
assert!(MarkdownOutputConverter::starts_with_bullet(" ► indented"));
assert!(!MarkdownOutputConverter::starts_with_bullet("text"));
assert!(!MarkdownOutputConverter::starts_with_bullet(""));
}
#[test]
fn test_strip_bullet() {
assert_eq!(MarkdownOutputConverter::strip_bullet("► text"), "text");
assert_eq!(MarkdownOutputConverter::strip_bullet("•item"), "item");
assert_eq!(MarkdownOutputConverter::strip_bullet("no bullet"), "no bullet");
}
#[test]
fn test_bullet_spans_become_list_items() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut title = make_span("FEATURES", 50.0, 660.0, 11.0, FontWeight::Bold);
title.reading_order = 0;
let mut bullet = make_span("►", 50.0, 640.0, 8.8, FontWeight::Normal);
bullet.reading_order = 1;
let mut text = make_span("Analog input", 60.0, 640.0, 11.0, FontWeight::Normal);
text.reading_order = 2;
let mut bullet2 = make_span("►", 50.0, 626.0, 8.8, FontWeight::Normal);
bullet2.reading_order = 3;
let mut text2 = make_span("16-bit ADC", 60.0, 626.0, 11.0, FontWeight::Normal);
text2.reading_order = 4;
let spans = vec![title, bullet, text, bullet2, text2];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("- Analog input"),
"Should convert bullet to list item: {}",
result
);
assert!(result.contains("- 16-bit ADC"), "Should convert second bullet: {}", result);
assert!(!result.contains("►"), "Should not contain raw bullet character: {}", result);
}
#[test]
fn test_inline_bullet_becomes_list_item() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut title = make_span("TITLE", 50.0, 660.0, 11.0, FontWeight::Bold);
title.reading_order = 0;
let mut bullet_text = make_span("► Analog input", 50.0, 640.0, 11.0, FontWeight::Normal);
bullet_text.reading_order = 1;
let spans = vec![title, bullet_text];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("- Analog input"),
"Should convert inline bullet to list item: {}",
result
);
}
#[test]
fn test_first_span_inline_bullet() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut bullet_text = make_span("► First item", 50.0, 660.0, 11.0, FontWeight::Normal);
bullet_text.reading_order = 0;
let mut bullet_text2 = make_span("► Second item", 50.0, 646.0, 11.0, FontWeight::Normal);
bullet_text2.reading_order = 1;
let spans = vec![bullet_text, bullet_text2];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("- First item"),
"First-span inline bullet should become list item: {}",
result
);
assert!(
result.contains("- Second item"),
"Second inline bullet should become list item: {}",
result
);
}
fn config_with_headings() -> TextPipelineConfig {
let mut config = TextPipelineConfig::default();
config.output.detect_headings = true;
config
}
#[test]
fn test_heading_base_font_excludes_small_spans() {
let converter = MarkdownOutputConverter::new();
let config = config_with_headings();
let mut spans = Vec::new();
let mut order = 0;
for i in 0..10 {
let mut s = make_span("►", 50.0, 600.0 - (i as f32) * 14.0, 8.8, FontWeight::Normal);
s.reading_order = order;
order += 1;
spans.push(s);
}
for i in 0..10 {
let mut s = make_span(
"body text content",
60.0,
600.0 - (i as f32) * 14.0,
11.0,
FontWeight::Bold,
);
s.reading_order = order;
order += 1;
spans.push(s);
}
let result = converter.convert(&spans, &config).unwrap();
assert!(
!result.contains("### body text content"),
"11pt bold text should not be heading when base is 11pt: {}",
result
);
}
fn make_span_with_width(
text: &str,
x: f32,
y: f32,
width: f32,
font_size: f32,
weight: FontWeight,
order: usize,
) -> OrderedTextSpan {
let mut s = OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, width, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
order,
);
s.reading_order = order;
s
}
#[test]
fn test_issue_260_single_word_bt_et_blocks_get_spaces() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span_with_width("The", 72.0, 500.0, 20.0, 12.0, FontWeight::Normal, 0),
make_span_with_width("quick", 96.0, 500.0, 30.0, 12.0, FontWeight::Normal, 1),
make_span_with_width("brown", 130.0, 500.0, 33.0, 12.0, FontWeight::Normal, 2),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("The quick brown"),
"Single-word BT/ET spans with gaps should have spaces inserted: got {:?}",
result
);
}
#[test]
fn test_issue_260_no_space_for_tight_spans() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![
make_span_with_width("Hel", 72.0, 500.0, 18.0, 12.0, FontWeight::Normal, 0),
make_span_with_width("lo", 90.0, 500.0, 12.0, 12.0, FontWeight::Normal, 1),
];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("Hello"),
"Tight spans should be merged without space: got {:?}",
result
);
}
#[test]
fn test_heading_detection_still_works_for_large_fonts() {
let converter = MarkdownOutputConverter::new();
let config = config_with_headings();
let mut heading = make_span("BIG HEADING", 50.0, 100.0, 24.0, FontWeight::Bold);
heading.reading_order = 0;
let mut body = make_span("Body text", 50.0, 70.0, 11.0, FontWeight::Normal);
body.reading_order = 1;
let spans = vec![heading, body];
let result = converter.convert(&spans, &config).unwrap();
assert!(result.contains("# BIG HEADING"), "24pt text should be H1: {}", result);
}
#[test]
fn test_bold_consolidation_adjacent_bold_spans() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut s1 = make_span_w("ACME", 72.0, 700.0, 55.0, 12.0, FontWeight::Bold);
s1.reading_order = 0;
let mut s2 = make_span_w("GLOBAL", 130.0, 700.0, 42.0, 12.0, FontWeight::Bold);
s2.reading_order = 1;
let mut s3 = make_span_w("LTD.", 175.0, 700.0, 24.0, 12.0, FontWeight::Bold);
s3.reading_order = 2;
let spans = vec![s1, s2, s3];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("**ACME GLOBAL LTD.**"),
"Adjacent bold spans should be consolidated into one bold block, got: {}",
result
);
assert!(
!result.contains("**ACME** **GLOBAL**"),
"Should not wrap each word individually in bold markers, got: {}",
result
);
}
#[test]
fn test_render_table_markdown_all_cells_present() {
let mut table = ExtractedTable::new();
table.has_header = true;
table.col_count = 4;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Account No.".to_string(), true));
header.add_cell(TableCell::new("Reference".to_string(), true));
header.add_cell(TableCell::new("Tax ID".to_string(), true));
header.add_cell(TableCell::new("Confirmation".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("20003035".to_string(), false));
data.add_cell(TableCell::new("403852".to_string(), false));
data.add_cell(TableCell::new("123 456 789".to_string(), false));
data.add_cell(TableCell::new("4351966".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::render_table_markdown(&table);
assert!(
result.contains("403852"),
"Reference value '403852' must be present in markdown table: {}",
result
);
assert!(result.contains("20003035"), "Account No. value must be present: {}", result);
assert!(result.contains("123 456 789"), "Tax ID value must be present: {}", result);
assert!(result.contains("4351966"), "Confirmation value must be present: {}", result);
assert!(result.contains("Reference"), "Header must be present: {}", result);
assert!(result.contains("|"), "Must be markdown table format with pipe separators");
}
#[test]
fn test_render_table_markdown_short_row_padded() {
let mut table = ExtractedTable::new();
table.has_header = true;
table.col_count = 4;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("A".to_string(), true));
header.add_cell(TableCell::new("B".to_string(), true));
header.add_cell(TableCell::new("C".to_string(), true));
header.add_cell(TableCell::new("D".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("1".to_string(), false));
data.add_cell(TableCell::new("2".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::render_table_markdown(&table);
let lines: Vec<&str> = result.lines().collect();
assert!(lines.len() >= 3, "Must have header, separator, and data row: {}", result);
let header_pipes = lines[0].matches('|').count();
let data_pipes = lines[2].matches('|').count();
assert_eq!(
header_pipes, data_pipes,
"Header and data rows must have same number of pipe separators.\nHeader ({}): {}\nData ({}): {}",
header_pipes, lines[0], data_pipes, lines[2]
);
}
#[test]
fn test_render_table_markdown_short_header_padded() {
let mut table = ExtractedTable::new();
table.has_header = true;
table.col_count = 3;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("X".to_string(), true));
header.add_cell(TableCell::new("Y".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("1".to_string(), false));
data.add_cell(TableCell::new("2".to_string(), false));
data.add_cell(TableCell::new("3".to_string(), false));
table.add_row(data);
let result = MarkdownOutputConverter::render_table_markdown(&table);
let lines: Vec<&str> = result.lines().collect();
assert!(lines.len() >= 3, "Must have header, separator, and data row: {}", result);
let header_pipes = lines[0].matches('|').count();
let data_pipes = lines[2].matches('|').count();
assert_eq!(
header_pipes, data_pipes,
"Header and data rows must have same number of pipe separators.\nHeader ({}): {}\nData ({}): {}",
header_pipes, lines[0], data_pipes, lines[2]
);
assert!(result.contains("| 3 |"), "Third cell in data row must be present: {}", result);
}
#[test]
fn test_key_value_pair_merging_in_markdown() {
let converter = MarkdownOutputConverter::new();
let config = TextPipelineConfig::default();
let mut s0 = make_span("Grand Total", 50.0, 200.0, 12.0, FontWeight::Normal);
s0.reading_order = 0;
s0.group_id = Some(0);
let mut s1 = make_span("$750.00", 300.0, 185.0, 12.0, FontWeight::Normal);
s1.reading_order = 1;
s1.group_id = Some(1);
let spans = vec![s0, s1];
let result = converter.convert(&spans, &config).unwrap();
assert!(
result.contains("Grand Total $750.00"),
"Should merge label with value on same line: {:?}",
result,
);
}
}