use crate::converters::text_post_processor::TextPostProcessor;
use crate::converters::whitespace::cleanup_markdown;
use crate::converters::{BoldMarkerBehavior, ConversionOptions, ReadingOrderMode};
use crate::error::Result;
use crate::extractors::SpacingConfig;
use crate::geometry::Rect;
use crate::layout::clustering::{cluster_chars_into_words, cluster_words_into_lines};
use crate::layout::document_analyzer::{AdaptiveLayoutParams, DocumentProperties};
use crate::layout::reading_order::graph_based_reading_order;
use crate::layout::{
BoldGroup, BoldMarkerDecision, BoldMarkerValidator, Color, FontWeight, TextBlock, TextChar,
TextSpan,
};
use crate::structure::spatial_table_detector::SpatialTableDetector;
use crate::structure::table_extractor::{ExtractedTable, TableRow};
use crate::XYCutStrategy;
use lazy_static::lazy_static;
use regex::{Captures, Regex};
lazy_static! {
static ref RE_URL: Regex = Regex::new(r"(https?://[^\s<>\[\]]*[^\s<>\[\].,!?;:])").unwrap();
static ref RE_EMAIL: Regex = Regex::new(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})").unwrap();
static ref RE_DASH_BEFORE: Regex = Regex::new(r"(\d)\s+(–|—)(\d)").unwrap();
static ref RE_DASH_AFTER: Regex = Regex::new(r"(\d)(–|—)\s+(\d)").unwrap();
static ref RE_PUNCT_SPACE: Regex = Regex::new(r"([.!?;:,])([A-Za-z])").unwrap();
}
#[derive(Debug)]
#[deprecated(
since = "0.2.0",
note = "Use `pdf_oxide::pipeline::converters::MarkdownOutputConverter` instead. \
The new converter is part of the unified TextPipeline architecture and \
provides better feature support and maintainability."
)]
pub struct MarkdownConverter;
#[allow(deprecated)]
impl MarkdownConverter {
pub fn new() -> Self {
Self
}
fn merge_adjacent_char_spans(blocks: Vec<TextBlock>) -> Vec<TextBlock> {
if blocks.is_empty() {
return blocks;
}
let mut merged: Vec<TextBlock> = Vec::new();
let mut current: Option<TextBlock> = None;
for block in blocks {
match current.take() {
None => {
current = Some(block);
},
Some(mut prev) => {
let same_line = (prev.bbox.y - block.bbox.y).abs() < 2.0;
let same_font = prev.dominant_font == block.dominant_font;
let same_size = (prev.avg_font_size - block.avg_font_size).abs() < 0.5;
let same_style = prev.is_bold == block.is_bold;
if same_line && same_font && same_size && same_style {
let prev_right = prev.bbox.x + prev.bbox.width;
let gap = block.bbox.x - prev_right;
let merge_threshold = prev.avg_font_size * 0.18;
let prev_is_space = prev.text.trim().is_empty();
let curr_is_space = block.text.trim().is_empty();
if !prev_is_space && !curr_is_space && gap < merge_threshold {
prev.text.push_str(&block.text);
prev.bbox.width = (block.bbox.x + block.bbox.width) - prev.bbox.x;
current = Some(prev);
} else {
merged.push(prev);
current = Some(block);
}
} else {
merged.push(prev);
current = Some(block);
}
},
}
}
if let Some(last) = current {
merged.push(last);
}
merged
}
pub fn convert_page_from_spans(
&self,
spans: &[crate::layout::TextSpan],
options: &ConversionOptions,
) -> Result<String> {
use crate::layout::TextBlock;
if spans.is_empty() {
return Ok(String::new());
}
let detector = crate::fonts::non_text_detection::NonTextDetector::default();
let span_classifications = detector.mark_non_text_spans(spans);
let figures_detected = span_classifications
.iter()
.filter(|c| c.is_non_text)
.count();
if figures_detected > 0 {
log::debug!("Detected {} figure(s) out of {} spans", figures_detected, spans.len());
}
let detected_tables = if options.extract_tables {
let detector_config = options.table_detection_config.clone().unwrap_or_default();
let table_detector = SpatialTableDetector::with_config(detector_config);
let tables = table_detector.detect_tables(spans);
if !tables.is_empty() {
log::debug!("Detected {} table(s) from {} spans", tables.len(), spans.len());
}
tables
} else {
Vec::new()
};
let _table_span_indices: std::collections::HashSet<usize> = detected_tables
.iter()
.flat_map(|table| &table.span_indices)
.copied()
.collect();
let mut blocks: Vec<TextBlock> = spans
.iter()
.enumerate()
.filter_map(|(idx, span)| {
let classification = &span_classifications[idx];
if classification.is_non_text {
log::debug!(
"Filtering out non-text span: '{}...' (confidence: {:.2})",
span.text.chars().take(20).collect::<String>(),
classification.confidence
);
return None; }
Some(TextBlock {
chars: vec![], bbox: span.bbox,
text: span.text.clone(),
avg_font_size: span.font_size,
dominant_font: span.font_name.clone(),
is_bold: span.font_weight.is_bold(),
is_italic: span.is_italic,
mcid: span.mcid,
})
})
.collect();
blocks.sort_by(|a, b| {
let y_cmp = crate::utils::safe_float_cmp(a.bbox.y, b.bbox.y);
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(a.bbox.x, b.bbox.x)
});
let initial_count = blocks.len();
let mut whitespace_count = 0;
blocks.retain(|block| {
let is_whitespace = block.text.trim().is_empty();
if is_whitespace {
whitespace_count += 1;
}
!is_whitespace
});
let filtered_count = blocks.len();
log::debug!(
"Pre-grouping whitespace filter: removed {} whitespace-only blocks ({} → {})",
whitespace_count,
initial_count,
filtered_count
);
let mut neutralized_count = 0;
for block in &mut blocks {
let has_alphanumeric = block.text.chars().any(|c| c.is_alphanumeric());
let has_non_whitespace = block.text.chars().any(|c| !c.is_whitespace());
let should_neutralize = if !has_alphanumeric {
true
} else if has_non_whitespace && block.text.len() == 1 {
let ch = match block.text.chars().next() {
Some(c) => c,
None => continue,
};
!ch.is_alphabetic() && ch != ' ' && ch != '\t' && ch != '\n'
} else {
false
};
if should_neutralize && block.is_bold {
log::debug!("Neutralizing bold on non-word block: '{}'", block.text);
block.is_bold = false;
neutralized_count += 1;
}
}
if neutralized_count > 0 {
log::debug!("Neutralized {} bold flags on non-word blocks", neutralized_count);
}
blocks = Self::merge_adjacent_char_spans(blocks);
let ordered_indices =
self.determine_reading_order(&blocks, ReadingOrderMode::TopToBottomLeftToRight, None);
let mut markdown = String::new();
let mut current_line: Vec<usize> = Vec::new();
let mut current_y: Option<f32> = None;
let render_line = |line_indices: &[usize], markdown: &mut String| {
if line_indices.is_empty() {
return;
}
let mut i = 0;
while i < line_indices.len() {
let idx = line_indices[i];
let block = &blocks[idx];
let is_bold = block.is_bold;
let is_italic = block.is_italic;
let mut j = i + 1;
while j < line_indices.len()
&& blocks[line_indices[j]].is_bold == is_bold
&& blocks[line_indices[j]].is_italic == is_italic
{
j += 1;
}
let prev_char = if markdown.is_empty() {
None
} else {
markdown.chars().last()
};
let next_char_after_group = if j < line_indices.len() {
blocks[line_indices[j]].text.chars().next()
} else {
None
};
let spacing_config = SpacingConfig::default();
let mut group_text = String::new();
for k in i..j {
let block_idx = line_indices[k];
let current_block = &blocks[block_idx];
if !group_text.is_empty() && k > i {
let prev_block = &blocks[line_indices[k - 1]];
let gap = current_block.bbox.left() - prev_block.bbox.right();
let char_size = prev_block.bbox.width.max(prev_block.bbox.height);
let threshold = spacing_config.word_margin * char_size;
let prev_ends_space = prev_block
.text
.chars()
.last()
.is_some_and(|c| c.is_whitespace());
let curr_starts_space = current_block
.text
.chars()
.next()
.is_some_and(|c| c.is_whitespace());
if gap > threshold && !prev_ends_space && !curr_starts_space {
group_text.push(' ');
}
}
group_text.push_str(¤t_block.text);
}
let formatted_text = Self::format_links(&group_text);
let cleaned_text = Self::clean_reference_spacing(&formatted_text);
if cleaned_text.trim().is_empty() {
log::debug!(
"Skipping bold markers: content became whitespace-only after formatting"
);
markdown.push_str(&cleaned_text);
continue;
}
let first_char_in_group = cleaned_text.chars().next();
let last_char_in_group = cleaned_text.chars().last();
let can_insert_open = should_insert_bold_marker(prev_char, first_char_in_group);
let can_insert_close =
should_insert_bold_marker(last_char_in_group, next_char_after_group);
let should_render_bold_markers = match options.bold_marker_behavior {
BoldMarkerBehavior::Aggressive => true,
BoldMarkerBehavior::Conservative => is_content_block(&cleaned_text),
};
let group = BoldGroup {
text: cleaned_text.clone(),
is_bold,
first_char_in_group,
last_char_in_group,
};
let should_check_validator =
is_bold && can_insert_open && can_insert_close && should_render_bold_markers;
let marker_decision = if should_check_validator {
BoldMarkerValidator::can_insert_markers(&group)
} else {
BoldMarkerDecision::Skip(
crate::layout::bold_validation::ValidatorError::NotBold,
)
};
let should_insert_bold_markers =
matches!(marker_decision, BoldMarkerDecision::Insert);
if should_insert_bold_markers {
match (is_bold, is_italic) {
(true, true) => markdown.push_str("***"), (true, false) => markdown.push_str("**"), (false, true) => markdown.push('*'), (false, false) => {}, }
} else if let BoldMarkerDecision::Skip(reason) = &marker_decision {
log::debug!(
"Skipping bold markers: {:?} for '{}'",
reason,
group.text.chars().take(20).collect::<String>()
);
}
markdown.push_str(&group.text);
if should_insert_bold_markers {
match (is_bold, is_italic) {
(true, true) => markdown.push_str("***"), (true, false) => markdown.push_str("**"), (false, true) => markdown.push('*'), (false, false) => {}, }
}
i = j;
}
markdown.push('\n');
};
for &idx in &ordered_indices {
let block = &blocks[idx];
let block_y = block.bbox.y;
match current_y {
Some(y) if (y - block_y).abs() < 2.0 => {
current_line.push(idx);
},
_ => {
render_line(¤t_line, &mut markdown);
current_line.clear();
current_line.push(idx);
current_y = Some(block_y);
},
}
}
render_line(¤t_line, &mut markdown);
let spaced = Self::insert_missing_punctuation_spaces(&markdown);
let cleaned = cleanup_markdown(&spaced);
let post_processed = TextPostProcessor::process(&cleaned);
Ok(post_processed)
}
pub fn convert_page(&self, chars: &[TextChar], options: &ConversionOptions) -> Result<String> {
if chars.is_empty() {
return Ok(String::new());
}
let mut sorted_chars = chars.to_vec();
sorted_chars.sort_by(|a, b| {
let y_cmp = crate::utils::safe_float_cmp(b.bbox.y, a.bbox.y);
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(a.bbox.x, b.bbox.x)
});
let median_font_size = Self::compute_median_font_size(&sorted_chars);
let word_epsilon = median_font_size * 0.8;
let word_clusters = cluster_chars_into_words(&sorted_chars, word_epsilon);
let mut words = Vec::new();
for cluster in &word_clusters {
let word_chars: Vec<TextChar> =
cluster.iter().map(|&i| sorted_chars[i].clone()).collect();
if !word_chars.is_empty() {
words.push(TextBlock::from_chars(word_chars));
}
}
if words.is_empty() {
return Ok(String::new());
}
let line_clusters = cluster_words_into_lines(&words, 5.0);
let mut lines = Vec::new();
let mut word_taken = vec![false; words.len()];
for cluster in &line_clusters {
if cluster.is_empty() {
continue;
}
let mut all_chars: Vec<TextChar> = Vec::new();
for &i in cluster {
if !word_taken[i] {
all_chars.extend(std::mem::take(&mut words[i].chars));
word_taken[i] = true;
}
}
if !all_chars.is_empty() {
lines.push(TextBlock::from_chars(all_chars));
}
}
if lines.is_empty() {
return Ok(String::new());
}
let page_bbox = Self::calculate_bounding_box(&lines);
let adaptive_params = match DocumentProperties::analyze(&sorted_chars, page_bbox) {
Ok(props) => Some(AdaptiveLayoutParams::from_properties(&props)),
Err(_) => None, };
let _heading_levels = vec![(); lines.len()];
let ordered_indices = self.determine_reading_order(
&lines,
options.reading_order_mode.clone(),
adaptive_params.as_ref(),
);
let mut markdown = String::new();
for &idx in &ordered_indices {
let line = &lines[idx];
let formatted_text = Self::format_links(&line.text);
let cleaned_text = Self::clean_reference_spacing(&formatted_text);
markdown.push_str(&cleaned_text);
markdown.push('\n');
}
let cleaned = cleanup_markdown(&markdown);
let post_processed = TextPostProcessor::process(&cleaned);
Ok(post_processed)
}
fn determine_reading_order(
&self,
blocks: &[TextBlock],
mode: ReadingOrderMode,
_adaptive_params: Option<&AdaptiveLayoutParams>,
) -> Vec<usize> {
if blocks.is_empty() {
return vec![];
}
let mut indices: Vec<usize> = (0..blocks.len()).collect();
match mode {
ReadingOrderMode::TopToBottomLeftToRight => {
indices.sort_by(|&a, &b| {
let block_a = &blocks[a];
let block_b = &blocks[b];
let y_cmp = crate::utils::safe_float_cmp(block_b.bbox.y, block_a.bbox.y);
if y_cmp != std::cmp::Ordering::Equal {
y_cmp
} else {
crate::utils::safe_float_cmp(block_a.bbox.x, block_b.bbox.x)
}
});
},
ReadingOrderMode::ColumnAware => {
indices = Self::xycut_reading_order(blocks);
log::info!("Using XY-Cut algorithm for column-aware reading order");
},
ReadingOrderMode::StructureTreeFirst { ref mcid_order } => {
if !mcid_order.is_empty() {
indices = Self::reorder_by_mcid(blocks, mcid_order);
log::info!("Using structure tree for reading order (Tagged PDF)");
} else {
log::info!("No MCIDs found, falling back to graph-based reading order");
indices = graph_based_reading_order(blocks);
}
},
}
indices
}
fn reorder_by_mcid(blocks: &[TextBlock], mcid_order: &[u32]) -> Vec<usize> {
use std::collections::HashMap;
let mut mcid_to_blocks: HashMap<u32, Vec<usize>> = HashMap::new();
let mut no_mcid_indices = Vec::new();
for (idx, block) in blocks.iter().enumerate() {
if let Some(mcid) = block.mcid {
mcid_to_blocks.entry(mcid).or_default().push(idx);
} else {
no_mcid_indices.push(idx);
}
}
for indices in mcid_to_blocks.values_mut() {
if indices.len() > 1 {
indices.sort_by(|&a, &b| {
let block_a = &blocks[a];
let block_b = &blocks[b];
let y_cmp = crate::utils::safe_float_cmp(block_b.bbox.y, block_a.bbox.y);
if y_cmp != std::cmp::Ordering::Equal {
y_cmp
} else {
crate::utils::safe_float_cmp(block_a.bbox.x, block_b.bbox.x)
}
});
}
}
let mut ordered_indices = Vec::with_capacity(blocks.len());
for &mcid in mcid_order {
if let Some(indices) = mcid_to_blocks.get(&mcid) {
ordered_indices.extend(indices);
}
}
ordered_indices.extend(no_mcid_indices);
ordered_indices
}
fn xycut_reading_order(blocks: &[TextBlock]) -> Vec<usize> {
if blocks.is_empty() {
return vec![];
}
let spans: Vec<TextSpan> = blocks
.iter()
.enumerate()
.map(|(seq, block)| TextSpan {
text: block.text.clone(),
bbox: block.bbox,
font_name: block.dominant_font.clone(),
font_size: block.avg_font_size,
font_weight: if block.is_bold {
FontWeight::Bold
} else {
FontWeight::Normal
},
is_italic: block.is_italic,
is_monospace: false,
color: Color::black(),
mcid: block.mcid,
sequence: seq,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
artifact_type: None,
char_widths: vec![],
})
.collect();
let strategy = XYCutStrategy::new()
.with_valley_threshold(0.25) .with_min_valley_width(12.0);
let groups = strategy.partition_region(&spans);
let mut indices = Vec::with_capacity(blocks.len());
for group in &groups {
for span in group {
indices.push(span.sequence);
}
}
indices
}
fn compute_median_font_size(chars: &[TextChar]) -> f32 {
if chars.is_empty() {
return 12.0; }
let mut font_sizes: Vec<f32> = chars.iter().map(|c| c.font_size).collect();
font_sizes.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
let mid = font_sizes.len() / 2;
if font_sizes.len().is_multiple_of(2) {
(font_sizes[mid - 1] + font_sizes[mid]) / 2.0
} else {
font_sizes[mid]
}
}
fn calculate_bounding_box(blocks: &[TextBlock]) -> Rect {
if blocks.is_empty() {
return Rect::new(0.0, 0.0, 0.0, 0.0);
}
let mut min_x = f32::INFINITY;
let mut min_y = f32::INFINITY;
let mut max_x = f32::NEG_INFINITY;
let mut max_y = f32::NEG_INFINITY;
for block in blocks {
min_x = min_x.min(block.bbox.left());
min_y = min_y.min(block.bbox.bottom());
max_x = max_x.max(block.bbox.right());
max_y = max_y.max(block.bbox.top());
}
Rect::from_points(min_x, min_y, max_x, max_y)
}
fn format_links(text: &str) -> String {
let might_have_url = text.contains("://") || text.contains("www.");
let might_have_email = text.contains('@');
if !might_have_url && !might_have_email {
return text.to_string();
}
let mut result = text.to_string();
if might_have_url {
result = RE_URL
.replace_all(&result, |caps: &Captures| {
let url = &caps[1];
if text.contains(&format!("[{}]", url)) {
url.to_string()
} else {
format!("[{}]({})", url, url)
}
})
.to_string();
}
if might_have_email {
result = RE_EMAIL
.replace_all(&result, |caps: &Captures| {
let email = &caps[1];
if result.contains(&format!("[{}]", email))
|| result.contains(&format!("//{}", email))
{
email.to_string()
} else {
format!("[{}](mailto:{})", email, email)
}
})
.to_string();
}
result
}
fn clean_reference_spacing(text: &str) -> String {
let mut result = text.to_string();
result = RE_DASH_BEFORE.replace_all(&result, "$1$2$3").to_string();
result = RE_DASH_AFTER.replace_all(&result, "$1$2$3").to_string();
result
}
fn insert_missing_punctuation_spaces(text: &str) -> String {
RE_PUNCT_SPACE.replace_all(text, "${1} ${2}").to_string()
}
}
#[allow(deprecated)]
impl Default for MarkdownConverter {
fn default() -> Self {
Self::new()
}
}
pub fn is_content_block(text: &str) -> bool {
text.chars().any(|c| !c.is_whitespace())
}
fn should_insert_bold_marker(prev_char: Option<char>, next_char: Option<char>) -> bool {
match (prev_char, next_char) {
(Some(p), Some(n)) if p.is_alphanumeric() && n.is_alphanumeric() => false,
(Some(')'), Some(n))
if matches!(n, '=' | '-' | '+' | '<' | '>' | '*' | '/' | '&' | '|' | '^') =>
{
false
},
(Some(']'), Some(n))
if matches!(n, '=' | '-' | '+' | '<' | '>' | '*' | '/' | '&' | '|' | '^') =>
{
false
},
(Some('}'), Some(n))
if matches!(n, '=' | '-' | '+' | '<' | '>' | '*' | '/' | '&' | '|' | '^') =>
{
false
},
_ => true,
}
}
#[allow(dead_code)]
fn render_markdown_table(table: &ExtractedTable) -> String {
let mut md = String::new();
if table.rows.is_empty() {
return md;
}
let mut header_row_count = 0;
while header_row_count < table.rows.len() && table.rows[header_row_count].is_header {
header_row_count += 1;
}
if header_row_count == 0 {
header_row_count = 1;
}
for i in 0..header_row_count {
md.push_str(&render_table_row(&table.rows[i]));
md.push('\n');
}
md.push('|');
for _ in 0..table.col_count {
md.push_str("---|");
}
md.push('\n');
for i in header_row_count..table.rows.len() {
md.push_str(&render_table_row(&table.rows[i]));
md.push('\n');
}
md
}
#[allow(dead_code)]
fn render_table_row(row: &TableRow) -> String {
let mut line = String::from("|");
for cell in &row.cells {
let escaped = cell.text.replace('|', "\\|");
let content = escaped.trim();
for _ in 0..cell.colspan {
line.push_str(&format!(" {} |", content));
}
}
line
}
#[cfg(test)]
#[allow(deprecated)]
mod tests {
use super::*;
use crate::geometry::Rect;
use crate::layout::bold_validation::ValidatorError;
use crate::layout::{Color, FontWeight};
fn mock_char(c: char, x: f32, y: f32, font_size: f32, bold: bool) -> TextChar {
let bbox = Rect::new(x, y, 8.0, font_size);
TextChar {
char: c,
bbox,
font_name: "Times".to_string(),
font_size,
font_weight: if bold {
FontWeight::Bold
} else {
FontWeight::Normal
},
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
origin_x: bbox.x,
origin_y: bbox.y,
rotation_degrees: 0.0,
advance_width: bbox.width,
matrix: None,
}
}
fn mock_word(text: &str, x: f32, y: f32, font_size: f32, bold: bool) -> Vec<TextChar> {
text.chars()
.enumerate()
.map(|(i, c)| mock_char(c, x + (i as f32 * 7.0), y, font_size, bold))
.collect()
}
#[test]
fn test_markdown_converter_new() {
let converter = MarkdownConverter::new();
assert!(format!("{:?}", converter).contains("MarkdownConverter"));
}
#[test]
fn test_markdown_converter_default() {
let converter = MarkdownConverter;
assert!(format!("{:?}", converter).contains("MarkdownConverter"));
}
#[test]
fn test_convert_empty() {
let converter = MarkdownConverter::new();
let options = ConversionOptions::default();
let result = converter.convert_page(&[], &options).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_convert_single_line() {
let converter = MarkdownConverter::new();
let options = ConversionOptions {
detect_headings: false,
..Default::default()
};
let chars = mock_word("Hello World", 0.0, 0.0, 12.0, false);
let result = converter.convert_page(&chars, &options).unwrap();
assert!(result.contains("Hello World"));
assert!(!result.contains('#')); }
#[test]
fn test_convert_with_heading() {
let converter = MarkdownConverter::new();
let options = ConversionOptions {
detect_headings: true,
..Default::default()
};
let mut chars = Vec::new();
chars.extend(mock_word("Title", 0.0, 0.0, 24.0, true)); chars.push(mock_char(' ', 45.0, 0.0, 24.0, true));
chars.extend(mock_word("Body Text", 0.0, 50.0, 12.0, false));
let result = converter.convert_page(&chars, &options).unwrap();
assert!(result.contains("# Title") || result.contains("Title"));
assert!(result.contains("Body Text"));
}
#[test]
fn test_convert_multiple_lines() {
let converter = MarkdownConverter::new();
let options = ConversionOptions {
detect_headings: false,
..Default::default()
};
let mut chars = Vec::new();
chars.extend(mock_word("Line One", 0.0, 0.0, 12.0, false));
chars.extend(mock_word("Line Two", 0.0, 20.0, 12.0, false));
chars.extend(mock_word("Line Three", 0.0, 40.0, 12.0, false));
let result = converter.convert_page(&chars, &options).unwrap();
assert!(result.contains("Line One"));
assert!(result.contains("Line Two"));
assert!(result.contains("Line Three"));
}
#[test]
fn test_reading_order_top_to_bottom() {
let converter = MarkdownConverter::new();
let block1 = TextBlock::from_chars(mock_word("Top", 0.0, 100.0, 12.0, false)); let block2 = TextBlock::from_chars(mock_word("Middle", 0.0, 50.0, 12.0, false)); let block3 = TextBlock::from_chars(mock_word("Bottom", 0.0, 0.0, 12.0, false));
let blocks = vec![block2.clone(), block3.clone(), block1.clone()];
let indices = converter.determine_reading_order(
&blocks,
ReadingOrderMode::TopToBottomLeftToRight,
None,
);
assert_eq!(indices[0], 2); assert_eq!(indices[1], 0); assert_eq!(indices[2], 1); }
#[test]
fn test_reading_order_left_to_right() {
let converter = MarkdownConverter::new();
let block1 = TextBlock::from_chars(mock_word("Left", 0.0, 0.0, 12.0, false));
let block2 = TextBlock::from_chars(mock_word("Center", 50.0, 0.0, 12.0, false));
let block3 = TextBlock::from_chars(mock_word("Right", 100.0, 0.0, 12.0, false));
let blocks = vec![block3.clone(), block1.clone(), block2.clone()];
let indices = converter.determine_reading_order(
&blocks,
ReadingOrderMode::TopToBottomLeftToRight,
None,
);
assert_eq!(indices[0], 1); assert_eq!(indices[1], 2); assert_eq!(indices[2], 0); }
#[test]
fn test_heading_level_h1() {
let converter = MarkdownConverter::new();
let options = ConversionOptions {
detect_headings: true,
..Default::default()
};
let chars = mock_word("Main Title", 0.0, 0.0, 28.0, true);
let result = converter.convert_page(&chars, &options).unwrap();
assert!(result.contains("# Main Title") || result.contains("Main Title"));
}
#[test]
fn test_heading_level_h2() {
let converter = MarkdownConverter::new();
let options = ConversionOptions {
detect_headings: true,
..Default::default()
};
let mut chars = Vec::new();
chars.extend(mock_word("Main", 0.0, 0.0, 24.0, true));
chars.extend(mock_word("Section", 0.0, 40.0, 18.0, true));
chars.extend(mock_word("Text", 0.0, 70.0, 12.0, false));
let result = converter.convert_page(&chars, &options).unwrap();
assert!(result.contains("Main"));
assert!(result.contains("Section"));
assert!(result.contains("Text"));
}
#[test]
fn test_column_aware_mode() {
let converter = MarkdownConverter::new();
let block1 = TextBlock::from_chars(mock_word("A", 0.0, 0.0, 12.0, false));
let block2 = TextBlock::from_chars(mock_word("B", 0.0, 50.0, 12.0, false));
let blocks = vec![block1, block2];
let indices1 = converter.determine_reading_order(
&blocks,
ReadingOrderMode::TopToBottomLeftToRight,
None,
);
let indices2 =
converter.determine_reading_order(&blocks, ReadingOrderMode::ColumnAware, None);
assert_eq!(indices1.len(), 2);
assert_eq!(indices2.len(), 2);
}
#[test]
fn test_column_aware_xycut_two_column_layout() {
let converter = MarkdownConverter::new();
let col1_top = TextBlock::from_chars(mock_word("Col1-Top", 10.0, 100.0, 12.0, false));
let col1_bottom = TextBlock::from_chars(mock_word("Col1-Bottom", 10.0, 50.0, 12.0, false));
let col2_top = TextBlock::from_chars(mock_word("Col2-Top", 300.0, 100.0, 12.0, false));
let col2_bottom = TextBlock::from_chars(mock_word("Col2-Bottom", 300.0, 50.0, 12.0, false));
let blocks = vec![
col2_bottom.clone(),
col1_top.clone(),
col2_top.clone(),
col1_bottom.clone(),
];
let indices =
converter.determine_reading_order(&blocks, ReadingOrderMode::ColumnAware, None);
assert_eq!(indices.len(), 4);
let mut sorted_indices = indices.clone();
sorted_indices.sort();
assert_eq!(sorted_indices, vec![0, 1, 2, 3]);
}
#[test]
fn test_whitespace_filtered_before_grouping() {
use crate::geometry::Rect;
use crate::layout::TextSpan;
let converter = MarkdownConverter::new();
let options = ConversionOptions::default();
let spans = vec![
TextSpan {
artifact_type: None,
text: "Hello".to_string(),
bbox: Rect::new(0.0, 0.0, 40.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: " ".to_string(), bbox: Rect::new(50.0, 0.0, 20.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold, is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "World".to_string(),
bbox: Rect::new(80.0, 0.0, 40.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 2,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let result = converter.convert_page_from_spans(&spans, &options).unwrap();
assert!(result.contains("Hello"));
assert!(result.contains("World"));
assert!(!result.contains("** **"), "Whitespace should be filtered before grouping");
}
#[test]
fn test_punctuation_not_bolded() {
use crate::geometry::Rect;
use crate::layout::TextSpan;
let converter = MarkdownConverter::new();
let options = ConversionOptions::default();
let spans = vec![
TextSpan {
artifact_type: None,
text: "Section".to_string(),
bbox: Rect::new(0.0, 0.0, 50.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "---".to_string(), bbox: Rect::new(60.0, 0.0, 20.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "Content".to_string(),
bbox: Rect::new(0.0, 20.0, 50.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 2,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let result = converter.convert_page_from_spans(&spans, &options).unwrap();
assert!(result.contains("---"));
assert!(result.contains("Content"));
assert!(!result.contains("**---**"), "Punctuation should not be bolded");
}
#[test]
fn test_numeric_bold_preserved() {
use crate::geometry::Rect;
use crate::layout::TextSpan;
let converter = MarkdownConverter::new();
let options = ConversionOptions {
bold_marker_behavior: crate::converters::BoldMarkerBehavior::Conservative,
..Default::default()
};
let spans = vec![
TextSpan {
artifact_type: None,
text: "Year:".to_string(),
bbox: Rect::new(0.0, 0.0, 40.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "2024".to_string(), bbox: Rect::new(50.0, 0.0, 30.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let result = converter.convert_page_from_spans(&spans, &options).unwrap();
assert!(result.contains("2024"));
assert!(!result.contains("** **"), "Numeric should not create empty bold markers");
}
#[test]
fn test_no_empty_bold_markers_regression() {
use crate::geometry::Rect;
use crate::layout::TextSpan;
let converter = MarkdownConverter::new();
let options = ConversionOptions::default();
let spans = vec![
TextSpan {
artifact_type: None,
text: "Title".to_string(),
bbox: Rect::new(0.0, 0.0, 40.0, 14.0),
font_name: "Times-Bold".to_string(),
font_size: 14.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: " ".to_string(), bbox: Rect::new(50.0, 0.0, 5.0, 14.0),
font_name: "Times-Bold".to_string(),
font_size: 14.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "...".to_string(), bbox: Rect::new(60.0, 0.0, 15.0, 14.0),
font_name: "Times-Bold".to_string(),
font_size: 14.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 2,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: " \n ".to_string(), bbox: Rect::new(0.0, 20.0, 50.0, 12.0),
font_name: "Times-Bold".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 3,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "Content".to_string(),
bbox: Rect::new(0.0, 35.0, 50.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 4,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let result = converter.convert_page_from_spans(&spans, &options).unwrap();
assert!(!result.contains("** **"), "No empty bold markers allowed");
assert!(!result.contains("**\n**"), "No empty bold markers with newlines");
assert!(!result.contains("** **"), "No bold wrapping only spaces");
assert!(result.contains("Title"));
assert!(result.contains("Content"));
}
#[test]
fn test_merge_adjacent_char_spans_preserves_spacing() {
let spans = vec![
TextBlock {
chars: vec![],
bbox: Rect::new(0.0, 0.0, 4.0, 12.0),
text: "H".to_string(),
avg_font_size: 12.0,
dominant_font: "Times".to_string(),
is_bold: false,
is_italic: false,
mcid: None,
},
TextBlock {
chars: vec![],
bbox: Rect::new(4.5, 0.0, 4.0, 12.0),
text: "i".to_string(),
avg_font_size: 12.0,
dominant_font: "Times".to_string(),
is_bold: false,
is_italic: false,
mcid: None,
},
TextBlock {
chars: vec![],
bbox: Rect::new(9.0, 0.0, 4.0, 12.0),
text: "!".to_string(),
avg_font_size: 12.0,
dominant_font: "Times".to_string(),
is_bold: false,
is_italic: false,
mcid: None,
},
];
let merged = MarkdownConverter::merge_adjacent_char_spans(spans);
assert_eq!(merged.len(), 1);
assert_eq!(merged[0].text, "Hi!");
}
#[test]
fn test_fix_2a_boundary_extraction_with_leading_whitespace() {
let group = BoldGroup {
text: " hello".to_string(), is_bold: true,
first_char_in_group: Some('h'), last_char_in_group: Some('o'),
};
assert_eq!(BoldMarkerValidator::can_insert_markers(&group), BoldMarkerDecision::Insert);
}
#[test]
fn test_fix_2a_boundary_extraction_with_trailing_whitespace() {
let group = BoldGroup {
text: "hello ".to_string(), is_bold: true,
first_char_in_group: Some('h'),
last_char_in_group: Some('o'), };
assert_eq!(BoldMarkerValidator::can_insert_markers(&group), BoldMarkerDecision::Insert);
}
#[test]
fn test_fix_2a_boundary_extraction_with_both_whitespace() {
let group = BoldGroup {
text: " hello world ".to_string(), is_bold: true,
first_char_in_group: Some('h'), last_char_in_group: Some('d'), };
assert_eq!(BoldMarkerValidator::can_insert_markers(&group), BoldMarkerDecision::Insert);
}
#[test]
fn test_fix_2a_whitespace_only_string_returns_none() {
let group = BoldGroup {
text: " ".to_string(),
is_bold: true,
first_char_in_group: None, last_char_in_group: None, };
assert_eq!(
BoldMarkerValidator::can_insert_markers(&group),
BoldMarkerDecision::Skip(ValidatorError::WhitespaceOnly)
);
}
#[test]
fn test_fix_2a_tabs_and_newlines_trimmed() {
let group = BoldGroup {
text: "\t\n hello \n\t".to_string(), is_bold: true,
first_char_in_group: Some('h'), last_char_in_group: Some('o'), };
assert_eq!(BoldMarkerValidator::can_insert_markers(&group), BoldMarkerDecision::Insert);
}
#[test]
fn test_fix_2a_markdown_no_empty_bold_from_spaces() {
use crate::layout::TextSpan;
let converter = MarkdownConverter::new();
let options = ConversionOptions::default();
let spans = vec![
TextSpan {
artifact_type: None,
text: "Content".to_string(),
bbox: Rect::new(0.0, 0.0, 50.0, 12.0),
font_name: "Times-Bold".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: " \n ".to_string(), bbox: Rect::new(60.0, 0.0, 20.0, 12.0),
font_name: "Times-Bold".to_string(),
font_size: 12.0,
font_weight: FontWeight::Bold,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "More".to_string(),
bbox: Rect::new(0.0, 20.0, 40.0, 12.0),
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 2,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let result = converter.convert_page_from_spans(&spans, &options).unwrap();
assert!(!result.contains("**\n**"), "No bold wrapping newlines");
assert!(!result.contains("** **"), "No bold wrapping spaces");
assert!(result.contains("Content"), "Content preserved");
assert!(result.contains("More"), "More preserved");
}
}