use crate::pdf::error::Result;
use crate::pdf::hierarchy::{
BoundingBox, CharData, TextBlock, assign_heading_levels_smart, cluster_font_sizes, extract_chars_with_fonts,
};
use pdfium_render::prelude::*;
const BASELINE_Y_TOLERANCE_FRACTION: f32 = 0.5;
const WORD_GAP_FRACTION: f32 = 0.3;
const PARAGRAPH_GAP_MULTIPLIER: f32 = 1.5;
const FONT_SIZE_CHANGE_THRESHOLD: f32 = 1.5;
const LEFT_INDENT_CHANGE_THRESHOLD: f32 = 10.0;
const MAX_HEADING_WORD_COUNT: usize = 12;
const MIN_GUTTER_WIDTH_MULTIPLIER: f32 = 2.0;
const MIN_GUTTER_HEIGHT_FRACTION: f32 = 0.6;
const COLUMN_HISTOGRAM_BIN_WIDTH: f32 = 5.0;
const MAX_LIST_ITEM_LINES: usize = 5;
const MAX_HEADING_DISTANCE_MULTIPLIER: f32 = 2.0;
#[derive(Debug, Clone)]
struct ColumnRegion {
x_min: f32,
x_max: f32,
}
#[derive(Debug, Clone)]
struct PdfWord {
text: String,
x_start: f32,
#[allow(dead_code)]
x_end: f32,
baseline_y: f32,
font_size: f32,
is_bold: bool,
is_italic: bool,
}
#[derive(Debug, Clone)]
struct PdfLine {
words: Vec<PdfWord>,
baseline_y: f32,
#[allow(dead_code)]
y_top: f32,
#[allow(dead_code)]
y_bottom: f32,
dominant_font_size: f32,
is_bold: bool,
is_italic: bool,
}
#[derive(Debug, Clone)]
struct PdfParagraph {
lines: Vec<PdfLine>,
dominant_font_size: f32,
heading_level: Option<u8>,
#[allow(dead_code)]
is_bold: bool,
#[allow(dead_code)]
is_italic: bool,
is_list_item: bool,
}
fn detect_columns(chars: &[CharData], page_width: f32, page_height: f32) -> Vec<ColumnRegion> {
if chars.is_empty() || page_width <= 0.0 || page_height <= 0.0 {
return vec![ColumnRegion {
x_min: 0.0,
x_max: page_width,
}];
}
let avg_char_width = chars
.iter()
.filter(|c| !c.text.trim().is_empty())
.map(|c| c.width)
.sum::<f32>()
/ chars.iter().filter(|c| !c.text.trim().is_empty()).count().max(1) as f32;
let min_gutter_width = avg_char_width * MIN_GUTTER_WIDTH_MULTIPLIER;
let num_bins = ((page_width / COLUMN_HISTOGRAM_BIN_WIDTH).ceil() as usize).max(1);
let mut bin_y_min = vec![f32::INFINITY; num_bins];
let mut bin_y_max = vec![f32::NEG_INFINITY; num_bins];
let mut bin_count = vec![0u32; num_bins];
for ch in chars {
if ch.text.trim().is_empty() {
continue;
}
let bin_start = ((ch.x / COLUMN_HISTOGRAM_BIN_WIDTH).floor() as usize).min(num_bins - 1);
let bin_end = (((ch.x + ch.width) / COLUMN_HISTOGRAM_BIN_WIDTH).ceil() as usize).min(num_bins);
for b in bin_start..bin_end {
bin_y_min[b] = bin_y_min[b].min(ch.baseline_y);
bin_y_max[b] = bin_y_max[b].max(ch.baseline_y);
bin_count[b] += 1;
}
}
let mut gutters: Vec<(f32, f32)> = Vec::new();
let mut gutter_start: Option<usize> = None;
for (i, &count) in bin_count.iter().enumerate() {
if count == 0 {
if gutter_start.is_none() {
gutter_start = Some(i);
}
} else if let Some(start) = gutter_start {
let x_start = start as f32 * COLUMN_HISTOGRAM_BIN_WIDTH;
let x_end = i as f32 * COLUMN_HISTOGRAM_BIN_WIDTH;
if x_end - x_start >= min_gutter_width {
let left_y_min = bin_y_min[..start].iter().copied().fold(f32::INFINITY, f32::min);
let left_y_max = bin_y_max[..start].iter().copied().fold(f32::NEG_INFINITY, f32::max);
let left_span = if left_y_max > left_y_min {
(left_y_max - left_y_min).abs()
} else {
0.0
};
let right_y_min = bin_y_min[i..].iter().copied().fold(f32::INFINITY, f32::min);
let right_y_max = bin_y_max[i..].iter().copied().fold(f32::NEG_INFINITY, f32::max);
let right_span = if right_y_max > right_y_min {
(right_y_max - right_y_min).abs()
} else {
0.0
};
if left_span.max(right_span) >= page_height * MIN_GUTTER_HEIGHT_FRACTION {
gutters.push((x_start, x_end));
}
}
gutter_start = None;
}
}
if gutters.is_empty() {
return vec![ColumnRegion {
x_min: 0.0,
x_max: page_width,
}];
}
let mut columns: Vec<ColumnRegion> = Vec::new();
let mut prev_x = 0.0_f32;
for (gl, gr) in &gutters {
if *gl > prev_x {
columns.push(ColumnRegion {
x_min: prev_x,
x_max: *gl,
});
}
prev_x = *gr;
}
if prev_x < page_width {
columns.push(ColumnRegion {
x_min: prev_x,
x_max: page_width,
});
}
columns.retain(|col| {
chars
.iter()
.any(|c| !c.text.trim().is_empty() && c.x >= col.x_min && c.x < col.x_max)
});
if columns.is_empty() {
vec![ColumnRegion {
x_min: 0.0,
x_max: page_width,
}]
} else {
columns
}
}
fn split_chars_by_columns<'a>(chars: &'a [CharData], columns: &[ColumnRegion]) -> Vec<Vec<&'a CharData>> {
let mut column_chars: Vec<Vec<&CharData>> = vec![Vec::new(); columns.len()];
for ch in chars {
if ch.text.trim().is_empty() {
continue;
}
let center_x = ch.x + ch.width / 2.0;
let mut assigned = false;
for (i, col) in columns.iter().enumerate() {
if center_x >= col.x_min && center_x < col.x_max {
column_chars[i].push(ch);
assigned = true;
break;
}
}
if !assigned {
let nearest = columns
.iter()
.enumerate()
.min_by(|(_, a), (_, b)| {
let da = (center_x - (a.x_min + a.x_max) / 2.0).abs();
let db = (center_x - (b.x_min + b.x_max) / 2.0).abs();
da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
})
.map(|(i, _)| i)
.unwrap_or(0);
column_chars[nearest].push(ch);
}
}
column_chars
}
pub fn render_document_as_markdown(document: &PdfDocument, k_clusters: usize) -> Result<String> {
render_document_as_markdown_with_tables(document, k_clusters, &[])
}
pub fn render_document_as_markdown_with_tables(
document: &PdfDocument,
k_clusters: usize,
tables: &[crate::types::Table],
) -> Result<String> {
let pages = document.pages();
let page_count = pages.len();
let mut all_page_chars: Vec<Vec<CharData>> = Vec::with_capacity(page_count as usize);
let mut page_dimensions: Vec<(f32, f32)> = Vec::with_capacity(page_count as usize);
for i in 0..page_count {
let page = pages.get(i).map_err(|e| {
crate::pdf::error::PdfError::TextExtractionFailed(format!("Failed to get page {}: {:?}", i, e))
})?;
let mut chars = extract_chars_with_fonts(&page)?;
let (page_w, page_h) = (page.width().value, page.height().value);
page_dimensions.push((page_w, page_h));
let page_tables: Vec<&crate::types::Table> =
tables.iter().filter(|t| t.page_number == (i as usize) + 1).collect();
if !page_tables.is_empty() {
chars.retain(|ch| {
!page_tables.iter().any(|t| {
if let Some(ref bbox) = t.bounding_box {
let char_center_x = ch.x + ch.width / 2.0;
char_center_x >= bbox.x0 as f32
&& char_center_x <= bbox.x1 as f32
&& ch.baseline_y >= bbox.y0 as f32
&& ch.baseline_y <= bbox.y1 as f32
} else {
false
}
})
});
}
all_page_chars.push(chars);
}
let mut all_blocks: Vec<TextBlock> = Vec::new();
let empty_bbox = BoundingBox {
left: 0.0,
top: 0.0,
right: 0.0,
bottom: 0.0,
};
for page_chars in &all_page_chars {
for ch in page_chars {
if ch.text.trim().is_empty() || ch.text.chars().any(|c| c.is_control()) {
continue; }
all_blocks.push(TextBlock {
text: String::new(),
bbox: empty_bbox,
font_size: ch.font_size,
});
}
}
let heading_map = if all_blocks.is_empty() {
Vec::new()
} else {
let clusters = cluster_font_sizes(&all_blocks, k_clusters)?;
assign_heading_levels_smart(&clusters)
};
let mut all_page_paragraphs: Vec<Vec<PdfParagraph>> = Vec::new();
for (page_idx, page_chars) in all_page_chars.iter().enumerate() {
let (page_w, page_h) = page_dimensions[page_idx];
let columns = detect_columns(page_chars, page_w, page_h);
let mut page_paragraphs: Vec<PdfParagraph> = Vec::new();
if columns.len() <= 1 {
let words = chars_to_words(page_chars);
let lines = words_to_lines(words);
let mut paragraphs = lines_to_paragraphs(lines);
classify_paragraphs(&mut paragraphs, &heading_map);
page_paragraphs = paragraphs;
} else {
let column_char_groups = split_chars_by_columns(page_chars, &columns);
for col_chars in &column_char_groups {
if col_chars.is_empty() {
continue;
}
let owned: Vec<CharData> = col_chars.iter().map(|c| (*c).clone()).collect();
let words = chars_to_words(&owned);
let lines = words_to_lines(words);
let mut paragraphs = lines_to_paragraphs(lines);
classify_paragraphs(&mut paragraphs, &heading_map);
page_paragraphs.extend(paragraphs);
}
}
all_page_paragraphs.push(page_paragraphs);
}
Ok(assemble_markdown_with_tables(all_page_paragraphs, tables))
}
fn is_cjk_char(c: char) -> bool {
let cp = c as u32;
matches!(cp,
0x4E00..=0x9FFF | 0x3040..=0x309F | 0x30A0..=0x30FF | 0xAC00..=0xD7AF | 0x3400..=0x4DBF | 0xF900..=0xFAFF | 0x20000..=0x2A6DF | 0x2A700..=0x2B73F | 0x2B740..=0x2B81F | 0x2B820..=0x2CEAF | 0x2CEB0..=0x2EBEF | 0x30000..=0x3134F | 0x31350..=0x323AF | 0x2F800..=0x2FA1F )
}
fn needs_space_between(prev: &str, next: &str) -> bool {
let prev_ends_cjk = prev.chars().last().is_some_and(is_cjk_char);
let next_starts_cjk = next.chars().next().is_some_and(is_cjk_char);
!(prev_ends_cjk && next_starts_cjk)
}
fn chars_to_words(chars: &[CharData]) -> Vec<PdfWord> {
if chars.is_empty() {
return Vec::new();
}
let filtered: Vec<&CharData> = chars
.iter()
.filter(|c| c.text.chars().all(|ch| !ch.is_control()))
.collect();
if filtered.is_empty() {
return Vec::new();
}
let mut sorted = filtered;
sorted.sort_by(|a, b| {
b.baseline_y
.partial_cmp(&a.baseline_y)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
});
let mut words: Vec<PdfWord> = Vec::new();
let mut word_chars: Vec<&CharData> = Vec::new();
for ch in &sorted {
if ch.text.trim().is_empty() {
if !word_chars.is_empty() {
words.push(finalize_word(&word_chars));
word_chars.clear();
}
continue;
}
if word_chars.is_empty() {
word_chars.push(ch);
continue;
}
let prev = word_chars.last().unwrap();
let min_fs = prev.font_size.min(ch.font_size).max(1.0);
let same_line = (prev.baseline_y - ch.baseline_y).abs() < BASELINE_Y_TOLERANCE_FRACTION * min_fs;
if same_line {
let prev_is_cjk = prev.text.chars().any(is_cjk_char);
let curr_is_cjk = ch.text.chars().any(is_cjk_char);
if prev_is_cjk || curr_is_cjk {
words.push(finalize_word(&word_chars));
word_chars.clear();
} else {
let prev_end = prev.x + prev.width;
let gap = ch.x - prev_end;
let avg_fs = ((prev.font_size + ch.font_size) / 2.0).max(1.0);
if gap > WORD_GAP_FRACTION * avg_fs {
words.push(finalize_word(&word_chars));
word_chars.clear();
}
}
} else {
words.push(finalize_word(&word_chars));
word_chars.clear();
}
word_chars.push(ch);
}
if !word_chars.is_empty() {
words.push(finalize_word(&word_chars));
}
words
}
fn finalize_word(chars: &[&CharData]) -> PdfWord {
let text: String = chars.iter().map(|c| c.text.as_str()).collect();
let x_start = chars.iter().map(|c| c.x).fold(f32::INFINITY, f32::min);
let x_end = chars.iter().map(|c| c.x + c.width).fold(f32::NEG_INFINITY, f32::max);
let baseline_y = chars.iter().map(|c| c.baseline_y).sum::<f32>() / chars.len() as f32;
let font_size = chars.iter().map(|c| c.font_size).sum::<f32>() / chars.len() as f32;
let bold_count = chars.iter().filter(|c| c.is_bold).count();
let italic_count = chars.iter().filter(|c| c.is_italic).count();
let majority = chars.len() / 2;
PdfWord {
text,
x_start,
x_end,
baseline_y,
font_size,
is_bold: bold_count > majority,
is_italic: italic_count > majority,
}
}
fn words_to_lines(words: Vec<PdfWord>) -> Vec<PdfLine> {
if words.is_empty() {
return Vec::new();
}
let mut sorted = words;
sorted.sort_by(|a, b| {
b.baseline_y
.partial_cmp(&a.baseline_y)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.x_start.partial_cmp(&b.x_start).unwrap_or(std::cmp::Ordering::Equal))
});
let mut lines: Vec<PdfLine> = Vec::new();
let mut current_words: Vec<PdfWord> = vec![sorted.remove(0)];
for word in sorted {
let current_baseline = current_words.iter().map(|w| w.baseline_y).sum::<f32>() / current_words.len() as f32;
let min_fs = current_words
.iter()
.map(|w| w.font_size)
.fold(f32::INFINITY, f32::min)
.min(word.font_size)
.max(1.0);
if (word.baseline_y - current_baseline).abs() < BASELINE_Y_TOLERANCE_FRACTION * min_fs {
current_words.push(word);
} else {
lines.push(finalize_line(current_words));
current_words = vec![word];
}
}
if !current_words.is_empty() {
lines.push(finalize_line(current_words));
}
lines
}
fn finalize_line(mut words: Vec<PdfWord>) -> PdfLine {
words.sort_by(|a, b| a.x_start.partial_cmp(&b.x_start).unwrap_or(std::cmp::Ordering::Equal));
let baseline_y = words.iter().map(|w| w.baseline_y).sum::<f32>() / words.len() as f32;
let y_top = words
.iter()
.map(|w| w.baseline_y - w.font_size)
.fold(f32::INFINITY, f32::min);
let y_bottom = words.iter().map(|w| w.baseline_y).fold(f32::NEG_INFINITY, f32::max);
let dominant_font_size = dominant_font_size_of_words(&words);
let bold_count = words.iter().filter(|w| w.is_bold).count();
let italic_count = words.iter().filter(|w| w.is_italic).count();
let majority = words.len().div_ceil(2);
PdfLine {
baseline_y,
y_top,
y_bottom,
dominant_font_size,
is_bold: bold_count >= majority,
is_italic: italic_count >= majority,
words,
}
}
fn dominant_font_size_of_words(words: &[PdfWord]) -> f32 {
if words.is_empty() {
return 0.0;
}
let mut counts: Vec<(i32, usize)> = Vec::new();
for w in words {
let key = (w.font_size * 2.0).round() as i32;
if let Some(entry) = counts.iter_mut().find(|(k, _)| *k == key) {
entry.1 += 1;
} else {
counts.push((key, 1));
}
}
counts.sort_by(|a, b| b.1.cmp(&a.1));
counts[0].0 as f32 / 2.0
}
fn lines_to_paragraphs(lines: Vec<PdfLine>) -> Vec<PdfParagraph> {
if lines.is_empty() {
return Vec::new();
}
if lines.len() == 1 {
return vec![finalize_paragraph(lines)];
}
let avg_font_size = lines.iter().map(|l| l.dominant_font_size).sum::<f32>() / lines.len() as f32;
let mut spacings: Vec<f32> = Vec::new();
for pair in lines.windows(2) {
let gap = (pair[1].baseline_y - pair[0].baseline_y).abs();
if gap > avg_font_size * 0.4 {
spacings.push(gap);
}
}
let base_spacing = if spacings.is_empty() {
avg_font_size
} else {
spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
spacings[0]
};
let paragraph_gap_threshold = base_spacing * PARAGRAPH_GAP_MULTIPLIER;
let mut paragraphs: Vec<PdfParagraph> = Vec::new();
let mut current_lines: Vec<PdfLine> = vec![lines[0].clone()];
for line in lines.into_iter().skip(1) {
let prev = current_lines.last().unwrap();
let vertical_gap = (line.baseline_y - prev.baseline_y).abs();
let font_size_change = (line.dominant_font_size - prev.dominant_font_size).abs();
let prev_left = prev.words.first().map(|w| w.x_start).unwrap_or(0.0);
let curr_left = line.words.first().map(|w| w.x_start).unwrap_or(0.0);
let indent_change = (curr_left - prev_left).abs();
let has_significant_gap = vertical_gap > paragraph_gap_threshold;
let has_some_gap = vertical_gap > base_spacing * 0.8;
let has_font_change = font_size_change > FONT_SIZE_CHANGE_THRESHOLD;
let has_indent_change = indent_change > LEFT_INDENT_CHANGE_THRESHOLD;
let is_paragraph_break = has_significant_gap || (has_some_gap && (has_font_change || has_indent_change));
if is_paragraph_break {
paragraphs.push(finalize_paragraph(current_lines));
current_lines = vec![line];
} else {
current_lines.push(line);
}
}
if !current_lines.is_empty() {
paragraphs.push(finalize_paragraph(current_lines));
}
paragraphs
}
fn finalize_paragraph(lines: Vec<PdfLine>) -> PdfParagraph {
let dominant_font_size = if lines.is_empty() {
0.0
} else {
let mut fs_counts: Vec<(i32, usize)> = Vec::new();
for l in &lines {
let key = (l.dominant_font_size * 2.0).round() as i32;
if let Some(entry) = fs_counts.iter_mut().find(|(k, _)| *k == key) {
entry.1 += 1;
} else {
fs_counts.push((key, 1));
}
}
fs_counts.sort_by(|a, b| b.1.cmp(&a.1));
fs_counts[0].0 as f32 / 2.0
};
let bold_count = lines.iter().filter(|l| l.is_bold).count();
let italic_count = lines.iter().filter(|l| l.is_italic).count();
let majority = lines.len().div_ceil(2);
let is_list_item = lines.len() <= MAX_LIST_ITEM_LINES
&& lines
.first()
.and_then(|l| l.words.first())
.map(|w| is_list_prefix(&w.text))
.unwrap_or(false);
PdfParagraph {
dominant_font_size,
heading_level: None, is_bold: bold_count >= majority,
is_italic: italic_count >= majority,
is_list_item,
lines,
}
}
fn is_list_prefix(text: &str) -> bool {
let trimmed = text.trim();
if trimmed == "-" || trimmed == "*" || trimmed == "\u{2022}" {
return true;
}
let bytes = trimmed.as_bytes();
if bytes.is_empty() {
return false;
}
let digit_end = bytes.iter().position(|&b| !b.is_ascii_digit()).unwrap_or(bytes.len());
if digit_end > 0 && digit_end < bytes.len() {
let suffix = bytes[digit_end];
return suffix == b'.' || suffix == b')';
}
false
}
fn classify_paragraphs(paragraphs: &mut [PdfParagraph], heading_map: &[(f32, Option<u8>)]) {
for para in paragraphs.iter_mut() {
let word_count: usize = para.lines.iter().map(|l| l.words.len()).sum();
let heading_level = find_heading_level(para.dominant_font_size, heading_map);
if let Some(level) = heading_level {
if word_count <= MAX_HEADING_WORD_COUNT {
para.heading_level = Some(level);
}
}
}
}
fn find_heading_level(font_size: f32, heading_map: &[(f32, Option<u8>)]) -> Option<u8> {
if heading_map.is_empty() {
return None;
}
if heading_map.len() == 1 {
return heading_map[0].1;
}
let mut best_distance = f32::INFINITY;
let mut best_level: Option<u8> = None;
for &(centroid, level) in heading_map {
let dist = (font_size - centroid).abs();
if dist < best_distance {
best_distance = dist;
best_level = level;
}
}
let mut centroids: Vec<f32> = heading_map.iter().map(|(c, _)| *c).collect();
centroids.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let gaps: Vec<f32> = centroids.windows(2).map(|w| (w[1] - w[0]).abs()).collect();
let avg_gap = if gaps.is_empty() {
f32::INFINITY
} else {
gaps.iter().sum::<f32>() / gaps.len() as f32
};
if best_distance > MAX_HEADING_DISTANCE_MULTIPLIER * avg_gap {
return None;
}
best_level
}
fn assemble_markdown(pages: Vec<Vec<PdfParagraph>>) -> String {
let mut output = String::new();
for (page_idx, paragraphs) in pages.iter().enumerate() {
if page_idx > 0 && !output.is_empty() {
output.push_str("\n\n");
}
for (para_idx, para) in paragraphs.iter().enumerate() {
if para_idx > 0 {
output.push_str("\n\n");
}
if let Some(level) = para.heading_level {
let prefix = "#".repeat(level as usize);
let text = join_line_texts(¶.lines);
output.push_str(&prefix);
output.push(' ');
output.push_str(&text);
} else if para.is_list_item {
for (line_idx, line) in para.lines.iter().enumerate() {
if line_idx > 0 {
output.push('\n');
}
let text = render_line_with_inline_markup(line);
output.push_str(&text);
}
} else {
let text = render_paragraph_with_inline_markup(para);
output.push_str(&text);
}
}
}
output
}
fn assemble_markdown_with_tables(pages: Vec<Vec<PdfParagraph>>, tables: &[crate::types::Table]) -> String {
if tables.is_empty() || tables.iter().all(|t| t.bounding_box.is_none()) {
return assemble_markdown(pages);
}
let mut output = String::new();
for (page_idx, paragraphs) in pages.iter().enumerate() {
let page_number = page_idx + 1;
if page_idx > 0 && !output.is_empty() {
output.push_str("\n\n");
}
let page_tables: Vec<&crate::types::Table> = tables.iter().filter(|t| t.page_number == page_number).collect();
let positioned_tables: Vec<&crate::types::Table> = page_tables
.iter()
.filter(|t| t.bounding_box.is_some())
.copied()
.collect();
let unpositioned_tables: Vec<&crate::types::Table> = page_tables
.iter()
.filter(|t| t.bounding_box.is_none())
.copied()
.collect();
if positioned_tables.is_empty() {
for (para_idx, para) in paragraphs.iter().enumerate() {
if para_idx > 0 {
output.push_str("\n\n");
}
render_paragraph_to_output(para, &mut output);
}
for table in &unpositioned_tables {
output.push_str("\n\n");
output.push_str(table.markdown.trim());
}
} else {
enum PageItem<'a> {
Paragraph(&'a PdfParagraph),
Table(&'a crate::types::Table),
}
let mut items: Vec<(f32, PageItem)> = Vec::new();
for para in paragraphs {
let y = para.lines.first().map(|l| l.baseline_y).unwrap_or(0.0);
items.push((y, PageItem::Paragraph(para)));
}
for table in &positioned_tables {
let y = table.bounding_box.as_ref().map(|b| b.y1 as f32).unwrap_or(0.0);
items.push((y, PageItem::Table(table)));
}
items.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
let mut first = true;
for (_, item) in &items {
if !first {
output.push_str("\n\n");
}
first = false;
match item {
PageItem::Paragraph(para) => render_paragraph_to_output(para, &mut output),
PageItem::Table(table) => output.push_str(table.markdown.trim()),
}
}
for table in &unpositioned_tables {
output.push_str("\n\n");
output.push_str(table.markdown.trim());
}
}
}
output
}
fn render_paragraph_to_output(para: &PdfParagraph, output: &mut String) {
if let Some(level) = para.heading_level {
let prefix = "#".repeat(level as usize);
let text = join_line_texts(¶.lines);
output.push_str(&prefix);
output.push(' ');
output.push_str(&text);
} else if para.is_list_item {
for (line_idx, line) in para.lines.iter().enumerate() {
if line_idx > 0 {
output.push('\n');
}
let text = render_line_with_inline_markup(line);
output.push_str(&text);
}
} else {
let text = render_paragraph_with_inline_markup(para);
output.push_str(&text);
}
}
pub fn inject_image_placeholders(markdown: &str, images: &[crate::types::ExtractedImage]) -> String {
if images.is_empty() {
return markdown.to_string();
}
let mut images_by_page: std::collections::BTreeMap<usize, Vec<(usize, &crate::types::ExtractedImage)>> =
std::collections::BTreeMap::new();
for (idx, img) in images.iter().enumerate() {
let page = img.page_number.unwrap_or(0);
images_by_page.entry(page).or_default().push((idx, img));
}
if images_by_page.keys().all(|&k| k == 0) {
let mut result = markdown.to_string();
for img in images {
let ii = img.image_index;
result.push_str(&format!("\n\n", ii, ii));
if let Some(ref ocr) = img.ocr_result {
let text = ocr.content.trim();
if !text.is_empty() {
result.push_str(&format!("\n> *Image text: {}*", text));
}
}
}
return result;
}
let mut result = markdown.to_string();
for (&page, page_images) in &images_by_page {
for (_idx, img) in page_images {
let ii = img.image_index;
let label = if page > 0 {
format!("", ii, page, page, ii)
} else {
format!("", ii, ii)
};
result.push_str("\n\n");
result.push_str(&label);
if let Some(ref ocr) = img.ocr_result {
let text = ocr.content.trim();
if !text.is_empty() {
result.push_str(&format!("\n> *Image text: {}*", text));
}
}
}
}
result
}
fn join_line_texts(lines: &[PdfLine]) -> String {
let all_words: Vec<&str> = lines
.iter()
.flat_map(|l| l.words.iter().map(|w| w.text.as_str()))
.collect();
join_words_cjk_aware(&all_words)
}
fn join_words_cjk_aware(words: &[&str]) -> String {
if words.is_empty() {
return String::new();
}
let mut result = String::from(words[0]);
for pair in words.windows(2) {
if needs_space_between(pair[0], pair[1]) {
result.push(' ');
}
result.push_str(pair[1]);
}
result
}
fn render_line_with_inline_markup(line: &PdfLine) -> String {
render_words_with_markup(&line.words)
}
fn render_paragraph_with_inline_markup(para: &PdfParagraph) -> String {
let all_words: Vec<&PdfWord> = para.lines.iter().flat_map(|l| l.words.iter()).collect();
render_words_with_markup_refs(&all_words)
}
fn render_words_with_markup(words: &[PdfWord]) -> String {
let refs: Vec<&PdfWord> = words.iter().collect();
render_words_with_markup_refs(&refs)
}
fn render_words_with_markup_refs(words: &[&PdfWord]) -> String {
if words.is_empty() {
return String::new();
}
let mut result = String::new();
let mut i = 0;
while i < words.len() {
let bold = words[i].is_bold;
let italic = words[i].is_italic;
let run_start = i;
while i < words.len() && words[i].is_bold == bold && words[i].is_italic == italic {
i += 1;
}
let run_words: Vec<&str> = words[run_start..i].iter().map(|w| w.text.as_str()).collect();
let run_text = join_words_cjk_aware(&run_words);
if !result.is_empty() {
let prev_end = words[run_start - 1].text.as_str();
let next_start = words[run_start].text.as_str();
if needs_space_between(prev_end, next_start) {
result.push(' ');
}
}
match (bold, italic) {
(true, true) => {
result.push_str("***");
result.push_str(&run_text);
result.push_str("***");
}
(true, false) => {
result.push_str("**");
result.push_str(&run_text);
result.push_str("**");
}
(false, true) => {
result.push('*');
result.push_str(&run_text);
result.push('*');
}
(false, false) => {
result.push_str(&run_text);
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
fn make_char(text: &str, x: f32, baseline_y: f32, font_size: f32, is_bold: bool, is_italic: bool) -> CharData {
CharData {
text: text.to_string(),
x,
y: baseline_y + font_size * 0.2, font_size,
width: font_size * 0.6,
height: font_size,
is_bold,
is_italic,
baseline_y,
}
}
fn plain_char(text: &str, x: f32, baseline_y: f32, font_size: f32) -> CharData {
make_char(text, x, baseline_y, font_size, false, false)
}
#[test]
fn test_chars_to_words() {
let fs = 12.0;
let cw = fs * 0.6;
let chars = vec![
plain_char("H", 0.0, 100.0, fs),
plain_char("i", cw, 100.0, fs),
plain_char("t", cw * 2.0 + 5.0, 100.0, fs),
plain_char("h", cw * 3.0 + 5.0, 100.0, fs),
plain_char("e", cw * 4.0 + 5.0, 100.0, fs),
plain_char("r", cw * 5.0 + 5.0, 100.0, fs),
plain_char("e", cw * 6.0 + 5.0, 100.0, fs),
];
let words = chars_to_words(&chars);
assert_eq!(words.len(), 2, "Expected 2 words, got {}", words.len());
assert_eq!(words[0].text, "Hi");
assert_eq!(words[1].text, "there");
}
#[test]
fn test_words_to_lines() {
let words = vec![
PdfWord {
text: "Hello".to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
PdfWord {
text: "world".to_string(),
x_start: 35.0,
x_end: 65.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
PdfWord {
text: "Next".to_string(),
x_start: 0.0,
x_end: 25.0,
baseline_y: 115.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
];
let lines = words_to_lines(words);
assert_eq!(lines.len(), 2, "Expected 2 lines, got {}", lines.len());
assert_eq!(lines[0].words.len(), 1);
assert_eq!(lines[0].words[0].text, "Next");
assert_eq!(lines[1].words.len(), 2);
assert_eq!(lines[1].words[0].text, "Hello");
assert_eq!(lines[1].words[1].text, "world");
}
#[test]
fn test_lines_to_paragraphs() {
fn make_line(text: &str, baseline: f32) -> PdfLine {
PdfLine {
words: vec![PdfWord {
text: text.to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: baseline,
font_size: 12.0,
is_bold: false,
is_italic: false,
}],
baseline_y: baseline,
y_top: baseline - 12.0,
y_bottom: baseline,
dominant_font_size: 12.0,
is_bold: false,
is_italic: false,
}
}
let lines = vec![
make_line("First", 100.0),
make_line("second", 114.0), make_line("third", 128.0), make_line("New para", 178.0), ];
let paragraphs = lines_to_paragraphs(lines);
assert_eq!(paragraphs.len(), 2, "Expected 2 paragraphs, got {}", paragraphs.len());
assert_eq!(paragraphs[0].lines.len(), 3);
assert_eq!(paragraphs[1].lines.len(), 1);
}
#[test]
fn test_heading_classification() {
let clusters = vec![
crate::pdf::hierarchy::FontSizeCluster {
centroid: 24.0,
members: vec![
TextBlock {
text: "Title".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 0.0,
right: 100.0,
bottom: 24.0,
},
font_size: 24.0,
},
TextBlock {
text: "Subtitle".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 30.0,
right: 100.0,
bottom: 54.0,
},
font_size: 24.0,
},
],
},
crate::pdf::hierarchy::FontSizeCluster {
centroid: 12.0,
members: (0..20)
.map(|i| TextBlock {
text: format!("body {}", i),
bbox: BoundingBox {
left: 0.0,
top: 60.0 + i as f32 * 14.0,
right: 400.0,
bottom: 72.0 + i as f32 * 14.0,
},
font_size: 12.0,
})
.collect(),
},
];
let heading_map = assign_heading_levels_smart(&clusters);
assert_eq!(heading_map.len(), 2);
let h24 = heading_map.iter().find(|(c, _)| (*c - 24.0).abs() < 0.1);
assert!(h24.is_some(), "Should find 24pt cluster");
assert_eq!(h24.unwrap().1, Some(1), "24pt should be H1");
let h12 = heading_map.iter().find(|(c, _)| (*c - 12.0).abs() < 0.1);
assert!(h12.is_some(), "Should find 12pt cluster");
assert_eq!(h12.unwrap().1, None, "12pt should be Body");
}
#[test]
fn test_single_font_size_no_headings() {
let clusters = vec![crate::pdf::hierarchy::FontSizeCluster {
centroid: 12.0,
members: (0..10)
.map(|i| TextBlock {
text: format!("text {}", i),
bbox: BoundingBox {
left: 0.0,
top: i as f32 * 14.0,
right: 100.0,
bottom: 12.0 + i as f32 * 14.0,
},
font_size: 12.0,
})
.collect(),
}];
let heading_map = assign_heading_levels_smart(&clusters);
assert_eq!(heading_map.len(), 1);
assert_eq!(heading_map[0].1, None, "Single cluster should be body");
}
#[test]
fn test_inline_bold_markup() {
let words = vec![
PdfWord {
text: "Hello".to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
PdfWord {
text: "bold".to_string(),
x_start: 35.0,
x_end: 55.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: true,
is_italic: false,
},
PdfWord {
text: "text".to_string(),
x_start: 60.0,
x_end: 80.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: true,
is_italic: false,
},
PdfWord {
text: "end".to_string(),
x_start: 85.0,
x_end: 105.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
];
let result = render_words_with_markup(&words);
assert_eq!(result, "Hello **bold text** end");
}
#[test]
fn test_inline_italic_and_bold_italic_markup() {
let words = vec![
PdfWord {
text: "normal".to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
PdfWord {
text: "italic".to_string(),
x_start: 35.0,
x_end: 65.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: false,
is_italic: true,
},
PdfWord {
text: "both".to_string(),
x_start: 70.0,
x_end: 90.0,
baseline_y: 100.0,
font_size: 12.0,
is_bold: true,
is_italic: true,
},
];
let result = render_words_with_markup(&words);
assert_eq!(result, "normal *italic* ***both***");
}
#[test]
fn test_markdown_assembly() {
let heading_para = PdfParagraph {
lines: vec![PdfLine {
words: vec![PdfWord {
text: "Introduction".to_string(),
x_start: 0.0,
x_end: 80.0,
baseline_y: 50.0,
font_size: 24.0,
is_bold: true,
is_italic: false,
}],
baseline_y: 50.0,
y_top: 26.0,
y_bottom: 50.0,
dominant_font_size: 24.0,
is_bold: true,
is_italic: false,
}],
dominant_font_size: 24.0,
heading_level: Some(1),
is_bold: true,
is_italic: false,
is_list_item: false,
};
let body_para = PdfParagraph {
lines: vec![
PdfLine {
words: vec![
PdfWord {
text: "This".to_string(),
x_start: 0.0,
x_end: 25.0,
baseline_y: 80.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
PdfWord {
text: "is".to_string(),
x_start: 30.0,
x_end: 40.0,
baseline_y: 80.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
},
],
baseline_y: 80.0,
y_top: 68.0,
y_bottom: 80.0,
dominant_font_size: 12.0,
is_bold: false,
is_italic: false,
},
PdfLine {
words: vec![PdfWord {
text: "body.".to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: 94.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
}],
baseline_y: 94.0,
y_top: 82.0,
y_bottom: 94.0,
dominant_font_size: 12.0,
is_bold: false,
is_italic: false,
},
],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_italic: false,
is_list_item: false,
};
let markdown = assemble_markdown(vec![vec![heading_para, body_para]]);
assert_eq!(markdown, "# Introduction\n\nThis is body.");
}
#[test]
fn test_list_item_detection() {
assert!(is_list_prefix("-"));
assert!(is_list_prefix("*"));
assert!(is_list_prefix("\u{2022}")); assert!(is_list_prefix("1."));
assert!(is_list_prefix("10)"));
assert!(!is_list_prefix("Hello"));
assert!(!is_list_prefix(""));
}
#[test]
fn test_empty_document() {
let paragraphs: Vec<Vec<PdfParagraph>> = vec![vec![]];
let markdown = assemble_markdown(paragraphs);
assert_eq!(markdown, "");
}
#[test]
fn test_chars_to_words_multiline() {
let fs = 12.0;
let cw = fs * 0.6;
let chars = vec![
plain_char("A", 0.0, 100.0, fs),
plain_char("B", cw, 100.0, fs),
plain_char("C", 0.0, 120.0, fs), plain_char("D", cw, 120.0, fs),
];
let words = chars_to_words(&chars);
assert_eq!(words.len(), 2, "Expected 2 words on different lines");
assert_eq!(words[0].text, "CD");
assert_eq!(words[1].text, "AB");
}
#[test]
fn test_body_is_most_frequent_cluster() {
let clusters = vec![
crate::pdf::hierarchy::FontSizeCluster {
centroid: 12.0,
members: (0..50)
.map(|i| TextBlock {
text: format!("body {}", i),
bbox: BoundingBox {
left: 0.0,
top: i as f32 * 14.0,
right: 400.0,
bottom: 12.0 + i as f32 * 14.0,
},
font_size: 12.0,
})
.collect(),
},
crate::pdf::hierarchy::FontSizeCluster {
centroid: 10.0,
members: (0..5)
.map(|i| TextBlock {
text: format!("caption {}", i),
bbox: BoundingBox {
left: 0.0,
top: 700.0 + i as f32 * 12.0,
right: 200.0,
bottom: 710.0 + i as f32 * 12.0,
},
font_size: 10.0,
})
.collect(),
},
];
let heading_map = assign_heading_levels_smart(&clusters);
let h12 = heading_map.iter().find(|(c, _)| (*c - 12.0).abs() < 0.1);
assert_eq!(h12.unwrap().1, None, "12pt (most frequent) should be body");
let h10 = heading_map.iter().find(|(c, _)| (*c - 10.0).abs() < 0.1);
assert_eq!(h10.unwrap().1, None, "10pt (smaller than body) should NOT be a heading");
}
#[test]
fn test_detect_columns_single_column() {
let chars: Vec<CharData> = (0..20)
.map(|i| CharData {
text: "x".to_string(),
x: i as f32 * 20.0,
y: 500.0,
font_size: 12.0,
width: 7.0,
height: 12.0,
is_bold: false,
is_italic: false,
baseline_y: 500.0,
})
.collect();
let columns = detect_columns(&chars, 400.0, 800.0);
assert_eq!(columns.len(), 1);
}
#[test]
fn test_detect_columns_two_columns() {
let mut chars: Vec<CharData> = Vec::new();
for row in 0..30 {
let y = 700.0 - row as f32 * 20.0;
for col in 0..10 {
chars.push(CharData {
text: "a".to_string(),
x: 10.0 + col as f32 * 18.0,
y,
font_size: 12.0,
width: 7.0,
height: 12.0,
is_bold: false,
is_italic: false,
baseline_y: y,
});
}
for col in 0..10 {
chars.push(CharData {
text: "b".to_string(),
x: 300.0 + col as f32 * 18.0,
y,
font_size: 12.0,
width: 7.0,
height: 12.0,
is_bold: false,
is_italic: false,
baseline_y: y,
});
}
}
let columns = detect_columns(&chars, 500.0, 800.0);
assert!(
columns.len() >= 2,
"Should detect at least 2 columns, got {}",
columns.len()
);
}
#[test]
fn test_detect_columns_empty() {
let columns = detect_columns(&[], 400.0, 800.0);
assert_eq!(columns.len(), 1);
}
#[test]
fn test_find_heading_level_outlier_rejected() {
let heading_map = vec![(24.0, Some(1)), (12.0, None)];
assert_eq!(find_heading_level(100.0, &heading_map), None);
}
#[test]
fn test_find_heading_level_close_match() {
let heading_map = vec![(24.0, Some(1)), (12.0, None)];
assert_eq!(find_heading_level(23.5, &heading_map), Some(1));
}
#[test]
fn test_is_cjk_char() {
assert!(is_cjk_char('中')); assert!(is_cjk_char('あ')); assert!(is_cjk_char('ア')); assert!(is_cjk_char('한')); assert!(!is_cjk_char('A')); assert!(!is_cjk_char('1')); assert!(!is_cjk_char(' ')); }
#[test]
fn test_chars_to_words_cjk_boundary() {
let fs = 12.0;
let cw = fs * 0.6;
let chars = vec![
CharData {
text: "中".to_string(), x: 0.0,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
CharData {
text: "文".to_string(), x: cw,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
CharData {
text: "字".to_string(), x: cw * 2.0,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
];
let words = chars_to_words(&chars);
assert_eq!(words.len(), 3, "Expected 3 CJK words, each character separate");
assert_eq!(words[0].text, "中");
assert_eq!(words[1].text, "文");
assert_eq!(words[2].text, "字");
}
#[test]
fn test_chars_to_words_cjk_latin_mixing() {
let fs = 12.0;
let cw = fs * 0.6;
let chars = vec![
CharData {
text: "A".to_string(),
x: 0.0,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
CharData {
text: "B".to_string(),
x: cw,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
CharData {
text: "中".to_string(), x: cw * 2.0,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
CharData {
text: "C".to_string(), x: cw * 3.0,
y: 100.0,
font_size: fs,
width: cw,
height: fs,
is_bold: false,
is_italic: false,
baseline_y: 100.0,
},
];
let words = chars_to_words(&chars);
assert_eq!(words.len(), 3, "Expected 3 words (AB, 中, C)");
assert_eq!(words[0].text, "AB", "Latin characters should stay together");
assert_eq!(words[1].text, "中", "CJK character should be separate");
assert_eq!(words[2].text, "C", "Latin after CJK should be separate");
}
#[test]
fn test_needs_space_between() {
assert!(!needs_space_between("中", "文"));
assert!(!needs_space_between("あ", "い"));
assert!(needs_space_between("hello", "world"));
assert!(needs_space_between("中", "hello"));
assert!(needs_space_between("hello", "中"));
}
#[test]
fn test_join_words_cjk_aware() {
assert_eq!(join_words_cjk_aware(&["中", "文", "字"]), "中文字");
assert_eq!(join_words_cjk_aware(&["hello", "world"]), "hello world");
assert_eq!(join_words_cjk_aware(&["中", "文", "test"]), "中文 test");
assert_eq!(join_words_cjk_aware(&["test", "中", "文"]), "test 中文");
assert_eq!(join_words_cjk_aware(&["hello"]), "hello");
assert_eq!(join_words_cjk_aware(&[]), "");
}
#[test]
fn test_inject_image_placeholders_empty() {
let md = "# Hello\n\nSome text.";
let result = inject_image_placeholders(md, &[]);
assert_eq!(result, md);
}
#[test]
fn test_inject_image_placeholders_uses_image_index() {
use bytes::Bytes;
use std::borrow::Cow;
let md = "# Page 1 content";
let images = vec![crate::types::ExtractedImage {
data: Bytes::from_static(&[0xFF]),
format: Cow::Borrowed("jpeg"),
image_index: 5,
page_number: Some(1),
width: None,
height: None,
colorspace: None,
bits_per_component: None,
is_mask: false,
description: None,
ocr_result: None,
bounding_box: None,
}];
let result = inject_image_placeholders(md, &images);
assert!(
result.contains("embedded:p1_i5"),
"Should use image_index 5, got: {}",
result
);
assert!(
result.contains("![Image 5 (page 1)]"),
"Should use image_index 5, got: {}",
result
);
}
#[test]
fn test_inject_image_placeholders_with_ocr() {
use bytes::Bytes;
use std::borrow::Cow;
let md = "Content here";
let images = vec![crate::types::ExtractedImage {
data: Bytes::from_static(&[0x89]),
format: Cow::Borrowed("png"),
image_index: 0,
page_number: None,
width: None,
height: None,
colorspace: None,
bits_per_component: None,
is_mask: false,
description: None,
ocr_result: Some(Box::new(crate::types::ExtractionResult {
content: "OCR detected text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: crate::types::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
})),
bounding_box: None,
}];
let result = inject_image_placeholders(md, &images);
assert!(
result.contains("Image text: OCR detected text"),
"Should include OCR text, got: {}",
result
);
}
#[test]
fn test_assemble_markdown_with_tables_no_tables() {
let paragraphs = vec![vec![PdfParagraph {
lines: vec![PdfLine {
words: vec![PdfWord {
text: "Hello".to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: 700.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
}],
baseline_y: 700.0,
y_top: 688.0,
y_bottom: 700.0,
dominant_font_size: 12.0,
is_bold: false,
is_italic: false,
}],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_italic: false,
is_list_item: false,
}]];
let result = assemble_markdown_with_tables(paragraphs, &[]);
assert_eq!(result, "Hello");
}
#[test]
fn test_assemble_markdown_with_tables_interleaves() {
let paragraphs = vec![vec![
PdfParagraph {
lines: vec![PdfLine {
words: vec![PdfWord {
text: "Top".to_string(),
x_start: 0.0,
x_end: 30.0,
baseline_y: 700.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
}],
baseline_y: 700.0,
y_top: 688.0,
y_bottom: 700.0,
dominant_font_size: 12.0,
is_bold: false,
is_italic: false,
}],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_italic: false,
is_list_item: false,
},
PdfParagraph {
lines: vec![PdfLine {
words: vec![PdfWord {
text: "Bottom".to_string(),
x_start: 0.0,
x_end: 50.0,
baseline_y: 200.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
}],
baseline_y: 200.0,
y_top: 188.0,
y_bottom: 200.0,
dominant_font_size: 12.0,
is_bold: false,
is_italic: false,
}],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_italic: false,
is_list_item: false,
},
]];
let tables = vec![crate::types::Table {
cells: vec![vec!["A".to_string(), "B".to_string()]],
markdown: "| A | B |".to_string(),
page_number: 1,
bounding_box: Some(crate::types::BoundingBox {
x0: 50.0,
y0: 400.0,
x1: 500.0,
y1: 500.0, }),
}];
let result = assemble_markdown_with_tables(paragraphs, &tables);
let top_pos = result.find("Top").unwrap();
let table_pos = result.find("| A | B |").unwrap();
let bottom_pos = result.find("Bottom").unwrap();
assert!(top_pos < table_pos, "Top should come before table");
assert!(table_pos < bottom_pos, "Table should come before Bottom");
}
}