use crate::error::Result;
use crate::layout::FontWeight;
use crate::pipeline::{OrderedTextSpan, TextPipelineConfig};
use crate::structure::table_extractor::ExtractedTable;
use crate::text::HyphenationHandler;
use super::OutputConverter;
pub struct HtmlOutputConverter {
paragraph_gap_ratio: f32,
}
impl HtmlOutputConverter {
pub fn new() -> Self {
Self {
paragraph_gap_ratio: 1.5,
}
}
fn is_bold(&self, span: &OrderedTextSpan) -> bool {
matches!(
span.span.font_weight,
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold
)
}
fn is_italic(&self, span: &OrderedTextSpan) -> bool {
span.span.is_italic
}
fn is_paragraph_break(&self, current: &OrderedTextSpan, previous: &OrderedTextSpan) -> bool {
let line_height = current.span.font_size.max(previous.span.font_size);
let gap = (previous.span.bbox.y - current.span.bbox.y).abs();
gap > line_height * self.paragraph_gap_ratio
}
fn heading_level(&self, span: &OrderedTextSpan, base_font_size: f32) -> Option<u8> {
let text = span.span.text.trim();
let text_len = text.len();
if !(2..=120).contains(&text_len) {
return None;
}
let word_count = text.split_whitespace().count();
if word_count > 12 {
return None;
}
if Self::looks_like_non_heading(text) {
return None;
}
let size_ratio = span.span.font_size / base_font_size;
let is_bold = matches!(
span.span.font_weight,
FontWeight::Bold | FontWeight::Black | FontWeight::ExtraBold | FontWeight::SemiBold
);
if size_ratio >= 2.0 {
Some(1)
} else if size_ratio >= 1.5 {
Some(2)
} else if size_ratio >= 1.3 || (is_bold && size_ratio >= 1.15) {
Some(3)
} else {
None
}
}
fn looks_like_non_heading(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.contains('$')
|| trimmed.contains('\u{20AC}') || trimmed.contains('\u{00A3}')
{
let non_currency: String = trimmed
.chars()
.filter(|c| {
!c.is_ascii_digit()
&& *c != '.'
&& *c != ','
&& *c != '$'
&& *c != ' '
&& *c != '\u{20AC}'
&& *c != '\u{00A3}'
})
.collect();
if non_currency.len() <= 2 {
return true;
}
}
{
let stripped: String = trimmed
.chars()
.filter(|c| !c.is_ascii_digit() && *c != '.' && *c != ',' && *c != ' ' && *c != '-')
.collect();
if stripped.is_empty() && !trimmed.is_empty() {
return true;
}
}
{
let parts: Vec<&str> = trimmed.split_whitespace().collect();
if parts.len() == 2
&& parts[0].chars().count() <= 10
&& parts[0].chars().all(|c| c.is_alphabetic())
&& parts[1]
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == '-')
{
return true;
}
}
if let Some(first_char) = trimmed.chars().next() {
if first_char.is_ascii_digit() {
let parts: Vec<&str> = trimmed.split_whitespace().collect();
if (3..=8).contains(&parts.len()) && trimmed.chars().count() < 80 {
let first_is_number = parts[0].chars().all(|c| c.is_ascii_digit() || c == '-');
let alpha_word_count = parts
.iter()
.skip(1)
.filter(|w| w.chars().any(|c| c.is_alphabetic()))
.count();
if first_is_number && alpha_word_count >= 2 {
return true;
}
}
}
}
false
}
fn escape_html(text: &str) -> String {
text.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
}
fn format_span_with_styles(&self, span: &OrderedTextSpan, text: &str) -> String {
let escaped = Self::escape_html(text);
let mut result = escaped;
if self.is_italic(span) {
result = format!("<em>{}</em>", result);
}
if self.is_bold(span) {
result = format!("<strong>{}</strong>", result);
}
result
}
fn format_color(&self, span: &OrderedTextSpan) -> Option<String> {
let color = &span.span.color;
let r = (color.r * 255.0) as u8;
let g = (color.g * 255.0) as u8;
let b = (color.b * 255.0) as u8;
if r != 0 || g != 0 || b != 0 {
Some(format!("#{:02x}{:02x}{:02x}", r, g, b))
} else {
None
}
}
}
impl Default for HtmlOutputConverter {
fn default() -> Self {
Self::new()
}
}
impl OutputConverter for HtmlOutputConverter {
fn convert(&self, spans: &[OrderedTextSpan], config: &TextPipelineConfig) -> Result<String> {
if config.output.preserve_layout {
self.convert_layout_mode(spans, config)
} else {
self.convert_semantic_mode(spans, &[], config)
}
}
fn convert_with_tables(
&self,
spans: &[OrderedTextSpan],
tables: &[ExtractedTable],
config: &TextPipelineConfig,
) -> Result<String> {
if config.output.preserve_layout {
self.convert_layout_mode(spans, config)
} else {
self.convert_semantic_mode(spans, tables, config)
}
}
fn name(&self) -> &'static str {
"HtmlOutputConverter"
}
fn mime_type(&self) -> &'static str {
"text/html"
}
}
impl HtmlOutputConverter {
fn convert_layout_mode(
&self,
spans: &[OrderedTextSpan],
config: &TextPipelineConfig,
) -> Result<String> {
if spans.is_empty() {
return Ok(String::new());
}
let mut sorted: Vec<_> = spans.iter().collect();
sorted.sort_by_key(|s| s.reading_order);
let mut result = String::new();
for span in sorted {
let text = self.format_span_with_styles(span, &span.span.text);
let x = span.span.bbox.x;
let y = span.span.bbox.y;
let font_size = span.span.font_size;
let mut style =
format!("position:absolute;left:{}pt;top:{}pt;font-size:{}pt;", x, y, font_size);
if let Some(color) = self.format_color(span) {
style.push_str(&format!("color:{};", color));
}
result.push_str(&format!("<div style=\"{}\">{}</div>\n", style, text));
}
if config.enable_hyphenation_reconstruction {
let handler = HyphenationHandler::new();
result = handler.process_text(&result);
}
Ok(result)
}
fn convert_semantic_mode(
&self,
spans: &[OrderedTextSpan],
tables: &[ExtractedTable],
config: &TextPipelineConfig,
) -> Result<String> {
if spans.is_empty() && tables.is_empty() {
return Ok(String::new());
}
let mut sorted: Vec<_> = spans.iter().collect();
sorted.sort_by_key(|s| s.reading_order);
let base_font_size = if config.output.detect_headings {
let sizes: Vec<f32> = sorted.iter().map(|s| s.span.font_size).collect();
let mut sizes_sorted = sizes.clone();
sizes_sorted.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
sizes_sorted
.get(sizes_sorted.len() / 2)
.copied()
.unwrap_or(12.0)
} else {
12.0
};
let mut tables_rendered = vec![false; tables.len()];
let mut result = String::new();
let mut prev_span: Option<&OrderedTextSpan> = None;
let mut in_paragraph = false;
let mut current_content = String::new();
for span in &sorted {
if !tables.is_empty() {
if let Some(table_idx) = super::span_in_table(span, tables) {
if !tables_rendered[table_idx] {
if in_paragraph && !current_content.is_empty() {
result.push_str(&format!("<p>{}</p>\n", current_content.trim()));
current_content.clear();
in_paragraph = false;
}
result.push_str(&Self::render_table_html(&tables[table_idx]));
tables_rendered[table_idx] = true;
prev_span = None;
}
continue;
}
}
if let Some(prev) = prev_span {
if self.is_paragraph_break(span, prev)
&& in_paragraph
&& !current_content.is_empty()
{
result.push_str(&format!("<p>{}</p>\n", current_content.trim()));
current_content.clear();
in_paragraph = false;
}
}
if config.output.detect_headings {
if let Some(level) = self.heading_level(span, base_font_size) {
if in_paragraph && !current_content.is_empty() {
result.push_str(&format!("<p>{}</p>\n", current_content.trim()));
current_content.clear();
in_paragraph = false;
}
let text = self.format_span_with_styles(span, span.span.text.trim());
result.push_str(&format!("<h{}>{}</h{}>\n", level, text, level));
prev_span = Some(span);
continue;
}
}
if !in_paragraph {
in_paragraph = true;
}
if let Some(prev) = prev_span {
let same_line =
(span.span.bbox.y - prev.span.bbox.y).abs() < span.span.font_size * 0.5;
if same_line
&& !current_content.is_empty()
&& !current_content.ends_with(' ')
&& !span.span.text.starts_with(' ')
&& super::has_horizontal_gap(&prev.span, &span.span)
{
current_content.push(' ');
}
}
let formatted = self.format_span_with_styles(span, &span.span.text);
current_content.push_str(&formatted);
prev_span = Some(span);
}
for (i, table) in tables.iter().enumerate() {
if !tables_rendered[i] && !table.is_empty() {
if in_paragraph && !current_content.is_empty() {
result.push_str(&format!("<p>{}</p>\n", current_content.trim()));
current_content.clear();
in_paragraph = false;
}
result.push_str(&Self::render_table_html(table));
}
}
if in_paragraph && !current_content.is_empty() {
result.push_str(&format!("<p>{}</p>\n", current_content.trim()));
}
if config.enable_hyphenation_reconstruction {
let handler = HyphenationHandler::new();
result = handler.process_text(&result);
}
Ok(result)
}
fn render_table_html(table: &ExtractedTable) -> String {
if table.rows.is_empty() {
return String::new();
}
let mut html = String::from("<table>\n");
let has_header = table.has_header || table.rows.first().is_some_and(|r| r.is_header);
let header_end = if has_header {
table
.rows
.iter()
.position(|r| !r.is_header)
.unwrap_or(table.rows.len())
} else {
0
};
if header_end > 0 {
html.push_str("<thead>\n");
for row in &table.rows[..header_end] {
html.push_str("<tr>");
for cell in &row.cells {
let mut attrs = String::new();
if cell.colspan > 1 {
attrs.push_str(&format!(" colspan=\"{}\"", cell.colspan));
}
if cell.rowspan > 1 {
attrs.push_str(&format!(" rowspan=\"{}\"", cell.rowspan));
}
let text = Self::escape_html(cell.text.trim());
html.push_str(&format!("<th{}>{}</th>", attrs, text));
}
html.push_str("</tr>\n");
}
html.push_str("</thead>\n");
}
let body_rows = &table.rows[header_end..];
if !body_rows.is_empty() {
html.push_str("<tbody>\n");
for row in body_rows {
html.push_str("<tr>");
for cell in &row.cells {
let mut attrs = String::new();
if cell.colspan > 1 {
attrs.push_str(&format!(" colspan=\"{}\"", cell.colspan));
}
if cell.rowspan > 1 {
attrs.push_str(&format!(" rowspan=\"{}\"", cell.rowspan));
}
let text = Self::escape_html(cell.text.trim());
html.push_str(&format!("<td{}>{}</td>", attrs, text));
}
html.push_str("</tr>\n");
}
html.push_str("</tbody>\n");
}
html.push_str("</table>\n");
html
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
use crate::layout::{Color, TextSpan};
use crate::pipeline::converters::span_in_table;
fn make_span(
text: &str,
x: f32,
y: f32,
font_size: f32,
weight: FontWeight,
) -> OrderedTextSpan {
OrderedTextSpan::new(
TextSpan {
artifact_type: None,
text: text.to_string(),
bbox: Rect::new(x, y, 50.0, font_size),
font_name: "Test".to_string(),
font_size,
font_weight: weight,
is_italic: false,
is_monospace: false,
color: Color::black(),
mcid: None,
sequence: 0,
offset_semantic: false,
split_boundary_before: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
0,
)
}
#[test]
fn test_empty_spans() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let result = converter.convert(&[], &config).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_single_paragraph() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(
"Hello world",
0.0,
100.0,
12.0,
FontWeight::Normal,
)];
let result = converter.convert(&spans, &config).unwrap();
assert_eq!(result, "<p>Hello world</p>\n");
}
#[test]
fn test_bold_text() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span("Bold", 0.0, 100.0, 12.0, FontWeight::Bold)];
let result = converter.convert(&spans, &config).unwrap();
assert_eq!(result, "<p><strong>Bold</strong></p>\n");
}
#[test]
fn test_html_escaping() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span(
"<script>alert('XSS')</script>",
0.0,
100.0,
12.0,
FontWeight::Normal,
)];
let result = converter.convert(&spans, &config).unwrap();
assert!(result.contains("<script>"));
assert!(!result.contains("<script>"));
}
use crate::structure::table_extractor::{TableCell, TableRow};
#[test]
fn test_render_table_html_empty() {
let table = ExtractedTable::new();
let result = HtmlOutputConverter::render_table_html(&table);
assert_eq!(result, "");
}
#[test]
fn test_render_table_html_basic() {
let mut table = ExtractedTable::new();
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("Name".to_string(), true));
header.add_cell(TableCell::new("Age".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("Alice".to_string(), false));
data.add_cell(TableCell::new("30".to_string(), false));
table.add_row(data);
let result = HtmlOutputConverter::render_table_html(&table);
assert!(result.contains("<table>"));
assert!(result.contains("</table>"));
assert!(result.contains("<thead>"));
assert!(result.contains("</thead>"));
assert!(result.contains("<tbody>"));
assert!(result.contains("</tbody>"));
assert!(result.contains("<th>Name</th>"));
assert!(result.contains("<th>Age</th>"));
assert!(result.contains("<td>Alice</td>"));
assert!(result.contains("<td>30</td>"));
}
#[test]
fn test_render_table_html_no_header() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("A".to_string(), false));
table.add_row(row);
let result = HtmlOutputConverter::render_table_html(&table);
assert!(result.contains("<table>"));
assert!(!result.contains("<thead>"), "Should not have thead when no header");
assert!(result.contains("<tbody>"));
assert!(result.contains("<td>A</td>"));
}
#[test]
fn test_render_table_html_colspan() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("Wide".to_string(), false).with_colspan(3));
table.add_row(row);
let result = HtmlOutputConverter::render_table_html(&table);
assert!(result.contains("colspan=\"3\""), "Should have colspan attribute: {}", result);
}
#[test]
fn test_render_table_html_rowspan() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("Tall".to_string(), false).with_rowspan(2));
table.add_row(row);
let result = HtmlOutputConverter::render_table_html(&table);
assert!(result.contains("rowspan=\"2\""), "Should have rowspan attribute: {}", result);
}
#[test]
fn test_render_table_html_escapes_content() {
let mut table = ExtractedTable::new();
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("<b>bold</b>".to_string(), false));
row.add_cell(TableCell::new("A & B".to_string(), false));
table.add_row(row);
let result = HtmlOutputConverter::render_table_html(&table);
assert!(result.contains("<b>bold</b>"), "HTML should be escaped: {}", result);
assert!(result.contains("A & B"), "Ampersand should be escaped: {}", result);
assert!(!result.contains("<b>bold</b>"), "Raw HTML should not appear");
}
#[test]
fn test_render_table_html_all_header_rows() {
let mut table = ExtractedTable::new();
table.has_header = true;
let mut h1 = TableRow::new(true);
h1.add_cell(TableCell::new("H1".to_string(), true));
table.add_row(h1);
let mut h2 = TableRow::new(true);
h2.add_cell(TableCell::new("H2".to_string(), true));
table.add_row(h2);
let result = HtmlOutputConverter::render_table_html(&table);
assert!(result.contains("<thead>"));
assert!(result.contains("<th>H1</th>"));
assert!(result.contains("<th>H2</th>"));
assert!(!result.contains("<tbody>"));
}
#[test]
fn test_convert_with_tables_renders_html_table() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
table.has_header = true;
let mut header = TableRow::new(true);
header.add_cell(TableCell::new("X".to_string(), true));
table.add_row(header);
let mut data = TableRow::new(false);
data.add_cell(TableCell::new("Y".to_string(), false));
table.add_row(data);
let result = converter
.convert_with_tables(&[], &[table], &config)
.unwrap();
assert!(result.contains("<table>"), "Should contain HTML table: {}", result);
assert!(result.contains("<th>X</th>"));
assert!(result.contains("<td>Y</td>"));
}
#[test]
fn test_convert_with_tables_mixed_content() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let mut span_before = make_span("Intro", 10.0, 200.0, 12.0, FontWeight::Normal);
span_before.reading_order = 0;
let mut span_in_table = make_span("Inside", 50.0, 70.0, 12.0, FontWeight::Normal);
span_in_table.reading_order = 1;
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
let mut row = TableRow::new(false);
row.add_cell(TableCell::new("Cell".to_string(), false));
table.add_row(row);
let result = converter
.convert_with_tables(&[span_before, span_in_table], &[table], &config)
.unwrap();
assert!(result.contains("<p>Intro</p>"), "Should contain paragraph: {}", result);
assert!(result.contains("<table>"), "Should contain table: {}", result);
assert!(!result.contains("Inside"), "Should exclude span in table region");
}
#[test]
fn test_convert_with_tables_no_tables_same_as_convert() {
let converter = HtmlOutputConverter::new();
let config = TextPipelineConfig::default();
let spans = vec![make_span("Hello", 0.0, 100.0, 12.0, FontWeight::Normal)];
let result_convert = converter.convert(&spans, &config).unwrap();
let result_with_tables = converter.convert_with_tables(&spans, &[], &config).unwrap();
assert_eq!(result_convert, result_with_tables);
}
#[test]
fn test_heading_not_assigned_to_non_heading_content() {
let converter = HtmlOutputConverter::new();
let mut config = TextPipelineConfig::default();
config.output.detect_headings = true;
let mut body1 = make_span("Gross revenue", 10.0, 200.0, 10.0, FontWeight::Normal);
body1.reading_order = 4;
let mut body2 = make_span("Operating expenses", 10.0, 220.0, 10.0, FontWeight::Normal);
body2.reading_order = 5;
let mut body3 = make_span("Net income", 10.0, 240.0, 10.0, FontWeight::Normal);
body3.reading_order = 6;
let mut body4 = make_span("Interest paid", 10.0, 260.0, 10.0, FontWeight::Normal);
body4.reading_order = 7;
let mut body5 = make_span("Depreciation", 10.0, 280.0, 10.0, FontWeight::Normal);
body5.reading_order = 8;
let mut address = make_span("123 Main Street", 10.0, 20.0, 24.0, FontWeight::Normal);
address.reading_order = 0;
let mut box_label = make_span("Box 14", 10.0, 60.0, 20.0, FontWeight::Normal);
box_label.reading_order = 1;
let mut amount = make_span("$65,700.00", 10.0, 100.0, 24.0, FontWeight::Normal);
amount.reading_order = 2;
let mut long_text = make_span(
"This is a very long paragraph of text that goes on and on and contains many words and should never be classified as a heading because headings are short descriptive labels",
10.0, 140.0, 24.0, FontWeight::Normal,
);
long_text.reading_order = 3;
let spans = vec![
address, box_label, amount, long_text, body1, body2, body3, body4, body5,
];
let result = converter
.convert_semantic_mode(&spans, &[], &config)
.unwrap();
assert!(!result.contains("<h1>123 Main Street"), "Address should not be h1: {}", result);
assert!(
!result.contains("<h2>Box 14") && !result.contains("<h1>Box 14"),
"Box label should not be a heading: {}",
result
);
assert!(
!result.contains("<h1>$65,700.00") && !result.contains("<h2>$65,700.00"),
"Currency amount should not be a heading: {}",
result
);
assert!(
!result.contains("<h1>This is a very long"),
"Long text should not be a heading: {}",
result
);
assert!(result.contains("<p>"), "Content should be in <p> tags: {}", result);
}
#[test]
fn test_heading_assigned_to_real_headings() {
let converter = HtmlOutputConverter::new();
let mut config = TextPipelineConfig::default();
config.output.detect_headings = true;
let mut heading = make_span("Introduction", 10.0, 20.0, 24.0, FontWeight::Bold);
heading.reading_order = 0;
let mut body1 = make_span(
"This is the body text of the document.",
10.0,
60.0,
10.0,
FontWeight::Normal,
);
body1.reading_order = 1;
let mut body2 =
make_span("More body text follows here.", 10.0, 80.0, 10.0, FontWeight::Normal);
body2.reading_order = 2;
let mut body3 = make_span("And even more content.", 10.0, 100.0, 10.0, FontWeight::Normal);
body3.reading_order = 3;
let spans = vec![heading, body1, body2, body3];
let result = converter
.convert_semantic_mode(&spans, &[], &config)
.unwrap();
assert!(
result.contains("<h1>") || result.contains("<h2>") || result.contains("<h3>"),
"Real heading should be detected: {}",
result
);
assert!(result.contains("Introduction"), "Heading text should appear: {}", result);
}
#[test]
fn test_span_in_table_html() {
let mut table = ExtractedTable::new();
table.bbox = Some(Rect::new(10.0, 50.0, 200.0, 100.0));
let inside = make_span("inside", 50.0, 70.0, 12.0, FontWeight::Normal);
let outside = make_span("outside", 500.0, 500.0, 12.0, FontWeight::Normal);
assert_eq!(span_in_table(&inside, &[table.clone()]), Some(0));
assert_eq!(span_in_table(&outside, &[table]), None);
}
}