mod html;
mod markdown;
mod plain_text;
pub mod toc_detector;
pub use html::HtmlOutputConverter;
pub use markdown::MarkdownOutputConverter;
pub use plain_text::PlainTextConverter;
pub use toc_detector::{TocDetector, TocEntry};
use crate::error::Result;
use crate::layout::TextSpan;
use crate::pipeline::{OrderedTextSpan, TextPipelineConfig};
use crate::structure::table_extractor::Table;
pub trait OutputConverter: Send + Sync {
fn convert(&self, spans: &[OrderedTextSpan], config: &TextPipelineConfig) -> Result<String>;
fn convert_with_tables(
&self,
spans: &[OrderedTextSpan],
tables: &[Table],
config: &TextPipelineConfig,
) -> Result<String> {
let _ = tables;
self.convert(spans, config)
}
fn name(&self) -> &'static str;
fn mime_type(&self) -> &'static str;
}
fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{4E00}'..='\u{9FFF}' | '\u{AC00}'..='\u{D7AF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' )
}
fn is_fullwidth_or_math_op(c: char) -> bool {
matches!(c,
'\u{FF0B}' | '\u{FF0D}' | '\u{FF1A}' | '\u{FF1B}' | '\u{FF1C}'..='\u{FF1E}' | '\u{2260}' | '\u{2248}' | '\u{2264}'..='\u{2265}' | '\u{00B5}' | '\u{03BC}' | '\u{00B1}' | '\u{00D7}' | '\u{00F7}' )
}
pub(crate) fn has_horizontal_gap(prev: &TextSpan, current: &TextSpan) -> bool {
let font_size = prev.font_size.max(current.font_size).max(1.0);
let prev_end_x = prev.bbox.x + prev.bbox.width;
let gap = current.bbox.x - prev_end_x;
let threshold = font_size * 0.15;
if gap <= threshold {
return false;
}
let prev_last = prev.text.chars().next_back();
let curr_first = current.text.chars().next();
if let (Some(p), Some(c)) = (prev_last, curr_first) {
let p_cjk = is_cjk_char(p);
let c_cjk = is_cjk_char(c);
if (p_cjk || is_fullwidth_or_math_op(p)) && (c_cjk || is_fullwidth_or_math_op(c)) {
if p_cjk || c_cjk {
return false;
}
}
}
true
}
pub(crate) fn span_in_table(span: &OrderedTextSpan, tables: &[Table]) -> Option<usize> {
let sx = span.span.bbox.x;
let sy = span.span.bbox.y;
for (i, table) in tables.iter().enumerate() {
let Some(ref bbox) = table.bbox else { continue };
let tolerance = 2.0;
let in_outer_bbox = sx >= bbox.x - tolerance
&& sx <= bbox.x + bbox.width + tolerance
&& sy >= bbox.y - tolerance
&& sy <= bbox.y + bbox.height + tolerance;
if !in_outer_bbox {
continue;
}
let has_any_cell_bbox = table
.rows
.iter()
.any(|row| row.cells.iter().any(|c| c.bbox.is_some()));
if !has_any_cell_bbox {
return Some(i);
}
let span_owned = table.rows.iter().any(|row| {
row.cells.iter().any(|cell| {
let Some(cb) = cell.bbox else { return false };
sx >= cb.x - tolerance
&& sx <= cb.x + cb.width + tolerance
&& sy >= cb.y - tolerance
&& sy <= cb.y + cb.height + tolerance
})
});
if span_owned {
return Some(i);
}
}
None
}
pub(crate) fn merge_key_value_pairs(text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 2 {
return text.to_string();
}
fn is_value_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.len() > 30 {
return false;
}
let first = trimmed.chars().next().unwrap();
matches!(first, '0'..='9' | '$' | '€' | '£' | '¥' | '(' | '-' | '.')
}
fn is_label_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
if is_value_line(line) {
return false;
}
let last = trimmed.chars().next_back().unwrap();
last.is_alphanumeric() || last == ')' || last == ':'
}
let mut result = String::with_capacity(text.len());
let mut i = 0;
while i < lines.len() {
if i + 1 < lines.len() && is_label_line(lines[i]) && is_value_line(lines[i + 1]) {
result.push_str(lines[i].trim_end());
result.push(' ');
result.push_str(lines[i + 1].trim_start());
result.push('\n');
i += 2;
}
else if i + 2 < lines.len()
&& is_label_line(lines[i])
&& lines[i + 1].trim().is_empty()
&& is_value_line(lines[i + 2])
{
result.push_str(lines[i].trim_end());
result.push(' ');
result.push_str(lines[i + 2].trim_start());
result.push('\n');
i += 3;
} else {
result.push_str(lines[i]);
result.push('\n');
i += 1;
}
}
let orig_trailing_newlines = text.chars().rev().take_while(|&c| c == '\n').count();
while result.ends_with('\n') {
result.pop();
}
for _ in 0..orig_trailing_newlines {
result.push('\n');
}
result
}
pub fn create_converter(format: &str) -> Option<Box<dyn OutputConverter>> {
match format.to_lowercase().as_str() {
"markdown" | "md" => Some(Box::new(MarkdownOutputConverter::new())),
"html" => Some(Box::new(HtmlOutputConverter::new())),
"text" | "plain" | "txt" => Some(Box::new(PlainTextConverter::new())),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_converter_markdown() {
let converter = create_converter("markdown").unwrap();
assert_eq!(converter.name(), "MarkdownOutputConverter");
assert_eq!(converter.mime_type(), "text/markdown");
}
#[test]
fn test_create_converter_html() {
let converter = create_converter("html").unwrap();
assert_eq!(converter.name(), "HtmlOutputConverter");
assert_eq!(converter.mime_type(), "text/html");
}
#[test]
fn test_create_converter_text() {
let converter = create_converter("text").unwrap();
assert_eq!(converter.name(), "PlainTextConverter");
assert_eq!(converter.mime_type(), "text/plain");
}
#[test]
fn test_create_converter_unknown() {
assert!(create_converter("unknown").is_none());
}
#[test]
fn test_key_value_pair_merging_basic() {
let input = "Grand Total\n$750.00\nNet Amount\n$250.00\n";
let expected = "Grand Total $750.00\nNet Amount $250.00\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_no_false_positive_on_sentences() {
let input = "This is a sentence.\n$100.00\n";
assert_eq!(merge_key_value_pairs(input), input);
}
#[test]
fn test_key_value_pair_merging_negative_numbers() {
let input = "Balance Due\n-$42.50\n";
let expected = "Balance Due -$42.50\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_plain_numbers() {
let input = "Account Number\n434508032\n";
let expected = "Account Number 434508032\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_skips_long_values() {
let input = "Introduction\nThis is a full paragraph of text that continues.\n";
assert_eq!(merge_key_value_pairs(input), input);
}
#[test]
fn test_key_value_pair_merging_preserves_blank_lines() {
let input = "Section A\n\nTotal\n$100\n";
let expected = "Section A\n\nTotal $100\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_consecutive_pairs() {
let input = "Subtotal\n$200.00\nTax\n$18.00\nTotal\n$218.00\n";
let expected = "Subtotal $200.00\nTax $18.00\nTotal $218.00\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_euro_and_pound() {
let input = "Price\n€49.99\nShipping\n£5.00\n";
let expected = "Price €49.99\nShipping £5.00\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_parenthesized_negative() {
let input = "Net Loss\n(1,234.56)\n";
let expected = "Net Loss (1,234.56)\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_no_merge_value_value() {
let input = "$100\n$200\n";
assert_eq!(merge_key_value_pairs(input), input);
}
#[test]
fn test_key_value_pair_merging_empty_input() {
assert_eq!(merge_key_value_pairs(""), "");
assert_eq!(merge_key_value_pairs("single line\n"), "single line\n");
}
fn make_span(x: f32, w: f32, text: &str) -> crate::layout::TextSpan {
crate::layout::TextSpan {
text: text.to_string(),
bbox: crate::geometry::Rect::new(x, 0.0, w, 10.0),
font_size: 10.0,
..Default::default()
}
}
#[test]
fn test_has_horizontal_gap_cjk_cjk_suppressed() {
let prev = make_span(0.0, 10.0, "数"); let curr = make_span(12.0, 10.0, "学"); assert!(!has_horizontal_gap(&prev, &curr), "CJK→CJK should suppress space insertion");
}
#[test]
fn test_has_horizontal_gap_cjk_fullwidth_suppressed() {
let prev = make_span(0.0, 10.0, "Q"); let prev_cjk = make_span(0.0, 10.0, "量");
let curr = make_span(12.0, 10.0, "<"); assert!(
!has_horizontal_gap(&prev_cjk, &curr),
"CJK→fullwidth-op should suppress space insertion"
);
let _ = prev; }
#[test]
fn test_has_horizontal_gap_fullwidth_cjk_suppressed() {
let prev = make_span(0.0, 10.0, "≤"); let curr = make_span(12.0, 10.0, "Q"); let curr_cjk = make_span(12.0, 10.0, "量");
assert!(
!has_horizontal_gap(&prev, &curr_cjk),
"fullwidth-op→CJK should suppress space insertion"
);
let _ = curr; }
#[test]
fn test_has_horizontal_gap_latin_latin_unchanged() {
let prev = make_span(0.0, 10.0, "hello");
let curr = make_span(12.0, 10.0, "world"); assert!(
has_horizontal_gap(&prev, &curr),
"Latin→Latin with gap > threshold should still insert space"
);
}
#[test]
fn test_has_horizontal_gap_latin_latin_no_gap() {
let prev = make_span(0.0, 10.0, "hello");
let curr = make_span(11.0, 10.0, "world"); assert!(
!has_horizontal_gap(&prev, &curr),
"Latin→Latin below threshold should not insert space"
);
}
#[test]
fn test_has_horizontal_gap_two_pure_math_ops_unchanged() {
let prev = make_span(0.0, 10.0, "≤");
let curr = make_span(12.0, 10.0, "≥"); assert!(
has_horizontal_gap(&prev, &curr),
"math-op→math-op (no CJK) should still apply gap-based logic"
);
}
fn make_table_no_cells(x: f32, y: f32, width: f32, height: f32) -> Table {
let mut t = Table::new();
t.bbox = Some(crate::geometry::Rect::new(x, y, width, height));
t
}
fn make_table_with_cell(
table_bbox: (f32, f32, f32, f32),
cell_bbox: (f32, f32, f32, f32),
) -> Table {
use crate::structure::table_extractor::{TableCell, TableRow};
let mut t = Table::new();
t.bbox = Some(crate::geometry::Rect::new(
table_bbox.0,
table_bbox.1,
table_bbox.2,
table_bbox.3,
));
let mut row = TableRow::new(false);
let mut cell = TableCell::new(String::new(), false);
cell.bbox =
Some(crate::geometry::Rect::new(cell_bbox.0, cell_bbox.1, cell_bbox.2, cell_bbox.3));
row.cells.push(cell);
t.rows.push(row);
t.col_count = 1;
t
}
fn make_ordered_span(x: f32, y: f32) -> crate::pipeline::OrderedTextSpan {
let span = crate::layout::TextSpan {
text: "test".to_string(),
bbox: crate::geometry::Rect::new(x, y, 5.0, 10.0),
font_size: 10.0,
..Default::default()
};
crate::pipeline::OrderedTextSpan::new(span, 0)
}
#[test]
fn span_in_table_no_cells_legacy_passthrough() {
let table = make_table_no_cells(10.0, 50.0, 200.0, 100.0);
let span = make_ordered_span(50.0, 70.0); assert_eq!(
span_in_table(&span, &[table]),
Some(0),
"no-cell Table preserves legacy outer-bbox contract"
);
}
#[test]
fn span_in_table_owned_by_cell() {
let table = make_table_with_cell(
(10.0, 50.0, 200.0, 100.0), (40.0, 60.0, 100.0, 20.0), );
let span = make_ordered_span(50.0, 70.0); assert_eq!(span_in_table(&span, &[table]), Some(0));
}
#[test]
fn span_in_table_outer_bbox_only_returns_none() {
let table = make_table_with_cell(
(10.0, 50.0, 200.0, 100.0), (10.0, 50.0, 50.0, 100.0), );
let span = make_ordered_span(150.0, 70.0);
assert_eq!(
span_in_table(&span, &[table]),
None,
"span outside every cell must NOT be marked in_table — \
paragraph flow needs to pick it up instead of dropping"
);
}
#[test]
fn span_in_table_outside_all_tables() {
let table = make_table_with_cell((10.0, 50.0, 200.0, 100.0), (40.0, 60.0, 100.0, 20.0));
let span = make_ordered_span(500.0, 500.0);
assert_eq!(span_in_table(&span, &[table]), None);
}
}