mod html;
mod markdown;
mod plain_text;
pub mod toc_detector;
pub use html::HtmlOutputConverter;
pub use markdown::MarkdownOutputConverter;
pub use plain_text::PlainTextConverter;
pub use toc_detector::{TocDetector, TocEntry};
use crate::error::Result;
use crate::layout::TextSpan;
use crate::pipeline::{OrderedTextSpan, TextPipelineConfig};
use crate::structure::table_extractor::Table;
pub trait OutputConverter: Send + Sync {
fn convert(&self, spans: &[OrderedTextSpan], config: &TextPipelineConfig) -> Result<String>;
fn convert_with_tables(
&self,
spans: &[OrderedTextSpan],
tables: &[Table],
config: &TextPipelineConfig,
) -> Result<String> {
let _ = tables;
self.convert(spans, config)
}
fn name(&self) -> &'static str;
fn mime_type(&self) -> &'static str;
}
pub(crate) fn has_horizontal_gap(prev: &TextSpan, current: &TextSpan) -> bool {
let font_size = prev.font_size.max(current.font_size).max(1.0);
let prev_end_x = prev.bbox.x + prev.bbox.width;
let gap = current.bbox.x - prev_end_x;
let threshold = font_size * 0.15;
gap > threshold && gap < font_size * 5.0
}
pub(crate) fn span_in_table(span: &OrderedTextSpan, tables: &[Table]) -> Option<usize> {
let sx = span.span.bbox.x;
let sy = span.span.bbox.y;
for (i, table) in tables.iter().enumerate() {
if let Some(ref bbox) = table.bbox {
let tolerance = 2.0;
if sx >= bbox.x - tolerance
&& sx <= bbox.x + bbox.width + tolerance
&& sy >= bbox.y - tolerance
&& sy <= bbox.y + bbox.height + tolerance
{
return Some(i);
}
}
}
None
}
pub(crate) fn merge_key_value_pairs(text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 2 {
return text.to_string();
}
fn is_value_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.len() > 30 {
return false;
}
let first = trimmed.chars().next().unwrap();
matches!(first, '0'..='9' | '$' | '€' | '£' | '¥' | '(' | '-' | '.')
}
fn is_label_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
if is_value_line(line) {
return false;
}
let last = trimmed.chars().next_back().unwrap();
last.is_alphanumeric() || last == ')' || last == ':'
}
let mut result = String::with_capacity(text.len());
let mut i = 0;
while i < lines.len() {
if i + 1 < lines.len() && is_label_line(lines[i]) && is_value_line(lines[i + 1]) {
result.push_str(lines[i].trim_end());
result.push(' ');
result.push_str(lines[i + 1].trim_start());
result.push('\n');
i += 2;
}
else if i + 2 < lines.len()
&& is_label_line(lines[i])
&& lines[i + 1].trim().is_empty()
&& is_value_line(lines[i + 2])
{
result.push_str(lines[i].trim_end());
result.push(' ');
result.push_str(lines[i + 2].trim_start());
result.push('\n');
i += 3;
} else {
result.push_str(lines[i]);
result.push('\n');
i += 1;
}
}
let orig_trailing_newlines = text.chars().rev().take_while(|&c| c == '\n').count();
while result.ends_with('\n') {
result.pop();
}
for _ in 0..orig_trailing_newlines {
result.push('\n');
}
result
}
pub fn create_converter(format: &str) -> Option<Box<dyn OutputConverter>> {
match format.to_lowercase().as_str() {
"markdown" | "md" => Some(Box::new(MarkdownOutputConverter::new())),
"html" => Some(Box::new(HtmlOutputConverter::new())),
"text" | "plain" | "txt" => Some(Box::new(PlainTextConverter::new())),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_converter_markdown() {
let converter = create_converter("markdown").unwrap();
assert_eq!(converter.name(), "MarkdownOutputConverter");
assert_eq!(converter.mime_type(), "text/markdown");
}
#[test]
fn test_create_converter_html() {
let converter = create_converter("html").unwrap();
assert_eq!(converter.name(), "HtmlOutputConverter");
assert_eq!(converter.mime_type(), "text/html");
}
#[test]
fn test_create_converter_text() {
let converter = create_converter("text").unwrap();
assert_eq!(converter.name(), "PlainTextConverter");
assert_eq!(converter.mime_type(), "text/plain");
}
#[test]
fn test_create_converter_unknown() {
assert!(create_converter("unknown").is_none());
}
#[test]
fn test_key_value_pair_merging_basic() {
let input = "Grand Total\n$750.00\nNet Amount\n$250.00\n";
let expected = "Grand Total $750.00\nNet Amount $250.00\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_no_false_positive_on_sentences() {
let input = "This is a sentence.\n$100.00\n";
assert_eq!(merge_key_value_pairs(input), input);
}
#[test]
fn test_key_value_pair_merging_negative_numbers() {
let input = "Balance Due\n-$42.50\n";
let expected = "Balance Due -$42.50\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_plain_numbers() {
let input = "Account Number\n434508032\n";
let expected = "Account Number 434508032\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_skips_long_values() {
let input = "Introduction\nThis is a full paragraph of text that continues.\n";
assert_eq!(merge_key_value_pairs(input), input);
}
#[test]
fn test_key_value_pair_merging_preserves_blank_lines() {
let input = "Section A\n\nTotal\n$100\n";
let expected = "Section A\n\nTotal $100\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_consecutive_pairs() {
let input = "Subtotal\n$200.00\nTax\n$18.00\nTotal\n$218.00\n";
let expected = "Subtotal $200.00\nTax $18.00\nTotal $218.00\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_euro_and_pound() {
let input = "Price\n€49.99\nShipping\n£5.00\n";
let expected = "Price €49.99\nShipping £5.00\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_parenthesized_negative() {
let input = "Net Loss\n(1,234.56)\n";
let expected = "Net Loss (1,234.56)\n";
assert_eq!(merge_key_value_pairs(input), expected);
}
#[test]
fn test_key_value_pair_merging_no_merge_value_value() {
let input = "$100\n$200\n";
assert_eq!(merge_key_value_pairs(input), input);
}
#[test]
fn test_key_value_pair_merging_empty_input() {
assert_eq!(merge_key_value_pairs(""), "");
assert_eq!(merge_key_value_pairs("single line\n"), "single line\n");
}
}