pub mod formula_renderer;
pub mod html;
pub mod markdown;
pub mod office;
pub mod table_formatter;
pub mod text_post_processor;
pub mod whitespace;
pub use formula_renderer::{FormulaRenderer, RenderedFormula};
#[allow(deprecated)]
pub use html::HtmlConverter;
#[allow(deprecated)]
pub use markdown::MarkdownConverter;
pub use table_formatter::MarkdownTableFormatter;
pub use text_post_processor::TextPostProcessor;
pub use whitespace::{cleanup_markdown, normalize_whitespace, remove_page_artifacts};
#[cfg(feature = "office")]
pub use office::{DocxConverter, PptxConverter, XlsxConverter};
pub use office::{Margins, OfficeConfig, OfficeConverter};
pub use crate::pipeline::config::BoldMarkerBehavior;
#[derive(Debug, Clone)]
pub struct TableFormatConfig {
pub include_header_separator: bool,
pub cell_padding: usize,
pub min_column_width: usize,
pub merge_adjacent_empty_cells: bool,
pub preserve_cell_formatting: bool,
pub empty_cell_text: String,
}
impl TableFormatConfig {
pub fn default() -> Self {
Self {
include_header_separator: true,
cell_padding: 1,
min_column_width: 3,
merge_adjacent_empty_cells: true,
preserve_cell_formatting: true,
empty_cell_text: "-".to_string(),
}
}
pub fn compact() -> Self {
Self {
include_header_separator: true,
cell_padding: 0,
min_column_width: 1,
merge_adjacent_empty_cells: true,
preserve_cell_formatting: false,
empty_cell_text: String::new(),
}
}
pub fn detailed() -> Self {
Self {
include_header_separator: true,
cell_padding: 2,
min_column_width: 5,
merge_adjacent_empty_cells: false,
preserve_cell_formatting: true,
empty_cell_text: "—".to_string(),
}
}
pub fn custom() -> Self {
Self::default()
}
pub fn with_cell_padding(mut self, padding: usize) -> Self {
self.cell_padding = padding;
self
}
pub fn with_min_column_width(mut self, width: usize) -> Self {
self.min_column_width = width;
self
}
pub fn with_empty_cell_text(mut self, text: &str) -> Self {
self.empty_cell_text = text.to_string();
self
}
}
impl Default for TableFormatConfig {
fn default() -> Self {
TableFormatConfig::default()
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct ConversionOptions {
pub preserve_layout: bool,
pub detect_headings: bool,
pub extract_tables: bool,
pub include_images: bool,
pub image_output_dir: Option<String>,
pub embed_images: bool,
pub reading_order_mode: ReadingOrderMode,
pub bold_marker_behavior: BoldMarkerBehavior,
pub table_detection_config: Option<crate::structure::TableDetectionConfig>,
pub render_formulas: bool,
pub page_images: Option<Vec<std::path::PathBuf>>,
pub page_dimensions: Option<(f32, f32)>,
pub include_form_fields: bool,
pub max_image_pixels: Option<u64>,
}
impl Default for ConversionOptions {
fn default() -> Self {
Self {
preserve_layout: false,
detect_headings: true,
extract_tables: true,
include_images: false,
image_output_dir: None,
embed_images: true,
reading_order_mode: ReadingOrderMode::StructureTreeFirst { mcid_order: vec![] },
bold_marker_behavior: BoldMarkerBehavior::Conservative,
table_detection_config: None,
render_formulas: false,
page_images: None,
page_dimensions: None,
include_form_fields: true,
max_image_pixels: None,
}
}
}
impl ConversionOptions {
pub fn with_table_detection(mut self, config: crate::structure::TableDetectionConfig) -> Self {
self.extract_tables = true;
self.table_detection_config = Some(config);
self
}
pub fn with_default_table_detection(mut self) -> Self {
self.extract_tables = true;
self.table_detection_config = None;
self
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReadingOrderMode {
TopToBottomLeftToRight,
ColumnAware,
StructureTreeFirst {
mcid_order: Vec<u32>,
},
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_conversion_options_default() {
let opts = ConversionOptions::default();
assert!(!opts.preserve_layout);
assert!(opts.detect_headings);
assert!(opts.extract_tables);
assert!(!opts.include_images);
assert_eq!(opts.image_output_dir, None);
assert!(opts.embed_images);
assert_eq!(
opts.reading_order_mode,
ReadingOrderMode::StructureTreeFirst { mcid_order: vec![] }
);
}
#[test]
fn test_conversion_options_embed_images() {
let opts = ConversionOptions::default();
assert!(opts.embed_images);
let opts = ConversionOptions {
embed_images: false,
image_output_dir: Some("images/".to_string()),
..Default::default()
};
assert!(!opts.embed_images);
assert_eq!(opts.image_output_dir, Some("images/".to_string()));
}
#[test]
fn test_conversion_options_custom() {
let opts = ConversionOptions {
preserve_layout: true,
detect_headings: false,
extract_tables: false,
include_images: false,
image_output_dir: Some("output/".to_string()),
reading_order_mode: ReadingOrderMode::ColumnAware,
bold_marker_behavior: BoldMarkerBehavior::Aggressive,
table_detection_config: None,
..Default::default()
};
assert!(opts.preserve_layout);
assert!(!opts.detect_headings);
assert!(!opts.include_images);
assert_eq!(opts.image_output_dir, Some("output/".to_string()));
assert_eq!(opts.reading_order_mode, ReadingOrderMode::ColumnAware);
assert_eq!(opts.bold_marker_behavior, BoldMarkerBehavior::Aggressive);
assert!(opts.table_detection_config.is_none());
}
#[test]
fn test_reading_order_mode_equality() {
assert_eq!(
ReadingOrderMode::TopToBottomLeftToRight,
ReadingOrderMode::TopToBottomLeftToRight
);
assert_ne!(ReadingOrderMode::TopToBottomLeftToRight, ReadingOrderMode::ColumnAware);
}
#[test]
fn test_conversion_options_clone() {
let opts1 = ConversionOptions::default();
let opts2 = opts1.clone();
assert_eq!(opts1, opts2);
}
#[test]
fn test_conversion_options_debug() {
let opts = ConversionOptions::default();
let debug_str = format!("{:?}", opts);
assert!(debug_str.contains("ConversionOptions"));
}
#[test]
fn test_bold_marker_behavior_default() {
assert_eq!(BoldMarkerBehavior::default(), BoldMarkerBehavior::Conservative);
}
#[test]
fn test_bold_marker_behavior_equality() {
assert_eq!(BoldMarkerBehavior::Conservative, BoldMarkerBehavior::Conservative);
assert_eq!(BoldMarkerBehavior::Aggressive, BoldMarkerBehavior::Aggressive);
assert_ne!(BoldMarkerBehavior::Conservative, BoldMarkerBehavior::Aggressive);
}
#[test]
fn test_bold_marker_behavior_copy_clone() {
let behavior = BoldMarkerBehavior::Aggressive;
let copied = behavior;
assert_eq!(behavior, copied);
}
#[test]
fn test_with_default_table_detection() {
let opts = ConversionOptions::default().with_default_table_detection();
assert!(opts.extract_tables);
assert!(opts.table_detection_config.is_none());
}
#[test]
fn test_with_table_detection() {
let config = crate::structure::TableDetectionConfig::strict();
let opts = ConversionOptions::default().with_table_detection(config);
assert!(opts.extract_tables);
assert!(opts.table_detection_config.is_some());
let cfg = opts.table_detection_config.unwrap();
assert_eq!(cfg.min_table_columns, 3);
assert_eq!(cfg.column_tolerance, 2.0);
}
#[test]
fn test_conversion_options_default_table_config() {
let opts = ConversionOptions::default();
assert!(opts.extract_tables);
assert!(opts.table_detection_config.is_none());
}
#[test]
fn test_include_images_default_false() {
let opts = ConversionOptions::default();
assert!(
!opts.include_images,
"include_images should default to false to prevent bloated output"
);
}
#[test]
fn test_include_images_opt_in() {
let opts = ConversionOptions {
include_images: true,
..Default::default()
};
assert!(opts.include_images);
assert!(opts.embed_images);
}
}