#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum LogLevel {
Error,
Warn,
#[default]
Info,
Debug,
Trace,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocumentType {
Academic,
Business,
Novel,
Cjk,
Rtl,
Generic,
}
impl DocumentType {
pub fn create_config(&self) -> TextPipelineConfig {
match self {
Self::Academic => Self::academic_config(),
Self::Business => Self::business_config(),
Self::Novel => Self::novel_config(),
Self::Cjk => Self::cjk_config(),
Self::Rtl => Self::rtl_config(),
Self::Generic => TextPipelineConfig::default(),
}
}
fn academic_config() -> TextPipelineConfig {
TextPipelineConfig {
spacing: SpacingConfig { word_margin: 0.1 },
tj_threshold: TjThresholdConfig {
space_insertion_threshold: -120.0,
use_adaptive: true, },
reading_order: ReadingOrderConfig {
strategy: ReadingOrderStrategyType::StructureTreeFirst,
},
output: OutputConfig {
detect_headings: false,
include_images: false,
bold_marker_behavior: BoldMarkerBehavior::Conservative,
preserve_layout: true, extract_tables: true, image_output_dir: None,
embed_images: true,
include_form_fields: true,
},
word_boundary_mode: WordBoundaryMode::Primary,
enable_hyphenation_reconstruction: true, log_level: LogLevel::Info,
collect_metrics: false,
}
}
fn business_config() -> TextPipelineConfig {
TextPipelineConfig {
spacing: SpacingConfig { word_margin: 0.1 },
tj_threshold: TjThresholdConfig {
space_insertion_threshold: -120.0,
use_adaptive: true, },
reading_order: ReadingOrderConfig {
strategy: ReadingOrderStrategyType::XYCut,
},
output: OutputConfig {
detect_headings: false,
include_images: false,
bold_marker_behavior: BoldMarkerBehavior::Conservative,
preserve_layout: true, extract_tables: true, image_output_dir: None,
embed_images: true,
include_form_fields: true,
},
word_boundary_mode: WordBoundaryMode::Primary,
enable_hyphenation_reconstruction: true,
log_level: LogLevel::Info,
collect_metrics: false,
}
}
fn novel_config() -> TextPipelineConfig {
TextPipelineConfig {
spacing: SpacingConfig { word_margin: 0.15 }, tj_threshold: TjThresholdConfig {
space_insertion_threshold: -100.0, use_adaptive: false,
},
reading_order: ReadingOrderConfig {
strategy: ReadingOrderStrategyType::Simple,
},
output: OutputConfig {
detect_headings: false,
include_images: false,
bold_marker_behavior: BoldMarkerBehavior::Conservative,
preserve_layout: false, extract_tables: false,
image_output_dir: None,
embed_images: true,
include_form_fields: true,
},
word_boundary_mode: WordBoundaryMode::Tiebreaker,
enable_hyphenation_reconstruction: true, log_level: LogLevel::Info,
collect_metrics: false,
}
}
fn cjk_config() -> TextPipelineConfig {
TextPipelineConfig {
spacing: SpacingConfig { word_margin: 0.05 }, tj_threshold: TjThresholdConfig {
space_insertion_threshold: -80.0, use_adaptive: true,
},
reading_order: ReadingOrderConfig {
strategy: ReadingOrderStrategyType::StructureTreeFirst,
},
output: OutputConfig {
detect_headings: false,
include_images: false,
bold_marker_behavior: BoldMarkerBehavior::Conservative,
preserve_layout: true,
extract_tables: true,
image_output_dir: None,
embed_images: true,
include_form_fields: true,
},
word_boundary_mode: WordBoundaryMode::Primary,
enable_hyphenation_reconstruction: false, log_level: LogLevel::Info,
collect_metrics: false,
}
}
fn rtl_config() -> TextPipelineConfig {
TextPipelineConfig {
spacing: SpacingConfig { word_margin: 0.1 },
tj_threshold: TjThresholdConfig {
space_insertion_threshold: -120.0,
use_adaptive: true,
},
reading_order: ReadingOrderConfig {
strategy: ReadingOrderStrategyType::StructureTreeFirst,
},
output: OutputConfig {
detect_headings: false,
include_images: false,
bold_marker_behavior: BoldMarkerBehavior::Conservative,
preserve_layout: true,
extract_tables: true,
image_output_dir: None,
embed_images: true,
include_form_fields: true,
},
word_boundary_mode: WordBoundaryMode::Tiebreaker,
enable_hyphenation_reconstruction: false, log_level: LogLevel::Info,
collect_metrics: false,
}
}
pub fn detect_from_sample(sample: &str) -> Self {
if sample.is_empty() {
return Self::Generic;
}
let cjk_ratio = Self::count_cjk_chars(sample) as f32 / sample.len() as f32;
let rtl_ratio = Self::count_rtl_chars(sample) as f32 / sample.len() as f32;
let special_ratio = Self::count_special_chars(sample) as f32 / sample.len() as f32;
if cjk_ratio > 0.1 {
return Self::Cjk;
}
if rtl_ratio > 0.2 {
return Self::Rtl;
}
if Self::looks_like_business(sample) {
return Self::Business;
}
if special_ratio >= 0.08 {
return Self::Academic;
}
if Self::looks_like_narrative(sample) {
return Self::Novel;
}
Self::Generic
}
fn count_cjk_chars(text: &str) -> usize {
text.chars()
.filter(|c| {
let code = *c as u32;
matches!(
code,
0x3040..=0x309F | 0x30A0..=0x30FF | 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xAC00..=0xD7AF )
})
.count()
}
fn count_rtl_chars(text: &str) -> usize {
text.chars()
.filter(|c| {
let code = *c as u32;
matches!(
code,
0x0590..=0x05FF | 0x0600..=0x06FF | 0x0750..=0x077F )
})
.count()
}
fn count_special_chars(text: &str) -> usize {
text.chars()
.filter(|c| {
matches!(
*c,
'©' | '®'
| '™'
| '§'
| '¶'
| '†'
| '‡'
| '€'
| '£'
| '¥'
| '¢'
| '±'
| '×'
| '÷'
| '√'
| '∞'
| '∫'
| '←'
| '→'
| '↑'
| '↓'
| '°'
| '′'
| '″'
)
})
.count()
}
fn looks_like_narrative(text: &str) -> bool {
let lower_count = text.chars().filter(|c| c.is_lowercase()).count();
let upper_count = text.chars().filter(|c| c.is_uppercase()).count();
let digit_count = text.chars().filter(|c| c.is_ascii_digit()).count();
let period_count = text.matches('.').count();
let has_narrative_words = text.contains("was ")
|| text.contains("were ")
|| text.contains("walked ")
|| text.contains("said ")
|| text.contains("went ");
lower_count > upper_count * 5
&& digit_count < text.len() / 20
&& (has_narrative_words || period_count > 2)
}
fn looks_like_business(text: &str) -> bool {
text.contains("Table")
|| text.contains("Figure")
|| text.contains("report")
|| text.contains("document")
|| text.contains("agreement")
}
}
impl LogLevel {
pub fn should_log(&self, level: LogLevel) -> bool {
match (*self, level) {
(Self::Error, Self::Error) => true,
(Self::Warn, Self::Error | Self::Warn) => true,
(Self::Info, Self::Error | Self::Warn | Self::Info) => true,
(Self::Debug, Self::Error | Self::Warn | Self::Info | Self::Debug) => true,
(Self::Trace, _) => true,
_ => false,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum WordBoundaryMode {
#[default]
Tiebreaker,
Primary,
}
#[derive(Debug, Clone)]
pub struct TextPipelineConfig {
pub spacing: SpacingConfig,
pub tj_threshold: TjThresholdConfig,
pub reading_order: ReadingOrderConfig,
pub output: OutputConfig,
pub word_boundary_mode: WordBoundaryMode,
pub enable_hyphenation_reconstruction: bool,
pub log_level: LogLevel,
pub collect_metrics: bool,
}
impl Default for TextPipelineConfig {
fn default() -> Self {
Self {
spacing: SpacingConfig::default(),
tj_threshold: TjThresholdConfig::default(),
reading_order: ReadingOrderConfig::default(),
output: OutputConfig::default(),
word_boundary_mode: WordBoundaryMode::default(),
enable_hyphenation_reconstruction: true,
log_level: LogLevel::default(),
collect_metrics: false,
}
}
}
impl TextPipelineConfig {
pub fn for_document_type(doc_type: DocumentType) -> Self {
doc_type.create_config()
}
pub fn detect_and_optimize(sample: &str) -> Self {
let doc_type = DocumentType::detect_from_sample(sample);
doc_type.create_config()
}
pub fn pdfplumber_compatible() -> Self {
Self {
spacing: SpacingConfig { word_margin: 0.1 },
tj_threshold: TjThresholdConfig {
space_insertion_threshold: -120.0,
use_adaptive: false,
},
reading_order: ReadingOrderConfig {
strategy: ReadingOrderStrategyType::Simple,
},
output: OutputConfig::default(),
word_boundary_mode: WordBoundaryMode::Tiebreaker,
enable_hyphenation_reconstruction: true,
log_level: LogLevel::default(),
collect_metrics: false,
}
}
pub fn from_conversion_options(opts: &crate::converters::ConversionOptions) -> Self {
use crate::converters::BoldMarkerBehavior as OldBMB;
use crate::converters::ReadingOrderMode;
let strategy = match &opts.reading_order_mode {
ReadingOrderMode::TopToBottomLeftToRight => ReadingOrderStrategyType::Simple,
ReadingOrderMode::ColumnAware => ReadingOrderStrategyType::XYCut,
ReadingOrderMode::StructureTreeFirst { .. } => {
ReadingOrderStrategyType::StructureTreeFirst
},
};
let bold_marker_behavior = match opts.bold_marker_behavior {
OldBMB::Aggressive => BoldMarkerBehavior::Aggressive,
OldBMB::Conservative => BoldMarkerBehavior::Conservative,
};
Self {
spacing: SpacingConfig::default(),
tj_threshold: TjThresholdConfig::default(),
reading_order: ReadingOrderConfig { strategy },
output: OutputConfig {
detect_headings: opts.detect_headings,
include_images: opts.include_images,
bold_marker_behavior,
preserve_layout: opts.preserve_layout,
extract_tables: opts.extract_tables,
image_output_dir: opts.image_output_dir.clone(),
embed_images: opts.embed_images,
include_form_fields: opts.include_form_fields,
},
word_boundary_mode: WordBoundaryMode::Tiebreaker, enable_hyphenation_reconstruction: true,
log_level: LogLevel::default(),
collect_metrics: false,
}
}
pub fn with_word_boundary_mode(mut self, mode: WordBoundaryMode) -> Self {
self.word_boundary_mode = mode;
self
}
pub fn with_hyphenation_reconstruction(mut self, enabled: bool) -> Self {
self.enable_hyphenation_reconstruction = enabled;
self
}
pub fn with_log_level(mut self, level: LogLevel) -> Self {
self.log_level = level;
self
}
pub fn with_metrics_collection(mut self, enabled: bool) -> Self {
self.collect_metrics = enabled;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_log_level_default() {
assert_eq!(LogLevel::default(), LogLevel::Info);
}
#[test]
fn test_log_level_should_log() {
let info_level = LogLevel::Info;
assert!(info_level.should_log(LogLevel::Error));
assert!(info_level.should_log(LogLevel::Warn));
assert!(info_level.should_log(LogLevel::Info));
assert!(!info_level.should_log(LogLevel::Debug));
assert!(!info_level.should_log(LogLevel::Trace));
let debug_level = LogLevel::Debug;
assert!(debug_level.should_log(LogLevel::Error));
assert!(debug_level.should_log(LogLevel::Warn));
assert!(debug_level.should_log(LogLevel::Info));
assert!(debug_level.should_log(LogLevel::Debug));
assert!(!debug_level.should_log(LogLevel::Trace));
let trace_level = LogLevel::Trace;
assert!(trace_level.should_log(LogLevel::Error));
assert!(trace_level.should_log(LogLevel::Warn));
assert!(trace_level.should_log(LogLevel::Info));
assert!(trace_level.should_log(LogLevel::Debug));
assert!(trace_level.should_log(LogLevel::Trace));
}
#[test]
fn test_config_log_level_default() {
let config = TextPipelineConfig::default();
assert_eq!(config.log_level, LogLevel::Info);
}
#[test]
fn test_config_with_log_level() {
let config = TextPipelineConfig::default().with_log_level(LogLevel::Debug);
assert_eq!(config.log_level, LogLevel::Debug);
}
#[test]
fn test_config_with_log_level_trace() {
let config = TextPipelineConfig::default().with_log_level(LogLevel::Trace);
assert_eq!(config.log_level, LogLevel::Trace);
}
#[test]
fn test_config_with_log_level_error() {
let config = TextPipelineConfig::default().with_log_level(LogLevel::Error);
assert_eq!(config.log_level, LogLevel::Error);
}
#[test]
fn test_pdfplumber_compatible_has_log_level() {
let config = TextPipelineConfig::pdfplumber_compatible();
assert_eq!(config.log_level, LogLevel::Info);
}
#[test]
fn test_document_type_academic_config() {
let config = DocumentType::Academic.create_config();
assert!(config.enable_hyphenation_reconstruction);
assert_eq!(config.log_level, LogLevel::Info);
assert!(config.output.preserve_layout);
assert!(config.output.extract_tables);
assert!(config.tj_threshold.use_adaptive);
}
#[test]
fn test_document_type_business_config() {
let config = DocumentType::Business.create_config();
assert!(config.enable_hyphenation_reconstruction);
assert_eq!(config.log_level, LogLevel::Info);
assert!(config.output.preserve_layout);
assert!(config.output.extract_tables);
assert_eq!(config.reading_order.strategy, ReadingOrderStrategyType::XYCut);
}
#[test]
fn test_document_type_novel_config() {
let config = DocumentType::Novel.create_config();
assert!(config.enable_hyphenation_reconstruction);
assert!(!config.output.preserve_layout);
assert!(!config.output.extract_tables);
assert_eq!(config.reading_order.strategy, ReadingOrderStrategyType::Simple);
assert!(!config.tj_threshold.use_adaptive);
}
#[test]
fn test_document_type_cjk_config() {
let config = DocumentType::Cjk.create_config();
assert!(!config.enable_hyphenation_reconstruction);
assert_eq!(config.log_level, LogLevel::Info);
assert!(config.output.preserve_layout);
assert!(config.output.extract_tables);
assert!(config.tj_threshold.use_adaptive);
assert_eq!(config.word_boundary_mode, WordBoundaryMode::Primary);
}
#[test]
fn test_document_type_rtl_config() {
let config = DocumentType::Rtl.create_config();
assert!(!config.enable_hyphenation_reconstruction);
assert!(config.output.preserve_layout);
assert!(config.output.extract_tables);
assert_eq!(config.word_boundary_mode, WordBoundaryMode::Tiebreaker);
}
#[test]
fn test_document_type_generic_config() {
let config = DocumentType::Generic.create_config();
assert_eq!(config.log_level, LogLevel::default());
assert_eq!(config.word_boundary_mode, WordBoundaryMode::default());
assert!(config.enable_hyphenation_reconstruction);
}
#[test]
fn test_detect_empty_sample() {
let doc_type = DocumentType::detect_from_sample("");
assert_eq!(doc_type, DocumentType::Generic);
}
#[test]
fn test_detect_cjk_sample() {
let sample = "これは日本語です。This is bilingual text.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Cjk);
}
#[test]
fn test_detect_cjk_chinese() {
let sample = "这是中文文本。";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Cjk);
}
#[test]
fn test_detect_cjk_korean() {
let sample = "이것은 한국어 텍스트입니다.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Cjk);
}
#[test]
fn test_detect_rtl_sample() {
let sample = "مرحبا بك في النص العربي";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Rtl);
}
#[test]
fn test_detect_rtl_hebrew() {
let sample = "זה טקסט בעברית";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Rtl);
}
#[test]
fn test_detect_academic_sample() {
let sample =
"The ∫∞√∑ equations © research shows ± evidence × mathematical ÷ concepts ® article";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Academic);
}
#[test]
fn test_detect_academic_with_symbols() {
let sample = "Consider the integral ∫ from a to b and the summation ∑ with limit n → ∞ © 2024 ® ± × ÷ √ Author";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Academic);
}
#[test]
fn test_detect_novel_sample() {
let sample = "The quick brown fox jumps over the lazy dog. She walked through the forest, listening to the birds singing their morning songs.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Novel);
}
#[test]
fn test_detect_novel_narrative() {
let sample = "Once upon a time, there was a kingdom far away. The princess walked through the castle gardens every morning, admiring the flowers and trees.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Novel);
}
#[test]
fn test_detect_business_sample() {
let sample = "Table 1 shows the results. Figure 2 displays the report findings. The document contains the agreement terms with key provisions.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Business);
}
#[test]
fn test_detect_generic_mixed_text() {
let sample = "ABC DEF GHI JKL MNO PQR STU VWX YZ are letters. Numbers like 1234567890 appear here too.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Generic);
}
#[test]
fn test_for_document_type_builder() {
let config = TextPipelineConfig::for_document_type(DocumentType::Business);
assert!(config.enable_hyphenation_reconstruction);
assert!(config.output.extract_tables);
}
#[test]
fn test_detect_and_optimize() {
let sample = "これは日本語です。";
let config = TextPipelineConfig::detect_and_optimize(sample);
assert_eq!(config.log_level, LogLevel::Info);
assert!(!config.enable_hyphenation_reconstruction); }
#[test]
fn test_for_document_type_academic() {
let config = TextPipelineConfig::for_document_type(DocumentType::Academic);
assert!(config.tj_threshold.use_adaptive);
assert_eq!(config.word_boundary_mode, WordBoundaryMode::Primary);
}
#[test]
fn test_for_document_type_cjk_spacing() {
let config = TextPipelineConfig::for_document_type(DocumentType::Cjk);
assert!(config.spacing.word_margin < 0.1);
}
#[test]
fn test_for_document_type_novel_spacing() {
let config = TextPipelineConfig::for_document_type(DocumentType::Novel);
assert!(config.spacing.word_margin > 0.1);
}
#[test]
fn test_detect_sample_with_high_cjk_ratio() {
let sample = "これはひらがなですあ。カタカナです。テスト。";
let doc_type = DocumentType::detect_from_sample(sample);
assert_eq!(doc_type, DocumentType::Cjk);
}
#[test]
fn test_detect_sample_with_low_cjk_ratio() {
let sample = "This is mostly English text with some 日本語 mixed in.";
let doc_type = DocumentType::detect_from_sample(sample);
assert_ne!(doc_type, DocumentType::Cjk);
}
#[test]
fn test_count_cjk_chars() {
let text = "これは日本語です";
let count = DocumentType::count_cjk_chars(text);
assert!(count > 0);
}
#[test]
fn test_count_rtl_chars() {
let text = "مرحبا بك";
let count = DocumentType::count_rtl_chars(text);
assert!(count > 0);
}
#[test]
fn test_count_special_chars() {
let text = "Equation: ∫√∞ with © symbol";
let count = DocumentType::count_special_chars(text);
assert!(count > 0);
}
#[test]
fn test_looks_like_narrative() {
let text = "she was running through the forest. she walked past the trees. they said hello to her.";
assert!(DocumentType::looks_like_narrative(text));
}
#[test]
fn test_looks_not_like_narrative_high_digits() {
let text = "1234567890 ABC DEF GHIJ 1234567890 KLMN";
assert!(!DocumentType::looks_like_narrative(text));
}
#[test]
fn test_looks_like_business() {
let text = "This Table shows the Figure in our report and document with agreement details";
assert!(DocumentType::looks_like_business(text));
}
#[test]
fn test_looks_not_like_business() {
let text = "This is a simple story about a dog and a cat in the forest";
assert!(!DocumentType::looks_like_business(text));
}
#[test]
fn test_collect_metrics_default_disabled() {
let config = TextPipelineConfig::default();
assert!(!config.collect_metrics);
}
#[test]
fn test_collect_metrics_enabled() {
let config = TextPipelineConfig::default().with_metrics_collection(true);
assert!(config.collect_metrics);
}
#[test]
fn test_collect_metrics_disabled_explicitly() {
let config = TextPipelineConfig::default().with_metrics_collection(false);
assert!(!config.collect_metrics);
}
#[test]
fn test_collect_metrics_builder_chain() {
let config = TextPipelineConfig::default()
.with_log_level(LogLevel::Debug)
.with_metrics_collection(true);
assert!(config.collect_metrics);
assert_eq!(config.log_level, LogLevel::Debug);
}
}
#[derive(Debug, Clone, Copy)]
pub struct SpacingConfig {
pub word_margin: f32,
}
impl Default for SpacingConfig {
fn default() -> Self {
Self { word_margin: 0.1 }
}
}
#[derive(Debug, Clone)]
pub struct TjThresholdConfig {
pub space_insertion_threshold: f32,
pub use_adaptive: bool,
}
impl Default for TjThresholdConfig {
fn default() -> Self {
Self {
space_insertion_threshold: -120.0,
use_adaptive: false,
}
}
}
#[derive(Debug, Clone)]
pub struct ReadingOrderConfig {
pub strategy: ReadingOrderStrategyType,
}
impl Default for ReadingOrderConfig {
fn default() -> Self {
Self {
strategy: ReadingOrderStrategyType::StructureTreeFirst,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReadingOrderStrategyType {
StructureTreeFirst,
Geometric,
XYCut,
Simple,
}
#[derive(Debug, Clone)]
pub struct OutputConfig {
pub detect_headings: bool,
pub include_images: bool,
pub bold_marker_behavior: BoldMarkerBehavior,
pub preserve_layout: bool,
pub extract_tables: bool,
pub image_output_dir: Option<String>,
pub embed_images: bool,
pub include_form_fields: bool,
}
impl Default for OutputConfig {
fn default() -> Self {
Self {
detect_headings: false, include_images: false, bold_marker_behavior: BoldMarkerBehavior::Conservative,
preserve_layout: false,
extract_tables: false,
image_output_dir: None,
embed_images: true,
include_form_fields: true,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BoldMarkerBehavior {
Conservative,
Aggressive,
}
impl Default for BoldMarkerBehavior {
fn default() -> Self {
Self::Conservative
}
}