pub mod config;
pub mod converters;
pub mod logging;
pub mod metrics;
pub mod ordered_span;
pub mod page_order;
pub mod reading_order;
pub mod text_processing;
pub use config::{
BoldMarkerBehavior, LogLevel, OutputConfig, ReadingOrderConfig, ReadingOrderStrategyType,
SpacingConfig, TextPipelineConfig, TjThresholdConfig, WordBoundaryMode,
};
pub use converters::{
HtmlOutputConverter, MarkdownOutputConverter, OutputConverter, PlainTextConverter,
};
pub use logging::{
extract_log_debug, extract_log_error, extract_log_info, extract_log_trace, extract_log_warn,
};
pub use metrics::{BatchMetrics, ExtractionMetrics};
pub use ordered_span::{
OrderedSpans, OrderedTextSpan, ReadingOrderInfo, ReadingOrderSource, StructRole,
};
pub use page_order::{page_reading_order, page_reading_order_no_artifacts};
pub use reading_order::{ReadingOrderContext, ReadingOrderStrategy, XYCutStrategy};
pub use text_processing::WhitespaceNormalizer;
use crate::error::Result;
use crate::layout::TextSpan;
use reading_order::create_strategy;
pub struct TextPipeline {
config: TextPipelineConfig,
reading_order_strategy: Box<dyn ReadingOrderStrategy>,
}
impl TextPipeline {
pub fn new() -> Self {
Self::with_config(TextPipelineConfig::default())
}
pub fn with_config(config: TextPipelineConfig) -> Self {
let strategy = create_strategy(&config.reading_order);
Self {
config,
reading_order_strategy: strategy,
}
}
pub fn process(
&self,
spans: Vec<TextSpan>,
context: ReadingOrderContext,
) -> Result<Vec<OrderedTextSpan>> {
let mut ordered = self.reading_order_strategy.apply(spans, &context)?;
reorder_rtl_word_runs(&mut ordered);
Ok(ordered)
}
pub fn config(&self) -> &TextPipelineConfig {
&self.config
}
}
impl Default for TextPipeline {
fn default() -> Self {
Self::new()
}
}
fn reorder_rtl_word_runs(ordered: &mut [OrderedTextSpan]) {
use crate::text::rtl_detector::is_rtl_text;
ordered.sort_by_key(|o| o.reading_order);
let is_space = |o: &OrderedTextSpan| o.span.text.trim().is_empty();
let is_rtl_word = |o: &OrderedTextSpan| {
let mut has_rtl = false;
for c in o.span.text.chars() {
if c.is_ascii_alphabetic() {
return false; }
if is_rtl_text(c as u32) {
has_rtl = true;
}
}
has_rtl
};
let mut i = 0;
let mut changed = false;
while i < ordered.len() {
if !is_rtl_word(&ordered[i]) {
i += 1;
continue;
}
let y = ordered[i].span.bbox.y;
let start = i;
let mut end = i + 1;
while end < ordered.len()
&& (ordered[end].span.bbox.y - y).abs() < 2.0
&& (is_rtl_word(&ordered[end]) || is_space(&ordered[end]))
{
end += 1;
}
let mut last = end;
while last > start + 1 && is_space(&ordered[last - 1]) {
last -= 1;
}
if last - start >= 2 {
ordered[start..last].reverse();
changed = true;
}
i = end;
}
if changed {
for (idx, o) in ordered.iter_mut().enumerate() {
o.reading_order = idx;
}
}
}
#[cfg(test)]
mod rtl_reorder_tests {
use super::*;
use crate::geometry::Rect;
use crate::layout::TextSpan;
fn ordered(text: &str, x: f32, y: f32, order: usize) -> OrderedTextSpan {
let span = TextSpan {
text: text.to_string(),
bbox: Rect::new(x, y, 10.0, 12.0),
font_size: 12.0,
..TextSpan::default()
};
OrderedTextSpan::new(span, order)
}
#[test]
fn reorder_rtl_word_runs_reverses_word_order() {
let mut v = vec![
ordered("اﻟﻌﺮﺑﻴﺔ", 160.0, 700.0, 0),
ordered(" ", 277.0, 700.0, 1),
ordered("اﳋﻄﻮط", 288.0, 700.0, 2),
ordered(" ", 409.0, 700.0, 3),
ordered("اﻧﻮاع", 420.0, 700.0, 4),
];
reorder_rtl_word_runs(&mut v);
let texts: Vec<&str> = v.iter().map(|o| o.span.text.as_str()).collect();
assert_eq!(texts, vec!["اﻧﻮاع", " ", "اﳋﻄﻮط", " ", "اﻟﻌﺮﺑﻴﺔ"]);
assert!(v.iter().enumerate().all(|(i, o)| o.reading_order == i));
}
#[test]
fn reorder_rtl_word_runs_leaves_ltr_alone() {
let mut v = vec![
ordered("The", 10.0, 700.0, 0),
ordered(" ", 30.0, 700.0, 1),
ordered("quick", 40.0, 700.0, 2),
];
reorder_rtl_word_runs(&mut v);
let texts: Vec<&str> = v.iter().map(|o| o.span.text.as_str()).collect();
assert_eq!(texts, vec!["The", " ", "quick"]);
}
}