pub mod config;
pub mod converters;
pub mod logging;
pub mod metrics;
pub mod ordered_span;
pub mod reading_order;
pub mod text_processing;
pub use config::{
BoldMarkerBehavior, LogLevel, OutputConfig, ReadingOrderConfig, ReadingOrderStrategyType,
SpacingConfig, TextPipelineConfig, TjThresholdConfig, WordBoundaryMode,
};
pub use converters::{
HtmlOutputConverter, MarkdownOutputConverter, OutputConverter, PlainTextConverter,
};
pub use logging::{
extract_log_debug, extract_log_error, extract_log_info, extract_log_trace, extract_log_warn,
};
pub use metrics::{BatchMetrics, ExtractionMetrics};
pub use ordered_span::{OrderedSpans, OrderedTextSpan, ReadingOrderInfo, ReadingOrderSource};
pub use reading_order::{ReadingOrderContext, ReadingOrderStrategy, XYCutStrategy};
pub use text_processing::WhitespaceNormalizer;
use crate::error::Result;
use crate::layout::TextSpan;
use reading_order::create_strategy;
pub struct TextPipeline {
config: TextPipelineConfig,
reading_order_strategy: Box<dyn ReadingOrderStrategy>,
}
impl TextPipeline {
pub fn new() -> Self {
Self::with_config(TextPipelineConfig::default())
}
pub fn with_config(config: TextPipelineConfig) -> Self {
let strategy = create_strategy(&config.reading_order);
Self {
config,
reading_order_strategy: strategy,
}
}
pub fn process(
&self,
spans: Vec<TextSpan>,
context: ReadingOrderContext,
) -> Result<Vec<OrderedTextSpan>> {
self.reading_order_strategy.apply(spans, &context)
}
pub fn config(&self) -> &TextPipelineConfig {
&self.config
}
}
impl Default for TextPipeline {
fn default() -> Self {
Self::new()
}
}