mod cache;
mod execution;
mod features;
mod format;
mod initialization;
#[cfg(test)]
mod tests;
pub use cache::clear_processor_cache;
pub use format::apply_output_format;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::types::ExtractionResult;
use execution::{execute_processors, execute_validators};
use features::{execute_chunking, execute_language_detection, execute_token_reduction};
use initialization::{get_processors_from_cache, initialize_features, initialize_processor_cache};
#[cfg_attr(feature = "otel", tracing::instrument(
skip(result, config),
fields(
pipeline.stage = "post_processing",
content.length = result.content.len(),
)
))]
pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
let pp_config = config.postprocessor.as_ref();
let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
if postprocessing_enabled {
initialize_features();
initialize_processor_cache()?;
let (early_processors, middle_processors, late_processors) = get_processors_from_cache()?;
execute_processors(
&mut result,
config,
&pp_config,
early_processors,
middle_processors,
late_processors,
)
.await?;
}
execute_chunking(&mut result, config)?;
execute_language_detection(&mut result, config)?;
execute_token_reduction(&mut result, config)?;
execute_validators(&result, config).await?;
if config.result_format == crate::types::OutputFormat::ElementBased {
result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
&result,
));
}
if config.include_document_structure && result.document.is_none() {
result.document = Some(crate::extraction::transform::transform_to_document_structure(&result));
}
#[cfg(feature = "quality")]
{
use unicode_normalization::UnicodeNormalization;
result.content = result.content.nfc().collect();
if let Some(pages) = result.pages.as_mut() {
for page in pages.iter_mut() {
page.content = page.content.nfc().collect();
}
}
}
apply_output_format(&mut result, config.output_format);
Ok(result)
}
#[cfg(not(feature = "tokio-runtime"))]
pub fn run_pipeline_sync(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
execute_chunking(&mut result, config)?;
execute_language_detection(&mut result, config)?;
execute_token_reduction(&mut result, config)?;
if config.result_format == crate::types::OutputFormat::ElementBased {
result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
&result,
));
}
if config.include_document_structure && result.document.is_none() {
result.document = Some(crate::extraction::transform::transform_to_document_structure(&result));
}
#[cfg(feature = "quality")]
{
use unicode_normalization::UnicodeNormalization;
result.content = result.content.nfc().collect();
if let Some(pages) = result.pages.as_mut() {
for page in pages.iter_mut() {
page.content = page.content.nfc().collect();
}
}
}
apply_output_format(&mut result, config.output_format);
Ok(result)
}