mod cache;
mod execution;
mod features;
mod format;
mod initialization;
#[cfg(test)]
mod tests;
pub use cache::clear_processor_cache;
pub use format::apply_output_format;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::types::ExtractionResult;
use crate::types::internal::InternalDocument;
use execution::{execute_processors, execute_validators};
use features::{execute_chunking, execute_language_detection, execute_token_reduction};
use initialization::{get_processors_from_cache, initialize_features, initialize_processor_cache};
#[cfg_attr(feature = "otel", tracing::instrument(
skip(doc, config),
fields(
pipeline.stage = "post_processing",
content.element_count = doc.elements.len(),
)
))]
pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
let mut doc = doc;
#[cfg(feature = "chunking")]
let chunker_heading_source = {
let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
c.chunker_type == crate::core::config::ChunkerType::Markdown
|| c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
}) && config.output_format == crate::core::config::OutputFormat::Plain;
if needs_markdown {
Some(crate::rendering::render_markdown(&doc))
} else {
None
}
};
#[cfg(feature = "html")]
let styled_html_prerender: Option<String> = {
use crate::plugins::Renderer as _;
if config.output_format == crate::core::config::OutputFormat::Html {
config.html_output.as_ref().and_then(|html_cfg| {
match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
Ok(renderer) => match renderer.render(&doc) {
Ok(html) => Some(html),
Err(e) => {
tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
None
}
},
Err(e) => {
tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
None
}
}
})
} else {
None
}
};
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
if config.ocr.is_some() && !doc.images.is_empty() {
let images_to_process = std::mem::take(&mut doc.images);
match crate::extraction::image_ocr::process_images_with_ocr(
images_to_process,
config,
&mut doc.processing_warnings,
)
.await
{
Ok(processed) => {
doc.images = processed;
}
Err(e) => {
doc.processing_warnings.push(crate::types::ProcessingWarning {
source: std::borrow::Cow::Borrowed("image_ocr"),
message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
});
}
}
}
let include_structure = config.include_document_structure;
let mut result =
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
#[cfg(feature = "html")]
if let Some(html) = styled_html_prerender {
result.formatted_content = Some(html);
}
#[cfg(feature = "chunking")]
let chunker_only_markdown = result.formatted_content.is_none();
#[cfg(feature = "chunking")]
if chunker_only_markdown && let Some(md) = chunker_heading_source {
result.formatted_content = Some(md);
}
let pp_config = config.postprocessor.as_ref();
let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
if postprocessing_enabled {
initialize_features();
initialize_processor_cache()?;
let (early_processors, middle_processors, late_processors) = get_processors_from_cache()?;
execute_processors(
&mut result,
config,
&pp_config,
early_processors,
middle_processors,
late_processors,
)
.await?;
}
execute_chunking(&mut result, config)?;
#[cfg(feature = "chunking")]
if chunker_only_markdown {
result.formatted_content = None;
}
execute_language_detection(&mut result, config)?;
execute_token_reduction(&mut result, config)?;
execute_validators(&result, config).await?;
apply_element_transform(&mut result, config);
normalize_nfc(&mut result);
#[cfg(feature = "liter-llm")]
if let Some(ref structured_config) = config.structured_extraction {
match crate::llm::structured::extract_structured(&result.content, structured_config).await {
Ok((output, usage)) => {
result.structured_output = Some(output);
crate::llm::usage::push_llm_usage(&mut result, usage);
}
Err(e) => {
tracing::warn!("Structured extraction failed: {e}");
result.processing_warnings.push(crate::types::ProcessingWarning {
source: std::borrow::Cow::Borrowed("structured_extraction"),
message: std::borrow::Cow::Owned(format!("Structured extraction failed: {e}")),
});
}
}
}
#[cfg(not(feature = "liter-llm"))]
if config.structured_extraction.is_some() {
result.processing_warnings.push(crate::types::ProcessingWarning {
source: std::borrow::Cow::Borrowed("structured_extraction"),
message: std::borrow::Cow::Borrowed("Structured extraction requires the 'liter-llm' feature"),
});
}
apply_output_format(&mut result, config.output_format.clone());
Ok(result)
}
#[cfg(not(feature = "tokio-runtime"))]
pub fn run_pipeline_sync(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
#[cfg(feature = "chunking")]
let chunker_heading_source = {
let needs_markdown = config.chunking.as_ref().is_some_and(|c| {
c.chunker_type == crate::core::config::ChunkerType::Markdown
|| c.resolve_preset().chunker_type == crate::core::config::ChunkerType::Markdown
}) && config.output_format == crate::core::config::OutputFormat::Plain;
if needs_markdown {
Some(crate::rendering::render_markdown(&doc))
} else {
None
}
};
#[cfg(feature = "html")]
let styled_html_prerender: Option<String> = {
use crate::plugins::Renderer as _;
if config.output_format == crate::core::config::OutputFormat::Html {
config.html_output.as_ref().and_then(|html_cfg| {
match crate::rendering::StyledHtmlRenderer::new(html_cfg.clone()) {
Ok(renderer) => match renderer.render(&doc) {
Ok(html) => Some(html),
Err(e) => {
tracing::warn!("StyledHtmlRenderer render failed, falling back to default HTML: {e}");
None
}
},
Err(e) => {
tracing::warn!("StyledHtmlRenderer construction failed, falling back to default HTML: {e}");
None
}
}
})
} else {
None
}
};
let include_structure = config.include_document_structure;
let mut result =
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
#[cfg(feature = "html")]
if let Some(html) = styled_html_prerender {
result.formatted_content = Some(html);
}
#[cfg(feature = "chunking")]
let chunker_only_markdown = result.formatted_content.is_none();
#[cfg(feature = "chunking")]
if chunker_only_markdown && let Some(md) = chunker_heading_source {
result.formatted_content = Some(md);
}
execute_chunking(&mut result, config)?;
#[cfg(feature = "chunking")]
if chunker_only_markdown {
result.formatted_content = None;
}
execute_language_detection(&mut result, config)?;
execute_token_reduction(&mut result, config)?;
apply_element_transform(&mut result, config);
normalize_nfc(&mut result);
apply_output_format(&mut result, config.output_format.clone());
Ok(result)
}
fn apply_element_transform(result: &mut ExtractionResult, config: &ExtractionConfig) {
if config.result_format == crate::types::OutputFormat::ElementBased {
result.elements = Some(crate::extraction::transform::transform_extraction_result_to_elements(
result,
));
}
}
fn normalize_nfc(result: &mut ExtractionResult) {
#[cfg(feature = "quality")]
{
use unicode_normalization::UnicodeNormalization;
result.content = result.content.nfc().collect();
if let Some(pages) = result.pages.as_mut() {
for page in pages.iter_mut() {
page.content = page.content.nfc().collect();
}
}
}
let _ = result;
}