use serde::{Deserialize, Serialize};
use super::super::acceleration::AccelerationConfig;
use super::super::formats::OutputFormat;
use super::super::ocr::OcrConfig;
use super::super::page::PageConfig;
use super::super::processing::{ChunkingConfig, PostProcessorConfig};
use super::file_config::FileExtractionConfig;
use super::types::{ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionConfig {
#[serde(default = "default_true")]
pub use_cache: bool,
#[serde(default = "default_true")]
pub enable_quality_processing: bool,
#[serde(default)]
pub ocr: Option<OcrConfig>,
#[serde(default)]
pub force_ocr: bool,
#[serde(default)]
pub chunking: Option<ChunkingConfig>,
#[serde(default)]
pub images: Option<ImageExtractionConfig>,
#[cfg(feature = "pdf")]
#[serde(default)]
pub pdf_options: Option<super::super::pdf::PdfConfig>,
#[serde(default)]
pub token_reduction: Option<TokenReductionConfig>,
#[serde(default)]
pub language_detection: Option<LanguageDetectionConfig>,
#[serde(default)]
pub pages: Option<PageConfig>,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
#[serde(default)]
pub keywords: Option<crate::keywords::KeywordConfig>,
#[serde(default)]
pub postprocessor: Option<PostProcessorConfig>,
#[cfg(feature = "html")]
#[serde(default)]
pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
#[serde(default)]
pub max_concurrent_extractions: Option<usize>,
#[serde(default)]
pub result_format: crate::types::OutputFormat,
#[cfg(feature = "archives")]
#[serde(default)]
pub security_limits: Option<crate::extractors::security::SecurityLimits>,
#[serde(default)]
pub output_format: OutputFormat,
#[cfg(feature = "layout-detection")]
#[serde(default)]
pub layout: Option<super::super::layout::LayoutDetectionConfig>,
#[serde(default)]
pub include_document_structure: bool,
#[serde(default)]
pub acceleration: Option<AccelerationConfig>,
#[serde(default)]
pub email: Option<super::super::email::EmailConfig>,
#[serde(default)]
pub concurrency: Option<super::super::concurrency::ConcurrencyConfig>,
}
impl Default for ExtractionConfig {
fn default() -> Self {
Self {
use_cache: true,
enable_quality_processing: true,
ocr: None,
force_ocr: false,
chunking: None,
images: None,
#[cfg(feature = "pdf")]
pdf_options: None,
token_reduction: None,
language_detection: None,
pages: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
keywords: None,
postprocessor: None,
#[cfg(feature = "html")]
html_options: None,
max_concurrent_extractions: None,
#[cfg(feature = "archives")]
security_limits: None,
#[cfg(feature = "layout-detection")]
layout: None,
result_format: crate::types::OutputFormat::Unified,
output_format: OutputFormat::Plain,
include_document_structure: false,
acceleration: None,
email: None,
concurrency: None,
}
}
}
impl ExtractionConfig {
pub fn with_file_overrides(&self, overrides: &FileExtractionConfig) -> Self {
let FileExtractionConfig {
ref enable_quality_processing,
ref ocr,
ref force_ocr,
ref chunking,
ref images,
#[cfg(feature = "pdf")]
ref pdf_options,
ref token_reduction,
ref language_detection,
ref pages,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
ref keywords,
ref postprocessor,
#[cfg(feature = "html")]
ref html_options,
ref result_format,
ref output_format,
ref include_document_structure,
#[cfg(feature = "layout-detection")]
ref layout,
} = *overrides;
let mut config = self.clone();
if let Some(v) = enable_quality_processing {
config.enable_quality_processing = *v;
}
if let Some(v) = ocr {
config.ocr = Some(v.clone());
}
if let Some(v) = force_ocr {
config.force_ocr = *v;
}
if let Some(v) = chunking {
config.chunking = Some(v.clone());
}
if let Some(v) = images {
config.images = Some(v.clone());
}
#[cfg(feature = "pdf")]
if let Some(v) = pdf_options {
config.pdf_options = Some(v.clone());
}
if let Some(v) = token_reduction {
config.token_reduction = Some(v.clone());
}
if let Some(v) = language_detection {
config.language_detection = Some(v.clone());
}
if let Some(v) = pages {
config.pages = Some(v.clone());
}
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
if let Some(v) = keywords {
config.keywords = Some(v.clone());
}
if let Some(v) = postprocessor {
config.postprocessor = Some(v.clone());
}
#[cfg(feature = "html")]
if let Some(v) = html_options {
config.html_options = Some(v.clone());
}
if let Some(v) = result_format {
config.result_format = *v;
}
if let Some(v) = output_format {
config.output_format = *v;
}
if let Some(v) = include_document_structure {
config.include_document_structure = *v;
}
#[cfg(feature = "layout-detection")]
if let Some(v) = layout {
config.layout = Some(v.clone());
}
config
}
pub fn needs_image_processing(&self) -> bool {
let ocr_enabled = self.ocr.is_some() || self.force_ocr;
let image_extraction_enabled = self.images.as_ref().map(|i| i.extract_images).unwrap_or(false);
#[cfg(feature = "layout-detection")]
let layout_enabled = self.layout.is_some();
#[cfg(not(feature = "layout-detection"))]
let layout_enabled = false;
ocr_enabled || image_extraction_enabled || layout_enabled
}
}
fn default_true() -> bool {
true
}