use serde::{Deserialize, Serialize};
use super::super::formats::OutputFormat;
use super::super::ocr::OcrConfig;
use super::super::page::PageConfig;
use super::super::processing::{ChunkingConfig, PostProcessorConfig};
use super::types::{ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionConfig {
#[serde(default = "default_true")]
pub use_cache: bool,
#[serde(default = "default_true")]
pub enable_quality_processing: bool,
#[serde(default)]
pub ocr: Option<OcrConfig>,
#[serde(default)]
pub force_ocr: bool,
#[serde(default)]
pub chunking: Option<ChunkingConfig>,
#[serde(default)]
pub images: Option<ImageExtractionConfig>,
#[cfg(feature = "pdf")]
#[serde(default)]
pub pdf_options: Option<super::super::pdf::PdfConfig>,
#[serde(default)]
pub token_reduction: Option<TokenReductionConfig>,
#[serde(default)]
pub language_detection: Option<LanguageDetectionConfig>,
#[serde(default)]
pub pages: Option<PageConfig>,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
#[serde(default)]
pub keywords: Option<crate::keywords::KeywordConfig>,
#[serde(default)]
pub postprocessor: Option<PostProcessorConfig>,
#[cfg(feature = "html")]
#[serde(default)]
pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
#[serde(default)]
pub max_concurrent_extractions: Option<usize>,
#[serde(default)]
pub result_format: crate::types::OutputFormat,
#[cfg(feature = "archives")]
#[serde(default)]
pub security_limits: Option<crate::extractors::security::SecurityLimits>,
#[serde(default)]
pub output_format: OutputFormat,
#[serde(default)]
pub include_document_structure: bool,
}
impl Default for ExtractionConfig {
fn default() -> Self {
Self {
use_cache: true,
enable_quality_processing: true,
ocr: None,
force_ocr: false,
chunking: None,
images: None,
#[cfg(feature = "pdf")]
pdf_options: None,
token_reduction: None,
language_detection: None,
pages: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
keywords: None,
postprocessor: None,
#[cfg(feature = "html")]
html_options: None,
max_concurrent_extractions: None,
#[cfg(feature = "archives")]
security_limits: None,
result_format: crate::types::OutputFormat::Unified,
output_format: OutputFormat::Plain,
include_document_structure: false,
}
}
}
impl ExtractionConfig {
pub fn needs_image_processing(&self) -> bool {
let ocr_enabled = self.ocr.is_some();
let image_extraction_enabled = self.images.as_ref().map(|i| i.extract_images).unwrap_or(false);
ocr_enabled || image_extraction_enabled
}
}
fn default_true() -> bool {
true
}