use strum_macros::{Display, EnumString};
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString)]
#[allow(non_camel_case_types)]
pub enum PdfOcrStrategy {
NO_OCR,
OCR_ONLY,
OCR_AND_TEXT_EXTRACTION,
#[default]
AUTO,
}
#[derive(Debug, Clone, PartialEq)]
pub struct PdfParserConfig {
pub(crate) ocr_strategy: PdfOcrStrategy,
pub(crate) extract_inline_images: bool,
pub(crate) extract_unique_inline_images_only: bool,
pub(crate) extract_marked_content: bool,
pub(crate) extract_annotation_text: bool,
}
impl Default for PdfParserConfig {
fn default() -> Self {
Self {
ocr_strategy: PdfOcrStrategy::AUTO,
extract_inline_images: false,
extract_unique_inline_images_only: false,
extract_marked_content: false,
extract_annotation_text: true,
}
}
}
impl PdfParserConfig {
pub fn new() -> Self {
Self::default()
}
pub fn set_ocr_strategy(mut self, val: PdfOcrStrategy) -> Self {
self.ocr_strategy = val;
self
}
pub fn set_extract_inline_images(mut self, val: bool) -> Self {
self.extract_inline_images = val;
self
}
pub fn set_extract_unique_inline_images_only(mut self, val: bool) -> Self {
self.extract_unique_inline_images_only = val;
self
}
pub fn set_extract_marked_content(mut self, val: bool) -> Self {
self.extract_marked_content = val;
self
}
pub fn set_extract_annotation_text(mut self, val: bool) -> Self {
self.extract_annotation_text = val;
self
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct OfficeParserConfig {
pub(crate) extract_macros: bool,
pub(crate) include_deleted_content: bool,
pub(crate) include_move_from_content: bool,
pub(crate) include_shape_based_content: bool,
pub(crate) include_headers_and_footers: bool,
pub(crate) include_missing_rows: bool,
pub(crate) include_slide_notes: bool,
pub(crate) include_slide_master_content: bool,
pub(crate) concatenate_phonetic_runs: bool,
pub(crate) extract_all_alternatives_from_msg: bool,
}
impl Default for OfficeParserConfig {
fn default() -> Self {
Self {
extract_macros: false,
include_deleted_content: false,
include_move_from_content: false,
include_shape_based_content: true,
include_headers_and_footers: false,
include_missing_rows: false,
include_slide_notes: true,
include_slide_master_content: true,
concatenate_phonetic_runs: true,
extract_all_alternatives_from_msg: false,
}
}
}
impl OfficeParserConfig {
pub fn new() -> Self {
Self::default()
}
pub fn set_extract_macros(mut self, val: bool) -> Self {
self.extract_macros = val;
self
}
pub fn set_include_deleted_content(mut self, val: bool) -> Self {
self.include_deleted_content = val;
self
}
pub fn set_include_move_from_content(mut self, val: bool) -> Self {
self.include_move_from_content = val;
self
}
pub fn set_include_shape_based_content(mut self, val: bool) -> Self {
self.include_shape_based_content = val;
self
}
pub fn set_include_headers_and_footers(mut self, val: bool) -> Self {
self.include_headers_and_footers = val;
self
}
pub fn set_include_missing_rows(mut self, val: bool) -> Self {
self.include_missing_rows = val;
self
}
pub fn set_include_slide_notes(mut self, val: bool) -> Self {
self.include_slide_notes = val;
self
}
pub fn set_include_slide_master_content(mut self, val: bool) -> Self {
self.include_slide_master_content = val;
self
}
pub fn set_concatenate_phonetic_runs(mut self, val: bool) -> Self {
self.concatenate_phonetic_runs = val;
self
}
pub fn set_extract_all_alternatives_from_msg(mut self, val: bool) -> Self {
self.extract_all_alternatives_from_msg = val;
self
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct TesseractOcrConfig {
pub(crate) density: i32,
pub(crate) depth: i32,
pub(crate) timeout_seconds: i32,
pub(crate) enable_image_preprocessing: bool,
pub(crate) apply_rotation: bool,
pub(crate) language: String,
}
impl Default for TesseractOcrConfig {
fn default() -> Self {
Self {
density: 300,
depth: 4,
timeout_seconds: 130,
enable_image_preprocessing: false,
apply_rotation: false,
language: "eng".to_string(),
}
}
}
impl TesseractOcrConfig {
pub fn new() -> Self {
Self::default()
}
pub fn set_apply_rotation(mut self, val: bool) -> Self {
self.apply_rotation = val;
self
}
pub fn set_density(mut self, val: i32) -> Self {
self.density = val;
self
}
pub fn set_depth(mut self, val: i32) -> Self {
self.depth = val;
self
}
pub fn set_enable_image_preprocessing(mut self, val: bool) -> Self {
self.enable_image_preprocessing = val;
self
}
pub fn set_language(mut self, val: &str) -> Self {
self.language = val.to_string();
self
}
pub fn set_timeout_seconds(mut self, val: i32) -> Self {
self.timeout_seconds = val;
self
}
}