Skip to main content

edgeparse_core/api/
config.rs

1//! Processing configuration for EdgeParse.
2
3use serde::{Deserialize, Serialize};
4
5/// Output format selection.
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
7pub enum OutputFormat {
8    /// JSON structured output
9    Json,
10    /// Plain text
11    Text,
12    /// HTML5
13    Html,
14    /// Annotated PDF with bounding boxes
15    Pdf,
16    /// Markdown
17    Markdown,
18    /// Markdown with HTML tables
19    MarkdownWithHtml,
20    /// Markdown with embedded images
21    MarkdownWithImages,
22}
23
24/// Table detection method.
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
26pub enum TableMethod {
27    /// Border-based detection only
28    Default,
29    /// Border + cluster detection
30    Cluster,
31}
32
33/// Reading order algorithm.
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
35pub enum ReadingOrder {
36    /// No reading order sorting
37    Off,
38    /// XY-Cut++ algorithm
39    XyCut,
40}
41
42/// Image output mode.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
44pub enum ImageOutput {
45    /// No image extraction
46    Off,
47    /// Base64 data URIs embedded in output
48    Embedded,
49    /// External file references
50    External,
51}
52
53/// Image format for extracted images.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
55pub enum ImageFormat {
56    /// PNG format
57    Png,
58    /// JPEG format
59    Jpeg,
60}
61
62/// Hybrid backend selection.
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64pub enum HybridBackend {
65    /// No hybrid processing
66    Off,
67    /// Docling Fast server
68    DoclingFast,
69}
70
71/// Hybrid triage mode.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
73pub enum HybridMode {
74    /// Dynamic triage — route pages based on complexity
75    Auto,
76    /// Skip triage — send all pages to backend
77    Full,
78}
79
80/// Main processing configuration (24 options from CLI).
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct ProcessingConfig {
83    /// Output directory (default: input file directory)
84    pub output_dir: Option<String>,
85    /// Password for encrypted PDFs
86    pub password: Option<String>,
87    /// Output formats
88    pub formats: Vec<OutputFormat>,
89    /// Suppress console logging
90    pub quiet: bool,
91    /// Content safety filter configuration
92    pub filter_config: super::filter::FilterConfig,
93    /// Enable PII sanitization
94    pub sanitize: bool,
95    /// Preserve original line breaks
96    pub keep_line_breaks: bool,
97    /// Replacement character for invalid characters
98    pub replace_invalid_chars: String,
99    /// Use PDF structure tree for tagged PDFs
100    pub use_struct_tree: bool,
101    /// Table detection method
102    pub table_method: TableMethod,
103    /// Reading order algorithm
104    pub reading_order: ReadingOrder,
105    /// Page separator for Markdown output
106    pub markdown_page_separator: Option<String>,
107    /// Page separator for text output
108    pub text_page_separator: Option<String>,
109    /// Page separator for HTML output
110    pub html_page_separator: Option<String>,
111    /// Image output mode
112    pub image_output: ImageOutput,
113    /// Image format
114    pub image_format: ImageFormat,
115    /// Directory for extracted images
116    pub image_dir: Option<String>,
117    /// Enable raster table OCR recovery on image-based tables
118    pub raster_table_ocr: bool,
119    /// Pages to extract (e.g., "1,3,5-7")
120    pub pages: Option<String>,
121    /// Include headers/footers in output
122    pub include_header_footer: bool,
123    /// Hybrid backend
124    pub hybrid: HybridBackend,
125    /// Hybrid triage mode
126    pub hybrid_mode: HybridMode,
127    /// Hybrid backend URL
128    pub hybrid_url: Option<String>,
129    /// Hybrid timeout in milliseconds
130    pub hybrid_timeout: u64,
131    /// Enable local fallback on hybrid error
132    pub hybrid_fallback: bool,
133}
134
135impl Default for ProcessingConfig {
136    fn default() -> Self {
137        Self {
138            output_dir: None,
139            password: None,
140            formats: vec![OutputFormat::Json],
141            quiet: false,
142            filter_config: super::filter::FilterConfig::default(),
143            sanitize: false,
144            keep_line_breaks: false,
145            replace_invalid_chars: " ".to_string(),
146            use_struct_tree: false,
147            table_method: TableMethod::Default,
148            reading_order: ReadingOrder::XyCut,
149            markdown_page_separator: None,
150            text_page_separator: None,
151            html_page_separator: None,
152            image_output: ImageOutput::External,
153            image_format: ImageFormat::Png,
154            image_dir: None,
155            raster_table_ocr: true,
156            pages: None,
157            include_header_footer: false,
158            hybrid: HybridBackend::Off,
159            hybrid_mode: HybridMode::Auto,
160            hybrid_url: None,
161            hybrid_timeout: 30000,
162            hybrid_fallback: false,
163        }
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn test_default_config() {
173        let config = ProcessingConfig::default();
174        assert_eq!(config.formats, vec![OutputFormat::Json]);
175        assert!(!config.quiet);
176        assert!(!config.sanitize);
177        assert_eq!(config.reading_order, ReadingOrder::XyCut);
178        assert_eq!(config.table_method, TableMethod::Default);
179        assert_eq!(config.image_output, ImageOutput::External);
180        assert_eq!(config.image_format, ImageFormat::Png);
181        assert!(config.raster_table_ocr);
182        assert_eq!(config.hybrid, HybridBackend::Off);
183        assert_eq!(config.hybrid_timeout, 30000);
184    }
185}