Skip to main content

edgeparse_core/api/
config.rs

1//! Processing configuration for EdgeParse.
2
3use serde::{Deserialize, Serialize};
4
5/// Output format selection.
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
7pub enum OutputFormat {
8    /// JSON structured output
9    Json,
10    /// Plain text
11    Text,
12    /// HTML5
13    Html,
14    /// Annotated PDF with bounding boxes
15    Pdf,
16    /// Markdown
17    Markdown,
18    /// Markdown with HTML tables
19    MarkdownWithHtml,
20    /// Markdown with embedded images
21    MarkdownWithImages,
22}
23
24/// Table detection method.
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
26pub enum TableMethod {
27    /// Border-based detection only
28    Default,
29    /// Border + cluster detection
30    Cluster,
31}
32
33/// Reading order algorithm.
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
35pub enum ReadingOrder {
36    /// No reading order sorting
37    Off,
38    /// XY-Cut++ algorithm
39    XyCut,
40}
41
42/// Image output mode.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
44pub enum ImageOutput {
45    /// No image extraction
46    Off,
47    /// Base64 data URIs embedded in output
48    Embedded,
49    /// External file references
50    External,
51}
52
53/// Image format for extracted images.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
55pub enum ImageFormat {
56    /// PNG format
57    Png,
58    /// JPEG format
59    Jpeg,
60}
61
62/// Hybrid backend selection.
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64pub enum HybridBackend {
65    /// No hybrid processing
66    Off,
67    /// Docling Fast server
68    DoclingFast,
69}
70
71/// Hybrid triage mode.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
73pub enum HybridMode {
74    /// Dynamic triage — route pages based on complexity
75    Auto,
76    /// Skip triage — send all pages to backend
77    Full,
78}
79
80/// Main processing configuration (24 options from CLI).
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct ProcessingConfig {
83    /// Output directory (default: input file directory)
84    pub output_dir: Option<String>,
85    /// Password for encrypted PDFs
86    pub password: Option<String>,
87    /// Output formats
88    pub formats: Vec<OutputFormat>,
89    /// Suppress console logging
90    pub quiet: bool,
91    /// Content safety filter configuration
92    pub filter_config: super::filter::FilterConfig,
93    /// Enable PII sanitization
94    pub sanitize: bool,
95    /// Preserve original line breaks
96    pub keep_line_breaks: bool,
97    /// Replacement character for invalid characters
98    pub replace_invalid_chars: String,
99    /// Use PDF structure tree for tagged PDFs
100    pub use_struct_tree: bool,
101    /// Table detection method
102    pub table_method: TableMethod,
103    /// Reading order algorithm
104    pub reading_order: ReadingOrder,
105    /// Page separator for Markdown output
106    pub markdown_page_separator: Option<String>,
107    /// Page separator for text output
108    pub text_page_separator: Option<String>,
109    /// Page separator for HTML output
110    pub html_page_separator: Option<String>,
111    /// Image output mode
112    pub image_output: ImageOutput,
113    /// Image format
114    pub image_format: ImageFormat,
115    /// Directory for extracted images
116    pub image_dir: Option<String>,
117    /// Pages to extract (e.g., "1,3,5-7")
118    pub pages: Option<String>,
119    /// Include headers/footers in output
120    pub include_header_footer: bool,
121    /// Hybrid backend
122    pub hybrid: HybridBackend,
123    /// Hybrid triage mode
124    pub hybrid_mode: HybridMode,
125    /// Hybrid backend URL
126    pub hybrid_url: Option<String>,
127    /// Hybrid timeout in milliseconds
128    pub hybrid_timeout: u64,
129    /// Enable local fallback on hybrid error
130    pub hybrid_fallback: bool,
131}
132
133impl Default for ProcessingConfig {
134    fn default() -> Self {
135        Self {
136            output_dir: None,
137            password: None,
138            formats: vec![OutputFormat::Json],
139            quiet: false,
140            filter_config: super::filter::FilterConfig::default(),
141            sanitize: false,
142            keep_line_breaks: false,
143            replace_invalid_chars: " ".to_string(),
144            use_struct_tree: false,
145            table_method: TableMethod::Default,
146            reading_order: ReadingOrder::XyCut,
147            markdown_page_separator: None,
148            text_page_separator: None,
149            html_page_separator: None,
150            image_output: ImageOutput::External,
151            image_format: ImageFormat::Png,
152            image_dir: None,
153            pages: None,
154            include_header_footer: false,
155            hybrid: HybridBackend::Off,
156            hybrid_mode: HybridMode::Auto,
157            hybrid_url: None,
158            hybrid_timeout: 30000,
159            hybrid_fallback: false,
160        }
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_default_config() {
170        let config = ProcessingConfig::default();
171        assert_eq!(config.formats, vec![OutputFormat::Json]);
172        assert!(!config.quiet);
173        assert!(!config.sanitize);
174        assert_eq!(config.reading_order, ReadingOrder::XyCut);
175        assert_eq!(config.table_method, TableMethod::Default);
176        assert_eq!(config.image_output, ImageOutput::External);
177        assert_eq!(config.image_format, ImageFormat::Png);
178        assert_eq!(config.hybrid, HybridBackend::Off);
179        assert_eq!(config.hybrid_timeout, 30000);
180    }
181}