1use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
9#[serde(rename_all = "lowercase")]
10pub enum OutputFormat {
11 #[default]
13 Json,
14 Text,
16}
17
18#[derive(Debug, Clone, Default, Serialize, Deserialize)]
21#[serde(rename_all = "camelCase", default)]
22pub struct DebugConfig {
23 pub enabled: bool,
24 #[serde(skip_serializing_if = "Option::is_none")]
25 pub trace: Option<bool>,
26 #[serde(skip_serializing_if = "Option::is_none")]
27 pub visualize: Option<bool>,
28 #[serde(skip_serializing_if = "Option::is_none")]
29 pub visualize_path: Option<String>,
30 #[serde(skip_serializing_if = "Option::is_none")]
31 pub output_path: Option<String>,
32 #[serde(skip_serializing_if = "Option::is_none")]
33 pub text_filter: Option<Vec<String>>,
34 #[serde(skip_serializing_if = "Option::is_none")]
35 pub page_filter: Option<u32>,
36 #[serde(skip_serializing_if = "Option::is_none")]
37 pub region_filter: Option<RegionFilter>,
38}
39
40#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
41#[serde(rename_all = "camelCase")]
42pub struct RegionFilter {
43 pub x1: f64,
44 pub y1: f64,
45 pub x2: f64,
46 pub y2: f64,
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51#[serde(untagged)]
52pub enum Language {
53 Single(String),
54 Multiple(Vec<String>),
55}
56
57impl Default for Language {
58 fn default() -> Self {
59 Language::Single("en".into())
60 }
61}
62
63impl Language {
64 pub fn as_strings(&self) -> Vec<&str> {
65 match self {
66 Language::Single(s) => vec![s.as_str()],
67 Language::Multiple(v) => v.iter().map(String::as_str).collect(),
68 }
69 }
70}
71
72#[derive(Debug, Clone, Serialize, Deserialize)]
75#[serde(rename_all = "camelCase")]
76pub struct ParseConfig {
77 pub ocr_language: Language,
78 pub ocr_enabled: bool,
79 #[serde(skip_serializing_if = "Option::is_none")]
80 pub ocr_server_url: Option<String>,
81 #[serde(skip_serializing_if = "Option::is_none")]
82 pub tessdata_path: Option<String>,
83 pub num_workers: usize,
84 pub max_pages: u32,
85 #[serde(skip_serializing_if = "Option::is_none")]
86 pub target_pages: Option<String>,
87 pub dpi: u32,
88 pub output_format: OutputFormat,
89 pub precise_bounding_box: bool,
90 pub preserve_very_small_text: bool,
91 pub preserve_layout_alignment_across_pages: bool,
92 #[serde(skip_serializing_if = "Option::is_none")]
93 pub password: Option<String>,
94 #[serde(skip_serializing_if = "Option::is_none")]
99 pub timeout_secs: Option<u64>,
100 #[serde(skip_serializing_if = "Option::is_none")]
103 pub max_input_bytes: Option<u64>,
104 #[serde(skip_serializing_if = "Option::is_none")]
105 pub debug: Option<DebugConfig>,
106}
107
108impl Default for ParseConfig {
109 fn default() -> Self {
110 Self {
111 ocr_language: Language::default(),
112 ocr_enabled: true,
113 ocr_server_url: None,
114 tessdata_path: None,
115 num_workers: 4,
116 max_pages: 1000,
117 target_pages: None,
118 dpi: 150,
119 output_format: OutputFormat::Json,
120 precise_bounding_box: true,
121 preserve_very_small_text: false,
122 preserve_layout_alignment_across_pages: false,
123 password: None,
124 timeout_secs: None,
125 max_input_bytes: None,
126 debug: None,
127 }
128 }
129}
130
131#[cfg(test)]
132mod tests {
133 use super::*;
134
135 #[test]
136 fn defaults_match_liteparse() {
137 let c = ParseConfig::default();
138 assert!(c.ocr_enabled);
139 assert_eq!(c.num_workers, 4);
140 assert_eq!(c.max_pages, 1000);
141 assert_eq!(c.dpi, 150);
142 assert_eq!(c.output_format, OutputFormat::Json);
143 assert!(c.precise_bounding_box);
144 }
145
146 #[test]
147 fn partial_config_deserializes() {
148 let json = r#"{"ocrLanguage":"fra","dpi":300,"outputFormat":"text"}"#;
149 let patch: serde_json::Value = serde_json::from_str(json).unwrap();
150 assert_eq!(patch["dpi"], 300);
153 }
154}