Skip to main content

spdf_types/
config.rs

1//! Configuration types. Defaults match `liteparse/src/core/config.ts`.
2
3use serde::{Deserialize, Serialize};
4
5/// Output format.
6///
7/// Mirrors `OutputFormat` in [`liteparse/src/core/types.ts`].
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
9#[serde(rename_all = "lowercase")]
10pub enum OutputFormat {
11    /// Structured JSON with per-page text items, bounding boxes, and metadata.
12    #[default]
13    Json,
14    /// Plain text with spatial layout preserved.
15    Text,
16}
17
18/// Grid projection debug knobs. Replaces the bespoke `gridDebugLogger` with a
19/// `tracing`-friendly config surface.
20#[derive(Debug, Clone, Default, Serialize, Deserialize)]
21#[serde(rename_all = "camelCase", default)]
22pub struct DebugConfig {
23    pub enabled: bool,
24    #[serde(skip_serializing_if = "Option::is_none")]
25    pub trace: Option<bool>,
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub visualize: Option<bool>,
28    #[serde(skip_serializing_if = "Option::is_none")]
29    pub visualize_path: Option<String>,
30    #[serde(skip_serializing_if = "Option::is_none")]
31    pub output_path: Option<String>,
32    #[serde(skip_serializing_if = "Option::is_none")]
33    pub text_filter: Option<Vec<String>>,
34    #[serde(skip_serializing_if = "Option::is_none")]
35    pub page_filter: Option<u32>,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub region_filter: Option<RegionFilter>,
38}
39
40#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
41#[serde(rename_all = "camelCase")]
42pub struct RegionFilter {
43    pub x1: f64,
44    pub y1: f64,
45    pub x2: f64,
46    pub y2: f64,
47}
48
49/// One or more language codes.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51#[serde(untagged)]
52pub enum Language {
53    Single(String),
54    Multiple(Vec<String>),
55}
56
57impl Default for Language {
58    fn default() -> Self {
59        Language::Single("en".into())
60    }
61}
62
63impl Language {
64    pub fn as_strings(&self) -> Vec<&str> {
65        match self {
66            Language::Single(s) => vec![s.as_str()],
67            Language::Multiple(v) => v.iter().map(String::as_str).collect(),
68        }
69    }
70}
71
72/// Full parser configuration. Defaults are identical to
73/// `DEFAULT_CONFIG` in [`liteparse/src/core/config.ts`].
74#[derive(Debug, Clone, Serialize, Deserialize)]
75#[serde(rename_all = "camelCase")]
76pub struct ParseConfig {
77    pub ocr_language: Language,
78    pub ocr_enabled: bool,
79    #[serde(skip_serializing_if = "Option::is_none")]
80    pub ocr_server_url: Option<String>,
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub tessdata_path: Option<String>,
83    pub num_workers: usize,
84    pub max_pages: u32,
85    #[serde(skip_serializing_if = "Option::is_none")]
86    pub target_pages: Option<String>,
87    pub dpi: u32,
88    pub output_format: OutputFormat,
89    pub precise_bounding_box: bool,
90    pub preserve_very_small_text: bool,
91    pub preserve_layout_alignment_across_pages: bool,
92    #[serde(skip_serializing_if = "Option::is_none")]
93    pub password: Option<String>,
94    /// Fail `parse()` with [`SpdfError::InvalidInput`] once wall-clock
95    /// work exceeds this many seconds. `None` = no deadline. Intended
96    /// as a defensive guard against pathological adversarial PDFs;
97    /// legitimate documents should never hit this.
98    #[serde(skip_serializing_if = "Option::is_none")]
99    pub timeout_secs: Option<u64>,
100    /// Hard cap on the size of an input blob accepted by `parse`. `None`
101    /// = no cap. Paths are not checked; only `ParseInput::Bytes`.
102    #[serde(skip_serializing_if = "Option::is_none")]
103    pub max_input_bytes: Option<u64>,
104    #[serde(skip_serializing_if = "Option::is_none")]
105    pub debug: Option<DebugConfig>,
106}
107
108impl Default for ParseConfig {
109    fn default() -> Self {
110        Self {
111            ocr_language: Language::default(),
112            ocr_enabled: true,
113            ocr_server_url: None,
114            tessdata_path: None,
115            num_workers: 4,
116            max_pages: 1000,
117            target_pages: None,
118            dpi: 150,
119            output_format: OutputFormat::Json,
120            precise_bounding_box: true,
121            preserve_very_small_text: false,
122            preserve_layout_alignment_across_pages: false,
123            password: None,
124            timeout_secs: None,
125            max_input_bytes: None,
126            debug: None,
127        }
128    }
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn defaults_match_liteparse() {
137        let c = ParseConfig::default();
138        assert!(c.ocr_enabled);
139        assert_eq!(c.num_workers, 4);
140        assert_eq!(c.max_pages, 1000);
141        assert_eq!(c.dpi, 150);
142        assert_eq!(c.output_format, OutputFormat::Json);
143        assert!(c.precise_bounding_box);
144    }
145
146    #[test]
147    fn partial_config_deserializes() {
148        let json = r#"{"ocrLanguage":"fra","dpi":300,"outputFormat":"text"}"#;
149        let patch: serde_json::Value = serde_json::from_str(json).unwrap();
150        // Quick smoke: ParseConfig fields are all required because we use
151        // defaults at the builder layer, but verify the wire shape round-trips.
152        assert_eq!(patch["dpi"], 300);
153    }
154}