oxipdf-ir 0.1.0

Intermediate representation types for the oxipdf PDF engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
//! Pipeline configuration: resource limits, policies, and mode flags.
//!
//! `RenderConfig` is the single top-level configuration struct that consumers
//! pass to the oxipdf pipeline. It lives in `oxipdf-ir` so that all downstream
//! crates can read the fields relevant to their stage.

/// Top-level configuration for a render pipeline invocation.
///
/// Consumers construct this once and pass it (or relevant subsets) to each
/// pipeline stage. All fields have safe defaults via [`Default`].
use crate::page_template::PageTemplate;
use crate::units::Pt;

#[derive(Debug, Clone)]
pub struct RenderConfig {
    /// Hard resource limits enforced during IR ingestion and pipeline execution.
    pub resource_limits: ResourceLimits,
    /// Policy for handling missing glyphs during text shaping.
    pub missing_glyph_policy: MissingGlyphPolicy,
    /// When `true`, the pipeline produces byte-identical output for identical
    /// input: deterministic object ordering, normalized metadata (fixed
    /// creation date, stable producer string), and no platform-dependent
    /// variation. When `false`, the pipeline may embed timestamps and use
    /// arbitrary (but still correct) object ordering for speed.
    pub deterministic_mode: bool,
    /// Page width for the output document. Default: A4 width (595.276 pt).
    pub page_width: Pt,
    /// Page height for the output document. Default: A4 height (841.890 pt).
    pub page_height: Pt,
    /// DPI threshold for downsampling raster images during SVG emission.
    ///
    /// Images with DPI above this threshold may be downsampled to reduce
    /// output size. Set to `f64::INFINITY` to disable downsampling.
    /// Default: 300.0 DPI (print-quality threshold).
    pub dpi_downsampling_threshold: f64,
    /// Page template controlling margins, headers, and footers.
    ///
    /// When `Some`, paginated render functions use the content area
    /// (page size minus margins and header/footer heights) for page splitting
    /// and offset body content accordingly. Single-page renders ignore this.
    pub page_template: Option<PageTemplate>,
    /// Hyphenation configuration. When enabled, the line breaker considers
    /// hyphenation points within words as secondary break opportunities.
    pub hyphenation: HyphenationConfig,
    /// Per-section page configuration overrides (page size, margins, headers/footers).
    pub page_sections: Vec<PageSectionConfig>,
    /// Footnote rendering configuration.
    pub footnote: crate::node::FootnoteConfig,
    /// Table of contents configuration.
    pub toc: TocConfig,
    /// PDF document metadata (Info dict + XMP).
    pub metadata: PdfMetadata,
    /// PDF output intent for color management (PDF/A compliance).
    pub output_intent: Option<OutputIntentConfig>,
}

/// PDF document metadata written to the Info dictionary and XMP stream.
#[derive(Debug, Clone, Default)]
pub struct PdfMetadata {
    /// Document title.
    pub title: Option<String>,
    /// Document author(s).
    pub author: Option<String>,
    /// Document subject / description.
    pub subject: Option<String>,
    /// Keywords (comma-separated or as a list).
    pub keywords: Option<String>,
    /// Creator application (e.g., "oxidoc 1.0").
    pub creator: Option<String>,
    /// Producer (defaults to "oxipdf" in deterministic mode).
    pub producer: Option<String>,
    /// Custom metadata key-value pairs (written to Info dict).
    pub custom: Vec<(String, String)>,
    /// Creation date. When `None`, uses current time (or fixed date in
    /// deterministic mode).
    pub creation_date: Option<(u16, u8, u8)>,
    /// Modification date. When `None`, uses current time (or fixed date
    /// in deterministic mode).
    pub modification_date: Option<(u16, u8, u8)>,
}

/// PDF output intent configuration for color management.
///
/// Used for PDF/A compliance. Specifies the intended output color space.
#[derive(Debug, Clone)]
pub struct OutputIntentConfig {
    /// Output condition identifier (e.g., "sRGB IEC61966-2.1").
    pub condition_identifier: String,
    /// Human-readable output condition description.
    pub condition: String,
    /// Registry name (e.g., "http://www.color.org").
    pub registry_name: String,
    /// Embedded ICC profile data (sRGB profile bytes).
    /// When `None`, uses the identifier without an embedded profile.
    pub icc_profile: Option<Vec<u8>>,
}

impl OutputIntentConfig {
    /// Create a standard sRGB output intent (without embedded profile).
    #[must_use]
    pub fn srgb() -> Self {
        Self {
            condition_identifier: "sRGB IEC61966-2.1".to_string(),
            condition: "sRGB".to_string(),
            registry_name: "http://www.color.org".to_string(),
            icc_profile: None,
        }
    }
}

/// Table of contents generation configuration.
#[derive(Debug, Clone)]
pub struct TocConfig {
    /// Whether to generate a table of contents.
    pub enabled: bool,
    /// Maximum heading level to include (1–6). Default: 3 (H1, H2, H3).
    pub max_level: u8,
    /// Whether to use dot leaders between title and page number.
    pub dot_leaders: bool,
    /// Title for the TOC section. `None` for no title.
    pub title: Option<String>,
    /// Indentation increment per heading level (in points).
    pub indent_per_level: Pt,
    /// Target width for TOC entries (for dot leader alignment).
    /// Defaults to page content width.
    pub entry_width: Option<Pt>,
}

impl Default for TocConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            max_level: 3,
            dot_leaders: true,
            title: Some("Contents".to_string()),
            indent_per_level: Pt::new(15.0),
            entry_width: None,
        }
    }
}

/// Per-section page configuration.
///
/// Defines overrides that take effect at a specific section boundary.
#[derive(Debug, Clone)]
pub struct PageSectionConfig {
    /// Element ID marking the start of this section.
    pub start_element_id: String,
    /// Page width for this section.
    pub width: Pt,
    /// Page height for this section.
    pub height: Pt,
    /// Optional margin overrides for this section.
    pub margins: Option<crate::page_template::PageMargins>,
    /// Optional header tree for this section (overrides global header).
    pub header: Option<crate::tree::StyledTree>,
    /// Optional footer tree for this section (overrides global footer).
    pub footer: Option<crate::tree::StyledTree>,
    /// Page number style for this section. `None` inherits from previous section.
    pub page_number_style: Option<PageNumberStyle>,
    /// Whether to restart page numbering at 1 when this section begins.
    pub restart_numbering: bool,
}

/// Page number formatting style.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum PageNumberStyle {
    /// 1, 2, 3, ...
    #[default]
    Arabic,
    /// i, ii, iii, iv, ...
    RomanLower,
    /// I, II, III, IV, ...
    RomanUpper,
    /// a, b, c, ..., z, aa, ab, ...
    AlphaLower,
    /// Suppress page numbers (empty string).
    None,
}

impl PageNumberStyle {
    /// Format a page number according to this style.
    #[must_use]
    pub fn format(&self, n: u32) -> String {
        match self {
            Self::Arabic => n.to_string(),
            Self::RomanLower => to_roman(n).to_lowercase(),
            Self::RomanUpper => to_roman(n),
            Self::AlphaLower => format_alpha_lower(n),
            Self::None => String::new(),
        }
    }
}

/// Convert a number to uppercase Roman numerals.
fn to_roman(mut n: u32) -> String {
    const TABLE: [(u32, &str); 13] = [
        (1000, "M"),
        (900, "CM"),
        (500, "D"),
        (400, "CD"),
        (100, "C"),
        (90, "XC"),
        (50, "L"),
        (40, "XL"),
        (10, "X"),
        (9, "IX"),
        (5, "V"),
        (4, "IV"),
        (1, "I"),
    ];
    if n == 0 {
        return "0".to_string();
    }
    let mut result = String::new();
    for &(value, symbol) in &TABLE {
        while n >= value {
            result.push_str(symbol);
            n -= value;
        }
    }
    result
}

/// Format a number as lowercase alphabetic: 1→a, 26→z, 27→aa, ...
fn format_alpha_lower(n: u32) -> String {
    if n == 0 {
        return String::new();
    }
    let mut result = String::new();
    let mut val = n - 1;
    loop {
        result.insert(0, (b'a' + (val % 26) as u8) as char);
        if val < 26 {
            break;
        }
        val = val / 26 - 1;
    }
    result
}

/// Hyphenation configuration for the line breaking pipeline.
#[derive(Debug, Clone)]
pub struct HyphenationConfig {
    /// Whether hyphenation is enabled.
    pub enabled: bool,
    /// Minimum word length before hyphenation is attempted.
    /// Words shorter than this are never hyphenated. Default: 5.
    pub min_word_length: u32,
    /// Minimum characters before the hyphenation point. Default: 2.
    pub min_left: u32,
    /// Minimum characters after the hyphenation point. Default: 2.
    pub min_right: u32,
}

impl Default for HyphenationConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            min_word_length: 5,
            min_left: 2,
            min_right: 2,
        }
    }
}

impl Default for RenderConfig {
    fn default() -> Self {
        Self {
            resource_limits: ResourceLimits::default(),
            missing_glyph_policy: MissingGlyphPolicy::default(),
            deterministic_mode: false,
            page_width: Pt::new(595.276),  // A4
            page_height: Pt::new(841.890), // A4
            dpi_downsampling_threshold: 300.0,
            page_template: None,
            hyphenation: HyphenationConfig::default(),
            page_sections: Vec::new(),
            footnote: crate::node::FootnoteConfig::default(),
            toc: TocConfig::default(),
            metadata: PdfMetadata::default(),
            output_intent: None,
        }
    }
}

/// Hard resource limits enforced before and during pipeline execution (§8).
///
/// These protect against pathological or malicious input. Exceeding any limit
/// produces an [`InputValidationError::ResourceLimitExceeded`](crate::error::InputValidationError::ResourceLimitExceeded)
/// or a `FragmentationError` from `oxipdf-fragment` — never silent truncation.
///
/// Default values are deliberately generous for typical documents while
/// still preventing runaway resource consumption on adversarial input.
#[derive(Debug, Clone)]
pub struct ResourceLimits {
    /// Maximum number of nodes in the `StyledTree`.
    /// Default: 1,000,000 (sufficient for very large technical documents).
    pub max_node_count: u32,

    /// Maximum depth of the tree (root = depth 0).
    /// Default: 256 (prevents stack overflow in recursive traversals).
    pub max_tree_depth: u32,

    /// Maximum total bytes of text content across all `Text` nodes.
    /// Default: 104,857,600 (100 MiB).
    pub max_text_bytes: u64,

    /// Maximum number of SVG paths per SVG node.
    /// Default: 50,000 (prevents SVG complexity bombs).
    pub max_svg_path_count: u32,

    /// Maximum number of pages the fragmenter may produce.
    /// Default: 10,000.
    pub max_pages: u32,

    /// Time budget in milliseconds for laying out and fragmenting a single page.
    /// If exceeded, fragmentation aborts with `FragmentationError::PageTimeBudgetExceeded`.
    /// Default: 500 ms.
    pub page_time_budget_ms: u64,
}

impl Default for ResourceLimits {
    fn default() -> Self {
        Self {
            max_node_count: 1_000_000,
            max_tree_depth: 256,
            max_text_bytes: 100 * 1024 * 1024, // 100 MiB
            max_svg_path_count: 50_000,
            max_pages: 10_000,
            page_time_budget_ms: 500,
        }
    }
}

impl ResourceLimits {
    /// Create limits with no restrictions. Useful for trusted input in
    /// controlled environments. **Not recommended for untrusted input.**
    #[must_use]
    pub fn unlimited() -> Self {
        Self {
            max_node_count: u32::MAX,
            max_tree_depth: u32::MAX,
            max_text_bytes: u64::MAX,
            max_svg_path_count: u32::MAX,
            max_pages: u32::MAX,
            page_time_budget_ms: u64::MAX,
        }
    }
}

/// Policy for handling missing glyphs during text shaping (§7).
///
/// When the shaping pipeline encounters a codepoint that has no glyph in
/// the resolved font (and all fallback fonts), this policy determines behavior.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum MissingGlyphPolicy {
    /// Replace the missing glyph with `.notdef` (typically a rectangular box)
    /// and continue shaping. A diagnostic warning is emitted but rendering
    /// proceeds. This is the default for document preview workflows.
    #[default]
    Fallback,

    /// Abort shaping with a `ShapingError::MissingGlyph` error. No PDF is
    /// produced. This is appropriate for production builds where missing
    /// glyphs indicate a font configuration problem.
    Fail,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn default_limits_are_sane() {
        let limits = ResourceLimits::default();
        assert_eq!(limits.max_node_count, 1_000_000);
        assert_eq!(limits.max_tree_depth, 256);
        assert_eq!(limits.max_text_bytes, 100 * 1024 * 1024);
        assert_eq!(limits.max_svg_path_count, 50_000);
        assert_eq!(limits.max_pages, 10_000);
        assert_eq!(limits.page_time_budget_ms, 500);
    }

    #[test]
    fn unlimited_is_max() {
        let limits = ResourceLimits::unlimited();
        assert_eq!(limits.max_node_count, u32::MAX);
        assert_eq!(limits.max_text_bytes, u64::MAX);
    }

    #[test]
    fn default_config() {
        let config = RenderConfig::default();
        assert_eq!(config.missing_glyph_policy, MissingGlyphPolicy::Fallback);
        assert!(!config.deterministic_mode);
        assert!((config.dpi_downsampling_threshold - 300.0).abs() < f64::EPSILON);
    }

    #[test]
    fn missing_glyph_policy_default_is_fallback() {
        assert_eq!(MissingGlyphPolicy::default(), MissingGlyphPolicy::Fallback);
    }
}