oxipdf-ir 0.1.0

Intermediate representation types for the oxipdf PDF engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
//! Content type payloads for IR nodes.
//!
//! Each [`super::ContentVariant`] carries a payload struct defined here.
//! Leaf content types (Text, Svg, Math, Form) have their own data;
//! Container and Link are structural and carry minimal or no data.

use super::NodeId;
use crate::units::Pt;

/// Text content for a `Text` node.
#[derive(Debug, Clone, PartialEq)]
pub struct TextContent {
    /// The text string. Must be valid Unicode. May contain explicit
    /// line break characters (`\n`) which are honored by the line breaker.
    pub text: String,
}

impl TextContent {
    #[must_use]
    pub fn new(text: impl Into<String>) -> Self {
        Self { text: text.into() }
    }
}

/// The format of a raster image.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum ImageFormat {
    /// JPEG/JFIF image (lossy compression). Embedded directly in PDF
    /// as a DCTDecode stream — no transcoding required.
    Jpeg,
    /// PNG image. Non-alpha variants are embedded with FlateDecode and
    /// PNG predictor for zero-copy efficiency. Alpha PNGs are decoded
    /// and split into a color image + soft mask.
    Png,
    /// WebP image (lossy or lossless). Decoded to raw pixels, then
    /// re-compressed with FlateDecode for PDF embedding. Alpha channel
    /// separated into a soft mask when present.
    Webp,
}

/// Raster image content for an `Image` node.
#[derive(Debug, Clone)]
pub struct ImageContent {
    /// Raw image bytes in the format specified by `format`.
    pub data: Vec<u8>,
    /// Image format (determines the PDF embedding strategy).
    pub format: ImageFormat,
    /// Intrinsic width in points, if known. When `None`, the image's
    /// pixel width is used at 72 DPI (1 pixel = 1 point).
    pub intrinsic_width: Option<Pt>,
    /// Intrinsic height in points, if known.
    pub intrinsic_height: Option<Pt>,
    /// Alternative text for accessibility (PDF `/Alt` entry).
    /// Used by screen readers to describe the image content.
    pub alt_text: Option<String>,
}

impl ImageContent {
    /// Create image content from raw bytes and format.
    #[must_use]
    pub fn new(data: Vec<u8>, format: ImageFormat) -> Self {
        Self {
            data,
            format,
            intrinsic_width: None,
            intrinsic_height: None,
            alt_text: None,
        }
    }

    /// Create image content with explicit dimensions.
    #[must_use]
    pub fn with_dimensions(data: Vec<u8>, format: ImageFormat, width: Pt, height: Pt) -> Self {
        Self {
            data,
            format,
            intrinsic_width: Some(width),
            intrinsic_height: Some(height),
            alt_text: None,
        }
    }

    /// Set alternative text for accessibility.
    #[must_use]
    pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
        self.alt_text = Some(alt.into());
        self
    }
}

/// SVG graphic content for an `Svg` node.
#[derive(Debug, Clone)]
pub struct SvgContent {
    /// Raw SVG document bytes (UTF-8 XML).
    pub data: Vec<u8>,
    /// Intrinsic width, if specified in the SVG root element.
    pub intrinsic_width: Option<Pt>,
    /// Intrinsic height, if specified in the SVG root element.
    pub intrinsic_height: Option<Pt>,
}

/// The format of a mathematical expression.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum MathFormat {
    MathMl,
    Latex,
}

/// Mathematical expression content for a `Math` node.
#[derive(Debug, Clone, PartialEq)]
pub struct MathContent {
    pub markup: String,
    pub format: MathFormat,
}

/// The target of a hyperlink.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum LinkTarget {
    /// An external URL.
    External(String),
    /// An internal cross-reference by element ID (§5.3).
    Internal(String),
}

/// Link content for a `Link` node.
#[derive(Debug, Clone, PartialEq)]
pub struct LinkContent {
    pub target: LinkTarget,
}

/// Form field types supported in the IR.
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub enum FormField {
    TextInput {
        name: String,
        default_value: Option<String>,
    },
    Checkbox {
        name: String,
        checked: bool,
    },
    Radio {
        name: String,
        value: String,
        selected: bool,
    },
    Dropdown {
        name: String,
        options: Vec<String>,
        selected: Option<usize>,
    },
}

/// Form content for a `Form` node.
#[derive(Debug, Clone, PartialEq)]
pub struct FormContent {
    pub field: FormField,
}

/// Index entry marker for a back-of-book index.
///
/// An invisible node placed inline in the text flow. During rendering it
/// produces no visible output; instead, the engine collects all index entries
/// after fragmentation and generates an index section listing each term
/// with its page numbers.
#[derive(Debug, Clone, PartialEq)]
pub struct IndexEntryContent {
    /// The term as it appears in the index (e.g., "Algorithm").
    pub term: String,
    /// Optional sort key for custom ordering (e.g., for CJK terms or
    /// terms starting with special characters). When `None`, sorts by `term`.
    pub sort_key: Option<String>,
    /// Optional "see also" cross-reference to another term.
    /// Rendered as "term, see also other_term" in the index.
    pub see_also: Option<String>,
}

impl IndexEntryContent {
    /// Create an index entry for the given term.
    #[must_use]
    pub fn new(term: impl Into<String>) -> Self {
        Self {
            term: term.into(),
            sort_key: None,
            see_also: None,
        }
    }

    /// Set a custom sort key.
    #[must_use]
    pub fn with_sort_key(mut self, key: impl Into<String>) -> Self {
        self.sort_key = Some(key.into());
        self
    }

    /// Set a "see also" cross-reference.
    #[must_use]
    pub fn with_see_also(mut self, term: impl Into<String>) -> Self {
        self.see_also = Some(term.into());
        self
    }
}

/// Footnote content for a `Footnote` node.
///
/// A footnote node appears inline in the text flow as a superscript marker
/// (e.g., "¹"). Its children in the `StyledTree` are the footnote body
/// content, which the engine places at the bottom of the page where the
/// marker appears.
///
/// The consumer is responsible for building the body content as child nodes.
/// The engine handles marker rendering, page-bottom placement, separator
/// rules, and page height budget.
#[derive(Debug, Clone, PartialEq)]
pub struct FootnoteContent {
    /// Marker text. When `None`, the engine auto-numbers sequentially.
    pub marker: Option<String>,
}

impl FootnoteContent {
    /// Create a footnote with automatic numbering.
    #[must_use]
    pub fn auto() -> Self {
        Self { marker: None }
    }

    /// Create a footnote with a custom marker string.
    #[must_use]
    pub fn with_marker(marker: impl Into<String>) -> Self {
        Self {
            marker: Some(marker.into()),
        }
    }
}

/// Configuration for footnote rendering.
#[derive(Debug, Clone, PartialEq)]
pub struct FootnoteConfig {
    /// Width of the separator rule between body content and footnotes (0 = no rule).
    pub separator_width: Pt,
    /// Fraction of page width the separator rule spans (0.0–1.0).
    pub separator_fraction: f64,
    /// Vertical gap between body content and the separator rule.
    pub separator_gap: Pt,
    /// Vertical gap between separator rule and first footnote.
    pub footnote_gap: Pt,
    /// Numbering style for auto-numbered footnotes.
    pub numbering_style: FootnoteNumberingStyle,
    /// Where footnote bodies are placed.
    pub position: FootnotePosition,
}

impl Default for FootnoteConfig {
    fn default() -> Self {
        Self {
            separator_width: Pt::new(0.5),
            separator_fraction: 0.33,
            separator_gap: Pt::new(6.0),
            footnote_gap: Pt::new(4.0),
            numbering_style: FootnoteNumberingStyle::Numeric,
            position: FootnotePosition::PageBottom,
        }
    }
}

/// Numbering style for auto-numbered footnotes.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum FootnoteNumberingStyle {
    /// 1, 2, 3, ...
    #[default]
    Numeric,
    /// a, b, c, ..., z, aa, ab, ...
    AlphaLower,
    /// A, B, C, ..., Z, AA, AB, ...
    AlphaUpper,
    /// *, †, ‡, §, ‖, ¶ (cycles with doubling: **, ††, ...)
    Symbol,
}

impl FootnoteNumberingStyle {
    /// Format a 1-based counter according to this style.
    #[must_use]
    pub fn format(&self, n: u32) -> String {
        match self {
            Self::Numeric => n.to_string(),
            Self::AlphaLower => format_alpha(n, b'a'),
            Self::AlphaUpper => format_alpha(n, b'A'),
            Self::Symbol => format_symbol(n),
        }
    }
}

/// Format a number as alphabetic: 1→a, 26→z, 27→aa, 28→ab, ...
fn format_alpha(n: u32, base: u8) -> String {
    if n == 0 {
        return String::new();
    }
    let mut result = String::new();
    let mut val = n - 1;
    loop {
        result.insert(0, (base + (val % 26) as u8) as char);
        if val < 26 {
            break;
        }
        val = val / 26 - 1;
    }
    result
}

/// Format a number as a footnote symbol.
/// Cycle: * † ‡ § ‖ ¶, then **, ††, ‡‡, ...
fn format_symbol(n: u32) -> String {
    const SYMBOLS: [char; 6] = ['*', '', '', '§', '', ''];
    if n == 0 {
        return String::new();
    }
    let idx = ((n - 1) % 6) as usize;
    let repeat = ((n - 1) / 6) as usize + 1;
    SYMBOLS[idx].to_string().repeat(repeat)
}

/// Where footnote bodies are placed in the output document.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum FootnotePosition {
    /// At the bottom of the page where the reference appears (traditional footnotes).
    #[default]
    PageBottom,
    /// Collected and rendered as a section at the end of the document (endnotes).
    DocumentEnd,
    /// Collected and rendered at the end of each section (chapter endnotes).
    SectionEnd,
}

/// Border collapse model for tables.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum BorderCollapse {
    /// Cells have their own borders with spacing between them.
    #[default]
    Separate,
    /// Adjacent cell borders merge into a single shared border.
    Collapse,
}

/// Table layout algorithm selection.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum TableLayoutMode {
    /// Column widths depend on content (CSS table-layout: auto).
    #[default]
    Auto,
    /// Column widths set by column definitions only (CSS table-layout: fixed).
    Fixed,
}

/// Column width constraint for a table column.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum TableColumnWidth {
    /// Auto: sized to content.
    Auto,
    /// Fixed width in points.
    Fixed(Pt),
    /// Percentage of table width.
    Percent(f64),
}

/// A column definition in a table.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct TableColumn {
    /// Column width constraint.
    pub width: TableColumnWidth,
}

/// Classification of a row group.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TableRowGroupKind {
    /// Header rows — repeated on continuation pages when the table spans pages.
    Header,
    /// Body rows — the main content of the table.
    Body,
    /// Footer rows — rendered at the bottom of the table (not repeated).
    Footer,
}

/// A group of rows (header, body, or footer).
#[derive(Debug, Clone, PartialEq)]
pub struct TableRowGroup {
    pub kind: TableRowGroupKind,
    pub rows: Vec<TableRow>,
}

/// A single table row.
#[derive(Debug, Clone, PartialEq)]
pub struct TableRow {
    /// Cells in this row. Each cell references a child `NodeId` of the table node.
    pub cells: Vec<TableCell>,
}

/// A cell within a table row.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct TableCell {
    /// The `NodeId` of the cell content node (must be a child of the table node).
    pub content_node: NodeId,
    /// Number of columns this cell spans (1 = no span).
    pub colspan: u32,
    /// Number of rows this cell spans (1 = no span).
    pub rowspan: u32,
}

/// Table content describing the grid structure.
///
/// The table's child nodes in the `StyledTree` are the cell content nodes.
/// This struct describes the grid topology: how those child nodes map to
/// rows, columns, and cell spans. Cell content is laid out independently
/// within computed cell dimensions.
#[derive(Debug, Clone, PartialEq)]
pub struct TableContent {
    /// Column definitions (width constraints).
    pub columns: Vec<TableColumn>,
    /// Row groups: header, body, footer.
    pub row_groups: Vec<TableRowGroup>,
    /// Border collapse model.
    pub border_collapse: BorderCollapse,
    /// Horizontal spacing between cells (Separate model only).
    pub cell_spacing_h: Pt,
    /// Vertical spacing between cells (Separate model only).
    pub cell_spacing_v: Pt,
    /// Table layout algorithm.
    pub table_layout: TableLayoutMode,
}

impl TableContent {
    /// Iterate all rows across all row groups in order.
    pub fn all_rows(&self) -> impl Iterator<Item = &TableRow> {
        self.row_groups.iter().flat_map(|rg| rg.rows.iter())
    }

    /// Total number of rows across all row groups.
    pub fn total_rows(&self) -> usize {
        self.row_groups.iter().map(|rg| rg.rows.len()).sum()
    }

    /// Iterate header rows only.
    pub fn header_rows(&self) -> impl Iterator<Item = &TableRow> {
        self.row_groups
            .iter()
            .filter(|rg| rg.kind == TableRowGroupKind::Header)
            .flat_map(|rg| rg.rows.iter())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn text_content_creation() {
        let tc = TextContent::new("Hello, world!");
        assert_eq!(tc.text, "Hello, world!");
    }

    #[test]
    fn link_target_variants() {
        let ext = LinkTarget::External("https://example.com".into());
        let int = LinkTarget::Internal("section-1".into());
        assert_ne!(ext, int);
    }
}