printpdf 0.9.1

Rust library for reading and writing PDF files
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
use serde_derive::{Deserialize, Serialize};

pub use crate::text::TextItem;
use crate::{
    color::Color,
    font::ParsedFont,
    graphics::{
        Line, LineCapStyle, LineDashPattern, LineJoinStyle, Point, Polygon, Rect, TextRenderingMode,
    },
    matrix::{CurTransMat, TextMatrix},
    units::{Mm, Pt},
    BuiltinFont, DictItem, ExtendedGraphicsStateId, FontId, LayerInternalId, LinkAnnotation,
    PdfResources, PdfToSvgOptions, PdfWarnMsg, RenderingIntent, XObjectId, XObjectTransform,
};

/// Font resource reference - either a builtin PDF font or an external font
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case", tag = "type", content = "data")]
pub enum PdfFontHandle {
    /// Builtin PDF font (F1-F14)
    Builtin(BuiltinFont),
    /// External font loaded from file
    External(FontId),
}

impl PdfFontHandle {
    /// Get the PDF resource name for this font
    pub fn get_resource_name(&self) -> String {
        match self {
            PdfFontHandle::Builtin(bf) => bf.get_pdf_id().to_string(),
            PdfFontHandle::External(fid) => fid.0.clone(),
        }
    }
}

#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct PdfPage {
    pub media_box: Rect,
    pub trim_box: Rect,
    pub crop_box: Rect,
    pub ops: Vec<Op>,
}

impl PdfPage {
    pub fn new(width: Mm, height: Mm, ops: Vec<Op>) -> Self {
        Self {
            media_box: Rect::from_wh(width.into(), height.into()),
            trim_box: Rect::from_wh(width.into(), height.into()),
            crop_box: Rect::from_wh(width.into(), height.into()),
            ops,
        }
    }

    pub(crate) fn get_media_box(&self) -> lopdf::Object {
        self.media_box.to_array().into()
    }

    pub(crate) fn get_trim_box(&self) -> lopdf::Object {
        self.trim_box.to_array().into()
    }

    pub(crate) fn get_crop_box(&self) -> lopdf::Object {
        self.crop_box.to_array().into()
    }

    /// Render the page to an SVG string.
    pub fn to_svg(
        &self,
        resources: &PdfResources,
        opts: &PdfToSvgOptions,
        warnings: &mut Vec<PdfWarnMsg>,
    ) -> String {
        crate::render::render_to_svg(self, resources, opts, warnings)
    }

    pub async fn to_svg_async(
        &self,
        resources: &PdfResources,
        opts: &PdfToSvgOptions,
        warnings: &mut Vec<PdfWarnMsg>,
    ) -> String {
        crate::render::render_to_svg_async(self, resources, opts, warnings).await
    }

    pub fn get_xobject_ids(&self) -> Vec<XObjectId> {
        self.ops
            .iter()
            .filter_map(|s| match s {
                Op::UseXobject { id, .. } => Some(id.clone()),
                _ => None,
            })
            .collect()
    }

    pub fn get_external_font_ids(&self) -> Vec<FontId> {
        // Note: With the new API, fonts are set via SetFont operation
        // This function now returns an empty vec as external fonts are referenced by resource name
        // To get font usage, parse SetFont operations and resolve resource names
        Vec::new()
    }

    pub fn get_layers(&self) -> Vec<LayerInternalId> {
        self.ops
            .iter()
            .filter_map(|s| match s {
                Op::BeginLayer { layer_id } => Some(layer_id.clone()),
                _ => None,
            })
            .collect()
    }

    /// Extracts text from a PDF page.
    ///
    /// This function processes text-related operations (ShowText) to extract text content.
    /// It properly handles both TextItem::Text and TextItem::GlyphIds by using the
    /// current font's cmap to convert glyph IDs to Unicode characters.
    ///
    /// Note: This implementation doesn't fully handle complex text positioning or
    /// layout features such as columns or tables.
    ///
    /// # Arguments
    /// * `resources` - The PDF resources containing font information
    ///
    /// # Returns
    /// A vector of text chunks extracted from text sections
    pub fn extract_text(&self, resources: &PdfResources) -> Vec<String> {
        let mut text_chunks = Vec::new();
        let mut current_chunk = String::new();
        let mut in_text_section = false;
        let mut cur_text_cursor = Point {
            x: Pt(0.0),
            y: Pt(0.0),
        };
        let mut current_font: Option<&ParsedFont> = None;

        for op in &self.ops {
            match op {
                Op::StartTextSection => {
                    in_text_section = true;
                }
                Op::EndTextSection => {
                    in_text_section = false;
                    if !current_chunk.is_empty() {
                        text_chunks.push(current_chunk.trim().to_string());
                        current_chunk = String::new();
                    }
                }
                Op::SetFont { font, size: _ } => {
                    // Track the current font for glyph ID decoding
                    if let crate::ops::PdfFontHandle::External(font_id) = font {
                        current_font = resources.fonts.map.get(font_id).map(|pf| &pf.parsed_font);
                    }
                }
                Op::SetTextMatrix { .. } => {
                    current_chunk.push_str("\r\n");
                }
                Op::SetTextCursor { pos } => {
                    if (cur_text_cursor.y.0.abs() - pos.y.0.abs()).abs() > 3.0 {
                        current_chunk.push_str("\r\n");
                    }
                    cur_text_cursor = *pos;
                }
                Op::ShowText { items } if in_text_section => {
                    for item in items {
                        match item {
                            TextItem::Offset(o) => {
                                if *o < -100.0 {
                                    current_chunk.push(' ');
                                }
                            }
                            TextItem::Text(t) => current_chunk.push_str(t),
                            TextItem::GlyphIds(glyphs) => {
                                // Convert glyph IDs to Unicode using the current font's cmap
                                if let Some(font) = current_font {
                                    for codepoint in glyphs {
                                        // Check if we have unicode mapping for this glyph ID
                                        let unicode_codepoint = codepoint.cid
                                            .as_ref()
                                            .and_then(|s| s.chars().next())
                                            .or_else(|| Self::glyph_id_to_char(font, codepoint.gid))
                                            .unwrap_or('\u{FFFD}');
                                        current_chunk.push(unicode_codepoint);
                                    }
                                } else {
                                    // No font set, use placeholder
                                    current_chunk.push_str("[glyphs]");
                                }
                            }
                        }
                    }
                }
                Op::AddLineBreak if in_text_section => {
                    current_chunk.push_str("\r\n");
                }
                Op::MoveTextCursorAndSetLeading { .. } if in_text_section => {
                    current_chunk.push_str("\r\n");
                }
                Op::MoveToNextLineShowText { text } if in_text_section => {
                    current_chunk.push_str("\r\n");
                    current_chunk.push_str(text);
                    current_chunk.push(' ');
                }
                Op::SetSpacingMoveAndShowText { text, .. } if in_text_section => {
                    current_chunk.push_str("\r\n");
                    current_chunk.push_str(text);
                    current_chunk.push(' ');
                }
                _ => {}
            }
        }

        if !current_chunk.is_empty() {
            text_chunks.push(current_chunk.trim().to_string());
        }

        text_chunks.retain(|chunk| !chunk.is_empty());
        text_chunks
    }

    /// Helper function to convert glyph ID to character using font's cmap
    fn glyph_id_to_char(font: &ParsedFont, gid: u16) -> Option<char> {
        // Try to find a character that maps to this glyph ID
        // This is a reverse lookup through the cmap
        // For efficiency, we only check common Unicode ranges
        for codepoint in 0..0x10000u32 {
            if let Some(mapped_gid) = font.lookup_glyph_index(codepoint) {
                if mapped_gid == gid {
                    return char::from_u32(codepoint);
                }
            }
        }
        None
    }
}

#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum LayerIntent {
    View,
    Design,
}

impl LayerIntent {
    pub fn to_string(&self) -> &'static str {
        match self {
            LayerIntent::View => "View",
            LayerIntent::Design => "Design",
        }
    }
}

#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum LayerSubtype {
    Artwork,
}

impl LayerSubtype {
    pub fn to_string(&self) -> &'static str {
        match self {
            LayerSubtype::Artwork => "Artwork",
        }
    }
}

#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Layer {
    pub name: String,
    pub creator: String,
    pub intent: LayerIntent,
    pub usage: LayerSubtype,
}

impl Layer {
    pub fn new(name: &str) -> Self {
        Self {
            name: name.to_string(),
            creator: "Adobe Illustrator 14.0".to_string(),
            intent: LayerIntent::Design,
            usage: LayerSubtype::Artwork,
        }
    }
}

/// Operations that can occur in a PDF page
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case", tag = "type", content = "data")]
pub enum Op {
    /// Debugging or section marker (arbitrary id can mark a certain point in a stream of
    /// operations)
    Marker { id: String },
    /// "CS" operator, sets the color space for stroking operations
    SetColorSpaceStroke { id: String },
    /// "cs" operator, sets the color space for fill operations
    SetColorSpaceFill { id: String },
    /// Starts a layer
    BeginLayer { layer_id: LayerInternalId },
    /// Ends a layer (is inserted if missing at the page end)
    EndLayer,
    /// Saves the graphics configuration on the stack (line thickness, colors, overprint, etc.)
    SaveGraphicsState,
    /// Pops the last graphics configuration state off the stack
    RestoreGraphicsState,
    /// Loads a specific graphics state (necessary for describing extended graphics)
    LoadGraphicsState { gs: ExtendedGraphicsStateId },
    /// `BT` - Starts a section of text
    StartTextSection,
    /// `ET` - Ends a text section (inserted by default at the page end)
    EndTextSection,
    
    // --- New 1:1 PDF operations (preferred) ---
    /// `Tf` operator: Set font and size
    /// This maps 1:1 to PDF and should be used instead of SetFontSize/SetFontSizeBuiltinFont
    SetFont { font: PdfFontHandle, size: Pt },
    /// `Tj`/`TJ` operators: Show text at current position
    /// Font must be set first with SetFont. This maps 1:1 to PDF.
    ShowText { items: Vec<TextItem> },
    /// `T*` Adds a line break to the text, depends on the line height
    AddLineBreak,
    /// `TL` - Sets the line height for the text
    SetLineHeight { lh: Pt },
    /// `Tw`: Sets the word spacing in point (default: 100.0)
    SetWordSpacing { pt: Pt },
    /// Positions the text cursor in the page from the bottom left corner (can be manipulated
    /// further with `SetTextMatrix`)
    SetTextCursor { pos: Point },
    /// Sets the fill color for texts / polygons
    SetFillColor { col: Color },
    /// Sets the outline color for texts / polygons
    SetOutlineColor { col: Color },
    /// Sets the outline thickness for texts / lines / polygons
    SetOutlineThickness { pt: Pt },
    /// Sets the outline dash pattern
    SetLineDashPattern { dash: LineDashPattern },
    /// Line join style: miter, round or limit
    SetLineJoinStyle { join: LineJoinStyle },
    /// Line cap style: butt, round, or projecting-square
    SetLineCapStyle { cap: LineCapStyle },
    /// Set a miter limit in Pt
    SetMiterLimit { limit: Pt },
    /// Sets the text rendering mode (fill, stroke, fill-stroke, clip, fill-clip)
    SetTextRenderingMode { mode: TextRenderingMode },
    /// Sets the character spacing (default: 1.0)
    SetCharacterSpacing { multiplier: f32 },
    /// `Ts`: Sets the line offset (default: 1.0)
    SetLineOffset { multiplier: f32 },
    /// Draw a line (colors, dashes configured earlier)
    DrawLine { line: Line },
    /// Draw a rectangle
    DrawRectangle { rectangle: Rect },
    /// Draw a polygon
    DrawPolygon { polygon: Polygon },
    /// Set the transformation matrix for this page. Make sure to save the old graphics state
    /// before invoking!
    SetTransformationMatrix { matrix: CurTransMat },
    /// Sets a matrix that only affects subsequent text objects.
    SetTextMatrix { matrix: TextMatrix },
    /// Adds a link annotation (use `PdfDocument::add_link` to register the `LinkAnnotation` on the
    /// document)
    LinkAnnotation { link: LinkAnnotation },
    /// Instantiates an XObject with a given transform (if the XObject has a width / height).
    /// Use `PdfDocument::add_xobject` to register the object and get the ID.
    UseXobject {
        id: XObjectId,
        transform: XObjectTransform,
    },
    /// `TD` operation
    MoveTextCursorAndSetLeading { tx: f32, ty: f32 },
    /// `ri` operation
    SetRenderingIntent { intent: RenderingIntent },
    /// `Tz` operation
    SetHorizontalScaling { percent: f32 },
    /// Begins an inline image object.
    BeginInlineImage,
    /// Begins the inline image data.
    BeginInlineImageData,
    /// Ends the inline image object.
    EndInlineImage,
    /// Begins a marked content sequence.
    BeginMarkedContent { tag: String },
    /// Begins a marked content sequence with an accompanying property list.
    BeginMarkedContentWithProperties {
        tag: String,
        properties: DictItem,
    },
    /// Starts an optional content layer
    BeginOptionalContent { layer_id: LayerInternalId },
    /// Defines a marked content point with properties.
    DefineMarkedContentPoint {
        tag: String,
        properties: Vec<DictItem>,
    },
    /// Ends the current marked-content sequence.
    EndMarkedContent,
    /// Ends the current marked-content sequence.
    EndMarkedContentWithProperties,
    /// Ends the current optional content sequence.
    EndOptionalContent,
    /// Begins a compatibility section (operators inside are ignored).
    BeginCompatibilitySection,
    /// Ends a compatibility section.
    EndCompatibilitySection,
    /// Moves to the next line and shows text (the `'` operator).
    MoveToNextLineShowText { text: String },
    /// Sets spacing, moves to the next line, and shows text (the `"` operator).
    SetSpacingMoveAndShowText {
        word_spacing: f32,
        char_spacing: f32,
        text: String,
    },
    /// Unknown, custom key / value operation
    Unknown { key: String, value: Vec<DictItem> },
}