rpdfium 7676.6.0

A faithful Rust port of Google's PDFium PDF rendering engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
//! Arc-based wrappers for WASM and async use cases.
//!
//! These types wrap the core engine in `Arc` so they can be sent across
//! threads and held in `'static` contexts (e.g. `wasm_bindgen` futures,
//! async runtimes). All types are `Send + Sync + 'static`.

use std::sync::{Arc, OnceLock};

use rpdfium_core::error::{ObjectId, PdfError};
use rpdfium_core::{Name, OpenOptions, Rect};
use rpdfium_doc::{Annotation, Bookmark, DocumentMetadata, PageStructure, StructTree};
use rpdfium_font::DashMapFontCache;
use rpdfium_page::display::{DisplayTree, walk};
use rpdfium_page::{InterpreterContext, collect_page_ids, interpret, resolve_resources};
use rpdfium_parser::{ObjectStore, tokenize_content_stream};
use rpdfium_text::{TextExtractor, TextPage};

use crate::{
    Error, FontCacheBridge, RenderConfig, Result, decode_page_contents, parse_annotations,
    parse_bookmarks, parse_metadata, parse_rect, parse_rect_from_obj,
};

// ---------------------------------------------------------------------------
// ArcLibrary
// ---------------------------------------------------------------------------

/// Arc-wrapped library instance for WASM/async contexts.
#[derive(Clone)]
pub struct ArcLibrary {
    #[allow(dead_code)]
    inner: Arc<LibraryInner>,
}

struct LibraryInner {
    _private: (),
}

impl ArcLibrary {
    /// Create a new `ArcLibrary` instance.
    pub fn new() -> Self {
        Self {
            inner: Arc::new(LibraryInner { _private: () }),
        }
    }
}

impl Default for ArcLibrary {
    fn default() -> Self {
        Self::new()
    }
}

// ---------------------------------------------------------------------------
// ArcDocument
// ---------------------------------------------------------------------------

/// Arc-wrapped document for WASM/async contexts.
///
/// Unlike [`Document`](crate::Document) which borrows from [`ArcLibrary`],
/// `ArcDocument` uses `Arc` internally and is `Send + Sync + 'static`.
#[derive(Clone)]
pub struct ArcDocument {
    inner: Arc<DocumentInner>,
}

struct DocumentInner {
    store: ObjectStore<Arc<[u8]>>,
    font_cache: DashMapFontCache,
    page_ids: Vec<ObjectId>,
    catalog_id: ObjectId,
    options: OpenOptions,
    oc_context: Option<rpdfium_page::OCContext>,
}

impl ArcDocument {
    /// Open a PDF document from in-memory data.
    ///
    /// Parses the file structure, resolves the page tree, and prepares
    /// the document for page access.
    pub fn open(_library: &ArcLibrary, data: Vec<u8>, options: &OpenOptions) -> Result<Self> {
        let arc_data: Arc<[u8]> = Arc::from(data);
        let store = ObjectStore::open_with_password(
            arc_data,
            options.parsing_mode,
            options.password.as_deref(),
        )?;
        let page_ids = collect_page_ids(&store)?;
        let catalog_id = store.trailer().root;
        let font_cache = DashMapFontCache::new();
        let oc_context = rpdfium_page::OCContext::from_catalog(&store, catalog_id);

        Ok(ArcDocument {
            inner: Arc::new(DocumentInner {
                store,
                font_cache,
                page_ids,
                catalog_id,
                options: options.clone(),
                oc_context,
            }),
        })
    }

    /// Open a PDF document from a file path.
    ///
    /// This is a convenience wrapper around [`ArcDocument::open()`] that reads
    /// the file contents into memory before parsing.
    ///
    /// # Examples
    ///
    /// ```no_run
    /// # use rpdfium::{ArcLibrary, OpenOptions};
    /// let lib = ArcLibrary::new();
    /// let opts = OpenOptions::default();
    /// let doc = rpdfium::ArcDocument::open_file(&lib, "document.pdf", &opts)?;
    /// # Ok::<(), rpdfium::Error>(())
    /// ```
    pub fn open_file(
        library: &ArcLibrary,
        path: impl AsRef<std::path::Path>,
        options: &OpenOptions,
    ) -> Result<Self> {
        let data = std::fs::read(path).map_err(PdfError::Io)?;
        Self::open(library, data, options)
    }

    /// Returns the number of pages in the document.
    pub fn page_count(&self) -> u32 {
        self.inner.page_ids.len() as u32
    }

    /// Get a page by its zero-based index.
    pub fn page(&self, index: u32) -> Result<ArcPage> {
        let count = self.page_count();
        if index >= count {
            return Err(Error::PageOutOfRange { index, count });
        }
        let page_dict_id = self.inner.page_ids[index as usize];

        // Resolve the page dictionary to extract /MediaBox
        let page_obj = self.inner.store.resolve(page_dict_id)?;
        let page_dict = page_obj
            .as_dict()
            .ok_or(PdfError::UnknownObject(page_dict_id))?;

        let media_box = parse_rect(page_dict, &Name::media_box(), &self.inner.store)
            .or_else(|| {
                let inherited = rpdfium_page::find_inherited_entry(
                    &self.inner.store,
                    page_dict,
                    &Name::media_box(),
                )
                .ok()??;
                parse_rect_from_obj(&inherited)
            })
            .unwrap_or(Rect::new(0.0, 0.0, 612.0, 792.0));

        Ok(ArcPage {
            doc: self.clone(),
            page_index: index,
            page_dict_id,
            media_box,
            display_tree: OnceLock::new(),
        })
    }

    /// Parse document metadata from the `/Info` dictionary.
    pub fn metadata(&self) -> Result<Option<DocumentMetadata>> {
        match self.inner.store.trailer().info {
            Some(info_id) => {
                let info_obj = self.inner.store.resolve(info_id)?;
                let meta = parse_metadata(info_obj, &self.inner.store)?;
                Ok(Some(meta))
            }
            None => Ok(None),
        }
    }

    /// Parse the document's bookmark (outline) tree.
    pub fn bookmarks(&self) -> Result<Vec<Bookmark>> {
        let catalog_obj = self.inner.store.resolve(self.inner.catalog_id)?;
        let bookmarks = parse_bookmarks(catalog_obj, &self.inner.store)?;
        Ok(bookmarks)
    }

    /// Returns a reference to the underlying object store.
    pub fn store(&self) -> &ObjectStore<Arc<[u8]>> {
        &self.inner.store
    }

    /// Render multiple pages in parallel using rayon.
    ///
    /// Each page is interpreted and rendered independently. Individual page
    /// errors are captured per-page; other pages still succeed.
    pub fn render_pages_parallel(
        &self,
        page_indices: &[u32],
        config: &RenderConfig,
    ) -> Vec<Result<rpdfium_graphics::Bitmap>> {
        use rayon::prelude::*;

        page_indices
            .par_iter()
            .map(|&page_idx| {
                let page = self.page(page_idx)?;
                let tree = page.interpret()?;
                let decoder = crate::image_decode::PdfImageDecoder::new(&page.doc.inner.store);
                let bitmap = rpdfium_render::render_with_images(tree, config, &decoder)?;
                Ok(bitmap)
            })
            .collect()
    }

    /// Render all pages in parallel.
    pub fn render_all_pages_parallel(
        &self,
        config: &RenderConfig,
    ) -> Vec<Result<rpdfium_graphics::Bitmap>> {
        let count = self.page_count();
        let indices: Vec<u32> = (0..count).collect();
        self.render_pages_parallel(&indices, config)
    }
}

// ---------------------------------------------------------------------------
// ArcPage
// ---------------------------------------------------------------------------

/// Arc-wrapped page for WASM/async contexts.
///
/// Unlike [`Page`](crate::Page) which borrows from [`Document`](crate::Document),
/// `ArcPage` holds a cloned `ArcDocument` and is `Send + Sync + 'static`.
#[derive(Clone)]
pub struct ArcPage {
    doc: ArcDocument,
    page_index: u32,
    page_dict_id: ObjectId,
    media_box: Rect,
    display_tree: OnceLock<DisplayTree>,
}

impl ArcPage {
    /// Returns the page's media box (the bounding box of the physical medium).
    pub fn media_box(&self) -> Rect {
        self.media_box
    }

    /// Returns the page's crop box, if explicitly set.
    pub fn crop_box(&self) -> Result<Option<Rect>> {
        let store = &self.doc.inner.store;
        let page_obj = store.resolve(self.page_dict_id)?;
        let page_dict = page_obj
            .as_dict()
            .ok_or(PdfError::UnknownObject(self.page_dict_id))?;
        Ok(parse_rect(page_dict, &Name::crop_box(), store))
    }

    /// Returns the page rotation in degrees (0, 90, 180, or 270).
    pub fn rotation(&self) -> Result<u32> {
        let store = &self.doc.inner.store;
        let page_obj = store.resolve(self.page_dict_id)?;
        let page_dict = page_obj
            .as_dict()
            .ok_or(PdfError::UnknownObject(self.page_dict_id))?;
        let rotation = page_dict
            .get(&Name::rotate())
            .and_then(|obj| store.deep_resolve(obj).ok().and_then(|o| o.as_i64()))
            .unwrap_or(0);
        // Normalize to 0-359 range (handles negative values from malformed PDFs)
        Ok(rotation.rem_euclid(360) as u32)
    }

    /// Interpret the page content stream into a display tree.
    ///
    /// The result is cached in a `OnceLock` so subsequent calls return
    /// the same tree without re-interpretation.
    pub fn interpret(&self) -> Result<&DisplayTree> {
        if let Some(tree) = self.display_tree.get() {
            return Ok(tree);
        }

        let tree = self.interpret_inner()?;

        // Store the tree; if another thread raced us, that's fine.
        let _ = self.display_tree.set(tree);
        Ok(self.display_tree.get().unwrap())
    }

    /// Internal interpretation logic.
    fn interpret_inner(&self) -> Result<DisplayTree> {
        let store = &self.doc.inner.store;
        let page_obj = store.resolve(self.page_dict_id)?;
        let page_dict = page_obj
            .as_dict()
            .ok_or(PdfError::UnknownObject(self.page_dict_id))?;

        let content_bytes = decode_page_contents(page_dict, store)?;
        let operators = tokenize_content_stream(&content_bytes)?;
        let resources = resolve_resources(store, page_dict)?;

        let bridge = FontCacheBridge {
            font_cache: &self.doc.inner.font_cache,
            store,
            resources: &resources,
        };

        let ctx = InterpreterContext {
            store,
            font_cache: &bridge,
            mode: self.doc.inner.options.parsing_mode,
            oc_context: self.doc.inner.oc_context.as_ref(),
        };

        let tree = interpret(
            &operators,
            &ctx,
            &resources,
            self.doc.inner.options.max_operators_per_page,
        )?;
        Ok(tree)
    }

    /// Render the page to a bitmap.
    pub fn render(&self, config: &RenderConfig) -> Result<rpdfium_graphics::Bitmap> {
        let tree = self.interpret()?;
        let decoder = crate::image_decode::PdfImageDecoder::new(&self.doc.inner.store);
        let bitmap = rpdfium_render::render_with_images(tree, config, &decoder)?;
        Ok(bitmap)
    }

    /// Extract text from the page.
    pub fn text(&self) -> Result<TextPage> {
        let tree = self.interpret()?;
        let mut extractor = TextExtractor::new();
        walk(tree, &mut extractor);
        let (characters, run_ids) = extractor.into_characters();
        Ok(TextPage::new_with_run_ids(characters, run_ids, false))
    }

    /// Parse annotations on this page.
    pub fn annotations(&self) -> Result<Vec<Annotation>> {
        let store = &self.doc.inner.store;
        let page_obj = store.resolve(self.page_dict_id)?;
        let page_dict = page_obj
            .as_dict()
            .ok_or(PdfError::UnknownObject(self.page_dict_id))?;
        match page_dict.get(&Name::annots()) {
            Some(annots_obj) => {
                let annots = parse_annotations(annots_obj, store)?;
                Ok(annots)
            }
            None => Ok(Vec::new()),
        }
    }

    /// Get the structure tree elements for this page.
    ///
    /// Returns `Some(PageStructure)` if the document has a tagged PDF structure tree.
    /// Returns `None` if the document is not tagged or has no structure tree.
    pub fn page_structure(&self) -> Option<PageStructure> {
        let store = &self.doc.inner.store;
        let catalog_id = self.doc.inner.catalog_id;

        let catalog_obj = match store.resolve(catalog_id) {
            Ok(obj) => obj,
            Err(_) => return None,
        };

        let catalog_dict = catalog_obj.as_dict()?;

        let struct_tree = match StructTree::from_catalog(catalog_dict, store) {
            Ok(Some(tree)) => tree,
            _ => return None,
        };

        Some(PageStructure::for_page(&struct_tree, self.page_dict_id))
    }

    /// Returns the zero-based page index.
    pub fn index(&self) -> u32 {
        self.page_index
    }

    /// Returns a reference to the parent document.
    pub fn document(&self) -> &ArcDocument {
        &self.doc
    }
}

// Compile-time assertions: all Arc types must be Send + Sync.
#[allow(dead_code)]
const _: () = {
    fn assert_send<T: Send>() {}
    fn assert_sync<T: Sync>() {}
    fn assertions() {
        assert_send::<ArcLibrary>();
        assert_sync::<ArcLibrary>();
        assert_send::<ArcDocument>();
        assert_sync::<ArcDocument>();
        assert_send::<ArcPage>();
        assert_sync::<ArcPage>();
    }
};