Skip to main content

pdf_dump/
lib.rs

1/// writeln! that exits cleanly on BrokenPipe instead of panicking.
2#[macro_export]
3macro_rules! wln {
4    ($dst:expr $(,)?) => {
5        if let Err(e) = writeln!($dst) {
6            if e.kind() == ::std::io::ErrorKind::BrokenPipe { ::std::process::exit(0); }
7            eprintln!("Write error: {}", e);
8            ::std::process::exit(1);
9        }
10    };
11    ($dst:expr, $($arg:tt)*) => {
12        if let Err(e) = writeln!($dst, $($arg)*) {
13            if e.kind() == ::std::io::ErrorKind::BrokenPipe { ::std::process::exit(0); }
14            eprintln!("Write error: {}", e);
15            ::std::process::exit(1);
16        }
17    };
18}
19
20/// write! that exits cleanly on BrokenPipe instead of panicking.
21#[macro_export]
22macro_rules! w {
23    ($dst:expr, $($arg:tt)*) => {
24        if let Err(e) = write!($dst, $($arg)*) {
25            if e.kind() == ::std::io::ErrorKind::BrokenPipe { ::std::process::exit(0); }
26            eprintln!("Write error: {}", e);
27            ::std::process::exit(1);
28        }
29    };
30}
31
32pub(crate) mod annotations;
33pub(crate) mod bookmarks;
34pub(crate) mod embedded;
35pub(crate) mod find_text;
36pub(crate) mod fonts;
37pub(crate) mod forms;
38pub(crate) mod helpers;
39pub(crate) mod images;
40pub(crate) mod inspect;
41pub(crate) mod layers;
42pub(crate) mod object;
43pub(crate) mod operators;
44pub(crate) mod page_info;
45pub(crate) mod page_labels;
46pub(crate) mod refs;
47pub(crate) mod resources;
48pub(crate) mod search;
49pub(crate) mod security;
50pub(crate) mod stream;
51pub(crate) mod structure;
52pub(crate) mod summary;
53pub(crate) mod text;
54pub(crate) mod tree;
55pub(crate) mod types;
56pub(crate) mod validate;
57
58use clap::Parser;
59use lopdf::{Document, Object};
60use serde_json::Value;
61use std::io::{self, Write};
62
63use helpers::json_pretty;
64use types::{Args, DocMode, DumpConfig, PageSpec, ResolvedMode, StandaloneMode};
65
66pub fn run() {
67    let args = Args::parse();
68
69    let resolved = args.resolve_mode().unwrap_or_else(|e| {
70        eprintln!("Error: {}", e);
71        std::process::exit(1);
72    });
73
74    // Modifier validation
75    if args.raw {
76        if !matches!(
77            resolved,
78            ResolvedMode::Standalone(StandaloneMode::Object { .. })
79        ) {
80            eprintln!("Error: --raw requires --object.");
81            std::process::exit(1);
82        }
83        if args.decode {
84            eprintln!("Error: --raw and --decode cannot be used together.");
85            std::process::exit(1);
86        }
87    }
88
89    let doc = match Document::load(&args.file) {
90        Ok(doc) => doc,
91        Err(e) => {
92            eprintln!("Error: Failed to load PDF file '{}'.", args.file.display());
93            eprintln!("Reason: {}", e);
94            std::process::exit(1);
95        }
96    };
97
98    let config = DumpConfig {
99        decode: args.decode,
100        truncate: args.truncate,
101        json: args.json,
102        hex: args.hex,
103        depth: args.depth,
104        deref: args.deref,
105        raw: args.raw,
106    };
107
108    let page_spec = args.page.as_deref().map(|s| {
109        PageSpec::parse(s).unwrap_or_else(|e| {
110            eprintln!("Error: {}", e);
111            std::process::exit(1);
112        })
113    });
114
115    let mut out = io::stdout().lock();
116
117    match resolved {
118        ResolvedMode::Default => {
119            dispatch_default(&mut out, &doc, &config, page_spec.as_ref());
120        }
121        ResolvedMode::Standalone(mode) => {
122            dispatch_standalone(&mut out, &doc, &config, mode);
123        }
124        ResolvedMode::Combined(modes) => {
125            dispatch_combined(&mut out, &doc, &config, page_spec.as_ref(), &args, &modes);
126        }
127    }
128}
129
130fn dispatch_default(
131    out: &mut impl Write,
132    doc: &Document,
133    config: &DumpConfig,
134    page_spec: Option<&PageSpec>,
135) {
136    if let Some(spec) = page_spec {
137        if config.json {
138            page_info::print_page_info_json(out, doc, spec);
139        } else {
140            page_info::print_page_info(out, doc, spec);
141        }
142    } else if config.json {
143        summary::print_overview_json(out, doc, config.decode);
144    } else {
145        summary::print_overview(out, doc, config.decode);
146    }
147}
148
149fn dispatch_standalone(
150    out: &mut impl Write,
151    doc: &Document,
152    config: &DumpConfig,
153    mode: StandaloneMode,
154) {
155    match mode {
156        StandaloneMode::ExtractStream {
157            obj_num,
158            ref output,
159        } => {
160            let object_id = (obj_num, 0);
161            match doc.get_object(object_id) {
162                Ok(Object::Stream(s)) => {
163                    let (decoded_content, warning) = stream::decode_stream(s);
164                    if let Some(warn) = &warning {
165                        eprintln!("Warning: {}", warn);
166                    }
167                    if let Err(e) = std::fs::write(output, &*decoded_content) {
168                        eprintln!("Error writing to output file: {}", e);
169                        std::process::exit(1);
170                    }
171                    wln!(
172                        out,
173                        "Successfully extracted object {} to '{}'.",
174                        obj_num,
175                        output.display()
176                    );
177                }
178                Ok(_) => {
179                    eprintln!(
180                        "Error: Object {} is not a stream and cannot be extracted to a file.",
181                        obj_num
182                    );
183                    std::process::exit(1);
184                }
185                Err(_) => {
186                    eprintln!("Error: Object {} not found in the document.", obj_num);
187                    std::process::exit(1);
188                }
189            }
190        }
191        StandaloneMode::Object { ref nums } => {
192            if config.json {
193                object::print_objects_json(out, doc, nums, config);
194            } else {
195                object::print_objects(out, doc, nums, config);
196            }
197        }
198        StandaloneMode::Inspect { obj_num } => {
199            if config.json {
200                inspect::print_info_json(out, doc, obj_num, config);
201            } else {
202                inspect::print_info(out, doc, obj_num);
203            }
204        }
205        StandaloneMode::Search {
206            ref expr,
207            list_modifier,
208        } => {
209            let conditions = match search::parse_search_expr(expr) {
210                Ok(c) => c,
211                Err(e) => {
212                    eprintln!("Error: Invalid search expression: {}", e);
213                    std::process::exit(1);
214                }
215            };
216            if config.json {
217                search::search_objects_json(out, doc, expr, &conditions, config);
218            } else {
219                search::search_objects(out, doc, &conditions, config, list_modifier);
220            }
221        }
222    }
223}
224
225fn dispatch_combined(
226    out: &mut impl Write,
227    doc: &Document,
228    config: &DumpConfig,
229    page_spec: Option<&PageSpec>,
230    args: &Args,
231    modes: &[DocMode],
232) {
233    let multi = modes.len() > 1;
234
235    if config.json {
236        if multi {
237            // Multiple modes: wrap in { "key": value, ... }
238            let mut map = serde_json::Map::new();
239            for mode in modes {
240                let value = build_mode_json_value(mode, doc, config, page_spec, args);
241                map.insert(mode.json_key().to_string(), value);
242            }
243            let output = Value::Object(map);
244            wln!(out, "{}", json_pretty(&output));
245        } else {
246            // Single mode: output directly (unchanged schema)
247            let value = build_mode_json_value(&modes[0], doc, config, page_spec, args);
248            wln!(out, "{}", json_pretty(&value));
249        }
250    } else {
251        for (i, mode) in modes.iter().enumerate() {
252            if multi {
253                if i > 0 {
254                    wln!(out);
255                }
256                wln!(out, "=== {} ===", mode.label());
257            }
258            dispatch_mode_text(out, mode, doc, config, page_spec, args);
259        }
260    }
261}
262
263fn build_mode_json_value(
264    mode: &DocMode,
265    doc: &Document,
266    config: &DumpConfig,
267    page_spec: Option<&PageSpec>,
268    args: &Args,
269) -> Value {
270    match mode {
271        DocMode::List => summary::list_json_value(doc),
272        DocMode::Validate => validate::validation_json_value(doc),
273        DocMode::Fonts => fonts::fonts_json_value(doc),
274        DocMode::Images => images::images_json_value(doc),
275        DocMode::Forms => forms::forms_json_value(doc),
276        DocMode::Bookmarks => bookmarks::bookmarks_json_value(doc),
277        DocMode::Annotations => annotations::annotations_json_value(doc, page_spec),
278        DocMode::Text => text::text_json_value(doc, page_spec),
279        DocMode::Operators => operators::operators_json_value(doc, page_spec),
280        DocMode::Tags => structure::structure_json_value(doc, config),
281        DocMode::Tree => tree::tree_json_value(doc, config),
282        DocMode::FindText => {
283            find_text::find_text_json_value(doc, args.find_text.as_deref().unwrap_or(""), page_spec)
284        }
285        DocMode::Detail(sub) => match sub {
286            types::DetailSub::Security => security::security_json_value(doc, &args.file),
287            types::DetailSub::Embedded => embedded::embedded_json_value(doc),
288            types::DetailSub::Labels => page_labels::labels_json_value(doc),
289            types::DetailSub::Layers => layers::layers_json_value(doc),
290        },
291    }
292}
293
294fn dispatch_mode_text(
295    out: &mut impl Write,
296    mode: &DocMode,
297    doc: &Document,
298    config: &DumpConfig,
299    page_spec: Option<&PageSpec>,
300    args: &Args,
301) {
302    match mode {
303        DocMode::List => summary::print_list(out, doc),
304        DocMode::Validate => validate::print_validation(out, doc),
305        DocMode::Fonts => fonts::print_fonts(out, doc),
306        DocMode::Images => images::print_images(out, doc),
307        DocMode::Forms => forms::print_forms(out, doc),
308        DocMode::Bookmarks => bookmarks::print_bookmarks(out, doc),
309        DocMode::Annotations => annotations::print_annotations(out, doc, page_spec),
310        DocMode::Text => text::print_text(out, doc, page_spec),
311        DocMode::Operators => operators::print_operators(out, doc, page_spec),
312        DocMode::Tags => structure::print_structure(out, doc, config),
313        DocMode::FindText => {
314            find_text::print_find_text(out, doc, args.find_text.as_deref().unwrap_or(""), page_spec)
315        }
316        DocMode::Tree => {
317            if args.dot {
318                tree::print_tree_dot(out, doc, config);
319            } else {
320                tree::print_tree(out, doc, config);
321            }
322        }
323        DocMode::Detail(sub) => match sub {
324            types::DetailSub::Security => security::print_security(out, doc, &args.file),
325            types::DetailSub::Embedded => embedded::print_embedded_files(out, doc),
326            types::DetailSub::Labels => page_labels::print_page_labels(out, doc),
327            types::DetailSub::Layers => layers::print_layers(out, doc),
328        },
329    }
330}
331
332#[cfg(test)]
333pub(crate) mod test_utils {
334    use crate::types::DumpConfig;
335    use flate2::Compression;
336    use flate2::write::ZlibEncoder;
337    use lopdf::{Document, Object, Stream};
338    use std::io::Write;
339
340    pub fn output_of(f: impl FnOnce(&mut Vec<u8>)) -> String {
341        let mut buf = Vec::new();
342        f(&mut buf);
343        String::from_utf8(buf).unwrap()
344    }
345
346    pub fn empty_doc() -> Document {
347        let mut doc = Document::new();
348        doc.version = "1.5".to_string();
349        doc
350    }
351
352    pub fn default_config() -> DumpConfig {
353        DumpConfig {
354            decode: false,
355            truncate: None,
356            json: false,
357            hex: false,
358            depth: None,
359            deref: false,
360            raw: false,
361        }
362    }
363
364    pub fn make_stream(filter: Option<Object>, content: Vec<u8>) -> Stream {
365        let mut dict = lopdf::Dictionary::new();
366        if let Some(f) = filter {
367            dict.set("Filter", f);
368        }
369        Stream::new(dict, content)
370    }
371
372    pub fn zlib_compress(data: &[u8]) -> Vec<u8> {
373        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
374        encoder.write_all(data).unwrap();
375        encoder.finish().unwrap()
376    }
377
378    pub fn json_config() -> DumpConfig {
379        DumpConfig {
380            decode: false,
381            truncate: None,
382            json: true,
383            hex: false,
384            depth: None,
385            deref: false,
386            raw: false,
387        }
388    }
389
390    pub fn build_two_page_doc() -> Document {
391        use lopdf::Dictionary;
392
393        let mut doc = Document::new();
394
395        let c1 = Stream::new(Dictionary::new(), b"BT /F1 12 Tf (Page1) Tj ET".to_vec());
396        let c1_id = doc.add_object(Object::Stream(c1));
397        let c2 = Stream::new(Dictionary::new(), b"BT /F1 12 Tf (Page2) Tj ET".to_vec());
398        let c2_id = doc.add_object(Object::Stream(c2));
399
400        let mut font = Dictionary::new();
401        font.set("Type", Object::Name(b"Font".to_vec()));
402        font.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
403        let font_id = doc.add_object(Object::Dictionary(font));
404
405        let mut f1 = Dictionary::new();
406        f1.set("F1", Object::Reference(font_id));
407        let mut resources = Dictionary::new();
408        resources.set("Font", Object::Dictionary(f1));
409        let resources_id = doc.add_object(Object::Dictionary(resources));
410
411        let mut pages = Dictionary::new();
412        pages.set("Type", Object::Name(b"Pages".to_vec()));
413        pages.set("Count", Object::Integer(2));
414        pages.set("Kids", Object::Array(vec![]));
415        let pages_id = doc.add_object(Object::Dictionary(pages));
416
417        let mut p1 = Dictionary::new();
418        p1.set("Type", Object::Name(b"Page".to_vec()));
419        p1.set("Parent", Object::Reference(pages_id));
420        p1.set("Contents", Object::Reference(c1_id));
421        p1.set("Resources", Object::Reference(resources_id));
422        p1.set(
423            "MediaBox",
424            Object::Array(vec![
425                Object::Integer(0),
426                Object::Integer(0),
427                Object::Integer(612),
428                Object::Integer(792),
429            ]),
430        );
431        let p1_id = doc.add_object(Object::Dictionary(p1));
432
433        let mut p2 = Dictionary::new();
434        p2.set("Type", Object::Name(b"Page".to_vec()));
435        p2.set("Parent", Object::Reference(pages_id));
436        p2.set("Contents", Object::Reference(c2_id));
437        p2.set("Resources", Object::Reference(resources_id));
438        p2.set(
439            "MediaBox",
440            Object::Array(vec![
441                Object::Integer(0),
442                Object::Integer(0),
443                Object::Integer(612),
444                Object::Integer(792),
445            ]),
446        );
447        let p2_id = doc.add_object(Object::Dictionary(p2));
448
449        if let Ok(Object::Dictionary(d)) = doc.get_object_mut(pages_id) {
450            d.set(
451                "Kids",
452                Object::Array(vec![Object::Reference(p1_id), Object::Reference(p2_id)]),
453            );
454        }
455
456        let mut catalog = Dictionary::new();
457        catalog.set("Type", Object::Name(b"Catalog".to_vec()));
458        catalog.set("Pages", Object::Reference(pages_id));
459        let catalog_id = doc.add_object(Object::Dictionary(catalog));
460        doc.trailer.set("Root", Object::Reference(catalog_id));
461
462        doc
463    }
464
465    pub fn build_page_doc_with_content(content: &[u8]) -> Document {
466        use lopdf::Dictionary;
467
468        let mut doc = Document::new();
469        let stream = Stream::new(Dictionary::new(), content.to_vec());
470        doc.objects.insert((1, 0), Object::Stream(stream));
471        let mut page_dict = Dictionary::new();
472        page_dict.set("Type", Object::Name(b"Page".to_vec()));
473        page_dict.set("Contents", Object::Reference((1, 0)));
474        page_dict.set("Parent", Object::Reference((3, 0)));
475        doc.objects.insert((2, 0), Object::Dictionary(page_dict));
476        let mut pages_dict = Dictionary::new();
477        pages_dict.set("Type", Object::Name(b"Pages".to_vec()));
478        pages_dict.set("Count", Object::Integer(1));
479        pages_dict.set("Kids", Object::Array(vec![Object::Reference((2, 0))]));
480        doc.objects.insert((3, 0), Object::Dictionary(pages_dict));
481        let mut catalog = Dictionary::new();
482        catalog.set("Type", Object::Name(b"Catalog".to_vec()));
483        catalog.set("Pages", Object::Reference((3, 0)));
484        doc.objects.insert((4, 0), Object::Dictionary(catalog));
485        doc.trailer.set("Root", Object::Reference((4, 0)));
486        doc
487    }
488
489    pub fn make_page_with_annots(
490        doc: &mut Document,
491        page_id: lopdf::ObjectId,
492        parent_id: lopdf::ObjectId,
493        annot_ids: Vec<lopdf::ObjectId>,
494    ) {
495        use lopdf::Dictionary;
496
497        let mut page = Dictionary::new();
498        page.set("Type", Object::Name(b"Page".to_vec()));
499        page.set("Parent", Object::Reference(parent_id));
500        page.set(
501            "MediaBox",
502            Object::Array(vec![
503                Object::Integer(0),
504                Object::Integer(0),
505                Object::Integer(612),
506                Object::Integer(792),
507            ]),
508        );
509        let refs: Vec<Object> = annot_ids.iter().map(|id| Object::Reference(*id)).collect();
510        page.set("Annots", Object::Array(refs));
511        doc.objects.insert(page_id, Object::Dictionary(page));
512    }
513}