Skip to main content

pdf_dump/
lib.rs

1/// writeln! that exits cleanly on BrokenPipe instead of panicking.
2#[macro_export]
3macro_rules! wln {
4    ($dst:expr $(,)?) => {
5        if let Err(e) = writeln!($dst) {
6            if e.kind() == ::std::io::ErrorKind::BrokenPipe { ::std::process::exit(0); }
7            eprintln!("Write error: {}", e);
8            ::std::process::exit(1);
9        }
10    };
11    ($dst:expr, $($arg:tt)*) => {
12        if let Err(e) = writeln!($dst, $($arg)*) {
13            if e.kind() == ::std::io::ErrorKind::BrokenPipe { ::std::process::exit(0); }
14            eprintln!("Write error: {}", e);
15            ::std::process::exit(1);
16        }
17    };
18}
19
20/// write! that exits cleanly on BrokenPipe instead of panicking.
21#[macro_export]
22macro_rules! w {
23    ($dst:expr, $($arg:tt)*) => {
24        if let Err(e) = write!($dst, $($arg)*) {
25            if e.kind() == ::std::io::ErrorKind::BrokenPipe { ::std::process::exit(0); }
26            eprintln!("Write error: {}", e);
27            ::std::process::exit(1);
28        }
29    };
30}
31
32pub(crate) mod types;
33pub(crate) mod stream;
34pub(crate) mod helpers;
35pub(crate) mod object;
36pub(crate) mod refs;
37pub(crate) mod summary;
38pub(crate) mod search;
39pub(crate) mod text;
40pub(crate) mod operators;
41pub(crate) mod resources;
42pub(crate) mod forms;
43pub(crate) mod fonts;
44pub(crate) mod images;
45pub(crate) mod validate;
46pub(crate) mod bookmarks;
47pub(crate) mod annotations;
48pub(crate) mod security;
49pub(crate) mod embedded;
50pub(crate) mod page_labels;
51pub(crate) mod tree;
52pub(crate) mod layers;
53pub(crate) mod structure;
54pub(crate) mod inspect;
55pub(crate) mod page_info;
56pub(crate) mod find_text;
57
58use clap::Parser;
59use lopdf::{Document, Object};
60use serde_json::Value;
61use std::io::{self, Write};
62
63use helpers::json_pretty;
64use types::{Args, DocMode, DumpConfig, PageSpec, ResolvedMode, StandaloneMode};
65
66pub fn run() {
67    let args = Args::parse();
68
69    let resolved = args.resolve_mode().unwrap_or_else(|e| {
70        eprintln!("Error: {}", e);
71        std::process::exit(1);
72    });
73
74    // Modifier validation
75    if args.raw {
76        if !matches!(resolved, ResolvedMode::Standalone(StandaloneMode::Object { .. })) {
77            eprintln!("Error: --raw requires --object.");
78            std::process::exit(1);
79        }
80        if args.decode {
81            eprintln!("Error: --raw and --decode cannot be used together.");
82            std::process::exit(1);
83        }
84    }
85
86    let doc = match Document::load(&args.file) {
87        Ok(doc) => doc,
88        Err(e) => {
89            eprintln!("Error: Failed to load PDF file '{}'.", args.file.display());
90            eprintln!("Reason: {}", e);
91            std::process::exit(1);
92        }
93    };
94
95    let config = DumpConfig {
96        decode: args.decode,
97        truncate: args.truncate,
98        json: args.json,
99        hex: args.hex,
100        depth: args.depth,
101        deref: args.deref,
102        raw: args.raw,
103    };
104
105    let page_spec = args.page.as_deref().map(|s| {
106        PageSpec::parse(s).unwrap_or_else(|e| {
107            eprintln!("Error: {}", e);
108            std::process::exit(1);
109        })
110    });
111
112    let mut out = io::stdout().lock();
113
114    match resolved {
115        ResolvedMode::Default => {
116            dispatch_default(&mut out, &doc, &config, page_spec.as_ref());
117        }
118        ResolvedMode::Standalone(mode) => {
119            dispatch_standalone(&mut out, &doc, &config, mode);
120        }
121        ResolvedMode::Combined(modes) => {
122            dispatch_combined(&mut out, &doc, &config, page_spec.as_ref(), &args, &modes);
123        }
124    }
125}
126
127fn dispatch_default(
128    out: &mut impl Write,
129    doc: &Document,
130    config: &DumpConfig,
131    page_spec: Option<&PageSpec>,
132) {
133    if let Some(spec) = page_spec {
134        if config.json {
135            page_info::print_page_info_json(out, doc, spec);
136        } else {
137            page_info::print_page_info(out, doc, spec);
138        }
139    } else if config.json {
140        summary::print_overview_json(out, doc, config.decode);
141    } else {
142        summary::print_overview(out, doc, config.decode);
143    }
144}
145
146fn dispatch_standalone(
147    out: &mut impl Write,
148    doc: &Document,
149    config: &DumpConfig,
150    mode: StandaloneMode,
151) {
152    match mode {
153        StandaloneMode::ExtractStream { obj_num, ref output } => {
154            let object_id = (obj_num, 0);
155            match doc.get_object(object_id) {
156                Ok(Object::Stream(s)) => {
157                    let (decoded_content, warning) = stream::decode_stream(s);
158                    if let Some(warn) = &warning {
159                        eprintln!("Warning: {}", warn);
160                    }
161                    if let Err(e) = std::fs::write(output, &*decoded_content) {
162                        eprintln!("Error writing to output file: {}", e);
163                        std::process::exit(1);
164                    }
165                    wln!(out, "Successfully extracted object {} to '{}'.", obj_num, output.display());
166                }
167                Ok(_) => {
168                    eprintln!("Error: Object {} is not a stream and cannot be extracted to a file.", obj_num);
169                    std::process::exit(1);
170                }
171                Err(_) => {
172                    eprintln!("Error: Object {} not found in the document.", obj_num);
173                    std::process::exit(1);
174                }
175            }
176        }
177        StandaloneMode::Object { ref nums } => {
178            if config.json {
179                object::print_objects_json(out, doc, nums, config);
180            } else {
181                object::print_objects(out, doc, nums, config);
182            }
183        }
184        StandaloneMode::Inspect { obj_num } => {
185            if config.json {
186                inspect::print_info_json(out, doc, obj_num, config);
187            } else {
188                inspect::print_info(out, doc, obj_num);
189            }
190        }
191        StandaloneMode::Search { ref expr, list_modifier } => {
192            let conditions = match search::parse_search_expr(expr) {
193                Ok(c) => c,
194                Err(e) => {
195                    eprintln!("Error: Invalid search expression: {}", e);
196                    std::process::exit(1);
197                }
198            };
199            if config.json {
200                search::search_objects_json(out, doc, expr, &conditions, config);
201            } else {
202                search::search_objects(out, doc, &conditions, config, list_modifier);
203            }
204        }
205    }
206}
207
208fn dispatch_combined(
209    out: &mut impl Write,
210    doc: &Document,
211    config: &DumpConfig,
212    page_spec: Option<&PageSpec>,
213    args: &Args,
214    modes: &[DocMode],
215) {
216    let multi = modes.len() > 1;
217
218    if config.json {
219        if multi {
220            // Multiple modes: wrap in { "key": value, ... }
221            let mut map = serde_json::Map::new();
222            for mode in modes {
223                let value = build_mode_json_value(mode, doc, config, page_spec, args);
224                map.insert(mode.json_key().to_string(), value);
225            }
226            let output = Value::Object(map);
227            wln!(out, "{}", json_pretty(&output));
228        } else {
229            // Single mode: output directly (unchanged schema)
230            let value = build_mode_json_value(&modes[0], doc, config, page_spec, args);
231            wln!(out, "{}", json_pretty(&value));
232        }
233    } else {
234        for (i, mode) in modes.iter().enumerate() {
235            if multi {
236                if i > 0 {
237                    wln!(out);
238                }
239                wln!(out, "=== {} ===", mode.label());
240            }
241            dispatch_mode_text(out, mode, doc, config, page_spec, args);
242        }
243    }
244}
245
246fn build_mode_json_value(
247    mode: &DocMode,
248    doc: &Document,
249    config: &DumpConfig,
250    page_spec: Option<&PageSpec>,
251    args: &Args,
252) -> Value {
253    match mode {
254        DocMode::List => summary::list_json_value(doc),
255        DocMode::Validate => validate::validation_json_value(doc),
256        DocMode::Fonts => fonts::fonts_json_value(doc),
257        DocMode::Images => images::images_json_value(doc),
258        DocMode::Forms => forms::forms_json_value(doc),
259        DocMode::Bookmarks => bookmarks::bookmarks_json_value(doc),
260        DocMode::Annotations => annotations::annotations_json_value(doc, page_spec),
261        DocMode::Text => text::text_json_value(doc, page_spec),
262        DocMode::Operators => operators::operators_json_value(doc, page_spec),
263        DocMode::Tags => structure::structure_json_value(doc, config),
264        DocMode::Tree => tree::tree_json_value(doc, config),
265        DocMode::FindText => find_text::find_text_json_value(doc, args.find_text.as_deref().unwrap_or(""), page_spec),
266        DocMode::Detail(sub) => match sub {
267            types::DetailSub::Security => security::security_json_value(doc, &args.file),
268            types::DetailSub::Embedded => embedded::embedded_json_value(doc),
269            types::DetailSub::Labels => page_labels::labels_json_value(doc),
270            types::DetailSub::Layers => layers::layers_json_value(doc),
271        },
272    }
273}
274
275fn dispatch_mode_text(
276    out: &mut impl Write,
277    mode: &DocMode,
278    doc: &Document,
279    config: &DumpConfig,
280    page_spec: Option<&PageSpec>,
281    args: &Args,
282) {
283    match mode {
284        DocMode::List => summary::print_list(out, doc),
285        DocMode::Validate => validate::print_validation(out, doc),
286        DocMode::Fonts => fonts::print_fonts(out, doc),
287        DocMode::Images => images::print_images(out, doc),
288        DocMode::Forms => forms::print_forms(out, doc),
289        DocMode::Bookmarks => bookmarks::print_bookmarks(out, doc),
290        DocMode::Annotations => annotations::print_annotations(out, doc, page_spec),
291        DocMode::Text => text::print_text(out, doc, page_spec),
292        DocMode::Operators => operators::print_operators(out, doc, page_spec),
293        DocMode::Tags => structure::print_structure(out, doc, config),
294        DocMode::FindText => find_text::print_find_text(out, doc, args.find_text.as_deref().unwrap_or(""), page_spec),
295        DocMode::Tree => {
296            if args.dot {
297                tree::print_tree_dot(out, doc, config);
298            } else {
299                tree::print_tree(out, doc, config);
300            }
301        }
302        DocMode::Detail(sub) => match sub {
303            types::DetailSub::Security => security::print_security(out, doc, &args.file),
304            types::DetailSub::Embedded => embedded::print_embedded_files(out, doc),
305            types::DetailSub::Labels => page_labels::print_page_labels(out, doc),
306            types::DetailSub::Layers => layers::print_layers(out, doc),
307        },
308    }
309}
310
311#[cfg(test)]
312pub(crate) mod test_utils {
313    use lopdf::{Document, Object, Stream};
314    use flate2::write::ZlibEncoder;
315    use flate2::Compression;
316    use std::io::Write;
317    use crate::types::DumpConfig;
318
319    pub fn output_of(f: impl FnOnce(&mut Vec<u8>)) -> String {
320        let mut buf = Vec::new();
321        f(&mut buf);
322        String::from_utf8(buf).unwrap()
323    }
324
325    pub fn empty_doc() -> Document {
326        let mut doc = Document::new();
327        doc.version = "1.5".to_string();
328        doc
329    }
330
331    pub fn default_config() -> DumpConfig {
332        DumpConfig {
333            decode: false,
334            truncate: None,
335            json: false,
336            hex: false,
337            depth: None,
338            deref: false,
339            raw: false,
340        }
341    }
342
343    pub fn make_stream(filter: Option<Object>, content: Vec<u8>) -> Stream {
344        let mut dict = lopdf::Dictionary::new();
345        if let Some(f) = filter {
346            dict.set("Filter", f);
347        }
348        Stream::new(dict, content)
349    }
350
351    pub fn zlib_compress(data: &[u8]) -> Vec<u8> {
352        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
353        encoder.write_all(data).unwrap();
354        encoder.finish().unwrap()
355    }
356
357    pub fn json_config() -> DumpConfig {
358        DumpConfig { decode: false, truncate: None, json: true, hex: false, depth: None, deref: false, raw: false }
359    }
360
361    pub fn build_two_page_doc() -> Document {
362        use lopdf::Dictionary;
363
364        let mut doc = Document::new();
365
366        let c1 = Stream::new(Dictionary::new(), b"BT /F1 12 Tf (Page1) Tj ET".to_vec());
367        let c1_id = doc.add_object(Object::Stream(c1));
368        let c2 = Stream::new(Dictionary::new(), b"BT /F1 12 Tf (Page2) Tj ET".to_vec());
369        let c2_id = doc.add_object(Object::Stream(c2));
370
371        let mut font = Dictionary::new();
372        font.set("Type", Object::Name(b"Font".to_vec()));
373        font.set("BaseFont", Object::Name(b"Helvetica".to_vec()));
374        let font_id = doc.add_object(Object::Dictionary(font));
375
376        let mut f1 = Dictionary::new();
377        f1.set("F1", Object::Reference(font_id));
378        let mut resources = Dictionary::new();
379        resources.set("Font", Object::Dictionary(f1));
380        let resources_id = doc.add_object(Object::Dictionary(resources));
381
382        let mut pages = Dictionary::new();
383        pages.set("Type", Object::Name(b"Pages".to_vec()));
384        pages.set("Count", Object::Integer(2));
385        pages.set("Kids", Object::Array(vec![]));
386        let pages_id = doc.add_object(Object::Dictionary(pages));
387
388        let mut p1 = Dictionary::new();
389        p1.set("Type", Object::Name(b"Page".to_vec()));
390        p1.set("Parent", Object::Reference(pages_id));
391        p1.set("Contents", Object::Reference(c1_id));
392        p1.set("Resources", Object::Reference(resources_id));
393        p1.set("MediaBox", Object::Array(vec![
394            Object::Integer(0), Object::Integer(0),
395            Object::Integer(612), Object::Integer(792),
396        ]));
397        let p1_id = doc.add_object(Object::Dictionary(p1));
398
399        let mut p2 = Dictionary::new();
400        p2.set("Type", Object::Name(b"Page".to_vec()));
401        p2.set("Parent", Object::Reference(pages_id));
402        p2.set("Contents", Object::Reference(c2_id));
403        p2.set("Resources", Object::Reference(resources_id));
404        p2.set("MediaBox", Object::Array(vec![
405            Object::Integer(0), Object::Integer(0),
406            Object::Integer(612), Object::Integer(792),
407        ]));
408        let p2_id = doc.add_object(Object::Dictionary(p2));
409
410        if let Ok(Object::Dictionary(d)) = doc.get_object_mut(pages_id) {
411            d.set("Kids", Object::Array(vec![
412                Object::Reference(p1_id),
413                Object::Reference(p2_id),
414            ]));
415        }
416
417        let mut catalog = Dictionary::new();
418        catalog.set("Type", Object::Name(b"Catalog".to_vec()));
419        catalog.set("Pages", Object::Reference(pages_id));
420        let catalog_id = doc.add_object(Object::Dictionary(catalog));
421        doc.trailer.set("Root", Object::Reference(catalog_id));
422
423        doc
424    }
425
426    pub fn build_page_doc_with_content(content: &[u8]) -> Document {
427        use lopdf::Dictionary;
428
429        let mut doc = Document::new();
430        let stream = Stream::new(Dictionary::new(), content.to_vec());
431        doc.objects.insert((1, 0), Object::Stream(stream));
432        let mut page_dict = Dictionary::new();
433        page_dict.set("Type", Object::Name(b"Page".to_vec()));
434        page_dict.set("Contents", Object::Reference((1, 0)));
435        page_dict.set("Parent", Object::Reference((3, 0)));
436        doc.objects.insert((2, 0), Object::Dictionary(page_dict));
437        let mut pages_dict = Dictionary::new();
438        pages_dict.set("Type", Object::Name(b"Pages".to_vec()));
439        pages_dict.set("Count", Object::Integer(1));
440        pages_dict.set("Kids", Object::Array(vec![Object::Reference((2, 0))]));
441        doc.objects.insert((3, 0), Object::Dictionary(pages_dict));
442        let mut catalog = Dictionary::new();
443        catalog.set("Type", Object::Name(b"Catalog".to_vec()));
444        catalog.set("Pages", Object::Reference((3, 0)));
445        doc.objects.insert((4, 0), Object::Dictionary(catalog));
446        doc.trailer.set("Root", Object::Reference((4, 0)));
447        doc
448    }
449
450    pub fn make_page_with_annots(doc: &mut Document, page_id: lopdf::ObjectId, parent_id: lopdf::ObjectId, annot_ids: Vec<lopdf::ObjectId>) {
451        use lopdf::Dictionary;
452
453        let mut page = Dictionary::new();
454        page.set("Type", Object::Name(b"Page".to_vec()));
455        page.set("Parent", Object::Reference(parent_id));
456        page.set("MediaBox", Object::Array(vec![
457            Object::Integer(0), Object::Integer(0), Object::Integer(612), Object::Integer(792),
458        ]));
459        let refs: Vec<Object> = annot_ids.iter().map(|id| Object::Reference(*id)).collect();
460        page.set("Annots", Object::Array(refs));
461        doc.objects.insert(page_id, Object::Dictionary(page));
462    }
463}