Skip to main content

justpdf_core/
embedded_file.rs

1//! PDF embedded files (attachments) — PDF spec section 7.8.
2//!
3//! Supports reading embedded files from the catalog name tree, extracting
4//! file data, and adding new embedded files to a document.
5
6use md5::{Digest, Md5};
7
8use crate::error::{JustPdfError, Result};
9use crate::object::{IndirectRef, PdfDict, PdfObject};
10use crate::parser::PdfDocument;
11use crate::stream;
12use crate::writer::encode::make_stream;
13use crate::writer::modify::DocumentModifier;
14
15// ---------------------------------------------------------------------------
16// Types
17// ---------------------------------------------------------------------------
18
19/// A parsed file specification (PDF FileSpec dictionary).
20#[derive(Debug, Clone)]
21pub struct FileSpec {
22    /// The filename (from /UF or /F).
23    pub filename: String,
24    /// Optional description (/Desc).
25    pub description: Option<String>,
26    /// MIME type (from the EF stream /Subtype, e.g. "application/pdf").
27    pub mime_type: Option<String>,
28    /// Uncompressed file size in bytes (/Params -> /Size).
29    pub size: Option<usize>,
30    /// MD5 checksum of the uncompressed data (/Params -> /CheckSum).
31    pub checksum: Option<Vec<u8>>,
32    /// Creation date string (/Params -> /CreationDate).
33    pub creation_date: Option<String>,
34    /// Modification date string (/Params -> /ModDate).
35    pub mod_date: Option<String>,
36    /// Reference to the embedded file stream object (/EF -> /F).
37    pub ef_stream_ref: Option<IndirectRef>,
38}
39
40// ---------------------------------------------------------------------------
41// Parsing helpers
42// ---------------------------------------------------------------------------
43
44/// Extract a UTF-8 string from a `PdfObject::String`.
45fn obj_to_string(obj: &PdfObject) -> Option<String> {
46    match obj {
47        PdfObject::String(bytes) => {
48            // Handle BOM-prefixed UTF-16BE strings
49            if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
50                let chars: Vec<u16> = bytes[2..]
51                    .chunks(2)
52                    .filter_map(|c| {
53                        if c.len() == 2 {
54                            Some(u16::from_be_bytes([c[0], c[1]]))
55                        } else {
56                            None
57                        }
58                    })
59                    .collect();
60                String::from_utf16(&chars).ok()
61            } else {
62                Some(String::from_utf8_lossy(bytes).into_owned())
63            }
64        }
65        _ => None,
66    }
67}
68
69/// Parse a single FileSpec dictionary into a `FileSpec`.
70fn parse_file_spec_dict(
71    doc: &PdfDocument,
72    dict: &PdfDict,
73) -> Result<FileSpec> {
74    // Filename: prefer /UF (Unicode), fall back to /F
75    let filename = dict
76        .get(b"UF")
77        .and_then(obj_to_string)
78        .or_else(|| dict.get(b"F").and_then(obj_to_string))
79        .unwrap_or_default();
80
81    let description = dict.get(b"Desc").and_then(obj_to_string);
82
83    // /EF dict -> /F (reference to the embedded file stream)
84    let mut ef_stream_ref: Option<IndirectRef> = None;
85    let mut mime_type: Option<String> = None;
86    let mut size: Option<usize> = None;
87    let mut checksum: Option<Vec<u8>> = None;
88    let mut creation_date: Option<String> = None;
89    let mut mod_date: Option<String> = None;
90
91    if let Some(ef_dict) = resolve_dict(doc, dict, b"EF")? {
92        // The /F entry inside /EF is a reference to the embedded stream
93        if let Some(r) = ef_dict.get_ref(b"F") {
94            let stream_ref = r.clone();
95
96            // Resolve the stream to extract params
97            if let Ok(stream_obj) = doc.resolve(&stream_ref) {
98                if let PdfObject::Stream { dict: s_dict, .. } = stream_obj {
99                    // MIME type from /Subtype
100                    if let Some(name) = s_dict.get_name(b"Subtype") {
101                        let raw = String::from_utf8_lossy(name).into_owned();
102                        // PDF uses #2F for '/' in names
103                        mime_type = Some(raw.replace("#2F", "/"));
104                    }
105
106                    // /Params sub-dictionary
107                    if let Some(params) = s_dict.get_dict(b"Params") {
108                        size = params.get_i64(b"Size").map(|v| v as usize);
109                        checksum = params.get_string(b"CheckSum").map(|b| b.to_vec());
110                        creation_date = params.get(b"CreationDate").and_then(obj_to_string);
111                        mod_date = params.get(b"ModDate").and_then(obj_to_string);
112                    }
113                }
114            }
115
116            ef_stream_ref = Some(stream_ref);
117        }
118    }
119
120    Ok(FileSpec {
121        filename,
122        description,
123        mime_type,
124        size,
125        checksum,
126        creation_date,
127        mod_date,
128        ef_stream_ref,
129    })
130}
131
132/// Resolve a dict entry that might be an indirect reference to a dict.
133fn resolve_dict<'a>(
134    doc: &'a PdfDocument,
135    parent: &PdfDict,
136    key: &[u8],
137) -> Result<Option<PdfDict>> {
138    match parent.get(key) {
139        Some(PdfObject::Dict(d)) => Ok(Some(d.clone())),
140        Some(PdfObject::Reference(r)) => {
141            let r = r.clone();
142            let obj = doc.resolve(&r)?;
143            match obj {
144                PdfObject::Dict(d) => Ok(Some(d)),
145                _ => Ok(None),
146            }
147        }
148        _ => Ok(None),
149    }
150}
151
152// ---------------------------------------------------------------------------
153// Parsing: read embedded files from catalog
154// ---------------------------------------------------------------------------
155
156/// Read all embedded file specifications from the document catalog.
157///
158/// Parses the Catalog -> /Names -> /EmbeddedFiles name tree and returns
159/// a `Vec<FileSpec>` for each attachment found.
160pub fn read_embedded_files(doc: &PdfDocument) -> Result<Vec<FileSpec>> {
161    // Get catalog
162    let catalog_ref = match doc.catalog_ref() {
163        Some(r) => r.clone(),
164        None => return Ok(Vec::new()),
165    };
166    let catalog = match doc.resolve(&catalog_ref)? {
167        PdfObject::Dict(d) => d,
168        _ => return Ok(Vec::new()),
169    };
170
171    // Catalog -> /Names
172    let names_dict = match resolve_dict(doc, &catalog, b"Names")? {
173        Some(d) => d,
174        None => return Ok(Vec::new()),
175    };
176
177    // /Names -> /EmbeddedFiles (name tree root)
178    let ef_tree = match resolve_dict(doc, &names_dict, b"EmbeddedFiles")? {
179        Some(d) => d,
180        None => return Ok(Vec::new()),
181    };
182
183    // Collect leaf values from the name tree
184    let mut file_specs = Vec::new();
185    collect_name_tree_values(doc, &ef_tree, &mut file_specs)?;
186
187    Ok(file_specs)
188}
189
190/// Recursively collect FileSpec values from a name tree node.
191fn collect_name_tree_values(
192    doc: &PdfDocument,
193    node: &PdfDict,
194    out: &mut Vec<FileSpec>,
195) -> Result<()> {
196    // Leaf node: /Names array of [name1, value1, name2, value2, ...]
197    if let Some(names_arr) = node.get_array(b"Names") {
198        let pairs: Vec<PdfObject> = names_arr.to_vec();
199        let mut i = 0;
200        while i + 1 < pairs.len() {
201            // pairs[i] is the name key (string), pairs[i+1] is the value (dict or ref)
202            let value = &pairs[i + 1];
203            let fs_dict = match value {
204                PdfObject::Dict(d) => Some(d.clone()),
205                PdfObject::Reference(r) => {
206                    let r = r.clone();
207                    match doc.resolve(&r)? {
208                        PdfObject::Dict(d) => Some(d),
209                        _ => None,
210                    }
211                }
212                _ => None,
213            };
214            if let Some(d) = fs_dict {
215                out.push(parse_file_spec_dict(doc, &d)?);
216            }
217            i += 2;
218        }
219    }
220
221    // Intermediate node: /Kids array of child node references
222    if let Some(kids_arr) = node.get_array(b"Kids") {
223        let kids: Vec<PdfObject> = kids_arr.to_vec();
224        for kid in &kids {
225            if let PdfObject::Reference(r) = kid {
226                let r = r.clone();
227                let child = doc.resolve(&r)?;
228                if let PdfObject::Dict(d) = child {
229                    collect_name_tree_values(doc, &d, out)?;
230                }
231            }
232        }
233    }
234
235    Ok(())
236}
237
238// ---------------------------------------------------------------------------
239// Extraction
240// ---------------------------------------------------------------------------
241
242/// Extract the raw (decoded) file data for an embedded file.
243///
244/// Resolves the EF stream reference, decodes the stream through its filter
245/// chain, and optionally verifies the MD5 checksum when present.
246pub fn extract_file(doc: &PdfDocument, file_spec: &FileSpec) -> Result<Vec<u8>> {
247    let stream_ref = file_spec.ef_stream_ref.as_ref().ok_or_else(|| {
248        JustPdfError::StreamDecode {
249            filter: String::new(),
250            detail: "FileSpec has no embedded file stream reference".into(),
251        }
252    })?;
253
254    let stream_obj = doc.resolve(stream_ref)?;
255    let (dict, raw_data) = match &stream_obj {
256        PdfObject::Stream { dict, data } => (dict, data.as_slice()),
257        _ => {
258            return Err(JustPdfError::StreamDecode {
259                filter: String::new(),
260                detail: "EF stream reference does not point to a stream object".into(),
261            });
262        }
263    };
264
265    let decoded = stream::decode_stream(raw_data, dict)?;
266
267    // Verify checksum if present
268    if let Some(expected) = &file_spec.checksum {
269        let mut hasher = Md5::new();
270        hasher.update(&decoded);
271        let computed = hasher.finalize();
272        if computed.as_slice() != expected.as_slice() {
273            return Err(JustPdfError::StreamDecode {
274                filter: String::new(),
275                detail: "embedded file MD5 checksum mismatch".into(),
276            });
277        }
278    }
279
280    Ok(decoded)
281}
282
283// ---------------------------------------------------------------------------
284// Builder: add embedded file
285// ---------------------------------------------------------------------------
286
287/// Add an embedded file (attachment) to the document.
288///
289/// Creates the embedded file stream, a FileSpec dictionary, and wires it
290/// into the Catalog -> /Names -> /EmbeddedFiles name tree. Returns the
291/// `IndirectRef` of the new FileSpec dictionary object.
292pub fn add_embedded_file(
293    modifier: &mut DocumentModifier,
294    filename: &str,
295    data: &[u8],
296    mime_type: Option<&str>,
297    description: Option<&str>,
298) -> Result<IndirectRef> {
299    // 1. Compute MD5 checksum of uncompressed data
300    let mut hasher = Md5::new();
301    hasher.update(data);
302    let checksum = hasher.finalize().to_vec();
303
304    // 2. Build the embedded file stream with FlateDecode compression
305    let (mut stream_dict, compressed) = make_stream(data, true);
306
307    // /Type /EmbeddedFile
308    stream_dict.insert(b"Type".to_vec(), PdfObject::Name(b"EmbeddedFile".to_vec()));
309
310    // /Subtype (MIME type encoded as a PDF name, with '/' -> '#2F')
311    if let Some(mt) = mime_type {
312        let name_encoded = mt.replace('/', "#2F");
313        stream_dict.insert(
314            b"Subtype".to_vec(),
315            PdfObject::Name(name_encoded.into_bytes()),
316        );
317    }
318
319    // /Params dict
320    let mut params = PdfDict::new();
321    params.insert(b"Size".to_vec(), PdfObject::Integer(data.len() as i64));
322    params.insert(b"CheckSum".to_vec(), PdfObject::String(checksum));
323    stream_dict.insert(b"Params".to_vec(), PdfObject::Dict(params));
324
325    let stream_ref = modifier.add_object(PdfObject::Stream {
326        dict: stream_dict,
327        data: compressed,
328    });
329
330    // 3. Build the FileSpec dictionary
331    let mut fs_dict = PdfDict::new();
332    fs_dict.insert(b"Type".to_vec(), PdfObject::Name(b"Filespec".to_vec()));
333    fs_dict.insert(
334        b"F".to_vec(),
335        PdfObject::String(filename.as_bytes().to_vec()),
336    );
337    fs_dict.insert(
338        b"UF".to_vec(),
339        PdfObject::String(filename.as_bytes().to_vec()),
340    );
341
342    if let Some(desc) = description {
343        fs_dict.insert(
344            b"Desc".to_vec(),
345            PdfObject::String(desc.as_bytes().to_vec()),
346        );
347    }
348
349    // /EF << /F stream_ref >>
350    let mut ef_dict = PdfDict::new();
351    ef_dict.insert(
352        b"F".to_vec(),
353        PdfObject::Reference(stream_ref),
354    );
355    fs_dict.insert(b"EF".to_vec(), PdfObject::Dict(ef_dict));
356
357    let fs_ref = modifier.add_object(PdfObject::Dict(fs_dict));
358
359    // 4. Wire into Catalog -> /Names -> /EmbeddedFiles
360    wire_into_name_tree(modifier, filename, &fs_ref)?;
361
362    Ok(fs_ref)
363}
364
365/// Ensure the catalog has a /Names -> /EmbeddedFiles name tree and append
366/// the new entry to it.
367fn wire_into_name_tree(
368    modifier: &mut DocumentModifier,
369    filename: &str,
370    fs_ref: &IndirectRef,
371) -> Result<()> {
372    let catalog_obj_num = modifier.catalog_ref().obj_num;
373
374    // Load catalog dict
375    let mut catalog = match modifier.find_object_pub(catalog_obj_num) {
376        Some(PdfObject::Dict(d)) => d.clone(),
377        _ => PdfDict::new(),
378    };
379
380    // Get or create /Names dict
381    let (names_obj_num, mut names_dict) = match catalog.get(b"Names") {
382        Some(PdfObject::Reference(r)) => {
383            let num = r.obj_num;
384            match modifier.find_object_pub(num) {
385                Some(PdfObject::Dict(d)) => (Some(num), d.clone()),
386                _ => (Some(num), PdfDict::new()),
387            }
388        }
389        Some(PdfObject::Dict(d)) => (None, d.clone()),
390        _ => (None, PdfDict::new()),
391    };
392
393    // Get or create /EmbeddedFiles name tree root
394    let (ef_obj_num, mut ef_dict) = match names_dict.get(b"EmbeddedFiles") {
395        Some(PdfObject::Reference(r)) => {
396            let num = r.obj_num;
397            match modifier.find_object_pub(num) {
398                Some(PdfObject::Dict(d)) => (Some(num), d.clone()),
399                _ => (Some(num), PdfDict::new()),
400            }
401        }
402        Some(PdfObject::Dict(d)) => (None, d.clone()),
403        _ => (None, PdfDict::new()),
404    };
405
406    // Append to the /Names array inside the EmbeddedFiles tree root
407    let mut names_arr = match ef_dict.get(b"Names") {
408        Some(PdfObject::Array(a)) => a.clone(),
409        _ => Vec::new(),
410    };
411    names_arr.push(PdfObject::String(filename.as_bytes().to_vec()));
412    names_arr.push(PdfObject::Reference(fs_ref.clone()));
413    ef_dict.insert(b"Names".to_vec(), PdfObject::Array(names_arr));
414
415    // Store the EmbeddedFiles dict (as indirect or inline)
416    match ef_obj_num {
417        Some(num) => {
418            modifier.set_object(num, PdfObject::Dict(ef_dict));
419        }
420        None => {
421            let ef_ref = modifier.add_object(PdfObject::Dict(ef_dict));
422            names_dict.insert(
423                b"EmbeddedFiles".to_vec(),
424                PdfObject::Reference(ef_ref),
425            );
426        }
427    }
428
429    // Store the Names dict
430    match names_obj_num {
431        Some(num) => {
432            modifier.set_object(num, PdfObject::Dict(names_dict));
433        }
434        None => {
435            let names_ref = modifier.add_object(PdfObject::Dict(names_dict));
436            catalog.insert(b"Names".to_vec(), PdfObject::Reference(names_ref));
437        }
438    }
439
440    // Update catalog
441    modifier.set_object(catalog_obj_num, PdfObject::Dict(catalog));
442
443    Ok(())
444}
445
446// ---------------------------------------------------------------------------
447// Tests
448// ---------------------------------------------------------------------------
449
450#[cfg(test)]
451mod tests {
452    use super::*;
453
454    /// Helper: create a FileSpec from a manually constructed dict.
455    fn make_sample_fs_dict(
456        filename: &str,
457        desc: Option<&str>,
458        stream_ref: Option<IndirectRef>,
459    ) -> PdfDict {
460        let mut dict = PdfDict::new();
461        dict.insert(b"Type".to_vec(), PdfObject::Name(b"Filespec".to_vec()));
462        dict.insert(
463            b"UF".to_vec(),
464            PdfObject::String(filename.as_bytes().to_vec()),
465        );
466        dict.insert(
467            b"F".to_vec(),
468            PdfObject::String(filename.as_bytes().to_vec()),
469        );
470
471        if let Some(d) = desc {
472            dict.insert(
473                b"Desc".to_vec(),
474                PdfObject::String(d.as_bytes().to_vec()),
475            );
476        }
477
478        if let Some(sr) = stream_ref {
479            let mut ef = PdfDict::new();
480            ef.insert(b"F".to_vec(), PdfObject::Reference(sr));
481            dict.insert(b"EF".to_vec(), PdfObject::Dict(ef));
482        }
483
484        dict
485    }
486
487    #[test]
488    fn test_parse_file_spec_minimal() {
489        // Minimal FileSpec: just a filename
490        let mut dict = PdfDict::new();
491        dict.insert(
492            b"F".to_vec(),
493            PdfObject::String(b"report.pdf".to_vec()),
494        );
495
496        // We cannot call parse_file_spec_dict without a PdfDocument, so test
497        // the helper `obj_to_string` and dict access directly.
498        let filename = dict
499            .get(b"UF")
500            .and_then(obj_to_string)
501            .or_else(|| dict.get(b"F").and_then(obj_to_string))
502            .unwrap_or_default();
503
504        assert_eq!(filename, "report.pdf");
505        assert!(dict.get(b"Desc").is_none());
506        assert!(dict.get(b"EF").is_none());
507    }
508
509    #[test]
510    fn test_parse_file_spec_all_fields() {
511        let dict = make_sample_fs_dict(
512            "attachment.txt",
513            Some("A text attachment"),
514            Some(IndirectRef { obj_num: 42, gen_num: 0 }),
515        );
516
517        // Filename from /UF
518        let filename = dict
519            .get(b"UF")
520            .and_then(obj_to_string)
521            .unwrap();
522        assert_eq!(filename, "attachment.txt");
523
524        // Description
525        let desc = dict.get(b"Desc").and_then(obj_to_string).unwrap();
526        assert_eq!(desc, "A text attachment");
527
528        // EF -> F reference
529        let ef = dict.get_dict(b"EF").unwrap();
530        let stream_ref = ef.get_ref(b"F").unwrap();
531        assert_eq!(stream_ref.obj_num, 42);
532        assert_eq!(stream_ref.gen_num, 0);
533    }
534
535    #[test]
536    fn test_empty_embedded_files_list() {
537        // An empty name tree /Names array should yield no results.
538        let mut ef_tree = PdfDict::new();
539        ef_tree.insert(b"Names".to_vec(), PdfObject::Array(Vec::new()));
540
541        // Manually test with empty names array — no pairs means no results.
542        let names_arr = ef_tree.get_array(b"Names").unwrap();
543        assert!(names_arr.is_empty());
544    }
545
546    #[test]
547    fn test_file_spec_struct_defaults() {
548        let fs = FileSpec {
549            filename: "test.pdf".into(),
550            description: None,
551            mime_type: None,
552            size: None,
553            checksum: None,
554            creation_date: None,
555            mod_date: None,
556            ef_stream_ref: None,
557        };
558
559        assert_eq!(fs.filename, "test.pdf");
560        assert!(fs.description.is_none());
561        assert!(fs.mime_type.is_none());
562        assert!(fs.size.is_none());
563        assert!(fs.checksum.is_none());
564        assert!(fs.creation_date.is_none());
565        assert!(fs.mod_date.is_none());
566        assert!(fs.ef_stream_ref.is_none());
567    }
568
569    #[test]
570    fn test_file_spec_struct_all_populated() {
571        let checksum = vec![0xAB, 0xCD, 0xEF, 0x01];
572        let fs = FileSpec {
573            filename: "data.csv".into(),
574            description: Some("CSV export".into()),
575            mime_type: Some("text/csv".into()),
576            size: Some(1024),
577            checksum: Some(checksum.clone()),
578            creation_date: Some("D:20260101120000".into()),
579            mod_date: Some("D:20260315090000".into()),
580            ef_stream_ref: Some(IndirectRef { obj_num: 99, gen_num: 0 }),
581        };
582
583        assert_eq!(fs.filename, "data.csv");
584        assert_eq!(fs.description.as_deref(), Some("CSV export"));
585        assert_eq!(fs.mime_type.as_deref(), Some("text/csv"));
586        assert_eq!(fs.size, Some(1024));
587        assert_eq!(fs.checksum.as_deref(), Some(checksum.as_slice()));
588        assert_eq!(fs.creation_date.as_deref(), Some("D:20260101120000"));
589        assert_eq!(fs.mod_date.as_deref(), Some("D:20260315090000"));
590        assert_eq!(fs.ef_stream_ref.as_ref().unwrap().obj_num, 99);
591    }
592
593    #[test]
594    fn test_obj_to_string_latin() {
595        let obj = PdfObject::String(b"hello.txt".to_vec());
596        assert_eq!(obj_to_string(&obj), Some("hello.txt".into()));
597    }
598
599    #[test]
600    fn test_obj_to_string_utf16be() {
601        // BOM (FE FF) + "AB" in UTF-16BE
602        let bytes = vec![0xFE, 0xFF, 0x00, 0x41, 0x00, 0x42];
603        let obj = PdfObject::String(bytes);
604        assert_eq!(obj_to_string(&obj), Some("AB".into()));
605    }
606
607    #[test]
608    fn test_obj_to_string_non_string() {
609        let obj = PdfObject::Integer(42);
610        assert_eq!(obj_to_string(&obj), None);
611    }
612
613    #[test]
614    fn test_mime_type_name_encoding() {
615        // Verify MIME name encoding roundtrip ('#2F' <-> '/')
616        let mime = "application/pdf";
617        let encoded = mime.replace('/', "#2F");
618        assert_eq!(encoded, "application#2Fpdf");
619        let decoded = encoded.replace("#2F", "/");
620        assert_eq!(decoded, mime);
621    }
622
623    #[test]
624    fn test_md5_checksum_computation() {
625        let data = b"Hello, embedded file!";
626        let mut hasher = Md5::new();
627        hasher.update(data);
628        let digest = hasher.finalize();
629
630        // MD5 produces 16 bytes
631        assert_eq!(digest.len(), 16);
632
633        // Same input should yield same digest
634        let mut hasher2 = Md5::new();
635        hasher2.update(data);
636        let digest2 = hasher2.finalize();
637        assert_eq!(digest.as_slice(), digest2.as_slice());
638    }
639
640    #[test]
641    fn test_make_sample_fs_dict_structure() {
642        let dict = make_sample_fs_dict("test.pdf", Some("Test"), None);
643
644        assert_eq!(dict.get_name(b"Type"), Some(b"Filespec".as_slice()));
645        assert_eq!(
646            dict.get_string(b"UF"),
647            Some(b"test.pdf".as_slice())
648        );
649        assert_eq!(
650            dict.get_string(b"F"),
651            Some(b"test.pdf".as_slice())
652        );
653        assert_eq!(
654            dict.get_string(b"Desc"),
655            Some(b"Test".as_slice())
656        );
657        assert!(dict.get(b"EF").is_none());
658    }
659
660    #[test]
661    fn test_make_sample_fs_dict_with_ef() {
662        let dict = make_sample_fs_dict(
663            "data.bin",
664            None,
665            Some(IndirectRef { obj_num: 7, gen_num: 0 }),
666        );
667
668        assert!(dict.get(b"Desc").is_none());
669        let ef = dict.get_dict(b"EF").unwrap();
670        let r = ef.get_ref(b"F").unwrap();
671        assert_eq!(r.obj_num, 7);
672    }
673}