1use md5::{Digest, Md5};
7
8use crate::error::{JustPdfError, Result};
9use crate::object::{IndirectRef, PdfDict, PdfObject};
10use crate::parser::PdfDocument;
11use crate::stream;
12use crate::writer::encode::make_stream;
13use crate::writer::modify::DocumentModifier;
14
15#[derive(Debug, Clone)]
21pub struct FileSpec {
22 pub filename: String,
24 pub description: Option<String>,
26 pub mime_type: Option<String>,
28 pub size: Option<usize>,
30 pub checksum: Option<Vec<u8>>,
32 pub creation_date: Option<String>,
34 pub mod_date: Option<String>,
36 pub ef_stream_ref: Option<IndirectRef>,
38}
39
40fn obj_to_string(obj: &PdfObject) -> Option<String> {
46 match obj {
47 PdfObject::String(bytes) => {
48 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
50 let chars: Vec<u16> = bytes[2..]
51 .chunks(2)
52 .filter_map(|c| {
53 if c.len() == 2 {
54 Some(u16::from_be_bytes([c[0], c[1]]))
55 } else {
56 None
57 }
58 })
59 .collect();
60 String::from_utf16(&chars).ok()
61 } else {
62 Some(String::from_utf8_lossy(bytes).into_owned())
63 }
64 }
65 _ => None,
66 }
67}
68
69fn parse_file_spec_dict(
71 doc: &PdfDocument,
72 dict: &PdfDict,
73) -> Result<FileSpec> {
74 let filename = dict
76 .get(b"UF")
77 .and_then(obj_to_string)
78 .or_else(|| dict.get(b"F").and_then(obj_to_string))
79 .unwrap_or_default();
80
81 let description = dict.get(b"Desc").and_then(obj_to_string);
82
83 let mut ef_stream_ref: Option<IndirectRef> = None;
85 let mut mime_type: Option<String> = None;
86 let mut size: Option<usize> = None;
87 let mut checksum: Option<Vec<u8>> = None;
88 let mut creation_date: Option<String> = None;
89 let mut mod_date: Option<String> = None;
90
91 if let Some(ef_dict) = resolve_dict(doc, dict, b"EF")? {
92 if let Some(r) = ef_dict.get_ref(b"F") {
94 let stream_ref = r.clone();
95
96 if let Ok(stream_obj) = doc.resolve(&stream_ref) {
98 if let PdfObject::Stream { dict: s_dict, .. } = stream_obj {
99 if let Some(name) = s_dict.get_name(b"Subtype") {
101 let raw = String::from_utf8_lossy(name).into_owned();
102 mime_type = Some(raw.replace("#2F", "/"));
104 }
105
106 if let Some(params) = s_dict.get_dict(b"Params") {
108 size = params.get_i64(b"Size").map(|v| v as usize);
109 checksum = params.get_string(b"CheckSum").map(|b| b.to_vec());
110 creation_date = params.get(b"CreationDate").and_then(obj_to_string);
111 mod_date = params.get(b"ModDate").and_then(obj_to_string);
112 }
113 }
114 }
115
116 ef_stream_ref = Some(stream_ref);
117 }
118 }
119
120 Ok(FileSpec {
121 filename,
122 description,
123 mime_type,
124 size,
125 checksum,
126 creation_date,
127 mod_date,
128 ef_stream_ref,
129 })
130}
131
132fn resolve_dict<'a>(
134 doc: &'a PdfDocument,
135 parent: &PdfDict,
136 key: &[u8],
137) -> Result<Option<PdfDict>> {
138 match parent.get(key) {
139 Some(PdfObject::Dict(d)) => Ok(Some(d.clone())),
140 Some(PdfObject::Reference(r)) => {
141 let r = r.clone();
142 let obj = doc.resolve(&r)?;
143 match obj {
144 PdfObject::Dict(d) => Ok(Some(d)),
145 _ => Ok(None),
146 }
147 }
148 _ => Ok(None),
149 }
150}
151
152pub fn read_embedded_files(doc: &PdfDocument) -> Result<Vec<FileSpec>> {
161 let catalog_ref = match doc.catalog_ref() {
163 Some(r) => r.clone(),
164 None => return Ok(Vec::new()),
165 };
166 let catalog = match doc.resolve(&catalog_ref)? {
167 PdfObject::Dict(d) => d,
168 _ => return Ok(Vec::new()),
169 };
170
171 let names_dict = match resolve_dict(doc, &catalog, b"Names")? {
173 Some(d) => d,
174 None => return Ok(Vec::new()),
175 };
176
177 let ef_tree = match resolve_dict(doc, &names_dict, b"EmbeddedFiles")? {
179 Some(d) => d,
180 None => return Ok(Vec::new()),
181 };
182
183 let mut file_specs = Vec::new();
185 collect_name_tree_values(doc, &ef_tree, &mut file_specs)?;
186
187 Ok(file_specs)
188}
189
190fn collect_name_tree_values(
192 doc: &PdfDocument,
193 node: &PdfDict,
194 out: &mut Vec<FileSpec>,
195) -> Result<()> {
196 if let Some(names_arr) = node.get_array(b"Names") {
198 let pairs: Vec<PdfObject> = names_arr.to_vec();
199 let mut i = 0;
200 while i + 1 < pairs.len() {
201 let value = &pairs[i + 1];
203 let fs_dict = match value {
204 PdfObject::Dict(d) => Some(d.clone()),
205 PdfObject::Reference(r) => {
206 let r = r.clone();
207 match doc.resolve(&r)? {
208 PdfObject::Dict(d) => Some(d),
209 _ => None,
210 }
211 }
212 _ => None,
213 };
214 if let Some(d) = fs_dict {
215 out.push(parse_file_spec_dict(doc, &d)?);
216 }
217 i += 2;
218 }
219 }
220
221 if let Some(kids_arr) = node.get_array(b"Kids") {
223 let kids: Vec<PdfObject> = kids_arr.to_vec();
224 for kid in &kids {
225 if let PdfObject::Reference(r) = kid {
226 let r = r.clone();
227 let child = doc.resolve(&r)?;
228 if let PdfObject::Dict(d) = child {
229 collect_name_tree_values(doc, &d, out)?;
230 }
231 }
232 }
233 }
234
235 Ok(())
236}
237
238pub fn extract_file(doc: &PdfDocument, file_spec: &FileSpec) -> Result<Vec<u8>> {
247 let stream_ref = file_spec.ef_stream_ref.as_ref().ok_or_else(|| {
248 JustPdfError::StreamDecode {
249 filter: String::new(),
250 detail: "FileSpec has no embedded file stream reference".into(),
251 }
252 })?;
253
254 let stream_obj = doc.resolve(stream_ref)?;
255 let (dict, raw_data) = match &stream_obj {
256 PdfObject::Stream { dict, data } => (dict, data.as_slice()),
257 _ => {
258 return Err(JustPdfError::StreamDecode {
259 filter: String::new(),
260 detail: "EF stream reference does not point to a stream object".into(),
261 });
262 }
263 };
264
265 let decoded = stream::decode_stream(raw_data, dict)?;
266
267 if let Some(expected) = &file_spec.checksum {
269 let mut hasher = Md5::new();
270 hasher.update(&decoded);
271 let computed = hasher.finalize();
272 if computed.as_slice() != expected.as_slice() {
273 return Err(JustPdfError::StreamDecode {
274 filter: String::new(),
275 detail: "embedded file MD5 checksum mismatch".into(),
276 });
277 }
278 }
279
280 Ok(decoded)
281}
282
283pub fn add_embedded_file(
293 modifier: &mut DocumentModifier,
294 filename: &str,
295 data: &[u8],
296 mime_type: Option<&str>,
297 description: Option<&str>,
298) -> Result<IndirectRef> {
299 let mut hasher = Md5::new();
301 hasher.update(data);
302 let checksum = hasher.finalize().to_vec();
303
304 let (mut stream_dict, compressed) = make_stream(data, true);
306
307 stream_dict.insert(b"Type".to_vec(), PdfObject::Name(b"EmbeddedFile".to_vec()));
309
310 if let Some(mt) = mime_type {
312 let name_encoded = mt.replace('/', "#2F");
313 stream_dict.insert(
314 b"Subtype".to_vec(),
315 PdfObject::Name(name_encoded.into_bytes()),
316 );
317 }
318
319 let mut params = PdfDict::new();
321 params.insert(b"Size".to_vec(), PdfObject::Integer(data.len() as i64));
322 params.insert(b"CheckSum".to_vec(), PdfObject::String(checksum));
323 stream_dict.insert(b"Params".to_vec(), PdfObject::Dict(params));
324
325 let stream_ref = modifier.add_object(PdfObject::Stream {
326 dict: stream_dict,
327 data: compressed,
328 });
329
330 let mut fs_dict = PdfDict::new();
332 fs_dict.insert(b"Type".to_vec(), PdfObject::Name(b"Filespec".to_vec()));
333 fs_dict.insert(
334 b"F".to_vec(),
335 PdfObject::String(filename.as_bytes().to_vec()),
336 );
337 fs_dict.insert(
338 b"UF".to_vec(),
339 PdfObject::String(filename.as_bytes().to_vec()),
340 );
341
342 if let Some(desc) = description {
343 fs_dict.insert(
344 b"Desc".to_vec(),
345 PdfObject::String(desc.as_bytes().to_vec()),
346 );
347 }
348
349 let mut ef_dict = PdfDict::new();
351 ef_dict.insert(
352 b"F".to_vec(),
353 PdfObject::Reference(stream_ref),
354 );
355 fs_dict.insert(b"EF".to_vec(), PdfObject::Dict(ef_dict));
356
357 let fs_ref = modifier.add_object(PdfObject::Dict(fs_dict));
358
359 wire_into_name_tree(modifier, filename, &fs_ref)?;
361
362 Ok(fs_ref)
363}
364
365fn wire_into_name_tree(
368 modifier: &mut DocumentModifier,
369 filename: &str,
370 fs_ref: &IndirectRef,
371) -> Result<()> {
372 let catalog_obj_num = modifier.catalog_ref().obj_num;
373
374 let mut catalog = match modifier.find_object_pub(catalog_obj_num) {
376 Some(PdfObject::Dict(d)) => d.clone(),
377 _ => PdfDict::new(),
378 };
379
380 let (names_obj_num, mut names_dict) = match catalog.get(b"Names") {
382 Some(PdfObject::Reference(r)) => {
383 let num = r.obj_num;
384 match modifier.find_object_pub(num) {
385 Some(PdfObject::Dict(d)) => (Some(num), d.clone()),
386 _ => (Some(num), PdfDict::new()),
387 }
388 }
389 Some(PdfObject::Dict(d)) => (None, d.clone()),
390 _ => (None, PdfDict::new()),
391 };
392
393 let (ef_obj_num, mut ef_dict) = match names_dict.get(b"EmbeddedFiles") {
395 Some(PdfObject::Reference(r)) => {
396 let num = r.obj_num;
397 match modifier.find_object_pub(num) {
398 Some(PdfObject::Dict(d)) => (Some(num), d.clone()),
399 _ => (Some(num), PdfDict::new()),
400 }
401 }
402 Some(PdfObject::Dict(d)) => (None, d.clone()),
403 _ => (None, PdfDict::new()),
404 };
405
406 let mut names_arr = match ef_dict.get(b"Names") {
408 Some(PdfObject::Array(a)) => a.clone(),
409 _ => Vec::new(),
410 };
411 names_arr.push(PdfObject::String(filename.as_bytes().to_vec()));
412 names_arr.push(PdfObject::Reference(fs_ref.clone()));
413 ef_dict.insert(b"Names".to_vec(), PdfObject::Array(names_arr));
414
415 match ef_obj_num {
417 Some(num) => {
418 modifier.set_object(num, PdfObject::Dict(ef_dict));
419 }
420 None => {
421 let ef_ref = modifier.add_object(PdfObject::Dict(ef_dict));
422 names_dict.insert(
423 b"EmbeddedFiles".to_vec(),
424 PdfObject::Reference(ef_ref),
425 );
426 }
427 }
428
429 match names_obj_num {
431 Some(num) => {
432 modifier.set_object(num, PdfObject::Dict(names_dict));
433 }
434 None => {
435 let names_ref = modifier.add_object(PdfObject::Dict(names_dict));
436 catalog.insert(b"Names".to_vec(), PdfObject::Reference(names_ref));
437 }
438 }
439
440 modifier.set_object(catalog_obj_num, PdfObject::Dict(catalog));
442
443 Ok(())
444}
445
446#[cfg(test)]
451mod tests {
452 use super::*;
453
454 fn make_sample_fs_dict(
456 filename: &str,
457 desc: Option<&str>,
458 stream_ref: Option<IndirectRef>,
459 ) -> PdfDict {
460 let mut dict = PdfDict::new();
461 dict.insert(b"Type".to_vec(), PdfObject::Name(b"Filespec".to_vec()));
462 dict.insert(
463 b"UF".to_vec(),
464 PdfObject::String(filename.as_bytes().to_vec()),
465 );
466 dict.insert(
467 b"F".to_vec(),
468 PdfObject::String(filename.as_bytes().to_vec()),
469 );
470
471 if let Some(d) = desc {
472 dict.insert(
473 b"Desc".to_vec(),
474 PdfObject::String(d.as_bytes().to_vec()),
475 );
476 }
477
478 if let Some(sr) = stream_ref {
479 let mut ef = PdfDict::new();
480 ef.insert(b"F".to_vec(), PdfObject::Reference(sr));
481 dict.insert(b"EF".to_vec(), PdfObject::Dict(ef));
482 }
483
484 dict
485 }
486
487 #[test]
488 fn test_parse_file_spec_minimal() {
489 let mut dict = PdfDict::new();
491 dict.insert(
492 b"F".to_vec(),
493 PdfObject::String(b"report.pdf".to_vec()),
494 );
495
496 let filename = dict
499 .get(b"UF")
500 .and_then(obj_to_string)
501 .or_else(|| dict.get(b"F").and_then(obj_to_string))
502 .unwrap_or_default();
503
504 assert_eq!(filename, "report.pdf");
505 assert!(dict.get(b"Desc").is_none());
506 assert!(dict.get(b"EF").is_none());
507 }
508
509 #[test]
510 fn test_parse_file_spec_all_fields() {
511 let dict = make_sample_fs_dict(
512 "attachment.txt",
513 Some("A text attachment"),
514 Some(IndirectRef { obj_num: 42, gen_num: 0 }),
515 );
516
517 let filename = dict
519 .get(b"UF")
520 .and_then(obj_to_string)
521 .unwrap();
522 assert_eq!(filename, "attachment.txt");
523
524 let desc = dict.get(b"Desc").and_then(obj_to_string).unwrap();
526 assert_eq!(desc, "A text attachment");
527
528 let ef = dict.get_dict(b"EF").unwrap();
530 let stream_ref = ef.get_ref(b"F").unwrap();
531 assert_eq!(stream_ref.obj_num, 42);
532 assert_eq!(stream_ref.gen_num, 0);
533 }
534
535 #[test]
536 fn test_empty_embedded_files_list() {
537 let mut ef_tree = PdfDict::new();
539 ef_tree.insert(b"Names".to_vec(), PdfObject::Array(Vec::new()));
540
541 let names_arr = ef_tree.get_array(b"Names").unwrap();
543 assert!(names_arr.is_empty());
544 }
545
546 #[test]
547 fn test_file_spec_struct_defaults() {
548 let fs = FileSpec {
549 filename: "test.pdf".into(),
550 description: None,
551 mime_type: None,
552 size: None,
553 checksum: None,
554 creation_date: None,
555 mod_date: None,
556 ef_stream_ref: None,
557 };
558
559 assert_eq!(fs.filename, "test.pdf");
560 assert!(fs.description.is_none());
561 assert!(fs.mime_type.is_none());
562 assert!(fs.size.is_none());
563 assert!(fs.checksum.is_none());
564 assert!(fs.creation_date.is_none());
565 assert!(fs.mod_date.is_none());
566 assert!(fs.ef_stream_ref.is_none());
567 }
568
569 #[test]
570 fn test_file_spec_struct_all_populated() {
571 let checksum = vec![0xAB, 0xCD, 0xEF, 0x01];
572 let fs = FileSpec {
573 filename: "data.csv".into(),
574 description: Some("CSV export".into()),
575 mime_type: Some("text/csv".into()),
576 size: Some(1024),
577 checksum: Some(checksum.clone()),
578 creation_date: Some("D:20260101120000".into()),
579 mod_date: Some("D:20260315090000".into()),
580 ef_stream_ref: Some(IndirectRef { obj_num: 99, gen_num: 0 }),
581 };
582
583 assert_eq!(fs.filename, "data.csv");
584 assert_eq!(fs.description.as_deref(), Some("CSV export"));
585 assert_eq!(fs.mime_type.as_deref(), Some("text/csv"));
586 assert_eq!(fs.size, Some(1024));
587 assert_eq!(fs.checksum.as_deref(), Some(checksum.as_slice()));
588 assert_eq!(fs.creation_date.as_deref(), Some("D:20260101120000"));
589 assert_eq!(fs.mod_date.as_deref(), Some("D:20260315090000"));
590 assert_eq!(fs.ef_stream_ref.as_ref().unwrap().obj_num, 99);
591 }
592
593 #[test]
594 fn test_obj_to_string_latin() {
595 let obj = PdfObject::String(b"hello.txt".to_vec());
596 assert_eq!(obj_to_string(&obj), Some("hello.txt".into()));
597 }
598
599 #[test]
600 fn test_obj_to_string_utf16be() {
601 let bytes = vec![0xFE, 0xFF, 0x00, 0x41, 0x00, 0x42];
603 let obj = PdfObject::String(bytes);
604 assert_eq!(obj_to_string(&obj), Some("AB".into()));
605 }
606
607 #[test]
608 fn test_obj_to_string_non_string() {
609 let obj = PdfObject::Integer(42);
610 assert_eq!(obj_to_string(&obj), None);
611 }
612
613 #[test]
614 fn test_mime_type_name_encoding() {
615 let mime = "application/pdf";
617 let encoded = mime.replace('/', "#2F");
618 assert_eq!(encoded, "application#2Fpdf");
619 let decoded = encoded.replace("#2F", "/");
620 assert_eq!(decoded, mime);
621 }
622
623 #[test]
624 fn test_md5_checksum_computation() {
625 let data = b"Hello, embedded file!";
626 let mut hasher = Md5::new();
627 hasher.update(data);
628 let digest = hasher.finalize();
629
630 assert_eq!(digest.len(), 16);
632
633 let mut hasher2 = Md5::new();
635 hasher2.update(data);
636 let digest2 = hasher2.finalize();
637 assert_eq!(digest.as_slice(), digest2.as_slice());
638 }
639
640 #[test]
641 fn test_make_sample_fs_dict_structure() {
642 let dict = make_sample_fs_dict("test.pdf", Some("Test"), None);
643
644 assert_eq!(dict.get_name(b"Type"), Some(b"Filespec".as_slice()));
645 assert_eq!(
646 dict.get_string(b"UF"),
647 Some(b"test.pdf".as_slice())
648 );
649 assert_eq!(
650 dict.get_string(b"F"),
651 Some(b"test.pdf".as_slice())
652 );
653 assert_eq!(
654 dict.get_string(b"Desc"),
655 Some(b"Test".as_slice())
656 );
657 assert!(dict.get(b"EF").is_none());
658 }
659
660 #[test]
661 fn test_make_sample_fs_dict_with_ef() {
662 let dict = make_sample_fs_dict(
663 "data.bin",
664 None,
665 Some(IndirectRef { obj_num: 7, gen_num: 0 }),
666 );
667
668 assert!(dict.get(b"Desc").is_none());
669 let ef = dict.get_dict(b"EF").unwrap();
670 let r = ef.get_ref(b"F").unwrap();
671 assert_eq!(r.obj_num, 7);
672 }
673}