Skip to main content

rpdfium_doc/
file_spec.rs

1//! PDF file specification dictionary (ISO 32000-2 section 7.11).
2//!
3//! A file specification dictionary identifies a file, either external or
4//! embedded, and can provide platform-specific filenames plus an embedded
5//! file stream reference.
6
7use std::collections::HashMap;
8
9use rpdfium_core::{Name, PdfSource};
10use rpdfium_parser::{Object, ObjectId, ObjectStore};
11
12use crate::error::{DocError, DocResult};
13use crate::name_tree::NameTree;
14
15/// A parsed PDF file specification dictionary.
16#[derive(Debug, Clone)]
17pub struct FileSpec {
18    /// File system name (`/FS`).
19    pub file_system: Option<String>,
20    /// Platform-independent filename (`/F`).
21    pub filename: Option<String>,
22    /// Unicode filename (`/UF`).
23    pub unicode_filename: Option<String>,
24    /// DOS filename (`/DOS`).
25    pub dos_filename: Option<String>,
26    /// Unix filename (`/Unix`).
27    pub unix_filename: Option<String>,
28    /// Indirect reference to the embedded file stream (from `/EF` sub-dict `/F`).
29    pub embedded_file: Option<ObjectId>,
30    /// Description of the file (`/Desc`).
31    pub description: Option<String>,
32    /// Decoded bytes of the embedded file stream, if available.
33    ///
34    /// Populated during parsing from the `/EF /F` stream when the
35    /// ObjectStore can decode it.  Corresponds to the buffer returned by
36    /// `FPDFAttachment_GetUnderlyingFile`.
37    pub data: Option<Vec<u8>>,
38}
39
40/// Parse a file specification dictionary.
41///
42/// Returns `None` if the object is not a valid file specification dictionary.
43pub fn parse_file_spec<S: PdfSource>(obj: &Object, store: &ObjectStore<S>) -> Option<FileSpec> {
44    let resolved = store.deep_resolve(obj).ok()?;
45    let dict = resolved.as_dict()?;
46
47    let file_system = dict
48        .get(&Name::fs())
49        .and_then(|o| store.deep_resolve(o).ok())
50        .and_then(|o| o.as_name().map(|n| n.as_str().into_owned()));
51
52    let filename = extract_string(dict, &Name::f(), store);
53
54    let unicode_filename = extract_string(dict, &Name::uf(), store);
55
56    let dos_filename = extract_string(dict, &Name::dos(), store);
57
58    let unix_filename = extract_string(dict, &Name::unix_name(), store);
59
60    let ef_resolved = dict
61        .get(&Name::ef())
62        .and_then(|o| store.deep_resolve(o).ok());
63
64    let embedded_file = ef_resolved
65        .as_ref()
66        .and_then(|o| o.as_dict().cloned())
67        .and_then(|ef_dict| ef_dict.get(&Name::f()).and_then(|o| o.as_reference()));
68
69    // Attempt to decode the embedded file stream bytes.
70    let data: Option<Vec<u8>> = embedded_file.and_then(|stream_id| {
71        let stream_obj = store.resolve(stream_id).ok()?;
72        store.decode_stream(stream_obj).ok()
73    });
74
75    let description = extract_string(dict, &Name::desc(), store);
76
77    Some(FileSpec {
78        file_system,
79        filename,
80        unicode_filename,
81        dos_filename,
82        unix_filename,
83        embedded_file,
84        description,
85        data,
86    })
87}
88
89impl FileSpec {
90    /// Returns the best available filename for this attachment.
91    ///
92    /// Prefers the Unicode filename (`/UF`) over the platform-encoded filename
93    /// (`/F`), with further fallbacks to Unix and DOS filenames.
94    ///
95    /// Corresponds to `FPDFAttachment_GetName`.
96    pub fn name(&self) -> Option<&str> {
97        self.unicode_filename
98            .as_deref()
99            .or(self.filename.as_deref())
100            .or(self.unix_filename.as_deref())
101            .or(self.dos_filename.as_deref())
102    }
103
104    /// ADR-019 T2 alias for [`name()`](Self::name).
105    ///
106    /// Corresponds to `FPDFAttachment_GetName`.
107    #[inline]
108    pub fn attachment_get_name(&self) -> Option<&str> {
109        self.name()
110    }
111
112    /// Deprecated — use [`attachment_get_name()`](Self::attachment_get_name).
113    ///
114    /// Corresponds to `FPDFAttachment_GetName`.
115    #[deprecated(note = "use `attachment_get_name()` — matches upstream `FPDFAttachment_GetName`")]
116    #[inline]
117    pub fn get_name(&self) -> Option<&str> {
118        self.name()
119    }
120
121    /// Returns the decoded bytes of the embedded file stream, if available.
122    ///
123    /// This is the primary data accessor for the embedded file content.
124    /// Returns `None` if no embedded file data is present or if decoding
125    /// failed during parsing.
126    ///
127    /// Corresponds to `FPDFAttachment_GetFile`.
128    pub fn file_data(&self) -> Option<&[u8]> {
129        self.data.as_deref()
130    }
131
132    /// ADR-019 T2 alias for [`file_data()`](Self::file_data).
133    ///
134    /// Corresponds to `FPDFAttachment_GetFile`.
135    #[inline]
136    pub fn attachment_get_file(&self) -> Option<&[u8]> {
137        self.file_data()
138    }
139
140    /// Deprecated — use [`attachment_get_file()`](Self::attachment_get_file).
141    ///
142    /// Corresponds to `FPDFAttachment_GetFile`.
143    #[deprecated(note = "use `attachment_get_file()` — matches upstream `FPDFAttachment_GetFile`")]
144    #[inline]
145    pub fn get_file(&self) -> Option<&[u8]> {
146        self.file_data()
147    }
148
149    /// Returns the MIME type (Subtype) of the embedded file, if present.
150    ///
151    /// Reads the `/Subtype` entry from the embedded file stream dictionary.
152    /// Returns `None` if no subtype is recorded.
153    ///
154    /// Corresponds to `FPDFAttachment_GetSubtype`.
155    ///
156    /// Note: `FileSpec` is parsed from the file specification dictionary; the
157    /// subtype lives in the embedded file stream (`/EF /F` stream dict `/Subtype`).
158    /// This field is not currently extracted during parsing — `None` is always
159    /// returned in this release.
160    pub fn subtype(&self) -> Option<&str> {
161        None
162    }
163
164    /// Deprecated — use [`subtype()`](Self::subtype) — no public `FPDFAttachment_GetSubtype` API.
165    #[deprecated(note = "use `subtype()` — there is no public `FPDFAttachment_GetSubtype` API")]
166    #[inline]
167    pub fn get_subtype(&self) -> Option<&str> {
168        self.subtype()
169    }
170
171    /// Returns the raw decoded bytes of the embedded file, if available.
172    ///
173    /// This is populated during document parsing when the embedded file stream
174    /// can be decoded.  Returns `None` if no embedded file data is present or
175    /// if decoding failed during parsing.
176    ///
177    /// Corresponds to `FPDFAttachment_GetUnderlyingFile`.
178    pub fn underlying_bytes(&self) -> Option<&[u8]> {
179        self.data.as_deref()
180    }
181
182    /// Deprecated — use [`underlying_bytes()`](Self::underlying_bytes) — no public FPDF_* API.
183    #[deprecated(
184        note = "use `underlying_bytes()` — there is no public `FPDFAttachment_GetUnderlyingFile` API"
185    )]
186    #[inline]
187    pub fn get_underlying_bytes(&self) -> Option<&[u8]> {
188        self.underlying_bytes()
189    }
190
191    /// Set the filename.
192    ///
193    /// Updates both the PDF-encoded `/F` filename and the Unicode `/UF` filename
194    /// in memory. To persist the change to a PDF file, use `EditDocument` in
195    /// rpdfium-edit.
196    pub fn set_filename(&mut self, filename: &str) -> DocResult<()> {
197        self.filename = Some(encode_filename(filename));
198        self.unicode_filename = Some(filename.to_string());
199        Ok(())
200    }
201
202    /// Returns the best available filename, preferring Unicode over platform-specific.
203    ///
204    /// Deprecated: use [`name()`](Self::name) instead (primary) or
205    /// [`get_name()`](Self::get_name) (upstream alias).
206    #[deprecated(since = "0.1.0", note = "use name() instead")]
207    #[inline]
208    pub fn best_filename(&self) -> Option<&str> {
209        self.name()
210    }
211}
212
213/// Encode a platform path to PDF file specification format.
214///
215/// Converts platform-specific path separators to `/` and handles
216/// Windows drive letters (e.g., `C:\dir\file.pdf` → `/C/dir/file.pdf`).
217pub fn encode_filename(path: &str) -> String {
218    let normalized = path.replace('\\', "/");
219    // Handle Windows drive letter (e.g., "C:/..." → "/C/...")
220    if normalized.len() >= 2 && normalized.as_bytes()[1] == b':' {
221        let drive = &normalized[0..1];
222        let rest = &normalized[2..];
223        format!("/{drive}{rest}")
224    } else {
225        normalized
226    }
227}
228
229/// Decode a PDF file specification path to platform format.
230///
231/// Reverses the encoding: converts `/` path separators to the platform
232/// separator and restores drive letters on Windows-style paths.
233pub fn decode_filename(path: &str) -> String {
234    // Detect encoded drive letter: "/C/..." → "C:/..."
235    if path.len() >= 3
236        && path.starts_with('/')
237        && path.as_bytes()[1].is_ascii_alphabetic()
238        && path.as_bytes()[2] == b'/'
239    {
240        let drive = &path[1..2];
241        let rest = &path[2..];
242        return format!("{drive}:{rest}");
243    }
244    path.to_string()
245}
246
247/// Collect all embedded file attachments from the document catalog.
248///
249/// Walks the `/Root/Names/EmbeddedFiles` name tree and returns all
250/// [`FileSpec`] entries. Returns an empty `Vec` if the document has no
251/// attachments or the names tree is absent.
252///
253/// Corresponds to `FPDFDoc_GetAttachmentCount` / `FPDFDoc_GetAttachment` in
254/// PDFium's `fpdf_attachment.h`.
255pub fn collect_attachments<S: PdfSource>(
256    catalog: &Object,
257    store: &ObjectStore<S>,
258) -> DocResult<Vec<FileSpec>> {
259    // /Root/Names/EmbeddedFiles
260    let catalog_dict = match catalog.as_dict() {
261        Some(d) => d,
262        None => return Ok(Vec::new()),
263    };
264
265    let names_obj = match catalog_dict
266        .get(&Name::names())
267        .and_then(|o| store.deep_resolve(o).ok())
268    {
269        Some(o) => o,
270        None => return Ok(Vec::new()),
271    };
272
273    let names_dict = match names_obj.as_dict() {
274        Some(d) => d,
275        None => return Ok(Vec::new()),
276    };
277
278    let ef_obj = match names_dict
279        .get(&Name::embedded_files())
280        .and_then(|o| store.deep_resolve(o).ok())
281    {
282        Some(o) => o,
283        None => return Ok(Vec::new()),
284    };
285
286    // Parse the EmbeddedFiles name tree.  Values are file specification dicts.
287    let tree = NameTree::parse(ef_obj, store, |val_obj| {
288        parse_file_spec(val_obj, store).ok_or(DocError::UnexpectedType)
289    })?;
290
291    Ok(tree.entries().iter().map(|(_, v)| v.clone()).collect())
292}
293
294/// Extract a string value from a dictionary key.
295fn extract_string<S: PdfSource>(
296    dict: &HashMap<Name, Object>,
297    key: &Name,
298    store: &ObjectStore<S>,
299) -> Option<String> {
300    dict.get(key)
301        .and_then(|o| store.deep_resolve(o).ok())
302        .and_then(|o| o.as_string().map(|s| s.to_string_lossy()))
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308    use rpdfium_core::PdfString;
309
310    fn build_store() -> ObjectStore<Vec<u8>> {
311        let pdf = build_minimal_pdf();
312        ObjectStore::open(pdf, rpdfium_core::ParsingMode::Lenient).unwrap()
313    }
314
315    fn build_minimal_pdf() -> Vec<u8> {
316        let mut pdf = Vec::new();
317        pdf.extend_from_slice(b"%PDF-1.4\n");
318        let obj1_offset = pdf.len();
319        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
320        let obj2_offset = pdf.len();
321        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
322        let xref_offset = pdf.len();
323        pdf.extend_from_slice(b"xref\n0 3\n");
324        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
325        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
326        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
327        pdf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
328        pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
329        pdf
330    }
331
332    fn str_obj(s: &str) -> Object {
333        Object::String(PdfString::from_bytes(s.as_bytes().to_vec()))
334    }
335
336    #[test]
337    fn test_parse_file_spec_full() {
338        let store = build_store();
339
340        let mut ef_dict = HashMap::new();
341        ef_dict.insert(Name::f(), Object::Reference(ObjectId::new(10, 0)));
342
343        let mut dict = HashMap::new();
344        dict.insert(Name::fs(), Object::Name(Name::from("URL")));
345        dict.insert(Name::f(), str_obj("report.pdf"));
346        dict.insert(Name::uf(), str_obj("report.pdf"));
347        dict.insert(Name::dos(), str_obj("REPORT.PDF"));
348        dict.insert(Name::unix_name(), str_obj("/home/user/report.pdf"));
349        dict.insert(Name::ef(), Object::Dictionary(ef_dict));
350        dict.insert(Name::desc(), str_obj("Annual report"));
351
352        let obj = Object::Dictionary(dict);
353        let spec = parse_file_spec(&obj, &store).unwrap();
354
355        assert_eq!(spec.file_system.as_deref(), Some("URL"));
356        assert_eq!(spec.filename.as_deref(), Some("report.pdf"));
357        assert_eq!(spec.unicode_filename.as_deref(), Some("report.pdf"));
358        assert_eq!(spec.dos_filename.as_deref(), Some("REPORT.PDF"));
359        assert_eq!(spec.unix_filename.as_deref(), Some("/home/user/report.pdf"));
360        assert_eq!(spec.embedded_file, Some(ObjectId::new(10, 0)));
361        assert_eq!(spec.description.as_deref(), Some("Annual report"));
362    }
363
364    #[test]
365    fn test_parse_file_spec_minimal() {
366        let store = build_store();
367
368        let mut dict = HashMap::new();
369        dict.insert(Name::f(), str_obj("data.txt"));
370
371        let obj = Object::Dictionary(dict);
372        let spec = parse_file_spec(&obj, &store).unwrap();
373
374        assert!(spec.file_system.is_none());
375        assert_eq!(spec.filename.as_deref(), Some("data.txt"));
376        assert!(spec.unicode_filename.is_none());
377        assert!(spec.embedded_file.is_none());
378    }
379
380    #[test]
381    fn test_parse_file_spec_not_dict_returns_none() {
382        let store = build_store();
383        let obj = Object::Integer(42);
384        assert!(parse_file_spec(&obj, &store).is_none());
385    }
386
387    #[test]
388    fn test_set_filename_updates_in_memory() {
389        let mut spec = FileSpec {
390            file_system: None,
391            filename: Some("test.pdf".into()),
392            unicode_filename: None,
393            dos_filename: None,
394            unix_filename: None,
395            embedded_file: None,
396            description: None,
397            data: None,
398        };
399        spec.set_filename("new.pdf").unwrap();
400        assert_eq!(spec.filename.as_deref(), Some("new.pdf"));
401        assert_eq!(spec.unicode_filename.as_deref(), Some("new.pdf"));
402    }
403
404    #[test]
405    fn test_best_filename_prefers_unicode() {
406        let spec = FileSpec {
407            file_system: None,
408            filename: Some("fallback.pdf".into()),
409            unicode_filename: Some("unicode.pdf".into()),
410            dos_filename: None,
411            unix_filename: None,
412            embedded_file: None,
413            description: None,
414            data: None,
415        };
416        assert_eq!(spec.name(), Some("unicode.pdf"));
417    }
418
419    #[test]
420    fn test_best_filename_falls_back() {
421        let spec = FileSpec {
422            file_system: None,
423            filename: None,
424            unicode_filename: None,
425            dos_filename: Some("DOS.PDF".into()),
426            unix_filename: None,
427            embedded_file: None,
428            description: None,
429            data: None,
430        };
431        assert_eq!(spec.name(), Some("DOS.PDF"));
432    }
433
434    #[test]
435    fn test_best_filename_none() {
436        let spec = FileSpec {
437            file_system: None,
438            filename: None,
439            unicode_filename: None,
440            dos_filename: None,
441            unix_filename: None,
442            embedded_file: None,
443            description: None,
444            data: None,
445        };
446        assert!(spec.name().is_none());
447    }
448
449    #[test]
450    fn test_encode_filename_unix() {
451        assert_eq!(encode_filename("/home/user/doc.pdf"), "/home/user/doc.pdf");
452    }
453
454    #[test]
455    fn test_encode_filename_windows() {
456        assert_eq!(encode_filename("C:\\Users\\doc.pdf"), "/C/Users/doc.pdf");
457    }
458
459    #[test]
460    fn test_encode_filename_already_pdf() {
461        assert_eq!(encode_filename("/path/to/file.pdf"), "/path/to/file.pdf");
462    }
463
464    #[test]
465    fn test_decode_filename_drive_letter() {
466        assert_eq!(decode_filename("/C/Users/doc.pdf"), "C:/Users/doc.pdf");
467    }
468
469    #[test]
470    fn test_decode_filename_unix() {
471        assert_eq!(decode_filename("/home/user/doc.pdf"), "/home/user/doc.pdf");
472    }
473
474    #[test]
475    fn test_decode_filename_no_drive() {
476        assert_eq!(decode_filename("relative/path.pdf"), "relative/path.pdf");
477    }
478
479    // -----------------------------------------------------------------------
480    // underlying_bytes / get_underlying_bytes tests
481    // -----------------------------------------------------------------------
482
483    #[test]
484    fn test_underlying_bytes_returns_none_when_no_data() {
485        let spec = FileSpec {
486            file_system: None,
487            filename: Some("report.pdf".into()),
488            unicode_filename: None,
489            dos_filename: None,
490            unix_filename: None,
491            embedded_file: None,
492            description: None,
493            data: None,
494        };
495        assert!(spec.underlying_bytes().is_none());
496    }
497
498    /// Upstream: TEST(CPDFFileSpecTest, GetFileStream)
499    ///
500    /// Tests embedded file stream retrieval from the /EF dictionary.
501    /// The upstream test builds an /EF dict with keys in precedence order
502    /// (Unix, Mac, DOS, F, UF) and verifies the highest-precedence stream
503    /// is returned. In rpdfium, `parse_file_spec` extracts the /EF /F
504    /// stream reference; we verify the embedded_file field is populated
505    /// when /EF contains a /F reference, and None when absent.
506    #[test]
507    fn test_cpdf_file_spec_get_file_stream() {
508        let store = build_store();
509
510        // Case 1: No /EF dict => no embedded file
511        let mut dict1 = HashMap::new();
512        dict1.insert(Name::f(), str_obj("test.pdf"));
513        let spec1 = parse_file_spec(&Object::Dictionary(dict1), &store).unwrap();
514        assert!(spec1.embedded_file.is_none());
515
516        // Case 2: Empty /EF dict => no embedded file
517        let mut dict2 = HashMap::new();
518        dict2.insert(Name::f(), str_obj("test.pdf"));
519        dict2.insert(Name::ef(), Object::Dictionary(HashMap::new()));
520        let spec2 = parse_file_spec(&Object::Dictionary(dict2), &store).unwrap();
521        assert!(spec2.embedded_file.is_none());
522
523        // Case 3: /EF dict with /F reference
524        let mut ef_dict = HashMap::new();
525        ef_dict.insert(Name::f(), Object::Reference(ObjectId::new(10, 0)));
526        let mut dict3 = HashMap::new();
527        dict3.insert(Name::f(), str_obj("test.pdf"));
528        dict3.insert(Name::ef(), Object::Dictionary(ef_dict));
529        let spec3 = parse_file_spec(&Object::Dictionary(dict3), &store).unwrap();
530        assert_eq!(spec3.embedded_file, Some(ObjectId::new(10, 0)));
531    }
532
533    /// Upstream: TEST(CPDFFileSpecTest, GetParamsDict)
534    ///
535    /// Tests /Params dictionary retrieval from an embedded file stream.
536    /// Since rpdfium's FileSpec doesn't directly expose the /Params dict,
537    /// this test verifies the related behavior: when data can be decoded
538    /// from the stream, file_data() returns it; otherwise None.
539    #[test]
540    fn test_cpdf_file_spec_get_params_dict() {
541        let store = build_store();
542
543        // Non-dict object => parse_file_spec returns None
544        let spec = parse_file_spec(&Object::Name(Name::from("test.pdf")), &store);
545        assert!(spec.is_none());
546
547        // Dict with /EF but stream not in store => data is None
548        let mut ef_dict = HashMap::new();
549        ef_dict.insert(Name::f(), Object::Reference(ObjectId::new(999, 0)));
550        let mut dict = HashMap::new();
551        dict.insert(Name::uf(), str_obj("test.pdf"));
552        dict.insert(Name::ef(), Object::Dictionary(ef_dict));
553        let spec = parse_file_spec(&Object::Dictionary(dict), &store).unwrap();
554        // The reference points to a non-existent object, so data is None
555        assert!(spec.file_data().is_none());
556        assert_eq!(spec.embedded_file, Some(ObjectId::new(999, 0)));
557    }
558
559    #[test]
560    fn test_underlying_bytes_returns_data_when_present() {
561        let payload = b"Hello, embedded file!".to_vec();
562        let spec = FileSpec {
563            file_system: None,
564            filename: Some("doc.txt".into()),
565            unicode_filename: None,
566            dos_filename: None,
567            unix_filename: None,
568            embedded_file: None,
569            description: None,
570            data: Some(payload.clone()),
571        };
572        assert_eq!(spec.underlying_bytes(), Some(payload.as_slice()));
573    }
574}