Skip to main content

folio_doc/
info.rs

1//! Document info dictionary.
2
3use folio_cos::PdfObject;
4use indexmap::IndexMap;
5
6/// Document metadata from the Info dictionary.
7#[derive(Debug, Clone, Default)]
8pub struct DocInfo {
9    dict: IndexMap<Vec<u8>, PdfObject>,
10}
11
12impl DocInfo {
13    pub(crate) fn from_dict(dict: IndexMap<Vec<u8>, PdfObject>) -> Self {
14        Self { dict }
15    }
16
17    fn get_text(&self, key: &[u8]) -> Option<String> {
18        self.dict.get(key).and_then(|obj| match obj {
19            PdfObject::Str(s) => Some(decode_pdf_text(s)),
20            _ => None,
21        })
22    }
23
24    /// Get the document title.
25    pub fn title(&self) -> Option<String> {
26        self.get_text(b"Title")
27    }
28
29    /// Get the document author.
30    pub fn author(&self) -> Option<String> {
31        self.get_text(b"Author")
32    }
33
34    /// Get the document subject.
35    pub fn subject(&self) -> Option<String> {
36        self.get_text(b"Subject")
37    }
38
39    /// Get the document keywords.
40    pub fn keywords(&self) -> Option<String> {
41        self.get_text(b"Keywords")
42    }
43
44    /// Get the creator (application that created the original document).
45    pub fn creator(&self) -> Option<String> {
46        self.get_text(b"Creator")
47    }
48
49    /// Get the producer (application that created the PDF).
50    pub fn producer(&self) -> Option<String> {
51        self.get_text(b"Producer")
52    }
53
54    /// Get the creation date as a raw string.
55    pub fn creation_date(&self) -> Option<String> {
56        self.get_text(b"CreationDate")
57    }
58
59    /// Get the modification date as a raw string.
60    pub fn mod_date(&self) -> Option<String> {
61        self.get_text(b"ModDate")
62    }
63}
64
65/// Decode a PDF text string to a Rust String.
66///
67/// PDF text strings can be either:
68/// - UTF-16BE (starts with BOM: 0xFE 0xFF)
69/// - PDFDocEncoding (a superset of ASCII/Latin-1)
70fn decode_pdf_text(data: &[u8]) -> String {
71    if data.len() >= 2 && data[0] == 0xFE && data[1] == 0xFF {
72        // UTF-16BE
73        let mut chars = Vec::new();
74        let mut i = 2;
75        while i + 1 < data.len() {
76            let code = ((data[i] as u16) << 8) | (data[i + 1] as u16);
77            chars.push(code);
78            i += 2;
79        }
80        String::from_utf16_lossy(&chars)
81    } else if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
82        // UTF-8 with BOM
83        String::from_utf8_lossy(&data[3..]).into_owned()
84    } else {
85        // PDFDocEncoding — for ASCII range this is identical to ASCII
86        // For bytes 128-255, PDFDocEncoding has specific mappings
87        // but for now we'll use lossy UTF-8 as a reasonable approximation
88        String::from_utf8_lossy(data).into_owned()
89    }
90}