Skip to main content

folio_font/
font.rs

1//! PDF font representation — loads font information from PDF dictionaries.
2
3use crate::cmap::ToUnicodeCMap;
4use crate::encoding::Encoding;
5use folio_core::Result;
6use folio_cos::{CosDoc, PdfObject};
7use std::collections::HashMap;
8
9/// PDF font types.
10#[derive(Debug, Clone, Copy, PartialEq)]
11pub enum FontType {
12    Type1,
13    TrueType,
14    Type0,
15    Type3,
16    CIDFontType0,
17    CIDFontType2,
18    MMType1,
19    Unknown,
20}
21
22/// A loaded PDF font with encoding and metrics.
23#[derive(Debug)]
24pub struct PdfFont {
25    /// Font type.
26    pub font_type: FontType,
27    /// Base font name (e.g., "Helvetica", "ABCDEF+ArialMT").
28    pub base_font: String,
29    /// Text encoding for simple fonts.
30    pub encoding: Encoding,
31    /// ToUnicode CMap (if present).
32    pub to_unicode: Option<ToUnicodeCMap>,
33    /// Character widths: code -> width in 1/1000 units.
34    pub widths: HashMap<u32, f64>,
35    /// Default width for missing characters.
36    pub default_width: f64,
37    /// First character code in the Widths array.
38    pub first_char: u32,
39    /// Whether this is a CID (multi-byte) font.
40    pub is_cid: bool,
41}
42
43impl PdfFont {
44    /// Load a font from a PDF font dictionary.
45    pub fn from_dict(dict: &PdfObject, doc: &mut CosDoc) -> Result<Self> {
46        let subtype = dict.dict_get_name_str(b"Subtype").unwrap_or_default();
47        let base_font = dict.dict_get_name_str(b"BaseFont").unwrap_or_default();
48
49        let font_type = match subtype.as_str() {
50            "Type1" => FontType::Type1,
51            "TrueType" => FontType::TrueType,
52            "Type0" => FontType::Type0,
53            "Type3" => FontType::Type3,
54            "CIDFontType0" => FontType::CIDFontType0,
55            "CIDFontType2" => FontType::CIDFontType2,
56            "MMType1" => FontType::MMType1,
57            _ => FontType::Unknown,
58        };
59
60        let is_cid = matches!(font_type, FontType::Type0);
61
62        // Load encoding
63        let encoding = load_encoding(dict, doc);
64
65        // Load ToUnicode CMap
66        let to_unicode = load_tounicode(dict, doc);
67
68        // Load widths
69        let (widths, first_char, default_width) = if is_cid {
70            load_cid_widths(dict, doc)
71        } else {
72            load_simple_widths(dict)
73        };
74
75        Ok(PdfFont {
76            font_type,
77            base_font,
78            encoding,
79            to_unicode,
80            widths,
81            default_width,
82            first_char,
83            is_cid,
84        })
85    }
86
87    /// Get the width of a character code in 1/1000 units.
88    pub fn char_width(&self, code: u32) -> f64 {
89        self.widths
90            .get(&code)
91            .copied()
92            .unwrap_or(self.default_width)
93    }
94
95    /// Decode a byte string to Unicode using this font's encoding.
96    pub fn decode_text(&self, data: &[u8]) -> String {
97        crate::encoding::decode_text(data, &self.encoding, self.to_unicode.as_ref())
98    }
99}
100
101fn load_encoding(dict: &PdfObject, doc: &mut CosDoc) -> Encoding {
102    let base_font = dict.dict_get_name_str(b"BaseFont").unwrap_or_default();
103    let subtype = dict.dict_get_name_str(b"Subtype").unwrap_or_default();
104
105    // ZapfDingbats and Symbol have their own built-in encodings
106    let is_zapf = base_font == "ZapfDingbats" || base_font.ends_with("+ZapfDingbats");
107    let is_symbol = base_font == "Symbol" || base_font.ends_with("+Symbol");
108
109    match dict.dict_get(b"Encoding") {
110        Some(PdfObject::Name(name)) => Encoding::from_name(name),
111        Some(PdfObject::Dict(d)) => {
112            let default_base = if is_zapf {
113                b"ZapfDingbatsEncoding".as_slice()
114            } else if is_symbol {
115                b"SymbolEncoding".as_slice()
116            } else {
117                b"WinAnsiEncoding".as_slice()
118            };
119            let base_name = d
120                .get(b"BaseEncoding".as_slice())
121                .and_then(|o| o.as_name())
122                .unwrap_or(default_base);
123            let mut enc = Encoding::from_name(base_name);
124
125            if let Some(PdfObject::Array(diffs)) = d.get(b"Differences".as_slice()) {
126                enc.apply_differences(diffs);
127            }
128
129            enc
130        }
131        Some(PdfObject::Reference(id)) => {
132            if let Ok(Some(obj)) = doc.get_object(id.num) {
133                let obj = obj.clone();
134                return load_encoding_from_obj(&obj, doc, is_zapf, is_symbol);
135            }
136            Encoding::win_ansi()
137        }
138        None => {
139            // No explicit encoding — choose default based on font type
140            if is_zapf {
141                Encoding::zapf_dingbats()
142            } else if is_symbol {
143                Encoding::symbol()
144            } else if subtype == "TrueType" && is_subset_font(&base_font) {
145                // Subsetted TrueType fonts without encoding often use MacRoman
146                // (common in PDFs from Mac-based tools)
147                Encoding::mac_roman()
148            } else {
149                Encoding::win_ansi()
150            }
151        }
152        _ => Encoding::win_ansi(),
153    }
154}
155
156fn load_encoding_from_obj(
157    obj: &PdfObject,
158    _doc: &mut CosDoc,
159    is_zapf: bool,
160    is_symbol: bool,
161) -> Encoding {
162    match obj {
163        PdfObject::Name(name) => Encoding::from_name(name),
164        PdfObject::Dict(d) => {
165            let default_base = if is_zapf {
166                b"ZapfDingbatsEncoding".as_slice()
167            } else if is_symbol {
168                b"SymbolEncoding".as_slice()
169            } else {
170                b"WinAnsiEncoding".as_slice()
171            };
172            let base_name = d
173                .get(b"BaseEncoding".as_slice())
174                .and_then(|o| o.as_name())
175                .unwrap_or(default_base);
176            let mut enc = Encoding::from_name(base_name);
177            if let Some(PdfObject::Array(diffs)) = d.get(b"Differences".as_slice()) {
178                enc.apply_differences(diffs);
179            }
180            enc
181        }
182        _ => Encoding::win_ansi(),
183    }
184}
185
186fn load_tounicode(dict: &PdfObject, doc: &mut CosDoc) -> Option<ToUnicodeCMap> {
187    let tu_ref = dict.dict_get(b"ToUnicode")?;
188
189    let stream = match tu_ref {
190        PdfObject::Reference(id) => {
191            let obj = doc.get_object(id.num).ok()??;
192            obj.clone()
193        }
194        other => other.clone(),
195    };
196
197    let stream_data = match &stream {
198        PdfObject::Stream(s) => doc.decode_stream(s).ok()?,
199        _ => return None,
200    };
201
202    ToUnicodeCMap::parse(&stream_data).ok()
203}
204
205fn load_simple_widths(dict: &PdfObject) -> (HashMap<u32, f64>, u32, f64) {
206    let first_char = dict.dict_get_i64(b"FirstChar").unwrap_or(0) as u32;
207    let default_width = dict.dict_get_f64(b"MissingWidth").unwrap_or(1000.0);
208
209    let mut widths = HashMap::new();
210
211    if let Some(PdfObject::Array(w_arr)) = dict.dict_get(b"Widths") {
212        for (i, w) in w_arr.iter().enumerate() {
213            if let Some(width) = w.as_f64() {
214                widths.insert(first_char + i as u32, width);
215            }
216        }
217    }
218
219    (widths, first_char, default_width)
220}
221
222fn load_cid_widths(dict: &PdfObject, doc: &mut CosDoc) -> (HashMap<u32, f64>, u32, f64) {
223    // For Type0 fonts, widths are in the descendant CIDFont
224    let descendant = dict
225        .dict_get(b"DescendantFonts")
226        .and_then(|o| o.as_array())
227        .and_then(|a| a.first())
228        .cloned();
229
230    let cid_dict = match descendant {
231        Some(PdfObject::Reference(id)) => doc.get_object(id.num).ok().flatten().cloned(),
232        Some(obj) => Some(obj),
233        None => None,
234    };
235
236    let cid_dict = match cid_dict {
237        Some(d) => d,
238        None => return (HashMap::new(), 0, 1000.0),
239    };
240
241    let default_width = cid_dict.dict_get_f64(b"DW").unwrap_or(1000.0);
242    let mut widths = HashMap::new();
243
244    // Parse /W array: [cid [w1 w2 ...]] or [cid_start cid_end w]
245    if let Some(PdfObject::Array(w_arr)) = cid_dict.dict_get(b"W") {
246        let mut i = 0;
247        while i < w_arr.len() {
248            let cid_start = match w_arr[i].as_i64() {
249                Some(n) => n as u32,
250                None => {
251                    i += 1;
252                    continue;
253                }
254            };
255
256            if i + 1 < w_arr.len() {
257                match &w_arr[i + 1] {
258                    PdfObject::Array(widths_arr) => {
259                        // [cid [w1 w2 w3 ...]]
260                        for (j, w) in widths_arr.iter().enumerate() {
261                            if let Some(width) = w.as_f64() {
262                                widths.insert(cid_start + j as u32, width);
263                            }
264                        }
265                        i += 2;
266                    }
267                    PdfObject::Integer(_) | PdfObject::Real(_) if i + 2 < w_arr.len() => {
268                        // [cid_start cid_end w]
269                        let cid_end = w_arr[i + 1].as_i64().unwrap_or(0) as u32;
270                        let width = w_arr[i + 2].as_f64().unwrap_or(default_width);
271                        for cid in cid_start..=cid_end {
272                            widths.insert(cid, width);
273                        }
274                        i += 3;
275                    }
276                    _ => {
277                        i += 1;
278                    }
279                }
280            } else {
281                i += 1;
282            }
283        }
284    }
285
286    (widths, 0, default_width)
287}
288
289/// Check if a font name has a subset prefix (e.g., "ABCDEF+FontName").
290fn is_subset_font(name: &str) -> bool {
291    if name.len() < 8 {
292        return false;
293    }
294    let prefix = &name[..6];
295    let has_plus = name.as_bytes().get(6) == Some(&b'+');
296    has_plus && prefix.chars().all(|c| c.is_ascii_uppercase())
297}