Skip to main content

read_fonts/tables/
name.rs

1//! The [name (Naming)](https://docs.microsoft.com/en-us/typography/opentype/spec/name) table
2
3include!("../../generated/generated_name.rs");
4
5pub use types::NameId;
6
7impl<'a> Name<'a> {
8    /// The FontData containing the encoded name strings.
9    pub fn string_data(&self) -> FontData<'a> {
10        let base = self.offset_data();
11        let off = self.storage_offset();
12        base.split_off(off as usize).unwrap_or_default()
13    }
14}
15
16impl NameRecord {
17    /// Return a type that can decode the string data for this name entry.
18    ///
19    /// The `data` argument should be the name table's data section, which can
20    /// be retrieved via [`Name::string_data`].
21    pub fn string<'a>(&self, data: FontData<'a>) -> Result<NameString<'a>, ReadError> {
22        let start = self.string_offset().non_null().unwrap_or(0);
23        let end = start + self.length() as usize;
24
25        let data = data
26            .as_bytes()
27            .get(start..end)
28            .ok_or(ReadError::OutOfBounds)?;
29
30        let encoding = Encoding::new(self.platform_id(), self.encoding_id());
31        Ok(NameString { data, encoding })
32    }
33
34    // reference from fonttools:
35    // https://github.com/fonttools/fonttools/blob/c2119229cfb02cdb7c5a63374ef29d3d514259e8/Lib/fontTools/ttLib/tables/_n_a_m_e.py#L509
36    pub fn is_unicode(&self) -> bool {
37        self.platform_id() == 0
38            || (self.platform_id() == 3 && [0, 1, 10].contains(&self.encoding_id()))
39    }
40}
41
42impl LangTagRecord {
43    /// Return a type that can decode the string data for this name entry.
44    pub fn lang_tag<'a>(&self, data: FontData<'a>) -> Result<NameString<'a>, ReadError> {
45        let start = self.lang_tag_offset().non_null().unwrap_or(0);
46        let end = start + self.length() as usize;
47
48        let data = data
49            .as_bytes()
50            .get(start..end)
51            .ok_or(ReadError::OutOfBounds)?;
52
53        let encoding = Encoding::Utf16Be;
54        Ok(NameString { data, encoding })
55    }
56}
57
58//-- all this is from pinot https://github.com/dfrg/pinot/blob/eff5239018ca50290fb890a84da3dd51505da364/src/name.rs
59/// Entry for a name in the naming table.
60///
61/// This provides an iterator over characters.
62#[derive(Copy, Clone, PartialEq, Eq)]
63pub struct NameString<'a> {
64    data: &'a [u8],
65    encoding: Encoding,
66}
67
68impl<'a> NameString<'a> {
69    /// An iterator over the `char`s in this name.
70    pub fn chars(&self) -> CharIter<'a> {
71        CharIter {
72            data: self.data,
73            encoding: self.encoding,
74            pos: 0,
75        }
76    }
77}
78
79#[cfg(feature = "experimental_traverse")]
80impl<'a> traversal::SomeString<'a> for NameString<'a> {
81    fn iter_chars(&self) -> Box<dyn Iterator<Item = char> + 'a> {
82        Box::new(self.into_iter())
83    }
84}
85
86#[cfg(feature = "experimental_traverse")]
87impl NameRecord {
88    fn traverse_string<'a>(&self, data: FontData<'a>) -> traversal::FieldType<'a> {
89        FieldType::StringOffset(traversal::StringOffset {
90            offset: self.string_offset().into(),
91            target: self.string(data).map(|s| Box::new(s) as _),
92        })
93    }
94}
95
96#[cfg(feature = "experimental_traverse")]
97impl LangTagRecord {
98    fn traverse_lang_tag<'a>(&self, data: FontData<'a>) -> traversal::FieldType<'a> {
99        FieldType::StringOffset(traversal::StringOffset {
100            offset: self.lang_tag_offset().into(),
101            target: self.lang_tag(data).map(|s| Box::new(s) as _),
102        })
103    }
104}
105
106impl<'a> IntoIterator for NameString<'a> {
107    type Item = char;
108    type IntoIter = CharIter<'a>;
109    fn into_iter(self) -> Self::IntoIter {
110        self.chars()
111    }
112}
113
114impl std::fmt::Display for NameString<'_> {
115    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
116        for c in self.chars() {
117            c.fmt(f)?;
118        }
119        Ok(())
120    }
121}
122
123impl std::fmt::Debug for NameString<'_> {
124    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
125        write!(f, "\"{self}\"")
126    }
127}
128
129/// An iterator over the chars of a name record.
130#[derive(Clone)]
131pub struct CharIter<'a> {
132    data: &'a [u8],
133    encoding: Encoding,
134    pos: usize,
135}
136
137impl CharIter<'_> {
138    fn bump_u16(&mut self) -> Option<u16> {
139        let result = self
140            .data
141            .get(self.pos..self.pos + 2)
142            .map(|x| u16::from_be_bytes(x.try_into().unwrap()))?;
143        self.pos += 2;
144        Some(result)
145    }
146
147    fn bump_u8(&mut self) -> Option<u8> {
148        let result = self.data.get(self.pos)?;
149        self.pos += 1;
150        Some(*result)
151    }
152}
153
154impl Iterator for CharIter<'_> {
155    type Item = char;
156
157    fn next(&mut self) -> Option<Self::Item> {
158        if self.pos >= self.data.len() {
159            return None;
160        }
161        let rep = core::char::REPLACEMENT_CHARACTER;
162        let raw_c = match self.encoding {
163            Encoding::Utf16Be => {
164                let c1 = self.bump_u16()? as u32;
165                if (0xD800..0xDC00).contains(&c1) {
166                    let Some(c2) = self.bump_u16() else {
167                        return Some(rep);
168                    };
169                    if !(0xDC00..=0xDFFF).contains(&c2) {
170                        // c1 is an unpaired high surrogate; rewind so c2 is
171                        // decoded on its own rather than folded into a bogus
172                        // scalar.
173                        self.pos -= 2;
174                        return Some(rep);
175                    }
176                    ((c1 & 0x3FF) << 10) + (c2 as u32 & 0x3FF) + 0x10000
177                } else {
178                    c1
179                }
180            }
181            Encoding::MacRoman => {
182                let c = self.bump_u8()?;
183                MacRomanMapping.decode(c) as u32
184            }
185            _ => return None,
186        };
187        Some(std::char::from_u32(raw_c).unwrap_or(rep))
188    }
189}
190
191/// The encoding used by the name table.
192#[derive(Copy, Clone, PartialEq, Eq)]
193pub enum Encoding {
194    Utf16Be,
195    MacRoman,
196    Unknown,
197}
198
199impl Encoding {
200    /// Determine the coding from the platform and encoding id.
201    pub fn new(platform_id: u16, encoding_id: u16) -> Encoding {
202        match (platform_id, encoding_id) {
203            (0, _) => Encoding::Utf16Be,
204            (1, 0) => Encoding::MacRoman,
205            (3, 0) => Encoding::Utf16Be,
206            (3, 1) => Encoding::Utf16Be,
207            (3, 10) => Encoding::Utf16Be,
208            _ => Encoding::Unknown,
209        }
210    }
211}
212
213/// A helper for encoding and decoding Mac OS Roman encoded strings.
214pub struct MacRomanMapping;
215
216impl MacRomanMapping {
217    const START_REMAP: u8 = 128;
218    /// Convert from a mac-roman encoded byte to a `char`
219    pub fn decode(self, raw: u8) -> char {
220        if raw < Self::START_REMAP {
221            raw as char
222        } else {
223            let idx = raw - Self::START_REMAP;
224            char::from_u32(MAC_ROMAN_DECODE[idx as usize] as u32).unwrap()
225        }
226    }
227
228    /// convert from a char to a mac-roman encoded byte, if the char is in the mac-roman charset.
229    pub fn encode(self, c: char) -> Option<u8> {
230        let raw_c = c as u32;
231        let raw_c: u16 = raw_c.try_into().ok()?;
232        if raw_c < Self::START_REMAP as u16 {
233            Some(raw_c as u8)
234        } else {
235            match MAC_ROMAN_ENCODE.binary_search_by_key(&raw_c, |(unic, _)| *unic) {
236                Ok(idx) => Some(MAC_ROMAN_ENCODE[idx].1),
237                Err(_) => None,
238            }
239        }
240    }
241}
242
243/// A lookup table for the Mac Roman encoding. This matches the values `128..=255`
244/// to specific Unicode values.
245#[rustfmt::skip]
246static MAC_ROMAN_DECODE: [u16; 128] = [
247    196, 197, 199, 201, 209, 214, 220, 225, 224, 226, 228, 227, 229, 231, 233,
248    232, 234, 235, 237, 236, 238, 239, 241, 243, 242, 244, 246, 245, 250, 249,
249    251, 252, 8224, 176, 162, 163, 167, 8226, 182, 223, 174, 169, 8482, 180,
250    168, 8800, 198, 216, 8734, 177, 8804, 8805, 165, 181, 8706, 8721, 8719,
251    960, 8747, 170, 186, 937, 230, 248, 191, 161, 172, 8730, 402, 8776, 8710,
252    171, 187, 8230, 160, 192, 195, 213, 338, 339, 8211, 8212, 8220, 8221, 8216,
253    8217, 247, 9674, 255, 376, 8260, 8364, 8249, 8250, 64257, 64258, 8225, 183,
254    8218, 8222, 8240, 194, 202, 193, 203, 200, 205, 206, 207, 204, 211, 212,
255    63743, 210, 218, 219, 217, 305, 710, 732, 175, 728, 729, 730, 184, 733,
256    731, 711,
257];
258
259/// A lookup pairing (sorted) Unicode values to Mac Roman values
260#[rustfmt::skip]
261static MAC_ROMAN_ENCODE: [(u16, u8); 128] = [
262    (160, 202), (161, 193), (162, 162), (163, 163),
263    (165, 180), (167, 164), (168, 172), (169, 169),
264    (170, 187), (171, 199), (172, 194), (174, 168),
265    (175, 248), (176, 161), (177, 177), (180, 171),
266    (181, 181), (182, 166), (183, 225), (184, 252),
267    (186, 188), (187, 200), (191, 192), (192, 203),
268    (193, 231), (194, 229), (195, 204), (196, 128),
269    (197, 129), (198, 174), (199, 130), (200, 233),
270    (201, 131), (202, 230), (203, 232), (204, 237),
271    (205, 234), (206, 235), (207, 236), (209, 132),
272    (210, 241), (211, 238), (212, 239), (213, 205),
273    (214, 133), (216, 175), (217, 244), (218, 242),
274    (219, 243), (220, 134), (223, 167), (224, 136),
275    (225, 135), (226, 137), (227, 139), (228, 138),
276    (229, 140), (230, 190), (231, 141), (232, 143),
277    (233, 142), (234, 144), (235, 145), (236, 147),
278    (237, 146), (238, 148), (239, 149), (241, 150),
279    (242, 152), (243, 151), (244, 153), (245, 155),
280    (246, 154), (247, 214), (248, 191), (249, 157),
281    (250, 156), (251, 158), (252, 159), (255, 216),
282    (305, 245), (338, 206), (339, 207), (376, 217),
283    (402, 196), (710, 246), (711, 255), (728, 249),
284    (729, 250), (730, 251), (731, 254), (732, 247),
285    (733, 253), (937, 189), (960, 185), (8211, 208),
286    (8212, 209), (8216, 212), (8217, 213), (8218, 226),
287    (8220, 210), (8221, 211), (8222, 227), (8224, 160),
288    (8225, 224), (8226, 165), (8230, 201), (8240, 228),
289    (8249, 220), (8250, 221), (8260, 218), (8364, 219),
290    (8482, 170), (8706, 182), (8710, 198), (8719, 184),
291    (8721, 183), (8730, 195), (8734, 176), (8747, 186),
292    (8776, 197), (8800, 173), (8804, 178), (8805, 179),
293    (9674, 215), (63743, 240), (64257, 222), (64258, 223),
294];
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    #[test]
301    fn mac_roman() {
302        static INPUT: &str = "Joachim Müller-Lancé";
303        for c in INPUT.chars() {
304            let enc = MacRomanMapping.encode(c).unwrap();
305            assert_eq!(MacRomanMapping.decode(enc), c);
306        }
307    }
308
309    #[test]
310    fn lone_surrogate_at_end() {
311        let chars = CharIter {
312            // DEVANAGARI LETTER SHORT A (U+0904), unpaired high surrogate (0xD800)
313            data: &[0x09, 0x04, 0xD8, 0x00],
314            encoding: Encoding::Utf16Be,
315            pos: 0,
316        };
317        assert!(chars.eq(['ऄ', std::char::REPLACEMENT_CHARACTER].into_iter()))
318    }
319
320    #[test]
321    fn high_surrogate_followed_by_non_low() {
322        let rep = std::char::REPLACEMENT_CHARACTER;
323        // High surrogate (0xD800) followed by a BMP scalar 'A' (U+0041): the
324        // high surrogate is unpaired and 'A' must still be decoded.
325        let chars = CharIter {
326            data: &[0xD8, 0x00, 0x00, 0x41],
327            encoding: Encoding::Utf16Be,
328            pos: 0,
329        };
330        assert!(chars.eq([rep, 'A'].into_iter()));
331
332        // High surrogate (0xD800) followed by a valid pair (U+1F600): the first
333        // unit is unpaired and the following pair must decode intact.
334        let chars = CharIter {
335            data: &[0xD8, 0x00, 0xD8, 0x3D, 0xDE, 0x00],
336            encoding: Encoding::Utf16Be,
337            pos: 0,
338        };
339        assert!(chars.eq([rep, '😀'].into_iter()));
340    }
341}