informa/sfnt/tables/cmap/
mod.rs

1//! # cmap — Character to Glyph Index Mapping Table
2//!
3//! Implementation of the `cmap` table.
4//!
5//! *Specification:*
6//! [OpenType](https://docs.microsoft.com/en-us/typography/opentype/spec/cmap),
7//! [TrueType](https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6cmap.html).
8//!
9//! The `cmap` table provides mappings from character codes to glyph ids.
10//! This allows typesetting systems using the font to resolve the glyphs needed to display a string of characters.
11//!
12//! Not all glyphs need be covered by `cmap`; some glyphs are accessible only using smart font technology or a glyph picker.
13//! Conversely, some glyphs may be mapped to by multiple character codes.
14//!
15//! A `cmap` table consists of one or multiple subtables.
16//! Subtables are differentiated by their format.
17//! Formats differ in character code encoding, compression strategies, and codepoint space size.
18//!
19//! Subtables are accessed by encoding records.
20//! A record describes the format of its associated subtable, the intended platform, and the encoding.
21//! Multiple records using the same subtable format (but possibly with different platforms and encodings) need not store the same subtable multiple times and instead share the common subtable storage space.
22//!
23//! # Usage
24//!
25//! Use the [`compile`] function to convert a character map to a `cmap` table.
26//! The [`Context::cmap_encoding_records`] field can be set to customize the encoding records used by [`compile`].
27//!
28//! # Example
29//!
30//! ```
31//! # use informa::ctx::Context;
32//! # use informa::data::CharacterMap;
33//! # use informa::sfnt::tables::cmap::compile;
34//! let ctx = Context::default();
35//! let mut map = CharacterMap::new();
36//! map.insert('A', 1);
37//! map.insert('B', 2);
38//! map.insert('C', 3);
39//! let table = compile(&map, &ctx);
40//! ```
41
42pub mod format12;
43pub mod format4;
44
45use crate::ctx::Context;
46use crate::data::CharacterMap;
47use bytes::{BufMut, Bytes, BytesMut};
48use itertools::Itertools;
49use lazy_static::lazy_static;
50use std::cmp;
51use std::collections::HashMap;
52use std::fmt;
53use std::mem::size_of;
54
55/// The size in bytes of the `version` field.
56const VERSION_FIELD_SIZE: usize = size_of::<u16>();
57/// The size in bytes of the `numTables` field.
58const NUM_TABLES_FIELD_SIZE: usize = size_of::<u16>();
59/// The size in bytes of the `platformID` field.
60const PLATFORM_ID_FIELD_SIZE: usize = size_of::<u16>();
61/// The size in bytes of the `encodingID` field.
62const ENCODING_ID_FIELD_SIZE: usize = size_of::<u16>();
63/// The size in bytes of the `subtableOffset` field.
64const SUBTABLE_OFFSET_FIELD_SIZE: usize = size_of::<u32>();
65/// The size if bytes of an encoding record.
66const ENCODING_RECORD_SIZE: usize =
67    PLATFORM_ID_FIELD_SIZE + ENCODING_ID_FIELD_SIZE + SUBTABLE_OFFSET_FIELD_SIZE;
68/// The size in bytes of the table header.
69const CONSTANT_SIZE: usize = VERSION_FIELD_SIZE + NUM_TABLES_FIELD_SIZE;
70/// The largest Unicode scaler that is part of the Basic Multilingual Plane (BMP).
71const MAX_BMP_SCALER: char = '\u{FFFF}';
72
73lazy_static! {
74    /// The default records used to represent BMP only character maps in order of `Ord`.
75    static ref DEFAULT_BMP_RECORDS: Vec<EncodingRecord> = vec![
76        EncodingRecord {
77            encoding: Encoding::Unicode(UnicodeEncoding::Bmp),
78            format: RecordFormat::Format4,
79        },
80        EncodingRecord {
81            encoding: Encoding::Windows(WindowsEncoding::Bmp),
82            format: RecordFormat::Format4,
83        },
84    ];
85    /// The default records used to represent full Unicode character maps in order of `Ord`.
86    static ref DEFAULT_FULL_RECORDS: Vec<EncodingRecord> = vec![
87        EncodingRecord {
88            encoding: Encoding::Unicode(UnicodeEncoding::Bmp),
89            format: RecordFormat::Format4,
90        },
91        EncodingRecord {
92            encoding: Encoding::Unicode(UnicodeEncoding::Full),
93            format: RecordFormat::Format12,
94        },
95        EncodingRecord {
96            encoding: Encoding::Windows(WindowsEncoding::Bmp),
97            format: RecordFormat::Format4,
98        },
99        EncodingRecord {
100            encoding: Encoding::Windows(WindowsEncoding::Full),
101            format: RecordFormat::Format12,
102        },
103    ];
104}
105
106/// Returns a `cmap` table for the given character map.
107///
108/// # Example
109///
110/// ```
111/// # use informa::ctx::Context;
112/// # use informa::data::CharacterMap;
113/// # use informa::sfnt::tables::cmap::compile;
114/// let ctx = Context::default();
115/// let mut map = CharacterMap::new();
116/// map.insert('A', 1);
117/// map.insert('B', 2);
118/// map.insert('C', 3);
119/// let table = compile(&map, &ctx);
120/// ```
121pub fn compile(map: &CharacterMap, ctx: &Context) -> Bytes {
122    let records = ctx.cmap_encoding_records.as_ref().unwrap_or_else(|| {
123        let exceeds_bmp = map.keys().last().map_or(false, |&x| x > MAX_BMP_SCALER);
124        if exceeds_bmp {
125            &DEFAULT_FULL_RECORDS
126        } else {
127            &DEFAULT_BMP_RECORDS
128        }
129    });
130
131    let mut subtables: HashMap<RecordFormat, Bytes> = records
132        .iter()
133        .map(|x| x.format)
134        .unique()
135        .map(|format| {
136            let subtable = match format {
137                RecordFormat::Format4 => format4::compile(map, ctx),
138                RecordFormat::Format12 => format12::compile(map),
139            };
140
141            (format, subtable)
142        })
143        .collect();
144
145    let fixed_size = CONSTANT_SIZE + (records.len() * ENCODING_RECORD_SIZE);
146    let length = fixed_size + subtables.values().map(Bytes::len).sum::<usize>();
147    let mut buf = BytesMut::with_capacity(length);
148
149    let version: u16 = 0;
150    buf.put_u16(version);
151
152    let num_tables: u16 = records.len() as u16;
153    buf.put_u16(num_tables);
154
155    let mut subtable_offsets: HashMap<RecordFormat, u32> = HashMap::new();
156    let mut subtable_offset: u32 = fixed_size as u32;
157
158    for record in records {
159        let platform_id = record.encoding.platform_id();
160        buf.put_u16(platform_id);
161
162        let encoding_id = record.encoding.encoding_id();
163        buf.put_u16(encoding_id);
164
165        if let Some(&offset) = subtable_offsets.get(&record.format) {
166            // reuse already registered subtable
167            buf.put_u32(offset);
168        } else {
169            // register subtable format
170            buf.put_u32(subtable_offset);
171            subtable_offsets.insert(record.format, subtable_offset);
172            subtable_offset += subtables[&record.format].len() as u32;
173        }
174    }
175
176    for format in records.iter().map(|x| x.format).unique() {
177        let subtable = subtables.get_mut(&format).unwrap();
178        buf.put(subtable);
179    }
180
181    assert_eq!(length, buf.len());
182
183    buf.freeze()
184}
185
186/// An encoding record describes a `cmap` subtable.
187#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
188pub struct EncodingRecord {
189    /// The encoding used by the subtable.
190    pub encoding: Encoding,
191    /// The subtable format.
192    pub format: RecordFormat,
193}
194
195impl cmp::PartialOrd for EncodingRecord {
196    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
197        self.encoding.partial_cmp(&other.encoding)
198    }
199}
200
201impl cmp::Ord for EncodingRecord {
202    fn cmp(&self, other: &Self) -> cmp::Ordering {
203        self.encoding.cmp(&other.encoding)
204    }
205}
206
207/// An error that may occur when reading an encoding record.
208#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
209pub enum ReadError {
210    /// The syntax is invalid.
211    InvalidSyntax,
212    /// The platform id can not be parsed.
213    InvalidPlatform,
214    /// The encoding id can not be parsed.
215    InvalidEncoding,
216    /// The format id can not be parsed.
217    InvalidFormat,
218    /// The platform id is not supported.
219    UnsupportedPlatform,
220    /// The encoding id is not supported.
221    UnsupportedEncoding,
222    /// The format is not supported.
223    UnsupportedFormat,
224}
225
226impl fmt::Display for ReadError {
227    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
228        let message = match self {
229            Self::InvalidSyntax => "The syntax is invalid.",
230            Self::InvalidPlatform => "The platform id can not be parsed.",
231            Self::InvalidEncoding => "The encoding id can not be parsed.",
232            Self::InvalidFormat => "The format id can not be parsed.",
233            Self::UnsupportedPlatform => "The platform id is not supported by Informa.",
234            Self::UnsupportedEncoding => "The encoding id is not supported by Informa.",
235            Self::UnsupportedFormat => "The format is not supported by Informa.",
236        };
237        write!(f, "{}", message)
238    }
239}
240
241impl std::error::Error for ReadError {
242    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
243        None
244    }
245}
246
247impl EncodingRecord {
248    /// Creates an `EncodingRecord` from an encoding-record code.
249    ///
250    /// Codes are formatted as `<platform_id> "/" <encoding_id> "=" <format_id>`.
251    /// Such a code is also written using the `Display` formatting of an `EncodingRecord`.
252    ///
253    /// ## Example
254    ///
255    /// ```
256    /// # use informa::sfnt::tables::cmap::EncodingRecord;
257    /// # use informa::sfnt::tables::cmap::Encoding;
258    /// # use informa::sfnt::tables::cmap::UnicodeEncoding;
259    /// # use informa::sfnt::tables::cmap::RecordFormat;
260    /// assert_eq!(
261    ///     EncodingRecord::from_code("0/3=4"),
262    ///     Ok(EncodingRecord {
263    ///         encoding: Encoding::Unicode(UnicodeEncoding::Bmp),
264    ///         format: RecordFormat::Format4
265    ///     })
266    /// );
267    /// ```
268    pub fn from_code(code: &str) -> Result<EncodingRecord, ReadError> {
269        // NOTE: use `split_once` once stable
270        // currently, inputs such as "0/3/abc=4=xyz" are accepted since
271        // only the first two items of each split are considered
272        let (selection, format) = code
273            .split("=")
274            .next_tuple::<(&str, &str)>()
275            .ok_or(ReadError::InvalidSyntax)?;
276        let (platform, encoding) = selection
277            .split("/")
278            .next_tuple::<(&str, &str)>()
279            .ok_or(ReadError::InvalidSyntax)?;
280        let platform_id = platform
281            .parse::<u16>()
282            .ok()
283            .ok_or(ReadError::InvalidPlatform)?;
284        let encoding_id = encoding
285            .parse::<u16>()
286            .ok()
287            .ok_or(ReadError::InvalidEncoding)?;
288        let format_id = format.parse::<u16>().ok().ok_or(ReadError::InvalidFormat)?;
289        let encoding = Encoding::from_ids(platform_id, encoding_id)?;
290        let format = RecordFormat::from_id(format_id).ok_or(ReadError::UnsupportedFormat)?;
291
292        Ok(EncodingRecord { encoding, format })
293    }
294}
295
296impl fmt::Display for EncodingRecord {
297    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
298        write!(f, "{}={}", self.encoding, self.format)
299    }
300}
301
302/// The combination of a platform id and a matching encoding id.
303#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Clone, Copy)]
304pub enum Encoding {
305    /// The Unicode platform.
306    Unicode(UnicodeEncoding),
307    /// The Windows platform.
308    Windows(WindowsEncoding),
309}
310
311impl Encoding {
312    /// Creates an encoding from a given `platformID` and `encodingID`.
313    ///
314    /// # Errors
315    ///
316    /// This function returns an error in case the platform id is unsupported or the encoding id is unsupported.
317    pub fn from_ids(platform_id: u16, encoding_id: u16) -> Result<Self, ReadError> {
318        match platform_id {
319            0 => {
320                let encoding =
321                    UnicodeEncoding::from_id(encoding_id).ok_or(ReadError::UnsupportedEncoding)?;
322                Ok(Encoding::Unicode(encoding))
323            }
324            3 => {
325                let encoding =
326                    WindowsEncoding::from_id(encoding_id).ok_or(ReadError::UnsupportedEncoding)?;
327                Ok(Encoding::Windows(encoding))
328            }
329            _ => Err(ReadError::UnsupportedPlatform)?,
330        }
331    }
332
333    /// The `platformID` of the encoding.
334    pub fn platform_id(&self) -> u16 {
335        match self {
336            Self::Unicode(_) => 0,
337            Self::Windows(_) => 3,
338        }
339    }
340
341    /// The `encodingID` of the encoding.
342    pub fn encoding_id(&self) -> u16 {
343        match self {
344            Self::Unicode(encoding) => encoding.id(),
345            Self::Windows(encoding) => encoding.id(),
346        }
347    }
348}
349
350impl fmt::Display for Encoding {
351    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
352        write!(f, "{}/{}", self.platform_id(), self.encoding_id())
353    }
354}
355
356/// The supported Unicode encodings.
357#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Clone, Copy)]
358pub enum UnicodeEncoding {
359    /// The Unicode Basic Multilingual Plane codespace.
360    Bmp,
361    /// The full Unicode codespace.
362    Full,
363}
364
365impl UnicodeEncoding {
366    /// Returns a `UnicodeEncoding` for an `encodingID`; returns `None` if the encoding id is unsupported.
367    pub fn from_id(id: u16) -> Option<Self> {
368        match id {
369            3 => Some(Self::Bmp),
370            4 => Some(Self::Full),
371            _ => None,
372        }
373    }
374
375    /// Returns the `encodingID` of the encoding.
376    pub fn id(&self) -> u16 {
377        match self {
378            Self::Bmp => 3,
379            Self::Full => 4,
380        }
381    }
382}
383
384/// The supported Windows encodings.
385#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash, Clone, Copy)]
386pub enum WindowsEncoding {
387    /// The Unicode Basic Multilingual Plane codespace.
388    Bmp,
389    /// The full Unicode codespace.
390    Full,
391}
392
393impl WindowsEncoding {
394    /// Returns a `WindowsEncoding` for an `encodingID`; returns `None` if the encoding id is unsupported.
395    pub fn from_id(id: u16) -> Option<Self> {
396        match id {
397            1 => Some(Self::Bmp),
398            10 => Some(Self::Full),
399            _ => None,
400        }
401    }
402
403    /// Returns the `encodingID` of the encoding.
404    pub fn id(&self) -> u16 {
405        match self {
406            Self::Bmp => 1,
407            Self::Full => 10,
408        }
409    }
410}
411
412/// The supported `cmap` subtable formats.
413#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
414pub enum RecordFormat {
415    /// The subtable format 4.
416    Format4,
417    /// The subtable format 12.
418    Format12,
419}
420
421impl RecordFormat {
422    /// Returns the `RecordFormat` for a subtable format id; returns `None` if the format id is unsupported.
423    pub fn from_id(id: u16) -> Option<Self> {
424        match id {
425            4 => Some(Self::Format4),
426            12 => Some(Self::Format12),
427            _ => None,
428        }
429    }
430
431    /// Returns the id of the format.
432    pub fn id(&self) -> u16 {
433        match self {
434            Self::Format4 => 4,
435            Self::Format12 => 12,
436        }
437    }
438}
439
440impl fmt::Display for RecordFormat {
441    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
442        write!(f, "{}", self.id())
443    }
444}