write_fonts/tables/
cmap.rs

1//! the [cmap] table
2//!
3//! [cmap]: https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
4
5include!("../../generated/generated_cmap.rs");
6
7use std::collections::HashMap;
8
9use crate::util::SearchRange;
10
11// https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#windows-platform-platform-id--3
12const WINDOWS_BMP_ENCODING: u16 = 1;
13const WINDOWS_FULL_REPERTOIRE_ENCODING: u16 = 10;
14
15// https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#unicode-platform-platform-id--0
16const UNICODE_BMP_ENCODING: u16 = 3;
17const UNICODE_FULL_REPERTOIRE_ENCODING: u16 = 4;
18
19impl CmapSubtable {
20    /// Create a new format 4 subtable
21    ///
22    /// Returns `None` if none of the input chars are in the BMP (i.e. have
23    /// codepoints <= 0xFFFF.)
24    ///
25    /// Invariants:
26    ///
27    /// - Inputs must be sorted and deduplicated.
28    /// - All `GlyphId`s must be 16-bit
29    fn create_format_4(mappings: &[(char, GlyphId)]) -> Option<Self> {
30        let mut end_code = Vec::with_capacity(mappings.len() + 1);
31        let mut start_code = Vec::with_capacity(mappings.len() + 1);
32        let mut id_deltas = Vec::with_capacity(mappings.len() + 1);
33        let mut id_range_offsets = Vec::with_capacity(mappings.len() + 1);
34        let mut glyph_ids = Vec::new();
35
36        let segments = Format4SegmentComputer::new(mappings).compute();
37        assert!(mappings.iter().all(|(_, g)| g.to_u32() <= 0xFFFF));
38        if segments.is_empty() {
39            // no chars in BMP
40            return None;
41        }
42        let n_segments = segments.len() + 1;
43        for (i, segment) in segments.into_iter().enumerate() {
44            let start = mappings[segment.start_ix].0;
45            let end = mappings[segment.end_ix].0;
46            start_code.push(start as u32 as u16);
47            end_code.push(end as u32 as u16);
48            if let Some(delta) = segment.id_delta {
49                // "The idDelta arithmetic is modulo 65536":
50                let delta = i16::try_from(delta)
51                    .unwrap_or_else(|_| delta.rem_euclid(0x10000).try_into().unwrap());
52                id_deltas.push(delta);
53                id_range_offsets.push(0u16);
54            } else {
55                // if the deltas for a range are not identical, we rely on the
56                // explicit glyph_ids array.
57                //
58                // The logic here is based on the memory layout of the table:
59                // because the glyph_id array follows the id_range_offsets array,
60                // the id_range_offsets array essentially stores a memory offset.
61                let current_n_ids = glyph_ids.len();
62                let n_following_segments = n_segments - i;
63                // number of bytes from the id_range_offset value to the glyph id
64                // for this segment, in the glyph_ids array
65                let id_range_offset = (n_following_segments + current_n_ids) * u16::RAW_BYTE_LEN;
66                id_deltas.push(0);
67                id_range_offsets.push(id_range_offset.try_into().unwrap());
68                glyph_ids.extend(
69                    mappings[segment.start_ix..=segment.end_ix]
70                        .iter()
71                        .map(|(_, gid)| u16::try_from(gid.to_u32()).expect("checked before now")),
72                )
73            }
74        }
75
76        // add the final segment:
77        end_code.push(0xFFFF);
78        start_code.push(0xFFFF);
79        id_deltas.push(1);
80        id_range_offsets.push(0);
81
82        Some(Self::format_4(
83            0,
84            end_code,
85            start_code,
86            id_deltas,
87            id_range_offsets,
88            glyph_ids,
89        ))
90    }
91
92    /// Create a new format 12 `CmapSubtable` from a list of `(char, GlyphId)` pairs.
93    ///
94    /// The pairs are expected to be already sorted by chars.
95    /// In case of duplicate chars, the last one wins.
96    fn create_format_12(mappings: &[(char, GlyphId)]) -> Self {
97        let (mut char_codes, gids): (Vec<u32>, Vec<u32>) = mappings
98            .iter()
99            .map(|(cp, gid)| (*cp as u32, gid.to_u32()))
100            .unzip();
101        let cmap: HashMap<_, _> = char_codes.iter().cloned().zip(gids).collect();
102        char_codes.dedup();
103
104        // we know we have at least one non-BMP char_code > 0xFFFF so unwrap is safe
105        let mut start_char_code = *char_codes.first().unwrap();
106        let mut start_glyph_id = cmap[&start_char_code];
107        let mut last_glyph_id = start_glyph_id.wrapping_sub(1);
108        let mut last_char_code = start_char_code.wrapping_sub(1);
109        let mut groups = Vec::new();
110        for char_code in char_codes {
111            let glyph_id = cmap[&char_code];
112            if glyph_id != last_glyph_id.wrapping_add(1)
113                || char_code != last_char_code.wrapping_add(1)
114            {
115                groups.push((start_char_code, last_char_code, start_glyph_id));
116                start_char_code = char_code;
117                start_glyph_id = glyph_id;
118            }
119            last_glyph_id = glyph_id;
120            last_char_code = char_code;
121        }
122        groups.push((start_char_code, last_char_code, start_glyph_id));
123
124        let seq_map_groups = groups
125            .into_iter()
126            .map(|(start_char, end_char, gid)| SequentialMapGroup::new(start_char, end_char, gid))
127            .collect::<Vec<_>>();
128        CmapSubtable::format_12(
129            0, // 'lang' set to zero for all 'cmap' subtables whose platform IDs are other than Macintosh
130            seq_map_groups,
131        )
132    }
133}
134
135/// A conflicting Cmap definition, one char is mapped to multiple distinct GlyphIds.
136///
137/// If there are multiple conflicting mappings, one is chosen arbitrarily.
138/// gid1 is less than gid2.
139#[derive(Debug, Clone, PartialEq, Eq)]
140pub struct CmapConflict {
141    ch: char,
142    gid1: GlyphId,
143    gid2: GlyphId,
144}
145
146impl std::fmt::Display for CmapConflict {
147    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
148        let ch32 = self.ch as u32;
149        write!(
150            f,
151            "Cannot map {:?} (U+{ch32:04X}) to two different glyph ids: {} and {}",
152            self.ch, self.gid1, self.gid2
153        )
154    }
155}
156
157impl std::error::Error for CmapConflict {}
158
159impl Cmap {
160    /// Generates a ['cmap'] that is expected to work in most modern environments.
161    ///
162    /// The input is not required to be sorted.
163    ///
164    /// This emits [format 4] and [format 12] subtables, respectively for the
165    /// Basic Multilingual Plane and Full Unicode Repertoire.
166    ///
167    /// Also see: <https://learn.microsoft.com/en-us/typography/opentype/spec/recom#cmap-table>
168    ///
169    /// [`cmap`]: https://learn.microsoft.com/en-us/typography/opentype/spec/cmap
170    /// [format 4]: https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values
171    /// [format 12]: https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-12-segmented-coverage
172    pub fn from_mappings(
173        mappings: impl IntoIterator<Item = (char, GlyphId)>,
174    ) -> Result<Cmap, CmapConflict> {
175        let mut mappings: Vec<_> = mappings.into_iter().collect();
176        mappings.sort();
177        mappings.dedup();
178        if let Some((ch, gid1, gid2)) =
179            mappings
180                .iter()
181                .zip(mappings.iter().skip(1))
182                .find_map(|((c1, g1), (c2, g2))| {
183                    (c1 == c2 && g1 != g2).then(|| (*c1, *g1.min(g2), *g1.max(g2)))
184                })
185        {
186            return Err(CmapConflict { ch, gid1, gid2 });
187        }
188
189        let mut uni_records = Vec::new(); // platform 0
190        let mut win_records = Vec::new(); // platform 3
191
192        // if there are characters in the Unicode Basic Multilingual Plane (U+0000 to U+FFFF)
193        // we need to emit format 4 subtables
194        let bmp_subtable = CmapSubtable::create_format_4(&mappings);
195        if let Some(bmp_subtable) = bmp_subtable {
196            // Absent a strong signal to do otherwise, match fontmake/fonttools
197            // Since both Windows and Unicode platform tables use the same subtable they are
198            // almost entirely byte-shared
199            // See https://github.com/googlefonts/fontmake-rs/issues/251
200            uni_records.push(EncodingRecord::new(
201                PlatformId::Unicode,
202                UNICODE_BMP_ENCODING,
203                bmp_subtable.clone(),
204            ));
205            win_records.push(EncodingRecord::new(
206                PlatformId::Windows,
207                WINDOWS_BMP_ENCODING,
208                bmp_subtable,
209            ));
210        }
211
212        // If there are any supplementary-plane characters (U+10000 to U+10FFFF) we also
213        // emit format 12 subtables
214        if mappings.iter().any(|(cp, _)| *cp > '\u{FFFF}') {
215            let full_repertoire_subtable = CmapSubtable::create_format_12(&mappings);
216            // format 12 subtables are also going to be byte-shared, just like above
217            uni_records.push(EncodingRecord::new(
218                PlatformId::Unicode,
219                UNICODE_FULL_REPERTOIRE_ENCODING,
220                full_repertoire_subtable.clone(),
221            ));
222            win_records.push(EncodingRecord::new(
223                PlatformId::Windows,
224                WINDOWS_FULL_REPERTOIRE_ENCODING,
225                full_repertoire_subtable,
226            ));
227        }
228
229        // put encoding records in order of (platform id, encoding id):
230        // - Unicode (0), BMP (3)
231        // - Unicode (0), full repertoire (4)
232        // - Windows (3), BMP (1)
233        // - Windows (3), full repertoire (10)
234        Ok(Cmap::new(
235            uni_records.into_iter().chain(win_records).collect(),
236        ))
237    }
238}
239
240// a helper for computing efficient segments for cmap format 4
241struct Format4SegmentComputer<'a> {
242    mappings: &'a [(char, GlyphId)],
243    /// The start index of the current segment, during iteration
244    seg_start: usize,
245    /// tracks whether the current segment has ordered gids
246    gids_in_order: bool,
247}
248
249#[derive(Clone, Copy, Debug)]
250struct Format4Segment {
251    // indices are into the source mappings
252    start_ix: usize,
253    end_ix: usize,
254    start_char: char,
255    end_char: char,
256    id_delta: Option<i32>,
257}
258
259impl Format4Segment {
260    fn len(&self) -> usize {
261        self.end_ix - self.start_ix + 1
262    }
263
264    // cost in bytes of this segment.
265    fn cost(&self) -> usize {
266        // a segment always costs 4 u16s (end, start, delta_id, id_range_offset)
267        const BASE_COST: usize = 4 * u16::RAW_BYTE_LEN;
268
269        if self.id_delta.is_some() {
270            BASE_COST
271        } else {
272            // and if there is not a common id_delta, we also need to add an item
273            // to the glyph_id_array for each char in the segment
274            BASE_COST + self.len() * u16::RAW_BYTE_LEN
275        }
276    }
277
278    /// `true` if we can merge other into self (other must follow self)
279    fn can_combine(&self, next: &Self) -> bool {
280        self.end_char as u32 + 1 == next.start_char as u32
281    }
282
283    /// Return `true` if we should combine this segment with the previous one.
284    ///
285    /// The case that matters here is when there is a segment with contiguous
286    /// GIDs and with a char range that is immediately adjacent to the previous
287    /// segment.
288    fn should_combine(&self, prev: &Self, next: Option<&Self>) -> bool {
289        if !prev.can_combine(self) {
290            return false;
291        }
292
293        // first we just consider the previous item. If our combined cost
294        // is lower than our separate cost, we will merge.
295        let combined_cost = prev.combine(self).cost();
296        let separate_cost = prev.cost() + self.cost();
297
298        if combined_cost < separate_cost {
299            return true;
300        }
301
302        // finally, if we are also char-contiguous with the next segment,
303        // then by construction it means if we merge now we will also merge
304        // with the next segment (since this current gid-contiguous segment
305        // is the reason we aren't all one big segment already) and so we need
306        // to also check that.
307        //
308        // Although the implementation is different, the logic is very similar in
309        // fonttools: https://github.com/fonttools/fonttools/blob/081d6a27ab8/Lib/fontTools/ttLib/tables/_c_m_a_p.py#L828
310        //
311        // As an example, consider a segment with 5 contiguous gids.
312        //
313        // This segment costs 8 bytes to encode; because the gids are contiguous
314        // we can use the `id_delta` field to represent them all.
315        //
316        // As an example, consider the following three segments:
317        //
318        // chrs [1 2] [3 4 5 6 7] [8 9]
319        // GIDs [3 1] [4 5 6 7 8] [2 9]
320        // cost   12       8       12
321        //
322        // the first and last segments each have len == 2. The GIDs are not
323        // contiguous, so they have to be encoded individually, which costs
324        // 2 bytes each. This means the total cost of these segments is 12:
325        // 8-bytes for the segment data, and 4 bytes for the gids.
326        //
327        // The middle segment has len == 5, but the GIDs are contiguous. This
328        // means that we can represent all the gids using the delta_id part of
329        // the segment, and encode the whole segment for 8 bytes.
330        //
331        // If we combine the first two segments, the new segment costs 22:
332        // 8 bytes for the segment, and 14 bytes for the 7 glyphs. This is
333        // more than the 20 bytes they cost separately.
334        //
335        // If we combine all three, though, the total cost is 26 (we add two
336        // more entries to the glyph_id array), which is better than the 32 bytes
337        // they cost separately.
338        //
339        // (note that we don't need to explicitly combine the next segment;
340        // it will happen automatically during the next loop)
341        if let Some(next) = next.filter(|next| self.can_combine(next)) {
342            let combined_cost = prev.combine(self).combine(next).cost();
343            let separate_cost = separate_cost + next.cost();
344            return combined_cost < separate_cost;
345        }
346
347        false
348    }
349
350    /// Combine this segment with one that immediately follows it.
351    ///
352    /// The caller must ensure that the two segments are contiguous.
353    fn combine(&self, next: &Format4Segment) -> Format4Segment {
354        assert_eq!(next.start_ix, self.end_ix + 1,);
355        Format4Segment {
356            start_ix: self.start_ix,
357            start_char: self.start_char,
358            end_char: next.end_char,
359            end_ix: next.end_ix,
360            id_delta: None,
361        }
362    }
363}
364
365impl<'a> Format4SegmentComputer<'a> {
366    fn new(mappings: &'a [(char, GlyphId)]) -> Self {
367        // ignore chars above BMP:
368        let mappings = mappings
369            .iter()
370            .position(|(c, _)| u16::try_from(*c as u32).is_err())
371            .map(|bad_idx| &mappings[..bad_idx])
372            .unwrap_or(mappings);
373        Self {
374            mappings,
375            seg_start: 0,
376            gids_in_order: false,
377        }
378    }
379
380    /// a convenience method called from our iter in the various cases where
381    /// we emit a segment.
382    ///
383    /// a 'seg_len' of 0 means start == end, e.g. a segment of one glyph.
384    fn make_segment(&mut self, seg_len: usize) -> Format4Segment {
385        // if start == end, we should always use a delta.
386        let use_delta = self.gids_in_order || seg_len == 0;
387        let start_ix = self.seg_start;
388        let end_ix = self.seg_start + seg_len;
389        let start_char = self.mappings[start_ix].0;
390        let end_char = self.mappings[end_ix].0;
391        let result = Format4Segment {
392            start_ix,
393            end_ix,
394            start_char,
395            end_char,
396            id_delta: self
397                .mappings
398                .get(self.seg_start)
399                .map(|(cp, gid)| gid.to_u32() as i32 - *cp as u32 as i32)
400                .filter(|_| use_delta),
401        };
402        self.seg_start += seg_len + 1;
403        self.gids_in_order = false;
404        result
405    }
406
407    /// Find the next possible segment.
408    ///
409    /// A segment _must_ be a contiguous range of chars, but we where such a range
410    /// contains subranges that are also contiguous ranges of glyph ids, we will
411    /// split those subranges into separate segments.
412    fn next_possible_segment(&mut self) -> Option<Format4Segment> {
413        if self.seg_start == self.mappings.len() {
414            return None;
415        }
416
417        let Some(((mut prev_cp, mut prev_gid), rest)) =
418            self.mappings[self.seg_start..].split_first()
419        else {
420            // if this is the last element, make a final segment
421            return Some(self.make_segment(0));
422        };
423
424        for (i, (cp, gid)) in rest.iter().enumerate() {
425            // first: all segments must be a contiguous range of codepoints
426            if *cp as u32 != prev_cp as u32 + 1 {
427                return Some(self.make_segment(i));
428            }
429            let next_gid_is_in_order = prev_gid.to_u32() + 1 == gid.to_u32();
430            if !next_gid_is_in_order {
431                // next: if prev gids were ordered but this one isn't, end prev segment
432                if self.gids_in_order {
433                    return Some(self.make_segment(i));
434                }
435            // and the funny case:
436            // if gids were not previously ordered but are now:
437            // - if i == 0, then this is the first item in a new segment;
438            //   set gids_in_order and continue
439            // - if i > 0, we need to back up one
440            } else if !self.gids_in_order {
441                if i == 0 {
442                    self.gids_in_order = true;
443                } else {
444                    return Some(self.make_segment(i - 1));
445                }
446            }
447            prev_cp = *cp;
448            prev_gid = *gid;
449        }
450
451        // if we're done looping then create the last segment:
452        let last_idx = self.mappings.len() - 1;
453        Some(self.make_segment(last_idx - self.seg_start))
454    }
455
456    /// Compute an efficient set of segments.
457    ///
458    /// - A segment is a contiguous range of chars.
459    /// - If all the chars in a segment share a common delta to their glyph ids,
460    ///   we can encode them much more efficiently
461    /// - it's possible for a contiguous range of chars to contain a subrange
462    ///   that share a common delta, where the overall range does not, e.g.
463    ///
464    ///   ```text
465    ///   [a b c d e f g]
466    ///   [9 3 6 7 8 2 1]
467    ///   ```
468    ///   (here a-g is a range containing the subrange c-e, which have a common
469    ///   delta.)
470    ///
471    /// This leads us to a reasonably intuitive algorithm: we start by greedily
472    /// splitting ranges up so we can consider all subranges with common deltas;
473    /// then we look at these one at a time, and combine them back together if
474    /// doing so saves space.
475    ///
476    /// This differs from the python, which starts from larger segments and then
477    /// subdivides them, but the overall idea is the same.
478    ///
479    /// <https://github.com/fonttools/fonttools/blob/f1d3e116d54f/Lib/fontTools/ttLib/tables/_c_m_a_p.py#L783>
480    fn compute(mut self) -> Vec<Format4Segment> {
481        let Some(first) = self.next_possible_segment() else {
482            return Default::default();
483        };
484
485        let mut result = vec![first];
486
487        // now we want to collect the segments, combining smaller segments where
488        // that leads to a size savings.
489        let mut next = self.next_possible_segment();
490
491        while let Some(current) = next.take() {
492            next = self.next_possible_segment();
493            let prev = result.last_mut().unwrap();
494            if current.should_combine(prev, next.as_ref()) {
495                *prev = prev.combine(&current);
496                continue;
497            }
498
499            result.push(current);
500        }
501        result
502    }
503}
504
505impl Cmap4 {
506    fn compute_length(&self) -> u16 {
507        // https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values
508        // there are always 8 u16 fields
509        const FIXED_SIZE: usize = 8 * u16::RAW_BYTE_LEN;
510        const PER_SEGMENT_LEN: usize = 4 * u16::RAW_BYTE_LEN;
511
512        let segment_len = self.end_code.len() * PER_SEGMENT_LEN;
513        let gid_len = self.glyph_id_array.len() * u16::RAW_BYTE_LEN;
514
515        (FIXED_SIZE + segment_len + gid_len)
516            .try_into()
517            .expect("cmap4 overflow")
518    }
519
520    fn compute_search_range(&self) -> u16 {
521        SearchRange::compute(self.end_code.len(), u16::RAW_BYTE_LEN).search_range
522    }
523
524    fn compute_entry_selector(&self) -> u16 {
525        SearchRange::compute(self.end_code.len(), u16::RAW_BYTE_LEN).entry_selector
526    }
527
528    fn compute_range_shift(&self) -> u16 {
529        SearchRange::compute(self.end_code.len(), u16::RAW_BYTE_LEN).range_shift
530    }
531}
532
533impl Cmap12 {
534    fn compute_length(&self) -> u32 {
535        // https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-12-segmented-coverage
536        const FIXED_SIZE: usize = 2 * u16::RAW_BYTE_LEN + 3 * u32::RAW_BYTE_LEN;
537        const PER_SEGMENT_LEN: usize = 3 * u32::RAW_BYTE_LEN;
538
539        (FIXED_SIZE + PER_SEGMENT_LEN * self.groups.len())
540            .try_into()
541            .unwrap()
542    }
543}
544
545#[cfg(test)]
546mod tests {
547    use std::ops::RangeInclusive;
548
549    use font_types::GlyphId;
550    use read_fonts::{
551        tables::cmap::{Cmap, CmapSubtable, PlatformId},
552        FontData, FontRead,
553    };
554
555    use crate::{
556        dump_table,
557        tables::cmap::{
558            self as write, CmapConflict, UNICODE_BMP_ENCODING, UNICODE_FULL_REPERTOIRE_ENCODING,
559            WINDOWS_BMP_ENCODING, WINDOWS_FULL_REPERTOIRE_ENCODING,
560        },
561    };
562
563    use super::{Cmap12, SequentialMapGroup};
564
565    fn assert_generates_simple_cmap(mappings: Vec<(char, GlyphId)>) {
566        let cmap = write::Cmap::from_mappings(mappings).unwrap();
567
568        let bytes = dump_table(&cmap).unwrap();
569        let font_data = FontData::new(&bytes);
570        let cmap = Cmap::read(font_data).unwrap();
571
572        assert_eq!(2, cmap.encoding_records().len(), "{cmap:?}");
573        assert_eq!(
574            vec![
575                (PlatformId::Unicode, UNICODE_BMP_ENCODING),
576                (PlatformId::Windows, WINDOWS_BMP_ENCODING)
577            ],
578            cmap.encoding_records()
579                .iter()
580                .map(|er| (er.platform_id(), er.encoding_id()))
581                .collect::<Vec<_>>(),
582            "{cmap:?}"
583        );
584
585        for encoding_record in cmap.encoding_records() {
586            let CmapSubtable::Format4(cmap4) = encoding_record.subtable(font_data).unwrap() else {
587                panic!("Expected a cmap4 in {encoding_record:?}");
588            };
589
590            // The spec example says entry_selector 4 but the calculation it gives seems to yield 2 (?)
591            assert_eq!(
592                (8, 8, 2, 0),
593                (
594                    cmap4.seg_count_x2(),
595                    cmap4.search_range(),
596                    cmap4.entry_selector(),
597                    cmap4.range_shift()
598                )
599            );
600            assert_eq!(cmap4.start_code(), &[10u16, 30u16, 153u16, 0xffffu16]);
601            assert_eq!(cmap4.end_code(), &[20u16, 90u16, 480u16, 0xffffu16]);
602            // The example starts at gid 1, we're starting at 0
603            assert_eq!(cmap4.id_delta(), &[-10i16, -19i16, -81i16, 1i16]);
604            assert_eq!(cmap4.id_range_offsets(), &[0u16, 0u16, 0u16, 0u16]);
605        }
606    }
607
608    fn simple_cmap_mappings() -> Vec<(char, GlyphId)> {
609        (10..=20)
610            .chain(30..=90)
611            .chain(153..=480)
612            .enumerate()
613            .map(|(idx, codepoint)| (char::from_u32(codepoint).unwrap(), GlyphId::new(idx as u32)))
614            .collect()
615    }
616
617    // https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values
618    // "map characters 10-20, 30-90, and 153-480 onto a contiguous range of glyph indices"
619    #[test]
620    fn generate_simple_cmap4() {
621        let mappings = simple_cmap_mappings();
622        assert_generates_simple_cmap(mappings);
623    }
624
625    #[test]
626    fn generate_cmap4_out_of_order_input() {
627        let mut ordered = simple_cmap_mappings();
628        let mut disordered = Vec::new();
629        while !ordered.is_empty() {
630            if ordered.len() % 2 == 0 {
631                disordered.insert(0, ordered.remove(0));
632            } else {
633                disordered.push(ordered.remove(0));
634            }
635        }
636        assert_ne!(ordered, disordered);
637        assert_generates_simple_cmap(disordered);
638    }
639
640    #[test]
641    fn generate_cmap4_large_values() {
642        let mut mappings = simple_cmap_mappings();
643        // Example from Texturina.
644        let codepoint = char::from_u32(0xa78b).unwrap();
645        let gid = GlyphId::new(153);
646        mappings.push((codepoint, gid));
647
648        let cmap = write::Cmap::from_mappings(mappings).unwrap();
649
650        let bytes = dump_table(&cmap).unwrap();
651        let font_data = FontData::new(&bytes);
652        let cmap = Cmap::read(font_data).unwrap();
653        assert_eq!(cmap.map_codepoint(codepoint), Some(gid));
654    }
655
656    #[test]
657    fn bytes_are_reused() {
658        // We emit extra encoding records assuming it's cheap. Make sure.
659        let mappings = simple_cmap_mappings();
660        let cmap_both = write::Cmap::from_mappings(mappings).unwrap();
661        assert_eq!(2, cmap_both.encoding_records.len(), "{cmap_both:?}");
662
663        let bytes_for_both = dump_table(&cmap_both).unwrap().len();
664
665        for i in 0..cmap_both.encoding_records.len() {
666            let mut cmap = cmap_both.clone();
667            cmap.encoding_records.remove(i);
668            let bytes_for_one = dump_table(&cmap).unwrap().len();
669            assert_eq!(bytes_for_one + 8, bytes_for_both);
670        }
671    }
672
673    fn non_bmp_cmap_mappings() -> Vec<(char, GlyphId)> {
674        // contains four sequential map groups
675        vec![
676            // first group
677            ('\u{1f12f}', GlyphId::new(481)),
678            ('\u{1f130}', GlyphId::new(482)),
679            // char 0x1f131 skipped, starts second group
680            ('\u{1f132}', GlyphId::new(483)),
681            ('\u{1f133}', GlyphId::new(484)),
682            // gid 485 skipped, starts third group
683            ('\u{1f134}', GlyphId::new(486)),
684            // char 0x1f135 skipped, starts fourth group. identical duplicate bindings are fine
685            ('\u{1f136}', GlyphId::new(488)),
686            ('\u{1f136}', GlyphId::new(488)),
687        ]
688    }
689
690    fn bmp_and_non_bmp_cmap_mappings() -> Vec<(char, GlyphId)> {
691        let mut mappings = simple_cmap_mappings();
692        mappings.extend(non_bmp_cmap_mappings());
693        mappings
694    }
695
696    fn assert_cmap12_groups(
697        font_data: FontData,
698        cmap: &Cmap,
699        record_index: usize,
700        expected: &[(u32, u32, u32)],
701    ) {
702        let rec = &cmap.encoding_records()[record_index];
703        let CmapSubtable::Format12(subtable) = rec.subtable(font_data).unwrap() else {
704            panic!("Expected a cmap12 in {rec:?}");
705        };
706        let groups = subtable
707            .groups()
708            .iter()
709            .map(|g| (g.start_char_code(), g.end_char_code(), g.start_glyph_id()))
710            .collect::<Vec<_>>();
711        assert_eq!(groups.len(), expected.len());
712        assert_eq!(groups, expected);
713    }
714
715    #[test]
716    fn generate_cmap4_and_12() {
717        let mappings = bmp_and_non_bmp_cmap_mappings();
718
719        let cmap = write::Cmap::from_mappings(mappings).unwrap();
720
721        let bytes = dump_table(&cmap).unwrap();
722        let font_data = FontData::new(&bytes);
723        let cmap = Cmap::read(font_data).unwrap();
724
725        assert_eq!(4, cmap.encoding_records().len(), "{cmap:?}");
726        assert_eq!(
727            vec![
728                (PlatformId::Unicode, UNICODE_BMP_ENCODING),
729                (PlatformId::Unicode, UNICODE_FULL_REPERTOIRE_ENCODING),
730                (PlatformId::Windows, WINDOWS_BMP_ENCODING),
731                (PlatformId::Windows, WINDOWS_FULL_REPERTOIRE_ENCODING)
732            ],
733            cmap.encoding_records()
734                .iter()
735                .map(|er| (er.platform_id(), er.encoding_id()))
736                .collect::<Vec<_>>(),
737            "{cmap:?}"
738        );
739
740        let encoding_records = cmap.encoding_records();
741        let first_rec = &encoding_records[0];
742        assert!(
743            matches!(
744                first_rec.subtable(font_data).unwrap(),
745                CmapSubtable::Format4(_)
746            ),
747            "Expected a cmap4 in {first_rec:?}"
748        );
749
750        // (start_char_code, end_char_code, start_glyph_id)
751        let expected_groups = vec![
752            (10, 20, 0),
753            (30, 90, 11),
754            (153, 480, 72),
755            (0x1f12f, 0x1f130, 481),
756            (0x1f132, 0x1f133, 483),
757            (0x1f134, 0x1f134, 486),
758            (0x1f136, 0x1f136, 488),
759        ];
760        assert_cmap12_groups(font_data, &cmap, 1, &expected_groups);
761        assert_cmap12_groups(font_data, &cmap, 3, &expected_groups);
762    }
763
764    #[test]
765    fn generate_cmap12_only() {
766        let mappings = non_bmp_cmap_mappings();
767
768        let cmap = write::Cmap::from_mappings(mappings).unwrap();
769
770        let bytes = dump_table(&cmap).unwrap();
771        let font_data = FontData::new(&bytes);
772        let cmap = Cmap::read(font_data).unwrap();
773
774        assert_eq!(2, cmap.encoding_records().len(), "{cmap:?}");
775        assert_eq!(
776            vec![
777                (PlatformId::Unicode, UNICODE_FULL_REPERTOIRE_ENCODING),
778                (PlatformId::Windows, WINDOWS_FULL_REPERTOIRE_ENCODING)
779            ],
780            cmap.encoding_records()
781                .iter()
782                .map(|er| (er.platform_id(), er.encoding_id()))
783                .collect::<Vec<_>>(),
784            "{cmap:?}"
785        );
786
787        // (start_char_code, end_char_code, start_glyph_id)
788        let expected_groups = vec![
789            (0x1f12f, 0x1f130, 481),
790            (0x1f132, 0x1f133, 483),
791            (0x1f134, 0x1f134, 486),
792            (0x1f136, 0x1f136, 488),
793        ];
794        assert_cmap12_groups(font_data, &cmap, 0, &expected_groups);
795        assert_cmap12_groups(font_data, &cmap, 1, &expected_groups);
796    }
797
798    #[test]
799    fn multiple_mappings_fails() {
800        let mut mappings = non_bmp_cmap_mappings();
801        // add an additional mapping to a different glyphId
802        let (ch, gid1) = mappings[0];
803        let gid2 = GlyphId::new(gid1.to_u32() + 1);
804        mappings.push((ch, gid2));
805
806        let result = write::Cmap::from_mappings(mappings);
807
808        assert_eq!(result, Err(CmapConflict { ch, gid1, gid2 }))
809    }
810
811    struct MappingBuilder {
812        mappings: Vec<(char, GlyphId)>,
813        next_gid: u16,
814    }
815
816    impl Default for MappingBuilder {
817        fn default() -> Self {
818            Self {
819                mappings: Default::default(),
820                next_gid: 1,
821            }
822        }
823    }
824
825    impl MappingBuilder {
826        fn extend(mut self, range: impl IntoIterator<Item = char>) -> Self {
827            for c in range {
828                let gid = GlyphId::new(self.next_gid as _);
829                self.mappings.push((c, gid));
830                self.next_gid += 1;
831            }
832            self
833        }
834
835        // compute the segments for the mapping
836        fn compute(&mut self) -> Vec<RangeInclusive<char>> {
837            self.mappings.sort();
838            super::Format4SegmentComputer::new(&self.mappings)
839                .compute()
840                .into_iter()
841                .map(|seg| self.mappings[seg.start_ix].0..=self.mappings[seg.end_ix].0)
842                .collect()
843        }
844
845        fn build(mut self) -> Vec<(char, GlyphId)> {
846            self.mappings.sort();
847            self.mappings
848        }
849    }
850
851    #[test]
852    fn f4_segments_simple() {
853        let mut one_big_discontiguous_mapping = MappingBuilder::default().extend(('a'..='z').rev());
854        assert_eq!(one_big_discontiguous_mapping.compute(), ['a'..='z']);
855    }
856
857    #[test]
858    fn f4_segments_combine_small() {
859        let mut mapping = MappingBuilder::default()
860            // backwards so gids are not contiguous
861            .extend(['e', 'd', 'c', 'b', 'a'])
862            // these two contiguous ranges aren't worth the cost, should merge
863            // into the first and last respectively
864            .extend('f'..='g')
865            .extend('m'..='n')
866            .extend(('o'..='z').rev());
867
868        assert_eq!(mapping.compute(), ['a'..='g', 'm'..='z']);
869    }
870
871    #[test]
872    fn f4_segments_keep() {
873        let mut mapping = MappingBuilder::default()
874            .extend('a'..='m')
875            .extend(['o', 'n']);
876
877        assert_eq!(mapping.compute(), ['a'..='m', 'n'..='o']);
878    }
879
880    fn expect_f4(mapping: &[(char, GlyphId)]) -> super::Cmap4 {
881        let format4 = super::CmapSubtable::create_format_4(mapping).unwrap();
882        let super::CmapSubtable::Format4(format4) = format4 else {
883            panic!("O_o")
884        };
885        format4
886    }
887
888    // roundtrip the mapping from read-fonts
889    fn get_read_mapping(table: &super::Cmap4) -> Vec<(char, GlyphId)> {
890        let bytes = dump_table(table).unwrap();
891        let readcmap = read_fonts::tables::cmap::Cmap4::read(bytes.as_slice().into()).unwrap();
892
893        let mut mapping = readcmap
894            .iter()
895            .map(|(c, gid)| (char::from_u32(c).unwrap(), gid))
896            .collect::<Vec<_>>();
897        // cmap4 always ends with a 65535 => notdef entry. Sanity check
898        // this and then pop it to avoid messing with tests
899        assert_eq!(mapping.pop(), Some(('\u{FFFF}', GlyphId::NOTDEF)));
900        mapping
901    }
902
903    #[test]
904    fn f4_segment_len_one_uses_delta() {
905        // if a segment is length one, we should always use the delta, since it's free.
906        let mapping = MappingBuilder::default()
907            .extend(['a', 'z', '1', '9'])
908            .build();
909
910        let format4 = expect_f4(&mapping);
911        assert_eq!(format4.end_code.len(), 5); // 4 + 0xffff
912        assert!(format4.glyph_id_array.is_empty());
913        assert!(format4.id_delta.iter().all(|d| *d != 0));
914    }
915
916    #[test]
917    fn f4_efficiency() {
918        // one of these ranges should use id_delta, the other should use glyph id array
919        let mapping = MappingBuilder::default()
920            .extend('A'..='Z')
921            .extend(('a'..='z').rev())
922            .build();
923
924        let format4 = expect_f4(&mapping);
925
926        assert_eq!(
927            format4.start_code,
928            ['A' as u32 as u16, 'a' as u32 as u16, 0xffff]
929        );
930
931        assert_eq!(
932            format4.end_code,
933            ['Z' as u32 as u16, 'z' as u32 as u16, 0xffff]
934        );
935
936        assert_eq!(format4.id_delta, [-64, 0, 1]);
937        assert_eq!(format4.id_range_offsets, [0, 4, 0]);
938
939        let read_mapping = get_read_mapping(&format4);
940        assert_eq!(mapping.len(), read_mapping.len());
941        assert!(mapping == read_mapping);
942    }
943
944    #[test]
945    fn f4_kinda_real_world() {
946        // based on the first few hundred glyphs of oswald
947        let mapping = MappingBuilder::default()
948            .extend(['\r']) // CR
949            .extend('\x20'..='\x7e') // ascii space to tilde
950            .extend('\u{a0}'..='\u{ac}') // nbspace to logical not
951            .extend('\u{ae}'..='\u{17f}') // registered to long s
952            .extend(['\u{18f}', '\u{192}'])
953            .build();
954
955        let format4 = expect_f4(&mapping);
956        // we added 3 ranges + 3 individual glyphs above, + the final 0xffff
957        assert_eq!(format4.end_code.len(), 7);
958        let read_mapping = get_read_mapping(&format4);
959
960        assert_eq!(mapping.len(), read_mapping.len());
961        assert!(mapping == read_mapping);
962    }
963
964    #[test]
965    // a small ordered segment between two larger unordered segments;
966    // merging this correctly requires us to consider the next segment as well
967    fn f4_sandwich_segment() {
968        let mapping = MappingBuilder::default()
969            .extend(['\r'])
970            .extend(('\x20'..='\x27').rev()) // cost = 8*2 + 8 = 24
971            .extend('\x28'..='\x2c') // cost = 8
972            .extend(('\x2d'..='\x34').rev()) // cost = 6*2 + 8 = 20
973            // combined =
974            // (8 + 5 + 6) * 2 + 8 = 46
975            .extend('\x35'..='\x3e')
976            .build();
977
978        let format4 = expect_f4(&mapping);
979        assert_eq!(format4.end_code.len(), 4);
980    }
981
982    // test that we correctly encode array lengths exceeding u16::MAX
983    #[test]
984    fn cmap12_length_calculation() {
985        let more_than_16_bits = u16::MAX as u32 + 5;
986        let groups = (0..more_than_16_bits)
987            .map(|i| SequentialMapGroup::new(i, i, i))
988            .collect();
989        let cmap12 = Cmap12::new(0, groups);
990        let bytes = crate::dump_table(&cmap12).unwrap();
991        let read_it_back = Cmap12::read(bytes.as_slice().into()).unwrap();
992        assert_eq!(read_it_back.groups.len() as u32, more_than_16_bits);
993    }
994}