Skip to main content

pdf_font/cmap/
mod.rs

1/*!
2A parser for cmap files, as they are found in PDFs.
3
4This crate provides a parser for cmap files and allows you to
5- Map character codes from text-showing operators to CID identifiers.
6- Map CIDs to Unicode characters or strings.
7
8## Safety
9This crate forbids unsafe code via a crate-level attribute.
10*/
11
12#[cfg(feature = "embed-cmaps")]
13mod bcmap;
14mod parse;
15
16#[cfg(feature = "embed-cmaps")]
17pub use bcmap::load_embedded;
18
19/// Look up an embedded binary cmap by name.
20///
21/// Returns `None` when the `embed-cmaps` feature is not enabled.
22#[cfg(not(feature = "embed-cmaps"))]
23pub fn load_embedded(_name: CMapName<'_>) -> Option<&'static [u8]> {
24    None
25}
26
27use alloc::boxed::Box;
28use alloc::string::String;
29use alloc::vec::Vec;
30
31/// A CID (Character Identifier).
32pub type Cid = u32;
33
34/// The name of the cmap.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum CMapName<'a> {
37    // ── Adobe-Japan1 (Japanese) ── Shift-JIS (RKSJ) encodings ──
38    /// `83pv-RKSJ-H` — Adobe-Japan1, Mac Shift-JIS (`KanjiTalk` 6), horizontal.
39    N83pvRksjH,
40    /// `90ms-RKSJ-H` — Adobe-Japan1, Windows Shift-JIS (code page 932), horizontal.
41    N90msRksjH,
42    /// `90ms-RKSJ-V` — Adobe-Japan1, Windows Shift-JIS (code page 932), vertical.
43    N90msRksjV,
44    /// `90msp-RKSJ-H` — Adobe-Japan1, Windows Shift-JIS with proportional Roman, horizontal.
45    N90mspRksjH,
46    /// `90msp-RKSJ-V` — Adobe-Japan1, Windows Shift-JIS with proportional Roman, vertical.
47    N90mspRksjV,
48    /// `90pv-RKSJ-H` — Adobe-Japan1, Mac Shift-JIS (`KanjiTalk` 7), horizontal.
49    N90pvRksjH,
50    /// `Add-RKSJ-H` — Adobe-Japan1, Fujitsu FMR Shift-JIS, horizontal.
51    AddRksjH,
52    /// `Add-RKSJ-V` — Adobe-Japan1, Fujitsu FMR Shift-JIS, vertical.
53    AddRksjV,
54
55    // ── Adobe UCS2 (CID → Unicode) ──
56    // Note that these are not part of the predefined CMaps, but we use them
57    // so we can more easily map CIDs back to Unicode.
58    /// `Adobe-CNS1-UCS2` — Adobe-CNS1, CID-to-Unicode mapping.
59    AdobeCns1Ucs2,
60    /// `Adobe-GB1-UCS2` — Adobe-GB1, CID-to-Unicode mapping.
61    AdobeGb1Ucs2,
62    /// `Adobe-Japan1-UCS2` — Adobe-Japan1, CID-to-Unicode mapping.
63    AdobeJapan1Ucs2,
64    /// `Adobe-Korea1-UCS2` — Adobe-Korea1, CID-to-Unicode mapping.
65    AdobeKorea1Ucs2,
66
67    // ── Adobe-CNS1 (Traditional Chinese) ── Big Five encodings ──
68    /// `B5pc-H` — Adobe-CNS1, Mac Big Five (`ETen` extensions), horizontal.
69    B5pcH,
70    /// `B5pc-V` — Adobe-CNS1, Mac Big Five (`ETen` extensions), vertical.
71    B5pcV,
72    /// `CNS-EUC-H` — Adobe-CNS1, CNS 11643 EUC encoding, horizontal.
73    CnsEucH,
74    /// `CNS-EUC-V` — Adobe-CNS1, CNS 11643 EUC encoding, vertical.
75    CnsEucV,
76    /// `ETen-B5-H` — Adobe-CNS1, `ETen` Big Five extensions, horizontal.
77    ETenB5H,
78    /// `ETen-B5-V` — Adobe-CNS1, `ETen` Big Five extensions, vertical.
79    ETenB5V,
80    /// `ETenms-B5-H` — Adobe-CNS1, `ETen` Big Five with Microsoft symbol extensions, horizontal.
81    ETenmsB5H,
82    /// `ETenms-B5-V` — Adobe-CNS1, `ETen` Big Five with Microsoft symbol extensions, vertical.
83    ETenmsB5V,
84
85    // ── Adobe-Japan1 (Japanese) ── EUC and extended Shift-JIS encodings ──
86    /// `EUC-H` — Adobe-Japan1, JIS X 0208 EUC-JP encoding, horizontal.
87    EucH,
88    /// `EUC-V` — Adobe-Japan1, JIS X 0208 EUC-JP encoding, vertical.
89    EucV,
90    /// `Ext-RKSJ-H` — Adobe-Japan1, Shift-JIS with NEC/IBM extensions, horizontal.
91    ExtRksjH,
92    /// `Ext-RKSJ-V` — Adobe-Japan1, Shift-JIS with NEC/IBM extensions, vertical.
93    ExtRksjV,
94
95    // ── Adobe-GB1 (Simplified Chinese) ──
96    /// `GB-EUC-H` — Adobe-GB1, GB 2312-80 EUC encoding, horizontal.
97    GbEucH,
98    /// `GB-EUC-V` — Adobe-GB1, GB 2312-80 EUC encoding, vertical.
99    GbEucV,
100    /// `GBK-EUC-H` — Adobe-GB1, GBK encoding (Microsoft code page 936), horizontal.
101    GbkEucH,
102    /// `GBK-EUC-V` — Adobe-GB1, GBK encoding (Microsoft code page 936), vertical.
103    GbkEucV,
104    /// `GBK2K-H` — Adobe-GB1, GB 18030-2000 encoding, horizontal.
105    Gbk2kH,
106    /// `GBK2K-V` — Adobe-GB1, GB 18030-2000 encoding, vertical.
107    Gbk2kV,
108    /// `GBKp-EUC-H` — Adobe-GB1, GBK with proportional Roman, horizontal.
109    GbkpEucH,
110    /// `GBKp-EUC-V` — Adobe-GB1, GBK with proportional Roman, vertical.
111    GbkpEucV,
112    /// `GBpc-EUC-H` — Adobe-GB1, Mac GB 2312 (simplified) EUC encoding, horizontal.
113    GbpcEucH,
114    /// `GBpc-EUC-V` — Adobe-GB1, Mac GB 2312 (simplified) EUC encoding, vertical.
115    GbpcEucV,
116
117    // ── Adobe-Japan1 (Japanese) ── JIS encoding ──
118    /// `H` — Adobe-Japan1, JIS X 0208 row-cell encoding, horizontal.
119    H,
120
121    // ── Adobe-CNS1 (Traditional Chinese) ── Hong Kong SCS ──
122    /// `HKscs-B5-H` — Adobe-CNS1, Hong Kong SCS (Big Five with HKSCS extensions), horizontal.
123    HKscsB5H,
124    /// `HKscs-B5-V` — Adobe-CNS1, Hong Kong SCS (Big Five with HKSCS extensions), vertical.
125    HKscsB5V,
126
127    // ── Adobe-Identity ──
128    /// `Identity-H` — Adobe-Identity, two-byte identity mapping, horizontal.
129    /// Character codes map directly to CIDs (i.e. CID = character code).
130    IdentityH,
131    /// `Identity-V` — Adobe-Identity, two-byte identity mapping, vertical.
132    /// Character codes map directly to CIDs (i.e. CID = character code).
133    IdentityV,
134
135    // ── Adobe-Korea1 (Korean) ──
136    /// `KSC-EUC-H` — Adobe-Korea1, KS X 1001:1992 EUC-KR encoding, horizontal.
137    KscEucH,
138    /// `KSC-EUC-V` — Adobe-Korea1, KS X 1001:1992 EUC-KR encoding, vertical.
139    KscEucV,
140    /// `KSCms-UHC-H` — Adobe-Korea1, Microsoft UHC (Unified Hangul Code, code page 949), horizontal.
141    KscmsUhcH,
142    /// `KSCms-UHC-HW-H` — Adobe-Korea1, Microsoft UHC with half-width Roman, horizontal.
143    KscmsUhcHwH,
144    /// `KSCms-UHC-HW-V` — Adobe-Korea1, Microsoft UHC with half-width Roman, vertical.
145    KscmsUhcHwV,
146    /// `KSCms-UHC-V` — Adobe-Korea1, Microsoft UHC (Unified Hangul Code, code page 949), vertical.
147    KscmsUhcV,
148    /// `KSCpc-EUC-H` — Adobe-Korea1, Mac KS X 1001:1992 EUC-KR encoding, horizontal.
149    KscpcEucH,
150
151    // ── Adobe-CNS1 (Traditional Chinese) ── Unicode encodings ──
152    /// `UniCNS-UCS2-H` — Adobe-CNS1, Unicode UCS-2 encoding, horizontal.
153    UniCnsUcs2H,
154    /// `UniCNS-UCS2-V` — Adobe-CNS1, Unicode UCS-2 encoding, vertical.
155    UniCnsUcs2V,
156    /// `UniCNS-UTF16-H` — Adobe-CNS1, Unicode UTF-16 encoding, horizontal.
157    UniCnsUtf16H,
158    /// `UniCNS-UTF16-V` — Adobe-CNS1, Unicode UTF-16 encoding, vertical.
159    UniCnsUtf16V,
160
161    // ── Adobe-GB1 (Simplified Chinese) ── Unicode encodings ──
162    /// `UniGB-UCS2-H` — Adobe-GB1, Unicode UCS-2 encoding, horizontal.
163    UniGbUcs2H,
164    /// `UniGB-UCS2-V` — Adobe-GB1, Unicode UCS-2 encoding, vertical.
165    UniGbUcs2V,
166    /// `UniGB-UTF16-H` — Adobe-GB1, Unicode UTF-16 encoding, horizontal.
167    UniGbUtf16H,
168    /// `UniGB-UTF16-V` — Adobe-GB1, Unicode UTF-16 encoding, vertical.
169    UniGbUtf16V,
170
171    // ── Adobe-Japan1 (Japanese) ── Unicode encodings ──
172    /// `UniJIS-UCS2-H` — Adobe-Japan1, Unicode UCS-2 encoding, horizontal.
173    UniJisUcs2H,
174    /// `UniJIS-UCS2-HW-H` — Adobe-Japan1, Unicode UCS-2 with half-width Roman, horizontal.
175    UniJisUcs2HwH,
176    /// `UniJIS-UCS2-HW-V` — Adobe-Japan1, Unicode UCS-2 with half-width Roman, vertical.
177    UniJisUcs2HwV,
178    /// `UniJIS-UCS2-V` — Adobe-Japan1, Unicode UCS-2 encoding, vertical.
179    UniJisUcs2V,
180    /// `UniJIS-UTF16-H` — Adobe-Japan1, Unicode UTF-16 encoding, horizontal.
181    UniJisUtf16H,
182    /// `UniJIS-UTF16-V` — Adobe-Japan1, Unicode UTF-16 encoding, vertical.
183    UniJisUtf16V,
184
185    // ── Adobe-Korea1 (Korean) ── Unicode encodings ──
186    /// `UniKS-UCS2-H` — Adobe-Korea1, Unicode UCS-2 encoding, horizontal.
187    UniKsUcs2H,
188    /// `UniKS-UCS2-V` — Adobe-Korea1, Unicode UCS-2 encoding, vertical.
189    UniKsUcs2V,
190    /// `UniKS-UTF16-H` — Adobe-Korea1, Unicode UTF-16 encoding, horizontal.
191    UniKsUtf16H,
192    /// `UniKS-UTF16-V` — Adobe-Korea1, Unicode UTF-16 encoding, vertical.
193    UniKsUtf16V,
194
195    // ── Adobe-Japan1 (Japanese) ── JIS encoding ──
196    /// `V` — Adobe-Japan1, JIS X 0208 row-cell encoding, vertical.
197    V,
198
199    /// A custom (non-predefined) `CMap` name.
200    Custom(&'a [u8]),
201}
202
203impl<'a> CMapName<'a> {
204    /// Create a `CMapType` from raw bytes.
205    pub fn from_bytes(name: &'a [u8]) -> Self {
206        match name {
207            b"83pv-RKSJ-H" => Self::N83pvRksjH,
208            b"90ms-RKSJ-H" => Self::N90msRksjH,
209            b"90ms-RKSJ-V" => Self::N90msRksjV,
210            b"90msp-RKSJ-H" => Self::N90mspRksjH,
211            b"90msp-RKSJ-V" => Self::N90mspRksjV,
212            b"90pv-RKSJ-H" => Self::N90pvRksjH,
213            b"Add-RKSJ-H" => Self::AddRksjH,
214            b"Add-RKSJ-V" => Self::AddRksjV,
215            b"Adobe-CNS1-UCS2" => Self::AdobeCns1Ucs2,
216            b"Adobe-GB1-UCS2" => Self::AdobeGb1Ucs2,
217            b"Adobe-Japan1-UCS2" => Self::AdobeJapan1Ucs2,
218            b"Adobe-Korea1-UCS2" => Self::AdobeKorea1Ucs2,
219            b"B5pc-H" => Self::B5pcH,
220            b"B5pc-V" => Self::B5pcV,
221            b"CNS-EUC-H" => Self::CnsEucH,
222            b"CNS-EUC-V" => Self::CnsEucV,
223            b"ETen-B5-H" => Self::ETenB5H,
224            b"ETen-B5-V" => Self::ETenB5V,
225            b"ETenms-B5-H" => Self::ETenmsB5H,
226            b"ETenms-B5-V" => Self::ETenmsB5V,
227            b"EUC-H" => Self::EucH,
228            b"EUC-V" => Self::EucV,
229            b"Ext-RKSJ-H" => Self::ExtRksjH,
230            b"Ext-RKSJ-V" => Self::ExtRksjV,
231            b"GB-EUC-H" => Self::GbEucH,
232            b"GB-EUC-V" => Self::GbEucV,
233            b"GBK-EUC-H" => Self::GbkEucH,
234            b"GBK-EUC-V" => Self::GbkEucV,
235            b"GBK2K-H" => Self::Gbk2kH,
236            b"GBK2K-V" => Self::Gbk2kV,
237            b"GBKp-EUC-H" => Self::GbkpEucH,
238            b"GBKp-EUC-V" => Self::GbkpEucV,
239            b"GBpc-EUC-H" => Self::GbpcEucH,
240            b"GBpc-EUC-V" => Self::GbpcEucV,
241            b"H" => Self::H,
242            b"HKscs-B5-H" => Self::HKscsB5H,
243            b"HKscs-B5-V" => Self::HKscsB5V,
244            b"Identity-H" => Self::IdentityH,
245            b"Identity-V" => Self::IdentityV,
246            b"KSC-EUC-H" => Self::KscEucH,
247            b"KSC-EUC-V" => Self::KscEucV,
248            b"KSCms-UHC-H" => Self::KscmsUhcH,
249            b"KSCms-UHC-HW-H" => Self::KscmsUhcHwH,
250            b"KSCms-UHC-HW-V" => Self::KscmsUhcHwV,
251            b"KSCms-UHC-V" => Self::KscmsUhcV,
252            b"KSCpc-EUC-H" => Self::KscpcEucH,
253            b"UniCNS-UCS2-H" => Self::UniCnsUcs2H,
254            b"UniCNS-UCS2-V" => Self::UniCnsUcs2V,
255            b"UniCNS-UTF16-H" => Self::UniCnsUtf16H,
256            b"UniCNS-UTF16-V" => Self::UniCnsUtf16V,
257            b"UniGB-UCS2-H" => Self::UniGbUcs2H,
258            b"UniGB-UCS2-V" => Self::UniGbUcs2V,
259            b"UniGB-UTF16-H" => Self::UniGbUtf16H,
260            b"UniGB-UTF16-V" => Self::UniGbUtf16V,
261            b"UniJIS-UCS2-H" => Self::UniJisUcs2H,
262            b"UniJIS-UCS2-HW-H" => Self::UniJisUcs2HwH,
263            b"UniJIS-UCS2-HW-V" => Self::UniJisUcs2HwV,
264            b"UniJIS-UCS2-V" => Self::UniJisUcs2V,
265            b"UniJIS-UTF16-H" => Self::UniJisUtf16H,
266            b"UniJIS-UTF16-V" => Self::UniJisUtf16V,
267            b"UniKS-UCS2-H" => Self::UniKsUcs2H,
268            b"UniKS-UCS2-V" => Self::UniKsUcs2V,
269            b"UniKS-UTF16-H" => Self::UniKsUtf16H,
270            b"UniKS-UTF16-V" => Self::UniKsUtf16V,
271            b"V" => Self::V,
272            _ => Self::Custom(name),
273        }
274    }
275
276    /// Convert the `CMapType` back to its raw byte representation.
277    pub fn to_bytes(&self) -> &[u8] {
278        match self {
279            Self::N83pvRksjH => b"83pv-RKSJ-H",
280            Self::N90msRksjH => b"90ms-RKSJ-H",
281            Self::N90msRksjV => b"90ms-RKSJ-V",
282            Self::N90mspRksjH => b"90msp-RKSJ-H",
283            Self::N90mspRksjV => b"90msp-RKSJ-V",
284            Self::N90pvRksjH => b"90pv-RKSJ-H",
285            Self::AddRksjH => b"Add-RKSJ-H",
286            Self::AddRksjV => b"Add-RKSJ-V",
287            Self::AdobeCns1Ucs2 => b"Adobe-CNS1-UCS2",
288            Self::AdobeGb1Ucs2 => b"Adobe-GB1-UCS2",
289            Self::AdobeJapan1Ucs2 => b"Adobe-Japan1-UCS2",
290            Self::AdobeKorea1Ucs2 => b"Adobe-Korea1-UCS2",
291            Self::B5pcH => b"B5pc-H",
292            Self::B5pcV => b"B5pc-V",
293            Self::CnsEucH => b"CNS-EUC-H",
294            Self::CnsEucV => b"CNS-EUC-V",
295            Self::ETenB5H => b"ETen-B5-H",
296            Self::ETenB5V => b"ETen-B5-V",
297            Self::ETenmsB5H => b"ETenms-B5-H",
298            Self::ETenmsB5V => b"ETenms-B5-V",
299            Self::EucH => b"EUC-H",
300            Self::EucV => b"EUC-V",
301            Self::ExtRksjH => b"Ext-RKSJ-H",
302            Self::ExtRksjV => b"Ext-RKSJ-V",
303            Self::GbEucH => b"GB-EUC-H",
304            Self::GbEucV => b"GB-EUC-V",
305            Self::GbkEucH => b"GBK-EUC-H",
306            Self::GbkEucV => b"GBK-EUC-V",
307            Self::Gbk2kH => b"GBK2K-H",
308            Self::Gbk2kV => b"GBK2K-V",
309            Self::GbkpEucH => b"GBKp-EUC-H",
310            Self::GbkpEucV => b"GBKp-EUC-V",
311            Self::GbpcEucH => b"GBpc-EUC-H",
312            Self::GbpcEucV => b"GBpc-EUC-V",
313            Self::H => b"H",
314            Self::HKscsB5H => b"HKscs-B5-H",
315            Self::HKscsB5V => b"HKscs-B5-V",
316            Self::IdentityH => b"Identity-H",
317            Self::IdentityV => b"Identity-V",
318            Self::KscEucH => b"KSC-EUC-H",
319            Self::KscEucV => b"KSC-EUC-V",
320            Self::KscmsUhcH => b"KSCms-UHC-H",
321            Self::KscmsUhcHwH => b"KSCms-UHC-HW-H",
322            Self::KscmsUhcHwV => b"KSCms-UHC-HW-V",
323            Self::KscmsUhcV => b"KSCms-UHC-V",
324            Self::KscpcEucH => b"KSCpc-EUC-H",
325            Self::UniCnsUcs2H => b"UniCNS-UCS2-H",
326            Self::UniCnsUcs2V => b"UniCNS-UCS2-V",
327            Self::UniCnsUtf16H => b"UniCNS-UTF16-H",
328            Self::UniCnsUtf16V => b"UniCNS-UTF16-V",
329            Self::UniGbUcs2H => b"UniGB-UCS2-H",
330            Self::UniGbUcs2V => b"UniGB-UCS2-V",
331            Self::UniGbUtf16H => b"UniGB-UTF16-H",
332            Self::UniGbUtf16V => b"UniGB-UTF16-V",
333            Self::UniJisUcs2H => b"UniJIS-UCS2-H",
334            Self::UniJisUcs2HwH => b"UniJIS-UCS2-HW-H",
335            Self::UniJisUcs2HwV => b"UniJIS-UCS2-HW-V",
336            Self::UniJisUcs2V => b"UniJIS-UCS2-V",
337            Self::UniJisUtf16H => b"UniJIS-UTF16-H",
338            Self::UniJisUtf16V => b"UniJIS-UTF16-V",
339            Self::UniKsUcs2H => b"UniKS-UCS2-H",
340            Self::UniKsUcs2V => b"UniKS-UCS2-V",
341            Self::UniKsUtf16H => b"UniKS-UTF16-H",
342            Self::UniKsUtf16V => b"UniKS-UTF16-V",
343            Self::V => b"V",
344            Self::Custom(name) => name,
345        }
346    }
347}
348
349/// Let's limit the number of nested `usecmap` references to 16.
350const MAX_NESTING_DEPTH: u32 = 16;
351
352/// A parsed cmap.
353#[derive(Debug, Clone)]
354pub struct CMap {
355    metadata: Metadata,
356    // Note that we don't actually use this, because Acrobat _seems_ to ignore
357    // it, too.
358    _codespace_ranges: Vec<CodespaceRange>,
359    cid_ranges: PartitionedRanges,
360    notdef_ranges: PartitionedRanges,
361    bf_entries: Vec<BfRange>,
362    base: Option<Box<Self>>,
363}
364
365impl CMap {
366    /// Parse a cmap from raw bytes.
367    ///
368    /// The `get_cmap` callback is used to recursively resolve cmaps that
369    /// are referenced via `usecmap`.
370    pub fn parse<'a>(
371        data: &[u8],
372        get_cmap: impl Fn(CMapName<'_>) -> Option<&'a [u8]> + Clone + 'a,
373    ) -> Option<Self> {
374        parse::parse_inner(data, get_cmap, 0)
375    }
376
377    /// Create an Identity-H cmap.
378    pub fn identity_h() -> Self {
379        Self::identity(WritingMode::Horizontal, b"Identity-H")
380    }
381
382    /// Create an Identity-V cmap.
383    pub fn identity_v() -> Self {
384        Self::identity(WritingMode::Vertical, b"Identity-V")
385    }
386
387    fn identity(writing_mode: WritingMode, name: &[u8]) -> Self {
388        Self {
389            metadata: Metadata {
390                character_collection: Some(CharacterCollection {
391                    family: CidFamily::AdobeIdentity,
392                    supplement: 0,
393                }),
394                name: Some(Vec::from(name)),
395                writing_mode: Some(writing_mode),
396            },
397            _codespace_ranges: Vec::new(),
398            cid_ranges: {
399                let mut r = PartitionedRanges::new();
400                r.push(
401                    2,
402                    CidRange {
403                        range: Range {
404                            start: 0,
405                            end: 0xFFFF,
406                        },
407                        cid_start: 0,
408                    },
409                );
410                r
411            },
412            notdef_ranges: PartitionedRanges::new(),
413            bf_entries: Vec::new(),
414            base: None,
415        }
416    }
417
418    /// Return the metadata of this cmap.
419    pub fn metadata(&self) -> &Metadata {
420        &self.metadata
421    }
422
423    /// Look up the CID code of a character code.
424    ///
425    /// Returns `None` if the code does not match any range for the given byte length.
426    pub fn lookup_cid_code(&self, code: u32, byte_len: u8) -> Option<Cid> {
427        // Note that, in theory, we are supposed to first check the code space range
428        // whether the entry exists in the first place. However, from my experiments
429        // Acrobat mostly seems to ignore this, so we do that as well.
430
431        let cid_ranges = self.cid_ranges.get(byte_len)?;
432        let notdef_ranges = self.notdef_ranges.get(byte_len)?;
433
434        if let Some(entry) = find_in_ranges(cid_ranges, code) {
435            let offset = code.checked_sub(entry.range.start)?;
436            return entry.cid_start.checked_add(offset);
437        } else if let Some(entry) = find_in_ranges(notdef_ranges, code) {
438            // For `.notdef` ranges, all codes map to the same `.notdef` CID, so
439            // no adding of the offset here.
440            return Some(entry.cid_start);
441        }
442
443        // See pdfjs_bug920426, it uses bf chars for encoding the characters
444        // of a text-showing operator, so try that as well. We don't want to
445        // recurse though, only check the bf chars of this level.
446        if let Some(BfString::Char(lookup)) = self.lookup_bf_string_inner(code, false) {
447            return Some(lookup as u32);
448        }
449
450        // If we still haven't found anything, check the base cmap.
451        self.base
452            .as_ref()
453            .and_then(|b| b.lookup_cid_code(code, byte_len))
454    }
455
456    /// Look up a bf string in the cmap. This is usually
457    /// used for mapping character codes to Unicode codepoints in a
458    /// `ToUnicode` cmap.
459    ///
460    /// Returns `None` if no mapping is available.
461    pub fn lookup_bf_string(&self, code: u32) -> Option<BfString> {
462        self.lookup_bf_string_inner(code, true)
463    }
464
465    fn lookup_bf_string_inner(&self, code: u32, recurse: bool) -> Option<BfString> {
466        if let Some(entry) = find_in_ranges(&self.bf_entries, code) {
467            let offset = u16::try_from(code - entry.range.start).ok()?;
468
469            fn decode_utf16(units: &[u16]) -> Option<BfString> {
470                let mut iter = core::char::decode_utf16(units.iter().copied());
471                let first = iter.next()?.ok()?;
472
473                if iter.next().is_none() {
474                    Some(BfString::Char(first))
475                } else {
476                    let s = String::from_utf16(units).ok()?;
477                    Some(BfString::String(s))
478                }
479            }
480
481            return if offset == 0 {
482                Some(decode_utf16(&entry.dst_base)?)
483            } else {
484                let mut units = entry.dst_base.clone();
485                *units.last_mut()? = units.last()?.checked_add(offset)?;
486                Some(decode_utf16(&units)?)
487            };
488        }
489
490        if recurse {
491            self.base.as_ref()?.lookup_bf_string(code)
492        } else {
493            None
494        }
495    }
496}
497
498trait HasRange {
499    fn range(&self) -> &Range;
500}
501
502fn find_in_ranges<T: HasRange>(entries: &[T], code: u32) -> Option<&T> {
503    let idx = entries
504        .binary_search_by(|entry| {
505            let r = entry.range();
506            if code < r.start {
507                core::cmp::Ordering::Greater
508            } else if code > r.end {
509                core::cmp::Ordering::Less
510            } else {
511                core::cmp::Ordering::Equal
512            }
513        })
514        .ok()?;
515
516    Some(&entries[idx])
517}
518
519/// A range with a start and end code.
520#[derive(Debug, Clone)]
521pub(crate) struct Range {
522    pub(crate) start: u32,
523    pub(crate) end: u32,
524}
525
526/// A range of character codes mapped to CIDs.
527#[derive(Debug, Clone)]
528pub struct CidRange {
529    pub(crate) range: Range,
530    pub(crate) cid_start: Cid,
531}
532
533impl HasRange for CidRange {
534    fn range(&self) -> &Range {
535        &self.range
536    }
537}
538
539#[derive(Debug, Clone)]
540pub(crate) struct PartitionedRanges {
541    buckets: [Vec<CidRange>; 4],
542}
543
544impl PartitionedRanges {
545    pub(crate) fn new() -> Self {
546        Self {
547            buckets: [Vec::new(), Vec::new(), Vec::new(), Vec::new()],
548        }
549    }
550
551    pub(crate) fn push(&mut self, byte_len: usize, range: CidRange) {
552        if let Some(bucket) = byte_len
553            .checked_sub(1)
554            .and_then(|i| self.buckets.get_mut(i))
555        {
556            bucket.push(range);
557        }
558    }
559
560    pub(crate) fn get(&self, byte_len: u8) -> Option<&[CidRange]> {
561        let idx = (byte_len as usize).checked_sub(1)?;
562        self.buckets.get(idx).map(|v| v.as_slice())
563    }
564
565    pub(crate) fn sort(&mut self) {
566        for bucket in &mut self.buckets {
567            bucket.sort_by(|a, b| a.range.start.cmp(&b.range.start));
568        }
569    }
570}
571
572#[derive(Debug, Clone)]
573pub(crate) struct BfRange {
574    pub(crate) range: Range,
575    pub(crate) dst_base: Vec<u16>,
576}
577
578impl HasRange for BfRange {
579    fn range(&self) -> &Range {
580        &self.range
581    }
582}
583
584/// A codespace range defining valid character code byte sequences.
585#[derive(Debug, Clone)]
586#[allow(dead_code)]
587pub(crate) struct CodespaceRange {
588    pub(crate) number_bytes: u8,
589    pub(crate) low: u32,
590    pub(crate) high: u32,
591}
592
593/// A Unicode value decoded from a cmap.
594#[derive(Debug, Clone, PartialEq, Eq)]
595pub enum BfString {
596    /// A single character.
597    Char(char),
598    /// A UTF-8 string.
599    String(String),
600}
601
602/// Metadata extracted from a cmap file.
603#[derive(Debug, Clone)]
604pub struct Metadata {
605    /// The referenced character collection.
606    pub character_collection: Option<CharacterCollection>,
607    /// The cmap name.
608    pub name: Option<Vec<u8>>,
609    /// The writing mode.
610    pub writing_mode: Option<WritingMode>,
611}
612
613/// The registry+ordering family of a CID character collection.
614#[derive(Debug, Clone, PartialEq, Eq)]
615pub enum CidFamily {
616    /// Adobe-Japan1
617    AdobeJapan1,
618    /// Adobe-GB1
619    AdobeGB1,
620    /// Adobe-CNS1
621    AdobeCNS1,
622    /// Adobe-Korea1
623    AdobeKorea1,
624    /// Adobe-Identity
625    AdobeIdentity,
626    /// A non-predefined registry/ordering pair.
627    Custom {
628        /// The registry name.
629        registry: Vec<u8>,
630        /// The ordering name.
631        ordering: Vec<u8>,
632    },
633}
634
635impl CidFamily {
636    /// Create a `CidFamily` from raw registry and ordering byte strings.
637    pub fn from_registry_ordering(registry: &[u8], ordering: &[u8]) -> Self {
638        if registry == b"Adobe" {
639            match ordering {
640                b"Japan1" => Self::AdobeJapan1,
641                b"GB1" => Self::AdobeGB1,
642                b"CNS1" => Self::AdobeCNS1,
643                b"Korea1" => Self::AdobeKorea1,
644                b"Identity" => Self::AdobeIdentity,
645                _ => Self::Custom {
646                    registry: registry.to_vec(),
647                    ordering: ordering.to_vec(),
648                },
649            }
650        } else {
651            Self::Custom {
652                registry: registry.to_vec(),
653                ordering: ordering.to_vec(),
654            }
655        }
656    }
657
658    /// Return the predefined UTF-16 `CMap` that maps Unicode code points to CIDs
659    /// for this family, if one exists.
660    pub fn unicode_cmap(&self) -> Option<CMapName<'static>> {
661        match self {
662            Self::AdobeJapan1 => Some(CMapName::UniJisUtf16H),
663            Self::AdobeGB1 => Some(CMapName::UniGbUtf16H),
664            Self::AdobeCNS1 => Some(CMapName::UniCnsUtf16H),
665            Self::AdobeKorea1 => Some(CMapName::UniKsUtf16H),
666            Self::AdobeIdentity | Self::Custom { .. } => None,
667        }
668    }
669
670    /// Return the `UCS2` cmap corresponding to the given character collection.
671    pub fn ucs2_cmap(&self) -> Option<CMapName<'static>> {
672        match self {
673            Self::AdobeJapan1 => Some(CMapName::AdobeJapan1Ucs2),
674            Self::AdobeGB1 => Some(CMapName::AdobeGb1Ucs2),
675            Self::AdobeCNS1 => Some(CMapName::AdobeCns1Ucs2),
676            Self::AdobeKorea1 => Some(CMapName::AdobeKorea1Ucs2),
677            Self::AdobeIdentity | Self::Custom { .. } => None,
678        }
679    }
680
681    /// Returns `true` for the four predefined CJK character collections
682    /// (Simplified Chinese, Traditional Chinese, Japanese, Korean).
683    pub fn is_cjk(&self) -> bool {
684        matches!(
685            self,
686            Self::AdobeGB1 | Self::AdobeCNS1 | Self::AdobeJapan1 | Self::AdobeKorea1
687        )
688    }
689}
690
691/// A CID character collection identifying the character set and ordering.
692#[derive(Debug, Clone, PartialEq, Eq)]
693pub struct CharacterCollection {
694    /// The registry+ordering family.
695    pub family: CidFamily,
696    /// The supplement number.
697    pub supplement: i32,
698}
699
700/// The writing mode of a cmap.
701#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
702pub enum WritingMode {
703    /// Horizontal writing mode.
704    #[default]
705    Horizontal,
706    /// Vertical writing mode.
707    Vertical,
708}
709
710#[cfg(test)]
711mod tests {
712    use super::*;
713
714    // Note that those cmaps might not be completely valid according to the rules
715    // of cmap/Postscript, but since our parser is very lenient and doesn't run a real
716    // interpreter we can shorten them by a lot.
717
718    const PREAMBLE: &[u8] = br#"/CIDSystemInfo 3 dict dup begin
719  /Registry (Adobe) def
720  /Ordering (Japan1) def
721  /Supplement 0 def
722end def
723/CMapName /Test def
724/WMode 0 def
7252 begincodespacerange
726<00> <FF>
727<0000> <FFFF>
728endcodespacerange
729"#;
730
731    fn parse_with_preamble(body: &[u8]) -> CMap {
732        let mut data = Vec::new();
733        data.extend_from_slice(PREAMBLE);
734        data.extend_from_slice(body);
735        CMap::parse(&data, |_| None).unwrap()
736    }
737
738    #[test]
739    fn metadata_parsing() {
740        let data = br#"
741/CIDInit /ProcSet findresource begin
74212 dict begin
743begincmap
744/CIDSystemInfo 3 dict dup begin
745  /Registry (Adobe) def
746  /Ordering (Japan1) def
747  /Supplement 6 def
748end def
749/CMapName /Adobe-Japan1-H def
750/CMapType 1 def
751/WMode 0 def
752endcmap"#;
753
754        let cmap = CMap::parse(data, |_| None).unwrap();
755        let cc = cmap.metadata().character_collection.as_ref().unwrap();
756        assert_eq!(cc.family, CidFamily::AdobeJapan1);
757        assert_eq!(cc.supplement, 6);
758        assert_eq!(
759            cmap.metadata().name.as_deref(),
760            Some(b"Adobe-Japan1-H".as_slice())
761        );
762        assert_eq!(cmap.metadata().writing_mode, Some(WritingMode::Horizontal));
763    }
764
765    #[test]
766    fn vertical_writing_mode() {
767        let data = br#"
768/CIDSystemInfo 3 dict dup begin
769  /Registry (Adobe) def
770  /Ordering (Japan1) def
771  /Supplement 6 def
772end def
773/CMapName /Adobe-Japan1-V def
774/WMode 1 def
775"#;
776
777        let cmap = CMap::parse(data, |_| None).unwrap();
778        assert_eq!(cmap.metadata().writing_mode, Some(WritingMode::Vertical));
779        assert_eq!(
780            cmap.metadata().name.as_deref(),
781            Some(b"Adobe-Japan1-V".as_slice())
782        );
783    }
784
785    #[test]
786    fn cid_range_lookup() {
787        let cmap = parse_with_preamble(
788            br#"
7893 begincidrange
790<0000> <00FF> 0
791<0100> <01FF> 256
792<8140> <817E> 633
793endcidrange
794"#,
795        );
796
797        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(0));
798        assert_eq!(cmap.lookup_cid_code(0x0042, 2), Some(0x42));
799        assert_eq!(cmap.lookup_cid_code(0x00FF, 2), Some(0xFF));
800
801        assert_eq!(cmap.lookup_cid_code(0x0100, 2), Some(256));
802        assert_eq!(cmap.lookup_cid_code(0x01FF, 2), Some(511));
803
804        assert_eq!(cmap.lookup_cid_code(0x8140, 2), Some(633));
805        assert_eq!(cmap.lookup_cid_code(0x817E, 2), Some(633 + 62));
806    }
807
808    #[test]
809    fn cid_char_lookup() {
810        let cmap = parse_with_preamble(
811            br#"
8123 begincidchar
813<03> 1
814<04> 2
815<20> 50
816endcidchar
817"#,
818        );
819
820        assert_eq!(cmap.lookup_cid_code(0x03, 1), Some(1));
821        assert_eq!(cmap.lookup_cid_code(0x04, 1), Some(2));
822        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(50));
823    }
824
825    #[test]
826    fn lookup_miss() {
827        let cmap = parse_with_preamble(
828            br#"
8291 begincidrange
830<0100> <01FF> 0
831endcidrange
832"#,
833        );
834
835        assert_eq!(cmap.lookup_cid_code(0x00FF, 2), None);
836        assert_eq!(cmap.lookup_cid_code(0x0200, 2), None);
837        assert_eq!(cmap.lookup_cid_code(0xFFFF, 2), None);
838    }
839
840    #[test]
841    fn multiple_sections() {
842        let cmap = parse_with_preamble(
843            br#"
8442 begincidrange
845<0000> <00FF> 0
846<0100> <01FF> 256
847endcidrange
8481 begincidchar
849<0200> 600
850endcidchar
8511 begincidrange
852<8140> <817E> 633
853endcidrange
854"#,
855        );
856
857        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(0));
858        assert_eq!(cmap.lookup_cid_code(0x0100, 2), Some(256));
859        assert_eq!(cmap.lookup_cid_code(0x0200, 2), Some(600));
860        assert_eq!(cmap.lookup_cid_code(0x8140, 2), Some(633));
861    }
862
863    #[test]
864    fn single_byte_codes() {
865        let cmap = parse_with_preamble(
866            br#"
8672 begincidrange
868<00> <7F> 0
869<80> <FF> 200
870endcidrange
871"#,
872        );
873
874        assert_eq!(cmap.lookup_cid_code(0x00, 1), Some(0));
875        assert_eq!(cmap.lookup_cid_code(0x41, 1), Some(0x41));
876        assert_eq!(cmap.lookup_cid_code(0x80, 1), Some(200));
877        assert_eq!(cmap.lookup_cid_code(0xFF, 1), Some(200 + 127));
878    }
879
880    #[test]
881    fn dict_style_cidsysteminfo() {
882        let data = br#"
883/CIDInit /ProcSet findresource begin
88410 dict begin
885begincmap
886/CIDSystemInfo
887<< /Registry (Adobe)
888/Ordering (UCS)
889/Supplement 0
890>> def
891/CMapName /Adobe-Identity-UCS def
892/CMapType 2 def
8931 begincodespacerange
894<0000> <FFFF>
895endcodespacerange
8962 beginbfchar
897<001F> <F049>
898<002A> <F055>
899endbfchar
900endcmap
901CMapName currentdict /CMap defineresource pop
902end
903end
904"#;
905        let cmap = CMap::parse(data, |_| None).unwrap();
906        let cc = cmap.metadata().character_collection.as_ref().unwrap();
907        assert_eq!(
908            cc.family,
909            CidFamily::Custom {
910                registry: b"Adobe".to_vec(),
911                ordering: b"UCS".to_vec(),
912            }
913        );
914        assert_eq!(cc.supplement, 0);
915        assert_eq!(
916            cmap.metadata().name.as_deref(),
917            Some(b"Adobe-Identity-UCS".as_slice())
918        );
919        assert_eq!(
920            cmap.lookup_bf_string(0x001F),
921            Some(BfString::Char('\u{F049}'))
922        );
923        assert_eq!(
924            cmap.lookup_bf_string(0x002A),
925            Some(BfString::Char('\u{F055}'))
926        );
927    }
928
929    #[test]
930    fn usecmap_chaining() {
931        let base_data = br#"
932/CIDSystemInfo 3 dict dup begin
933  /Registry (Adobe) def
934  /Ordering (Japan1) def
935  /Supplement 0 def
936end def
937/CMapName /Base def
938/WMode 0 def
9391 begincodespacerange
940<0000> <FFFF>
941endcodespacerange
9421 begincidrange
943<0000> <00FF> 0
944endcidrange
945"#;
946
947        let child_data = br#"
948/Base usecmap
949/CIDSystemInfo 3 dict dup begin
950  /Registry (Adobe) def
951  /Ordering (Japan1) def
952  /Supplement 0 def
953end def
954/CMapName /Child def
955/WMode 0 def
9561 begincodespacerange
957<0000> <FFFF>
958endcodespacerange
9591 begincidrange
960<0100> <01FF> 256
961endcidrange
962"#;
963
964        let cmap = CMap::parse(child_data, |name| {
965            if name.to_bytes() == b"Base" {
966                Some(base_data.as_slice())
967            } else {
968                None
969            }
970        })
971        .unwrap();
972
973        assert_eq!(cmap.lookup_cid_code(0x0100, 2), Some(256));
974        assert_eq!(cmap.lookup_cid_code(0x01FF, 2), Some(511));
975        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(0));
976        assert_eq!(cmap.lookup_cid_code(0x00FF, 2), Some(0xFF));
977
978        assert_eq!(cmap.lookup_cid_code(0x0200, 2), None);
979    }
980
981    #[test]
982    fn usecmap_partial_override() {
983        let base_data = br#"
984/CIDSystemInfo 3 dict dup begin
985  /Registry (Adobe) def
986  /Ordering (Japan1) def
987  /Supplement 0 def
988end def
989/CMapName /Base def
990/WMode 0 def
9911 begincodespacerange
992<0000> <FFFF>
993endcodespacerange
9941 begincidrange
995<0000> <00FF> 0
996endcidrange
997"#;
998
999        let child_data = br#"
1000/Base usecmap
1001/CIDSystemInfo 3 dict dup begin
1002  /Registry (Adobe) def
1003  /Ordering (Japan1) def
1004  /Supplement 0 def
1005end def
1006/CMapName /Child def
1007/WMode 0 def
10081 begincodespacerange
1009<0000> <FFFF>
1010endcodespacerange
10111 begincidrange
1012<0040> <007F> 500
1013endcidrange
1014"#;
1015
1016        let cmap = CMap::parse(child_data, |name| {
1017            if name.to_bytes() == b"Base" {
1018                Some(base_data.as_slice())
1019            } else {
1020                None
1021            }
1022        })
1023        .unwrap();
1024
1025        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(0));
1026        assert_eq!(cmap.lookup_cid_code(0x003F, 2), Some(0x3F));
1027        assert_eq!(cmap.lookup_cid_code(0x0040, 2), Some(500));
1028        assert_eq!(cmap.lookup_cid_code(0x007F, 2), Some(563));
1029        assert_eq!(cmap.lookup_cid_code(0x0080, 2), Some(0x80));
1030        assert_eq!(cmap.lookup_cid_code(0x00FF, 2), Some(0xFF));
1031    }
1032
1033    #[test]
1034    fn notdef_char_lookup() {
1035        let cmap = parse_with_preamble(
1036            br#"
10372 beginnotdefchar
1038<03> 10
1039<20> 20
1040endnotdefchar
1041"#,
1042        );
1043
1044        assert_eq!(cmap.lookup_cid_code(0x03, 1), Some(10));
1045        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(20));
1046        assert_eq!(cmap.lookup_cid_code(0x04, 1), None);
1047    }
1048
1049    #[test]
1050    fn notdef_range_lookup() {
1051        let cmap = parse_with_preamble(
1052            br#"
10531 beginnotdefrange
1054<0000> <001F> 100
1055endnotdefrange
1056"#,
1057        );
1058
1059        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(100));
1060        assert_eq!(cmap.lookup_cid_code(0x0001, 2), Some(100));
1061        assert_eq!(cmap.lookup_cid_code(0x001F, 2), Some(100));
1062        assert_eq!(cmap.lookup_cid_code(0x0020, 2), None);
1063    }
1064
1065    #[test]
1066    fn bfchar_lookup() {
1067        let cmap = parse_with_preamble(
1068            br#"
10692 beginbfchar
1070<0041> <0048>
1071<0042> <0065>
1072endbfchar
1073"#,
1074        );
1075
1076        assert_eq!(cmap.lookup_bf_string(0x0041), Some(BfString::Char('H')));
1077        assert_eq!(cmap.lookup_bf_string(0x0042), Some(BfString::Char('e')));
1078        assert_eq!(cmap.lookup_bf_string(0x0043), None);
1079    }
1080
1081    #[test]
1082    fn bfchar_ligature() {
1083        let cmap = parse_with_preamble(
1084            br#"
10851 beginbfchar
1086<005F> <00660066>
1087endbfchar
1088"#,
1089        );
1090
1091        assert_eq!(
1092            cmap.lookup_bf_string(0x005F),
1093            Some(BfString::String(String::from("ff")))
1094        );
1095    }
1096
1097    #[test]
1098    fn bfchar_surrogate_pair() {
1099        let cmap = parse_with_preamble(
1100            br#"
11011 beginbfchar
1102<3A51> <D840DC3E>
1103endbfchar
1104"#,
1105        );
1106
1107        assert_eq!(
1108            cmap.lookup_bf_string(0x3A51),
1109            Some(BfString::Char('\u{2003E}'))
1110        );
1111    }
1112
1113    #[test]
1114    fn bfrange_incrementing() {
1115        let cmap = parse_with_preamble(
1116            br#"
11171 beginbfrange
1118<0000> <0004> <0041>
1119endbfrange
1120"#,
1121        );
1122
1123        assert_eq!(cmap.lookup_bf_string(0x0000), Some(BfString::Char('A')));
1124        assert_eq!(cmap.lookup_bf_string(0x0001), Some(BfString::Char('B')));
1125        assert_eq!(cmap.lookup_bf_string(0x0004), Some(BfString::Char('E')));
1126        assert_eq!(cmap.lookup_bf_string(0x0005), None);
1127    }
1128
1129    #[test]
1130    fn bfrange_array() {
1131        let cmap = parse_with_preamble(
1132            br#"
11331 beginbfrange
1134<005F> <0061> [<00660066> <00660069> <0066006C>]
1135endbfrange
1136"#,
1137        );
1138
1139        // ff, fi, fl ligatures
1140        assert_eq!(
1141            cmap.lookup_bf_string(0x005F),
1142            Some(BfString::String(String::from("ff")))
1143        );
1144        assert_eq!(
1145            cmap.lookup_bf_string(0x0060),
1146            Some(BfString::String(String::from("fi")))
1147        );
1148        assert_eq!(
1149            cmap.lookup_bf_string(0x0061),
1150            Some(BfString::String(String::from("fl")))
1151        );
1152    }
1153
1154    #[test]
1155    fn unicode_lookup_miss() {
1156        let cmap = parse_with_preamble(
1157            br#"
11581 beginbfchar
1159<0041> <0048>
1160endbfchar
1161"#,
1162        );
1163
1164        assert_eq!(cmap.lookup_bf_string(0x0000), None);
1165        assert_eq!(cmap.lookup_bf_string(0x0042), None);
1166    }
1167
1168    #[test]
1169    fn identity_h() {
1170        let cmap = CMap::identity_h();
1171        assert_eq!(
1172            cmap.metadata().name.as_deref(),
1173            Some(b"Identity-H".as_slice())
1174        );
1175        assert_eq!(cmap.metadata().writing_mode, Some(WritingMode::Horizontal));
1176
1177        assert_eq!(cmap.lookup_cid_code(0x0041, 2), Some(0x0041));
1178        assert_eq!(cmap.lookup_cid_code(0x1234, 2), Some(0x1234));
1179        assert_eq!(cmap.lookup_cid_code(0xFFFF, 2), Some(0xFFFF));
1180
1181        assert_eq!(cmap.lookup_cid_code(0x0041, 1), None);
1182        assert_eq!(cmap.lookup_cid_code(0x0041, 3), None);
1183    }
1184
1185    #[test]
1186    fn identity_v() {
1187        let cmap = CMap::identity_v();
1188        assert_eq!(
1189            cmap.metadata().name.as_deref(),
1190            Some(b"Identity-V".as_slice())
1191        );
1192        assert_eq!(cmap.metadata().writing_mode, Some(WritingMode::Vertical));
1193
1194        assert_eq!(cmap.lookup_cid_code(0x0041, 2), Some(0x0041));
1195        assert_eq!(cmap.lookup_cid_code(0xFFFF, 2), Some(0xFFFF));
1196    }
1197
1198    #[test]
1199    fn minimal_cmap_no_name_no_wmode() {
1200        // Extracted from corpus PDF 0500013.
1201        let data = br#"
1202/CIDInit /ProcSet findresource begin
120312 dict begin
1204begincmap
1205/CMapType 2 def
12061 begincodespacerange
1207<00><ff>
1208endcodespacerange
12091 beginbfrange
1210<01><01><0020>
1211endbfrange
1212endcmap
1213"#;
1214        let cmap = CMap::parse(data, |_| None).unwrap();
1215        assert_eq!(cmap.metadata().name, None);
1216        assert_eq!(cmap.metadata().character_collection, None);
1217        assert_eq!(cmap.metadata().writing_mode, None);
1218        assert_eq!(cmap.lookup_bf_string(0x01), Some(BfString::Char(' ')));
1219    }
1220
1221    #[test]
1222    fn registry_as_name() {
1223        // Extracted from corpus PDF 0875241, uses names instead of strings for
1224        // strings for metadata.
1225        let data = br#"
1226/CIDSystemInfo
1227<< /Registry /ABCDEF+SimSun
1228/Ordering (pdfbeaninc)
1229/Supplement 0
1230>> def
1231/CMapName /ABCDEF+SimSun def
1232/CMapType 2 def
12331 begincodespacerange
1234<0000> <FFFF>
1235endcodespacerange
12361 beginbfrange
1237<0000> <0000> <6881>
1238endbfrange
1239"#;
1240        let cmap = CMap::parse(data, |_| None).unwrap();
1241        let cc = cmap.metadata().character_collection.as_ref().unwrap();
1242        assert_eq!(
1243            cc.family,
1244            CidFamily::Custom {
1245                registry: b"ABCDEF+SimSun".to_vec(),
1246                ordering: b"pdfbeaninc".to_vec(),
1247            }
1248        );
1249        assert_eq!(cc.supplement, 0);
1250        assert_eq!(
1251            cmap.lookup_bf_string(0x0000),
1252            Some(BfString::Char('\u{6881}'))
1253        );
1254    }
1255
1256    #[test]
1257    fn single_byte_bfrange_destination() {
1258        // See for example corpus test case 0625248.
1259        let cmap = parse_with_preamble(
1260            br#"
12611 beginbfrange
1262<00> <7f> <00>
1263endbfrange
1264"#,
1265        );
1266
1267        assert_eq!(
1268            cmap.lookup_bf_string(0x00),
1269            Some(BfString::Char('\u{0000}'))
1270        );
1271        assert_eq!(cmap.lookup_bf_string(0x41), Some(BfString::Char('A')));
1272        assert_eq!(
1273            cmap.lookup_bf_string(0x7f),
1274            Some(BfString::Char('\u{007F}'))
1275        );
1276    }
1277}
1278
1279#[cfg(all(test, feature = "embed-cmaps"))]
1280mod bcmap_tests {
1281    use super::*;
1282
1283    fn get_embedded_cmap(name: CMapName<'_>) -> Option<&'static [u8]> {
1284        load_embedded(name)
1285    }
1286
1287    #[test]
1288    fn embedded_h_cmap() {
1289        let data = load_embedded(CMapName::H).expect("embedded H cmap not found");
1290        let cmap = CMap::parse(data, get_embedded_cmap).expect("failed to parse H cmap");
1291        assert_eq!(cmap.metadata().writing_mode, None);
1292        assert_eq!(
1293            cmap.metadata().character_collection,
1294            Some(CharacterCollection {
1295                family: CidFamily::AdobeJapan1,
1296                supplement: 1,
1297            })
1298        );
1299        assert_eq!(cmap.lookup_cid_code(0x2121, 2), Some(633));
1300        assert_eq!(cmap.lookup_cid_code(0x2221, 2), Some(727));
1301    }
1302
1303    #[test]
1304    fn embedded_90ms_rksj_h() {
1305        let data =
1306            load_embedded(CMapName::N90msRksjH).expect("embedded 90ms-RKSJ-H cmap not found");
1307        let cmap = CMap::parse(data, get_embedded_cmap).expect("failed to parse 90ms-RKSJ-H cmap");
1308        assert_eq!(
1309            cmap.metadata().character_collection,
1310            Some(CharacterCollection {
1311                family: CidFamily::AdobeJapan1,
1312                supplement: 2,
1313            })
1314        );
1315        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(231));
1316        assert_eq!(cmap.lookup_cid_code(0x8140, 2), Some(633));
1317    }
1318
1319    #[test]
1320    fn embedded_v_cmap() {
1321        let data = load_embedded(CMapName::V).expect("embedded V cmap not found");
1322        let cmap = CMap::parse(data, get_embedded_cmap).expect("failed to parse V cmap");
1323        assert_eq!(cmap.metadata().writing_mode, Some(WritingMode::Vertical));
1324        assert_eq!(
1325            cmap.metadata().character_collection,
1326            Some(CharacterCollection {
1327                family: CidFamily::AdobeJapan1,
1328                supplement: 1,
1329            })
1330        );
1331        // Inherited from base "H" via usecmap.
1332        assert_eq!(cmap.lookup_cid_code(0x2121, 2), Some(633));
1333    }
1334
1335    #[test]
1336    fn embedded_gbk_euc_h() {
1337        let data = load_embedded(CMapName::GbkEucH).expect("embedded GBK-EUC-H not found");
1338        let cmap = CMap::parse(data, get_embedded_cmap).expect("failed to parse GBK-EUC-H");
1339        assert_eq!(
1340            cmap.metadata().character_collection,
1341            Some(CharacterCollection {
1342                family: CidFamily::AdobeGB1,
1343                supplement: 2,
1344            })
1345        );
1346        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(7716));
1347        assert_eq!(cmap.lookup_cid_code(0x21, 1), Some(814));
1348        assert_eq!(cmap.lookup_cid_code(0x7E, 1), Some(814 + 0x7E - 0x21));
1349        assert_eq!(cmap.lookup_cid_code(0xFE80, 2), Some(22094));
1350        assert_eq!(cmap.lookup_cid_code(0xFEA0, 2), Some(22094 + 0xA0 - 0x80));
1351    }
1352
1353    #[test]
1354    fn embedded_ksc_euc_h() {
1355        let data = load_embedded(CMapName::KscEucH).unwrap();
1356        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1357        assert_eq!(
1358            cmap.metadata().character_collection,
1359            Some(CharacterCollection {
1360                family: CidFamily::AdobeKorea1,
1361                supplement: 0,
1362            })
1363        );
1364        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(8094));
1365        assert_eq!(cmap.lookup_cid_code(0x41, 1), Some(8094 + 0x41 - 0x20));
1366        assert_eq!(cmap.lookup_cid_code(0xA1A1, 2), Some(101));
1367        assert_eq!(cmap.lookup_cid_code(0xA1FE, 2), Some(101 + 0xFE - 0xA1));
1368        assert_eq!(cmap.lookup_cid_code(0xFDA1, 2), Some(7962));
1369    }
1370
1371    #[test]
1372    fn embedded_b5pc_h() {
1373        let data = load_embedded(CMapName::B5pcH).unwrap();
1374        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1375        assert_eq!(
1376            cmap.metadata().character_collection,
1377            Some(CharacterCollection {
1378                family: CidFamily::AdobeCNS1,
1379                supplement: 0,
1380            })
1381        );
1382        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(1));
1383        assert_eq!(cmap.lookup_cid_code(0x41, 1), Some(1 + 0x41 - 0x20));
1384        assert_eq!(cmap.lookup_cid_code(0x80, 1), Some(61));
1385        assert_eq!(cmap.lookup_cid_code(0xD140, 2), Some(7251));
1386        assert_eq!(cmap.lookup_cid_code(0xD17E, 2), Some(7251 + 0x7E - 0x40));
1387        assert_eq!(cmap.lookup_cid_code(0xF9D5, 2), Some(13642 + 3));
1388    }
1389
1390    #[test]
1391    fn embedded_unijis_utf16_h() {
1392        let data = load_embedded(CMapName::UniJisUtf16H).unwrap();
1393        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1394        assert_eq!(
1395            cmap.metadata().character_collection,
1396            Some(CharacterCollection {
1397                family: CidFamily::AdobeJapan1,
1398                supplement: 7,
1399            })
1400        );
1401        assert_eq!(cmap.lookup_cid_code(0x20, 2), Some(1));
1402        assert_eq!(cmap.lookup_cid_code(0x41, 2), Some(1 + 0x41 - 0x20));
1403        assert_eq!(cmap.lookup_cid_code(0x5C, 2), Some(97));
1404        assert_eq!(cmap.lookup_cid_code(0x5BCC, 2), Some(3531));
1405        assert_eq!(cmap.lookup_cid_code(0xD884DF50, 4), Some(19130));
1406    }
1407
1408    #[test]
1409    fn embedded_identity_h() {
1410        let data = load_embedded(CMapName::IdentityH).expect("embedded Identity-H not found");
1411        let cmap = CMap::parse(data, get_embedded_cmap).expect("failed to parse Identity-H cmap");
1412        assert_eq!(cmap.metadata().writing_mode, None);
1413        assert_eq!(
1414            cmap.metadata().character_collection,
1415            Some(CharacterCollection {
1416                family: CidFamily::AdobeIdentity,
1417                supplement: 0,
1418            })
1419        );
1420        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(0));
1421        assert_eq!(cmap.lookup_cid_code(0x0041, 2), Some(0x41));
1422        assert_eq!(cmap.lookup_cid_code(0xFFFF, 2), Some(0xFFFF));
1423    }
1424
1425    #[test]
1426    fn embedded_identity_v() {
1427        let data = load_embedded(CMapName::IdentityV).expect("embedded Identity-V not found");
1428        let cmap = CMap::parse(data, get_embedded_cmap).expect("failed to parse Identity-V cmap");
1429        assert_eq!(cmap.metadata().writing_mode, Some(WritingMode::Vertical));
1430        assert_eq!(
1431            cmap.metadata().character_collection,
1432            Some(CharacterCollection {
1433                family: CidFamily::AdobeIdentity,
1434                supplement: 0,
1435            })
1436        );
1437        assert_eq!(cmap.lookup_cid_code(0x0000, 2), Some(0));
1438        assert_eq!(cmap.lookup_cid_code(0x0041, 2), Some(0x41));
1439        assert_eq!(cmap.lookup_cid_code(0xFFFF, 2), Some(0xFFFF));
1440    }
1441
1442    // Note: Some tests AI-generated, haven't double-checked them.
1443
1444    #[test]
1445    fn embedded_adobe_gb1_ucs2() {
1446        let data = load_embedded(CMapName::AdobeGb1Ucs2).unwrap();
1447        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1448        assert_eq!(
1449            cmap.metadata().character_collection,
1450            Some(CharacterCollection {
1451                family: CidFamily::Custom {
1452                    registry: b"Adobe".to_vec(),
1453                    ordering: b"Adobe_GB1_UCS2".to_vec(),
1454                },
1455                supplement: 5,
1456            })
1457        );
1458
1459        assert_eq!(
1460            cmap.lookup_bf_string(0x0063),
1461            Some(BfString::Char('\u{00B7}'))
1462        );
1463        assert_eq!(cmap.lookup_bf_string(0x0001), Some(BfString::Char(' ')));
1464        assert_eq!(cmap.lookup_bf_string(0x0022), Some(BfString::Char('A')));
1465        assert_eq!(cmap.lookup_bf_string(0x005F), Some(BfString::Char('~')));
1466        assert_eq!(
1467            cmap.lookup_bf_string(0x560F),
1468            Some(BfString::String(String::from("\u{90CE}\u{FE00}")))
1469        );
1470    }
1471
1472    #[test]
1473    fn embedded_adobe_japan1_ucs2() {
1474        let data = load_embedded(CMapName::AdobeJapan1Ucs2).unwrap();
1475        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1476        assert_eq!(
1477            cmap.metadata().character_collection,
1478            Some(CharacterCollection {
1479                family: CidFamily::Custom {
1480                    registry: b"Adobe".to_vec(),
1481                    ordering: b"Adobe_Japan1_UCS2".to_vec(),
1482                },
1483                supplement: 7,
1484            })
1485        );
1486
1487        assert_eq!(
1488            cmap.lookup_bf_string(0x003D),
1489            Some(BfString::Char('\u{00A5}'))
1490        );
1491        assert_eq!(cmap.lookup_bf_string(0x0001), Some(BfString::Char(' ')));
1492        assert_eq!(cmap.lookup_bf_string(0x003C), Some(BfString::Char('[')));
1493        assert_eq!(
1494            cmap.lookup_bf_string(0x0476),
1495            Some(BfString::String(String::from("\u{82A6}\u{E0100}")))
1496        );
1497        assert_eq!(
1498            cmap.lookup_bf_string(0x265B),
1499            Some(BfString::String(String::from("10/11")))
1500        );
1501        assert_eq!(
1502            cmap.lookup_bf_string(0x2E6B),
1503            Some(BfString::String(String::from("オングストローム")))
1504        );
1505    }
1506
1507    #[test]
1508    fn embedded_adobe_korea1_ucs2() {
1509        let data = load_embedded(CMapName::AdobeKorea1Ucs2).unwrap();
1510        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1511        assert_eq!(
1512            cmap.metadata().character_collection,
1513            Some(CharacterCollection {
1514                family: CidFamily::Custom {
1515                    registry: b"Adobe".to_vec(),
1516                    ordering: b"Adobe_Korea1_UCS2".to_vec(),
1517                },
1518                supplement: 2,
1519            })
1520        );
1521
1522        assert_eq!(
1523            cmap.lookup_bf_string(0x0060),
1524            Some(BfString::Char('\u{20A9}'))
1525        );
1526        assert_eq!(
1527            cmap.lookup_bf_string(0x2072),
1528            Some(BfString::String(String::from("[10]")))
1529        );
1530        assert_eq!(
1531            cmap.lookup_bf_string(0x2073),
1532            Some(BfString::String(String::from("[11]")))
1533        );
1534    }
1535
1536    #[test]
1537    fn embedded_adobe_cns1_ucs2() {
1538        let data = load_embedded(CMapName::AdobeCns1Ucs2).unwrap();
1539        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1540        assert_eq!(
1541            cmap.metadata().character_collection,
1542            Some(CharacterCollection {
1543                family: CidFamily::Custom {
1544                    registry: b"Adobe".to_vec(),
1545                    ordering: b"Adobe_CNS1_UCS2".to_vec(),
1546                },
1547                supplement: 7,
1548            })
1549        );
1550
1551        assert_eq!(
1552            cmap.lookup_bf_string(0x0060),
1553            Some(BfString::Char('\u{00A9}'))
1554        );
1555        assert_eq!(cmap.lookup_bf_string(0x0001), Some(BfString::Char(' ')));
1556        assert_eq!(cmap.lookup_bf_string(0x005F), Some(BfString::Char('~')));
1557        assert_eq!(
1558            cmap.lookup_bf_string(0x36B0),
1559            Some(BfString::Char('\u{200CC}'))
1560        );
1561    }
1562
1563    #[test]
1564    fn embedded_eten_b5_h() {
1565        let data = load_embedded(CMapName::ETenB5H).unwrap();
1566        let cmap = CMap::parse(data, get_embedded_cmap).unwrap();
1567        assert_eq!(
1568            cmap.metadata().character_collection,
1569            Some(CharacterCollection {
1570                family: CidFamily::AdobeCNS1,
1571                supplement: 0,
1572            })
1573        );
1574        assert_eq!(cmap.lookup_cid_code(0x20, 1), Some(13648));
1575        assert_eq!(cmap.lookup_cid_code(0x7E, 1), Some(13648 + 0x7E - 0x20));
1576        assert_eq!(cmap.lookup_cid_code(0xA140, 2), Some(99));
1577        assert_eq!(cmap.lookup_cid_code(0xA158, 2), Some(99 + 0x58 - 0x40));
1578        assert_eq!(cmap.lookup_cid_code(0xD040, 2), Some(7094));
1579        assert_eq!(cmap.lookup_cid_code(0xF9FE, 2), Some(14056 + 0xFE - 0xD6));
1580    }
1581}