pdfplumber_parse/
cmap.rs

1//! ToUnicode CMap parser for mapping character codes to Unicode strings.
2//!
3//! Parses CMap data embedded in PDF `/ToUnicode` streams to convert glyph codes
4//! to Unicode text. Supports `beginbfchar`/`endbfchar` (single mappings) and
5//! `beginbfrange`/`endbfrange` (range mappings) with UTF-16BE encoded values.
6
7use std::collections::HashMap;
8
9use crate::error::BackendError;
10
11/// A parsed ToUnicode CMap that maps character codes to Unicode strings.
12///
13/// Character codes are typically 1 or 2 bytes from the PDF font encoding.
14/// Unicode values may be single characters or multi-character strings
15/// (e.g., ligatures like "fi" → "fi").
16#[derive(Debug, Clone)]
17pub struct CMap {
18    /// Mapping from character code to Unicode string.
19    mappings: HashMap<u32, String>,
20}
21
22impl CMap {
23    /// Parse a ToUnicode CMap from its raw byte content.
24    ///
25    /// Extracts `beginbfchar`/`endbfchar` and `beginbfrange`/`endbfrange`
26    /// sections to build the character code → Unicode mapping table.
27    pub fn parse(data: &[u8]) -> Result<Self, BackendError> {
28        let text = String::from_utf8_lossy(data);
29        let mut mappings = HashMap::new();
30
31        // Parse all beginbfchar...endbfchar sections
32        let mut search_from = 0;
33        while let Some(start) = text[search_from..].find("beginbfchar") {
34            let section_start = search_from + start + "beginbfchar".len();
35            if let Some(end) = text[section_start..].find("endbfchar") {
36                let section = &text[section_start..section_start + end];
37                parse_bfchar_section(section, &mut mappings)?;
38                search_from = section_start + end + "endbfchar".len();
39            } else {
40                break;
41            }
42        }
43
44        // Parse all beginbfrange...endbfrange sections
45        search_from = 0;
46        while let Some(start) = text[search_from..].find("beginbfrange") {
47            let section_start = search_from + start + "beginbfrange".len();
48            if let Some(end) = text[section_start..].find("endbfrange") {
49                let section = &text[section_start..section_start + end];
50                parse_bfrange_section(section, &mut mappings)?;
51                search_from = section_start + end + "endbfrange".len();
52            } else {
53                break;
54            }
55        }
56
57        Ok(CMap { mappings })
58    }
59
60    /// Look up the Unicode string for a character code.
61    ///
62    /// Returns `None` if the code has no mapping in this CMap.
63    pub fn lookup(&self, code: u32) -> Option<&str> {
64        self.mappings.get(&code).map(|s| s.as_str())
65    }
66
67    /// Look up the Unicode string for a character code, with fallback.
68    ///
69    /// If no mapping is found, returns U+FFFD (REPLACEMENT CHARACTER).
70    pub fn lookup_or_replacement(&self, code: u32) -> String {
71        self.lookup(code)
72            .map(|s| s.to_string())
73            .unwrap_or_else(|| "\u{FFFD}".to_string())
74    }
75
76    /// Returns the number of mappings in this CMap.
77    pub fn len(&self) -> usize {
78        self.mappings.len()
79    }
80
81    /// Returns true if this CMap has no mappings.
82    pub fn is_empty(&self) -> bool {
83        self.mappings.is_empty()
84    }
85}
86
87/// A parsed CID CMap that maps character codes to CIDs.
88///
89/// Used by predefined CMaps (e.g., Adobe-Japan1) and embedded CID CMaps
90/// that use `begincidchar`/`endcidchar` and `begincidrange`/`endcidrange`
91/// sections. Unlike [`CMap`] which maps to Unicode strings, this maps
92/// character codes to numeric CID values.
93#[derive(Debug, Clone)]
94pub struct CidCMap {
95    /// Mapping from character code to CID.
96    cid_mappings: HashMap<u32, u32>,
97    /// CMap name (e.g., "Adobe-Japan1-6").
98    name: Option<String>,
99    /// Writing mode: 0 = horizontal, 1 = vertical.
100    writing_mode: u8,
101}
102
103impl CidCMap {
104    /// Parse a CID CMap from its raw byte content.
105    ///
106    /// Extracts `begincidchar`/`endcidchar` and `begincidrange`/`endcidrange`
107    /// sections to build the character code → CID mapping table.
108    pub fn parse(data: &[u8]) -> Result<Self, BackendError> {
109        let text = String::from_utf8_lossy(data);
110        let mut cid_mappings = HashMap::new();
111
112        // Parse CMap name
113        let name = parse_cmap_name(&text);
114
115        // Parse writing mode (/WMode)
116        let writing_mode = parse_writing_mode(&text);
117
118        // Parse all begincidchar...endcidchar sections
119        let mut search_from = 0;
120        while let Some(start) = text[search_from..].find("begincidchar") {
121            let section_start = search_from + start + "begincidchar".len();
122            if let Some(end) = text[section_start..].find("endcidchar") {
123                let section = &text[section_start..section_start + end];
124                parse_cidchar_section(section, &mut cid_mappings)?;
125                search_from = section_start + end + "endcidchar".len();
126            } else {
127                break;
128            }
129        }
130
131        // Parse all begincidrange...endcidrange sections
132        search_from = 0;
133        while let Some(start) = text[search_from..].find("begincidrange") {
134            let section_start = search_from + start + "begincidrange".len();
135            if let Some(end) = text[section_start..].find("endcidrange") {
136                let section = &text[section_start..section_start + end];
137                parse_cidrange_section(section, &mut cid_mappings)?;
138                search_from = section_start + end + "endcidrange".len();
139            } else {
140                break;
141            }
142        }
143
144        Ok(CidCMap {
145            cid_mappings,
146            name,
147            writing_mode,
148        })
149    }
150
151    /// Look up the CID for a character code.
152    pub fn lookup(&self, code: u32) -> Option<u32> {
153        self.cid_mappings.get(&code).copied()
154    }
155
156    /// Returns the number of mappings in this CID CMap.
157    pub fn len(&self) -> usize {
158        self.cid_mappings.len()
159    }
160
161    /// Returns true if this CID CMap has no mappings.
162    pub fn is_empty(&self) -> bool {
163        self.cid_mappings.is_empty()
164    }
165
166    /// CMap name.
167    pub fn name(&self) -> Option<&str> {
168        self.name.as_deref()
169    }
170
171    /// Writing mode: 0 = horizontal, 1 = vertical.
172    pub fn writing_mode(&self) -> u8 {
173        self.writing_mode
174    }
175}
176
177/// Parse a begincidchar...endcidchar section.
178///
179/// Each line has format: `<srcCode> CID`
180fn parse_cidchar_section(
181    section: &str,
182    mappings: &mut HashMap<u32, u32>,
183) -> Result<(), BackendError> {
184    for line in section.lines() {
185        let trimmed = line.trim();
186        if trimmed.is_empty() || !trimmed.contains('<') {
187            continue;
188        }
189
190        let tokens = extract_hex_tokens(trimmed);
191        if tokens.is_empty() {
192            continue;
193        }
194        let src_code = parse_hex_code(tokens[0])?;
195
196        // CID is a decimal number after the hex token
197        let after_hex = trimmed
198            .rfind('>')
199            .map(|pos| &trimmed[pos + 1..])
200            .unwrap_or("");
201        if let Ok(cid) = after_hex.trim().parse::<u32>() {
202            mappings.insert(src_code, cid);
203        }
204    }
205    Ok(())
206}
207
208/// Parse a begincidrange...endcidrange section.
209///
210/// Each line has format: `<srcLow> <srcHigh> CID_start`
211fn parse_cidrange_section(
212    section: &str,
213    mappings: &mut HashMap<u32, u32>,
214) -> Result<(), BackendError> {
215    for line in section.lines() {
216        let trimmed = line.trim();
217        if trimmed.is_empty() || !trimmed.contains('<') {
218            continue;
219        }
220
221        let tokens = extract_hex_tokens(trimmed);
222        if tokens.len() < 2 {
223            continue;
224        }
225        let src_low = parse_hex_code(tokens[0])?;
226        let src_high = parse_hex_code(tokens[1])?;
227
228        // CID start is a decimal number after the last hex token
229        let after_last_hex = trimmed
230            .rfind('>')
231            .map(|pos| &trimmed[pos + 1..])
232            .unwrap_or("");
233        if let Ok(cid_start) = after_last_hex.trim().parse::<u32>() {
234            for offset in 0..=(src_high.saturating_sub(src_low)) {
235                mappings.insert(src_low + offset, cid_start + offset);
236            }
237        }
238    }
239    Ok(())
240}
241
242/// Parse /CMapName from CMap data.
243fn parse_cmap_name(text: &str) -> Option<String> {
244    // Look for "/CMapName /SomeName def"
245    let idx = text.find("/CMapName")?;
246    let rest = &text[idx + "/CMapName".len()..];
247    let rest = rest.trim_start();
248    if let Some(rest) = rest.strip_prefix('/') {
249        let end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
250        Some(rest[..end].to_string())
251    } else {
252        None
253    }
254}
255
256/// Parse /WMode from CMap data.
257fn parse_writing_mode(text: &str) -> u8 {
258    // Look for "/WMode N def"
259    if let Some(idx) = text.find("/WMode") {
260        let rest = &text[idx + "/WMode".len()..];
261        let rest = rest.trim_start();
262        if let Some(ch) = rest.chars().next() {
263            if ch == '1' {
264                return 1;
265            }
266        }
267    }
268    0 // default horizontal
269}
270
271/// Parse a hex string like "0041" into a u32 character code.
272fn parse_hex_code(hex: &str) -> Result<u32, BackendError> {
273    u32::from_str_radix(hex, 16)
274        .map_err(|e| BackendError::Parse(format!("invalid hex code '{hex}': {e}")))
275}
276
277/// Decode a hex string as UTF-16BE bytes into a Unicode string.
278///
279/// The hex string represents UTF-16BE encoded code units. For BMP characters,
280/// this is a single 2-byte value. For supplementary characters, this is a
281/// surrogate pair (4 bytes). For multi-character mappings (ligatures), this
282/// can be multiple 2-byte values.
283fn decode_utf16be_hex(hex: &str) -> Result<String, BackendError> {
284    if hex.len() % 4 != 0 {
285        // Pad to even number of hex digits (groups of 4 for UTF-16 code units)
286        // For 2-digit hex like "41", treat as single-byte padded to "0041"
287        if hex.len() == 2 {
288            let padded = format!("00{hex}");
289            return decode_utf16be_hex(&padded);
290        }
291        return Err(BackendError::Parse(format!(
292            "UTF-16BE hex string must have length divisible by 4, got '{hex}' (len={})",
293            hex.len()
294        )));
295    }
296
297    // Parse hex string into u16 code units
298    let mut code_units = Vec::with_capacity(hex.len() / 4);
299    for chunk in hex.as_bytes().chunks(4) {
300        let chunk_str = std::str::from_utf8(chunk)
301            .map_err(|e| BackendError::Parse(format!("invalid UTF-8 in hex: {e}")))?;
302        let unit = u16::from_str_radix(chunk_str, 16).map_err(|e| {
303            BackendError::Parse(format!("invalid hex in UTF-16BE '{chunk_str}': {e}"))
304        })?;
305        code_units.push(unit);
306    }
307
308    // Decode UTF-16BE code units to String
309    String::from_utf16(&code_units)
310        .map_err(|e| BackendError::Parse(format!("invalid UTF-16BE sequence: {e}")))
311}
312
313/// Extract all `<hex>` tokens from a line of text.
314fn extract_hex_tokens(text: &str) -> Vec<&str> {
315    let mut tokens = Vec::new();
316    let mut rest = text;
317    while let Some(start) = rest.find('<') {
318        if let Some(end) = rest[start + 1..].find('>') {
319            let hex = &rest[start + 1..start + 1 + end];
320            tokens.push(hex);
321            rest = &rest[start + 1 + end + 1..];
322        } else {
323            break;
324        }
325    }
326    tokens
327}
328
329/// Parse a beginbfchar...endbfchar section.
330///
331/// Each line has format: `<srcCode> <dstUnicode>`
332fn parse_bfchar_section(
333    section: &str,
334    mappings: &mut HashMap<u32, String>,
335) -> Result<(), BackendError> {
336    for line in section.lines() {
337        let trimmed = line.trim();
338        if trimmed.is_empty() || !trimmed.contains('<') {
339            continue;
340        }
341
342        let tokens = extract_hex_tokens(trimmed);
343        if tokens.len() >= 2 {
344            let src_code = parse_hex_code(tokens[0])?;
345            let unicode_str = decode_utf16be_hex(tokens[1])?;
346            mappings.insert(src_code, unicode_str);
347        }
348    }
349    Ok(())
350}
351
352/// Parse a beginbfrange...endbfrange section.
353///
354/// Each line has format: `<srcLow> <srcHigh> <dstStart>`
355/// or: `<srcLow> <srcHigh> [<str1> <str2> ...]`
356fn parse_bfrange_section(
357    section: &str,
358    mappings: &mut HashMap<u32, String>,
359) -> Result<(), BackendError> {
360    for line in section.lines() {
361        let trimmed = line.trim();
362        if trimmed.is_empty() || !trimmed.contains('<') {
363            continue;
364        }
365
366        // Check if destination is an array: [<hex> <hex> ...]
367        if let Some(bracket_start) = trimmed.find('[') {
368            // Array form: <srcLow> <srcHigh> [<str1> <str2> ...]
369            let before_bracket = &trimmed[..bracket_start];
370            let src_tokens = extract_hex_tokens(before_bracket);
371            if src_tokens.len() < 2 {
372                continue;
373            }
374            let src_low = parse_hex_code(src_tokens[0])?;
375            let src_high = parse_hex_code(src_tokens[1])?;
376
377            // Extract hex tokens from inside the brackets
378            let bracket_end = trimmed.rfind(']').unwrap_or(trimmed.len());
379            let array_content = &trimmed[bracket_start + 1..bracket_end];
380            let dst_tokens = extract_hex_tokens(array_content);
381
382            for (i, dst_hex) in dst_tokens.iter().enumerate() {
383                let code = src_low + i as u32;
384                if code > src_high {
385                    break;
386                }
387                let unicode_str = decode_utf16be_hex(dst_hex)?;
388                mappings.insert(code, unicode_str);
389            }
390        } else {
391            // Standard form: <srcLow> <srcHigh> <dstStart>
392            let tokens = extract_hex_tokens(trimmed);
393            if tokens.len() < 3 {
394                continue;
395            }
396            let src_low = parse_hex_code(tokens[0])?;
397            let src_high = parse_hex_code(tokens[1])?;
398            let dst_start = parse_hex_code(tokens[2])?;
399
400            for offset in 0..=(src_high - src_low) {
401                let code = src_low + offset;
402                let unicode_cp = dst_start + offset;
403                if let Some(ch) = char::from_u32(unicode_cp) {
404                    mappings.insert(code, ch.to_string());
405                }
406            }
407        }
408    }
409    Ok(())
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415
416    // --- CMap construction and basic lookup ---
417
418    #[test]
419    fn empty_cmap_returns_none() {
420        let cmap = CMap::parse(b"").unwrap();
421        assert!(cmap.is_empty());
422        assert_eq!(cmap.len(), 0);
423        assert_eq!(cmap.lookup(0x0041), None);
424    }
425
426    #[test]
427    fn lookup_or_replacement_returns_fffd_for_missing() {
428        let cmap = CMap::parse(b"").unwrap();
429        assert_eq!(cmap.lookup_or_replacement(0x0041), "\u{FFFD}");
430    }
431
432    // --- beginbfchar / endbfchar ---
433
434    #[test]
435    fn bfchar_single_mapping() {
436        let data = b"\
437            beginbfchar\n\
438            <0041> <0041>\n\
439            endbfchar\n";
440        let cmap = CMap::parse(data).unwrap();
441        assert_eq!(cmap.lookup(0x0041), Some("A"));
442    }
443
444    #[test]
445    fn bfchar_multiple_mappings() {
446        let data = b"\
447            beginbfchar\n\
448            <0041> <0041>\n\
449            <0042> <0042>\n\
450            <0043> <0043>\n\
451            endbfchar\n";
452        let cmap = CMap::parse(data).unwrap();
453        assert_eq!(cmap.lookup(0x0041), Some("A"));
454        assert_eq!(cmap.lookup(0x0042), Some("B"));
455        assert_eq!(cmap.lookup(0x0043), Some("C"));
456        assert_eq!(cmap.len(), 3);
457    }
458
459    #[test]
460    fn bfchar_single_byte_source_code() {
461        // 1-byte source code
462        let data = b"\
463            beginbfchar\n\
464            <41> <0041>\n\
465            endbfchar\n";
466        let cmap = CMap::parse(data).unwrap();
467        assert_eq!(cmap.lookup(0x41), Some("A"));
468    }
469
470    #[test]
471    fn bfchar_remapped_codes() {
472        // Code 0x01 maps to 'A' (0x0041)
473        let data = b"\
474            beginbfchar\n\
475            <01> <0041>\n\
476            <02> <0042>\n\
477            endbfchar\n";
478        let cmap = CMap::parse(data).unwrap();
479        assert_eq!(cmap.lookup(0x01), Some("A"));
480        assert_eq!(cmap.lookup(0x02), Some("B"));
481    }
482
483    #[test]
484    fn bfchar_multi_char_unicode_ligature() {
485        // fi ligature → "fi" (two Unicode characters)
486        let data = b"\
487            beginbfchar\n\
488            <FB01> <00660069>\n\
489            endbfchar\n";
490        let cmap = CMap::parse(data).unwrap();
491        assert_eq!(cmap.lookup(0xFB01), Some("fi"));
492    }
493
494    #[test]
495    fn bfchar_non_bmp_character() {
496        // U+1F600 (😀) encoded as UTF-16BE surrogate pair: D83D DE00
497        let data = b"\
498            beginbfchar\n\
499            <0001> <D83DDE00>\n\
500            endbfchar\n";
501        let cmap = CMap::parse(data).unwrap();
502        assert_eq!(cmap.lookup(0x0001), Some("\u{1F600}"));
503    }
504
505    #[test]
506    fn bfchar_with_surrounding_cmap_boilerplate() {
507        let data = b"\
508            /CIDInit /ProcSet findresource begin\n\
509            12 dict begin\n\
510            begincmap\n\
511            /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n\
512            /CMapName /Adobe-Identity-UCS def\n\
513            /CMapType 2 def\n\
514            1 begincodespacerange\n\
515            <0000> <FFFF>\n\
516            endcodespacerange\n\
517            2 beginbfchar\n\
518            <0041> <0041>\n\
519            <0042> <0042>\n\
520            endbfchar\n\
521            endcmap\n\
522            CMapName currentdict /CMap defineresource pop\n\
523            end\n\
524            end\n";
525        let cmap = CMap::parse(data).unwrap();
526        assert_eq!(cmap.lookup(0x0041), Some("A"));
527        assert_eq!(cmap.lookup(0x0042), Some("B"));
528        assert_eq!(cmap.len(), 2);
529    }
530
531    // --- beginbfrange / endbfrange ---
532
533    #[test]
534    fn bfrange_simple_range() {
535        let data = b"\
536            beginbfrange\n\
537            <0041> <0043> <0041>\n\
538            endbfrange\n";
539        let cmap = CMap::parse(data).unwrap();
540        assert_eq!(cmap.lookup(0x0041), Some("A"));
541        assert_eq!(cmap.lookup(0x0042), Some("B"));
542        assert_eq!(cmap.lookup(0x0043), Some("C"));
543        assert_eq!(cmap.len(), 3);
544    }
545
546    #[test]
547    fn bfrange_offset_mapping() {
548        // Source codes 0x01-0x03 map to U+0041-U+0043
549        let data = b"\
550            beginbfrange\n\
551            <01> <03> <0041>\n\
552            endbfrange\n";
553        let cmap = CMap::parse(data).unwrap();
554        assert_eq!(cmap.lookup(0x01), Some("A"));
555        assert_eq!(cmap.lookup(0x02), Some("B"));
556        assert_eq!(cmap.lookup(0x03), Some("C"));
557    }
558
559    #[test]
560    fn bfrange_single_code_range() {
561        // Range with low == high (single mapping)
562        let data = b"\
563            beginbfrange\n\
564            <0041> <0041> <0061>\n\
565            endbfrange\n";
566        let cmap = CMap::parse(data).unwrap();
567        assert_eq!(cmap.lookup(0x0041), Some("a")); // U+0061 = 'a'
568        assert_eq!(cmap.len(), 1);
569    }
570
571    #[test]
572    fn bfrange_multiple_ranges() {
573        let data = b"\
574            beginbfrange\n\
575            <0041> <0043> <0041>\n\
576            <0061> <0063> <0061>\n\
577            endbfrange\n";
578        let cmap = CMap::parse(data).unwrap();
579        assert_eq!(cmap.lookup(0x0041), Some("A"));
580        assert_eq!(cmap.lookup(0x0043), Some("C"));
581        assert_eq!(cmap.lookup(0x0061), Some("a"));
582        assert_eq!(cmap.lookup(0x0063), Some("c"));
583        assert_eq!(cmap.len(), 6);
584    }
585
586    #[test]
587    fn bfrange_with_array_destination() {
588        // Range with array of individual Unicode strings
589        let data = b"\
590            beginbfrange\n\
591            <0041> <0043> [<0058> <0059> <005A>]\n\
592            endbfrange\n";
593        let cmap = CMap::parse(data).unwrap();
594        assert_eq!(cmap.lookup(0x0041), Some("X"));
595        assert_eq!(cmap.lookup(0x0042), Some("Y"));
596        assert_eq!(cmap.lookup(0x0043), Some("Z"));
597    }
598
599    // --- Combined bfchar + bfrange ---
600
601    #[test]
602    fn combined_bfchar_and_bfrange() {
603        let data = b"\
604            2 beginbfchar\n\
605            <0001> <0041>\n\
606            <0002> <0042>\n\
607            endbfchar\n\
608            1 beginbfrange\n\
609            <0003> <0005> <0043>\n\
610            endbfrange\n";
611        let cmap = CMap::parse(data).unwrap();
612        assert_eq!(cmap.lookup(0x0001), Some("A"));
613        assert_eq!(cmap.lookup(0x0002), Some("B"));
614        assert_eq!(cmap.lookup(0x0003), Some("C"));
615        assert_eq!(cmap.lookup(0x0004), Some("D"));
616        assert_eq!(cmap.lookup(0x0005), Some("E"));
617        assert_eq!(cmap.len(), 5);
618    }
619
620    // --- Multiple bfchar/bfrange sections ---
621
622    #[test]
623    fn multiple_bfchar_sections() {
624        let data = b"\
625            1 beginbfchar\n\
626            <0041> <0041>\n\
627            endbfchar\n\
628            1 beginbfchar\n\
629            <0042> <0042>\n\
630            endbfchar\n";
631        let cmap = CMap::parse(data).unwrap();
632        assert_eq!(cmap.lookup(0x0041), Some("A"));
633        assert_eq!(cmap.lookup(0x0042), Some("B"));
634        assert_eq!(cmap.len(), 2);
635    }
636
637    // --- UTF-16BE encoding ---
638
639    #[test]
640    fn utf16be_basic_latin() {
641        // ASCII 'A' is 0x0041 in UTF-16BE
642        let data = b"\
643            beginbfchar\n\
644            <41> <0041>\n\
645            endbfchar\n";
646        let cmap = CMap::parse(data).unwrap();
647        assert_eq!(cmap.lookup(0x41), Some("A"));
648    }
649
650    #[test]
651    fn utf16be_cjk_character() {
652        // U+4E2D (中) in UTF-16BE is 4E2D
653        let data = b"\
654            beginbfchar\n\
655            <01> <4E2D>\n\
656            endbfchar\n";
657        let cmap = CMap::parse(data).unwrap();
658        assert_eq!(cmap.lookup(0x01), Some("中"));
659    }
660
661    #[test]
662    fn utf16be_surrogate_pair() {
663        // U+10400 (𐐀) = D801 DC00 in UTF-16BE
664        let data = b"\
665            beginbfchar\n\
666            <01> <D801DC00>\n\
667            endbfchar\n";
668        let cmap = CMap::parse(data).unwrap();
669        assert_eq!(cmap.lookup(0x01), Some("\u{10400}"));
670    }
671
672    // --- Edge cases ---
673
674    #[test]
675    fn whitespace_variations() {
676        // Tabs and extra whitespace
677        let data = b"\
678            beginbfchar\n\
679            \t<0041>\t<0041>\t\n\
680            endbfchar\n";
681        let cmap = CMap::parse(data).unwrap();
682        assert_eq!(cmap.lookup(0x0041), Some("A"));
683    }
684
685    #[test]
686    fn crlf_line_endings() {
687        let data = b"beginbfchar\r\n<0041> <0041>\r\nendbfchar\r\n";
688        let cmap = CMap::parse(data).unwrap();
689        assert_eq!(cmap.lookup(0x0041), Some("A"));
690    }
691
692    #[test]
693    fn missing_mapping_returns_none() {
694        let data = b"\
695            beginbfchar\n\
696            <0041> <0041>\n\
697            endbfchar\n";
698        let cmap = CMap::parse(data).unwrap();
699        assert_eq!(cmap.lookup(0x9999), None);
700    }
701
702    #[test]
703    fn lookup_or_replacement_with_valid_mapping() {
704        let data = b"\
705            beginbfchar\n\
706            <0041> <0041>\n\
707            endbfchar\n";
708        let cmap = CMap::parse(data).unwrap();
709        assert_eq!(cmap.lookup_or_replacement(0x0041), "A");
710    }
711
712    #[test]
713    fn lookup_or_replacement_with_missing_mapping() {
714        let data = b"\
715            beginbfchar\n\
716            <0041> <0041>\n\
717            endbfchar\n";
718        let cmap = CMap::parse(data).unwrap();
719        assert_eq!(cmap.lookup_or_replacement(0x9999), "\u{FFFD}");
720    }
721
722    // --- CidCMap tests ---
723
724    #[test]
725    fn cid_cmap_empty() {
726        let cmap = CidCMap::parse(b"").unwrap();
727        assert!(cmap.is_empty());
728        assert_eq!(cmap.len(), 0);
729        assert_eq!(cmap.lookup(0), None);
730    }
731
732    #[test]
733    fn cid_cmap_cidchar_single() {
734        let data = b"\
735            begincidchar\n\
736            <0041> 100\n\
737            endcidchar\n";
738        let cmap = CidCMap::parse(data).unwrap();
739        assert_eq!(cmap.lookup(0x0041), Some(100));
740    }
741
742    #[test]
743    fn cid_cmap_cidchar_multiple() {
744        let data = b"\
745            begincidchar\n\
746            <0041> 100\n\
747            <0042> 101\n\
748            <0043> 102\n\
749            endcidchar\n";
750        let cmap = CidCMap::parse(data).unwrap();
751        assert_eq!(cmap.lookup(0x0041), Some(100));
752        assert_eq!(cmap.lookup(0x0042), Some(101));
753        assert_eq!(cmap.lookup(0x0043), Some(102));
754        assert_eq!(cmap.len(), 3);
755    }
756
757    #[test]
758    fn cid_cmap_cidrange_simple() {
759        let data = b"\
760            begincidrange\n\
761            <0041> <0043> 100\n\
762            endcidrange\n";
763        let cmap = CidCMap::parse(data).unwrap();
764        assert_eq!(cmap.lookup(0x0041), Some(100));
765        assert_eq!(cmap.lookup(0x0042), Some(101));
766        assert_eq!(cmap.lookup(0x0043), Some(102));
767        assert_eq!(cmap.len(), 3);
768    }
769
770    #[test]
771    fn cid_cmap_cidrange_single_code() {
772        let data = b"\
773            begincidrange\n\
774            <0041> <0041> 50\n\
775            endcidrange\n";
776        let cmap = CidCMap::parse(data).unwrap();
777        assert_eq!(cmap.lookup(0x0041), Some(50));
778        assert_eq!(cmap.len(), 1);
779    }
780
781    #[test]
782    fn cid_cmap_combined_cidchar_and_cidrange() {
783        let data = b"\
784            1 begincidchar\n\
785            <0001> 1\n\
786            endcidchar\n\
787            1 begincidrange\n\
788            <0010> <0012> 100\n\
789            endcidrange\n";
790        let cmap = CidCMap::parse(data).unwrap();
791        assert_eq!(cmap.lookup(0x0001), Some(1));
792        assert_eq!(cmap.lookup(0x0010), Some(100));
793        assert_eq!(cmap.lookup(0x0011), Some(101));
794        assert_eq!(cmap.lookup(0x0012), Some(102));
795        assert_eq!(cmap.len(), 4);
796    }
797
798    #[test]
799    fn cid_cmap_parses_name() {
800        let data = b"\
801            /CMapName /Adobe-Japan1-6 def\n\
802            begincidrange\n\
803            <0041> <0043> 100\n\
804            endcidrange\n";
805        let cmap = CidCMap::parse(data).unwrap();
806        assert_eq!(cmap.name(), Some("Adobe-Japan1-6"));
807    }
808
809    #[test]
810    fn cid_cmap_parses_writing_mode_horizontal() {
811        let data = b"\
812            /WMode 0 def\n\
813            begincidchar\n\
814            <0041> 1\n\
815            endcidchar\n";
816        let cmap = CidCMap::parse(data).unwrap();
817        assert_eq!(cmap.writing_mode(), 0);
818    }
819
820    #[test]
821    fn cid_cmap_parses_writing_mode_vertical() {
822        let data = b"\
823            /WMode 1 def\n\
824            begincidchar\n\
825            <0041> 1\n\
826            endcidchar\n";
827        let cmap = CidCMap::parse(data).unwrap();
828        assert_eq!(cmap.writing_mode(), 1);
829    }
830
831    #[test]
832    fn cid_cmap_default_writing_mode_horizontal() {
833        let data = b"\
834            begincidchar\n\
835            <0041> 1\n\
836            endcidchar\n";
837        let cmap = CidCMap::parse(data).unwrap();
838        assert_eq!(cmap.writing_mode(), 0);
839    }
840
841    #[test]
842    fn cid_cmap_with_full_boilerplate() {
843        let data = b"\
844            /CIDInit /ProcSet findresource begin\n\
845            12 dict begin\n\
846            begincmap\n\
847            /CIDSystemInfo << /Registry (Adobe) /Ordering (Japan1) /Supplement 6 >> def\n\
848            /CMapName /Adobe-Japan1-6 def\n\
849            /CMapType 1 def\n\
850            /WMode 0 def\n\
851            1 begincodespacerange\n\
852            <0000> <FFFF>\n\
853            endcodespacerange\n\
854            2 begincidchar\n\
855            <0041> 100\n\
856            <0042> 101\n\
857            endcidchar\n\
858            1 begincidrange\n\
859            <0100> <010F> 200\n\
860            endcidrange\n\
861            endcmap\n";
862        let cmap = CidCMap::parse(data).unwrap();
863        assert_eq!(cmap.name(), Some("Adobe-Japan1-6"));
864        assert_eq!(cmap.writing_mode(), 0);
865        assert_eq!(cmap.lookup(0x0041), Some(100));
866        assert_eq!(cmap.lookup(0x0042), Some(101));
867        assert_eq!(cmap.lookup(0x0100), Some(200));
868        assert_eq!(cmap.lookup(0x010F), Some(215)); // 200 + 15
869        assert_eq!(cmap.len(), 18); // 2 + 16
870    }
871
872    #[test]
873    fn cid_cmap_missing_lookup_returns_none() {
874        let data = b"\
875            begincidchar\n\
876            <0041> 100\n\
877            endcidchar\n";
878        let cmap = CidCMap::parse(data).unwrap();
879        assert_eq!(cmap.lookup(0x9999), None);
880    }
881}
pdfplumber_parse/cmap.rs

pdfplumber_parse/
cmap.rs