Skip to main content

oxidize_pdf/text/
cmap.rs

1//! CMap and ToUnicode support for text extraction
2//!
3//! This module implements CMap parsing and ToUnicode mappings according to
4//! ISO 32000-1:2008 Section 9.10 (Extraction of Text Content) and Section 9.7.5 (CMaps).
5//!
6//! CMaps define the mapping from character codes to character selectors (CIDs, character names, or Unicode values).
7
8use crate::parser::{ParseError, ParseResult};
9use std::collections::HashMap;
10
11/// CMap type enumeration
12#[derive(Debug, Clone, PartialEq)]
13pub enum CMapType {
14    /// Maps character codes to CIDs (Character IDs)
15    CIDMap,
16    /// Maps character codes to Unicode values
17    ToUnicode,
18    /// Predefined CMap (e.g., Identity-H, Identity-V)
19    Predefined(String),
20}
21
22/// Character code range mapping
23#[derive(Debug, Clone)]
24pub struct CodeRange {
25    /// Start of the code range
26    pub start: Vec<u8>,
27    /// End of the code range
28    pub end: Vec<u8>,
29}
30
31impl CodeRange {
32    /// Check if a code is within this range
33    pub fn contains(&self, code: &[u8]) -> bool {
34        if code.len() != self.start.len() || code.len() != self.end.len() {
35            return false;
36        }
37
38        code >= &self.start[..] && code <= &self.end[..]
39    }
40}
41
42/// CMap mapping entry
43#[derive(Debug, Clone)]
44pub enum CMapEntry {
45    /// Single character mapping
46    Single {
47        /// Source character code
48        src: Vec<u8>,
49        /// Destination (CID or Unicode)
50        dst: Vec<u8>,
51    },
52    /// Range mapping
53    Range {
54        /// Start of source range
55        src_start: Vec<u8>,
56        /// End of source range
57        src_end: Vec<u8>,
58        /// Start of destination range
59        dst_start: Vec<u8>,
60    },
61}
62
63/// CMap structure for character code mappings
64#[derive(Debug, Clone)]
65pub struct CMap {
66    /// CMap name
67    pub name: Option<String>,
68    /// CMap type
69    pub cmap_type: CMapType,
70    /// Writing mode (0 = horizontal, 1 = vertical)
71    pub wmode: u8,
72    /// Code space ranges
73    pub codespace_ranges: Vec<CodeRange>,
74    /// Character mappings
75    pub mappings: Vec<CMapEntry>,
76    /// Cached single mappings for fast lookup
77    single_mappings: HashMap<Vec<u8>, Vec<u8>>,
78}
79
80impl Default for CMap {
81    fn default() -> Self {
82        Self::new()
83    }
84}
85
86impl CMap {
87    /// Create a new empty CMap
88    pub fn new() -> Self {
89        Self {
90            name: None,
91            cmap_type: CMapType::ToUnicode,
92            wmode: 0,
93            codespace_ranges: Vec::new(),
94            mappings: Vec::new(),
95            single_mappings: HashMap::new(),
96        }
97    }
98
99    /// Create a predefined Identity CMap
100    pub fn identity_h() -> Self {
101        Self {
102            name: Some("Identity-H".to_string()),
103            cmap_type: CMapType::Predefined("Identity-H".to_string()),
104            wmode: 0,
105            codespace_ranges: vec![CodeRange {
106                start: vec![0x00, 0x00],
107                end: vec![0xFF, 0xFF],
108            }],
109            mappings: Vec::new(),
110            single_mappings: HashMap::new(),
111        }
112    }
113
114    /// Create a predefined Identity-V CMap
115    pub fn identity_v() -> Self {
116        Self {
117            name: Some("Identity-V".to_string()),
118            cmap_type: CMapType::Predefined("Identity-V".to_string()),
119            wmode: 1,
120            codespace_ranges: vec![CodeRange {
121                start: vec![0x00, 0x00],
122                end: vec![0xFF, 0xFF],
123            }],
124            mappings: Vec::new(),
125            single_mappings: HashMap::new(),
126        }
127    }
128
129    /// Parse a CMap from data
130    pub fn parse(data: &[u8]) -> ParseResult<Self> {
131        let mut cmap = Self::new();
132        let content =
133            std::str::from_utf8(data).map_err(|e| ParseError::CharacterEncodingError {
134                position: 0,
135                message: format!("Invalid UTF-8 in CMap: {e}"),
136            })?;
137
138        let lines = content.lines();
139        let mut in_codespace_range = false;
140        let mut in_bf_char = false;
141        let mut in_bf_range = false;
142
143        for line in lines {
144            let line = line.trim();
145
146            // Skip comments
147            if line.starts_with('%') {
148                continue;
149            }
150
151            // CMap name
152            if line.starts_with("/CMapName") {
153                if let Some(name) = extract_name(line) {
154                    cmap.name = Some(name);
155                }
156            }
157            // Writing mode
158            else if line.starts_with("/WMode") {
159                if let Some(wmode) = extract_number(line) {
160                    cmap.wmode = wmode as u8;
161                }
162            }
163            // Code space range
164            else if line.contains("begincodespacerange") {
165                in_codespace_range = true;
166            } else if line == "endcodespacerange" {
167                in_codespace_range = false;
168            } else if in_codespace_range {
169                if let Some((start, end)) = parse_hex_range(line) {
170                    cmap.codespace_ranges.push(CodeRange { start, end });
171                }
172            }
173            // BF char mappings
174            else if line.contains("beginbfchar") {
175                in_bf_char = true;
176            } else if line == "endbfchar" {
177                in_bf_char = false;
178            } else if in_bf_char {
179                if let Some((src, dst)) = parse_bf_char(line) {
180                    cmap.single_mappings.insert(src.clone(), dst.clone());
181                    cmap.mappings.push(CMapEntry::Single { src, dst });
182                }
183            }
184            // BF range mappings
185            else if line.contains("beginbfrange") {
186                in_bf_range = true;
187            } else if line == "endbfrange" {
188                in_bf_range = false;
189            } else if in_bf_range {
190                // Handle both simple format and array format
191                if let Some(entries) = parse_bf_range_entries(line) {
192                    for entry in entries {
193                        if let CMapEntry::Single { ref src, ref dst } = entry {
194                            cmap.single_mappings.insert(src.clone(), dst.clone());
195                        }
196                        cmap.mappings.push(entry);
197                    }
198                }
199            }
200        }
201
202        Ok(cmap)
203    }
204
205    /// Map a character code to its destination
206    pub fn map(&self, code: &[u8]) -> Option<Vec<u8>> {
207        // Check if code is in valid codespace
208        if !self.is_valid_code(code) {
209            return None;
210        }
211
212        // For predefined Identity CMaps
213        if let CMapType::Predefined(name) = &self.cmap_type {
214            if name.starts_with("Identity") {
215                return Some(code.to_vec());
216            }
217        }
218
219        // Check single mappings first (cached)
220        if let Some(dst) = self.single_mappings.get(code) {
221            return Some(dst.clone());
222        }
223
224        // Check range mappings
225        for mapping in &self.mappings {
226            if let CMapEntry::Range {
227                src_start,
228                src_end,
229                dst_start,
230            } = mapping
231            {
232                if code.len() == src_start.len() && code >= &src_start[..] && code <= &src_end[..] {
233                    // Calculate offset within range
234                    let offset = calculate_offset(code, src_start);
235                    let mut result = dst_start.clone();
236
237                    // Add offset to destination
238                    if let Some(last) = result.last_mut() {
239                        *last = last.wrapping_add(offset as u8);
240                    }
241
242                    return Some(result);
243                }
244            }
245        }
246
247        None
248    }
249
250    /// Check if a code is in valid codespace
251    pub fn is_valid_code(&self, code: &[u8]) -> bool {
252        for range in &self.codespace_ranges {
253            if range.contains(code) {
254                return true;
255            }
256        }
257        false
258    }
259
260    /// Convert mapped value to Unicode string
261    pub fn to_unicode(&self, mapped: &[u8]) -> Option<String> {
262        match self.cmap_type {
263            CMapType::ToUnicode => {
264                // Interpret as UTF-16BE
265                if mapped.len() % 2 == 0 {
266                    let utf16_values: Vec<u16> = mapped
267                        .chunks(2)
268                        .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
269                        .collect();
270                    String::from_utf16(&utf16_values).ok()
271                } else {
272                    // Try as UTF-8
273                    String::from_utf8(mapped.to_vec()).ok()
274                }
275            }
276            _ => None,
277        }
278    }
279}
280
281/// Extract name from a line like "/CMapName /Adobe-Identity-UCS def"
282fn extract_name(line: &str) -> Option<String> {
283    let parts: Vec<&str> = line.split_whitespace().collect();
284    if parts.len() >= 2 && parts[1].starts_with('/') {
285        Some(parts[1][1..].to_string())
286    } else {
287        None
288    }
289}
290
291/// Extract number from a line like "/WMode 0 def"
292fn extract_number(line: &str) -> Option<i32> {
293    let parts: Vec<&str> = line.split_whitespace().collect();
294    if parts.len() >= 2 {
295        parts[1].parse().ok()
296    } else {
297        None
298    }
299}
300
301/// Parse hex string to bytes
302fn parse_hex(s: &str) -> Option<Vec<u8>> {
303    let s = s.trim_start_matches('<').trim_end_matches('>');
304    if s.len() % 2 != 0 {
305        return None;
306    }
307
308    let mut bytes = Vec::new();
309    for i in (0..s.len()).step_by(2) {
310        if let Ok(byte) = u8::from_str_radix(&s[i..i + 2], 16) {
311            bytes.push(byte);
312        } else {
313            return None;
314        }
315    }
316    Some(bytes)
317}
318
319/// Parse a hex range like "<0000> <FFFF>"
320fn parse_hex_range(line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
321    let parts: Vec<&str> = line.split_whitespace().collect();
322    if parts.len() >= 2 {
323        if let (Some(start), Some(end)) = (parse_hex(parts[0]), parse_hex(parts[1])) {
324            return Some((start, end));
325        }
326    }
327    None
328}
329
330/// Parse a bfchar line like "<0001> <0041>"
331fn parse_bf_char(line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
332    parse_hex_range(line)
333}
334
335/// Parse a bfrange line - returns Vec of entries (array format creates multiple entries)
336fn parse_bf_range_entries(line: &str) -> Option<Vec<CMapEntry>> {
337    // Check if line contains an array (format: <start> <end> [<dst1> <dst2> ...])
338    if line.contains('[') {
339        // Parse array format: <srcStart> <srcEnd> [<dst1> <dst2> <dst3> ...]
340        if let Some(array_start) = line.find('[') {
341            let before_array = &line[..array_start];
342            let parts: Vec<&str> = before_array.split_whitespace().collect();
343
344            if parts.len() >= 2 {
345                if let (Some(src_start), Some(src_end)) = (parse_hex(parts[0]), parse_hex(parts[1]))
346                {
347                    // Extract array values
348                    let after_bracket = &line[array_start + 1..];
349                    if let Some(array_end) = after_bracket.find(']') {
350                        let array_content = &after_bracket[..array_end];
351
352                        // Parse each hex value in the array
353                        let hex_values: Vec<Vec<u8>> = array_content
354                            .split_whitespace()
355                            .filter_map(parse_hex)
356                            .collect();
357
358                        // Create individual Single entries for each mapping
359                        let mut entries = Vec::new();
360                        let mut current_src = src_start;
361
362                        for dst in hex_values {
363                            entries.push(CMapEntry::Single {
364                                src: current_src.clone(),
365                                dst,
366                            });
367
368                            // Increment source code
369                            if let Some(last) = current_src.last_mut() {
370                                *last = last.wrapping_add(1);
371                            }
372
373                            // Stop if we've reached src_end
374                            if current_src > src_end {
375                                break;
376                            }
377                        }
378
379                        return Some(entries);
380                    }
381                }
382            }
383        }
384        return None;
385    }
386
387    // Original simple format: <start> <end> <dst>
388    let parts: Vec<&str> = line.split_whitespace().collect();
389    if parts.len() >= 3 {
390        if let (Some(start), Some(end), Some(dst)) = (
391            parse_hex(parts[0]),
392            parse_hex(parts[1]),
393            parse_hex(parts[2]),
394        ) {
395            return Some(vec![CMapEntry::Range {
396                src_start: start,
397                src_end: end,
398                dst_start: dst,
399            }]);
400        }
401    }
402    None
403}
404
405/// Calculate offset between two byte arrays
406fn calculate_offset(code: &[u8], start: &[u8]) -> usize {
407    let mut offset = 0;
408    for i in (0..code.len()).rev() {
409        let diff = code[i] as usize - start[i] as usize;
410        offset += diff * (256_usize.pow((code.len() - i - 1) as u32));
411    }
412    offset
413}
414
415/// ToUnicode CMap builder for creating custom mappings
416#[derive(Debug, Clone)]
417pub struct ToUnicodeCMapBuilder {
418    /// Character to Unicode mappings
419    mappings: HashMap<Vec<u8>, String>,
420    /// Code length in bytes
421    code_length: usize,
422}
423
424impl ToUnicodeCMapBuilder {
425    /// Create a new ToUnicode CMap builder
426    pub fn new(code_length: usize) -> Self {
427        Self {
428            mappings: HashMap::new(),
429            code_length,
430        }
431    }
432
433    /// Add a character mapping
434    pub fn add_mapping(&mut self, char_code: Vec<u8>, unicode: &str) {
435        self.mappings.insert(char_code, unicode.to_string());
436    }
437
438    /// Add a mapping from a single byte code
439    pub fn add_single_byte_mapping(&mut self, char_code: u8, unicode: char) {
440        let code = if self.code_length == 1 {
441            vec![char_code]
442        } else {
443            // Pad with zeros for multi-byte codes
444            let mut code = vec![0; self.code_length - 1];
445            code.push(char_code);
446            code
447        };
448        self.mappings.insert(code, unicode.to_string());
449    }
450
451    /// Build the ToUnicode CMap content
452    pub fn build(&self) -> Vec<u8> {
453        let mut content = String::new();
454
455        // CMap header
456        content.push_str("/CIDInit /ProcSet findresource begin\n");
457        content.push_str("12 dict begin\n");
458        content.push_str("begincmap\n");
459        content.push_str("/CIDSystemInfo\n");
460        content.push_str("<< /Registry (Adobe)\n");
461        content.push_str("   /Ordering (UCS)\n");
462        content.push_str("   /Supplement 0\n");
463        content.push_str(">> def\n");
464        content.push_str("/CMapName /Adobe-Identity-UCS def\n");
465        content.push_str("/CMapType 2 def\n");
466
467        // Code space range
468        content.push_str("1 begincodespacerange\n");
469        if self.code_length == 1 {
470            content.push_str("<00> <FF>\n");
471        } else {
472            let start = vec![0x00; self.code_length];
473            let end = vec![0xFF; self.code_length];
474            content.push_str(&format!(
475                "<{}> <{}>\n",
476                hex_string(&start),
477                hex_string(&end)
478            ));
479        }
480        content.push_str("endcodespacerange\n");
481
482        // Character mappings
483        if !self.mappings.is_empty() {
484            // Group mappings by consecutive ranges
485            let mut sorted_mappings: Vec<_> = self.mappings.iter().collect();
486            sorted_mappings.sort_by_key(|(k, _)| *k);
487
488            // Output single character mappings
489            let mut single_mappings = Vec::new();
490            for (code, unicode) in &sorted_mappings {
491                let utf16_bytes = string_to_utf16_be_bytes(unicode);
492                single_mappings.push((code, utf16_bytes));
493            }
494
495            // Write bfchar mappings in chunks of 100
496            for chunk in single_mappings.chunks(100) {
497                content.push_str(&format!("{} beginbfchar\n", chunk.len()));
498                for (code, unicode_bytes) in chunk {
499                    content.push_str(&format!(
500                        "<{}> <{}>\n",
501                        hex_string(code),
502                        hex_string(unicode_bytes)
503                    ));
504                }
505                content.push_str("endbfchar\n");
506            }
507        }
508
509        // CMap footer
510        content.push_str("endcmap\n");
511        content.push_str("CMapName currentdict /CMap defineresource pop\n");
512        content.push_str("end\n");
513        content.push_str("end\n");
514
515        content.into_bytes()
516    }
517}
518
519/// Convert string to UTF-16BE bytes
520pub fn string_to_utf16_be_bytes(s: &str) -> Vec<u8> {
521    let mut bytes = Vec::new();
522    for ch in s.encode_utf16() {
523        bytes.extend(&ch.to_be_bytes());
524    }
525    bytes
526}
527
528/// Convert bytes to hex string
529pub fn hex_string(bytes: &[u8]) -> String {
530    bytes.iter().map(|b| format!("{b:02X}")).collect()
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    #[test]
538    fn test_code_range() {
539        let range = CodeRange {
540            start: vec![0x00],
541            end: vec![0xFF],
542        };
543
544        assert!(range.contains(&[0x00]));
545        assert!(range.contains(&[0x80]));
546        assert!(range.contains(&[0xFF]));
547        assert!(!range.contains(&[0x00, 0x00])); // Wrong length
548    }
549
550    #[test]
551    fn test_identity_cmap() {
552        let cmap = CMap::identity_h();
553        assert_eq!(cmap.name, Some("Identity-H".to_string()));
554        assert_eq!(cmap.wmode, 0);
555
556        // Identity mapping returns the same code
557        let code = vec![0x00, 0x41];
558        assert_eq!(cmap.map(&code), Some(code.clone()));
559    }
560
561    #[test]
562    fn test_parse_hex() {
563        assert_eq!(parse_hex("<00>"), Some(vec![0x00]));
564        assert_eq!(parse_hex("<FF>"), Some(vec![0xFF]));
565        assert_eq!(parse_hex("<0041>"), Some(vec![0x00, 0x41]));
566        assert_eq!(parse_hex("<FEFF>"), Some(vec![0xFE, 0xFF]));
567        assert_eq!(parse_hex("invalid"), None);
568    }
569
570    #[test]
571    fn test_calculate_offset() {
572        assert_eq!(calculate_offset(&[0x00, 0x05], &[0x00, 0x00]), 5);
573        assert_eq!(calculate_offset(&[0x01, 0x00], &[0x00, 0x00]), 256);
574        assert_eq!(calculate_offset(&[0xFF], &[0x00]), 255);
575    }
576
577    #[test]
578    fn test_tounicode_builder() {
579        let mut builder = ToUnicodeCMapBuilder::new(1);
580        builder.add_single_byte_mapping(0x41, 'A');
581        builder.add_single_byte_mapping(0x42, 'B');
582
583        let content = builder.build();
584        let content_str = String::from_utf8(content).unwrap();
585
586        assert!(content_str.contains("/CMapName /Adobe-Identity-UCS def"));
587        assert!(content_str.contains("begincodespacerange"));
588        assert!(content_str.contains("<00> <FF>"));
589        assert!(content_str.contains("beginbfchar"));
590    }
591
592    #[test]
593    fn test_simple_cmap_parsing() {
594        let cmap_data = br#"
595%!PS-Adobe-3.0 Resource-CMap
596%%DocumentNeededResources: ProcSet (CIDInit)
597%%IncludeResource: ProcSet (CIDInit)
598%%BeginResource: CMap (Custom)
599%%Title: (Custom Adobe UCS 0)
600%%Version: 1.000
601%%EndComments
602
603/CIDInit /ProcSet findresource begin
60412 dict begin
605begincmap
606/CIDSystemInfo
607<< /Registry (Adobe)
608   /Ordering (UCS)
609   /Supplement 0
610>> def
611/CMapName /Custom def
612/CMapType 2 def
6131 begincodespacerange
614<00> <FF>
615endcodespacerange
6162 beginbfchar
617<20> <0020>
618<41> <0041>
619endbfchar
620endcmap
621"#;
622
623        let cmap = CMap::parse(cmap_data).unwrap();
624        assert_eq!(cmap.name, Some("Custom".to_string()));
625        assert_eq!(cmap.codespace_ranges.len(), 1);
626        assert_eq!(cmap.map(&[0x20]), Some(vec![0x00, 0x20]));
627        assert_eq!(cmap.map(&[0x41]), Some(vec![0x00, 0x41]));
628    }
629
630    #[test]
631    fn test_cmap_to_unicode() {
632        let mut cmap = CMap::new();
633        cmap.cmap_type = CMapType::ToUnicode;
634
635        // UTF-16BE for 'A'
636        let unicode_a = vec![0x00, 0x41];
637        assert_eq!(cmap.to_unicode(&unicode_a), Some("A".to_string()));
638
639        // UTF-16BE for '中' (U+4E2D)
640        let unicode_cjk = vec![0x4E, 0x2D];
641        assert_eq!(cmap.to_unicode(&unicode_cjk), Some("中".to_string()));
642    }
643
644    #[test]
645    fn test_bf_range_mapping() {
646        let mut cmap = CMap::new();
647        cmap.codespace_ranges.push(CodeRange {
648            start: vec![0x00],
649            end: vec![0xFF],
650        });
651        cmap.mappings.push(CMapEntry::Range {
652            src_start: vec![0x20],
653            src_end: vec![0x7E],
654            dst_start: vec![0x00, 0x20],
655        });
656
657        // Test range mapping
658        assert_eq!(cmap.map(&[0x20]), Some(vec![0x00, 0x20])); // Space
659        assert_eq!(cmap.map(&[0x41]), Some(vec![0x00, 0x41])); // 'A'
660        assert_eq!(cmap.map(&[0x7E]), Some(vec![0x00, 0x7E])); // '~'
661        assert_eq!(cmap.map(&[0x7F]), None); // Out of range
662    }
663
664    #[test]
665    fn test_multibyte_mapping() {
666        let mut builder = ToUnicodeCMapBuilder::new(2);
667        builder.add_mapping(vec![0x00, 0x41], "A");
668        builder.add_mapping(vec![0x00, 0x42], "B");
669
670        let content = builder.build();
671        let content_str = String::from_utf8(content).unwrap();
672
673        assert!(content_str.contains("<0000> <FFFF>"));
674        assert!(content_str.contains("<0041>"));
675        assert!(content_str.contains("<0042>"));
676    }
677}