Skip to main content

folio_font/
cmap.rs

1//! ToUnicode CMap parsing — maps character codes to Unicode strings.
2//!
3//! ToUnicode CMaps are the primary mechanism for extracting Unicode text
4//! from PDFs. They are embedded as streams in the font dictionary.
5
6use folio_core::Result;
7use std::collections::HashMap;
8
9/// A parsed ToUnicode CMap.
10#[derive(Debug, Clone, Default)]
11pub struct ToUnicodeCMap {
12    /// Direct character code -> Unicode string mappings.
13    mappings: HashMap<u32, String>,
14    /// Range mappings: (start_code, end_code, start_unicode).
15    ranges: Vec<(u32, u32, u32)>,
16}
17
18impl ToUnicodeCMap {
19    /// Parse a ToUnicode CMap from its text content.
20    pub fn parse(data: &[u8]) -> Result<Self> {
21        let text = String::from_utf8_lossy(data);
22        let mut cmap = ToUnicodeCMap::default();
23
24        let mut lines = text.lines().peekable();
25
26        while let Some(line) = lines.next() {
27            let line = line.trim();
28
29            if line.ends_with("beginbfchar") {
30                // Parse individual character mappings
31                while let Some(mapping_line) = lines.next() {
32                    let mapping_line = mapping_line.trim();
33                    if mapping_line.contains("endbfchar") {
34                        break;
35                    }
36                    if let Some((code, unicode)) = parse_bfchar_line(mapping_line) {
37                        cmap.mappings.insert(code, unicode);
38                    }
39                }
40            } else if line.ends_with("beginbfrange") {
41                // Parse range mappings
42                while let Some(range_line) = lines.next() {
43                    let range_line = range_line.trim();
44                    if range_line.contains("endbfrange") {
45                        break;
46                    }
47                    if let Some((start, end, unicode_start)) = parse_bfrange_line(range_line) {
48                        cmap.ranges.push((start, end, unicode_start));
49                    }
50                }
51            }
52        }
53
54        Ok(cmap)
55    }
56
57    /// Look up a character code in this CMap.
58    pub fn lookup(&self, code: u32) -> Option<String> {
59        // Check direct mappings first
60        if let Some(s) = self.mappings.get(&code) {
61            return Some(s.clone());
62        }
63
64        // Check range mappings
65        for &(start, end, unicode_start) in &self.ranges {
66            if code >= start && code <= end {
67                let offset = code - start;
68                if let Some(ch) = char::from_u32(unicode_start + offset) {
69                    return Some(ch.to_string());
70                }
71            }
72        }
73
74        None
75    }
76
77    /// Decode a byte sequence using this CMap.
78    ///
79    /// Tries 2-byte codes first, falls back to 1-byte.
80    pub fn decode(&self, data: &[u8]) -> String {
81        let mut result = String::new();
82        let mut i = 0;
83
84        while i < data.len() {
85            // Try 2-byte code first (common in CID fonts)
86            if i + 1 < data.len() {
87                let code2 = ((data[i] as u32) << 8) | (data[i + 1] as u32);
88                if let Some(s) = self.lookup(code2) {
89                    result.push_str(&s);
90                    i += 2;
91                    continue;
92                }
93            }
94
95            // Try 1-byte code
96            let code1 = data[i] as u32;
97            if let Some(s) = self.lookup(code1) {
98                result.push_str(&s);
99            } else {
100                // Fallback: try as ASCII
101                if data[i] >= 0x20 && data[i] <= 0x7E {
102                    result.push(data[i] as char);
103                }
104            }
105            i += 1;
106        }
107
108        result
109    }
110
111    /// Whether this CMap has any mappings.
112    pub fn is_empty(&self) -> bool {
113        self.mappings.is_empty() && self.ranges.is_empty()
114    }
115}
116
117/// Parse a single bfchar mapping line: <XXXX> <YYYY>
118fn parse_bfchar_line(line: &str) -> Option<(u32, String)> {
119    let parts: Vec<&str> = line.split('<').filter(|s| !s.is_empty()).collect();
120    if parts.len() < 2 {
121        return None;
122    }
123
124    let code_hex = parts[0].split('>').next()?;
125    let unicode_hex = parts[1].split('>').next()?;
126
127    let code = u32::from_str_radix(code_hex.trim(), 16).ok()?;
128    let unicode_str = hex_to_unicode_string(unicode_hex.trim())?;
129
130    Some((code, unicode_str))
131}
132
133/// Parse a bfrange line: <XXXX> <YYYY> <ZZZZ>
134fn parse_bfrange_line(line: &str) -> Option<(u32, u32, u32)> {
135    let parts: Vec<&str> = line.split('<').filter(|s| !s.is_empty()).collect();
136    if parts.len() < 3 {
137        return None;
138    }
139
140    let start_hex = parts[0].split('>').next()?;
141    let end_hex = parts[1].split('>').next()?;
142    let unicode_hex = parts[2].split('>').next()?;
143
144    let start = u32::from_str_radix(start_hex.trim(), 16).ok()?;
145    let end = u32::from_str_radix(end_hex.trim(), 16).ok()?;
146    let unicode_start = u32::from_str_radix(unicode_hex.trim(), 16).ok()?;
147
148    Some((start, end, unicode_start))
149}
150
151/// Convert a hex string to a Unicode string.
152/// E.g., "0048006500" -> "He"
153fn hex_to_unicode_string(hex: &str) -> Option<String> {
154    let hex = hex.trim();
155    if hex.len() <= 4 {
156        // Single code point
157        let cp = u32::from_str_radix(hex, 16).ok()?;
158        char::from_u32(cp).map(|c| c.to_string())
159    } else {
160        // Multiple code points (each 4 hex digits)
161        let mut result = String::new();
162        let mut i = 0;
163        while i + 3 < hex.len() {
164            if let Ok(cp) = u32::from_str_radix(&hex[i..i + 4], 16) {
165                if let Some(c) = char::from_u32(cp) {
166                    result.push(c);
167                }
168            }
169            i += 4;
170        }
171        if result.is_empty() {
172            None
173        } else {
174            Some(result)
175        }
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182
183    #[test]
184    fn test_parse_bfchar() {
185        let cmap_data = br#"
186/CIDInit /ProcSet findresource begin
18712 dict begin
188begincmap
1891 begincodespacerange
190<00> <FF>
191endcodespacerange
1923 beginbfchar
193<01> <0048>
194<02> <0065>
195<03> <006C>
196endbfchar
197endcmap
198"#;
199        let cmap = ToUnicodeCMap::parse(cmap_data).unwrap();
200        assert_eq!(cmap.lookup(1), Some("H".into()));
201        assert_eq!(cmap.lookup(2), Some("e".into()));
202        assert_eq!(cmap.lookup(3), Some("l".into()));
203    }
204
205    #[test]
206    fn test_parse_bfrange() {
207        let cmap_data = br#"
2081 beginbfrange
209<0041> <005A> <0041>
210endbfrange
211"#;
212        let cmap = ToUnicodeCMap::parse(cmap_data).unwrap();
213        assert_eq!(cmap.lookup(0x41), Some("A".into()));
214        assert_eq!(cmap.lookup(0x42), Some("B".into()));
215        assert_eq!(cmap.lookup(0x5A), Some("Z".into()));
216        assert_eq!(cmap.lookup(0x5B), None);
217    }
218
219    #[test]
220    fn test_decode() {
221        let cmap_data = br#"
2223 beginbfchar
223<48> <0048>
224<65> <0065>
225<6C> <006C>
226endbfchar
227"#;
228        let cmap = ToUnicodeCMap::parse(cmap_data).unwrap();
229        assert_eq!(cmap.decode(b"Hel"), "Hel");
230    }
231}