justpdf_core/font/
cmap.rs

1use std::collections::HashMap;
2
3/// A parsed ToUnicode CMap: maps character codes to Unicode strings.
4#[derive(Debug, Clone)]
5pub struct ToUnicodeCMap {
6    /// Single char code → Unicode string.
7    mappings: HashMap<u32, String>,
8    /// Range mappings: (start_code, end_code, start_unicode).
9    ranges: Vec<(u32, u32, u32)>,
10}
11
12impl ToUnicodeCMap {
13    /// Parse a ToUnicode CMap from its raw stream data.
14    pub fn parse(data: &[u8]) -> Self {
15        let mut cmap = Self {
16            mappings: HashMap::new(),
17            ranges: Vec::new(),
18        };
19
20        let text = String::from_utf8_lossy(data);
21
22        // Parse "beginbfchar" sections: <src> <dst>
23        let mut pos = 0;
24        while let Some(start) = text[pos..].find("beginbfchar") {
25            let section_start = pos + start + "beginbfchar".len();
26            let section_end = text[section_start..]
27                .find("endbfchar")
28                .map(|i| section_start + i)
29                .unwrap_or(text.len());
30
31            let section = &text[section_start..section_end];
32            parse_bfchar_section(section, &mut cmap.mappings);
33
34            pos = section_end;
35        }
36
37        // Parse "beginbfrange" sections: <start> <end> <dst>
38        pos = 0;
39        while let Some(start) = text[pos..].find("beginbfrange") {
40            let section_start = pos + start + "beginbfrange".len();
41            let section_end = text[section_start..]
42                .find("endbfrange")
43                .map(|i| section_start + i)
44                .unwrap_or(text.len());
45
46            let section = &text[section_start..section_end];
47            parse_bfrange_section(section, &mut cmap.mappings, &mut cmap.ranges);
48
49            pos = section_end;
50        }
51
52        cmap
53    }
54
55    /// Look up a character code and return the Unicode string.
56    pub fn lookup(&self, code: u32) -> Option<String> {
57        // Check direct mappings first
58        if let Some(s) = self.mappings.get(&code) {
59            return Some(s.clone());
60        }
61
62        // Check range mappings
63        for &(start, end, start_unicode) in &self.ranges {
64            if code >= start && code <= end {
65                let offset = code - start;
66                if let Some(c) = char::from_u32(start_unicode + offset) {
67                    return Some(c.to_string());
68                }
69            }
70        }
71
72        None
73    }
74
75    /// Number of mappings.
76    pub fn len(&self) -> usize {
77        self.mappings.len() + self.ranges.len()
78    }
79
80    pub fn is_empty(&self) -> bool {
81        self.mappings.is_empty() && self.ranges.is_empty()
82    }
83}
84
85/// Parse a bfchar section: pairs of <srcCode> <dstString>.
86fn parse_bfchar_section(section: &str, mappings: &mut HashMap<u32, String>) {
87    let hex_values = extract_hex_values(section);
88    for pair in hex_values.chunks(2) {
89        if pair.len() == 2 {
90            let src_code = u32::from_str_radix(&pair[0], 16).unwrap_or(0);
91            let dst_str = hex_to_unicode_string(&pair[1]);
92            mappings.insert(src_code, dst_str);
93        }
94    }
95}
96
97/// Parse a bfrange section: triples of <startCode> <endCode> <dstStartOrArray>.
98fn parse_bfrange_section(
99    section: &str,
100    mappings: &mut HashMap<u32, String>,
101    ranges: &mut Vec<(u32, u32, u32)>,
102) {
103    let mut chars = section.chars().peekable();
104    loop {
105        // Skip to next '<'
106        skip_until(&mut chars, '<');
107        let start_hex = read_hex_token(&mut chars);
108        if start_hex.is_empty() {
109            break;
110        }
111
112        skip_until(&mut chars, '<');
113        let end_hex = read_hex_token(&mut chars);
114        if end_hex.is_empty() {
115            break;
116        }
117
118        let start_code = u32::from_str_radix(&start_hex, 16).unwrap_or(0);
119        let end_code = u32::from_str_radix(&end_hex, 16).unwrap_or(0);
120
121        // Next could be <hex> or [ <hex> <hex> ... ]
122        skip_whitespace(&mut chars);
123        match chars.peek() {
124            Some('<') => {
125                chars.next(); // skip '<'
126                let dst_hex = read_hex_token(&mut chars);
127                let dst_code = u32::from_str_radix(&dst_hex, 16).unwrap_or(0);
128                ranges.push((start_code, end_code, dst_code));
129            }
130            Some('[') => {
131                chars.next(); // skip '['
132                let mut code = start_code;
133                loop {
134                    skip_whitespace(&mut chars);
135                    match chars.peek() {
136                        Some(']') => {
137                            chars.next();
138                            break;
139                        }
140                        Some('<') => {
141                            chars.next();
142                            let hex = read_hex_token(&mut chars);
143                            let dst_str = hex_to_unicode_string(&hex);
144                            mappings.insert(code, dst_str);
145                            code += 1;
146                        }
147                        None => break,
148                        _ => {
149                            chars.next();
150                        }
151                    }
152                }
153            }
154            _ => break,
155        }
156    }
157}
158
159/// Extract all hex values enclosed in <...> from text.
160fn extract_hex_values(text: &str) -> Vec<String> {
161    let mut values = Vec::new();
162    let mut in_hex = false;
163    let mut current = String::new();
164
165    for c in text.chars() {
166        match c {
167            '<' => {
168                in_hex = true;
169                current.clear();
170            }
171            '>' => {
172                if in_hex {
173                    values.push(current.clone());
174                    in_hex = false;
175                }
176            }
177            _ if in_hex && c.is_ascii_hexdigit() => {
178                current.push(c);
179            }
180            _ => {}
181        }
182    }
183
184    values
185}
186
187/// Convert a hex string to a Unicode string.
188/// Each pair of hex digits is a byte; interpreted as UTF-16BE.
189fn hex_to_unicode_string(hex: &str) -> String {
190    let bytes: Vec<u8> = (0..hex.len())
191        .step_by(2)
192        .filter_map(|i| {
193            if i + 2 <= hex.len() {
194                u8::from_str_radix(&hex[i..i + 2], 16).ok()
195            } else {
196                None
197            }
198        })
199        .collect();
200
201    // If 2 bytes, interpret as single UTF-16BE codepoint
202    if bytes.len() == 2 {
203        let code = ((bytes[0] as u32) << 8) | bytes[1] as u32;
204        if let Some(c) = char::from_u32(code) {
205            return c.to_string();
206        }
207    }
208
209    // If 4 bytes, could be surrogate pair or two codepoints
210    if bytes.len() == 4 {
211        let hi = ((bytes[0] as u16) << 8) | bytes[1] as u16;
212        let lo = ((bytes[2] as u16) << 8) | bytes[3] as u16;
213
214        // Check for surrogate pair
215        if (0xD800..=0xDBFF).contains(&hi) && (0xDC00..=0xDFFF).contains(&lo) {
216            let cp = 0x10000 + ((hi as u32 - 0xD800) << 10) + (lo as u32 - 0xDC00);
217            if let Some(c) = char::from_u32(cp) {
218                return c.to_string();
219            }
220        }
221
222        // Two separate codepoints
223        let mut s = String::new();
224        if let Some(c) = char::from_u32(hi as u32) {
225            s.push(c);
226        }
227        if let Some(c) = char::from_u32(lo as u32) {
228            s.push(c);
229        }
230        return s;
231    }
232
233    // Fallback: try pairs as UTF-16BE
234    let mut s = String::new();
235    for chunk in bytes.chunks(2) {
236        if chunk.len() == 2 {
237            let code = ((chunk[0] as u32) << 8) | chunk[1] as u32;
238            if let Some(c) = char::from_u32(code) {
239                s.push(c);
240            }
241        }
242    }
243    s
244}
245
246fn skip_until(chars: &mut std::iter::Peekable<std::str::Chars<'_>>, target: char) {
247    while let Some(&c) = chars.peek() {
248        if c == target {
249            chars.next(); // consume the target
250            return;
251        }
252        chars.next();
253    }
254}
255
256fn skip_whitespace(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) {
257    while let Some(&c) = chars.peek() {
258        if c.is_ascii_whitespace() {
259            chars.next();
260        } else {
261            break;
262        }
263    }
264}
265
266fn read_hex_token(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> String {
267    let mut hex = String::new();
268    while let Some(&c) = chars.peek() {
269        if c == '>' {
270            chars.next(); // consume '>'
271            break;
272        }
273        if c.is_ascii_hexdigit() {
274            hex.push(c);
275        }
276        chars.next();
277    }
278    hex
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_parse_bfchar() {
287        let data = br#"
288/CIDInit /ProcSet findresource begin
2891 begincodespacerange
290<0000> <FFFF>
291endcodespacerange
2923 beginbfchar
293<0003> <0020>
294<0011> <002E>
295<0024> <0041>
296endbfchar
297endcmap
298"#;
299        let cmap = ToUnicodeCMap::parse(data);
300        assert_eq!(cmap.lookup(0x0003), Some(" ".into()));
301        assert_eq!(cmap.lookup(0x0011), Some(".".into()));
302        assert_eq!(cmap.lookup(0x0024), Some("A".into()));
303        assert_eq!(cmap.lookup(0x9999), None);
304    }
305
306    #[test]
307    fn test_parse_bfrange() {
308        let data = br#"
3091 begincodespacerange
310<00> <FF>
311endcodespacerange
3121 beginbfrange
313<41> <5A> <0041>
314endbfrange
315"#;
316        let cmap = ToUnicodeCMap::parse(data);
317        assert_eq!(cmap.lookup(0x41), Some("A".into()));
318        assert_eq!(cmap.lookup(0x42), Some("B".into()));
319        assert_eq!(cmap.lookup(0x5A), Some("Z".into()));
320        assert_eq!(cmap.lookup(0x40), None); // before range
321    }
322
323    #[test]
324    fn test_parse_bfrange_with_array() {
325        let data = br#"
3261 beginbfrange
327<01> <03> [<0041> <0042> <0043>]
328endbfrange
329"#;
330        let cmap = ToUnicodeCMap::parse(data);
331        assert_eq!(cmap.lookup(0x01), Some("A".into()));
332        assert_eq!(cmap.lookup(0x02), Some("B".into()));
333        assert_eq!(cmap.lookup(0x03), Some("C".into()));
334    }
335
336    #[test]
337    fn test_empty_cmap() {
338        let cmap = ToUnicodeCMap::parse(b"");
339        assert!(cmap.is_empty());
340    }
341
342    #[test]
343    fn test_hex_to_unicode() {
344        assert_eq!(hex_to_unicode_string("0041"), "A");
345        assert_eq!(hex_to_unicode_string("0048"), "H");
346        assert_eq!(hex_to_unicode_string("AC00"), "가"); // Korean
347    }
348
349    #[test]
350    fn test_multibyte_unicode() {
351        // Two codepoints
352        assert_eq!(hex_to_unicode_string("00480069"), "Hi");
353    }
354}
justpdf_core/font/cmap.rs

justpdf_core/font/
cmap.rs