justpdf_core/font/
cmap.rs1use std::collections::HashMap;
2
3#[derive(Debug, Clone)]
5pub struct ToUnicodeCMap {
6 mappings: HashMap<u32, String>,
8 ranges: Vec<(u32, u32, u32)>,
10}
11
12impl ToUnicodeCMap {
13 pub fn parse(data: &[u8]) -> Self {
15 let mut cmap = Self {
16 mappings: HashMap::new(),
17 ranges: Vec::new(),
18 };
19
20 let text = String::from_utf8_lossy(data);
21
22 let mut pos = 0;
24 while let Some(start) = text[pos..].find("beginbfchar") {
25 let section_start = pos + start + "beginbfchar".len();
26 let section_end = text[section_start..]
27 .find("endbfchar")
28 .map(|i| section_start + i)
29 .unwrap_or(text.len());
30
31 let section = &text[section_start..section_end];
32 parse_bfchar_section(section, &mut cmap.mappings);
33
34 pos = section_end;
35 }
36
37 pos = 0;
39 while let Some(start) = text[pos..].find("beginbfrange") {
40 let section_start = pos + start + "beginbfrange".len();
41 let section_end = text[section_start..]
42 .find("endbfrange")
43 .map(|i| section_start + i)
44 .unwrap_or(text.len());
45
46 let section = &text[section_start..section_end];
47 parse_bfrange_section(section, &mut cmap.mappings, &mut cmap.ranges);
48
49 pos = section_end;
50 }
51
52 cmap
53 }
54
55 pub fn lookup(&self, code: u32) -> Option<String> {
57 if let Some(s) = self.mappings.get(&code) {
59 return Some(s.clone());
60 }
61
62 for &(start, end, start_unicode) in &self.ranges {
64 if code >= start && code <= end {
65 let offset = code - start;
66 if let Some(c) = char::from_u32(start_unicode + offset) {
67 return Some(c.to_string());
68 }
69 }
70 }
71
72 None
73 }
74
75 pub fn len(&self) -> usize {
77 self.mappings.len() + self.ranges.len()
78 }
79
80 pub fn is_empty(&self) -> bool {
81 self.mappings.is_empty() && self.ranges.is_empty()
82 }
83}
84
85fn parse_bfchar_section(section: &str, mappings: &mut HashMap<u32, String>) {
87 let hex_values = extract_hex_values(section);
88 for pair in hex_values.chunks(2) {
89 if pair.len() == 2 {
90 let src_code = u32::from_str_radix(&pair[0], 16).unwrap_or(0);
91 let dst_str = hex_to_unicode_string(&pair[1]);
92 mappings.insert(src_code, dst_str);
93 }
94 }
95}
96
97fn parse_bfrange_section(
99 section: &str,
100 mappings: &mut HashMap<u32, String>,
101 ranges: &mut Vec<(u32, u32, u32)>,
102) {
103 let mut chars = section.chars().peekable();
104 loop {
105 skip_until(&mut chars, '<');
107 let start_hex = read_hex_token(&mut chars);
108 if start_hex.is_empty() {
109 break;
110 }
111
112 skip_until(&mut chars, '<');
113 let end_hex = read_hex_token(&mut chars);
114 if end_hex.is_empty() {
115 break;
116 }
117
118 let start_code = u32::from_str_radix(&start_hex, 16).unwrap_or(0);
119 let end_code = u32::from_str_radix(&end_hex, 16).unwrap_or(0);
120
121 skip_whitespace(&mut chars);
123 match chars.peek() {
124 Some('<') => {
125 chars.next(); let dst_hex = read_hex_token(&mut chars);
127 let dst_code = u32::from_str_radix(&dst_hex, 16).unwrap_or(0);
128 ranges.push((start_code, end_code, dst_code));
129 }
130 Some('[') => {
131 chars.next(); let mut code = start_code;
133 loop {
134 skip_whitespace(&mut chars);
135 match chars.peek() {
136 Some(']') => {
137 chars.next();
138 break;
139 }
140 Some('<') => {
141 chars.next();
142 let hex = read_hex_token(&mut chars);
143 let dst_str = hex_to_unicode_string(&hex);
144 mappings.insert(code, dst_str);
145 code += 1;
146 }
147 None => break,
148 _ => {
149 chars.next();
150 }
151 }
152 }
153 }
154 _ => break,
155 }
156 }
157}
158
159fn extract_hex_values(text: &str) -> Vec<String> {
161 let mut values = Vec::new();
162 let mut in_hex = false;
163 let mut current = String::new();
164
165 for c in text.chars() {
166 match c {
167 '<' => {
168 in_hex = true;
169 current.clear();
170 }
171 '>' => {
172 if in_hex {
173 values.push(current.clone());
174 in_hex = false;
175 }
176 }
177 _ if in_hex && c.is_ascii_hexdigit() => {
178 current.push(c);
179 }
180 _ => {}
181 }
182 }
183
184 values
185}
186
187fn hex_to_unicode_string(hex: &str) -> String {
190 let bytes: Vec<u8> = (0..hex.len())
191 .step_by(2)
192 .filter_map(|i| {
193 if i + 2 <= hex.len() {
194 u8::from_str_radix(&hex[i..i + 2], 16).ok()
195 } else {
196 None
197 }
198 })
199 .collect();
200
201 if bytes.len() == 2 {
203 let code = ((bytes[0] as u32) << 8) | bytes[1] as u32;
204 if let Some(c) = char::from_u32(code) {
205 return c.to_string();
206 }
207 }
208
209 if bytes.len() == 4 {
211 let hi = ((bytes[0] as u16) << 8) | bytes[1] as u16;
212 let lo = ((bytes[2] as u16) << 8) | bytes[3] as u16;
213
214 if (0xD800..=0xDBFF).contains(&hi) && (0xDC00..=0xDFFF).contains(&lo) {
216 let cp = 0x10000 + ((hi as u32 - 0xD800) << 10) + (lo as u32 - 0xDC00);
217 if let Some(c) = char::from_u32(cp) {
218 return c.to_string();
219 }
220 }
221
222 let mut s = String::new();
224 if let Some(c) = char::from_u32(hi as u32) {
225 s.push(c);
226 }
227 if let Some(c) = char::from_u32(lo as u32) {
228 s.push(c);
229 }
230 return s;
231 }
232
233 let mut s = String::new();
235 for chunk in bytes.chunks(2) {
236 if chunk.len() == 2 {
237 let code = ((chunk[0] as u32) << 8) | chunk[1] as u32;
238 if let Some(c) = char::from_u32(code) {
239 s.push(c);
240 }
241 }
242 }
243 s
244}
245
246fn skip_until(chars: &mut std::iter::Peekable<std::str::Chars<'_>>, target: char) {
247 while let Some(&c) = chars.peek() {
248 if c == target {
249 chars.next(); return;
251 }
252 chars.next();
253 }
254}
255
256fn skip_whitespace(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) {
257 while let Some(&c) = chars.peek() {
258 if c.is_ascii_whitespace() {
259 chars.next();
260 } else {
261 break;
262 }
263 }
264}
265
266fn read_hex_token(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> String {
267 let mut hex = String::new();
268 while let Some(&c) = chars.peek() {
269 if c == '>' {
270 chars.next(); break;
272 }
273 if c.is_ascii_hexdigit() {
274 hex.push(c);
275 }
276 chars.next();
277 }
278 hex
279}
280
281#[cfg(test)]
282mod tests {
283 use super::*;
284
285 #[test]
286 fn test_parse_bfchar() {
287 let data = br#"
288/CIDInit /ProcSet findresource begin
2891 begincodespacerange
290<0000> <FFFF>
291endcodespacerange
2923 beginbfchar
293<0003> <0020>
294<0011> <002E>
295<0024> <0041>
296endbfchar
297endcmap
298"#;
299 let cmap = ToUnicodeCMap::parse(data);
300 assert_eq!(cmap.lookup(0x0003), Some(" ".into()));
301 assert_eq!(cmap.lookup(0x0011), Some(".".into()));
302 assert_eq!(cmap.lookup(0x0024), Some("A".into()));
303 assert_eq!(cmap.lookup(0x9999), None);
304 }
305
306 #[test]
307 fn test_parse_bfrange() {
308 let data = br#"
3091 begincodespacerange
310<00> <FF>
311endcodespacerange
3121 beginbfrange
313<41> <5A> <0041>
314endbfrange
315"#;
316 let cmap = ToUnicodeCMap::parse(data);
317 assert_eq!(cmap.lookup(0x41), Some("A".into()));
318 assert_eq!(cmap.lookup(0x42), Some("B".into()));
319 assert_eq!(cmap.lookup(0x5A), Some("Z".into()));
320 assert_eq!(cmap.lookup(0x40), None); }
322
323 #[test]
324 fn test_parse_bfrange_with_array() {
325 let data = br#"
3261 beginbfrange
327<01> <03> [<0041> <0042> <0043>]
328endbfrange
329"#;
330 let cmap = ToUnicodeCMap::parse(data);
331 assert_eq!(cmap.lookup(0x01), Some("A".into()));
332 assert_eq!(cmap.lookup(0x02), Some("B".into()));
333 assert_eq!(cmap.lookup(0x03), Some("C".into()));
334 }
335
336 #[test]
337 fn test_empty_cmap() {
338 let cmap = ToUnicodeCMap::parse(b"");
339 assert!(cmap.is_empty());
340 }
341
342 #[test]
343 fn test_hex_to_unicode() {
344 assert_eq!(hex_to_unicode_string("0041"), "A");
345 assert_eq!(hex_to_unicode_string("0048"), "H");
346 assert_eq!(hex_to_unicode_string("AC00"), "가"); }
348
349 #[test]
350 fn test_multibyte_unicode() {
351 assert_eq!(hex_to_unicode_string("00480069"), "Hi");
353 }
354}