1use crate::parser::{ParseError, ParseResult};
9use std::collections::HashMap;
10
11#[derive(Debug, Clone, PartialEq)]
13pub enum CMapType {
14 CIDMap,
16 ToUnicode,
18 Predefined(String),
20}
21
22#[derive(Debug, Clone)]
24pub struct CodeRange {
25 pub start: Vec<u8>,
27 pub end: Vec<u8>,
29}
30
31impl CodeRange {
32 pub fn contains(&self, code: &[u8]) -> bool {
34 if code.len() != self.start.len() || code.len() != self.end.len() {
35 return false;
36 }
37
38 code >= &self.start[..] && code <= &self.end[..]
39 }
40}
41
42#[derive(Debug, Clone)]
44pub enum CMapEntry {
45 Single {
47 src: Vec<u8>,
49 dst: Vec<u8>,
51 },
52 Range {
54 src_start: Vec<u8>,
56 src_end: Vec<u8>,
58 dst_start: Vec<u8>,
60 },
61}
62
63#[derive(Debug, Clone)]
65pub struct CMap {
66 pub name: Option<String>,
68 pub cmap_type: CMapType,
70 pub wmode: u8,
72 pub codespace_ranges: Vec<CodeRange>,
74 pub mappings: Vec<CMapEntry>,
76 single_mappings: HashMap<Vec<u8>, Vec<u8>>,
78}
79
80impl Default for CMap {
81 fn default() -> Self {
82 Self::new()
83 }
84}
85
86impl CMap {
87 pub fn new() -> Self {
89 Self {
90 name: None,
91 cmap_type: CMapType::ToUnicode,
92 wmode: 0,
93 codespace_ranges: Vec::new(),
94 mappings: Vec::new(),
95 single_mappings: HashMap::new(),
96 }
97 }
98
99 pub fn identity_h() -> Self {
101 Self {
102 name: Some("Identity-H".to_string()),
103 cmap_type: CMapType::Predefined("Identity-H".to_string()),
104 wmode: 0,
105 codespace_ranges: vec![CodeRange {
106 start: vec![0x00, 0x00],
107 end: vec![0xFF, 0xFF],
108 }],
109 mappings: Vec::new(),
110 single_mappings: HashMap::new(),
111 }
112 }
113
114 pub fn identity_v() -> Self {
116 Self {
117 name: Some("Identity-V".to_string()),
118 cmap_type: CMapType::Predefined("Identity-V".to_string()),
119 wmode: 1,
120 codespace_ranges: vec![CodeRange {
121 start: vec![0x00, 0x00],
122 end: vec![0xFF, 0xFF],
123 }],
124 mappings: Vec::new(),
125 single_mappings: HashMap::new(),
126 }
127 }
128
129 pub fn parse(data: &[u8]) -> ParseResult<Self> {
131 let mut cmap = Self::new();
132 let content =
133 std::str::from_utf8(data).map_err(|e| ParseError::CharacterEncodingError {
134 position: 0,
135 message: format!("Invalid UTF-8 in CMap: {e}"),
136 })?;
137
138 let lines = content.lines();
139 let mut in_codespace_range = false;
140 let mut in_bf_char = false;
141 let mut in_bf_range = false;
142
143 for line in lines {
144 let line = line.trim();
145
146 if line.starts_with('%') {
148 continue;
149 }
150
151 if line.starts_with("/CMapName") {
153 if let Some(name) = extract_name(line) {
154 cmap.name = Some(name);
155 }
156 }
157 else if line.starts_with("/WMode") {
159 if let Some(wmode) = extract_number(line) {
160 cmap.wmode = wmode as u8;
161 }
162 }
163 else if line.contains("begincodespacerange") {
165 in_codespace_range = true;
166 } else if line == "endcodespacerange" {
167 in_codespace_range = false;
168 } else if in_codespace_range {
169 if let Some((start, end)) = parse_hex_range(line) {
170 cmap.codespace_ranges.push(CodeRange { start, end });
171 }
172 }
173 else if line.contains("beginbfchar") {
175 in_bf_char = true;
176 } else if line == "endbfchar" {
177 in_bf_char = false;
178 } else if in_bf_char {
179 if let Some((src, dst)) = parse_bf_char(line) {
180 cmap.single_mappings.insert(src.clone(), dst.clone());
181 cmap.mappings.push(CMapEntry::Single { src, dst });
182 }
183 }
184 else if line.contains("beginbfrange") {
186 in_bf_range = true;
187 } else if line == "endbfrange" {
188 in_bf_range = false;
189 } else if in_bf_range {
190 if let Some(entries) = parse_bf_range_entries(line) {
192 for entry in entries {
193 if let CMapEntry::Single { ref src, ref dst } = entry {
194 cmap.single_mappings.insert(src.clone(), dst.clone());
195 }
196 cmap.mappings.push(entry);
197 }
198 }
199 }
200 }
201
202 Ok(cmap)
203 }
204
205 pub fn map(&self, code: &[u8]) -> Option<Vec<u8>> {
207 if !self.is_valid_code(code) {
209 return None;
210 }
211
212 if let CMapType::Predefined(name) = &self.cmap_type {
214 if name.starts_with("Identity") {
215 return Some(code.to_vec());
216 }
217 }
218
219 if let Some(dst) = self.single_mappings.get(code) {
221 return Some(dst.clone());
222 }
223
224 for mapping in &self.mappings {
226 if let CMapEntry::Range {
227 src_start,
228 src_end,
229 dst_start,
230 } = mapping
231 {
232 if code.len() == src_start.len() && code >= &src_start[..] && code <= &src_end[..] {
233 let offset = calculate_offset(code, src_start);
235 let mut result = dst_start.clone();
236
237 if let Some(last) = result.last_mut() {
239 *last = last.wrapping_add(offset as u8);
240 }
241
242 return Some(result);
243 }
244 }
245 }
246
247 None
248 }
249
250 pub fn is_valid_code(&self, code: &[u8]) -> bool {
252 for range in &self.codespace_ranges {
253 if range.contains(code) {
254 return true;
255 }
256 }
257 false
258 }
259
260 pub fn to_unicode(&self, mapped: &[u8]) -> Option<String> {
262 match self.cmap_type {
263 CMapType::ToUnicode => {
264 if mapped.len() % 2 == 0 {
266 let utf16_values: Vec<u16> = mapped
267 .chunks(2)
268 .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
269 .collect();
270 String::from_utf16(&utf16_values).ok()
271 } else {
272 String::from_utf8(mapped.to_vec()).ok()
274 }
275 }
276 _ => None,
277 }
278 }
279}
280
281fn extract_name(line: &str) -> Option<String> {
283 let parts: Vec<&str> = line.split_whitespace().collect();
284 if parts.len() >= 2 && parts[1].starts_with('/') {
285 Some(parts[1][1..].to_string())
286 } else {
287 None
288 }
289}
290
291fn extract_number(line: &str) -> Option<i32> {
293 let parts: Vec<&str> = line.split_whitespace().collect();
294 if parts.len() >= 2 {
295 parts[1].parse().ok()
296 } else {
297 None
298 }
299}
300
301fn parse_hex(s: &str) -> Option<Vec<u8>> {
303 let s = s.trim_start_matches('<').trim_end_matches('>');
304 if s.len() % 2 != 0 {
305 return None;
306 }
307
308 let mut bytes = Vec::new();
309 for i in (0..s.len()).step_by(2) {
310 if let Ok(byte) = u8::from_str_radix(&s[i..i + 2], 16) {
311 bytes.push(byte);
312 } else {
313 return None;
314 }
315 }
316 Some(bytes)
317}
318
319fn parse_hex_range(line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
321 let parts: Vec<&str> = line.split_whitespace().collect();
322 if parts.len() >= 2 {
323 if let (Some(start), Some(end)) = (parse_hex(parts[0]), parse_hex(parts[1])) {
324 return Some((start, end));
325 }
326 }
327 None
328}
329
330fn parse_bf_char(line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
332 parse_hex_range(line)
333}
334
335fn parse_bf_range_entries(line: &str) -> Option<Vec<CMapEntry>> {
337 if line.contains('[') {
339 if let Some(array_start) = line.find('[') {
341 let before_array = &line[..array_start];
342 let parts: Vec<&str> = before_array.split_whitespace().collect();
343
344 if parts.len() >= 2 {
345 if let (Some(src_start), Some(src_end)) = (parse_hex(parts[0]), parse_hex(parts[1]))
346 {
347 let after_bracket = &line[array_start + 1..];
349 if let Some(array_end) = after_bracket.find(']') {
350 let array_content = &after_bracket[..array_end];
351
352 let hex_values: Vec<Vec<u8>> = array_content
354 .split_whitespace()
355 .filter_map(parse_hex)
356 .collect();
357
358 let mut entries = Vec::new();
360 let mut current_src = src_start;
361
362 for dst in hex_values {
363 entries.push(CMapEntry::Single {
364 src: current_src.clone(),
365 dst,
366 });
367
368 if let Some(last) = current_src.last_mut() {
370 *last = last.wrapping_add(1);
371 }
372
373 if current_src > src_end {
375 break;
376 }
377 }
378
379 return Some(entries);
380 }
381 }
382 }
383 }
384 return None;
385 }
386
387 let parts: Vec<&str> = line.split_whitespace().collect();
389 if parts.len() >= 3 {
390 if let (Some(start), Some(end), Some(dst)) = (
391 parse_hex(parts[0]),
392 parse_hex(parts[1]),
393 parse_hex(parts[2]),
394 ) {
395 return Some(vec![CMapEntry::Range {
396 src_start: start,
397 src_end: end,
398 dst_start: dst,
399 }]);
400 }
401 }
402 None
403}
404
405fn calculate_offset(code: &[u8], start: &[u8]) -> usize {
407 let mut offset = 0;
408 for i in (0..code.len()).rev() {
409 let diff = code[i] as usize - start[i] as usize;
410 offset += diff * (256_usize.pow((code.len() - i - 1) as u32));
411 }
412 offset
413}
414
415#[derive(Debug, Clone)]
417pub struct ToUnicodeCMapBuilder {
418 mappings: HashMap<Vec<u8>, String>,
420 code_length: usize,
422}
423
424impl ToUnicodeCMapBuilder {
425 pub fn new(code_length: usize) -> Self {
427 Self {
428 mappings: HashMap::new(),
429 code_length,
430 }
431 }
432
433 pub fn add_mapping(&mut self, char_code: Vec<u8>, unicode: &str) {
435 self.mappings.insert(char_code, unicode.to_string());
436 }
437
438 pub fn add_single_byte_mapping(&mut self, char_code: u8, unicode: char) {
440 let code = if self.code_length == 1 {
441 vec![char_code]
442 } else {
443 let mut code = vec![0; self.code_length - 1];
445 code.push(char_code);
446 code
447 };
448 self.mappings.insert(code, unicode.to_string());
449 }
450
451 pub fn build(&self) -> Vec<u8> {
453 let mut content = String::new();
454
455 content.push_str("/CIDInit /ProcSet findresource begin\n");
457 content.push_str("12 dict begin\n");
458 content.push_str("begincmap\n");
459 content.push_str("/CIDSystemInfo\n");
460 content.push_str("<< /Registry (Adobe)\n");
461 content.push_str(" /Ordering (UCS)\n");
462 content.push_str(" /Supplement 0\n");
463 content.push_str(">> def\n");
464 content.push_str("/CMapName /Adobe-Identity-UCS def\n");
465 content.push_str("/CMapType 2 def\n");
466
467 content.push_str("1 begincodespacerange\n");
469 if self.code_length == 1 {
470 content.push_str("<00> <FF>\n");
471 } else {
472 let start = vec![0x00; self.code_length];
473 let end = vec![0xFF; self.code_length];
474 content.push_str(&format!(
475 "<{}> <{}>\n",
476 hex_string(&start),
477 hex_string(&end)
478 ));
479 }
480 content.push_str("endcodespacerange\n");
481
482 if !self.mappings.is_empty() {
484 let mut sorted_mappings: Vec<_> = self.mappings.iter().collect();
486 sorted_mappings.sort_by_key(|(k, _)| *k);
487
488 let mut single_mappings = Vec::new();
490 for (code, unicode) in &sorted_mappings {
491 let utf16_bytes = string_to_utf16_be_bytes(unicode);
492 single_mappings.push((code, utf16_bytes));
493 }
494
495 for chunk in single_mappings.chunks(100) {
497 content.push_str(&format!("{} beginbfchar\n", chunk.len()));
498 for (code, unicode_bytes) in chunk {
499 content.push_str(&format!(
500 "<{}> <{}>\n",
501 hex_string(code),
502 hex_string(unicode_bytes)
503 ));
504 }
505 content.push_str("endbfchar\n");
506 }
507 }
508
509 content.push_str("endcmap\n");
511 content.push_str("CMapName currentdict /CMap defineresource pop\n");
512 content.push_str("end\n");
513 content.push_str("end\n");
514
515 content.into_bytes()
516 }
517}
518
519pub fn string_to_utf16_be_bytes(s: &str) -> Vec<u8> {
521 let mut bytes = Vec::new();
522 for ch in s.encode_utf16() {
523 bytes.extend(&ch.to_be_bytes());
524 }
525 bytes
526}
527
528pub fn hex_string(bytes: &[u8]) -> String {
530 bytes.iter().map(|b| format!("{b:02X}")).collect()
531}
532
533#[cfg(test)]
534mod tests {
535 use super::*;
536
537 #[test]
538 fn test_code_range() {
539 let range = CodeRange {
540 start: vec![0x00],
541 end: vec![0xFF],
542 };
543
544 assert!(range.contains(&[0x00]));
545 assert!(range.contains(&[0x80]));
546 assert!(range.contains(&[0xFF]));
547 assert!(!range.contains(&[0x00, 0x00])); }
549
550 #[test]
551 fn test_identity_cmap() {
552 let cmap = CMap::identity_h();
553 assert_eq!(cmap.name, Some("Identity-H".to_string()));
554 assert_eq!(cmap.wmode, 0);
555
556 let code = vec![0x00, 0x41];
558 assert_eq!(cmap.map(&code), Some(code.clone()));
559 }
560
561 #[test]
562 fn test_parse_hex() {
563 assert_eq!(parse_hex("<00>"), Some(vec![0x00]));
564 assert_eq!(parse_hex("<FF>"), Some(vec![0xFF]));
565 assert_eq!(parse_hex("<0041>"), Some(vec![0x00, 0x41]));
566 assert_eq!(parse_hex("<FEFF>"), Some(vec![0xFE, 0xFF]));
567 assert_eq!(parse_hex("invalid"), None);
568 }
569
570 #[test]
571 fn test_calculate_offset() {
572 assert_eq!(calculate_offset(&[0x00, 0x05], &[0x00, 0x00]), 5);
573 assert_eq!(calculate_offset(&[0x01, 0x00], &[0x00, 0x00]), 256);
574 assert_eq!(calculate_offset(&[0xFF], &[0x00]), 255);
575 }
576
577 #[test]
578 fn test_tounicode_builder() {
579 let mut builder = ToUnicodeCMapBuilder::new(1);
580 builder.add_single_byte_mapping(0x41, 'A');
581 builder.add_single_byte_mapping(0x42, 'B');
582
583 let content = builder.build();
584 let content_str = String::from_utf8(content).unwrap();
585
586 assert!(content_str.contains("/CMapName /Adobe-Identity-UCS def"));
587 assert!(content_str.contains("begincodespacerange"));
588 assert!(content_str.contains("<00> <FF>"));
589 assert!(content_str.contains("beginbfchar"));
590 }
591
592 #[test]
593 fn test_simple_cmap_parsing() {
594 let cmap_data = br#"
595%!PS-Adobe-3.0 Resource-CMap
596%%DocumentNeededResources: ProcSet (CIDInit)
597%%IncludeResource: ProcSet (CIDInit)
598%%BeginResource: CMap (Custom)
599%%Title: (Custom Adobe UCS 0)
600%%Version: 1.000
601%%EndComments
602
603/CIDInit /ProcSet findresource begin
60412 dict begin
605begincmap
606/CIDSystemInfo
607<< /Registry (Adobe)
608 /Ordering (UCS)
609 /Supplement 0
610>> def
611/CMapName /Custom def
612/CMapType 2 def
6131 begincodespacerange
614<00> <FF>
615endcodespacerange
6162 beginbfchar
617<20> <0020>
618<41> <0041>
619endbfchar
620endcmap
621"#;
622
623 let cmap = CMap::parse(cmap_data).unwrap();
624 assert_eq!(cmap.name, Some("Custom".to_string()));
625 assert_eq!(cmap.codespace_ranges.len(), 1);
626 assert_eq!(cmap.map(&[0x20]), Some(vec![0x00, 0x20]));
627 assert_eq!(cmap.map(&[0x41]), Some(vec![0x00, 0x41]));
628 }
629
630 #[test]
631 fn test_cmap_to_unicode() {
632 let mut cmap = CMap::new();
633 cmap.cmap_type = CMapType::ToUnicode;
634
635 let unicode_a = vec![0x00, 0x41];
637 assert_eq!(cmap.to_unicode(&unicode_a), Some("A".to_string()));
638
639 let unicode_cjk = vec![0x4E, 0x2D];
641 assert_eq!(cmap.to_unicode(&unicode_cjk), Some("中".to_string()));
642 }
643
644 #[test]
645 fn test_bf_range_mapping() {
646 let mut cmap = CMap::new();
647 cmap.codespace_ranges.push(CodeRange {
648 start: vec![0x00],
649 end: vec![0xFF],
650 });
651 cmap.mappings.push(CMapEntry::Range {
652 src_start: vec![0x20],
653 src_end: vec![0x7E],
654 dst_start: vec![0x00, 0x20],
655 });
656
657 assert_eq!(cmap.map(&[0x20]), Some(vec![0x00, 0x20])); assert_eq!(cmap.map(&[0x41]), Some(vec![0x00, 0x41])); assert_eq!(cmap.map(&[0x7E]), Some(vec![0x00, 0x7E])); assert_eq!(cmap.map(&[0x7F]), None); }
663
664 #[test]
665 fn test_multibyte_mapping() {
666 let mut builder = ToUnicodeCMapBuilder::new(2);
667 builder.add_mapping(vec![0x00, 0x41], "A");
668 builder.add_mapping(vec![0x00, 0x42], "B");
669
670 let content = builder.build();
671 let content_str = String::from_utf8(content).unwrap();
672
673 assert!(content_str.contains("<0000> <FFFF>"));
674 assert!(content_str.contains("<0041>"));
675 assert!(content_str.contains("<0042>"));
676 }
677}