Skip to main content

pdf_ast/parser/
cmap.rs

1use crate::ast::{AstNode, NodeId, NodeType, PdfAstGraph};
2use crate::parser::reference_resolver::ObjectNodeMap;
3use crate::types::{PdfStream, PdfValue};
4use std::collections::HashMap;
5
6#[derive(Debug, Clone)]
7pub struct CMap {
8    pub name: String,
9    pub cid_system_info: CIDSystemInfo,
10    pub wmode: i32,
11    pub code_space_ranges: Vec<CodeSpaceRange>,
12    pub mappings: CMapMappings,
13    pub usecmap: Option<String>,
14}
15
16#[derive(Debug, Clone)]
17pub struct CIDSystemInfo {
18    pub registry: String,
19    pub ordering: String,
20    pub supplement: i32,
21}
22
23#[derive(Debug, Clone)]
24pub struct CodeSpaceRange {
25    pub start: Vec<u8>,
26    pub end: Vec<u8>,
27}
28
29#[derive(Debug, Clone)]
30pub enum CMapMappings {
31    Char(HashMap<Vec<u8>, Vec<u8>>), // bfchar mappings
32    Range(Vec<CharRangeMapping>),    // bfrange mappings
33    CID(HashMap<Vec<u8>, u32>),      // cidchar mappings
34    CIDRange(Vec<CIDRangeMapping>),  // cidrange mappings
35    Mixed {
36        chars: HashMap<Vec<u8>, Vec<u8>>,
37        ranges: Vec<CharRangeMapping>,
38        cid_chars: HashMap<Vec<u8>, u32>,
39        cid_ranges: Vec<CIDRangeMapping>,
40    },
41}
42
43#[derive(Debug, Clone)]
44pub struct CharRangeMapping {
45    pub start: Vec<u8>,
46    pub end: Vec<u8>,
47    pub dest: RangeDest,
48}
49
50#[derive(Debug, Clone)]
51pub enum RangeDest {
52    Single(Vec<u8>),     // Maps to single starting point
53    Array(Vec<Vec<u8>>), // Maps to array of values
54}
55
56#[derive(Debug, Clone)]
57pub struct CIDRangeMapping {
58    pub start: Vec<u8>,
59    pub end: Vec<u8>,
60    pub cid: u32,
61}
62
63#[allow(dead_code)]
64pub struct CMapParser<'a> {
65    ast: &'a mut PdfAstGraph,
66    resolver: &'a ObjectNodeMap,
67}
68
69impl<'a> CMapParser<'a> {
70    pub fn new(ast: &'a mut PdfAstGraph, resolver: &'a ObjectNodeMap) -> Self {
71        CMapParser { ast, resolver }
72    }
73
74    pub fn parse_cmap_stream(&mut self, stream: &PdfStream) -> Option<(NodeId, CMap)> {
75        let data = stream.decode().ok()?;
76        let cmap = self.parse_cmap_data(&data)?;
77
78        // Create CMap node
79        let mut node = AstNode::new(
80            self.ast.next_node_id(),
81            NodeType::CMap,
82            PdfValue::Stream(stream.clone()),
83        );
84
85        // Add metadata
86        node.metadata
87            .set_property("cmap_name".to_string(), cmap.name.clone());
88        node.metadata.set_property(
89            "registry".to_string(),
90            cmap.cid_system_info.registry.clone(),
91        );
92        node.metadata.set_property(
93            "ordering".to_string(),
94            cmap.cid_system_info.ordering.clone(),
95        );
96        node.metadata.set_property(
97            "supplement".to_string(),
98            cmap.cid_system_info.supplement.to_string(),
99        );
100        node.metadata
101            .set_property("wmode".to_string(), cmap.wmode.to_string());
102
103        let node_id = self.ast.add_node(node);
104
105        Some((node_id, cmap))
106    }
107
108    pub fn parse_tounicode_stream(&mut self, stream: &PdfStream) -> Option<NodeId> {
109        let data = stream.decode().ok()?;
110        let cmap = self.parse_cmap_data(&data)?;
111
112        // Create ToUnicode node
113        let mut node = AstNode::new(
114            self.ast.next_node_id(),
115            NodeType::ToUnicode,
116            PdfValue::Stream(stream.clone()),
117        );
118
119        // Add metadata
120        node.metadata
121            .set_property("cmap_name".to_string(), cmap.name.clone());
122
123        // Count mappings
124        let mapping_count = match &cmap.mappings {
125            CMapMappings::Char(m) => m.len(),
126            CMapMappings::Range(r) => r.len(),
127            CMapMappings::CID(m) => m.len(),
128            CMapMappings::CIDRange(r) => r.len(),
129            CMapMappings::Mixed {
130                chars,
131                ranges,
132                cid_chars,
133                cid_ranges,
134            } => chars.len() + ranges.len() + cid_chars.len() + cid_ranges.len(),
135        };
136
137        node.metadata
138            .set_property("mapping_count".to_string(), mapping_count.to_string());
139
140        let node_id = self.ast.add_node(node);
141
142        Some(node_id)
143    }
144
145    fn parse_cmap_data(&self, data: &[u8]) -> Option<CMap> {
146        let content = String::from_utf8_lossy(data);
147        let mut cmap = CMap {
148            name: String::new(),
149            cid_system_info: CIDSystemInfo {
150                registry: String::new(),
151                ordering: String::new(),
152                supplement: 0,
153            },
154            wmode: 0,
155            code_space_ranges: Vec::new(),
156            mappings: CMapMappings::Char(HashMap::new()),
157            usecmap: None,
158        };
159
160        let mut chars = HashMap::new();
161        let mut ranges = Vec::new();
162        let mut cid_chars = HashMap::new();
163        let mut cid_ranges = Vec::new();
164
165        let lines: Vec<&str> = content.lines().collect();
166        let mut i = 0;
167
168        while i < lines.len() {
169            let line = lines[i].trim();
170
171            // CMapName
172            if line.starts_with("/CMapName") {
173                if let Some(name) = self.extract_name(line) {
174                    cmap.name = name;
175                }
176            }
177            // CIDSystemInfo
178            else if line.contains("CIDSystemInfo") {
179                i += 1;
180                while i < lines.len() && !lines[i].contains(">>") {
181                    let info_line = lines[i].trim();
182                    if info_line.starts_with("/Registry") {
183                        if let Some(reg) = self.extract_string(info_line) {
184                            cmap.cid_system_info.registry = reg;
185                        }
186                    } else if info_line.starts_with("/Ordering") {
187                        if let Some(ord) = self.extract_string(info_line) {
188                            cmap.cid_system_info.ordering = ord;
189                        }
190                    } else if info_line.starts_with("/Supplement") {
191                        if let Some(sup) = self.extract_number(info_line) {
192                            cmap.cid_system_info.supplement = sup as i32;
193                        }
194                    }
195                    i += 1;
196                }
197            }
198            // WMode
199            else if line.starts_with("/WMode") {
200                if let Some(wmode) = self.extract_number(line) {
201                    cmap.wmode = wmode as i32;
202                }
203            }
204            // UseCMap
205            else if line.starts_with("/UseCMap") {
206                if let Some(usecmap) = self.extract_name(line) {
207                    cmap.usecmap = Some(usecmap);
208                }
209            }
210            // Code space ranges
211            else if line.contains("begincodespacerange") {
212                let count = self.extract_count(line).unwrap_or(0);
213                i += 1;
214                for _ in 0..count {
215                    if i >= lines.len() {
216                        break;
217                    }
218                    let range_line = lines[i].trim();
219                    if let Some((start, end)) = self.parse_hex_range(range_line) {
220                        cmap.code_space_ranges.push(CodeSpaceRange { start, end });
221                    }
222                    i += 1;
223                }
224            }
225            // Character mappings
226            else if line.contains("beginbfchar") {
227                let count = self.extract_count(line).unwrap_or(0);
228                i += 1;
229                for _ in 0..count {
230                    if i >= lines.len() {
231                        break;
232                    }
233                    let char_line = lines[i].trim();
234                    if let Some((src, dst)) = self.parse_char_mapping(char_line) {
235                        chars.insert(src, dst);
236                    }
237                    i += 1;
238                }
239            }
240            // Range mappings
241            else if line.contains("beginbfrange") {
242                let count = self.extract_count(line).unwrap_or(0);
243                i += 1;
244                for _ in 0..count {
245                    if i >= lines.len() {
246                        break;
247                    }
248                    let range_line = lines[i].trim();
249                    if let Some(mapping) = self.parse_range_mapping(range_line) {
250                        ranges.push(mapping);
251                    }
252                    i += 1;
253                }
254            }
255            // CID character mappings
256            else if line.contains("begincidchar") {
257                let count = self.extract_count(line).unwrap_or(0);
258                i += 1;
259                for _ in 0..count {
260                    if i >= lines.len() {
261                        break;
262                    }
263                    let cid_line = lines[i].trim();
264                    if let Some((src, cid)) = self.parse_cid_char(cid_line) {
265                        cid_chars.insert(src, cid);
266                    }
267                    i += 1;
268                }
269            }
270            // CID range mappings
271            else if line.contains("begincidrange") {
272                let count = self.extract_count(line).unwrap_or(0);
273                i += 1;
274                for _ in 0..count {
275                    if i >= lines.len() {
276                        break;
277                    }
278                    let cid_range_line = lines[i].trim();
279                    if let Some(mapping) = self.parse_cid_range(cid_range_line) {
280                        cid_ranges.push(mapping);
281                    }
282                    i += 1;
283                }
284            }
285
286            i += 1;
287        }
288
289        // Determine mapping type
290        cmap.mappings = if !chars.is_empty()
291            && ranges.is_empty()
292            && cid_chars.is_empty()
293            && cid_ranges.is_empty()
294        {
295            CMapMappings::Char(chars)
296        } else if chars.is_empty()
297            && !ranges.is_empty()
298            && cid_chars.is_empty()
299            && cid_ranges.is_empty()
300        {
301            CMapMappings::Range(ranges)
302        } else if chars.is_empty()
303            && ranges.is_empty()
304            && !cid_chars.is_empty()
305            && cid_ranges.is_empty()
306        {
307            CMapMappings::CID(cid_chars)
308        } else if chars.is_empty()
309            && ranges.is_empty()
310            && cid_chars.is_empty()
311            && !cid_ranges.is_empty()
312        {
313            CMapMappings::CIDRange(cid_ranges)
314        } else {
315            CMapMappings::Mixed {
316                chars,
317                ranges,
318                cid_chars,
319                cid_ranges,
320            }
321        };
322
323        Some(cmap)
324    }
325
326    fn extract_name(&self, line: &str) -> Option<String> {
327        let parts: Vec<&str> = line.split_whitespace().collect();
328        parts.get(1).map(|s| s.trim_start_matches('/').to_string())
329    }
330
331    fn extract_string(&self, line: &str) -> Option<String> {
332        if let Some(start) = line.find('(') {
333            if let Some(end) = line.rfind(')') {
334                return Some(line[start + 1..end].to_string());
335            }
336        }
337        None
338    }
339
340    fn extract_number(&self, line: &str) -> Option<i64> {
341        let parts: Vec<&str> = line.split_whitespace().collect();
342        parts.get(1).and_then(|s| s.parse().ok())
343    }
344
345    fn extract_count(&self, line: &str) -> Option<usize> {
346        let parts: Vec<&str> = line.split_whitespace().collect();
347        parts.first().and_then(|s| s.parse().ok())
348    }
349
350    fn parse_hex_range(&self, line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
351        let parts: Vec<&str> = line.split_whitespace().collect();
352        if parts.len() >= 2 {
353            let start = self.hex_to_bytes(parts[0])?;
354            let end = self.hex_to_bytes(parts[1])?;
355            return Some((start, end));
356        }
357        None
358    }
359
360    fn parse_char_mapping(&self, line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
361        let parts: Vec<&str> = line.split_whitespace().collect();
362        if parts.len() >= 2 {
363            let src = self.hex_to_bytes(parts[0])?;
364            let dst = self.hex_to_bytes(parts[1])?;
365            return Some((src, dst));
366        }
367        None
368    }
369
370    fn parse_range_mapping(&self, line: &str) -> Option<CharRangeMapping> {
371        let parts: Vec<&str> = line.split_whitespace().collect();
372        if parts.len() >= 3 {
373            let start = self.hex_to_bytes(parts[0])?;
374            let end = self.hex_to_bytes(parts[1])?;
375
376            // Check if destination is array
377            if parts[2].starts_with('[') {
378                // Parse array of destinations
379                let mut array_dests = Vec::new();
380                let array_str = parts[2..].join(" ");
381                let array_content = array_str.trim_start_matches('[').trim_end_matches(']');
382
383                for hex in array_content.split_whitespace() {
384                    if let Some(bytes) = self.hex_to_bytes(hex) {
385                        array_dests.push(bytes);
386                    }
387                }
388
389                return Some(CharRangeMapping {
390                    start,
391                    end,
392                    dest: RangeDest::Array(array_dests),
393                });
394            } else {
395                // Single destination
396                let dest = self.hex_to_bytes(parts[2])?;
397                return Some(CharRangeMapping {
398                    start,
399                    end,
400                    dest: RangeDest::Single(dest),
401                });
402            }
403        }
404        None
405    }
406
407    fn parse_cid_char(&self, line: &str) -> Option<(Vec<u8>, u32)> {
408        let parts: Vec<&str> = line.split_whitespace().collect();
409        if parts.len() >= 2 {
410            let src = self.hex_to_bytes(parts[0])?;
411            let cid = parts[1].parse().ok()?;
412            return Some((src, cid));
413        }
414        None
415    }
416
417    fn parse_cid_range(&self, line: &str) -> Option<CIDRangeMapping> {
418        let parts: Vec<&str> = line.split_whitespace().collect();
419        if parts.len() >= 3 {
420            let start = self.hex_to_bytes(parts[0])?;
421            let end = self.hex_to_bytes(parts[1])?;
422            let cid = parts[2].parse().ok()?;
423            return Some(CIDRangeMapping { start, end, cid });
424        }
425        None
426    }
427
428    fn hex_to_bytes(&self, hex: &str) -> Option<Vec<u8>> {
429        let hex = hex.trim_start_matches('<').trim_end_matches('>');
430        if hex.len() % 2 != 0 {
431            return None;
432        }
433
434        let mut bytes = Vec::new();
435        for i in (0..hex.len()).step_by(2) {
436            let byte_str = &hex[i..i + 2];
437            if let Ok(byte) = u8::from_str_radix(byte_str, 16) {
438                bytes.push(byte);
439            } else {
440                return None;
441            }
442        }
443
444        Some(bytes)
445    }
446
447    pub fn map_code_to_unicode(&self, cmap: &CMap, code: &[u8]) -> Option<String> {
448        match &cmap.mappings {
449            CMapMappings::Char(chars) => chars
450                .get(code)
451                .and_then(|bytes| self.bytes_to_unicode(bytes)),
452            CMapMappings::Range(ranges) => {
453                for range in ranges {
454                    if self.in_range(code, &range.start, &range.end) {
455                        return self.map_range_to_unicode(code, &range.start, &range.dest);
456                    }
457                }
458                None
459            }
460            CMapMappings::Mixed { chars, ranges, .. } => {
461                // Try direct mapping first
462                if let Some(unicode) = chars
463                    .get(code)
464                    .and_then(|bytes| self.bytes_to_unicode(bytes))
465                {
466                    return Some(unicode);
467                }
468
469                // Try range mappings
470                for range in ranges {
471                    if self.in_range(code, &range.start, &range.end) {
472                        return self.map_range_to_unicode(code, &range.start, &range.dest);
473                    }
474                }
475
476                None
477            }
478            _ => None,
479        }
480    }
481
482    fn in_range(&self, code: &[u8], start: &[u8], end: &[u8]) -> bool {
483        if code.len() != start.len() || code.len() != end.len() {
484            return false;
485        }
486
487        code >= start && code <= end
488    }
489
490    fn map_range_to_unicode(&self, code: &[u8], start: &[u8], dest: &RangeDest) -> Option<String> {
491        match dest {
492            RangeDest::Single(base) => {
493                // Calculate offset
494                let offset = self.bytes_to_u32(code)? - self.bytes_to_u32(start)?;
495                let unicode_value = self.bytes_to_u32(base)? + offset;
496
497                // Convert to Unicode character
498                char::from_u32(unicode_value).map(|c| c.to_string())
499            }
500            RangeDest::Array(array) => {
501                // Calculate index
502                let index = (self.bytes_to_u32(code)? - self.bytes_to_u32(start)?) as usize;
503                array
504                    .get(index)
505                    .and_then(|bytes| self.bytes_to_unicode(bytes))
506            }
507        }
508    }
509
510    fn bytes_to_unicode(&self, bytes: &[u8]) -> Option<String> {
511        // Interpret bytes as UTF-16BE Unicode value
512        if bytes.len() == 2 {
513            let value = ((bytes[0] as u32) << 8) | (bytes[1] as u32);
514            char::from_u32(value).map(|c| c.to_string())
515        } else if bytes.len() == 4 {
516            // Surrogate pair or direct UTF-32
517            let value = ((bytes[0] as u32) << 24)
518                | ((bytes[1] as u32) << 16)
519                | ((bytes[2] as u32) << 8)
520                | (bytes[3] as u32);
521            char::from_u32(value).map(|c| c.to_string())
522        } else {
523            None
524        }
525    }
526
527    fn bytes_to_u32(&self, bytes: &[u8]) -> Option<u32> {
528        if bytes.is_empty() || bytes.len() > 4 {
529            return None;
530        }
531
532        let mut value = 0u32;
533        for byte in bytes {
534            value = (value << 8) | (*byte as u32);
535        }
536
537        Some(value)
538    }
539}