1use crate::ast::{AstNode, NodeId, NodeType, PdfAstGraph};
2use crate::parser::reference_resolver::ObjectNodeMap;
3use crate::types::{PdfStream, PdfValue};
4use std::collections::HashMap;
5
6#[derive(Debug, Clone)]
7pub struct CMap {
8 pub name: String,
9 pub cid_system_info: CIDSystemInfo,
10 pub wmode: i32,
11 pub code_space_ranges: Vec<CodeSpaceRange>,
12 pub mappings: CMapMappings,
13 pub usecmap: Option<String>,
14}
15
16#[derive(Debug, Clone)]
17pub struct CIDSystemInfo {
18 pub registry: String,
19 pub ordering: String,
20 pub supplement: i32,
21}
22
23#[derive(Debug, Clone)]
24pub struct CodeSpaceRange {
25 pub start: Vec<u8>,
26 pub end: Vec<u8>,
27}
28
29#[derive(Debug, Clone)]
30pub enum CMapMappings {
31 Char(HashMap<Vec<u8>, Vec<u8>>), Range(Vec<CharRangeMapping>), CID(HashMap<Vec<u8>, u32>), CIDRange(Vec<CIDRangeMapping>), Mixed {
36 chars: HashMap<Vec<u8>, Vec<u8>>,
37 ranges: Vec<CharRangeMapping>,
38 cid_chars: HashMap<Vec<u8>, u32>,
39 cid_ranges: Vec<CIDRangeMapping>,
40 },
41}
42
43#[derive(Debug, Clone)]
44pub struct CharRangeMapping {
45 pub start: Vec<u8>,
46 pub end: Vec<u8>,
47 pub dest: RangeDest,
48}
49
50#[derive(Debug, Clone)]
51pub enum RangeDest {
52 Single(Vec<u8>), Array(Vec<Vec<u8>>), }
55
56#[derive(Debug, Clone)]
57pub struct CIDRangeMapping {
58 pub start: Vec<u8>,
59 pub end: Vec<u8>,
60 pub cid: u32,
61}
62
63#[allow(dead_code)]
64pub struct CMapParser<'a> {
65 ast: &'a mut PdfAstGraph,
66 resolver: &'a ObjectNodeMap,
67}
68
69impl<'a> CMapParser<'a> {
70 pub fn new(ast: &'a mut PdfAstGraph, resolver: &'a ObjectNodeMap) -> Self {
71 CMapParser { ast, resolver }
72 }
73
74 pub fn parse_cmap_stream(&mut self, stream: &PdfStream) -> Option<(NodeId, CMap)> {
75 let data = stream.decode().ok()?;
76 let cmap = self.parse_cmap_data(&data)?;
77
78 let mut node = AstNode::new(
80 self.ast.next_node_id(),
81 NodeType::CMap,
82 PdfValue::Stream(stream.clone()),
83 );
84
85 node.metadata
87 .set_property("cmap_name".to_string(), cmap.name.clone());
88 node.metadata.set_property(
89 "registry".to_string(),
90 cmap.cid_system_info.registry.clone(),
91 );
92 node.metadata.set_property(
93 "ordering".to_string(),
94 cmap.cid_system_info.ordering.clone(),
95 );
96 node.metadata.set_property(
97 "supplement".to_string(),
98 cmap.cid_system_info.supplement.to_string(),
99 );
100 node.metadata
101 .set_property("wmode".to_string(), cmap.wmode.to_string());
102
103 let node_id = self.ast.add_node(node);
104
105 Some((node_id, cmap))
106 }
107
108 pub fn parse_tounicode_stream(&mut self, stream: &PdfStream) -> Option<NodeId> {
109 let data = stream.decode().ok()?;
110 let cmap = self.parse_cmap_data(&data)?;
111
112 let mut node = AstNode::new(
114 self.ast.next_node_id(),
115 NodeType::ToUnicode,
116 PdfValue::Stream(stream.clone()),
117 );
118
119 node.metadata
121 .set_property("cmap_name".to_string(), cmap.name.clone());
122
123 let mapping_count = match &cmap.mappings {
125 CMapMappings::Char(m) => m.len(),
126 CMapMappings::Range(r) => r.len(),
127 CMapMappings::CID(m) => m.len(),
128 CMapMappings::CIDRange(r) => r.len(),
129 CMapMappings::Mixed {
130 chars,
131 ranges,
132 cid_chars,
133 cid_ranges,
134 } => chars.len() + ranges.len() + cid_chars.len() + cid_ranges.len(),
135 };
136
137 node.metadata
138 .set_property("mapping_count".to_string(), mapping_count.to_string());
139
140 let node_id = self.ast.add_node(node);
141
142 Some(node_id)
143 }
144
145 fn parse_cmap_data(&self, data: &[u8]) -> Option<CMap> {
146 let content = String::from_utf8_lossy(data);
147 let mut cmap = CMap {
148 name: String::new(),
149 cid_system_info: CIDSystemInfo {
150 registry: String::new(),
151 ordering: String::new(),
152 supplement: 0,
153 },
154 wmode: 0,
155 code_space_ranges: Vec::new(),
156 mappings: CMapMappings::Char(HashMap::new()),
157 usecmap: None,
158 };
159
160 let mut chars = HashMap::new();
161 let mut ranges = Vec::new();
162 let mut cid_chars = HashMap::new();
163 let mut cid_ranges = Vec::new();
164
165 let lines: Vec<&str> = content.lines().collect();
166 let mut i = 0;
167
168 while i < lines.len() {
169 let line = lines[i].trim();
170
171 if line.starts_with("/CMapName") {
173 if let Some(name) = self.extract_name(line) {
174 cmap.name = name;
175 }
176 }
177 else if line.contains("CIDSystemInfo") {
179 i += 1;
180 while i < lines.len() && !lines[i].contains(">>") {
181 let info_line = lines[i].trim();
182 if info_line.starts_with("/Registry") {
183 if let Some(reg) = self.extract_string(info_line) {
184 cmap.cid_system_info.registry = reg;
185 }
186 } else if info_line.starts_with("/Ordering") {
187 if let Some(ord) = self.extract_string(info_line) {
188 cmap.cid_system_info.ordering = ord;
189 }
190 } else if info_line.starts_with("/Supplement") {
191 if let Some(sup) = self.extract_number(info_line) {
192 cmap.cid_system_info.supplement = sup as i32;
193 }
194 }
195 i += 1;
196 }
197 }
198 else if line.starts_with("/WMode") {
200 if let Some(wmode) = self.extract_number(line) {
201 cmap.wmode = wmode as i32;
202 }
203 }
204 else if line.starts_with("/UseCMap") {
206 if let Some(usecmap) = self.extract_name(line) {
207 cmap.usecmap = Some(usecmap);
208 }
209 }
210 else if line.contains("begincodespacerange") {
212 let count = self.extract_count(line).unwrap_or(0);
213 i += 1;
214 for _ in 0..count {
215 if i >= lines.len() {
216 break;
217 }
218 let range_line = lines[i].trim();
219 if let Some((start, end)) = self.parse_hex_range(range_line) {
220 cmap.code_space_ranges.push(CodeSpaceRange { start, end });
221 }
222 i += 1;
223 }
224 }
225 else if line.contains("beginbfchar") {
227 let count = self.extract_count(line).unwrap_or(0);
228 i += 1;
229 for _ in 0..count {
230 if i >= lines.len() {
231 break;
232 }
233 let char_line = lines[i].trim();
234 if let Some((src, dst)) = self.parse_char_mapping(char_line) {
235 chars.insert(src, dst);
236 }
237 i += 1;
238 }
239 }
240 else if line.contains("beginbfrange") {
242 let count = self.extract_count(line).unwrap_or(0);
243 i += 1;
244 for _ in 0..count {
245 if i >= lines.len() {
246 break;
247 }
248 let range_line = lines[i].trim();
249 if let Some(mapping) = self.parse_range_mapping(range_line) {
250 ranges.push(mapping);
251 }
252 i += 1;
253 }
254 }
255 else if line.contains("begincidchar") {
257 let count = self.extract_count(line).unwrap_or(0);
258 i += 1;
259 for _ in 0..count {
260 if i >= lines.len() {
261 break;
262 }
263 let cid_line = lines[i].trim();
264 if let Some((src, cid)) = self.parse_cid_char(cid_line) {
265 cid_chars.insert(src, cid);
266 }
267 i += 1;
268 }
269 }
270 else if line.contains("begincidrange") {
272 let count = self.extract_count(line).unwrap_or(0);
273 i += 1;
274 for _ in 0..count {
275 if i >= lines.len() {
276 break;
277 }
278 let cid_range_line = lines[i].trim();
279 if let Some(mapping) = self.parse_cid_range(cid_range_line) {
280 cid_ranges.push(mapping);
281 }
282 i += 1;
283 }
284 }
285
286 i += 1;
287 }
288
289 cmap.mappings = if !chars.is_empty()
291 && ranges.is_empty()
292 && cid_chars.is_empty()
293 && cid_ranges.is_empty()
294 {
295 CMapMappings::Char(chars)
296 } else if chars.is_empty()
297 && !ranges.is_empty()
298 && cid_chars.is_empty()
299 && cid_ranges.is_empty()
300 {
301 CMapMappings::Range(ranges)
302 } else if chars.is_empty()
303 && ranges.is_empty()
304 && !cid_chars.is_empty()
305 && cid_ranges.is_empty()
306 {
307 CMapMappings::CID(cid_chars)
308 } else if chars.is_empty()
309 && ranges.is_empty()
310 && cid_chars.is_empty()
311 && !cid_ranges.is_empty()
312 {
313 CMapMappings::CIDRange(cid_ranges)
314 } else {
315 CMapMappings::Mixed {
316 chars,
317 ranges,
318 cid_chars,
319 cid_ranges,
320 }
321 };
322
323 Some(cmap)
324 }
325
326 fn extract_name(&self, line: &str) -> Option<String> {
327 let parts: Vec<&str> = line.split_whitespace().collect();
328 parts.get(1).map(|s| s.trim_start_matches('/').to_string())
329 }
330
331 fn extract_string(&self, line: &str) -> Option<String> {
332 if let Some(start) = line.find('(') {
333 if let Some(end) = line.rfind(')') {
334 return Some(line[start + 1..end].to_string());
335 }
336 }
337 None
338 }
339
340 fn extract_number(&self, line: &str) -> Option<i64> {
341 let parts: Vec<&str> = line.split_whitespace().collect();
342 parts.get(1).and_then(|s| s.parse().ok())
343 }
344
345 fn extract_count(&self, line: &str) -> Option<usize> {
346 let parts: Vec<&str> = line.split_whitespace().collect();
347 parts.first().and_then(|s| s.parse().ok())
348 }
349
350 fn parse_hex_range(&self, line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
351 let parts: Vec<&str> = line.split_whitespace().collect();
352 if parts.len() >= 2 {
353 let start = self.hex_to_bytes(parts[0])?;
354 let end = self.hex_to_bytes(parts[1])?;
355 return Some((start, end));
356 }
357 None
358 }
359
360 fn parse_char_mapping(&self, line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
361 let parts: Vec<&str> = line.split_whitespace().collect();
362 if parts.len() >= 2 {
363 let src = self.hex_to_bytes(parts[0])?;
364 let dst = self.hex_to_bytes(parts[1])?;
365 return Some((src, dst));
366 }
367 None
368 }
369
370 fn parse_range_mapping(&self, line: &str) -> Option<CharRangeMapping> {
371 let parts: Vec<&str> = line.split_whitespace().collect();
372 if parts.len() >= 3 {
373 let start = self.hex_to_bytes(parts[0])?;
374 let end = self.hex_to_bytes(parts[1])?;
375
376 if parts[2].starts_with('[') {
378 let mut array_dests = Vec::new();
380 let array_str = parts[2..].join(" ");
381 let array_content = array_str.trim_start_matches('[').trim_end_matches(']');
382
383 for hex in array_content.split_whitespace() {
384 if let Some(bytes) = self.hex_to_bytes(hex) {
385 array_dests.push(bytes);
386 }
387 }
388
389 return Some(CharRangeMapping {
390 start,
391 end,
392 dest: RangeDest::Array(array_dests),
393 });
394 } else {
395 let dest = self.hex_to_bytes(parts[2])?;
397 return Some(CharRangeMapping {
398 start,
399 end,
400 dest: RangeDest::Single(dest),
401 });
402 }
403 }
404 None
405 }
406
407 fn parse_cid_char(&self, line: &str) -> Option<(Vec<u8>, u32)> {
408 let parts: Vec<&str> = line.split_whitespace().collect();
409 if parts.len() >= 2 {
410 let src = self.hex_to_bytes(parts[0])?;
411 let cid = parts[1].parse().ok()?;
412 return Some((src, cid));
413 }
414 None
415 }
416
417 fn parse_cid_range(&self, line: &str) -> Option<CIDRangeMapping> {
418 let parts: Vec<&str> = line.split_whitespace().collect();
419 if parts.len() >= 3 {
420 let start = self.hex_to_bytes(parts[0])?;
421 let end = self.hex_to_bytes(parts[1])?;
422 let cid = parts[2].parse().ok()?;
423 return Some(CIDRangeMapping { start, end, cid });
424 }
425 None
426 }
427
428 fn hex_to_bytes(&self, hex: &str) -> Option<Vec<u8>> {
429 let hex = hex.trim_start_matches('<').trim_end_matches('>');
430 if hex.len() % 2 != 0 {
431 return None;
432 }
433
434 let mut bytes = Vec::new();
435 for i in (0..hex.len()).step_by(2) {
436 let byte_str = &hex[i..i + 2];
437 if let Ok(byte) = u8::from_str_radix(byte_str, 16) {
438 bytes.push(byte);
439 } else {
440 return None;
441 }
442 }
443
444 Some(bytes)
445 }
446
447 pub fn map_code_to_unicode(&self, cmap: &CMap, code: &[u8]) -> Option<String> {
448 match &cmap.mappings {
449 CMapMappings::Char(chars) => chars
450 .get(code)
451 .and_then(|bytes| self.bytes_to_unicode(bytes)),
452 CMapMappings::Range(ranges) => {
453 for range in ranges {
454 if self.in_range(code, &range.start, &range.end) {
455 return self.map_range_to_unicode(code, &range.start, &range.dest);
456 }
457 }
458 None
459 }
460 CMapMappings::Mixed { chars, ranges, .. } => {
461 if let Some(unicode) = chars
463 .get(code)
464 .and_then(|bytes| self.bytes_to_unicode(bytes))
465 {
466 return Some(unicode);
467 }
468
469 for range in ranges {
471 if self.in_range(code, &range.start, &range.end) {
472 return self.map_range_to_unicode(code, &range.start, &range.dest);
473 }
474 }
475
476 None
477 }
478 _ => None,
479 }
480 }
481
482 fn in_range(&self, code: &[u8], start: &[u8], end: &[u8]) -> bool {
483 if code.len() != start.len() || code.len() != end.len() {
484 return false;
485 }
486
487 code >= start && code <= end
488 }
489
490 fn map_range_to_unicode(&self, code: &[u8], start: &[u8], dest: &RangeDest) -> Option<String> {
491 match dest {
492 RangeDest::Single(base) => {
493 let offset = self.bytes_to_u32(code)? - self.bytes_to_u32(start)?;
495 let unicode_value = self.bytes_to_u32(base)? + offset;
496
497 char::from_u32(unicode_value).map(|c| c.to_string())
499 }
500 RangeDest::Array(array) => {
501 let index = (self.bytes_to_u32(code)? - self.bytes_to_u32(start)?) as usize;
503 array
504 .get(index)
505 .and_then(|bytes| self.bytes_to_unicode(bytes))
506 }
507 }
508 }
509
510 fn bytes_to_unicode(&self, bytes: &[u8]) -> Option<String> {
511 if bytes.len() == 2 {
513 let value = ((bytes[0] as u32) << 8) | (bytes[1] as u32);
514 char::from_u32(value).map(|c| c.to_string())
515 } else if bytes.len() == 4 {
516 let value = ((bytes[0] as u32) << 24)
518 | ((bytes[1] as u32) << 16)
519 | ((bytes[2] as u32) << 8)
520 | (bytes[3] as u32);
521 char::from_u32(value).map(|c| c.to_string())
522 } else {
523 None
524 }
525 }
526
527 fn bytes_to_u32(&self, bytes: &[u8]) -> Option<u32> {
528 if bytes.is_empty() || bytes.len() > 4 {
529 return None;
530 }
531
532 let mut value = 0u32;
533 for byte in bytes {
534 value = (value << 8) | (*byte as u32);
535 }
536
537 Some(value)
538 }
539}