1use folio_core::Result;
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Default)]
11pub struct ToUnicodeCMap {
12 mappings: HashMap<u32, String>,
14 ranges: Vec<(u32, u32, u32)>,
16}
17
18impl ToUnicodeCMap {
19 pub fn parse(data: &[u8]) -> Result<Self> {
21 let text = String::from_utf8_lossy(data);
22 let mut cmap = ToUnicodeCMap::default();
23
24 let mut lines = text.lines().peekable();
25
26 while let Some(line) = lines.next() {
27 let line = line.trim();
28
29 if line.ends_with("beginbfchar") {
30 while let Some(mapping_line) = lines.next() {
32 let mapping_line = mapping_line.trim();
33 if mapping_line.contains("endbfchar") {
34 break;
35 }
36 if let Some((code, unicode)) = parse_bfchar_line(mapping_line) {
37 cmap.mappings.insert(code, unicode);
38 }
39 }
40 } else if line.ends_with("beginbfrange") {
41 while let Some(range_line) = lines.next() {
43 let range_line = range_line.trim();
44 if range_line.contains("endbfrange") {
45 break;
46 }
47 if let Some((start, end, unicode_start)) = parse_bfrange_line(range_line) {
48 cmap.ranges.push((start, end, unicode_start));
49 }
50 }
51 }
52 }
53
54 Ok(cmap)
55 }
56
57 pub fn lookup(&self, code: u32) -> Option<String> {
59 if let Some(s) = self.mappings.get(&code) {
61 return Some(s.clone());
62 }
63
64 for &(start, end, unicode_start) in &self.ranges {
66 if code >= start && code <= end {
67 let offset = code - start;
68 if let Some(ch) = char::from_u32(unicode_start + offset) {
69 return Some(ch.to_string());
70 }
71 }
72 }
73
74 None
75 }
76
77 pub fn decode(&self, data: &[u8]) -> String {
81 let mut result = String::new();
82 let mut i = 0;
83
84 while i < data.len() {
85 if i + 1 < data.len() {
87 let code2 = ((data[i] as u32) << 8) | (data[i + 1] as u32);
88 if let Some(s) = self.lookup(code2) {
89 result.push_str(&s);
90 i += 2;
91 continue;
92 }
93 }
94
95 let code1 = data[i] as u32;
97 if let Some(s) = self.lookup(code1) {
98 result.push_str(&s);
99 } else {
100 if data[i] >= 0x20 && data[i] <= 0x7E {
102 result.push(data[i] as char);
103 }
104 }
105 i += 1;
106 }
107
108 result
109 }
110
111 pub fn is_empty(&self) -> bool {
113 self.mappings.is_empty() && self.ranges.is_empty()
114 }
115}
116
117fn parse_bfchar_line(line: &str) -> Option<(u32, String)> {
119 let parts: Vec<&str> = line.split('<').filter(|s| !s.is_empty()).collect();
120 if parts.len() < 2 {
121 return None;
122 }
123
124 let code_hex = parts[0].split('>').next()?;
125 let unicode_hex = parts[1].split('>').next()?;
126
127 let code = u32::from_str_radix(code_hex.trim(), 16).ok()?;
128 let unicode_str = hex_to_unicode_string(unicode_hex.trim())?;
129
130 Some((code, unicode_str))
131}
132
133fn parse_bfrange_line(line: &str) -> Option<(u32, u32, u32)> {
135 let parts: Vec<&str> = line.split('<').filter(|s| !s.is_empty()).collect();
136 if parts.len() < 3 {
137 return None;
138 }
139
140 let start_hex = parts[0].split('>').next()?;
141 let end_hex = parts[1].split('>').next()?;
142 let unicode_hex = parts[2].split('>').next()?;
143
144 let start = u32::from_str_radix(start_hex.trim(), 16).ok()?;
145 let end = u32::from_str_radix(end_hex.trim(), 16).ok()?;
146 let unicode_start = u32::from_str_radix(unicode_hex.trim(), 16).ok()?;
147
148 Some((start, end, unicode_start))
149}
150
151fn hex_to_unicode_string(hex: &str) -> Option<String> {
154 let hex = hex.trim();
155 if hex.len() <= 4 {
156 let cp = u32::from_str_radix(hex, 16).ok()?;
158 char::from_u32(cp).map(|c| c.to_string())
159 } else {
160 let mut result = String::new();
162 let mut i = 0;
163 while i + 3 < hex.len() {
164 if let Ok(cp) = u32::from_str_radix(&hex[i..i + 4], 16) {
165 if let Some(c) = char::from_u32(cp) {
166 result.push(c);
167 }
168 }
169 i += 4;
170 }
171 if result.is_empty() {
172 None
173 } else {
174 Some(result)
175 }
176 }
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182
183 #[test]
184 fn test_parse_bfchar() {
185 let cmap_data = br#"
186/CIDInit /ProcSet findresource begin
18712 dict begin
188begincmap
1891 begincodespacerange
190<00> <FF>
191endcodespacerange
1923 beginbfchar
193<01> <0048>
194<02> <0065>
195<03> <006C>
196endbfchar
197endcmap
198"#;
199 let cmap = ToUnicodeCMap::parse(cmap_data).unwrap();
200 assert_eq!(cmap.lookup(1), Some("H".into()));
201 assert_eq!(cmap.lookup(2), Some("e".into()));
202 assert_eq!(cmap.lookup(3), Some("l".into()));
203 }
204
205 #[test]
206 fn test_parse_bfrange() {
207 let cmap_data = br#"
2081 beginbfrange
209<0041> <005A> <0041>
210endbfrange
211"#;
212 let cmap = ToUnicodeCMap::parse(cmap_data).unwrap();
213 assert_eq!(cmap.lookup(0x41), Some("A".into()));
214 assert_eq!(cmap.lookup(0x42), Some("B".into()));
215 assert_eq!(cmap.lookup(0x5A), Some("Z".into()));
216 assert_eq!(cmap.lookup(0x5B), None);
217 }
218
219 #[test]
220 fn test_decode() {
221 let cmap_data = br#"
2223 beginbfchar
223<48> <0048>
224<65> <0065>
225<6C> <006C>
226endbfchar
227"#;
228 let cmap = ToUnicodeCMap::parse(cmap_data).unwrap();
229 assert_eq!(cmap.decode(b"Hel"), "Hel");
230 }
231}