1pub mod cmap;
2mod glyphnames;
3mod mappings;
4
5use crate::Error;
6use crate::Result;
7use cmap::ToUnicodeCMap;
8use encoding_rs::UTF_16BE;
9use log::debug;
10use crate::parser_aux::substr;
11pub use self::mappings::*;
12
13pub fn bytes_to_string(encoding: &CodedCharacterSet, bytes: &[u8]) -> String {
14 let code_points = bytes
15 .iter()
16 .filter_map(|&byte| encoding[byte as usize])
17 .collect::<Vec<u16>>();
18 String::from_utf16(&code_points).expect("decoded string should only contain valid UTF16")
19}
20
21pub fn string_to_bytes(encoding: &CodedCharacterSet, text: &str) -> Vec<u8> {
22 text.encode_utf16()
23 .filter_map(|ch| encoding.iter().position(|&code| code == Some(ch)))
24 .map(|byte| byte as u8)
25 .collect()
26}
27
28pub enum Encoding<'a> {
29 OneByteEncoding(&'a CodedCharacterSet),
30 SimpleEncoding(&'a [u8]),
31 UnicodeMapEncoding(ToUnicodeCMap),
32}
33
34impl std::fmt::Debug for Encoding<'_> {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 match self {
37 Self::OneByteEncoding(_arg0) => f.debug_tuple("OneByteEncoding").finish(),
39 Self::SimpleEncoding(arg0) => f.debug_tuple("SimpleEncoding").field(arg0).finish(),
40 Self::UnicodeMapEncoding(_arg0) => f.debug_tuple("UnicodeMapEncoding").finish(),
41 }
42 }
43}
44
45impl Encoding<'_> {
46 pub fn bytes_to_string(&self, bytes: &[u8]) -> Result<String> {
47 match self {
48 Self::OneByteEncoding(map) => Ok(bytes_to_string(map, bytes)),
49 Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => {
50 Ok(UTF_16BE.decode(bytes).0.to_string())
51 }
52 Self::UnicodeMapEncoding(unicode_map) => {
53 let mut output_bytes = Vec::new();
54
55 let mut bytes_in_considered_code = 0u8;
57 let mut considered_source_code = 0u32;
58 for byte in bytes {
59 if bytes_in_considered_code == 4 {
60 let mut value = unicode_map.get_or_replacement_char(considered_source_code, 4);
61 considered_source_code = 0;
62 bytes_in_considered_code = 0;
63 output_bytes.append(&mut value);
64 }
65 bytes_in_considered_code += 1;
66 considered_source_code = considered_source_code * 256 + *byte as u32;
67 if let Some(mut value) = unicode_map.get(considered_source_code, bytes_in_considered_code) {
68 considered_source_code = 0;
69 bytes_in_considered_code = 0;
70 output_bytes.append(&mut value);
71 }
72 }
73 if bytes_in_considered_code > 0 {
74 let mut value =
75 unicode_map.get_or_replacement_char(considered_source_code, bytes_in_considered_code);
76 output_bytes.append(&mut value);
77 }
78 let utf16_str: Vec<u8> = output_bytes
79 .iter()
80 .flat_map(|it| [(it / 256) as u8, (it % 256) as u8])
81 .collect();
82 Ok(UTF_16BE.decode(&utf16_str).0.to_string())
83 }
84 Self::SimpleEncoding(b"WinAnsiEncoding") => Ok(bytes_to_string(&WIN_ANSI_ENCODING, bytes)),
85 Self::SimpleEncoding(_) => Err(Error::CharacterEncoding),
86 }
87 }
88
89 pub fn string_to_bytes(&self, text: &str) -> Vec<u8> {
90 match self {
91 Self::OneByteEncoding(map) => string_to_bytes(map, text),
92 Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => encode_utf16_be(text),
93 Self::SimpleEncoding(b"WinAnsiEncoding") => string_to_bytes(&WIN_ANSI_ENCODING, text),
94 Self::UnicodeMapEncoding(unicode_map) => {
95 let mut result_bytes = Vec::new();
96
97 let mut i = 0;
98 while i < text.chars().count() {
99 let current_unicode_seq: Vec<u16> = substr(text, i, 1).encode_utf16().collect();
100
101 if let Some(entries) = unicode_map.get_source_codes_for_unicode(¤t_unicode_seq) {
102 if let Some(entry) = entries.first() {
103 let mut bytes_for_code = Vec::new();
105 let val = entry.source_code;
106 match entry.code_len {
107 1 => bytes_for_code.push(val as u8),
108 2 => bytes_for_code.extend_from_slice(&(val as u16).to_be_bytes()),
109 3 => {
110 bytes_for_code.push((val >> 16) as u8);
111 bytes_for_code.push((val >> 8) as u8);
112 bytes_for_code.push(val as u8);
113 }
114 4 => bytes_for_code.extend_from_slice(&val.to_be_bytes()),
115 _ => { }
116 }
117 result_bytes.extend(bytes_for_code);
118 } else {
119 log::warn!(
121 "Unicode sequence {current_unicode_seq:04X?} found in map but no entries, skipping."
122 );
123 }
124 } else {
125 log::warn!(
127 "Unicode sequence {current_unicode_seq:04X?} not found in ToUnicode CMap, skipping."
128 );
129 }
130 i += 1;
131 }
132 result_bytes
133 }
134 Self::SimpleEncoding(_) => {
135 debug!("Unknown encoding used to encode text {self:?}");
136 text.as_bytes().to_vec()
137 }
138 }
139 }
140}
141
142pub fn encode_utf16_be(text: &str) -> Vec<u8> {
146 let bom: u16 = 0xFEFF;
148 let mut bytes = vec![];
149 bytes.extend([bom].iter().flat_map(|b| b.to_be_bytes()));
150 bytes.extend(text.encode_utf16().flat_map(|b| b.to_be_bytes()));
151 bytes
152}
153
154pub fn encode_utf8(text: &str) -> Vec<u8> {
159 let mut bytes = vec![0xEF, 0xBB, 0xBF];
161 bytes.extend(text.bytes());
162 bytes
163}
164
165#[cfg(test)]
166mod tests {
167
168 use super::*;
169
170 #[test]
171 fn unicode_with_2byte_code_does_not_convert_single_bytes() {
172 let mut cmap = ToUnicodeCMap::new();
173
174 cmap.put(0x0000, 0x0002, 2, cmap::BfRangeTarget::UTF16CodePoint { offset: 0 });
175 cmap.put(0x0024, 0x0025, 2, cmap::BfRangeTarget::UTF16CodePoint { offset: 0 });
176
177 let bytes: [u8; 2] = [0x00, 0x24];
178
179 let result = Encoding::UnicodeMapEncoding(cmap).bytes_to_string(&bytes);
180
181 assert_eq!(result.unwrap(), "\u{0024}");
182 }
183
184 #[test]
185 fn winansi_bytes_to_string() {
186 let bytes = [0x41, 0xe9, 0x42, 0xfc, 0xdf]; let result = Encoding::SimpleEncoding(b"WinAnsiEncoding")
189 .bytes_to_string(&bytes)
190 .expect("WinAnsi decode should succeed");
191 assert_eq!(result, "AéBüß");
192 }
193
194 #[test]
195 fn winansi_string_to_bytes() {
196 let text = "Sébastien 0,019€ ü ÄÖÜ ß";
197 let bytes = Encoding::SimpleEncoding(b"WinAnsiEncoding").string_to_bytes(text);
198 let decoded = Encoding::OneByteEncoding(&WIN_ANSI_ENCODING)
200 .bytes_to_string(&bytes)
201 .expect("WinAnsi decode should succeed");
202 assert_eq!(decoded, text);
203 }
204}