pdfluent_lopdf/encodings/
mod.rs1pub mod cmap;
2mod glyphnames;
3mod mappings;
4
5pub use self::mappings::*;
6use crate::Error;
7use crate::Result;
8use crate::parser_aux::substr;
9use cmap::ToUnicodeCMap;
10use encoding_rs::UTF_16BE;
11use log::debug;
12
13pub fn bytes_to_string(encoding: &CodedCharacterSet, bytes: &[u8]) -> String {
14 let code_points = bytes
15 .iter()
16 .filter_map(|&byte| encoding[byte as usize])
17 .collect::<Vec<u16>>();
18 String::from_utf16(&code_points).expect("decoded string should only contain valid UTF16")
19}
20
21pub fn string_to_bytes(encoding: &CodedCharacterSet, text: &str) -> Vec<u8> {
22 text.encode_utf16()
23 .filter_map(|ch| encoding.iter().position(|&code| code == Some(ch)))
24 .map(|byte| byte as u8)
25 .collect()
26}
27
28pub enum Encoding<'a> {
29 OneByteEncoding(&'a CodedCharacterSet),
30 SimpleEncoding(&'a [u8]),
31 UnicodeMapEncoding(ToUnicodeCMap),
32}
33
34impl std::fmt::Debug for Encoding<'_> {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 match self {
37 Self::OneByteEncoding(_arg0) => f.debug_tuple("OneByteEncoding").finish(),
39 Self::SimpleEncoding(arg0) => f.debug_tuple("SimpleEncoding").field(arg0).finish(),
40 Self::UnicodeMapEncoding(_arg0) => f.debug_tuple("UnicodeMapEncoding").finish(),
41 }
42 }
43}
44
45impl Encoding<'_> {
46 pub fn bytes_to_string(&self, bytes: &[u8]) -> Result<String> {
47 match self {
48 Self::OneByteEncoding(map) => Ok(bytes_to_string(map, bytes)),
49 Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => {
50 Ok(UTF_16BE.decode(bytes).0.to_string())
51 }
52 Self::UnicodeMapEncoding(unicode_map) => {
53 let mut output_bytes = Vec::new();
54
55 let mut bytes_in_considered_code = 0u8;
57 let mut considered_source_code = 0u32;
58 for byte in bytes {
59 if bytes_in_considered_code == 4 {
60 let mut value =
61 unicode_map.get_or_replacement_char(considered_source_code, 4);
62 considered_source_code = 0;
63 bytes_in_considered_code = 0;
64 output_bytes.append(&mut value);
65 }
66 bytes_in_considered_code += 1;
67 considered_source_code = considered_source_code * 256 + *byte as u32;
68 if let Some(mut value) =
69 unicode_map.get(considered_source_code, bytes_in_considered_code)
70 {
71 considered_source_code = 0;
72 bytes_in_considered_code = 0;
73 output_bytes.append(&mut value);
74 }
75 }
76 if bytes_in_considered_code > 0 {
77 let mut value = unicode_map
78 .get_or_replacement_char(considered_source_code, bytes_in_considered_code);
79 output_bytes.append(&mut value);
80 }
81 let utf16_str: Vec<u8> = output_bytes
82 .iter()
83 .flat_map(|it| [(it / 256) as u8, (it % 256) as u8])
84 .collect();
85 Ok(UTF_16BE.decode(&utf16_str).0.to_string())
86 }
87 Self::SimpleEncoding(_) => Err(Error::CharacterEncoding),
88 }
89 }
90
91 pub fn string_to_bytes(&self, text: &str) -> Vec<u8> {
92 match self {
93 Self::OneByteEncoding(map) => string_to_bytes(map, text),
94 Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => {
95 encode_utf16_be(text)
96 }
97 Self::UnicodeMapEncoding(unicode_map) => {
98 let mut result_bytes = Vec::new();
99
100 let mut i = 0;
101 while i < text.chars().count() {
102 let current_unicode_seq: Vec<u16> = substr(text, i, 1).encode_utf16().collect();
103
104 if let Some(entries) =
105 unicode_map.get_source_codes_for_unicode(¤t_unicode_seq)
106 {
107 if let Some(entry) = entries.first() {
108 let mut bytes_for_code = Vec::new();
110 let val = entry.source_code;
111 match entry.code_len {
112 1 => bytes_for_code.push(val as u8),
113 2 => bytes_for_code.extend_from_slice(&(val as u16).to_be_bytes()),
114 3 => {
115 bytes_for_code.push((val >> 16) as u8);
116 bytes_for_code.push((val >> 8) as u8);
117 bytes_for_code.push(val as u8);
118 }
119 4 => bytes_for_code.extend_from_slice(&val.to_be_bytes()),
120 _ => { }
121 }
122 result_bytes.extend(bytes_for_code);
123 } else {
124 log::warn!(
126 "Unicode sequence {current_unicode_seq:04X?} found in map but no entries, skipping."
127 );
128 }
129 } else {
130 log::warn!(
132 "Unicode sequence {current_unicode_seq:04X?} not found in ToUnicode CMap, skipping."
133 );
134 }
135 i += 1;
136 }
137 result_bytes
138 }
139 Self::SimpleEncoding(_) => {
140 debug!("Unknown encoding used to encode text {self:?}");
141 text.as_bytes().to_vec()
142 }
143 }
144 }
145}
146
147pub fn encode_utf16_be(text: &str) -> Vec<u8> {
151 let bom: u16 = 0xFEFF;
153 let mut bytes = vec![];
154 bytes.extend([bom].iter().flat_map(|b| b.to_be_bytes()));
155 bytes.extend(text.encode_utf16().flat_map(|b| b.to_be_bytes()));
156 bytes
157}
158
159pub fn encode_utf8(text: &str) -> Vec<u8> {
164 let mut bytes = vec![0xEF, 0xBB, 0xBF];
166 bytes.extend(text.bytes());
167 bytes
168}
169
170#[cfg(test)]
171mod tests {
172
173 use super::*;
174
175 #[test]
176 fn unicode_with_2byte_code_does_not_convert_single_bytes() {
177 let mut cmap = ToUnicodeCMap::new();
178
179 cmap.put(
180 0x0000,
181 0x0002,
182 2,
183 cmap::BfRangeTarget::UTF16CodePoint { offset: 0 },
184 );
185 cmap.put(
186 0x0024,
187 0x0025,
188 2,
189 cmap::BfRangeTarget::UTF16CodePoint { offset: 0 },
190 );
191
192 let bytes: [u8; 2] = [0x00, 0x24];
193
194 let result = Encoding::UnicodeMapEncoding(cmap).bytes_to_string(&bytes);
195
196 assert_eq!(result.unwrap(), "\u{0024}");
197 }
198}