Skip to main content

lopdf/encodings/
mod.rs

1pub mod cmap;
2mod glyphnames;
3mod mappings;
4
5use crate::Error;
6use crate::Result;
7use cmap::ToUnicodeCMap;
8use encoding_rs::UTF_16BE;
9use log::debug;
10use crate::parser_aux::substr;
11pub use self::mappings::*;
12
13pub fn bytes_to_string(encoding: &CodedCharacterSet, bytes: &[u8]) -> String {
14    let code_points = bytes
15        .iter()
16        .filter_map(|&byte| encoding[byte as usize])
17        .collect::<Vec<u16>>();
18    String::from_utf16(&code_points).expect("decoded string should only contain valid UTF16")
19}
20
21pub fn string_to_bytes(encoding: &CodedCharacterSet, text: &str) -> Vec<u8> {
22    text.encode_utf16()
23        .filter_map(|ch| encoding.iter().position(|&code| code == Some(ch)))
24        .map(|byte| byte as u8)
25        .collect()
26}
27
28pub enum Encoding<'a> {
29    OneByteEncoding(&'a CodedCharacterSet),
30    SimpleEncoding(&'a [u8]),
31    UnicodeMapEncoding(ToUnicodeCMap),
32}
33
34impl std::fmt::Debug for Encoding<'_> {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            // UnicodeCMap and Bytes encoding ommitted to not bloat debug log
38            Self::OneByteEncoding(_arg0) => f.debug_tuple("OneByteEncoding").finish(),
39            Self::SimpleEncoding(arg0) => f.debug_tuple("SimpleEncoding").field(arg0).finish(),
40            Self::UnicodeMapEncoding(_arg0) => f.debug_tuple("UnicodeMapEncoding").finish(),
41        }
42    }
43}
44
45impl Encoding<'_> {
46    pub fn bytes_to_string(&self, bytes: &[u8]) -> Result<String> {
47        match self {
48            Self::OneByteEncoding(map) => Ok(bytes_to_string(map, bytes)),
49            Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => {
50                Ok(UTF_16BE.decode(bytes).0.to_string())
51            }
52            Self::UnicodeMapEncoding(unicode_map) => {
53                let mut output_bytes = Vec::new();
54
55                // source codes can have a variadic length from 1 to 4 bytes
56                let mut bytes_in_considered_code = 0u8;
57                let mut considered_source_code = 0u32;
58                for byte in bytes {
59                    if bytes_in_considered_code == 4 {
60                        let mut value = unicode_map.get_or_replacement_char(considered_source_code, 4);
61                        considered_source_code = 0;
62                        bytes_in_considered_code = 0;
63                        output_bytes.append(&mut value);
64                    }
65                    bytes_in_considered_code += 1;
66                    considered_source_code = considered_source_code * 256 + *byte as u32;
67                    if let Some(mut value) = unicode_map.get(considered_source_code, bytes_in_considered_code) {
68                        considered_source_code = 0;
69                        bytes_in_considered_code = 0;
70                        output_bytes.append(&mut value);
71                    }
72                }
73                if bytes_in_considered_code > 0 {
74                    let mut value =
75                        unicode_map.get_or_replacement_char(considered_source_code, bytes_in_considered_code);
76                    output_bytes.append(&mut value);
77                }
78                let utf16_str: Vec<u8> = output_bytes
79                    .iter()
80                    .flat_map(|it| [(it / 256) as u8, (it % 256) as u8])
81                    .collect();
82                Ok(UTF_16BE.decode(&utf16_str).0.to_string())
83            }
84            Self::SimpleEncoding(b"WinAnsiEncoding") => Ok(bytes_to_string(&WIN_ANSI_ENCODING, bytes)),
85            Self::SimpleEncoding(_) => Err(Error::CharacterEncoding),
86        }
87    }
88
89    pub fn string_to_bytes(&self, text: &str) -> Vec<u8> {
90        match self {
91            Self::OneByteEncoding(map) => string_to_bytes(map, text),
92            Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => encode_utf16_be(text),
93            Self::SimpleEncoding(b"WinAnsiEncoding") => string_to_bytes(&WIN_ANSI_ENCODING, text),
94            Self::UnicodeMapEncoding(unicode_map) => {
95                let mut result_bytes = Vec::new();
96
97                let mut i = 0;
98                while i < text.chars().count() {
99                    let current_unicode_seq: Vec<u16> = substr(text, i, 1).encode_utf16().collect();
100
101                    if let Some(entries) = unicode_map.get_source_codes_for_unicode(&current_unicode_seq) {
102                        if let Some(entry) = entries.first() {
103                            // TODO: Add logic to pick the best entry if multiple
104                            let mut bytes_for_code = Vec::new();
105                            let val = entry.source_code;
106                            match entry.code_len {
107                                1 => bytes_for_code.push(val as u8),
108                                2 => bytes_for_code.extend_from_slice(&(val as u16).to_be_bytes()),
109                                3 => {
110                                    bytes_for_code.push((val >> 16) as u8);
111                                    bytes_for_code.push((val >> 8) as u8);
112                                    bytes_for_code.push(val as u8);
113                                }
114                                4 => bytes_for_code.extend_from_slice(&val.to_be_bytes()),
115                                _ => { /* Should not happen */ }
116                            }
117                            result_bytes.extend(bytes_for_code);
118                        } else {
119                            // No specific entry, handle as unmappable
120                            log::warn!(
121                                "Unicode sequence {current_unicode_seq:04X?} found in map but no entries, skipping."
122                            );
123                        }
124                    } else {
125                        // Character or sequence not found in CMap
126                        log::warn!(
127                            "Unicode sequence {current_unicode_seq:04X?} not found in ToUnicode CMap, skipping."
128                        );
129                    }
130                    i += 1;
131                }
132                result_bytes
133            }
134            Self::SimpleEncoding(_) => {
135                debug!("Unknown encoding used to encode text {self:?}");
136                text.as_bytes().to_vec()
137            }
138        }
139    }
140}
141
142/// Encodes the given `str` to UTF-16BE.
143/// The recommended way to encode text strings, as it supports all of
144/// unicode and all major PDF readers support it.
145pub fn encode_utf16_be(text: &str) -> Vec<u8> {
146    // Prepend BOM to the mark string as UTF-16BE encoded.
147    let bom: u16 = 0xFEFF;
148    let mut bytes = vec![];
149    bytes.extend([bom].iter().flat_map(|b| b.to_be_bytes()));
150    bytes.extend(text.encode_utf16().flat_map(|b| b.to_be_bytes()));
151    bytes
152}
153
154/// Encodes the given `str` to UTF-8. This method of encoding text strings
155/// is first specified in PDF2.0 and reader support is still lacking
156/// (notably, Adobe Acrobat Reader doesn't support it at the time of writing).
157/// Thus, using it is **NOT RECOMMENDED**.
158pub fn encode_utf8(text: &str) -> Vec<u8> {
159    // Prepend BOM to the mark string as UTF-8 encoded.
160    let mut bytes = vec![0xEF, 0xBB, 0xBF];
161    bytes.extend(text.bytes());
162    bytes
163}
164
165#[cfg(test)]
166mod tests {
167
168    use super::*;
169
170    #[test]
171    fn unicode_with_2byte_code_does_not_convert_single_bytes() {
172        let mut cmap = ToUnicodeCMap::new();
173
174        cmap.put(0x0000, 0x0002, 2, cmap::BfRangeTarget::UTF16CodePoint { offset: 0 });
175        cmap.put(0x0024, 0x0025, 2, cmap::BfRangeTarget::UTF16CodePoint { offset: 0 });
176
177        let bytes: [u8; 2] = [0x00, 0x24];
178
179        let result = Encoding::UnicodeMapEncoding(cmap).bytes_to_string(&bytes);
180
181        assert_eq!(result.unwrap(), "\u{0024}");
182    }
183
184    #[test]
185    fn winansi_bytes_to_string() {
186        // 0xe9 = é in WinAnsi, 0xfc = ü, 0xdf = ß
187        let bytes = [0x41, 0xe9, 0x42, 0xfc, 0xdf]; // AéBüß
188        let result = Encoding::SimpleEncoding(b"WinAnsiEncoding")
189            .bytes_to_string(&bytes)
190            .expect("WinAnsi decode should succeed");
191        assert_eq!(result, "AéBüß");
192    }
193
194    #[test]
195    fn winansi_string_to_bytes() {
196        let text = "Sébastien 0,019€ ü ÄÖÜ ß";
197        let bytes = Encoding::SimpleEncoding(b"WinAnsiEncoding").string_to_bytes(text);
198        // Round-trip: decode the bytes back via the same encoding
199        let decoded = Encoding::OneByteEncoding(&WIN_ANSI_ENCODING)
200            .bytes_to_string(&bytes)
201            .expect("WinAnsi decode should succeed");
202        assert_eq!(decoded, text);
203    }
204}