Skip to main content

pdfluent_lopdf/encodings/
mod.rs

1pub mod cmap;
2mod glyphnames;
3mod mappings;
4
5pub use self::mappings::*;
6use crate::Error;
7use crate::Result;
8use crate::parser_aux::substr;
9use cmap::ToUnicodeCMap;
10use encoding_rs::UTF_16BE;
11use log::debug;
12
13pub fn bytes_to_string(encoding: &CodedCharacterSet, bytes: &[u8]) -> String {
14    let code_points = bytes
15        .iter()
16        .filter_map(|&byte| encoding[byte as usize])
17        .collect::<Vec<u16>>();
18    String::from_utf16(&code_points).expect("decoded string should only contain valid UTF16")
19}
20
21pub fn string_to_bytes(encoding: &CodedCharacterSet, text: &str) -> Vec<u8> {
22    text.encode_utf16()
23        .filter_map(|ch| encoding.iter().position(|&code| code == Some(ch)))
24        .map(|byte| byte as u8)
25        .collect()
26}
27
28pub enum Encoding<'a> {
29    OneByteEncoding(&'a CodedCharacterSet),
30    SimpleEncoding(&'a [u8]),
31    UnicodeMapEncoding(ToUnicodeCMap),
32}
33
34impl std::fmt::Debug for Encoding<'_> {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            // UnicodeCMap and Bytes encoding ommitted to not bloat debug log
38            Self::OneByteEncoding(_arg0) => f.debug_tuple("OneByteEncoding").finish(),
39            Self::SimpleEncoding(arg0) => f.debug_tuple("SimpleEncoding").field(arg0).finish(),
40            Self::UnicodeMapEncoding(_arg0) => f.debug_tuple("UnicodeMapEncoding").finish(),
41        }
42    }
43}
44
45impl Encoding<'_> {
46    pub fn bytes_to_string(&self, bytes: &[u8]) -> Result<String> {
47        match self {
48            Self::OneByteEncoding(map) => Ok(bytes_to_string(map, bytes)),
49            Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => {
50                Ok(UTF_16BE.decode(bytes).0.to_string())
51            }
52            Self::UnicodeMapEncoding(unicode_map) => {
53                let mut output_bytes = Vec::new();
54
55                // source codes can have a variadic length from 1 to 4 bytes
56                let mut bytes_in_considered_code = 0u8;
57                let mut considered_source_code = 0u32;
58                for byte in bytes {
59                    if bytes_in_considered_code == 4 {
60                        let mut value =
61                            unicode_map.get_or_replacement_char(considered_source_code, 4);
62                        considered_source_code = 0;
63                        bytes_in_considered_code = 0;
64                        output_bytes.append(&mut value);
65                    }
66                    bytes_in_considered_code += 1;
67                    considered_source_code = considered_source_code * 256 + *byte as u32;
68                    if let Some(mut value) =
69                        unicode_map.get(considered_source_code, bytes_in_considered_code)
70                    {
71                        considered_source_code = 0;
72                        bytes_in_considered_code = 0;
73                        output_bytes.append(&mut value);
74                    }
75                }
76                if bytes_in_considered_code > 0 {
77                    let mut value = unicode_map
78                        .get_or_replacement_char(considered_source_code, bytes_in_considered_code);
79                    output_bytes.append(&mut value);
80                }
81                let utf16_str: Vec<u8> = output_bytes
82                    .iter()
83                    .flat_map(|it| [(it / 256) as u8, (it % 256) as u8])
84                    .collect();
85                Ok(UTF_16BE.decode(&utf16_str).0.to_string())
86            }
87            Self::SimpleEncoding(_) => Err(Error::CharacterEncoding),
88        }
89    }
90
91    pub fn string_to_bytes(&self, text: &str) -> Vec<u8> {
92        match self {
93            Self::OneByteEncoding(map) => string_to_bytes(map, text),
94            Self::SimpleEncoding(b"UniGB-UCS2-H") | Self::SimpleEncoding(b"UniGB-UTF16-H") => {
95                encode_utf16_be(text)
96            }
97            Self::UnicodeMapEncoding(unicode_map) => {
98                let mut result_bytes = Vec::new();
99
100                let mut i = 0;
101                while i < text.chars().count() {
102                    let current_unicode_seq: Vec<u16> = substr(text, i, 1).encode_utf16().collect();
103
104                    if let Some(entries) =
105                        unicode_map.get_source_codes_for_unicode(&current_unicode_seq)
106                    {
107                        if let Some(entry) = entries.first() {
108                            // TODO: Add logic to pick the best entry if multiple
109                            let mut bytes_for_code = Vec::new();
110                            let val = entry.source_code;
111                            match entry.code_len {
112                                1 => bytes_for_code.push(val as u8),
113                                2 => bytes_for_code.extend_from_slice(&(val as u16).to_be_bytes()),
114                                3 => {
115                                    bytes_for_code.push((val >> 16) as u8);
116                                    bytes_for_code.push((val >> 8) as u8);
117                                    bytes_for_code.push(val as u8);
118                                }
119                                4 => bytes_for_code.extend_from_slice(&val.to_be_bytes()),
120                                _ => { /* Should not happen */ }
121                            }
122                            result_bytes.extend(bytes_for_code);
123                        } else {
124                            // No specific entry, handle as unmappable
125                            log::warn!(
126                                "Unicode sequence {current_unicode_seq:04X?} found in map but no entries, skipping."
127                            );
128                        }
129                    } else {
130                        // Character or sequence not found in CMap
131                        log::warn!(
132                            "Unicode sequence {current_unicode_seq:04X?} not found in ToUnicode CMap, skipping."
133                        );
134                    }
135                    i += 1;
136                }
137                result_bytes
138            }
139            Self::SimpleEncoding(_) => {
140                debug!("Unknown encoding used to encode text {self:?}");
141                text.as_bytes().to_vec()
142            }
143        }
144    }
145}
146
147/// Encodes the given `str` to UTF-16BE.
148/// The recommended way to encode text strings, as it supports all of
149/// unicode and all major PDF readers support it.
150pub fn encode_utf16_be(text: &str) -> Vec<u8> {
151    // Prepend BOM to the mark string as UTF-16BE encoded.
152    let bom: u16 = 0xFEFF;
153    let mut bytes = vec![];
154    bytes.extend([bom].iter().flat_map(|b| b.to_be_bytes()));
155    bytes.extend(text.encode_utf16().flat_map(|b| b.to_be_bytes()));
156    bytes
157}
158
159/// Encodes the given `str` to UTF-8. This method of encoding text strings
160/// is first specified in PDF2.0 and reader support is still lacking
161/// (notably, Adobe Acrobat Reader doesn't support it at the time of writing).
162/// Thus, using it is **NOT RECOMMENDED**.
163pub fn encode_utf8(text: &str) -> Vec<u8> {
164    // Prepend BOM to the mark string as UTF-8 encoded.
165    let mut bytes = vec![0xEF, 0xBB, 0xBF];
166    bytes.extend(text.bytes());
167    bytes
168}
169
170#[cfg(test)]
171mod tests {
172
173    use super::*;
174
175    #[test]
176    fn unicode_with_2byte_code_does_not_convert_single_bytes() {
177        let mut cmap = ToUnicodeCMap::new();
178
179        cmap.put(
180            0x0000,
181            0x0002,
182            2,
183            cmap::BfRangeTarget::UTF16CodePoint { offset: 0 },
184        );
185        cmap.put(
186            0x0024,
187            0x0025,
188            2,
189            cmap::BfRangeTarget::UTF16CodePoint { offset: 0 },
190        );
191
192        let bytes: [u8; 2] = [0x00, 0x24];
193
194        let result = Encoding::UnicodeMapEncoding(cmap).bytes_to_string(&bytes);
195
196        assert_eq!(result.unwrap(), "\u{0024}");
197    }
198}