lopdf/common_data_structures/
mod.rs

1use crate::{
2    encodings::{self, bytes_to_string},
3    Error, Object, Result, StringFormat,
4};
5
6/// Creates a text string.
7/// If the input only contains ASCII characters, the string is encoded
8/// in PDFDocEncoding, otherwise in UTF-16BE.
9pub fn text_string(text: &str) -> Object {
10    if text.is_ascii() {
11        return Object::String(text.into(), StringFormat::Literal);
12    }
13    Object::String(encodings::encode_utf16_be(text), StringFormat::Hexadecimal)
14}
15
16/// Decodes a text string.
17/// Depending on the BOM at the start of the string, a different encoding is chosen.
18/// All encodings specified in PDF2.0 are supported (PDFDocEncoding, UTF-16BE,
19/// and UTF-8).
20pub fn decode_text_string(obj: &Object) -> Result<String> {
21    let s = obj.as_str()?;
22    if s.starts_with(b"\xFE\xFF") {
23        // Detected UTF-16BE BOM
24        String::from_utf16(
25            &s[2..]
26                .chunks(2)
27                .map(|c| {
28                    if c.len() == 1 {
29                        u16::from_be_bytes([c[0], 0])
30                    } else {
31                        u16::from_be_bytes(c.try_into().unwrap())
32                    }
33                })
34                .collect::<Vec<u16>>(),
35        )
36        .map_err(|_| Error::TextStringDecode)
37    } else if s.starts_with(b"\xEF\xBB\xBF") {
38        // Detected UTF-8 BOM
39        String::from_utf8(s.to_vec()).map_err(|_| Error::TextStringDecode)
40    } else {
41        // If neither BOM is detected, PDFDocEncoding is used
42        Ok(bytes_to_string(&encodings::PDF_DOC_ENCODING, s))
43    }
44}
45
46#[cfg(test)]
47mod test {
48    use crate::{
49        common_data_structures::decode_text_string, encodings, parser::ParserInput, text_string, writer::Writer,
50        Object, StringFormat,
51    };
52
53    #[test]
54    fn spec_example1_encode() {
55        let input = "text‰";
56        let text_string = encodings::string_to_bytes(&encodings::PDF_DOC_ENCODING, input);
57        // let text_string = input.bytes().collect::<Vec<_>>();
58        let dict = Object::Dictionary(dictionary!(
59            "Key" => Object::String(text_string, StringFormat::Literal),
60        ));
61        let mut actual = vec![];
62        Writer::write_object(&mut actual, &dict).unwrap();
63        // "\x8B" is equivalent to the escaped version "\\213" which is used
64        // in the original example.
65        let expected = b"<</Key(text\x8B)>>";
66        assert_eq!(actual.as_slice(), expected);
67    }
68
69    #[test]
70    fn spec_example1_decode() {
71        let input = b"<</Key(text\\213)>>";
72        let dict = crate::parser::direct_object(ParserInput::new_extra(input, "")).unwrap();
73        let dict = dict.as_dict().unwrap();
74        let actual = decode_text_string(dict.get(b"Key").unwrap()).unwrap();
75        let expected = "text‰";
76        assert_eq!(&actual, expected);
77    }
78
79    #[test]
80    fn spec_example2_encode() {
81        // Russian for "test"
82        let input = "тест";
83        // let text_string = encodings::string_to_bytes(encodings::PDF_DOC_ENCODING, input);
84        let dict = Object::Dictionary(dictionary!(
85            "Key" => text_string(input),
86        ));
87        let mut actual = vec![];
88        Writer::write_object(&mut actual, &dict).unwrap();
89        let expected = b"<</Key<FEFF0442043504410442>>>";
90        assert_eq!(actual.as_slice(), expected);
91    }
92
93    #[test]
94    fn spec_example2_decode() {
95        let input = b"<</Key<FEFF0442043504410442>>>";
96        let dict = crate::parser::direct_object(ParserInput::new_extra(input, "")).unwrap();
97        let dict = dict.as_dict().unwrap();
98        let actual = decode_text_string(dict.get(b"Key").unwrap()).unwrap();
99        // Russian for "test"
100        let expected = "тест";
101        assert_eq!(&actual, expected);
102    }
103}