Skip to main content

pdf_lib_rs/core/objects/
pdf_string.rs

1use std::fmt;
2use crate::core::syntax::CharCodes;
3use crate::utils::{
4    copy_string_into_buffer, has_utf16_bom, pdf_doc_encoding_decode, utf16_decode,
5};
6use super::pdf_object::PdfObjectTrait;
7
8/// A PDF literal string object, e.g., `(Hello World)`.
9#[derive(Debug, Clone, PartialEq)]
10pub struct PdfString {
11    value: String,
12}
13
14impl PdfString {
15    pub fn of(value: &str) -> Self {
16        PdfString {
17            value: value.to_string(),
18        }
19    }
20
21    /// Get the raw string value (without parentheses).
22    pub fn as_string(&self) -> &str {
23        &self.value
24    }
25
26    /// Convert the string to raw bytes, interpreting escape sequences.
27    /// Characters are treated as Latin-1 code points (0..=255), not UTF-8.
28    pub fn as_bytes_decoded(&self) -> Vec<u8> {
29        let mut bytes = Vec::new();
30        let chars: Vec<u8> = self.value.chars().map(|c| c as u8).collect();
31        let mut i = 0;
32        let mut escaped = false;
33        let mut octal = String::new();
34
35        while i < chars.len() {
36            let byte = chars[i];
37            let next_byte = chars.get(i + 1).copied();
38
39            if !escaped {
40                if byte == CharCodes::BackSlash {
41                    escaped = true;
42                } else {
43                    bytes.push(byte);
44                }
45            } else {
46                match byte {
47                    CharCodes::Newline | CharCodes::CarriageReturn => {
48                        // Escaped line break - ignore
49                        escaped = false;
50                    }
51                    b'n' => {
52                        bytes.push(CharCodes::Newline);
53                        escaped = false;
54                    }
55                    b'r' => {
56                        bytes.push(CharCodes::CarriageReturn);
57                        escaped = false;
58                    }
59                    b't' => {
60                        bytes.push(CharCodes::Tab);
61                        escaped = false;
62                    }
63                    b'b' => {
64                        bytes.push(CharCodes::Backspace);
65                        escaped = false;
66                    }
67                    b'f' => {
68                        bytes.push(CharCodes::FormFeed);
69                        escaped = false;
70                    }
71                    CharCodes::LeftParen => {
72                        bytes.push(CharCodes::LeftParen);
73                        escaped = false;
74                    }
75                    CharCodes::RightParen => {
76                        bytes.push(CharCodes::RightParen);
77                        escaped = false;
78                    }
79                    CharCodes::BackSlash => {
80                        bytes.push(CharCodes::BackSlash);
81                        escaped = false;
82                    }
83                    b'0'..=b'7' => {
84                        octal.push(byte as char);
85                        if octal.len() == 3
86                            || !matches!(next_byte, Some(b'0'..=b'7'))
87                        {
88                            if let Ok(val) = u8::from_str_radix(&octal, 8) {
89                                bytes.push(val);
90                            }
91                            octal.clear();
92                            escaped = false;
93                        }
94                    }
95                    _ => {
96                        bytes.push(byte);
97                        escaped = false;
98                    }
99                }
100            }
101            i += 1;
102        }
103
104        bytes
105    }
106
107    /// Decode the string to a text String, handling UTF-16 and PDFDocEncoding.
108    pub fn decode_text(&self) -> String {
109        let bytes = self.as_bytes_decoded();
110        if has_utf16_bom(&bytes) {
111            utf16_decode(&bytes)
112        } else {
113            pdf_doc_encoding_decode(&bytes)
114        }
115    }
116}
117
118impl fmt::Display for PdfString {
119    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
120        write!(f, "({})", self.value)
121    }
122}
123
124impl PdfObjectTrait for PdfString {
125    fn size_in_bytes(&self) -> usize {
126        self.value.len() + 2
127    }
128
129    fn copy_bytes_into(&self, buffer: &mut [u8], offset: usize) -> usize {
130        let mut off = offset;
131        buffer[off] = CharCodes::LeftParen;
132        off += 1;
133        off += copy_string_into_buffer(&self.value, buffer, off);
134        buffer[off] = CharCodes::RightParen;
135        self.value.len() + 2
136    }
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142    use crate::utils::typed_array_for;
143
144    #[test]
145    fn can_be_constructed() {
146        let _ = PdfString::of("foobar");
147        let _ = PdfString::of(" (foo(bar))");
148        let _ = PdfString::of(")b\\a/z(");
149    }
150
151    #[test]
152    fn can_be_converted_to_raw_string() {
153        assert_eq!(PdfString::of("foobar").as_string(), "foobar");
154    }
155
156    #[test]
157    fn can_be_cloned() {
158        let original = PdfString::of(")b\\a/z(");
159        let clone = original.clone();
160        assert_eq!(clone.to_string(), original.to_string());
161    }
162
163    #[test]
164    fn can_be_converted_to_string() {
165        assert_eq!(PdfString::of("foobar").to_string(), "(foobar)");
166    }
167
168    #[test]
169    fn does_not_escape_backslashes() {
170        assert_eq!(
171            PdfString::of("Foo\\Bar\\Qux").to_string(),
172            "(Foo\\Bar\\Qux)"
173        );
174    }
175
176    #[test]
177    fn does_not_escape_nested_parenthesis() {
178        assert_eq!(
179            PdfString::of("(Foo((Bar))Qux)").to_string(),
180            "((Foo((Bar))Qux))"
181        );
182    }
183
184    #[test]
185    fn can_interpret_escaped_octal_codes() {
186        let literal =
187            "\\376\\377\\000\\105\\000\\147\\000\\147\\000\\040\\330\\074\\337\\163";
188        let bytes = PdfString::of(literal).as_bytes_decoded();
189        assert_eq!(
190            bytes,
191            vec![
192                0o376, 0o377, 0o000, 0o105, 0o000, 0o147, 0o000, 0o147, 0o000, 0o040,
193                0o330, 0o074, 0o337, 0o163,
194            ]
195        );
196    }
197
198    #[test]
199    fn can_interpret_eols_and_line_breaks() {
200        let literal = "a\nb\rc\\\nd\\\re";
201        let bytes = PdfString::of(literal).as_bytes_decoded();
202        assert_eq!(
203            bytes,
204            vec![
205                b'a', b'\n', b'b', b'\r', b'c', b'd', b'e',
206            ]
207        );
208    }
209
210    #[test]
211    fn can_interpret_invalid_escapes() {
212        let literal = "a\nb\rc\\xd\\;";
213        let bytes = PdfString::of(literal).as_bytes_decoded();
214        assert_eq!(
215            bytes,
216            vec![b'a', b'\n', b'b', b'\r', b'c', b'x', b'd', b';']
217        );
218    }
219
220    #[test]
221    fn can_provide_size_in_bytes() {
222        assert_eq!(PdfString::of("foobar").size_in_bytes(), 8);
223        assert_eq!(PdfString::of(" (foo(bar))").size_in_bytes(), 13);
224        assert_eq!(PdfString::of(")b\\a/z(").size_in_bytes(), 9);
225    }
226
227    #[test]
228    fn can_be_serialized() {
229        let mut buffer = vec![b' '; 20];
230        assert_eq!(
231            PdfString::of(")(b\\a/))z(").copy_bytes_into(&mut buffer, 3),
232            12
233        );
234        assert_eq!(buffer, typed_array_for("   ()(b\\a/))z()     "));
235    }
236
237    #[test]
238    fn can_decode_utf16be_strings() {
239        let literal =
240            "\\376\\377\\000\\105\\000\\147\\000\\147\\000\\040\\330\\074\\337\\163";
241        let text = PdfString::of(literal).decode_text();
242        assert_eq!(text, "Egg 🍳");
243    }
244
245    #[test]
246    fn can_decode_pdfdocencoded_strings() {
247        let literal = "a\\105b\\163\\0b6";
248        let text = PdfString::of(literal).decode_text();
249        assert_eq!(text, "aEbs\0b6");
250    }
251}