Skip to main content

pdf_syntax/object/
string.rs

1//! Strings.
2
3use crate::crypto::DecryptionTarget;
4use crate::filter::ascii_hex;
5use crate::object::Object;
6use crate::object::macros::object;
7use crate::reader::Reader;
8use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
9use core::ops::Deref;
10use log::warn;
11use smallvec::SmallVec;
12
13type StringInner = SmallVec<[u8; 23]>;
14
15/// A PDF string object.
16#[derive(Clone, Debug, PartialEq, Eq, Hash)]
17pub struct String(StringInner);
18
19impl String {
20    /// Returns the string data as a byte slice.
21    pub fn as_bytes(&self) -> &[u8] {
22        &self.0
23    }
24}
25
26impl Deref for String {
27    type Target = [u8];
28
29    fn deref(&self) -> &Self::Target {
30        &self.0
31    }
32}
33
34impl AsRef<[u8]> for String {
35    fn as_ref(&self) -> &[u8] {
36        &self.0
37    }
38}
39
40object!(String, String);
41
42impl Skippable for String {
43    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
44        match r.peek_byte()? {
45            b'<' => skip_hex(r),
46            b'(' => skip_literal(r),
47            _ => None,
48        }
49    }
50}
51
52impl Readable<'_> for String {
53    fn read(r: &mut Reader<'_>, ctx: &ReaderContext<'_>) -> Option<Self> {
54        let decoded = match r.peek_byte()? {
55            b'<' => read_hex(r)?,
56            b'(' => read_literal(r)?,
57            _ => return None,
58        };
59
60        // Apply decryption if needed.
61        let final_data = if ctx.xref().needs_decryption(ctx) {
62            if let Some(obj_number) = ctx.obj_number() {
63                ctx.xref()
64                    .decrypt(obj_number, &decoded, DecryptionTarget::String)
65                    .map(SmallVec::from_vec)
66                    .unwrap_or(decoded)
67            } else {
68                decoded
69            }
70        } else {
71            decoded
72        };
73
74        Some(Self(final_data))
75    }
76}
77
78fn skip_hex(r: &mut Reader<'_>) -> Option<()> {
79    r.forward_tag(b"<")?;
80    // Consume all bytes until '>' — non-hex bytes are tolerated per Adobe Reader
81    // behaviour. Some PDFs embed binary data in /ID hex strings (e.g. bytes like
82    // 0xAE, 'I') which are technically invalid but must not break dict parsing.
83    loop {
84        match r.peek_byte()? {
85            b'>' => break,
86            _ => {
87                r.read_byte()?;
88            }
89        }
90    }
91    r.forward_tag(b">")?;
92
93    Some(())
94}
95
96fn read_hex(r: &mut Reader<'_>) -> Option<StringInner> {
97    let start = r.offset();
98    skip_hex(r)?;
99    let end = r.offset();
100
101    // Exclude outer brackets.
102    let raw = r.range(start + 1..end - 1)?;
103    // Non-hex bytes produce None from ascii_hex::decode; return empty string
104    // rather than failing so dict parsing stays intact (e.g. corrupt /ID arrays).
105    let decoded = ascii_hex::decode(raw).unwrap_or_default();
106
107    Some(SmallVec::from_vec(decoded))
108}
109
110fn skip_literal(r: &mut Reader<'_>) -> Option<()> {
111    r.forward_tag(b"(")?;
112    let mut bracket_counter = 1;
113
114    while bracket_counter > 0 {
115        let byte = r.read_byte()?;
116
117        match byte {
118            b'\\' => {
119                let _ = r.read_byte()?;
120            }
121            b'(' => bracket_counter += 1,
122            b')' => bracket_counter -= 1,
123            _ => {}
124        };
125    }
126
127    Some(())
128}
129
130fn read_literal(r: &mut Reader<'_>) -> Option<StringInner> {
131    let start = r.offset();
132    skip_literal(r)?;
133    let end = r.offset();
134
135    // Exclude outer parentheses.
136    let data = r.range(start + 1..end - 1)?;
137
138    let mut r = Reader::new(data);
139    let mut result = SmallVec::new();
140
141    while let Some(byte) = r.read_byte() {
142        match byte {
143            b'\\' => {
144                let next = r.read_byte()?;
145
146                if is_octal_digit(next) {
147                    let second = r.read_byte();
148                    let third = r.read_byte();
149
150                    let bytes = match (second, third) {
151                        (Some(n1), Some(n2)) => match (is_octal_digit(n1), is_octal_digit(n2)) {
152                            (true, true) => [next, n1, n2],
153                            (true, _) => {
154                                r.jump(r.offset() - 1);
155                                [b'0', next, n1]
156                            }
157                            _ => {
158                                r.jump(r.offset() - 2);
159                                [b'0', b'0', next]
160                            }
161                        },
162                        (Some(n1), None) => {
163                            if is_octal_digit(n1) {
164                                [b'0', next, n1]
165                            } else {
166                                r.jump(r.offset() - 1);
167                                [b'0', b'0', next]
168                            }
169                        }
170                        _ => [b'0', b'0', next],
171                    };
172
173                    // All bytes are octal digits (b'0'..=b'7'): compute value directly.
174                    let value = bytes
175                        .iter()
176                        .fold(0u16, |acc, &b| acc * 8 + (b - b'0') as u16);
177                    if value <= 255 {
178                        result.push(value as u8);
179                    } else {
180                        warn!("overflow occurred while parsing octal literal string");
181                    }
182                } else {
183                    match next {
184                        b'n' => result.push(0xA),
185                        b'r' => result.push(0xD),
186                        b't' => result.push(0x9),
187                        b'b' => result.push(0x8),
188                        b'f' => result.push(0xC),
189                        b'(' => result.push(b'('),
190                        b')' => result.push(b')'),
191                        b'\\' => result.push(b'\\'),
192                        b'\n' | b'\r' => {
193                            // A conforming reader shall disregard the REVERSE SOLIDUS
194                            // and the end-of-line marker following it when reading
195                            // the string; the resulting string value shall be
196                            // identical to that which would be read if the string
197                            // were not split.
198                            r.skip_eol_characters();
199                        }
200                        _ => result.push(next),
201                    }
202                }
203            }
204            b'(' | b')' => result.push(byte),
205            // An end-of-line marker appearing within a literal string
206            // without a preceding REVERSE SOLIDUS shall be treated as
207            // a byte value of (0Ah), irrespective of whether the end-of-line
208            // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
209            b'\n' | b'\r' => {
210                result.push(b'\n');
211                r.skip_eol_characters();
212            }
213            other => result.push(other),
214        }
215    }
216
217    Some(result)
218}
219
220fn is_octal_digit(byte: u8) -> bool {
221    matches!(byte, b'0'..=b'7')
222}
223
224#[cfg(test)]
225mod tests {
226    use crate::object::String;
227    use crate::reader::Reader;
228    use crate::reader::ReaderExt;
229
230    #[test]
231    fn hex_string_empty() {
232        assert_eq!(
233            Reader::new(b"<>")
234                .read_without_context::<String>()
235                .unwrap()
236                .as_bytes(),
237            b""
238        );
239    }
240
241    #[test]
242    fn hex_string_1() {
243        assert_eq!(
244            Reader::new(b"<00010203>")
245                .read_without_context::<String>()
246                .unwrap()
247                .as_bytes(),
248            &[0x00, 0x01, 0x02, 0x03]
249        );
250    }
251
252    #[test]
253    fn hex_string_2() {
254        assert_eq!(
255            Reader::new(b"<000102034>")
256                .read_without_context::<String>()
257                .unwrap()
258                .as_bytes(),
259            &[0x00, 0x01, 0x02, 0x03, 0x40]
260        );
261    }
262
263    #[test]
264    fn hex_string_trailing_1() {
265        assert_eq!(
266            Reader::new(b"<000102034>dfgfg4")
267                .read_without_context::<String>()
268                .unwrap()
269                .as_bytes(),
270            &[0x00, 0x01, 0x02, 0x03, 0x40]
271        );
272    }
273
274    #[test]
275    fn hex_string_trailing_2() {
276        assert_eq!(
277            Reader::new(b"<1  3 4>dfgfg4")
278                .read_without_context::<String>()
279                .unwrap()
280                .as_bytes(),
281            &[0x13, 0x40]
282        );
283    }
284
285    #[test]
286    fn hex_string_trailing_3() {
287        assert_eq!(
288            Reader::new(b"<1>dfgfg4")
289                .read_without_context::<String>()
290                .unwrap()
291                .as_bytes(),
292            &[0x10]
293        );
294    }
295
296    #[test]
297    fn hex_string_invalid_1() {
298        assert!(Reader::new(b"<").read_without_context::<String>().is_none());
299    }
300
301    #[test]
302    fn hex_string_invalid_2() {
303        assert!(
304            Reader::new(b"34AD")
305                .read_without_context::<String>()
306                .is_none()
307        );
308    }
309
310    #[test]
311    fn literal_string_empty() {
312        assert_eq!(
313            Reader::new(b"()")
314                .read_without_context::<String>()
315                .unwrap()
316                .as_bytes(),
317            b""
318        );
319    }
320
321    #[test]
322    fn literal_string_1() {
323        assert_eq!(
324            Reader::new(b"(Hi there.)")
325                .read_without_context::<String>()
326                .unwrap()
327                .as_bytes(),
328            b"Hi there."
329        );
330    }
331
332    #[test]
333    fn literal_string_2() {
334        assert!(
335            Reader::new(b"(Hi \\777)")
336                .read_without_context::<String>()
337                .is_some()
338        );
339    }
340
341    #[test]
342    fn literal_string_3() {
343        assert_eq!(
344            Reader::new(b"(Hi ) there.)")
345                .read_without_context::<String>()
346                .unwrap()
347                .as_bytes(),
348            b"Hi "
349        );
350    }
351
352    #[test]
353    fn literal_string_4() {
354        assert_eq!(
355            Reader::new(b"(Hi (()) there)")
356                .read_without_context::<String>()
357                .unwrap()
358                .as_bytes(),
359            b"Hi (()) there"
360        );
361    }
362
363    #[test]
364    fn literal_string_5() {
365        assert_eq!(
366            Reader::new(b"(Hi \\()")
367                .read_without_context::<String>()
368                .unwrap()
369                .as_bytes(),
370            b"Hi ("
371        );
372    }
373
374    #[test]
375    fn literal_string_6() {
376        assert_eq!(
377            Reader::new(b"(Hi \\\nthere)")
378                .read_without_context::<String>()
379                .unwrap()
380                .as_bytes(),
381            b"Hi there"
382        );
383    }
384
385    #[test]
386    fn literal_string_7() {
387        assert_eq!(
388            Reader::new(b"(Hi \\05354)")
389                .read_without_context::<String>()
390                .unwrap()
391                .as_bytes(),
392            b"Hi +54"
393        );
394    }
395
396    #[test]
397    fn literal_string_8() {
398        assert_eq!(
399            Reader::new(b"(\\3)")
400                .read_without_context::<String>()
401                .unwrap()
402                .as_bytes(),
403            b"\x03"
404        );
405    }
406
407    #[test]
408    fn literal_string_9() {
409        assert_eq!(
410            Reader::new(b"(\\36)")
411                .read_without_context::<String>()
412                .unwrap()
413                .as_bytes(),
414            b"\x1e"
415        );
416    }
417
418    #[test]
419    fn literal_string_10() {
420        assert_eq!(
421            Reader::new(b"(\\36ab)")
422                .read_without_context::<String>()
423                .unwrap()
424                .as_bytes(),
425            b"\x1eab"
426        );
427    }
428
429    #[test]
430    fn literal_string_11() {
431        assert_eq!(
432            Reader::new(b"(\\00Y)")
433                .read_without_context::<String>()
434                .unwrap()
435                .as_bytes(),
436            b"\0Y"
437        );
438    }
439
440    #[test]
441    fn literal_string_12() {
442        assert_eq!(
443            Reader::new(b"(\\0Y)")
444                .read_without_context::<String>()
445                .unwrap()
446                .as_bytes(),
447            b"\0Y"
448        );
449    }
450
451    #[test]
452    fn literal_string_trailing() {
453        assert_eq!(
454            Reader::new(b"(Hi there.)abcde")
455                .read_without_context::<String>()
456                .unwrap()
457                .as_bytes(),
458            b"Hi there."
459        );
460    }
461
462    #[test]
463    fn literal_string_invalid() {
464        assert_eq!(
465            Reader::new(b"(Hi \\778)")
466                .read_without_context::<String>()
467                .unwrap()
468                .as_bytes(),
469            b"Hi \x3F8"
470        );
471    }
472
473    #[test]
474    fn string_1() {
475        assert_eq!(
476            Reader::new(b"(Hi there.)")
477                .read_without_context::<String>()
478                .unwrap()
479                .as_bytes(),
480            b"Hi there."
481        );
482    }
483
484    #[test]
485    fn string_2() {
486        assert_eq!(
487            Reader::new(b"<00010203>")
488                .read_without_context::<String>()
489                .unwrap()
490                .as_bytes(),
491            &[0x00, 0x01, 0x02, 0x03]
492        );
493    }
494}