Skip to main content

hayro_syntax/object/
string.rs

1//! Strings.
2
3use crate::crypto::DecryptionTarget;
4use crate::filter::ascii_hex;
5use crate::object::Object;
6use crate::object::macros::object;
7use crate::reader::Reader;
8use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
9use crate::trivia::is_white_space_character;
10use alloc::vec::Vec;
11use core::borrow::Borrow;
12use core::hash::{Hash, Hasher};
13use core::ops::Deref;
14use smallvec::SmallVec;
15
16#[derive(Clone)]
17enum StringInner<'a> {
18    Borrowed(&'a [u8]),
19    Owned(SmallVec<[u8; 23]>),
20}
21
22impl AsRef<[u8]> for StringInner<'_> {
23    fn as_ref(&self) -> &[u8] {
24        match self {
25            Self::Borrowed(data) => data,
26            Self::Owned(data) => data,
27        }
28    }
29}
30
31/// A PDF string object.
32#[derive(Clone)]
33pub struct String<'a>(StringInner<'a>);
34
35impl<'a> String<'a> {
36    /// Returns the string data as a byte slice.
37    pub fn as_bytes(&self) -> &[u8] {
38        self.as_ref()
39    }
40}
41
42impl Deref for String<'_> {
43    type Target = [u8];
44
45    fn deref(&self) -> &Self::Target {
46        self.as_ref()
47    }
48}
49
50impl AsRef<[u8]> for String<'_> {
51    fn as_ref(&self) -> &[u8] {
52        match &self.0 {
53            StringInner::Borrowed(data) => data,
54            StringInner::Owned(data) => data,
55        }
56    }
57}
58
59impl Borrow<[u8]> for String<'_> {
60    fn borrow(&self) -> &[u8] {
61        self.as_ref()
62    }
63}
64
65impl PartialEq for String<'_> {
66    fn eq(&self, other: &Self) -> bool {
67        self.as_ref() == other.as_ref()
68    }
69}
70
71impl Eq for String<'_> {}
72
73impl Hash for String<'_> {
74    fn hash<H: Hasher>(&self, state: &mut H) {
75        self.as_ref().hash(state);
76    }
77}
78
79impl core::fmt::Debug for String<'_> {
80    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
81        <[u8] as core::fmt::Debug>::fmt(self.as_ref(), f)
82    }
83}
84
85object!(String<'a>, String);
86
87impl Skippable for String<'_> {
88    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
89        match r.peek_byte()? {
90            b'<' => skip_hex(r),
91            b'(' => skip_literal(r),
92            _ => None,
93        }
94    }
95}
96
97impl<'a> Readable<'a> for String<'a> {
98    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
99        let decoded = match r.peek_byte()? {
100            b'<' => StringInner::Owned(read_hex(r)?),
101            b'(' => read_literal(r)?,
102            _ => return None,
103        };
104
105        // Apply decryption if needed.
106        let final_data = if ctx.xref().needs_decryption(ctx) {
107            if let Some(obj_number) = ctx.obj_number() {
108                ctx.xref()
109                    .decrypt(obj_number, decoded.as_ref(), DecryptionTarget::String)
110                    .map(StringInner::from)
111                    .unwrap_or(decoded)
112            } else {
113                decoded
114            }
115        } else {
116            decoded
117        };
118
119        Some(Self(final_data))
120    }
121}
122
123impl From<Vec<u8>> for StringInner<'_> {
124    fn from(value: Vec<u8>) -> Self {
125        Self::Owned(SmallVec::from_vec(value))
126    }
127}
128
129fn skip_hex(r: &mut Reader<'_>) -> Option<()> {
130    r.forward_tag(b"<")?;
131    while let Some(b) = r.peek_byte() {
132        let is_hex = b.is_ascii_hexdigit();
133        let is_whitespace = is_white_space_character(b);
134
135        if !is_hex && !is_whitespace {
136            break;
137        }
138
139        r.read_byte()?;
140    }
141    r.forward_tag(b">")?;
142
143    Some(())
144}
145
146fn read_hex(r: &mut Reader<'_>) -> Option<SmallVec<[u8; 23]>> {
147    let start = r.offset();
148    skip_hex(r)?;
149    let end = r.offset();
150
151    // Exclude outer brackets.
152    let raw = r.range(start + 1..end - 1)?;
153    let decoded = ascii_hex::decode_into(raw)?;
154
155    Some(decoded)
156}
157
158fn skip_literal(r: &mut Reader<'_>) -> Option<()> {
159    r.forward_tag(b"(")?;
160    let mut bracket_counter = 1;
161
162    while bracket_counter > 0 {
163        let byte = r.read_byte()?;
164
165        match byte {
166            b'\\' => {
167                let _ = r.read_byte()?;
168            }
169            b'(' => bracket_counter += 1,
170            b')' => bracket_counter -= 1,
171            _ => {}
172        };
173    }
174
175    Some(())
176}
177
178fn read_literal<'a>(r: &mut Reader<'a>) -> Option<StringInner<'a>> {
179    let start = r.offset();
180    skip_literal(r)?;
181    let end = r.offset();
182
183    // Exclude outer parentheses.
184    let data = r.range(start + 1..end - 1)?;
185
186    if !data.iter().any(|b| matches!(b, b'\\' | b'\n' | b'\r')) {
187        return Some(StringInner::Borrowed(data));
188    }
189
190    let mut r = Reader::new(data);
191    let mut result = SmallVec::new();
192
193    while let Some(byte) = r.read_byte() {
194        match byte {
195            b'\\' => {
196                let next = r.read_byte()?;
197
198                if is_octal_digit(next) {
199                    let second = r.read_byte();
200                    let third = r.read_byte();
201
202                    let bytes = match (second, third) {
203                        (Some(n1), Some(n2)) => match (is_octal_digit(n1), is_octal_digit(n2)) {
204                            (true, true) => [next, n1, n2],
205                            (true, _) => {
206                                r.jump(r.offset() - 1);
207                                [b'0', next, n1]
208                            }
209                            _ => {
210                                r.jump(r.offset() - 2);
211                                [b'0', b'0', next]
212                            }
213                        },
214                        (Some(n1), None) => {
215                            if is_octal_digit(n1) {
216                                [b'0', next, n1]
217                            } else {
218                                r.jump(r.offset() - 1);
219                                [b'0', b'0', next]
220                            }
221                        }
222                        _ => [b'0', b'0', next],
223                    };
224
225                    let str = core::str::from_utf8(&bytes).unwrap();
226
227                    if let Ok(num) = u8::from_str_radix(str, 8) {
228                        result.push(num);
229                    } else {
230                        warn!("overflow occurred while parsing octal literal string");
231                    }
232                } else {
233                    match next {
234                        b'n' => result.push(0xA),
235                        b'r' => result.push(0xD),
236                        b't' => result.push(0x9),
237                        b'b' => result.push(0x8),
238                        b'f' => result.push(0xC),
239                        b'(' => result.push(b'('),
240                        b')' => result.push(b')'),
241                        b'\\' => result.push(b'\\'),
242                        b'\n' | b'\r' => {
243                            // A conforming reader shall disregard the REVERSE SOLIDUS
244                            // and the end-of-line marker following it when reading
245                            // the string; the resulting string value shall be
246                            // identical to that which would be read if the string
247                            // were not split.
248                            r.skip_eol_characters();
249                        }
250                        _ => result.push(next),
251                    }
252                }
253            }
254            b'(' | b')' => result.push(byte),
255            // An end-of-line marker appearing within a literal string
256            // without a preceding REVERSE SOLIDUS shall be treated as
257            // a byte value of (0Ah), irrespective of whether the end-of-line
258            // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
259            b'\n' | b'\r' => {
260                result.push(b'\n');
261                r.skip_eol_characters();
262            }
263            other => result.push(other),
264        }
265    }
266
267    Some(StringInner::Owned(result))
268}
269
270fn is_octal_digit(byte: u8) -> bool {
271    matches!(byte, b'0'..=b'7')
272}
273
274#[cfg(test)]
275mod tests {
276    use crate::object::String;
277    use crate::reader::Reader;
278    use crate::reader::ReaderExt;
279
280    #[test]
281    fn hex_string_empty() {
282        assert_eq!(
283            Reader::new(b"<>")
284                .read_without_context::<String<'_>>()
285                .unwrap()
286                .as_bytes(),
287            b""
288        );
289    }
290
291    #[test]
292    fn hex_string_1() {
293        assert_eq!(
294            Reader::new(b"<00010203>")
295                .read_without_context::<String<'_>>()
296                .unwrap()
297                .as_bytes(),
298            &[0x00, 0x01, 0x02, 0x03]
299        );
300    }
301
302    #[test]
303    fn hex_string_2() {
304        assert_eq!(
305            Reader::new(b"<000102034>")
306                .read_without_context::<String<'_>>()
307                .unwrap()
308                .as_bytes(),
309            &[0x00, 0x01, 0x02, 0x03, 0x40]
310        );
311    }
312
313    #[test]
314    fn hex_string_trailing_1() {
315        assert_eq!(
316            Reader::new(b"<000102034>dfgfg4")
317                .read_without_context::<String<'_>>()
318                .unwrap()
319                .as_bytes(),
320            &[0x00, 0x01, 0x02, 0x03, 0x40]
321        );
322    }
323
324    #[test]
325    fn hex_string_trailing_2() {
326        assert_eq!(
327            Reader::new(b"<1  3 4>dfgfg4")
328                .read_without_context::<String<'_>>()
329                .unwrap()
330                .as_bytes(),
331            &[0x13, 0x40]
332        );
333    }
334
335    #[test]
336    fn hex_string_trailing_3() {
337        assert_eq!(
338            Reader::new(b"<1>dfgfg4")
339                .read_without_context::<String<'_>>()
340                .unwrap()
341                .as_bytes(),
342            &[0x10]
343        );
344    }
345
346    #[test]
347    fn hex_string_invalid_1() {
348        assert!(
349            Reader::new(b"<")
350                .read_without_context::<String<'_>>()
351                .is_none()
352        );
353    }
354
355    #[test]
356    fn hex_string_invalid_2() {
357        assert!(
358            Reader::new(b"34AD")
359                .read_without_context::<String<'_>>()
360                .is_none()
361        );
362    }
363
364    #[test]
365    fn literal_string_empty() {
366        assert_eq!(
367            Reader::new(b"()")
368                .read_without_context::<String<'_>>()
369                .unwrap()
370                .as_bytes(),
371            b""
372        );
373    }
374
375    #[test]
376    fn literal_string_1() {
377        assert_eq!(
378            Reader::new(b"(Hi there.)")
379                .read_without_context::<String<'_>>()
380                .unwrap()
381                .as_bytes(),
382            b"Hi there."
383        );
384    }
385
386    #[test]
387    fn literal_string_2() {
388        assert!(
389            Reader::new(b"(Hi \\777)")
390                .read_without_context::<String<'_>>()
391                .is_some()
392        );
393    }
394
395    #[test]
396    fn literal_string_3() {
397        assert_eq!(
398            Reader::new(b"(Hi ) there.)")
399                .read_without_context::<String<'_>>()
400                .unwrap()
401                .as_bytes(),
402            b"Hi "
403        );
404    }
405
406    #[test]
407    fn literal_string_4() {
408        assert_eq!(
409            Reader::new(b"(Hi (()) there)")
410                .read_without_context::<String<'_>>()
411                .unwrap()
412                .as_bytes(),
413            b"Hi (()) there"
414        );
415    }
416
417    #[test]
418    fn literal_string_5() {
419        assert_eq!(
420            Reader::new(b"(Hi \\()")
421                .read_without_context::<String<'_>>()
422                .unwrap()
423                .as_bytes(),
424            b"Hi ("
425        );
426    }
427
428    #[test]
429    fn literal_string_6() {
430        assert_eq!(
431            Reader::new(b"(Hi \\\nthere)")
432                .read_without_context::<String<'_>>()
433                .unwrap()
434                .as_bytes(),
435            b"Hi there"
436        );
437    }
438
439    #[test]
440    fn literal_string_7() {
441        assert_eq!(
442            Reader::new(b"(Hi \\05354)")
443                .read_without_context::<String<'_>>()
444                .unwrap()
445                .as_bytes(),
446            b"Hi +54"
447        );
448    }
449
450    #[test]
451    fn literal_string_8() {
452        assert_eq!(
453            Reader::new(b"(\\3)")
454                .read_without_context::<String<'_>>()
455                .unwrap()
456                .as_bytes(),
457            b"\x03"
458        );
459    }
460
461    #[test]
462    fn literal_string_9() {
463        assert_eq!(
464            Reader::new(b"(\\36)")
465                .read_without_context::<String<'_>>()
466                .unwrap()
467                .as_bytes(),
468            b"\x1e"
469        );
470    }
471
472    #[test]
473    fn literal_string_10() {
474        assert_eq!(
475            Reader::new(b"(\\36ab)")
476                .read_without_context::<String<'_>>()
477                .unwrap()
478                .as_bytes(),
479            b"\x1eab"
480        );
481    }
482
483    #[test]
484    fn literal_string_11() {
485        assert_eq!(
486            Reader::new(b"(\\00Y)")
487                .read_without_context::<String<'_>>()
488                .unwrap()
489                .as_bytes(),
490            b"\0Y"
491        );
492    }
493
494    #[test]
495    fn literal_string_12() {
496        assert_eq!(
497            Reader::new(b"(\\0Y)")
498                .read_without_context::<String<'_>>()
499                .unwrap()
500                .as_bytes(),
501            b"\0Y"
502        );
503    }
504
505    #[test]
506    fn literal_string_trailing() {
507        assert_eq!(
508            Reader::new(b"(Hi there.)abcde")
509                .read_without_context::<String<'_>>()
510                .unwrap()
511                .as_bytes(),
512            b"Hi there."
513        );
514    }
515
516    #[test]
517    fn literal_string_invalid() {
518        assert_eq!(
519            Reader::new(b"(Hi \\778)")
520                .read_without_context::<String<'_>>()
521                .unwrap()
522                .as_bytes(),
523            b"Hi \x3F8"
524        );
525    }
526
527    #[test]
528    fn string_1() {
529        assert_eq!(
530            Reader::new(b"(Hi there.)")
531                .read_without_context::<String<'_>>()
532                .unwrap()
533                .as_bytes(),
534            b"Hi there."
535        );
536    }
537
538    #[test]
539    fn string_2() {
540        assert_eq!(
541            Reader::new(b"<00010203>")
542                .read_without_context::<String<'_>>()
543                .unwrap()
544                .as_bytes(),
545            &[0x00, 0x01, 0x02, 0x03]
546        );
547    }
548}