hayro_syntax/object/
string.rs

1//! Strings.
2
3use crate::crypto::DecryptionTarget;
4use crate::filter::ascii_hex::decode_hex_string;
5use crate::object::macros::object;
6use crate::object::{Object, ObjectLike};
7use crate::reader::Reader;
8use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
9use crate::trivia::is_white_space_character;
10use log::warn;
11use std::borrow::Cow;
12use std::hash::{Hash, Hasher};
13// TODO: Make `HexString` and `LiteralString` own their values.
14
15/// A hex-encoded string.
16#[derive(Clone, Debug)]
17struct HexString<'a>(&'a [u8], bool, ReaderContext<'a>);
18
19impl HexString<'_> {
20    /// Returns the content of the string.
21    fn get(&self) -> Vec<u8> {
22        let decoded = if self.1 {
23            let mut cleaned = Vec::with_capacity(self.0.len() + 1);
24
25            for b in self.0.iter().copied() {
26                if !is_white_space_character(b) {
27                    cleaned.push(b);
28                }
29            }
30
31            if cleaned.len() % 2 != 0 {
32                cleaned.push(b'0');
33            }
34
35            // We made sure while parsing that it is a valid hex string.
36            decode_hex_string(&cleaned).unwrap()
37        } else {
38            // We made sure while parsing that it is a valid hex string.
39            decode_hex_string(self.0).unwrap()
40        };
41
42        if self.2.xref.needs_decryption(&self.2) {
43            self.2
44                .xref
45                .decrypt(
46                    self.2.obj_number.unwrap(),
47                    &decoded,
48                    DecryptionTarget::String,
49                )
50                .unwrap_or_default()
51        } else {
52            decoded
53        }
54    }
55}
56
57impl PartialEq for HexString<'_> {
58    fn eq(&self, other: &Self) -> bool {
59        // TODO: We probably want to ignore escapes.
60        self.0 == other.0 && self.1 == other.1
61    }
62}
63
64impl Skippable for HexString<'_> {
65    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
66        parse_hex(r).map(|_| {})
67    }
68}
69
70impl<'a> Readable<'a> for HexString<'a> {
71    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
72        let start = r.offset();
73        let mut dirty = parse_hex(r)?;
74        let end = r.offset();
75
76        // Exclude outer brackets.
77        let result = r.range(start + 1..end - 1).unwrap();
78        dirty |= !result.len().is_multiple_of(2);
79
80        Some(HexString(result, dirty, ctx.clone()))
81    }
82}
83
84impl<'a> TryFrom<Object<'a>> for HexString<'a> {
85    type Error = ();
86
87    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
88        match value {
89            Object::String(String(InnerString::Hex(h))) => Ok(h),
90            _ => Err(()),
91        }
92    }
93}
94
95impl<'a> ObjectLike<'a> for HexString<'a> {}
96
97fn parse_hex(r: &mut Reader<'_>) -> Option<bool> {
98    let mut has_whitespace = false;
99
100    r.forward_tag(b"<")?;
101    while let Some(b) = r.peek_byte() {
102        let is_hex = b.is_ascii_hexdigit();
103        let is_whitespace = is_white_space_character(b);
104        has_whitespace |= is_whitespace;
105
106        if !is_hex && !is_whitespace {
107            break;
108        }
109
110        r.read_byte()?;
111    }
112    r.forward_tag(b">")?;
113
114    Some(has_whitespace)
115}
116
117/// A literal string.
118#[derive(Debug, Clone)]
119struct LiteralString<'a>(&'a [u8], bool, ReaderContext<'a>);
120
121impl<'a> LiteralString<'a> {
122    /// Returns the content of the string.
123    fn get(&self) -> Cow<'a, [u8]> {
124        let decoded = if self.1 {
125            let mut cleaned = vec![];
126            let mut r = Reader::new(self.0);
127
128            while let Some(byte) = r.read_byte() {
129                match byte {
130                    b'\\' => {
131                        let next = r.read_byte().unwrap();
132
133                        if is_octal_digit(next) {
134                            let second = r.read_byte();
135                            let third = r.read_byte();
136
137                            let bytes = match (second, third) {
138                                (Some(n1), Some(n2)) => {
139                                    match (is_octal_digit(n1), is_octal_digit(n2)) {
140                                        (true, true) => [next, n1, n2],
141                                        (true, _) => {
142                                            r.jump(r.offset() - 1);
143                                            [b'0', next, n1]
144                                        }
145                                        _ => {
146                                            r.jump(r.offset() - 2);
147                                            [b'0', b'0', next]
148                                        }
149                                    }
150                                }
151                                (Some(n1), None) => {
152                                    if is_octal_digit(n1) {
153                                        [b'0', next, n1]
154                                    } else {
155                                        r.jump(r.offset() - 1);
156                                        [b'0', b'0', next]
157                                    }
158                                }
159                                _ => [b'0', b'0', next],
160                            };
161
162                            let str = std::str::from_utf8(&bytes).unwrap();
163
164                            if let Ok(num) = u8::from_str_radix(str, 8) {
165                                cleaned.push(num);
166                            } else {
167                                warn!("overflow occurred while parsing octal literal string");
168                            }
169                        } else {
170                            match next {
171                                b'n' => cleaned.push(0xA),
172                                b'r' => cleaned.push(0xD),
173                                b't' => cleaned.push(0x9),
174                                b'b' => cleaned.push(0x8),
175                                b'f' => cleaned.push(0xC),
176                                b'(' => cleaned.push(b'('),
177                                b')' => cleaned.push(b')'),
178                                b'\\' => cleaned.push(b'\\'),
179                                b'\n' | b'\r' => {
180                                    // A conforming reader shall disregard the REVERSE SOLIDUS
181                                    // and the end-of-line marker following it when reading
182                                    // the string; the resulting string value shall be
183                                    // identical to that which would be read if the string
184                                    // were not split.
185                                    r.skip_eol_characters();
186                                }
187                                _ => cleaned.push(next),
188                            }
189                        }
190                    }
191                    // An end-of-line marker appearing within a literal string
192                    // without a preceding REVERSE SOLIDUS shall be treated as
193                    // a byte value of (0Ah), irrespective of whether the end-of-line
194                    // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
195                    b'\n' | b'\r' => {
196                        cleaned.push(b'\n');
197                        r.skip_eol_characters();
198                    }
199                    other => cleaned.push(other),
200                }
201            }
202
203            Cow::Owned(cleaned)
204        } else {
205            Cow::Borrowed(self.0)
206        };
207
208        if self.2.xref.needs_decryption(&self.2) {
209            // This might be `None` for example when reading metadata
210            // from the trailer dictionary.
211            if let Some(obj_number) = self.2.obj_number {
212                Cow::Owned(
213                    self.2
214                        .xref
215                        .decrypt(obj_number, &decoded, DecryptionTarget::String)
216                        .unwrap_or_default(),
217                )
218            } else {
219                decoded
220            }
221        } else {
222            decoded
223        }
224    }
225}
226
227impl Hash for LiteralString<'_> {
228    fn hash<H: Hasher>(&self, state: &mut H) {
229        self.0.hash(state);
230        self.1.hash(state);
231    }
232}
233
234impl PartialEq for LiteralString<'_> {
235    fn eq(&self, other: &Self) -> bool {
236        self.0.eq(other.0) && self.1.eq(&other.1)
237    }
238}
239
240impl Skippable for LiteralString<'_> {
241    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
242        parse_literal(r).map(|_| ())
243    }
244}
245
246impl<'a> Readable<'a> for LiteralString<'a> {
247    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
248        let start = r.offset();
249        let dirty = parse_literal(r)?;
250        let end = r.offset();
251
252        // Exclude outer brackets
253        let result = r.range(start + 1..end - 1).unwrap();
254
255        Some(LiteralString(result, dirty, ctx.clone()))
256    }
257}
258
259impl<'a> TryFrom<Object<'a>> for LiteralString<'a> {
260    type Error = ();
261
262    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
263        match value {
264            Object::String(String(InnerString::Literal(l))) => Ok(l),
265            _ => Err(()),
266        }
267    }
268}
269
270impl<'a> ObjectLike<'a> for LiteralString<'a> {}
271
272fn parse_literal(r: &mut Reader<'_>) -> Option<bool> {
273    r.forward_tag(b"(")?;
274    let mut bracket_counter = 1;
275    let mut dirty = false;
276
277    while bracket_counter > 0 {
278        let byte = r.read_byte()?;
279
280        match byte {
281            b'\\' => {
282                dirty = true;
283
284                let _ = r.read_byte()?;
285            }
286            b'(' => bracket_counter += 1,
287            b')' => bracket_counter -= 1,
288            b'\n' | b'\r' => dirty = true,
289            _ => {}
290        };
291    }
292
293    Some(dirty)
294}
295
296#[derive(Clone, Debug, PartialEq)]
297enum InnerString<'a> {
298    Hex(HexString<'a>),
299    Literal(LiteralString<'a>),
300}
301
302/// A string.
303#[derive(Clone, Debug, PartialEq)]
304pub struct String<'a>(InnerString<'a>);
305
306impl<'a> String<'a> {
307    /// Returns the content of the string.
308    pub fn get(&self) -> Cow<'a, [u8]> {
309        match &self.0 {
310            InnerString::Hex(hex) => Cow::Owned(hex.get()),
311            InnerString::Literal(lit) => lit.get(),
312        }
313    }
314}
315
316impl<'a> From<HexString<'a>> for String<'a> {
317    fn from(value: HexString<'a>) -> Self {
318        Self(InnerString::Hex(value))
319    }
320}
321
322impl<'a> From<LiteralString<'a>> for String<'a> {
323    fn from(value: LiteralString<'a>) -> Self {
324        Self(InnerString::Literal(value))
325    }
326}
327
328object!(String<'a>, String);
329
330impl Skippable for String<'_> {
331    fn skip(r: &mut Reader<'_>, is_content_stream: bool) -> Option<()> {
332        match r.peek_byte()? {
333            b'<' => HexString::skip(r, is_content_stream),
334            b'(' => LiteralString::skip(r, is_content_stream),
335            _ => None,
336        }
337    }
338}
339
340impl<'a> Readable<'a> for String<'a> {
341    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
342        let inner = match r.peek_byte()? {
343            b'<' => InnerString::Hex(r.read::<HexString<'_>>(ctx)?),
344            b'(' => InnerString::Literal(r.read::<LiteralString<'_>>(ctx)?),
345            _ => return None,
346        };
347
348        Some(String(inner))
349    }
350}
351
352fn is_octal_digit(byte: u8) -> bool {
353    matches!(byte, b'0'..=b'7')
354}
355
356#[cfg(test)]
357mod tests {
358    use crate::object::string::{HexString, LiteralString, String};
359    use crate::reader::Reader;
360    use crate::reader::ReaderExt;
361
362    #[test]
363    fn hex_string_empty() {
364        assert_eq!(
365            Reader::new("<>".as_bytes())
366                .read_without_context::<HexString<'_>>()
367                .unwrap()
368                .get(),
369            vec![]
370        );
371    }
372
373    #[test]
374    fn hex_string_1() {
375        assert_eq!(
376            Reader::new("<00010203>".as_bytes())
377                .read_without_context::<HexString<'_>>()
378                .unwrap()
379                .get(),
380            vec![0x00, 0x01, 0x02, 0x03]
381        );
382    }
383
384    #[test]
385    fn hex_string_2() {
386        assert_eq!(
387            Reader::new("<000102034>".as_bytes())
388                .read_without_context::<HexString<'_>>()
389                .unwrap()
390                .get(),
391            vec![0x00, 0x01, 0x02, 0x03, 0x40]
392        );
393    }
394
395    #[test]
396    fn hex_string_trailing_1() {
397        assert_eq!(
398            Reader::new("<000102034>dfgfg4".as_bytes())
399                .read_without_context::<HexString<'_>>()
400                .unwrap()
401                .get(),
402            vec![0x00, 0x01, 0x02, 0x03, 0x40]
403        );
404    }
405
406    #[test]
407    fn hex_string_trailing_2() {
408        assert_eq!(
409            Reader::new("<1  3 4>dfgfg4".as_bytes())
410                .read_without_context::<HexString<'_>>()
411                .unwrap()
412                .get(),
413            vec![0x13, 0x40]
414        );
415    }
416
417    #[test]
418    fn hex_string_trailing_3() {
419        assert_eq!(
420            Reader::new("<1>dfgfg4".as_bytes())
421                .read_without_context::<HexString<'_>>()
422                .unwrap()
423                .get(),
424            vec![0x10]
425        );
426    }
427
428    #[test]
429    fn hex_string_invalid_1() {
430        assert!(
431            Reader::new("<".as_bytes())
432                .read_without_context::<HexString<'_>>()
433                .is_none()
434        );
435    }
436
437    #[test]
438    fn hex_string_invalid_2() {
439        assert!(
440            Reader::new("34AD".as_bytes())
441                .read_without_context::<HexString<'_>>()
442                .is_none()
443        );
444    }
445
446    #[test]
447    fn literal_string_empty() {
448        assert_eq!(
449            Reader::new("()".as_bytes())
450                .read_without_context::<LiteralString<'_>>()
451                .unwrap()
452                .get()
453                .to_vec(),
454            b"".to_vec()
455        );
456    }
457
458    #[test]
459    fn literal_string_1() {
460        assert_eq!(
461            Reader::new("(Hi there.)".as_bytes())
462                .read_without_context::<LiteralString<'_>>()
463                .unwrap()
464                .get()
465                .to_vec(),
466            b"Hi there.".to_vec()
467        );
468    }
469
470    #[test]
471    fn literal_string_2() {
472        assert!(
473            Reader::new("(Hi \\777)".as_bytes())
474                .read_without_context::<LiteralString<'_>>()
475                .is_some()
476        );
477    }
478
479    #[test]
480    fn literal_string_3() {
481        assert_eq!(
482            Reader::new("(Hi ) there.)".as_bytes())
483                .read_without_context::<LiteralString<'_>>()
484                .unwrap()
485                .get()
486                .to_vec(),
487            b"Hi ".to_vec()
488        );
489    }
490
491    #[test]
492    fn literal_string_4() {
493        assert_eq!(
494            Reader::new("(Hi (()) there)".as_bytes())
495                .read_without_context::<LiteralString<'_>>()
496                .unwrap()
497                .get()
498                .to_vec(),
499            b"Hi (()) there".to_vec()
500        );
501    }
502
503    #[test]
504    fn literal_string_5() {
505        assert_eq!(
506            Reader::new("(Hi \\()".as_bytes())
507                .read_without_context::<LiteralString<'_>>()
508                .unwrap()
509                .get()
510                .to_vec(),
511            b"Hi (".to_vec()
512        );
513    }
514
515    #[test]
516    fn literal_string_6() {
517        assert_eq!(
518            Reader::new("(Hi \\\nthere)".as_bytes())
519                .read_without_context::<LiteralString<'_>>()
520                .unwrap()
521                .get()
522                .to_vec(),
523            b"Hi there".to_vec()
524        );
525    }
526
527    #[test]
528    fn literal_string_7() {
529        assert_eq!(
530            Reader::new("(Hi \\05354)".as_bytes())
531                .read_without_context::<LiteralString<'_>>()
532                .unwrap()
533                .get()
534                .to_vec(),
535            b"Hi +54".to_vec()
536        );
537    }
538
539    #[test]
540    fn literal_string_8() {
541        assert_eq!(
542            Reader::new("(\\3)".as_bytes())
543                .read_without_context::<String<'_>>()
544                .unwrap()
545                .get(),
546            b"\x03".to_vec()
547        );
548    }
549
550    #[test]
551    fn literal_string_9() {
552        assert_eq!(
553            Reader::new("(\\36)".as_bytes())
554                .read_without_context::<String<'_>>()
555                .unwrap()
556                .get(),
557            b"\x1e".to_vec()
558        );
559    }
560
561    #[test]
562    fn literal_string_10() {
563        assert_eq!(
564            Reader::new("(\\36ab)".as_bytes())
565                .read_without_context::<String<'_>>()
566                .unwrap()
567                .get(),
568            b"\x1eab".to_vec()
569        );
570    }
571
572    #[test]
573    fn literal_string_11() {
574        assert_eq!(
575            Reader::new("(\\00Y)".as_bytes())
576                .read_without_context::<String<'_>>()
577                .unwrap()
578                .get(),
579            b"\0Y".to_vec()
580        );
581    }
582
583    #[test]
584    fn literal_string_12() {
585        assert_eq!(
586            Reader::new("(\\0Y)".as_bytes())
587                .read_without_context::<String<'_>>()
588                .unwrap()
589                .get(),
590            b"\0Y".to_vec()
591        );
592    }
593
594    #[test]
595    fn literal_string_trailing() {
596        assert_eq!(
597            Reader::new("(Hi there.)abcde".as_bytes())
598                .read_without_context::<LiteralString<'_>>()
599                .unwrap()
600                .get()
601                .to_vec(),
602            b"Hi there.".to_vec()
603        );
604    }
605
606    #[test]
607    fn literal_string_invalid() {
608        assert_eq!(
609            Reader::new("(Hi \\778)".as_bytes())
610                .read_without_context::<LiteralString<'_>>()
611                .unwrap()
612                .get()
613                .to_vec(),
614            b"Hi \x3F8".to_vec()
615        );
616    }
617
618    #[test]
619    fn string_1() {
620        assert_eq!(
621            Reader::new("(Hi there.)".as_bytes())
622                .read_without_context::<String<'_>>()
623                .unwrap()
624                .get()
625                .to_vec(),
626            b"Hi there.".to_vec()
627        );
628    }
629
630    #[test]
631    fn string_2() {
632        assert_eq!(
633            Reader::new("<00010203>".as_bytes())
634                .read_without_context::<String<'_>>()
635                .unwrap()
636                .get(),
637            vec![0x00, 0x01, 0x02, 0x03]
638        );
639    }
640}