hayro_syntax/object/
string.rs

1//! Strings.
2
3use crate::crypto::DecryptionTarget;
4use crate::filter::ascii_hex::decode_hex_string;
5use crate::object::macros::object;
6use crate::object::{Object, ObjectLike};
7use crate::reader::{Readable, Reader, ReaderContext, Skippable};
8use crate::trivia::is_white_space_character;
9use log::warn;
10use std::borrow::Cow;
11use std::hash::{Hash, Hasher};
12// TODO: Make `HexString` and `LiteralString` own their values.
13
14/// A hex-encoded string.
15#[derive(Clone, Debug)]
16struct HexString<'a>(&'a [u8], bool, ReaderContext<'a>);
17
18impl HexString<'_> {
19    /// Returns the content of the string.
20    fn get(&self) -> Vec<u8> {
21        let decoded = if self.1 {
22            let mut cleaned = Vec::with_capacity(self.0.len() + 1);
23
24            for b in self.0.iter().copied() {
25                if !is_white_space_character(b) {
26                    cleaned.push(b);
27                }
28            }
29
30            if cleaned.len() % 2 != 0 {
31                cleaned.push(b'0');
32            }
33
34            // We made sure while parsing that it is a valid hex string.
35            decode_hex_string(&cleaned).unwrap()
36        } else {
37            // We made sure while parsing that it is a valid hex string.
38            decode_hex_string(self.0).unwrap()
39        };
40
41        if self.2.xref.needs_decryption(&self.2) {
42            self.2
43                .xref
44                .decrypt(
45                    self.2.obj_number.unwrap(),
46                    &decoded,
47                    DecryptionTarget::String,
48                )
49                .unwrap_or_default()
50        } else {
51            decoded
52        }
53    }
54}
55
56impl PartialEq for HexString<'_> {
57    fn eq(&self, other: &Self) -> bool {
58        // TODO: We probably want to ignore escapes.
59        self.0 == other.0 && self.1 == other.1
60    }
61}
62
63impl Skippable for HexString<'_> {
64    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
65        parse_hex(r).map(|_| {})
66    }
67}
68
69impl<'a> Readable<'a> for HexString<'a> {
70    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
71        let start = r.offset();
72        let mut dirty = parse_hex(r)?;
73        let end = r.offset();
74
75        // Exclude outer brackets.
76        let result = r.range(start + 1..end - 1).unwrap();
77        dirty |= !result.len().is_multiple_of(2);
78
79        Some(HexString(result, dirty, ctx.clone()))
80    }
81}
82
83impl<'a> TryFrom<Object<'a>> for HexString<'a> {
84    type Error = ();
85
86    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
87        match value {
88            Object::String(String(InnerString::Hex(h))) => Ok(h),
89            _ => Err(()),
90        }
91    }
92}
93
94impl<'a> ObjectLike<'a> for HexString<'a> {}
95
96fn parse_hex(r: &mut Reader<'_>) -> Option<bool> {
97    let mut has_whitespace = false;
98
99    r.forward_tag(b"<")?;
100    while let Some(b) = r.peek_byte() {
101        let is_hex = b.is_ascii_hexdigit();
102        let is_whitespace = is_white_space_character(b);
103        has_whitespace |= is_whitespace;
104
105        if !is_hex && !is_whitespace {
106            break;
107        }
108
109        r.read_byte()?;
110    }
111    r.forward_tag(b">")?;
112
113    Some(has_whitespace)
114}
115
116/// A literal string.
117#[derive(Debug, Clone)]
118struct LiteralString<'a>(&'a [u8], bool, ReaderContext<'a>);
119
120impl<'a> LiteralString<'a> {
121    /// Returns the content of the string.
122    fn get(&self) -> Cow<'a, [u8]> {
123        let decoded = if self.1 {
124            let mut cleaned = vec![];
125            let mut r = Reader::new(self.0);
126
127            while let Some(byte) = r.read_byte() {
128                match byte {
129                    b'\\' => {
130                        let next = r.read_byte().unwrap();
131
132                        if is_octal_digit(next) {
133                            let second = r.read_byte();
134                            let third = r.read_byte();
135
136                            let bytes = match (second, third) {
137                                (Some(n1), Some(n2)) => {
138                                    match (is_octal_digit(n1), is_octal_digit(n2)) {
139                                        (true, true) => [next, n1, n2],
140                                        (true, _) => {
141                                            r.jump(r.offset() - 1);
142                                            [b'0', next, n1]
143                                        }
144                                        _ => {
145                                            r.jump(r.offset() - 2);
146                                            [b'0', b'0', next]
147                                        }
148                                    }
149                                }
150                                (Some(n1), None) => {
151                                    if is_octal_digit(n1) {
152                                        [b'0', next, n1]
153                                    } else {
154                                        r.jump(r.offset() - 1);
155                                        [b'0', b'0', next]
156                                    }
157                                }
158                                _ => [b'0', b'0', next],
159                            };
160
161                            let str = std::str::from_utf8(&bytes).unwrap();
162
163                            if let Ok(num) = u8::from_str_radix(str, 8) {
164                                cleaned.push(num);
165                            } else {
166                                warn!("overflow occurred while parsing octal literal string");
167                            }
168                        } else {
169                            match next {
170                                b'n' => cleaned.push(0xA),
171                                b'r' => cleaned.push(0xD),
172                                b't' => cleaned.push(0x9),
173                                b'b' => cleaned.push(0x8),
174                                b'f' => cleaned.push(0xC),
175                                b'(' => cleaned.push(b'('),
176                                b')' => cleaned.push(b')'),
177                                b'\\' => cleaned.push(b'\\'),
178                                b'\n' | b'\r' => {
179                                    // A conforming reader shall disregard the REVERSE SOLIDUS
180                                    // and the end-of-line marker following it when reading
181                                    // the string; the resulting string value shall be
182                                    // identical to that which would be read if the string
183                                    // were not split.
184                                    r.skip_eol_characters();
185                                }
186                                _ => cleaned.push(next),
187                            }
188                        }
189                    }
190                    // An end-of-line marker appearing within a literal string
191                    // without a preceding REVERSE SOLIDUS shall be treated as
192                    // a byte value of (0Ah), irrespective of whether the end-of-line
193                    // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
194                    b'\n' | b'\r' => {
195                        cleaned.push(b'\n');
196                        r.skip_eol_characters();
197                    }
198                    other => cleaned.push(other),
199                }
200            }
201
202            Cow::Owned(cleaned)
203        } else {
204            Cow::Borrowed(self.0)
205        };
206
207        if self.2.xref.needs_decryption(&self.2) {
208            Cow::Owned(
209                self.2
210                    .xref
211                    .decrypt(
212                        self.2.obj_number.unwrap(),
213                        &decoded,
214                        DecryptionTarget::String,
215                    )
216                    .unwrap_or_default(),
217            )
218        } else {
219            decoded
220        }
221    }
222}
223
224impl Hash for LiteralString<'_> {
225    fn hash<H: Hasher>(&self, state: &mut H) {
226        self.0.hash(state);
227        self.1.hash(state);
228    }
229}
230
231impl PartialEq for LiteralString<'_> {
232    fn eq(&self, other: &Self) -> bool {
233        self.0.eq(other.0) && self.1.eq(&other.1)
234    }
235}
236
237impl Skippable for LiteralString<'_> {
238    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
239        parse_literal(r).map(|_| ())
240    }
241}
242
243impl<'a> Readable<'a> for LiteralString<'a> {
244    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
245        let start = r.offset();
246        let dirty = parse_literal(r)?;
247        let end = r.offset();
248
249        // Exclude outer brackets
250        let result = r.range(start + 1..end - 1).unwrap();
251
252        Some(LiteralString(result, dirty, ctx.clone()))
253    }
254}
255
256impl<'a> TryFrom<Object<'a>> for LiteralString<'a> {
257    type Error = ();
258
259    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
260        match value {
261            Object::String(String(InnerString::Literal(l))) => Ok(l),
262            _ => Err(()),
263        }
264    }
265}
266
267impl<'a> ObjectLike<'a> for LiteralString<'a> {}
268
269fn parse_literal(r: &mut Reader<'_>) -> Option<bool> {
270    r.forward_tag(b"(")?;
271    let mut bracket_counter = 1;
272    let mut dirty = false;
273
274    while bracket_counter > 0 {
275        let byte = r.read_byte()?;
276
277        match byte {
278            b'\\' => {
279                dirty = true;
280
281                let _ = r.read_byte()?;
282            }
283            b'(' => bracket_counter += 1,
284            b')' => bracket_counter -= 1,
285            b'\n' | b'\r' => dirty = true,
286            _ => {}
287        };
288    }
289
290    Some(dirty)
291}
292
293#[derive(Clone, Debug, PartialEq)]
294enum InnerString<'a> {
295    Hex(HexString<'a>),
296    Literal(LiteralString<'a>),
297}
298
299/// A string.
300#[derive(Clone, Debug, PartialEq)]
301pub struct String<'a>(InnerString<'a>);
302
303impl<'a> String<'a> {
304    /// Returns the content of the string.
305    pub fn get(&self) -> Cow<'a, [u8]> {
306        match &self.0 {
307            InnerString::Hex(hex) => Cow::Owned(hex.get()),
308            InnerString::Literal(lit) => lit.get(),
309        }
310    }
311}
312
313impl<'a> From<HexString<'a>> for String<'a> {
314    fn from(value: HexString<'a>) -> Self {
315        Self(InnerString::Hex(value))
316    }
317}
318
319impl<'a> From<LiteralString<'a>> for String<'a> {
320    fn from(value: LiteralString<'a>) -> Self {
321        Self(InnerString::Literal(value))
322    }
323}
324
325object!(String<'a>, String);
326
327impl Skippable for String<'_> {
328    fn skip(r: &mut Reader<'_>, is_content_stream: bool) -> Option<()> {
329        match r.peek_byte()? {
330            b'<' => HexString::skip(r, is_content_stream),
331            b'(' => LiteralString::skip(r, is_content_stream),
332            _ => None,
333        }
334    }
335}
336
337impl<'a> Readable<'a> for String<'a> {
338    fn read(r: &mut Reader<'a>, _: &ReaderContext) -> Option<Self> {
339        let inner = match r.peek_byte()? {
340            b'<' => InnerString::Hex(r.read_without_context::<HexString>()?),
341            b'(' => InnerString::Literal(r.read_without_context::<LiteralString>()?),
342            _ => return None,
343        };
344
345        Some(String(inner))
346    }
347}
348
349fn is_octal_digit(byte: u8) -> bool {
350    matches!(byte, b'0'..=b'7')
351}
352
353#[cfg(test)]
354mod tests {
355    use crate::object::string::{HexString, LiteralString, String};
356    use crate::reader::Reader;
357
358    #[test]
359    fn hex_string_empty() {
360        assert_eq!(
361            Reader::new("<>".as_bytes())
362                .read_without_context::<HexString>()
363                .unwrap()
364                .get(),
365            vec![]
366        );
367    }
368
369    #[test]
370    fn hex_string_1() {
371        assert_eq!(
372            Reader::new("<00010203>".as_bytes())
373                .read_without_context::<HexString>()
374                .unwrap()
375                .get(),
376            vec![0x00, 0x01, 0x02, 0x03]
377        );
378    }
379
380    #[test]
381    fn hex_string_2() {
382        assert_eq!(
383            Reader::new("<000102034>".as_bytes())
384                .read_without_context::<HexString>()
385                .unwrap()
386                .get(),
387            vec![0x00, 0x01, 0x02, 0x03, 0x40]
388        );
389    }
390
391    #[test]
392    fn hex_string_trailing_1() {
393        assert_eq!(
394            Reader::new("<000102034>dfgfg4".as_bytes())
395                .read_without_context::<HexString>()
396                .unwrap()
397                .get(),
398            vec![0x00, 0x01, 0x02, 0x03, 0x40]
399        );
400    }
401
402    #[test]
403    fn hex_string_trailing_2() {
404        assert_eq!(
405            Reader::new("<1  3 4>dfgfg4".as_bytes())
406                .read_without_context::<HexString>()
407                .unwrap()
408                .get(),
409            vec![0x13, 0x40]
410        );
411    }
412
413    #[test]
414    fn hex_string_trailing_3() {
415        assert_eq!(
416            Reader::new("<1>dfgfg4".as_bytes())
417                .read_without_context::<HexString>()
418                .unwrap()
419                .get(),
420            vec![0x10]
421        );
422    }
423
424    #[test]
425    fn hex_string_invalid_1() {
426        assert!(
427            Reader::new("<".as_bytes())
428                .read_without_context::<HexString>()
429                .is_none()
430        );
431    }
432
433    #[test]
434    fn hex_string_invalid_2() {
435        assert!(
436            Reader::new("34AD".as_bytes())
437                .read_without_context::<HexString>()
438                .is_none()
439        );
440    }
441
442    #[test]
443    fn literal_string_empty() {
444        assert_eq!(
445            Reader::new("()".as_bytes())
446                .read_without_context::<LiteralString>()
447                .unwrap()
448                .get()
449                .to_vec(),
450            b"".to_vec()
451        );
452    }
453
454    #[test]
455    fn literal_string_1() {
456        assert_eq!(
457            Reader::new("(Hi there.)".as_bytes())
458                .read_without_context::<LiteralString>()
459                .unwrap()
460                .get()
461                .to_vec(),
462            b"Hi there.".to_vec()
463        );
464    }
465
466    #[test]
467    fn literal_string_2() {
468        assert!(
469            Reader::new("(Hi \\777)".as_bytes())
470                .read_without_context::<LiteralString>()
471                .is_some()
472        );
473    }
474
475    #[test]
476    fn literal_string_3() {
477        assert_eq!(
478            Reader::new("(Hi ) there.)".as_bytes())
479                .read_without_context::<LiteralString>()
480                .unwrap()
481                .get()
482                .to_vec(),
483            b"Hi ".to_vec()
484        );
485    }
486
487    #[test]
488    fn literal_string_4() {
489        assert_eq!(
490            Reader::new("(Hi (()) there)".as_bytes())
491                .read_without_context::<LiteralString>()
492                .unwrap()
493                .get()
494                .to_vec(),
495            b"Hi (()) there".to_vec()
496        );
497    }
498
499    #[test]
500    fn literal_string_5() {
501        assert_eq!(
502            Reader::new("(Hi \\()".as_bytes())
503                .read_without_context::<LiteralString>()
504                .unwrap()
505                .get()
506                .to_vec(),
507            b"Hi (".to_vec()
508        );
509    }
510
511    #[test]
512    fn literal_string_6() {
513        assert_eq!(
514            Reader::new("(Hi \\\nthere)".as_bytes())
515                .read_without_context::<LiteralString>()
516                .unwrap()
517                .get()
518                .to_vec(),
519            b"Hi there".to_vec()
520        );
521    }
522
523    #[test]
524    fn literal_string_7() {
525        assert_eq!(
526            Reader::new("(Hi \\05354)".as_bytes())
527                .read_without_context::<LiteralString>()
528                .unwrap()
529                .get()
530                .to_vec(),
531            b"Hi +54".to_vec()
532        );
533    }
534
535    #[test]
536    fn literal_string_8() {
537        assert_eq!(
538            Reader::new("(\\3)".as_bytes())
539                .read_without_context::<String>()
540                .unwrap()
541                .get(),
542            b"\x03".to_vec()
543        )
544    }
545
546    #[test]
547    fn literal_string_9() {
548        assert_eq!(
549            Reader::new("(\\36)".as_bytes())
550                .read_without_context::<String>()
551                .unwrap()
552                .get(),
553            b"\x1e".to_vec()
554        )
555    }
556
557    #[test]
558    fn literal_string_10() {
559        assert_eq!(
560            Reader::new("(\\36ab)".as_bytes())
561                .read_without_context::<String>()
562                .unwrap()
563                .get(),
564            b"\x1eab".to_vec()
565        )
566    }
567
568    #[test]
569    fn literal_string_11() {
570        assert_eq!(
571            Reader::new("(\\00Y)".as_bytes())
572                .read_without_context::<String>()
573                .unwrap()
574                .get(),
575            b"\0Y".to_vec()
576        )
577    }
578
579    #[test]
580    fn literal_string_12() {
581        assert_eq!(
582            Reader::new("(\\0Y)".as_bytes())
583                .read_without_context::<String>()
584                .unwrap()
585                .get(),
586            b"\0Y".to_vec()
587        )
588    }
589
590    #[test]
591    fn literal_string_trailing() {
592        assert_eq!(
593            Reader::new("(Hi there.)abcde".as_bytes())
594                .read_without_context::<LiteralString>()
595                .unwrap()
596                .get()
597                .to_vec(),
598            b"Hi there.".to_vec()
599        );
600    }
601
602    #[test]
603    fn literal_string_invalid() {
604        assert_eq!(
605            Reader::new("(Hi \\778)".as_bytes())
606                .read_without_context::<LiteralString>()
607                .unwrap()
608                .get()
609                .to_vec(),
610            b"Hi \x3F8".to_vec()
611        );
612    }
613
614    #[test]
615    fn string_1() {
616        assert_eq!(
617            Reader::new("(Hi there.)".as_bytes())
618                .read_without_context::<String>()
619                .unwrap()
620                .get()
621                .to_vec(),
622            b"Hi there.".to_vec()
623        );
624    }
625
626    #[test]
627    fn string_2() {
628        assert_eq!(
629            Reader::new("<00010203>".as_bytes())
630                .read_without_context::<String>()
631                .unwrap()
632                .get(),
633            vec![0x00, 0x01, 0x02, 0x03]
634        );
635    }
636}