hayro_syntax/object/
string.rs

1//! Strings.
2
3use crate::filter::ascii_hex::decode_hex_string;
4use crate::object::macros::object;
5use crate::object::{Object, ObjectLike};
6use crate::reader::{Readable, Reader, ReaderContext, Skippable};
7use crate::trivia::is_white_space_character;
8use log::warn;
9use std::borrow::Cow;
10
11/// A hex-encoded string.
12#[derive(PartialEq, Eq, Copy, Clone, Debug)]
13struct HexString<'a>(&'a [u8], bool);
14
15impl HexString<'_> {
16    /// Returns the content of the string.
17    fn get(&self) -> Vec<u8> {
18        if self.1 {
19            let mut cleaned = Vec::with_capacity(self.0.len() + 1);
20
21            for b in self.0.iter().copied() {
22                if !is_white_space_character(b) {
23                    cleaned.push(b);
24                }
25            }
26
27            if cleaned.len() % 2 != 0 {
28                cleaned.push(b'0');
29            }
30
31            // We made sure while parsing that it is a valid hex string.
32            decode_hex_string(&cleaned).unwrap()
33        } else {
34            // We made sure while parsing that it is a valid hex string.
35            decode_hex_string(self.0).unwrap()
36        }
37    }
38}
39
40impl Skippable for HexString<'_> {
41    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
42        parse_hex(r).map(|_| {})
43    }
44}
45
46impl<'a> Readable<'a> for HexString<'a> {
47    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
48        let start = r.offset();
49        let mut dirty = parse_hex(r)?;
50        let end = r.offset();
51
52        // Exclude outer brackets.
53        let result = r.range(start + 1..end - 1).unwrap();
54        dirty |= result.len() % 2 != 0;
55
56        Some(HexString(result, dirty))
57    }
58}
59
60impl<'a> TryFrom<Object<'a>> for HexString<'a> {
61    type Error = ();
62
63    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
64        match value {
65            Object::String(String(InnerString::Hex(h))) => Ok(h),
66            _ => Err(()),
67        }
68    }
69}
70
71impl<'a> ObjectLike<'a> for HexString<'a> {}
72
73fn parse_hex(r: &mut Reader<'_>) -> Option<bool> {
74    let mut has_whitespace = false;
75
76    r.forward_tag(b"<")?;
77    while let Some(b) = r.peek_byte() {
78        let is_hex = b.is_ascii_hexdigit();
79        let is_whitespace = is_white_space_character(b);
80        has_whitespace |= is_whitespace;
81
82        if !is_hex && !is_whitespace {
83            break;
84        }
85
86        r.read_byte()?;
87    }
88    r.forward_tag(b">")?;
89
90    Some(has_whitespace)
91}
92
93/// A literal string.
94#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
95struct LiteralString<'a>(&'a [u8], bool);
96
97impl<'a> LiteralString<'a> {
98    /// Returns the content of the string.
99    fn get(&self) -> Cow<'a, [u8]> {
100        if self.1 {
101            let mut cleaned = vec![];
102            let mut r = Reader::new(self.0);
103
104            while let Some(byte) = r.read_byte() {
105                match byte {
106                    b'\\' => {
107                        let next = r.read_byte().unwrap();
108
109                        if is_octal_digit(next) {
110                            let second = r.read_byte();
111                            let third = r.read_byte();
112
113                            match (second, third) {
114                                (Some(n1), Some(n2)) => {
115                                    if is_octal_digit(n1) && is_octal_digit(n2) {
116                                        let bytes = [next, n1, n2];
117                                        let str = std::str::from_utf8(&bytes).unwrap();
118
119                                        if let Ok(num) = u8::from_str_radix(str, 8) {
120                                            cleaned.push(num);
121                                        } else {
122                                            warn!(
123                                                "overflow occurred while parsing octal literal string"
124                                            );
125                                        }
126                                    } else {
127                                        // Ignore the solidus and treat as normal characters.
128                                        cleaned.push(next);
129                                        cleaned.push(n1);
130                                        cleaned.push(n2);
131                                    }
132                                }
133                                (Some(n1), None) => {
134                                    cleaned.push(next);
135                                    cleaned.push(n1);
136                                }
137                                _ => cleaned.push(next),
138                            }
139                        } else {
140                            match next {
141                                b'n' => cleaned.push(0xA),
142                                b'r' => cleaned.push(0xD),
143                                b't' => cleaned.push(0x9),
144                                b'b' => cleaned.push(0x8),
145                                b'f' => cleaned.push(0xC),
146                                b'(' => cleaned.push(b'('),
147                                b')' => cleaned.push(b')'),
148                                b'\\' => cleaned.push(b'\\'),
149                                b'\n' | b'\r' => {
150                                    // A conforming reader shall disregard the REVERSE SOLIDUS
151                                    // and the end-of-line marker following it when reading
152                                    // the string; the resulting string value shall be
153                                    // identical to that which would be read if the string
154                                    // were not split.
155                                    r.skip_eol_characters();
156                                }
157                                _ => cleaned.push(next),
158                            }
159                        }
160                    }
161                    // An end-of-line marker appearing within a literal string
162                    // without a preceding REVERSE SOLIDUS shall be treated as
163                    // a byte value of (0Ah), irrespective of whether the end-of-line
164                    // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
165                    b'\n' | b'\r' => {
166                        cleaned.push(b'\n');
167                        r.skip_eol_characters();
168                    }
169                    other => cleaned.push(other),
170                }
171            }
172
173            Cow::Owned(cleaned)
174        } else {
175            Cow::Borrowed(self.0)
176        }
177    }
178}
179
180impl Skippable for LiteralString<'_> {
181    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
182        parse_literal(r).map(|_| ())
183    }
184}
185
186impl<'a> Readable<'a> for LiteralString<'a> {
187    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
188        let start = r.offset();
189        let dirty = parse_literal(r)?;
190        let end = r.offset();
191
192        // Exclude outer brackets
193        let result = r.range(start + 1..end - 1).unwrap();
194
195        Some(LiteralString(result, dirty))
196    }
197}
198
199impl<'a> TryFrom<Object<'a>> for LiteralString<'a> {
200    type Error = ();
201
202    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
203        match value {
204            Object::String(String(InnerString::Literal(l))) => Ok(l),
205            _ => Err(()),
206        }
207    }
208}
209
210impl<'a> ObjectLike<'a> for LiteralString<'a> {}
211
212fn parse_literal(r: &mut Reader<'_>) -> Option<bool> {
213    r.forward_tag(b"(")?;
214    let mut bracket_counter = 1;
215    let mut dirty = false;
216
217    while bracket_counter > 0 {
218        let byte = r.read_byte()?;
219
220        match byte {
221            b'\\' => {
222                dirty = true;
223
224                let _ = r.read_byte()?;
225            }
226            b'(' => bracket_counter += 1,
227            b')' => bracket_counter -= 1,
228            b'\n' | b'\r' => dirty = true,
229            _ => {}
230        };
231    }
232
233    Some(dirty)
234}
235
236#[derive(Clone, Debug, PartialEq)]
237enum InnerString<'a> {
238    Hex(HexString<'a>),
239    Literal(LiteralString<'a>),
240}
241
242/// A string.
243#[derive(Clone, Debug, PartialEq)]
244pub struct String<'a>(InnerString<'a>);
245
246impl<'a> String<'a> {
247    /// Returns the content of the string.
248    pub fn get(&self) -> Cow<'a, [u8]> {
249        match &self.0 {
250            InnerString::Hex(hex) => Cow::Owned(hex.get()),
251            InnerString::Literal(lit) => lit.get(),
252        }
253    }
254}
255
256impl<'a> From<HexString<'a>> for String<'a> {
257    fn from(value: HexString<'a>) -> Self {
258        Self(InnerString::Hex(value))
259    }
260}
261
262impl<'a> From<LiteralString<'a>> for String<'a> {
263    fn from(value: LiteralString<'a>) -> Self {
264        Self(InnerString::Literal(value))
265    }
266}
267
268object!(String<'a>, String);
269
270impl Skippable for String<'_> {
271    fn skip(r: &mut Reader<'_>, is_content_stream: bool) -> Option<()> {
272        match r.peek_byte()? {
273            b'<' => HexString::skip(r, is_content_stream),
274            b'(' => LiteralString::skip(r, is_content_stream),
275            _ => None,
276        }
277    }
278}
279
280impl<'a> Readable<'a> for String<'a> {
281    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
282        let inner = match r.peek_byte()? {
283            b'<' => InnerString::Hex(r.read_without_context::<HexString>()?),
284            b'(' => InnerString::Literal(r.read_without_context::<LiteralString>()?),
285            _ => return None,
286        };
287
288        Some(String(inner))
289    }
290}
291
292fn is_octal_digit(byte: u8) -> bool {
293    matches!(byte, b'0'..=b'7')
294}
295
296#[cfg(test)]
297mod tests {
298    use crate::object::string::{HexString, LiteralString, String};
299    use crate::reader::Reader;
300
301    #[test]
302    fn hex_string_empty() {
303        assert_eq!(
304            Reader::new("<>".as_bytes())
305                .read_without_context::<HexString>()
306                .unwrap()
307                .get(),
308            vec![]
309        );
310    }
311
312    #[test]
313    fn hex_string_1() {
314        assert_eq!(
315            Reader::new("<00010203>".as_bytes())
316                .read_without_context::<HexString>()
317                .unwrap()
318                .get(),
319            vec![0x00, 0x01, 0x02, 0x03]
320        );
321    }
322
323    #[test]
324    fn hex_string_2() {
325        assert_eq!(
326            Reader::new("<000102034>".as_bytes())
327                .read_without_context::<HexString>()
328                .unwrap()
329                .get(),
330            vec![0x00, 0x01, 0x02, 0x03, 0x40]
331        );
332    }
333
334    #[test]
335    fn hex_string_trailing_1() {
336        assert_eq!(
337            Reader::new("<000102034>dfgfg4".as_bytes())
338                .read_without_context::<HexString>()
339                .unwrap()
340                .get(),
341            vec![0x00, 0x01, 0x02, 0x03, 0x40]
342        );
343    }
344
345    #[test]
346    fn hex_string_trailing_2() {
347        assert_eq!(
348            Reader::new("<1  3 4>dfgfg4".as_bytes())
349                .read_without_context::<HexString>()
350                .unwrap()
351                .get(),
352            vec![0x13, 0x40]
353        );
354    }
355
356    #[test]
357    fn hex_string_trailing_3() {
358        assert_eq!(
359            Reader::new("<1>dfgfg4".as_bytes())
360                .read_without_context::<HexString>()
361                .unwrap()
362                .get(),
363            vec![0x10]
364        );
365    }
366
367    #[test]
368    fn hex_string_invalid_1() {
369        assert!(
370            Reader::new("<".as_bytes())
371                .read_without_context::<HexString>()
372                .is_none()
373        );
374    }
375
376    #[test]
377    fn hex_string_invalid_2() {
378        assert!(
379            Reader::new("34AD".as_bytes())
380                .read_without_context::<HexString>()
381                .is_none()
382        );
383    }
384
385    #[test]
386    fn literal_string_empty() {
387        assert_eq!(
388            Reader::new("()".as_bytes())
389                .read_without_context::<LiteralString>()
390                .unwrap()
391                .get()
392                .to_vec(),
393            b"".to_vec()
394        );
395    }
396
397    #[test]
398    fn literal_string_1() {
399        assert_eq!(
400            Reader::new("(Hi there.)".as_bytes())
401                .read_without_context::<LiteralString>()
402                .unwrap()
403                .get()
404                .to_vec(),
405            b"Hi there.".to_vec()
406        );
407    }
408
409    #[test]
410    fn literal_string_2() {
411        assert!(
412            Reader::new("(Hi \\777)".as_bytes())
413                .read_without_context::<LiteralString>()
414                .is_some()
415        );
416    }
417
418    #[test]
419    fn literal_string_3() {
420        assert_eq!(
421            Reader::new("(Hi ) there.)".as_bytes())
422                .read_without_context::<LiteralString>()
423                .unwrap()
424                .get()
425                .to_vec(),
426            b"Hi ".to_vec()
427        );
428    }
429
430    #[test]
431    fn literal_string_4() {
432        assert_eq!(
433            Reader::new("(Hi (()) there)".as_bytes())
434                .read_without_context::<LiteralString>()
435                .unwrap()
436                .get()
437                .to_vec(),
438            b"Hi (()) there".to_vec()
439        );
440    }
441
442    #[test]
443    fn literal_string_5() {
444        assert_eq!(
445            Reader::new("(Hi \\()".as_bytes())
446                .read_without_context::<LiteralString>()
447                .unwrap()
448                .get()
449                .to_vec(),
450            b"Hi (".to_vec()
451        );
452    }
453
454    #[test]
455    fn literal_string_6() {
456        assert_eq!(
457            Reader::new("(Hi \\\nthere)".as_bytes())
458                .read_without_context::<LiteralString>()
459                .unwrap()
460                .get()
461                .to_vec(),
462            b"Hi there".to_vec()
463        );
464    }
465
466    #[test]
467    fn literal_string_7() {
468        assert_eq!(
469            Reader::new("(Hi \\05354)".as_bytes())
470                .read_without_context::<LiteralString>()
471                .unwrap()
472                .get()
473                .to_vec(),
474            b"Hi +54".to_vec()
475        );
476    }
477
478    #[test]
479    fn literal_string_8() {
480        assert_eq!(
481            Reader::new("(\\3)".as_bytes())
482                .read_without_context::<String>()
483                .unwrap()
484                .get(),
485            b"3".to_vec()
486        )
487    }
488
489    #[test]
490    fn literal_string_9() {
491        assert_eq!(
492            Reader::new("(\\36)".as_bytes())
493                .read_without_context::<String>()
494                .unwrap()
495                .get(),
496            b"36".to_vec()
497        )
498    }
499
500    #[test]
501    fn literal_string_10() {
502        assert_eq!(
503            Reader::new("(\\36ab)".as_bytes())
504                .read_without_context::<String>()
505                .unwrap()
506                .get(),
507            b"36ab".to_vec()
508        )
509    }
510
511    #[test]
512    fn literal_string_trailing() {
513        assert_eq!(
514            Reader::new("(Hi there.)abcde".as_bytes())
515                .read_without_context::<LiteralString>()
516                .unwrap()
517                .get()
518                .to_vec(),
519            b"Hi there.".to_vec()
520        );
521    }
522
523    #[test]
524    fn literal_string_invalid() {
525        // In this case, we just ignore the solidus and treat it as literal numbers.
526        assert_eq!(
527            Reader::new("(Hi \\778)".as_bytes())
528                .read_without_context::<LiteralString>()
529                .unwrap()
530                .get()
531                .to_vec(),
532            b"Hi 778".to_vec()
533        );
534    }
535
536    #[test]
537    fn string_1() {
538        assert_eq!(
539            Reader::new("(Hi there.)".as_bytes())
540                .read_without_context::<String>()
541                .unwrap()
542                .get()
543                .to_vec(),
544            b"Hi there.".to_vec()
545        );
546    }
547
548    #[test]
549    fn string_2() {
550        assert_eq!(
551            Reader::new("<00010203>".as_bytes())
552                .read_without_context::<String>()
553                .unwrap()
554                .get(),
555            vec![0x00, 0x01, 0x02, 0x03]
556        );
557    }
558}