hayro_syntax/object/
string.rs

1//! Strings.
2
3use crate::filter::ascii_hex::decode_hex_string;
4use crate::object::macros::object;
5use crate::object::{Object, ObjectLike};
6use crate::reader::{Readable, Reader, ReaderContext, Skippable};
7use crate::trivia::is_white_space_character;
8use std::borrow::Cow;
9
10/// A hex-encoded string.
11#[derive(PartialEq, Eq, Copy, Clone, Debug)]
12struct HexString<'a>(&'a [u8], bool);
13
14impl HexString<'_> {
15    /// Returns the content of the string.
16    fn get(&self) -> Vec<u8> {
17        if self.1 {
18            let mut cleaned = Vec::with_capacity(self.0.len() + 1);
19
20            for b in self.0.iter().copied() {
21                if !is_white_space_character(b) {
22                    cleaned.push(b);
23                }
24            }
25
26            if cleaned.len() % 2 != 0 {
27                cleaned.push(b'0');
28            }
29
30            // We made sure while parsing that it is a valid hex string.
31            decode_hex_string(&cleaned).unwrap()
32        } else {
33            // We made sure while parsing that it is a valid hex string.
34            decode_hex_string(self.0).unwrap()
35        }
36    }
37}
38
39impl Skippable for HexString<'_> {
40    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
41        parse_hex(r).map(|_| {})
42    }
43}
44
45impl<'a> Readable<'a> for HexString<'a> {
46    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
47        let start = r.offset();
48        let mut dirty = parse_hex(r)?;
49        let end = r.offset();
50
51        // Exclude outer brackets.
52        let result = r.range(start + 1..end - 1).unwrap();
53        dirty |= result.len() % 2 != 0;
54
55        Some(HexString(result, dirty))
56    }
57}
58
59impl<'a> TryFrom<Object<'a>> for HexString<'a> {
60    type Error = ();
61
62    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
63        match value {
64            Object::String(String(InnerString::Hex(h))) => Ok(h),
65            _ => Err(()),
66        }
67    }
68}
69
70impl<'a> ObjectLike<'a> for HexString<'a> {}
71
72fn parse_hex(r: &mut Reader<'_>) -> Option<bool> {
73    let mut has_whitespace = false;
74
75    r.forward_tag(b"<")?;
76    while let Some(b) = r.peek_byte() {
77        let is_hex = b.is_ascii_hexdigit();
78        let is_whitespace = is_white_space_character(b);
79        has_whitespace |= is_whitespace;
80
81        if !is_hex && !is_whitespace {
82            break;
83        }
84
85        r.read_byte()?;
86    }
87    r.forward_tag(b">")?;
88
89    Some(has_whitespace)
90}
91
92/// A literal string.
93#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
94struct LiteralString<'a>(&'a [u8], bool);
95
96impl<'a> LiteralString<'a> {
97    /// Returns the content of the string.
98    fn get(&self) -> Cow<'a, [u8]> {
99        if self.1 {
100            let mut cleaned = vec![];
101            let mut r = Reader::new(self.0);
102
103            while let Some(byte) = r.read_byte() {
104                match byte {
105                    b'\\' => {
106                        let next = r.read_byte().unwrap();
107
108                        if is_octal_digit(next) {
109                            let second = r.read_byte();
110                            let third = r.read_byte();
111
112                            match (second, third) {
113                                (Some(n1), Some(n2)) => {
114                                    if is_octal_digit(n1) && is_octal_digit(n2) {
115                                        let bytes = [next, n1, n2];
116                                        let str = std::str::from_utf8(&bytes).unwrap();
117                                        let num = u8::from_str_radix(str, 8).unwrap();
118                                        cleaned.push(num);
119                                    } else {
120                                        // Ignore the solidus and treat as normal characters.
121                                        cleaned.push(next);
122                                        cleaned.push(n1);
123                                        cleaned.push(n2);
124                                    }
125                                }
126                                (Some(n1), None) => {
127                                    cleaned.push(next);
128                                    cleaned.push(n1);
129                                }
130                                _ => cleaned.push(next),
131                            }
132                        } else {
133                            match next {
134                                b'n' => cleaned.push(0xA),
135                                b'r' => cleaned.push(0xD),
136                                b't' => cleaned.push(0x9),
137                                b'b' => cleaned.push(0x8),
138                                b'f' => cleaned.push(0xC),
139                                b'(' => cleaned.push(b'('),
140                                b')' => cleaned.push(b')'),
141                                b'\\' => cleaned.push(b'\\'),
142                                b'\n' | b'\r' => {
143                                    // A conforming reader shall disregard the REVERSE SOLIDUS
144                                    // and the end-of-line marker following it when reading
145                                    // the string; the resulting string value shall be
146                                    // identical to that which would be read if the string
147                                    // were not split.
148                                    r.skip_eol_characters();
149                                }
150                                _ => cleaned.push(next),
151                            }
152                        }
153                    }
154                    // An end-of-line marker appearing within a literal string
155                    // without a preceding REVERSE SOLIDUS shall be treated as
156                    // a byte value of (0Ah), irrespective of whether the end-of-line
157                    // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
158                    b'\n' | b'\r' => {
159                        cleaned.push(b'\n');
160                        r.skip_eol_characters();
161                    }
162                    other => cleaned.push(other),
163                }
164            }
165
166            Cow::Owned(cleaned)
167        } else {
168            Cow::Borrowed(self.0)
169        }
170    }
171}
172
173impl Skippable for LiteralString<'_> {
174    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
175        parse_literal(r).map(|_| ())
176    }
177}
178
179impl<'a> Readable<'a> for LiteralString<'a> {
180    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
181        let start = r.offset();
182        let dirty = parse_literal(r)?;
183        let end = r.offset();
184
185        // Exclude outer brackets
186        let result = r.range(start + 1..end - 1).unwrap();
187
188        Some(LiteralString(result, dirty))
189    }
190}
191
192impl<'a> TryFrom<Object<'a>> for LiteralString<'a> {
193    type Error = ();
194
195    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
196        match value {
197            Object::String(String(InnerString::Literal(l))) => Ok(l),
198            _ => Err(()),
199        }
200    }
201}
202
203impl<'a> ObjectLike<'a> for LiteralString<'a> {}
204
205fn parse_literal(r: &mut Reader<'_>) -> Option<bool> {
206    r.forward_tag(b"(")?;
207    let mut bracket_counter = 1;
208    let mut dirty = false;
209
210    while bracket_counter > 0 {
211        let byte = r.read_byte()?;
212
213        match byte {
214            b'\\' => {
215                dirty = true;
216
217                let _ = r.read_byte()?;
218            }
219            b'(' => bracket_counter += 1,
220            b')' => bracket_counter -= 1,
221            b'\n' | b'\r' => dirty = true,
222            _ => {}
223        };
224    }
225
226    Some(dirty)
227}
228
229#[derive(Clone, Debug, PartialEq)]
230enum InnerString<'a> {
231    Hex(HexString<'a>),
232    Literal(LiteralString<'a>),
233}
234
235/// A string.
236#[derive(Clone, Debug, PartialEq)]
237pub struct String<'a>(InnerString<'a>);
238
239impl<'a> String<'a> {
240    /// Returns the content of the string.
241    pub fn get(&self) -> Cow<'a, [u8]> {
242        match &self.0 {
243            InnerString::Hex(hex) => Cow::Owned(hex.get()),
244            InnerString::Literal(lit) => lit.get(),
245        }
246    }
247}
248
249impl<'a> From<HexString<'a>> for String<'a> {
250    fn from(value: HexString<'a>) -> Self {
251        Self(InnerString::Hex(value))
252    }
253}
254
255impl<'a> From<LiteralString<'a>> for String<'a> {
256    fn from(value: LiteralString<'a>) -> Self {
257        Self(InnerString::Literal(value))
258    }
259}
260
261object!(String<'a>, String);
262
263impl Skippable for String<'_> {
264    fn skip(r: &mut Reader<'_>, is_content_stream: bool) -> Option<()> {
265        match r.peek_byte()? {
266            b'<' => HexString::skip(r, is_content_stream),
267            b'(' => LiteralString::skip(r, is_content_stream),
268            _ => None,
269        }
270    }
271}
272
273impl<'a> Readable<'a> for String<'a> {
274    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
275        let inner = match r.peek_byte()? {
276            b'<' => InnerString::Hex(r.read_without_context::<HexString>()?),
277            b'(' => InnerString::Literal(r.read_without_context::<LiteralString>()?),
278            _ => return None,
279        };
280
281        Some(String(inner))
282    }
283}
284
285fn is_octal_digit(byte: u8) -> bool {
286    matches!(byte, b'0'..=b'7')
287}
288
289#[cfg(test)]
290mod tests {
291    use crate::object::string::{HexString, LiteralString, String};
292    use crate::reader::Reader;
293
294    #[test]
295    fn hex_string_empty() {
296        assert_eq!(
297            Reader::new("<>".as_bytes())
298                .read_without_context::<HexString>()
299                .unwrap()
300                .get(),
301            vec![]
302        );
303    }
304
305    #[test]
306    fn hex_string_1() {
307        assert_eq!(
308            Reader::new("<00010203>".as_bytes())
309                .read_without_context::<HexString>()
310                .unwrap()
311                .get(),
312            vec![0x00, 0x01, 0x02, 0x03]
313        );
314    }
315
316    #[test]
317    fn hex_string_2() {
318        assert_eq!(
319            Reader::new("<000102034>".as_bytes())
320                .read_without_context::<HexString>()
321                .unwrap()
322                .get(),
323            vec![0x00, 0x01, 0x02, 0x03, 0x40]
324        );
325    }
326
327    #[test]
328    fn hex_string_trailing_1() {
329        assert_eq!(
330            Reader::new("<000102034>dfgfg4".as_bytes())
331                .read_without_context::<HexString>()
332                .unwrap()
333                .get(),
334            vec![0x00, 0x01, 0x02, 0x03, 0x40]
335        );
336    }
337
338    #[test]
339    fn hex_string_trailing_2() {
340        assert_eq!(
341            Reader::new("<1  3 4>dfgfg4".as_bytes())
342                .read_without_context::<HexString>()
343                .unwrap()
344                .get(),
345            vec![0x13, 0x40]
346        );
347    }
348
349    #[test]
350    fn hex_string_trailing_3() {
351        assert_eq!(
352            Reader::new("<1>dfgfg4".as_bytes())
353                .read_without_context::<HexString>()
354                .unwrap()
355                .get(),
356            vec![0x10]
357        );
358    }
359
360    #[test]
361    fn hex_string_invalid_1() {
362        assert!(
363            Reader::new("<".as_bytes())
364                .read_without_context::<HexString>()
365                .is_none()
366        );
367    }
368
369    #[test]
370    fn hex_string_invalid_2() {
371        assert!(
372            Reader::new("34AD".as_bytes())
373                .read_without_context::<HexString>()
374                .is_none()
375        );
376    }
377
378    #[test]
379    fn literal_string_empty() {
380        assert_eq!(
381            Reader::new("()".as_bytes())
382                .read_without_context::<LiteralString>()
383                .unwrap()
384                .get()
385                .to_vec(),
386            b"".to_vec()
387        );
388    }
389
390    #[test]
391    fn literal_string_1() {
392        assert_eq!(
393            Reader::new("(Hi there.)".as_bytes())
394                .read_without_context::<LiteralString>()
395                .unwrap()
396                .get()
397                .to_vec(),
398            b"Hi there.".to_vec()
399        );
400    }
401
402    #[test]
403    fn literal_string_2() {
404        assert!(
405            Reader::new("(Hi \\777)".as_bytes())
406                .read_without_context::<LiteralString>()
407                .is_some()
408        );
409    }
410
411    #[test]
412    fn literal_string_3() {
413        assert_eq!(
414            Reader::new("(Hi ) there.)".as_bytes())
415                .read_without_context::<LiteralString>()
416                .unwrap()
417                .get()
418                .to_vec(),
419            b"Hi ".to_vec()
420        );
421    }
422
423    #[test]
424    fn literal_string_4() {
425        assert_eq!(
426            Reader::new("(Hi (()) there)".as_bytes())
427                .read_without_context::<LiteralString>()
428                .unwrap()
429                .get()
430                .to_vec(),
431            b"Hi (()) there".to_vec()
432        );
433    }
434
435    #[test]
436    fn literal_string_5() {
437        assert_eq!(
438            Reader::new("(Hi \\()".as_bytes())
439                .read_without_context::<LiteralString>()
440                .unwrap()
441                .get()
442                .to_vec(),
443            b"Hi (".to_vec()
444        );
445    }
446
447    #[test]
448    fn literal_string_6() {
449        assert_eq!(
450            Reader::new("(Hi \\\nthere)".as_bytes())
451                .read_without_context::<LiteralString>()
452                .unwrap()
453                .get()
454                .to_vec(),
455            b"Hi there".to_vec()
456        );
457    }
458
459    #[test]
460    fn literal_string_7() {
461        assert_eq!(
462            Reader::new("(Hi \\05354)".as_bytes())
463                .read_without_context::<LiteralString>()
464                .unwrap()
465                .get()
466                .to_vec(),
467            b"Hi +54".to_vec()
468        );
469    }
470
471    #[test]
472    fn literal_string_8() {
473        assert_eq!(
474            Reader::new("(\\3)".as_bytes())
475                .read_without_context::<String>()
476                .unwrap()
477                .get(),
478            b"3".to_vec()
479        )
480    }
481
482    #[test]
483    fn literal_string_9() {
484        assert_eq!(
485            Reader::new("(\\36)".as_bytes())
486                .read_without_context::<String>()
487                .unwrap()
488                .get(),
489            b"36".to_vec()
490        )
491    }
492
493    #[test]
494    fn literal_string_10() {
495        assert_eq!(
496            Reader::new("(\\36ab)".as_bytes())
497                .read_without_context::<String>()
498                .unwrap()
499                .get(),
500            b"36ab".to_vec()
501        )
502    }
503
504    #[test]
505    fn literal_string_trailing() {
506        assert_eq!(
507            Reader::new("(Hi there.)abcde".as_bytes())
508                .read_without_context::<LiteralString>()
509                .unwrap()
510                .get()
511                .to_vec(),
512            b"Hi there.".to_vec()
513        );
514    }
515
516    #[test]
517    fn literal_string_invalid() {
518        // In this case, we just ignore the solidus and treat it as literal numbers.
519        assert_eq!(
520            Reader::new("(Hi \\778)".as_bytes())
521                .read_without_context::<LiteralString>()
522                .unwrap()
523                .get()
524                .to_vec(),
525            b"Hi 778".to_vec()
526        );
527    }
528
529    #[test]
530    fn string_1() {
531        assert_eq!(
532            Reader::new("(Hi there.)".as_bytes())
533                .read_without_context::<String>()
534                .unwrap()
535                .get()
536                .to_vec(),
537            b"Hi there.".to_vec()
538        );
539    }
540
541    #[test]
542    fn string_2() {
543        assert_eq!(
544            Reader::new("<00010203>".as_bytes())
545                .read_without_context::<String>()
546                .unwrap()
547                .get(),
548            vec![0x00, 0x01, 0x02, 0x03]
549        );
550    }
551}