hayro_syntax/object/
string.rs

1//! String objects.
2
3use crate::object::macros::object;
4use crate::object::{Object, ObjectLike};
5use crate::reader::{Readable, Reader, Skippable};
6use crate::trivia::is_white_space_character;
7use crate::xref::XRef;
8use std::borrow::Cow;
9
10/// A hex-encoded string.
11#[derive(PartialEq, Eq, Copy, Clone, Debug)]
12pub struct HexString<'a>(&'a [u8], bool);
13
14impl HexString<'_> {
15    /// Returns the content of the string.
16    pub fn get(&self) -> Vec<u8> {
17        if self.1 {
18            let mut cleaned = Vec::with_capacity(self.0.len() + 1);
19
20            for b in self.0.iter().copied() {
21                if !is_white_space_character(b) {
22                    cleaned.push(b);
23                }
24            }
25
26            if cleaned.len() % 2 != 0 {
27                cleaned.push(b'0');
28            }
29
30            // We made sure while parsing that it is a valid hex string.
31            hex::decode(cleaned).unwrap()
32        } else {
33            // We made sure while parsing that it is a valid hex string.
34            hex::decode(self.0).unwrap()
35        }
36    }
37}
38
39impl Skippable for HexString<'_> {
40    fn skip<const PLAIN: bool>(r: &mut Reader<'_>) -> Option<()> {
41        parse_hex(r).map(|_| {})
42    }
43}
44
45impl<'a> Readable<'a> for HexString<'a> {
46    fn read<const PLAIN: bool>(r: &mut Reader<'a>, _: &'a XRef) -> Option<Self> {
47        let start = r.offset();
48        let mut dirty = parse_hex(r)?;
49        let end = r.offset();
50
51        // Exclude outer brackets.
52        let result = r.range(start + 1..end - 1).unwrap();
53        dirty |= result.len() % 2 != 0;
54
55        Some(HexString(result, dirty))
56    }
57}
58
59impl<'a> TryFrom<Object<'a>> for HexString<'a> {
60    type Error = ();
61
62    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
63        match value {
64            Object::String(String(InnerString::Hex(h))) => Ok(h),
65            _ => Err(()),
66        }
67    }
68}
69
70impl<'a> ObjectLike<'a> for HexString<'a> {}
71
72fn parse_hex(r: &mut Reader<'_>) -> Option<bool> {
73    let mut has_whitespace = false;
74
75    r.forward_tag(b"<")?;
76    while let Some(b) = r.peek_byte() {
77        let is_hex = b.is_ascii_hexdigit();
78        let is_whitespace = is_white_space_character(b);
79        has_whitespace |= is_whitespace;
80
81        if !is_hex && !is_whitespace {
82            break;
83        }
84
85        r.read_byte()?;
86    }
87    r.forward_tag(b">")?;
88
89    Some(has_whitespace)
90}
91
92/// A literal string.
93#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
94pub struct LiteralString<'a>(&'a [u8], bool);
95
96impl<'a> LiteralString<'a> {
97    /// Returns the content of the string.
98    pub fn get(&self) -> Cow<'a, [u8]> {
99        if self.1 {
100            let mut cleaned = vec![];
101            let mut r = Reader::new(self.0);
102
103            while let Some(byte) = r.read_byte() {
104                match byte {
105                    b'\\' => {
106                        let next = r.read_byte().unwrap();
107
108                        if is_octal_digit(next) {
109                            let second = r.read_byte().unwrap();
110                            let third = r.read_byte().unwrap();
111                            let bytes = [next, second, third];
112                            let str = std::str::from_utf8(&bytes).unwrap();
113                            let num = u8::from_str_radix(str, 8).unwrap();
114                            cleaned.push(num);
115                        } else {
116                            match next {
117                                b'n' => cleaned.push(0xA),
118                                b'r' => cleaned.push(0xD),
119                                b't' => cleaned.push(0x9),
120                                b'b' => cleaned.push(0x8),
121                                b'f' => cleaned.push(0xC),
122                                b'(' => cleaned.push(b'('),
123                                b')' => cleaned.push(b')'),
124                                b'\\' => cleaned.push(b'\\'),
125                                b'\n' | b'\r' => {
126                                    // A conforming reader shall disregard the REVERSE SOLIDUS
127                                    // and the end-of-line marker following it when reading
128                                    // the string; the resulting string value shall be
129                                    // identical to that which would be read if the string
130                                    // were not split.
131                                    r.skip_eol_characters();
132                                }
133                                _ => unreachable!(),
134                            }
135                        }
136                    }
137                    // An end-of-line marker appearing within a literal string
138                    // without a preceding REVERSE SOLIDUS shall be treated as
139                    // a byte value of (0Ah), irrespective of whether the end-of-line
140                    // marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
141                    b'\n' | b'\r' => {
142                        cleaned.push(b'\n');
143                        r.skip_eol_characters();
144                    }
145                    other => cleaned.push(other),
146                }
147            }
148
149            Cow::Owned(cleaned)
150        } else {
151            Cow::Borrowed(self.0)
152        }
153    }
154}
155
156impl Skippable for LiteralString<'_> {
157    fn skip<const PLAIN: bool>(r: &mut Reader<'_>) -> Option<()> {
158        parse_literal(r).map(|_| ())
159    }
160}
161
162impl<'a> Readable<'a> for LiteralString<'a> {
163    fn read<const PLAIN: bool>(r: &mut Reader<'a>, _: &XRef) -> Option<Self> {
164        let start = r.offset();
165        let dirty = parse_literal(r)?;
166        let end = r.offset();
167
168        // Exclude outer brackets
169        let result = r.range(start + 1..end - 1).unwrap();
170
171        Some(LiteralString(result, dirty))
172    }
173}
174
175impl<'a> TryFrom<Object<'a>> for LiteralString<'a> {
176    type Error = ();
177
178    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
179        match value {
180            Object::String(String(InnerString::Literal(l))) => Ok(l),
181            _ => Err(()),
182        }
183    }
184}
185
186impl<'a> ObjectLike<'a> for LiteralString<'a> {}
187
188fn parse_literal(r: &mut Reader<'_>) -> Option<bool> {
189    r.forward_tag(b"(")?;
190    let mut bracket_counter = 1;
191    let mut dirty = false;
192
193    while bracket_counter > 0 {
194        let byte = r.read_byte()?;
195
196        match byte {
197            b'\\' => {
198                dirty = true;
199
200                let next = r.read_byte()?;
201                if is_octal_digit(next) {
202                    r.eat(|b| is_octal_digit(b))?;
203                    r.eat(|b| is_octal_digit(b))?;
204                } else if !matches!(
205                    next,
206                    b'n' | b'r' | b't' | b'b' | b'f' | b'(' | b')' | b'\\' | b'\n' | b'\r'
207                ) {
208                    return None;
209                }
210            }
211            b'(' => bracket_counter += 1,
212            b')' => bracket_counter -= 1,
213            b'\n' | b'\r' => dirty = true,
214            _ => {}
215        };
216    }
217
218    Some(dirty)
219}
220
221#[derive(Clone, Debug, PartialEq)]
222enum InnerString<'a> {
223    Hex(HexString<'a>),
224    Literal(LiteralString<'a>),
225}
226
227/// A string.
228#[derive(Clone, Debug, PartialEq)]
229pub struct String<'a>(InnerString<'a>);
230
231impl<'a> String<'a> {
232    /// Returns the content of the string.
233    pub fn get(&self) -> Cow<'a, [u8]> {
234        match &self.0 {
235            InnerString::Hex(hex) => Cow::Owned(hex.get()),
236            InnerString::Literal(lit) => lit.get(),
237        }
238    }
239}
240
241impl<'a> From<HexString<'a>> for String<'a> {
242    fn from(value: HexString<'a>) -> Self {
243        Self(InnerString::Hex(value))
244    }
245}
246
247impl<'a> From<LiteralString<'a>> for String<'a> {
248    fn from(value: LiteralString<'a>) -> Self {
249        Self(InnerString::Literal(value))
250    }
251}
252
253object!(String<'a>, String);
254
255impl Skippable for String<'_> {
256    fn skip<const PLAIN: bool>(r: &mut Reader<'_>) -> Option<()> {
257        match r.peek_byte()? {
258            b'<' => HexString::skip::<true>(r),
259            b'(' => LiteralString::skip::<true>(r),
260            _ => None,
261        }
262    }
263}
264
265impl<'a> Readable<'a> for String<'a> {
266    fn read<const PLAIN: bool>(r: &mut Reader<'a>, _: &'a XRef) -> Option<Self> {
267        let inner = match r.peek_byte()? {
268            b'<' => InnerString::Hex(r.read_without_xref::<HexString>()?),
269            b'(' => InnerString::Literal(r.read_without_xref::<LiteralString>()?),
270            _ => return None,
271        };
272
273        Some(String(inner))
274    }
275}
276
277fn is_octal_digit(byte: u8) -> bool {
278    matches!(byte, b'0'..=b'7')
279}
280
281#[cfg(test)]
282mod tests {
283    use crate::object::string::{HexString, LiteralString, String};
284    use crate::reader::Reader;
285
286    #[test]
287    fn hex_string_empty() {
288        assert_eq!(
289            Reader::new("<>".as_bytes())
290                .read_without_xref::<HexString>()
291                .unwrap()
292                .get(),
293            vec![]
294        );
295    }
296
297    #[test]
298    fn hex_string_1() {
299        assert_eq!(
300            Reader::new("<00010203>".as_bytes())
301                .read_without_xref::<HexString>()
302                .unwrap()
303                .get(),
304            vec![0x00, 0x01, 0x02, 0x03]
305        );
306    }
307
308    #[test]
309    fn hex_string_2() {
310        assert_eq!(
311            Reader::new("<000102034>".as_bytes())
312                .read_without_xref::<HexString>()
313                .unwrap()
314                .get(),
315            vec![0x00, 0x01, 0x02, 0x03, 0x40]
316        );
317    }
318
319    #[test]
320    fn hex_string_trailing_1() {
321        assert_eq!(
322            Reader::new("<000102034>dfgfg4".as_bytes())
323                .read_without_xref::<HexString>()
324                .unwrap()
325                .get(),
326            vec![0x00, 0x01, 0x02, 0x03, 0x40]
327        );
328    }
329
330    #[test]
331    fn hex_string_trailing_2() {
332        assert_eq!(
333            Reader::new("<1  3 4>dfgfg4".as_bytes())
334                .read_without_xref::<HexString>()
335                .unwrap()
336                .get(),
337            vec![0x13, 0x40]
338        );
339    }
340
341    #[test]
342    fn hex_string_trailing_3() {
343        assert_eq!(
344            Reader::new("<1>dfgfg4".as_bytes())
345                .read_without_xref::<HexString>()
346                .unwrap()
347                .get(),
348            vec![0x10]
349        );
350    }
351
352    #[test]
353    fn hex_string_invalid_1() {
354        assert_eq!(
355            Reader::new("<".as_bytes())
356                .read_without_xref::<HexString>()
357                .is_none(),
358            true
359        );
360    }
361
362    #[test]
363    fn hex_string_invalid_2() {
364        assert_eq!(
365            Reader::new("34AD".as_bytes())
366                .read_without_xref::<HexString>()
367                .is_none(),
368            true
369        );
370    }
371
372    #[test]
373    fn literal_string_empty() {
374        assert_eq!(
375            Reader::new("()".as_bytes())
376                .read_without_xref::<LiteralString>()
377                .unwrap()
378                .get()
379                .to_vec(),
380            b"".to_vec()
381        );
382    }
383
384    #[test]
385    fn literal_string_1() {
386        assert_eq!(
387            Reader::new("(Hi there.)".as_bytes())
388                .read_without_xref::<LiteralString>()
389                .unwrap()
390                .get()
391                .to_vec(),
392            b"Hi there.".to_vec()
393        );
394    }
395
396    #[test]
397    fn literal_string_2() {
398        assert!(
399            Reader::new("(Hi \\777)".as_bytes())
400                .read_without_xref::<LiteralString>()
401                .is_some()
402        );
403    }
404
405    #[test]
406    fn literal_string_3() {
407        assert_eq!(
408            Reader::new("(Hi ) there.)".as_bytes())
409                .read_without_xref::<LiteralString>()
410                .unwrap()
411                .get()
412                .to_vec(),
413            b"Hi ".to_vec()
414        );
415    }
416
417    #[test]
418    fn literal_string_4() {
419        assert_eq!(
420            Reader::new("(Hi (()) there)".as_bytes())
421                .read_without_xref::<LiteralString>()
422                .unwrap()
423                .get()
424                .to_vec(),
425            b"Hi (()) there".to_vec()
426        );
427    }
428
429    #[test]
430    fn literal_string_5() {
431        assert_eq!(
432            Reader::new("(Hi \\()".as_bytes())
433                .read_without_xref::<LiteralString>()
434                .unwrap()
435                .get()
436                .to_vec(),
437            b"Hi (".to_vec()
438        );
439    }
440
441    #[test]
442    fn literal_string_6() {
443        assert_eq!(
444            Reader::new("(Hi \\\nthere)".as_bytes())
445                .read_without_xref::<LiteralString>()
446                .unwrap()
447                .get()
448                .to_vec(),
449            b"Hi there".to_vec()
450        );
451    }
452
453    #[test]
454    fn literal_string_7() {
455        assert_eq!(
456            Reader::new("(Hi \\05354)".as_bytes())
457                .read_without_xref::<LiteralString>()
458                .unwrap()
459                .get()
460                .to_vec(),
461            b"Hi +54".to_vec()
462        );
463    }
464
465    #[test]
466    fn literal_string_trailing() {
467        assert_eq!(
468            Reader::new("(Hi there.)abcde".as_bytes())
469                .read_without_xref::<LiteralString>()
470                .unwrap()
471                .get()
472                .to_vec(),
473            b"Hi there.".to_vec()
474        );
475    }
476
477    #[test]
478    fn literal_string_invalid() {
479        assert!(
480            Reader::new("(Hi \\778)".as_bytes())
481                .read_without_xref::<LiteralString>()
482                .is_none()
483        );
484    }
485
486    #[test]
487    fn string_1() {
488        assert_eq!(
489            Reader::new("(Hi there.)".as_bytes())
490                .read_without_xref::<String>()
491                .unwrap()
492                .get()
493                .to_vec(),
494            b"Hi there.".to_vec()
495        );
496    }
497
498    #[test]
499    fn string_2() {
500        assert_eq!(
501            Reader::new("<00010203>".as_bytes())
502                .read_without_xref::<String>()
503                .unwrap()
504                .get(),
505            vec![0x00, 0x01, 0x02, 0x03]
506        );
507    }
508}