Skip to main content

cbor_core/
parse.rs

1//! CBOR diagnostic notation parser (Section 2.3.6 of draft-rundgren-cbor-core-25).
2//!
3//! Parses diagnostic-notation strings into [`Value`]. Exposed through the
4//! standard [`FromStr`] trait: `"42".parse::<Value>()`.
5
6use std::{collections::BTreeMap, str::FromStr};
7
8use crate::{
9    Error, Float, SimpleValue, Value,
10    error::WithEof,
11    float::Inner,
12    io::{MyReader, SliceReader},
13    limits, tag,
14    util::{trim_leading_zeros, u8_from_base64_digit, u8_from_hex_digit, u64_from_slice},
15};
16
17impl FromStr for Value {
18    type Err = Error;
19
20    fn from_str(s: &str) -> Result<Self, Error> {
21        let mut parser = Parser::new(SliceReader(s.as_bytes()), limits::RECURSION_LIMIT);
22        parser.parse_complete()
23    }
24}
25
26// The underlying reader is forward-only, but the parser needs arbitrary
27// lookahead. Bytes pulled for peeking are held in `buf` until consumed,
28// so the stream is never read past the last byte the parser actually
29// inspects on a successful match.
30pub(crate) struct Parser<R: MyReader> {
31    reader: R,
32    buf: [u8; 16],
33    buf_len: usize,
34    depth: u16,
35}
36
37impl<R: MyReader> Parser<R> {
38    pub(crate) fn new(inner: R, recursion_limit: u16) -> Self {
39        Self {
40            reader: inner,
41            buf: [0; _],
42            buf_len: 0,
43            depth: recursion_limit,
44        }
45    }
46
47    /// Parse a single value and require that the input is then fully
48    /// consumed (trailing whitespace and comments are accepted, nothing
49    /// else). Used by in-memory decode paths.
50    pub(crate) fn parse_complete(&mut self) -> Result<Value, R::Error> {
51        self.skip_whitespace()?;
52        let value = self.parse_value()?;
53        self.skip_whitespace()?;
54        if !self.at_end()? {
55            Err(Error::InvalidFormat.into())
56        } else {
57            Ok(value)
58        }
59    }
60
61    /// Parse a single value from a stream. After the value, trailing
62    /// whitespace and comments are consumed up to either EOF or a
63    /// top-level separator comma (the comma is consumed). Anything
64    /// else is rejected. Used by [`DecodeOptions::read_from`] so the
65    /// caller can pull successive elements of a CBOR sequence by
66    /// calling `read_from` repeatedly.
67    pub(crate) fn parse_stream_item(&mut self) -> Result<Value, R::Error> {
68        self.skip_whitespace()?;
69        let value = self.parse_value()?;
70        self.consume_trailing_separator()?;
71        Ok(value)
72    }
73
74    /// Pull the next value of a sequence. Returns `Ok(None)` at a clean
75    /// end of input (including a trailing comma). After returning a
76    /// value, any trailing top-level comma is consumed, ready for the
77    /// next call.
78    pub(crate) fn parse_seq_item(&mut self) -> Result<Option<Value>, R::Error> {
79        self.skip_whitespace()?;
80        if self.at_end()? {
81            Ok(None)
82        } else {
83            let value = self.parse_value()?;
84            self.consume_trailing_separator()?;
85            Ok(Some(value))
86        }
87    }
88
89    /// After a value has been parsed, consume whitespace and comments
90    /// up to either EOF or a top-level comma (which is also consumed).
91    /// Anything else is a syntax error.
92    fn consume_trailing_separator(&mut self) -> Result<(), R::Error> {
93        self.skip_whitespace()?;
94        if self.at_end()? || self.eat(b',')? {
95            Ok(())
96        } else {
97            Err(Error::InvalidFormat.into())
98        }
99    }
100
101    fn enter(&mut self) -> Result<(), R::Error> {
102        self.depth = self.depth.checked_sub(1).ok_or(Error::NestingTooDeep)?;
103        Ok(())
104    }
105
106    fn leave(&mut self) {
107        self.depth += 1;
108    }
109
110    fn ensure(&mut self, n: usize) -> Result<(), R::Error> {
111        while self.buf_len < n {
112            let [b] = self.reader.read_bytes::<1>()?;
113            self.buf[self.buf_len] = b;
114            self.buf_len += 1;
115        }
116        Ok(())
117    }
118
119    fn peek(&mut self) -> Result<Option<u8>, R::Error> {
120        self.peek_at(0)
121    }
122
123    fn peek_at(&mut self, offset: usize) -> Result<Option<u8>, R::Error> {
124        match self.ensure(offset + 1) {
125            Ok(()) => Ok(Some(self.buf[offset])),
126            Err(e) if e.is_eof() => Ok(None),
127            Err(e) => Err(e),
128        }
129    }
130
131    fn advance(&mut self) -> Result<u8, R::Error> {
132        self.ensure(1)?;
133        let byte = self.buf[0];
134        self.buf.copy_within(1..self.buf_len, 0);
135        self.buf_len -= 1;
136        Ok(byte)
137    }
138
139    fn skip(&mut self, len: usize) -> Result<(), R::Error> {
140        debug_assert!(len <= self.buf_len);
141        self.buf.copy_within(len..self.buf_len, 0);
142        self.buf_len -= len;
143        Ok(())
144    }
145
146    fn eat(&mut self, byte: u8) -> Result<bool, R::Error> {
147        if self.peek()? == Some(byte) {
148            self.skip(1)?;
149            Ok(true)
150        } else {
151            Ok(false)
152        }
153    }
154
155    fn expect(&mut self, byte: u8) -> Result<(), R::Error> {
156        if self.eat(byte)? {
157            Ok(())
158        } else {
159            Err(Error::InvalidFormat.into())
160        }
161    }
162
163    // Tentatively match `prefix` byte-by-byte.
164    fn consume(&mut self, prefix: &[u8]) -> Result<bool, R::Error> {
165        for (i, &b) in prefix.iter().enumerate() {
166            if self.peek_at(i)? != Some(b) {
167                return Ok(false);
168            }
169        }
170        self.skip(prefix.len())?;
171        Ok(true)
172    }
173
174    fn skip_whitespace(&mut self) -> Result<(), R::Error> {
175        loop {
176            while matches!(self.peek()?, Some(b' ' | b'\t' | b'\r' | b'\n')) {
177                self.skip(1)?;
178            }
179
180            if self.eat(b'#')? {
181                while let Some(b) = self.peek()?
182                    && b != b'\n'
183                {
184                    self.skip(1)?;
185                }
186            } else if self.eat(b'/')? {
187                while self.advance()? != b'/' {}
188            } else {
189                return Ok(());
190            }
191        }
192    }
193
194    fn at_end(&mut self) -> Result<bool, R::Error> {
195        Ok(self.peek()?.is_none())
196    }
197
198    fn parse_value(&mut self) -> Result<Value, R::Error> {
199        self.skip_whitespace()?;
200        let byte = self.peek()?.ok_or(Error::UnexpectedEof)?;
201        match byte {
202            b'[' => self.parse_array(),
203            b'{' => self.parse_map(),
204            b'"' => self.parse_text_string(),
205            b'\'' => self.parse_single_quoted_bstr(),
206            b'<' => self.parse_embedded_bstr(),
207            b'-' => {
208                if self.consume(b"-Infinity")? {
209                    Ok(Value::float(f64::NEG_INFINITY))
210                } else {
211                    self.parse_number_or_tag()
212                }
213            }
214            b'0'..=b'9' => self.parse_number_or_tag(),
215            b'N' if self.consume(b"NaN")? => Ok(Value::Float(Float(Inner::F16(0x7e00)))),
216            b'I' if self.consume(b"Infinity")? => Ok(Value::float(f64::INFINITY)),
217            b't' if self.consume(b"true")? => Ok(Value::from(true)),
218            b'f' if self.consume(b"false")? => Ok(Value::from(false)),
219            b'n' if self.consume(b"null")? => Ok(Value::null()),
220            b's' if self.consume(b"simple(")? => self.parse_simple_tail(),
221            b'h' if self.consume(b"h\'")? => self.parse_hex_bstr_tail(),
222            b'b' if self.consume(b"b64'")? => self.parse_b64_bstr_tail(),
223            b'f' if self.consume(b"float'")? => self.parse_float_hex_tail(),
224            _ => Err(Error::InvalidFormat.into()),
225        }
226    }
227
228    fn parse_array(&mut self) -> Result<Value, R::Error> {
229        self.expect(b'[')?;
230        self.skip_whitespace()?;
231        let mut items = Vec::new();
232        if self.eat(b']')? {
233            Ok(Value::Array(items))
234        } else {
235            self.enter()?;
236            let result = loop {
237                items.push(self.parse_value()?);
238                self.skip_whitespace()?;
239                if self.eat(b',')? {
240                    continue;
241                } else if self.eat(b']')? {
242                    break Ok(Value::Array(items));
243                } else {
244                    break Err(Error::InvalidFormat.into());
245                }
246            };
247            self.leave();
248            result
249        }
250    }
251
252    fn parse_map(&mut self) -> Result<Value, R::Error> {
253        self.expect(b'{')?;
254        self.skip_whitespace()?;
255        let mut map: BTreeMap<Value, Value> = BTreeMap::new();
256        if self.eat(b'}')? {
257            Ok(Value::Map(map))
258        } else {
259            self.enter()?;
260            let result = loop {
261                let key = self.parse_value()?;
262                self.skip_whitespace()?;
263                if let Err(error) = self.expect(b':') {
264                    break Err(error);
265                }
266                let value = self.parse_value()?;
267                if map.insert(key, value).is_some() {
268                    break Err(Error::NonDeterministic.into());
269                }
270                self.skip_whitespace()?;
271                if self.eat(b',')? {
272                    continue;
273                } else if self.eat(b'}')? {
274                    break Ok(Value::Map(map));
275                } else {
276                    break Err(Error::InvalidFormat.into());
277                }
278            };
279            self.leave();
280            result
281        }
282    }
283
284    fn parse_number_or_tag(&mut self) -> Result<Value, R::Error> {
285        let negative = self.eat(b'-')?;
286
287        let value = if self.peek()? == Some(b'0') {
288            match self.peek_at(1)? {
289                Some(b'b' | b'B') => {
290                    self.skip(2)?;
291                    self.parse_integer_base(negative, 2)?
292                }
293                Some(b'o' | b'O') => {
294                    self.skip(2)?;
295                    self.parse_integer_base(negative, 8)?
296                }
297                Some(b'x' | b'X') => {
298                    self.skip(2)?;
299                    self.parse_integer_base(negative, 16)?
300                }
301                _ => self.parse_decimal(negative)?,
302            }
303        } else {
304            self.parse_decimal(negative)?
305        };
306
307        self.skip_whitespace()?;
308
309        if self.eat(b'(')? {
310            let Value::Unsigned(tag_number) = value else {
311                return Err(Error::InvalidFormat.into());
312            };
313            self.enter()?;
314            let inner = self.parse_value();
315            self.leave();
316            let inner = inner?;
317            self.skip_whitespace()?;
318            self.expect(b')')?;
319            Ok(Value::tag(tag_number, inner))
320        } else {
321            Ok(value)
322        }
323    }
324
325    fn parse_decimal(&mut self, negative: bool) -> Result<Value, R::Error> {
326        let mut int_digits: Vec<u8> = Vec::new();
327        while let Some(b) = self.peek()?
328            && b.is_ascii_digit()
329        {
330            int_digits.push(b);
331            self.skip(1)?;
332        }
333        if int_digits.is_empty() {
334            return Err(Error::InvalidFormat.into());
335        }
336        if self.peek()? == Some(b'.') {
337            let mut text: Vec<u8> = int_digits;
338            text.push(self.advance()?);
339            let frac_start = text.len();
340            while let Some(b) = self.peek()?
341                && b.is_ascii_digit()
342            {
343                text.push(b);
344                self.skip(1)?;
345            }
346            if text.len() == frac_start {
347                return Err(Error::InvalidFormat.into());
348            }
349            if matches!(self.peek()?, Some(b'e' | b'E')) {
350                text.push(self.advance()?);
351                if matches!(self.peek()?, Some(b'+' | b'-')) {
352                    text.push(self.advance()?);
353                }
354                let exp_start = text.len();
355                while let Some(b) = self.peek()?
356                    && b.is_ascii_digit()
357                {
358                    text.push(b);
359                    self.skip(1)?;
360                }
361                if text.len() == exp_start {
362                    return Err(Error::InvalidFormat.into());
363                }
364            }
365            let text = std::str::from_utf8(&text).unwrap();
366            let mut parsed: f64 = text.parse().map_err(|_| Error::InvalidFormat)?;
367            if negative {
368                parsed = -parsed;
369            }
370            return Ok(Value::float(parsed));
371        }
372
373        let bytes = digits_to_be_bytes(&int_digits, 10)?;
374        Ok(be_bytes_to_value(&bytes, negative)?)
375    }
376
377    fn parse_integer_base(&mut self, negative: bool, base: u32) -> Result<Value, R::Error> {
378        let mut digits: Vec<u8> = Vec::new();
379        let mut last_was_digit = false;
380        while let Some(b) = self.peek()? {
381            if b == b'_' {
382                if !last_was_digit {
383                    return Err(Error::InvalidFormat.into());
384                } else {
385                    self.skip(1)?;
386                    last_was_digit = false;
387                    continue;
388                }
389            } else {
390                let is_valid = match base {
391                    2 => matches!(b, b'0' | b'1'),
392                    8 => matches!(b, b'0'..=b'7'),
393                    16 => b.is_ascii_hexdigit(),
394                    _ => unreachable!(),
395                };
396                if !is_valid {
397                    break;
398                }
399                digits.push(b);
400                last_was_digit = true;
401                self.skip(1)?;
402            }
403        }
404        if digits.is_empty() || !last_was_digit {
405            Err(Error::InvalidFormat.into())
406        } else {
407            let bytes = digits_to_be_bytes(&digits, base)?;
408            Ok(be_bytes_to_value(&bytes, negative)?)
409        }
410    }
411
412    fn parse_simple_tail(&mut self) -> Result<Value, R::Error> {
413        self.skip_whitespace()?;
414        let mut digits: Vec<u8> = Vec::new();
415        while let Some(b) = self.peek()?
416            && b.is_ascii_digit()
417        {
418            digits.push(b);
419            self.skip(1)?;
420        }
421        if digits.is_empty() {
422            Err(Error::InvalidFormat.into())
423        } else {
424            let text = std::str::from_utf8(&digits).unwrap();
425            let number: u8 = text.parse().map_err(|_| Error::InvalidFormat)?;
426            self.skip_whitespace()?;
427            self.expect(b')')?;
428            Ok(Value::from(SimpleValue::try_from(number)?))
429        }
430    }
431
432    fn parse_float_hex_tail(&mut self) -> Result<Value, R::Error> {
433        let mut hex: Vec<u8> = Vec::new();
434        while let Some(b) = self.peek()?
435            && b != b'\''
436        {
437            hex.push(b);
438            self.skip(1)?;
439        }
440        self.expect(b'\'')?;
441        let mut bits: u64 = 0;
442        for &byte in &hex {
443            let digit = u8_from_hex_digit(byte)? as u64;
444            bits = (bits << 4) | digit;
445        }
446        match hex.len() {
447            4 => Ok(Value::Float(Float::from_bits_u16(bits as u16))),
448            8 => Ok(Value::Float(Float::from_bits_u32(bits as u32)?)),
449            16 => Ok(Value::Float(Float::from_bits_u64(bits)?)),
450            _ => Err(Error::InvalidFormat.into()),
451        }
452    }
453
454    fn parse_hex_bstr_tail(&mut self) -> Result<Value, R::Error> {
455        let mut bytes = Vec::new();
456        let mut half: Option<u8> = None;
457        loop {
458            match self.advance()? {
459                b'\'' => {
460                    if half.is_some() {
461                        return Err(Error::InvalidFormat.into());
462                    } else {
463                        return Ok(Value::ByteString(bytes));
464                    }
465                }
466                b' ' | b'\t' | b'\r' | b'\n' => continue,
467                byte => {
468                    let digit = u8_from_hex_digit(byte)?;
469                    match half.take() {
470                        None => half = Some(digit),
471                        Some(high) => bytes.push((high << 4) | digit),
472                    }
473                }
474            }
475        }
476    }
477
478    fn parse_b64_bstr_tail(&mut self) -> Result<Value, R::Error> {
479        let mut data: Vec<u8> = Vec::new();
480        loop {
481            match self.advance()? {
482                b'\'' => return Ok(Value::ByteString(decode_base64(&data)?)),
483                b' ' | b'\t' | b'\r' | b'\n' => continue,
484                byte => data.push(byte),
485            }
486        }
487    }
488
489    fn parse_text_string(&mut self) -> Result<Value, R::Error> {
490        self.expect(b'"')?;
491        let mut buf: Vec<u8> = Vec::new();
492        loop {
493            match self.advance()? {
494                b'"' => {
495                    let text = String::try_from(buf).map_err(|_| Error::InvalidUtf8)?;
496                    return Ok(Value::from(text));
497                }
498                b'\r' => {
499                    self.eat(b'\n')?;
500                    buf.push(b'\n');
501                }
502                b'\\' => {
503                    self.read_escape_into_string(&mut buf)?;
504                }
505                byte => {
506                    buf.push(byte);
507                }
508            }
509        }
510    }
511
512    fn parse_single_quoted_bstr(&mut self) -> Result<Value, R::Error> {
513        self.expect(b'\'')?;
514        let mut bytes: Vec<u8> = Vec::new();
515        loop {
516            match self.advance()? {
517                b'\'' => {
518                    return Ok(Value::ByteString(bytes));
519                }
520                b'\r' => {
521                    self.eat(b'\n')?;
522                    bytes.push(b'\n');
523                }
524                b'\\' => {
525                    self.read_escape_into_string(&mut bytes)?;
526                }
527                byte => {
528                    bytes.push(byte);
529                }
530            }
531        }
532    }
533
534    /// Consume an escape sequence (after the leading backslash) and append
535    /// its decoded value to `out`. Returns `false` if the escape was a
536    /// line continuation that produces no output.
537    fn read_escape_into_string(&mut self, out: &mut Vec<u8>) -> Result<bool, R::Error> {
538        let byte = self.advance()?;
539        let ch = match byte {
540            b'\'' => '\'',
541            b'"' => '"',
542            b'\\' => '\\',
543            b'b' => '\u{08}',
544            b'f' => '\u{0C}',
545            b'n' => '\n',
546            b'r' => '\r',
547            b't' => '\t',
548            b'u' => self.read_u_escape()?,
549            b'\n' => return Ok(false),
550            b'\r' => {
551                self.eat(b'\n')?;
552                return Ok(false);
553            }
554            _ => return Err(Error::InvalidFormat.into()),
555        };
556        let mut buf = [0; 4];
557        let s = ch.encode_utf8(&mut buf);
558        out.extend_from_slice(s.as_bytes());
559
560        // out.push(ch);
561        Ok(true)
562    }
563
564    fn read_u_escape(&mut self) -> Result<char, R::Error> {
565        let high = self.read_4_hex()?;
566        if (0xD800..=0xDBFF).contains(&high) {
567            if !self.consume(b"\\u")? {
568                return Err(Error::InvalidFormat.into());
569            }
570            let low = self.read_4_hex()?;
571            if !(0xDC00..=0xDFFF).contains(&low) {
572                return Err(Error::InvalidFormat.into());
573            }
574            let code = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00);
575            char::from_u32(code).ok_or_else(|| Error::InvalidFormat.into())
576        } else if (0xDC00..=0xDFFF).contains(&high) {
577            Err(Error::InvalidFormat.into())
578        } else {
579            char::from_u32(high).ok_or_else(|| Error::InvalidFormat.into())
580        }
581    }
582
583    fn read_4_hex(&mut self) -> Result<u32, R::Error> {
584        let mut code: u32 = 0;
585        for _ in 0..4 {
586            let byte = self.advance()?;
587            let digit = u8_from_hex_digit(byte)? as u32;
588            code = (code << 4) | digit;
589        }
590        Ok(code)
591    }
592
593    fn parse_embedded_bstr(&mut self) -> Result<Value, R::Error> {
594        self.expect(b'<')?;
595        self.expect(b'<')?;
596        let mut buf = Vec::new();
597        self.skip_whitespace()?;
598        if self.consume(b">>")? {
599            Ok(Value::ByteString(buf))
600        } else {
601            self.enter()?;
602            let result = loop {
603                let value = self.parse_value()?;
604                buf.extend(value.encode());
605                self.skip_whitespace()?;
606                if self.eat(b',')? {
607                    continue;
608                } else if self.consume(b">>")? {
609                    break Ok(Value::ByteString(buf));
610                } else {
611                    break Err(Error::InvalidFormat.into());
612                }
613            };
614            self.leave();
615            result
616        }
617    }
618}
619
620fn decode_base64(input: &[u8]) -> Result<Vec<u8>, Error> {
621    let mut data = input;
622    while let Some(stripped) = data.strip_suffix(b"=") {
623        data = stripped;
624    }
625
626    if data.len() % 4 == 1 {
627        return Err(Error::InvalidFormat);
628    }
629
630    let mut out = Vec::with_capacity(data.len() * 3 / 4);
631    let mut buf: u32 = 0;
632    let mut bits: u32 = 0;
633
634    for &byte in data {
635        let value = u8_from_base64_digit(byte)? as u32;
636        buf = (buf << 6) | value;
637        bits += 6;
638        if bits >= 8 {
639            bits -= 8;
640            out.push((buf >> bits) as u8);
641            buf &= (1 << bits) - 1;
642        }
643    }
644
645    if buf == 0 { Ok(out) } else { Err(Error::InvalidFormat) }
646}
647
648/// Convert ASCII digits in the given base to a big-endian byte representation
649/// of the magnitude.
650fn digits_to_be_bytes(digits: &[u8], base: u32) -> Result<Vec<u8>, Error> {
651    let mut result = vec![0u8];
652
653    for &digit in digits {
654        let value = match digit {
655            b'0'..=b'9' => (digit - b'0') as u32,
656            b'a'..=b'f' => (digit - b'a' + 10) as u32,
657            b'A'..=b'F' => (digit - b'A' + 10) as u32,
658            _ => return Err(Error::InvalidFormat),
659        };
660
661        if value >= base {
662            return Err(Error::InvalidFormat);
663        }
664
665        let mut carry = value;
666
667        for byte in result.iter_mut().rev() {
668            let product = (*byte as u32) * base + carry;
669            *byte = product as u8;
670            carry = product >> 8;
671        }
672
673        while carry > 0 {
674            result.insert(0, carry as u8);
675            carry >>= 8;
676        }
677    }
678
679    Ok(result)
680}
681
682/// Construct a CBOR integer value from a big-endian magnitude and a sign.
683fn be_bytes_to_value(bytes: &[u8], negative: bool) -> Result<Value, Error> {
684    let bytes = trim_leading_zeros(bytes);
685
686    if bytes.is_empty() {
687        Ok(Value::Unsigned(0))
688    } else if !negative {
689        if bytes.len() <= 8 {
690            Ok(Value::Unsigned(u64_from_slice(bytes)?))
691        } else {
692            Ok(Value::tag(tag::POS_BIG_INT, Value::from(bytes)))
693        }
694    } else {
695        let mut sub = bytes.to_vec();
696        let mut idx = sub.len();
697        loop {
698            idx -= 1;
699            if sub[idx] > 0 {
700                sub[idx] -= 1;
701                break;
702            } else {
703                sub[idx] = 0xff;
704            }
705        }
706        let sub = trim_leading_zeros(&sub);
707        if sub.len() <= 8 {
708            Ok(Value::Negative(u64_from_slice(sub)?))
709        } else {
710            Ok(Value::tag(tag::NEG_BIG_INT, Value::from(sub)))
711        }
712    }
713}