Skip to main content

cbor_core/
parse.rs

1//! CBOR diagnostic notation parser (Section 2.3.6 of draft-rundgren-cbor-core-25).
2//!
3//! Parses diagnostic-notation strings into [`Value`]. Exposed through the
4//! standard [`FromStr`] trait: `"42".parse::<Value>()`.
5
6use std::{collections::BTreeMap, str::FromStr};
7
8use crate::{
9    Error, Float, Result, SimpleValue, Value,
10    float::Inner,
11    tag,
12    util::{trim_leading_zeros, u8_from_base64_digit, u8_from_hex_digit, u64_from_slice},
13};
14
15impl FromStr for Value {
16    type Err = Error;
17
18    fn from_str(s: &str) -> Result<Self> {
19        let mut parser = Parser::new(s.as_bytes());
20        parser.skip_ws()?;
21        let value = parser.parse_value()?;
22        parser.skip_ws()?;
23        if parser.pos != parser.src.len() {
24            return Err(Error::InvalidFormat);
25        }
26        Ok(value)
27    }
28}
29
30struct Parser<'a> {
31    src: &'a [u8],
32    pos: usize,
33}
34
35impl<'a> Parser<'a> {
36    fn new(src: &'a [u8]) -> Self {
37        Self { src, pos: 0 }
38    }
39
40    fn peek(&self) -> Option<u8> {
41        self.src.get(self.pos).copied()
42    }
43
44    fn peek_at(&self, offset: usize) -> Option<u8> {
45        self.src.get(self.pos + offset).copied()
46    }
47
48    fn advance(&mut self) -> Result<u8> {
49        let byte = self.peek().ok_or(Error::InvalidFormat)?;
50        self.pos += 1;
51        Ok(byte)
52    }
53
54    fn eat(&mut self, byte: u8) -> bool {
55        let found = self.peek() == Some(byte);
56        if found {
57            self.pos += 1
58        }
59        found
60    }
61
62    fn expect(&mut self, byte: u8) -> Result<()> {
63        if self.eat(byte) {
64            Ok(())
65        } else {
66            Err(Error::InvalidFormat)
67        }
68    }
69
70    fn starts_with(&self, prefix: &[u8]) -> bool {
71        self.src[self.pos..].starts_with(prefix)
72    }
73
74    fn consume(&mut self, prefix: &[u8]) -> bool {
75        let found = self.starts_with(prefix);
76        if found {
77            self.pos += prefix.len();
78        }
79        found
80    }
81
82    fn skip_ws(&mut self) -> Result<()> {
83        loop {
84            match self.peek() {
85                Some(b' ' | b'\t' | b'\r' | b'\n') => self.pos += 1,
86                Some(b'#') => {
87                    while let Some(b) = self.peek() {
88                        self.pos += 1;
89                        if b == b'\n' {
90                            break;
91                        }
92                    }
93                }
94                Some(b'/') => {
95                    self.pos += 1;
96                    loop {
97                        match self.peek() {
98                            Some(b'/') => {
99                                self.pos += 1;
100                                break;
101                            }
102                            Some(_) => self.pos += 1,
103                            None => return Err(Error::InvalidFormat),
104                        }
105                    }
106                }
107                _ => return Ok(()),
108            }
109        }
110    }
111
112    fn parse_value(&mut self) -> Result<Value> {
113        self.skip_ws()?;
114        let byte = self.peek().ok_or(Error::InvalidFormat)?;
115        match byte {
116            b'[' => self.parse_array(),
117            b'{' => self.parse_map(),
118            b'"' => self.parse_text_string(),
119            b'\'' => self.parse_single_quoted_bstr(),
120            b'<' => self.parse_embedded_bstr(),
121            b'-' => {
122                if self.consume(b"-Infinity") {
123                    Ok(Value::float(f64::NEG_INFINITY))
124                } else {
125                    self.parse_number_or_tag()
126                }
127            }
128            b'0'..=b'9' => self.parse_number_or_tag(),
129            b'N' if self.consume(b"NaN") => Ok(Value::Float(Float(Inner::F16(0x7e00)))),
130            b'I' if self.consume(b"Infinity") => Ok(Value::float(f64::INFINITY)),
131            b't' if self.consume(b"true") => Ok(Value::from(true)),
132            b'n' if self.consume(b"null") => Ok(Value::null()),
133            b's' if self.consume(b"simple(") => self.parse_simple_tail(),
134            b'h' if self.peek_at(1) == Some(b'\'') => {
135                self.pos += 2;
136                self.parse_hex_bstr_tail()
137            }
138            b'b' if self.consume(b"b64'") => self.parse_b64_bstr_tail(),
139            b'f' => {
140                if self.consume(b"false") {
141                    Ok(Value::from(false))
142                } else if self.consume(b"float'") {
143                    self.parse_float_hex_tail()
144                } else {
145                    Err(Error::InvalidFormat)
146                }
147            }
148            _ => Err(Error::InvalidFormat),
149        }
150    }
151
152    fn parse_array(&mut self) -> Result<Value> {
153        self.expect(b'[')?;
154        self.skip_ws()?;
155        let mut items = Vec::new();
156        if self.eat(b']') {
157            Ok(Value::Array(items))
158        } else {
159            loop {
160                items.push(self.parse_value()?);
161                self.skip_ws()?;
162                if self.eat(b',') {
163                    continue;
164                } else if self.eat(b']') {
165                    break Ok(Value::Array(items));
166                } else {
167                    break Err(Error::InvalidFormat);
168                }
169            }
170        }
171    }
172
173    fn parse_map(&mut self) -> Result<Value> {
174        self.expect(b'{')?;
175        self.skip_ws()?;
176        let mut map: BTreeMap<Value, Value> = BTreeMap::new();
177        if self.eat(b'}') {
178            Ok(Value::Map(map))
179        } else {
180            loop {
181                let key = self.parse_value()?;
182                self.skip_ws()?;
183                self.expect(b':')?;
184                let value = self.parse_value()?;
185                if map.insert(key, value).is_some() {
186                    return Err(Error::NonDeterministic);
187                }
188                self.skip_ws()?;
189                if self.eat(b',') {
190                    continue;
191                } else if self.eat(b'}') {
192                    break Ok(Value::Map(map));
193                } else {
194                    break Err(Error::InvalidFormat);
195                }
196            }
197        }
198    }
199
200    fn parse_number_or_tag(&mut self) -> Result<Value> {
201        let negative = self.eat(b'-');
202        let value = if self.peek() == Some(b'0') {
203            match self.peek_at(1) {
204                Some(b'b' | b'B') => {
205                    self.pos += 2;
206                    self.parse_integer_base(negative, 2)?
207                }
208                Some(b'o' | b'O') => {
209                    self.pos += 2;
210                    self.parse_integer_base(negative, 8)?
211                }
212                Some(b'x' | b'X') => {
213                    self.pos += 2;
214                    self.parse_integer_base(negative, 16)?
215                }
216                _ => self.parse_decimal(negative)?,
217            }
218        } else {
219            self.parse_decimal(negative)?
220        };
221
222        self.skip_ws()?;
223        if self.peek() == Some(b'(') {
224            self.pos += 1;
225            let Value::Unsigned(tag_number) = value else {
226                return Err(Error::InvalidFormat);
227            };
228            let inner = self.parse_value()?;
229            self.skip_ws()?;
230            self.expect(b')')?;
231            Ok(Value::tag(tag_number, inner))
232        } else {
233            Ok(value)
234        }
235    }
236
237    fn parse_decimal(&mut self, negative: bool) -> Result<Value> {
238        let start = self.pos;
239        while let Some(b) = self.peek()
240            && b.is_ascii_digit()
241        {
242            self.pos += 1;
243        }
244        if self.pos == start {
245            return Err(Error::InvalidFormat);
246        }
247        let int_end = self.pos;
248        if self.peek() == Some(b'.') {
249            self.pos += 1;
250            let frac_start = self.pos;
251            while let Some(b) = self.peek()
252                && b.is_ascii_digit()
253            {
254                self.pos += 1;
255            }
256            if self.pos == frac_start {
257                return Err(Error::InvalidFormat);
258            }
259            if matches!(self.peek(), Some(b'e' | b'E')) {
260                self.pos += 1;
261                if matches!(self.peek(), Some(b'+' | b'-')) {
262                    self.pos += 1;
263                }
264                let exp_start = self.pos;
265                while let Some(b) = self.peek()
266                    && b.is_ascii_digit()
267                {
268                    self.pos += 1;
269                }
270                if self.pos == exp_start {
271                    return Err(Error::InvalidFormat);
272                }
273            }
274            let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
275            let mut parsed: f64 = text.parse().map_err(|_| Error::InvalidFormat)?;
276            if negative {
277                parsed = -parsed;
278            }
279            return Ok(Value::float(parsed));
280        }
281
282        let digits = &self.src[start..int_end];
283        let bytes = digits_to_be_bytes(digits, 10)?;
284        be_bytes_to_value(&bytes, negative)
285    }
286
287    fn parse_integer_base(&mut self, negative: bool, base: u32) -> Result<Value> {
288        let mut digits: Vec<u8> = Vec::new();
289        let mut last_was_digit = false;
290        while let Some(b) = self.peek() {
291            if b == b'_' {
292                if !last_was_digit {
293                    return Err(Error::InvalidFormat);
294                } else {
295                    self.pos += 1;
296                    last_was_digit = false;
297                    continue;
298                }
299            } else {
300                let is_valid = match base {
301                    2 => matches!(b, b'0' | b'1'),
302                    8 => matches!(b, b'0'..=b'7'),
303                    16 => b.is_ascii_hexdigit(),
304                    _ => unreachable!(),
305                };
306                if !is_valid {
307                    break;
308                }
309                digits.push(b);
310                last_was_digit = true;
311                self.pos += 1;
312            }
313        }
314        if digits.is_empty() || !last_was_digit {
315            Err(Error::InvalidFormat)
316        } else {
317            let bytes = digits_to_be_bytes(&digits, base)?;
318            be_bytes_to_value(&bytes, negative)
319        }
320    }
321
322    fn parse_simple_tail(&mut self) -> Result<Value> {
323        self.skip_ws()?;
324        let start = self.pos;
325        while let Some(b) = self.peek()
326            && b.is_ascii_digit()
327        {
328            self.pos += 1;
329        }
330        if self.pos == start {
331            Err(Error::InvalidFormat)
332        } else {
333            let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
334            let number: u8 = text.parse().map_err(|_| Error::InvalidFormat)?;
335            self.skip_ws()?;
336            self.expect(b')')?;
337            Ok(Value::from(SimpleValue::try_from(number)?))
338        }
339    }
340
341    fn parse_float_hex_tail(&mut self) -> Result<Value> {
342        let start = self.pos;
343        while let Some(b) = self.peek()
344            && b != b'\''
345        {
346            self.pos += 1;
347        }
348        let hex = &self.src[start..self.pos];
349        self.expect(b'\'')?;
350        let mut bits: u64 = 0;
351        for &byte in hex {
352            let digit = u8_from_hex_digit(byte)? as u64;
353            bits = (bits << 4) | digit;
354        }
355        match hex.len() {
356            4 => Ok(Value::Float(Float::from_u16(bits as u16))),
357            8 => Ok(Value::Float(Float::from_u32(bits as u32)?)),
358            16 => Ok(Value::Float(Float::from_u64(bits)?)),
359            _ => Err(Error::InvalidFormat),
360        }
361    }
362
363    fn parse_hex_bstr_tail(&mut self) -> Result<Value> {
364        let mut bytes = Vec::new();
365        let mut half: Option<u8> = None;
366        loop {
367            let byte = self.advance()?;
368            match byte {
369                b'\'' => {
370                    if half.is_some() {
371                        return Err(Error::InvalidFormat);
372                    } else {
373                        return Ok(Value::ByteString(bytes));
374                    }
375                }
376                b' ' | b'\t' | b'\r' | b'\n' => continue,
377                _ => {
378                    let digit = u8_from_hex_digit(byte)?;
379                    match half.take() {
380                        None => half = Some(digit),
381                        Some(high) => bytes.push((high << 4) | digit),
382                    }
383                }
384            }
385        }
386    }
387
388    fn parse_b64_bstr_tail(&mut self) -> Result<Value> {
389        let mut data: Vec<u8> = Vec::new();
390        loop {
391            let byte = self.advance()?;
392            match byte {
393                b'\'' => return Ok(Value::ByteString(decode_base64(&data)?)),
394                b' ' | b'\t' | b'\r' | b'\n' => continue,
395                _ => data.push(byte),
396            }
397        }
398    }
399
400    fn parse_text_string(&mut self) -> Result<Value> {
401        self.expect(b'"')?;
402        let mut out = String::new();
403        loop {
404            let start = self.pos;
405            while let Some(b) = self.peek()
406                && !matches!(b, b'"' | b'\\' | b'\r')
407            {
408                self.pos += 1;
409            }
410            let slice = std::str::from_utf8(&self.src[start..self.pos]).map_err(|_| Error::InvalidUtf8)?;
411            out.push_str(slice);
412            let byte = self.peek().ok_or(Error::InvalidFormat)?;
413            match byte {
414                b'"' => {
415                    self.pos += 1;
416                    return Ok(Value::from(out));
417                }
418                b'\r' => {
419                    self.pos += 1;
420                    self.eat(b'\n');
421                    out.push('\n');
422                }
423                b'\\' => {
424                    self.pos += 1;
425                    if !self.read_escape_into_string(&mut out)? {
426                        // line continuation, no output
427                    }
428                }
429                _ => unreachable!(),
430            }
431        }
432    }
433
434    fn parse_single_quoted_bstr(&mut self) -> Result<Value> {
435        self.expect(b'\'')?;
436        let mut out: Vec<u8> = Vec::new();
437        loop {
438            let start = self.pos;
439            while let Some(b) = self.peek()
440                && !matches!(b, b'\'' | b'\\' | b'\r')
441            {
442                self.pos += 1;
443            }
444            out.extend_from_slice(&self.src[start..self.pos]);
445            let byte = self.peek().ok_or(Error::InvalidFormat)?;
446            match byte {
447                b'\'' => {
448                    self.pos += 1;
449                    return Ok(Value::ByteString(out));
450                }
451                b'\r' => {
452                    self.pos += 1;
453                    self.eat(b'\n');
454                    out.push(b'\n');
455                }
456                b'\\' => {
457                    self.pos += 1;
458                    let mut tmp = String::new();
459                    if self.read_escape_into_string(&mut tmp)? {
460                        out.extend_from_slice(tmp.as_bytes());
461                    }
462                }
463                _ => unreachable!(),
464            }
465        }
466    }
467
468    /// Consume an escape sequence (after the leading backslash) and append
469    /// its decoded value to `out`. Returns `false` if the escape was a
470    /// line continuation that produces no output.
471    fn read_escape_into_string(&mut self, out: &mut String) -> Result<bool> {
472        let byte = self.advance()?;
473        let ch = match byte {
474            b'\'' => '\'',
475            b'"' => '"',
476            b'\\' => '\\',
477            b'b' => '\u{08}',
478            b'f' => '\u{0C}',
479            b'n' => '\n',
480            b'r' => '\r',
481            b't' => '\t',
482            b'u' => self.read_u_escape()?,
483            b'\n' => return Ok(false),
484            b'\r' => {
485                self.eat(b'\n');
486                return Ok(false);
487            }
488            _ => return Err(Error::InvalidFormat),
489        };
490        out.push(ch);
491        Ok(true)
492    }
493
494    fn read_u_escape(&mut self) -> Result<char> {
495        let high = self.read_4_hex()?;
496        if (0xD800..=0xDBFF).contains(&high) {
497            if !self.consume(b"\\u") {
498                return Err(Error::InvalidFormat);
499            }
500            let low = self.read_4_hex()?;
501            if !(0xDC00..=0xDFFF).contains(&low) {
502                return Err(Error::InvalidFormat);
503            }
504            let code = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00);
505            char::from_u32(code).ok_or(Error::InvalidFormat)
506        } else if (0xDC00..=0xDFFF).contains(&high) {
507            Err(Error::InvalidFormat)
508        } else {
509            char::from_u32(high).ok_or(Error::InvalidFormat)
510        }
511    }
512
513    fn read_4_hex(&mut self) -> Result<u32> {
514        let mut code: u32 = 0;
515        for _ in 0..4 {
516            let byte = self.advance()?;
517            let digit = u8_from_hex_digit(byte)? as u32;
518            code = (code << 4) | digit;
519        }
520        Ok(code)
521    }
522
523    fn parse_embedded_bstr(&mut self) -> Result<Value> {
524        self.expect(b'<')?;
525        self.expect(b'<')?;
526        let mut buf = Vec::new();
527        self.skip_ws()?;
528        if self.consume(b">>") {
529            Ok(Value::ByteString(buf))
530        } else {
531            loop {
532                let value = self.parse_value()?;
533                buf.extend(value.encode());
534                self.skip_ws()?;
535                if self.eat(b',') {
536                    continue;
537                } else if self.consume(b">>") {
538                    return Ok(Value::ByteString(buf));
539                } else {
540                    return Err(Error::InvalidFormat);
541                }
542            }
543        }
544    }
545}
546
547fn decode_base64(input: &[u8]) -> Result<Vec<u8>> {
548    let mut data = input;
549    while let Some(stripped) = data.strip_suffix(b"=") {
550        data = stripped;
551    }
552
553    if data.len() % 4 == 1 {
554        return Err(Error::InvalidFormat);
555    }
556
557    let mut out = Vec::with_capacity(data.len() * 3 / 4);
558    let mut buf: u32 = 0;
559    let mut bits: u32 = 0;
560
561    for &byte in data {
562        let value = u8_from_base64_digit(byte)? as u32;
563        buf = (buf << 6) | value;
564        bits += 6;
565        if bits >= 8 {
566            bits -= 8;
567            out.push((buf >> bits) as u8);
568            buf &= (1 << bits) - 1;
569        }
570    }
571
572    if buf == 0 { Ok(out) } else { Err(Error::InvalidFormat) }
573}
574
575/// Convert ASCII digits in the given base to a big-endian byte representation
576/// of the magnitude.
577fn digits_to_be_bytes(digits: &[u8], base: u32) -> Result<Vec<u8>> {
578    let mut result = vec![0u8];
579
580    for &digit in digits {
581        let value = match digit {
582            b'0'..=b'9' => (digit - b'0') as u32,
583            b'a'..=b'f' => (digit - b'a' + 10) as u32,
584            b'A'..=b'F' => (digit - b'A' + 10) as u32,
585            _ => return Err(Error::InvalidFormat),
586        };
587
588        if value >= base {
589            return Err(Error::InvalidFormat);
590        }
591
592        let mut carry = value;
593
594        for byte in result.iter_mut().rev() {
595            let product = (*byte as u32) * base + carry;
596            *byte = product as u8;
597            carry = product >> 8;
598        }
599
600        while carry > 0 {
601            result.insert(0, carry as u8);
602            carry >>= 8;
603        }
604    }
605
606    Ok(result)
607}
608
609/// Construct a CBOR integer value from a big-endian magnitude and a sign.
610fn be_bytes_to_value(bytes: &[u8], negative: bool) -> Result<Value> {
611    let bytes = trim_leading_zeros(bytes);
612
613    if bytes.is_empty() {
614        Ok(Value::Unsigned(0))
615    } else if !negative {
616        if bytes.len() <= 8 {
617            Ok(Value::Unsigned(u64_from_slice(bytes)?))
618        } else {
619            Ok(Value::tag(tag::POS_BIG_INT, Value::from(bytes)))
620        }
621    } else {
622        let mut sub = bytes.to_vec();
623        let mut idx = sub.len();
624        loop {
625            idx -= 1;
626            if sub[idx] > 0 {
627                sub[idx] -= 1;
628                break;
629            } else {
630                sub[idx] = 0xff;
631            }
632        }
633        let sub = trim_leading_zeros(&sub);
634        if sub.len() <= 8 {
635            Ok(Value::Negative(u64_from_slice(sub)?))
636        } else {
637            Ok(Value::tag(tag::NEG_BIG_INT, Value::from(sub)))
638        }
639    }
640}