dcbor_parse/
parse.rs

1use base64::Engine as _;
2use bc_ur::prelude::*;
3use known_values::KnownValue;
4use logos::{ Lexer, Logos, Span };
5use thiserror::Error;
6
7#[derive(Debug, Error, Clone, PartialEq)]
8#[rustfmt::skip]
9pub enum Error {
10    #[error("Empty input")]
11    EmptyInput,
12    #[error("Unexpected end of input")]
13    UnexpectedEndOfInput,
14    #[error("Extra data at end of input")]
15    ExtraData(Span),
16    #[error("Unexpected token {0:?}")]
17    UnexpectedToken(Box<Token>, Span),
18    #[error("Unrecognized token")]
19    UnrecognizedToken(Span),
20    #[error("Expected comma")]
21    ExpectedComma(Span),
22    #[error("Expected colon")]
23    ExpectedColon(Span),
24    #[error("Unmatched parentheses")]
25    UnmatchedParentheses(Span),
26    #[error("Unmatched braces")]
27    UnmatchedBraces(Span),
28    #[error("Expected map key")]
29    ExpectedMapKey(Span),
30    #[error("Invalid tag value '{0}'")]
31    InvalidTagValue(String, Span),
32    #[error("Unknown tag name '{0}'")]
33    UnknownTagName(String, Span),
34    #[error("Invalid hex string")]
35    InvalidHexString(Span),
36    #[error("Invalid base64 string")]
37    InvalidBase64String(Span),
38    #[error("Unknown UR type '{0}'")]
39    UnknownUrType(String, Span),
40    #[error("Invalid UR '{0}'")]
41    InvalidUr(String, Span),
42    #[error("Invalid known value '{0}'")]
43    InvalidKnownValue(String, Span),
44    #[error("Unknown known value name '{0}'")]
45    UnknownKnownValueName(String, Span),
46}
47
48impl Error {
49    pub fn is_default(&self) -> bool {
50        matches!(self, Error::UnrecognizedToken(_))
51    }
52
53    fn format_message(message: &dyn ToString, source: &str, range: &Span) -> String {
54        let message = message.to_string();
55        let start = range.start;
56        let end = range.end;
57        // Walk through the bytes up to `start` to find line number and line start offset
58        let mut line_number = 1;
59        let mut line_start = 0;
60        for (idx, ch) in source.char_indices() {
61            if idx >= start {
62                break;
63            }
64            if ch == '\n' {
65                line_number += 1;
66                line_start = idx + 1;
67            }
68        }
69        // Grab the exact line text (or empty if out of bounds)
70        let line = source
71            .lines()
72            .nth(line_number - 1)
73            .unwrap_or("");
74        // Column is byte-offset into that line
75        let column = start.saturating_sub(line_start);
76        // Underline at least one caret, even for zero-width spans
77        let underline_len = end.saturating_sub(start).max(1);
78        let caret = " ".repeat(column) + &"^".repeat(underline_len);
79        format!("line {line_number}: {message}\n{line}\n{caret}")
80    }
81
82    #[rustfmt::skip]
83    pub fn full_message(&self, source: &str) -> String {
84        match self {
85            Error::EmptyInput => Self::format_message(self, source, &Span::default()),
86            Error::UnexpectedEndOfInput => Self::format_message(self, source, &(source.len()..source.len())),
87            Error::ExtraData(range) => Self::format_message(self, source, range),
88            Error::UnexpectedToken(_, range) => Self::format_message(self, source, range),
89            Error::UnrecognizedToken(range) => Self::format_message(self, source, range),
90            Error::UnknownUrType(_, range) => Self::format_message(self, source, range),
91            Error::UnmatchedParentheses(range) => Self::format_message(self, source, range),
92            Error::ExpectedComma(range) => Self::format_message(self, source, range),
93            Error::ExpectedColon(range) => Self::format_message(self, source, range),
94            Error::ExpectedMapKey(range) => Self::format_message(self, source, range),
95            Error::UnmatchedBraces(range) => Self::format_message(self, source, range),
96            Error::UnknownTagName(_, range) => Self::format_message(self, source, range),
97            Error::InvalidHexString(range) => Self::format_message(self, source, range),
98            Error::InvalidBase64String(range) => Self::format_message(self, source, range),
99            Error::InvalidTagValue(_, range) => Self::format_message(self, source, range),
100            Error::InvalidUr(_, range) => Self::format_message(self, source, range),
101            Error::InvalidKnownValue(_, range) => Self::format_message(self, source, range),
102            Error::UnknownKnownValueName(_, range) => Self::format_message(self, source, range),
103        }
104    }
105}
106
107impl Default for Error {
108    fn default() -> Self {
109        Error::UnrecognizedToken(Span::default())
110    }
111}
112
113pub type Result<T> = std::result::Result<T, Error>;
114
115/// Parses a dCBOR item from a string input.
116///
117/// This function takes a string slice containing a dCBOR diagnostic notation
118/// encoded value and attempts to parse it into a `CBOR` object. If the input
119/// contains extra tokens after a valid item, an error is returned.
120///
121/// # Arguments
122///
123/// * `src` - A string slice containing the dCBOR-encoded data.
124///
125/// # Returns
126///
127/// * `Ok(CBOR)` if parsing is successful and the input contains exactly one
128///   valid dCBOR item, which itself might be an atomic value like a number or
129///   string, or a complex value like an array or map.
130/// * `Err(Error)` if parsing fails or if extra tokens are found after the item.
131///
132/// # Errors
133///
134/// Returns an error if the input is invalid, contains extra tokens, or if any
135/// token cannot be parsed as expected.
136///
137/// # Example
138///
139/// ```rust
140/// # use dcbor_parse::parse_dcbor_item;
141/// let cbor = parse_dcbor_item("[1, 2, 3]").unwrap();
142/// assert_eq!(cbor.diagnostic(), "[1, 2, 3]");
143/// ```
144pub fn parse_dcbor_item(src: &str) -> Result<CBOR> {
145    let mut lexer = Token::lexer(src);
146    let first_token = expect_token(&mut lexer);
147    match first_token {
148        Ok(token) => {
149            parse_item_token(&token, &mut lexer).and_then(|cbor| {
150                if lexer.next().is_some() { Err(Error::ExtraData(lexer.span())) } else { Ok(cbor) }
151            })
152        }
153        Err(e) => {
154            if e == Error::UnexpectedEndOfInput {
155                return Err(Error::EmptyInput);
156            }
157            return Err(e);
158        }
159    }
160}
161
162//
163// === Private Functions ===
164//
165
166fn parse_item(lexer: &mut Lexer<'_, Token>) -> Result<CBOR> {
167    let token = expect_token(lexer)?;
168    parse_item_token(&token, lexer)
169}
170
171fn expect_token(lexer: &mut Lexer<'_, Token>) -> Result<Token> {
172    let span = lexer.span();
173    match lexer.next() {
174        Some(token_or_err) => {
175            match token_or_err {
176                Ok(token) => { Ok(token) }
177                Err(e) => {
178                    if e.is_default() { Err(Error::UnrecognizedToken(span)) } else { Err(e) }
179                }
180            }
181        }
182        None => Err(Error::UnexpectedEndOfInput),
183    }
184}
185
186fn parse_item_token(token: &Token, lexer: &mut Lexer<'_, Token>) -> Result<CBOR> {
187    // Handle embedded lexing errors in token payloads
188    if let Token::ByteStringHex(Err(e)) = token {
189        return Err(e.clone());
190    }
191    if let Token::ByteStringBase64(Err(e)) = token {
192        return Err(e.clone());
193    }
194    if let Token::TagValue(Err(e)) = token {
195        return Err(e.clone());
196    }
197    if let Token::UR(Err(e)) = token {
198        return Err(e.clone());
199    }
200    if let Token::KnownValueNumber(Err(e)) = token {
201        return Err(e.clone());
202    }
203
204    match token {
205        Token::Bool(b) => Ok((*b).into()),
206        Token::Null => Ok(CBOR::null()),
207        Token::ByteStringHex(Ok(bytes)) => Ok(CBOR::to_byte_string(bytes)),
208        Token::ByteStringBase64(Ok(bytes)) => Ok(CBOR::to_byte_string(bytes)),
209        Token::Number(num) => Ok((*num).into()),
210        Token::NaN => Ok(f64::NAN.into()),
211        Token::Infinity => Ok(f64::INFINITY.into()),
212        Token::NegInfinity => Ok(f64::NEG_INFINITY.into()),
213        Token::String(s) => parse_string(s, lexer.span()),
214        Token::UR(Ok(ur)) => parse_ur(ur, lexer.span()),
215        Token::TagValue(Ok(tag_value)) => parse_number_tag(*tag_value, lexer),
216        Token::TagName(name) => parse_name_tag(&name, lexer),
217        Token::KnownValueNumber(Ok(value)) => Ok(KnownValue::new(*value).into()),
218        Token::KnownValueName(name) => {
219            if let Some(known_value) = known_value_for_name(&name) {
220                Ok(known_value.into())
221            } else {
222                let span = lexer.span().start + 1..lexer.span().end - 1;
223                Err(Error::UnknownKnownValueName(name.clone(), span))
224            }
225        }
226        Token::Unit => Ok(KnownValue::new(0).into()),
227        Token::BracketOpen => parse_array(lexer),
228        Token::BraceOpen => parse_map(lexer),
229        _ => Err(Error::UnexpectedToken(Box::new(token.clone()), lexer.span())),
230    }
231}
232
233fn parse_string(s: &str, span: Span) -> Result<CBOR> {
234    if s.starts_with('"') && s.ends_with('"') {
235        let s = &s[1..s.len() - 1];
236        Ok(s.into())
237    } else {
238        Err(Error::UnrecognizedToken(span))
239    }
240}
241
242fn tag_for_name(name: &str) -> Option<Tag> {
243    with_tags!(|tags: &TagsStore| tags.tag_for_name(name))
244}
245
246fn known_value_for_name(name: &str) -> Option<KnownValue> {
247    let binding = known_values::KNOWN_VALUES.get();
248    let known_values = binding.as_ref().unwrap();
249    known_values.known_value_named(name).cloned()
250}
251
252fn parse_ur(ur: &UR, span: Span) -> Result<CBOR> {
253    let ur_type = ur.ur_type_str();
254    if let Some(tag) = tag_for_name(ur_type) {
255        Ok(CBOR::to_tagged_value(tag, ur.cbor()))
256    } else {
257        Err(
258            Error::UnknownUrType(
259                ur_type.to_string(),
260                span.start + 3..span.start + 3 + ur_type.len()
261            )
262        )
263    }
264}
265
266fn parse_number_tag(tag_value: TagValue, lexer: &mut Lexer<'_, Token>) -> Result<CBOR> {
267    let item = parse_item(lexer)?;
268    match expect_token(lexer) {
269        Ok(Token::ParenthesisClose) => Ok(CBOR::to_tagged_value(tag_value, item)),
270        Ok(_) => Err(Error::UnmatchedParentheses(lexer.span())),
271        Err(e) => {
272            if e == Error::UnexpectedEndOfInput {
273                return Err(Error::UnmatchedParentheses(lexer.span()));
274            }
275            return Err(e);
276        }
277    }
278}
279
280fn parse_name_tag(name: &str, lexer: &mut Lexer<'_, Token>) -> Result<CBOR> {
281    let span = lexer.span().start..lexer.span().end - 1;
282    let item = parse_item(lexer)?;
283    match expect_token(lexer)? {
284        Token::ParenthesisClose => {
285            if let Some(tag) = tag_for_name(name) {
286                Ok(CBOR::to_tagged_value(tag, item))
287            } else {
288                Err(Error::UnknownTagName(name.to_string(), span))
289            }
290        }
291        _ => { Err(Error::UnmatchedParentheses(lexer.span())) }
292    }
293}
294
295fn parse_array(lexer: &mut Lexer<'_, Token>) -> Result<CBOR> {
296    let mut items = Vec::new();
297    let mut awaits_comma = false;
298    let mut awaits_item = false;
299
300    loop {
301        match expect_token(lexer)? {
302            Token::Bool(b) if !awaits_comma => {
303                items.push(b.into());
304                awaits_item = false;
305            }
306            Token::Null if !awaits_comma => {
307                items.push(CBOR::null());
308                awaits_item = false;
309            }
310            Token::ByteStringHex(Ok(bytes)) if !awaits_comma => {
311                items.push(CBOR::to_byte_string(bytes));
312                awaits_item = false;
313            }
314            Token::ByteStringBase64(Ok(bytes)) if !awaits_comma => {
315                items.push(CBOR::to_byte_string(bytes));
316                awaits_item = false;
317            }
318            Token::Number(num) if !awaits_comma => {
319                items.push(num.into());
320                awaits_item = false;
321            }
322            Token::NaN if !awaits_comma => {
323                items.push(f64::NAN.into());
324                awaits_item = false;
325            }
326            Token::Infinity if !awaits_comma => {
327                items.push(f64::INFINITY.into());
328                awaits_item = false;
329            }
330            Token::NegInfinity if !awaits_comma => {
331                items.push(f64::NEG_INFINITY.into());
332                awaits_item = false;
333            }
334            Token::String(s) if !awaits_comma => {
335                items.push(parse_string(&s, lexer.span())?);
336                awaits_item = false;
337            }
338            Token::UR(Ok(ur)) if !awaits_comma => {
339                items.push(parse_ur(&ur, lexer.span())?);
340                awaits_item = false;
341            }
342            Token::TagValue(Ok(tag_value)) if !awaits_comma => {
343                items.push(parse_number_tag(tag_value, lexer)?);
344                awaits_item = false;
345            }
346            Token::TagName(name) if !awaits_comma => {
347                items.push(parse_name_tag(&name, lexer)?);
348                awaits_item = false;
349            }
350            Token::KnownValueNumber(Ok(value)) if !awaits_comma => {
351                items.push(KnownValue::new(value).into());
352                awaits_item = false;
353            }
354            Token::KnownValueName(name) if !awaits_comma => {
355                if let Some(known_value) = known_value_for_name(&name) {
356                    items.push(known_value.into());
357                } else {
358                    return Err(Error::UnknownKnownValueName(name, lexer.span()));
359                }
360                awaits_item = false;
361            }
362            Token::BracketOpen if !awaits_comma => {
363                items.push(parse_array(lexer)?);
364                awaits_item = false;
365            }
366            Token::BraceOpen if !awaits_comma => {
367                items.push(parse_map(lexer)?);
368                awaits_item = false;
369            }
370            Token::Comma if awaits_comma => {
371                awaits_item = true;
372            }
373            Token::BracketClose if !awaits_item => {
374                return Ok(items.into());
375            }
376            token => {
377                if awaits_comma {
378                    return Err(Error::ExpectedComma(lexer.span()));
379                }
380                return Err(Error::UnexpectedToken(Box::new(token), lexer.span()));
381            }
382        }
383        awaits_comma = !awaits_item;
384    }
385}
386
387fn parse_map(lexer: &mut Lexer<'_, Token>) -> Result<CBOR> {
388    let mut map = Map::new();
389    let mut awaits_comma = false;
390    let mut awaits_key = false;
391
392    loop {
393        let token = match expect_token(lexer) {
394            Ok(tok) => tok,
395            Err(e) if e == Error::UnexpectedEndOfInput => {
396                return Err(Error::UnmatchedBraces(lexer.span()));
397            }
398            Err(e) => {
399                return Err(e);
400            }
401        };
402        match token {
403            Token::BraceClose if !awaits_key => {
404                return Ok(map.into());
405            }
406            Token::Comma if awaits_comma => {
407                awaits_key = true;
408            }
409            _ => {
410                if awaits_comma {
411                    return Err(Error::ExpectedComma(lexer.span()));
412                }
413                let key = parse_item_token(&token, lexer)?;
414                if let Some(Token::Colon) = expect_token(lexer).ok() {
415                    let value = match parse_item(lexer) {
416                        Err(Error::UnexpectedToken(token, span)) if *token == Token::BraceClose => {
417                            return Err(Error::ExpectedMapKey(span));
418                        }
419                        other => other?,
420                    };
421                    map.insert(key, value);
422                    awaits_key = false;
423                } else {
424                    return Err(Error::ExpectedColon(lexer.span()));
425                }
426            }
427        }
428        awaits_comma = !awaits_key;
429    }
430}
431
432#[derive(Debug, Clone, Logos, PartialEq)]
433#[rustfmt::skip]
434#[logos(error = Error)]
435#[logos(skip r"(?:[ \t\r\n\f]|/[^/]*/|#[^\n]*)+")]
436pub enum Token {
437    #[token("false", |_| false)]
438    #[token("true", |_| true)]
439    Bool(bool),
440
441    #[token("{")]
442    BraceOpen,
443
444    #[token("}")]
445    BraceClose,
446
447    #[token("[")]
448    BracketOpen,
449
450    #[token("]")]
451    BracketClose,
452
453    #[token("(")]
454    ParenthesisOpen,
455
456    #[token(")")]
457    ParenthesisClose,
458
459    #[token(":")]
460    Colon,
461
462    #[token(",")]
463    Comma,
464
465    #[token("null")]
466    Null,
467
468    #[token("NaN")]
469    NaN,
470
471    #[token("Infinity")]
472    Infinity,
473
474    #[token("-Infinity")]
475    NegInfinity,
476
477    /// Binary string in hex format.
478    #[regex(r"h'[0-9a-fA-F]*'", |lex| {
479        let hex = lex.slice();
480        let raw_hex = hex[2..hex.len() - 1].as_bytes();
481        if raw_hex.len() % 2 != 0 {
482            return Err(Error::InvalidHexString(lex.span()));
483        }
484        hex::decode(raw_hex)
485            .map_err(|_|
486                Error::InvalidHexString(lex.span())
487            )
488    })]
489    ByteStringHex(Result<Vec<u8>>),
490
491    /// Binary string in base64 format.
492    #[regex(r"b64'([A-Za-z0-9+/=]{2,})'", |lex| {
493        let base64 = lex.slice();
494        let s = &base64[4..base64.len() - 1];
495        base64::engine::general_purpose::STANDARD
496        .decode(s)
497        .map_err(|_| Error::InvalidBase64String(lex.span()))
498    })]
499    ByteStringBase64(Result<Vec<u8>>),
500
501    /// JavaScript-style number.
502    #[regex(r"-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?", |lex|
503        lex.slice().parse::<f64>().unwrap()
504    )]
505    Number(f64),
506
507    /// JavaScript-style string.
508    #[regex(r#""([^"\\\x00-\x1F]|\\(["\\bnfrt/]|u[a-fA-F0-9]{4}))*""#, |lex|
509        lex.slice().to_owned()
510    )]
511    String(String),
512
513    /// Integer followed immediately by an opening parenthesis.
514    #[regex(r#"0\(|[1-9][0-9]*\("#, |lex|
515        let span = (lex.span().start)..(lex.span().end - 1);
516        let stripped = lex.slice().strip_suffix('(').unwrap();
517        stripped.parse::<TagValue>().map_err(|_|
518                Error::InvalidTagValue(stripped.to_string(), span)
519            )
520    )]
521    TagValue(Result<TagValue>),
522
523    /// Tag name followed immediately by an opening parenthesis.
524    #[regex(r#"[a-zA-Z_][a-zA-Z0-9_-]*\("#, |lex|
525        // safe to drop the trailing '('
526        lex.slice()[..lex.slice().len()-1].to_string()
527    )]
528    TagName(String),
529
530    /// Integer (same regex as TagValue) enclosed in single quotes.
531    #[regex(r#"'0'|'[1-9][0-9]*'"#, |lex|
532        let span = (lex.span().start + 1)..(lex.span().end - 1);
533        let slice = lex.slice();
534        let stripped = slice[1..slice.len() - 1].to_string();
535        stripped.parse::<TagValue>().map_err(|_|
536                Error::InvalidKnownValue(stripped, span)
537            )
538    )]
539    KnownValueNumber(Result<u64>),
540
541    /// Single-quoted empty string (i.e., `''`) (Unit) or Identifier (same regex
542    /// as for tag names) enclosed in single quotes.
543    #[regex(r#"''|'[a-zA-Z_][a-zA-Z0-9_-]*'"#, |lex|
544        lex.slice()[1..lex.slice().len()-1].to_string()
545    )]
546    KnownValueName(String),
547
548    /// The _unit_ known value `40000(0)`.
549    #[token("Unit")]
550    Unit,
551
552    #[regex(r#"ur:([a-zA-Z0-9][a-zA-Z0-9-]*)/([a-zA-Z]{8,})"#, |lex|
553        let s = lex.slice();
554        let ur = UR::from_ur_string(s);
555        ur.map_err(|e| {
556            Error::InvalidUr(e.to_string(), lex.span())
557        })
558    )]
559    UR(Result<UR>),
560}