candid_parser 0.3.1

Candid is an interface description language (IDL) for interacting with canisters running on the Internet Computer. This crate contains the parser and the binding generator for Candid.
Documentation
use std::{cell::RefCell, collections::HashMap, mem, rc::Rc};

use lalrpop_util::ParseError;
use logos::{Lexer, Logos};

const LINE_COMMENT_PREFIX: &str = "//";

#[derive(Logos, Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
#[logos(skip r"[ \t\r\n]+")]
pub enum Token {
    #[regex(r"//[^\n]*")] // must start with `LINE_COMMENT_PREFIX`
    LineComment,
    #[token("/*")]
    StartComment,
    #[token("=")]
    Equals,
    #[token("(")]
    LParen,
    #[token(")")]
    RParen,
    #[token("{")]
    LBrace,
    #[token("}")]
    RBrace,
    #[token(";")]
    Semi,
    #[token(",")]
    Comma,
    #[token(".", priority = 10)]
    Dot,
    #[token(":")]
    Colon,
    #[token("->")]
    Arrow,
    #[token("null")]
    Null,
    #[token("vec")]
    Vec,
    #[token("record")]
    Record,
    #[token("variant")]
    Variant,
    #[token("func")]
    Func,
    #[token("service")]
    Service,
    #[token("oneway")]
    Oneway,
    #[token("query")]
    Query,
    #[token("composite_query")]
    CompositeQuery,
    #[token("blob")]
    Blob,
    #[token("type")]
    Type,
    #[token("import")]
    Import,
    #[token("opt")]
    Opt,
    #[token("==")]
    TestEqual,
    #[token("!=")]
    NotEqual,
    #[token("!:")]
    NotDecode,
    #[token("principal")]
    Principal,
    #[regex("[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
    Id(String),
    #[token("\"")]
    StartString,
    // This token is not derived. Stores the unescaped string
    Text(String),
    #[regex("[+-]", |lex| lex.slice().chars().next())]
    Sign(char),
    #[regex("[0-9][_0-9]*", parse_number)]
    Decimal(String),
    #[regex("0[xX][0-9a-fA-F][_0-9a-fA-F]*", parse_number)]
    Hex(String),
    #[regex("[0-9]*\\.[0-9]*", parse_number)]
    #[regex("[0-9]+(\\.[0-9]*)?[eE][+-]?[0-9]+", parse_number)]
    Float(String),
    #[regex("true|false", |lex| lex.slice().parse().map_err(|_| ()))]
    Boolean(bool),
}

#[derive(Logos, Debug, Clone, PartialEq, Eq)]
enum Comment {
    #[token("*/")]
    End,
    #[token("/*")]
    Start,
}

#[derive(Logos, Debug, Clone, PartialEq, Eq)]
#[allow(clippy::enum_variant_names)]
enum Text {
    #[regex(r#"[^\\"]+"#)]
    Text,
    #[regex(r"\\.")]
    EscapeCharacter,
    #[regex(r"\\u\{[0-9a-fA-F][_0-9a-fA-F]*\}")]
    Codepoint,
    #[regex(r"\\[0-9a-fA-F][0-9a-fA-F]")]
    Byte,
    #[token("\"")]
    EndString,
}

impl std::fmt::Display for Token {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(fmt, "{self:?}")
    }
}

fn parse_number(lex: &mut Lexer<Token>) -> String {
    let iter = lex.slice().chars().filter(|c| *c != '_');
    if lex.slice().starts_with("0x") {
        iter.skip(2).collect()
    } else {
        iter.collect()
    }
}

fn parse_doc_comment(lex: &Lexer<Token>) -> String {
    lex.slice()
        .trim_start_matches(LINE_COMMENT_PREFIX)
        .trim()
        .to_string()
}

pub type TriviaMap = Rc<RefCell<HashMap<usize, Vec<String>>>>;

pub struct Tokenizer<'input> {
    lex: Lexer<'input, Token>,
    comment_buffer: Vec<String>,
    trivia: Option<TriviaMap>,
    last_comment_end: Option<usize>,
    last_token_line_end: Option<usize>,
}

impl<'input> Tokenizer<'input> {
    pub fn new(input: &'input str) -> Self {
        let lex = Token::lexer(input);
        Tokenizer {
            lex,
            comment_buffer: vec![],
            trivia: None,
            last_comment_end: None,
            last_token_line_end: None,
        }
    }

    pub fn new_with_trivia(input: &'input str, trivia: TriviaMap) -> Self {
        let lex = Token::lexer(input);
        Tokenizer {
            lex,
            comment_buffer: vec![],
            trivia: Some(trivia),
            last_comment_end: None,
            last_token_line_end: None,
        }
    }

    /// Count newlines between two positions in the source
    fn count_newlines_between(&self, start: usize, end: usize) -> usize {
        let source = self.lex.source();
        if start < end && end <= source.len() {
            source[start..end].chars().filter(|&c| c == '\n').count()
        } else {
            0
        }
    }

    /// Check if there's a blank line (or more) between the end of the last comment
    /// and the start of the current token
    fn has_blank_line_before_token(&self, token_start: usize) -> bool {
        self.last_comment_end
            .map(|end| self.count_newlines_between(end, token_start) >= 2)
            .unwrap_or(false)
    }

    /// Check if the comment is on the same line as the previous token (inline comment)
    fn is_inline_comment(&self, comment_start: usize) -> bool {
        self.last_token_line_end
            .map(|line_end| comment_start <= line_end)
            .unwrap_or(false)
    }

    /// Find the end of the line containing the given position
    fn find_line_end(&self, pos: usize) -> usize {
        let source = self.lex.source();
        source[pos..]
            .find('\n')
            .map(|offset| pos + offset)
            .unwrap_or(source.len())
    }
}

pub type Span = std::ops::Range<usize>;

#[derive(Clone, Debug, Eq, PartialEq)]
pub struct LexicalError {
    pub err: String,
    pub span: Span,
}
impl std::fmt::Display for LexicalError {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.span.start == 0 && self.span.end == 0 {
            write!(fmt, "{}", self.err)
        } else {
            write!(fmt, "{} at {:?}", self.err, self.span)
        }
    }
}
impl LexicalError {
    fn new<E: ToString>(err: E, span: Span) -> Self {
        LexicalError {
            err: err.to_string(),
            span,
        }
    }
}

pub(crate) type ParserError = ParseError<usize, Token, LexicalError>;
pub fn error<E: ToString>(err: E) -> ParserError {
    ParseError::User {
        error: LexicalError::new(err, 0..0),
    }
}
pub fn error2<E: ToString>(err: E, span: Span) -> ParserError {
    ParseError::User {
        error: LexicalError::new(err, span),
    }
}

impl Iterator for Tokenizer<'_> {
    type Item = Result<(usize, Token, usize), LexicalError>;
    fn next(&mut self) -> Option<Self::Item> {
        let token = self.lex.next()?;
        let span = self.lex.span();
        match token {
            Err(_) => {
                let err = format!("Unknown token {}", self.lex.slice());
                Some(Err(LexicalError::new(err, span)))
            }
            Ok(Token::LineComment) => {
                if self.trivia.is_some() && !self.is_inline_comment(span.start) {
                    if self.has_blank_line_before_token(span.start) {
                        self.comment_buffer.clear();
                    }
                    self.comment_buffer.push(parse_doc_comment(&self.lex));
                    self.last_comment_end = Some(span.end);
                }
                self.next()
            }
            Ok(Token::StartComment) => {
                let mut lex = self.lex.to_owned().morph::<Comment>();
                let mut nesting = 1;
                loop {
                    match lex.next() {
                        Some(Err(_)) => continue,
                        Some(Ok(Comment::End)) => {
                            nesting -= 1;
                            if nesting == 0 {
                                break;
                            }
                        }
                        Some(Ok(Comment::Start)) => nesting += 1,
                        None => {
                            return Some(Err(LexicalError::new(
                                "Unclosed comment",
                                span.start..lex.span().end,
                            )))
                        }
                    }
                }
                self.lex = lex.morph::<Token>();
                if self.trivia.is_some() {
                    // Update last_comment_end to skip over the block comment.
                    // This prevents block comments from breaking doc comment continuity.
                    self.last_comment_end = Some(self.lex.span().start);
                }
                self.next()
            }
            Ok(Token::StartString) => {
                let mut result = String::new();
                let mut lex = self.lex.to_owned().morph::<Text>();
                loop {
                    use self::Text::*;
                    match lex.next() {
                        Some(Ok(Text)) => result += lex.slice(),
                        Some(Ok(EscapeCharacter)) => match lex.slice().chars().nth(1).unwrap() {
                            'n' => result.push('\n'),
                            'r' => result.push('\r'),
                            't' => result.push('\t'),
                            '\\' => result.push('\\'),
                            '"' => result.push('"'),
                            '\'' => result.push('\''),
                            c => {
                                return Some(Err(LexicalError::new(
                                    format!("Unknown escape character {c}"),
                                    lex.span(),
                                )))
                            }
                        },
                        Some(Ok(Codepoint)) => {
                            let slice = lex.slice();
                            let hex = slice[3..slice.len() - 1].replace('_', "");
                            match u32::from_str_radix(&hex, 16)
                                .map_err(|_| {
                                    LexicalError::new("Not a valid hex escape", lex.span())
                                })
                                .and_then(|c| {
                                    std::char::from_u32(c).ok_or_else(|| {
                                        LexicalError::new(
                                            format!("Unicode escape out of range {hex}"),
                                            lex.span(),
                                        )
                                    })
                                }) {
                                Ok(c) => result.push(c),
                                Err(e) => return Some(Err(e)),
                            }
                        }
                        Some(Ok(Byte)) => {
                            let hex = &lex.slice()[1..];
                            match u8::from_str_radix(hex, 16) {
                                Ok(byte) => {
                                    // According to https://webassembly.github.io/spec/core/text/values.html#strings
                                    // \xx escape can break utf8 unicode.
                                    let bytes = unsafe { result.as_mut_vec() };
                                    bytes.push(byte);
                                }
                                Err(_) => {
                                    return Some(Err(LexicalError::new(
                                        "Not a valid hex escape",
                                        lex.span(),
                                    )))
                                }
                            }
                        }
                        Some(Ok(EndString)) => break,
                        Some(Err(_)) => {
                            return Some(Err(LexicalError::new(
                                format!("Unexpected string {}", lex.slice()),
                                lex.span(),
                            )))
                        }
                        None => {
                            return Some(Err(LexicalError::new(
                                "Unclosed string",
                                span.start..lex.span().end,
                            )))
                        }
                    }
                }
                self.lex = lex.morph::<Token>();
                Some(Ok((span.start, Token::Text(result), self.lex.span().end)))
            }
            Ok(token) => {
                if self.trivia.is_some() {
                    let has_blank_line = self.has_blank_line_before_token(span.start);
                    if let Some(trivia) = &mut self.trivia {
                        if !self.comment_buffer.is_empty() {
                            if !has_blank_line {
                                let content: Vec<String> = mem::take(&mut self.comment_buffer);
                                trivia.borrow_mut().insert(span.start, content);
                            } else {
                                self.comment_buffer.clear();
                            }
                        }
                        self.last_comment_end = None;
                        self.last_token_line_end = Some(self.find_line_end(span.end));
                    }
                }

                Some(Ok((span.start, token, span.end)))
            }
        }
    }
}