ty-parser 0.1.0

C-syntax parser for the Ty programming language
Documentation
use crate::token::{Token, keyword};
use crate::{Error, Span};

pub struct SpannedToken {
    pub token: Token,
    pub span: Span,
}

pub struct Lexer {
    source: Vec<char>,
    position: usize,
    line: u32,
    column: u32,
}

impl Lexer {
    pub fn new(source: &str) -> Self {
        Self {
            source: source.chars().collect(),
            position: 0,
            line: 1,
            column: 1,
        }
    }

    pub fn tokenize(&mut self) -> crate::Result<Vec<SpannedToken>> {
        let mut tokens = Vec::new();
        loop {
            self.skip_whitespace_and_comments();
            let span = self.span();
            if self.at_end() {
                tokens.push(SpannedToken {
                    token: Token::End,
                    span,
                });
                break;
            }
            let token = self.next_token()?;
            tokens.push(SpannedToken { token, span });
        }
        Ok(tokens)
    }

    fn span(&self) -> Span {
        Span {
            line: self.line,
            column: self.column,
        }
    }

    fn at_end(&self) -> bool {
        self.position >= self.source.len()
    }

    fn peek(&self) -> char {
        self.source.get(self.position).copied().unwrap_or('\0')
    }

    fn peek_next(&self) -> char {
        self.source.get(self.position + 1).copied().unwrap_or('\0')
    }

    fn advance(&mut self) -> char {
        let ch = self.peek();
        self.position += 1;
        if ch == '\n' {
            self.line += 1;
            self.column = 1;
        } else {
            self.column += 1;
        }
        ch
    }

    fn skip_whitespace_and_comments(&mut self) {
        loop {
            while !self.at_end() && self.peek().is_whitespace() {
                self.advance();
            }
            if self.peek() == '/' && self.peek_next() == '/' {
                while !self.at_end() && self.peek() != '\n' {
                    self.advance();
                }
            } else {
                break;
            }
        }
    }

    fn next_token(&mut self) -> crate::Result<Token> {
        let ch = self.peek();

        if ch.is_ascii_digit() {
            return self.lex_number();
        }
        if ch.is_ascii_alphabetic() || ch == '_' {
            return Ok(self.lex_identifier());
        }
        if ch == '"' {
            return self.lex_string();
        }

        let span = self.span();
        self.advance();
        match ch {
            '(' => Ok(Token::LeftParen),
            ')' => Ok(Token::RightParen),
            '{' => Ok(Token::LeftBrace),
            '}' => Ok(Token::RightBrace),
            '[' => Ok(Token::LeftBracket),
            ']' => Ok(Token::RightBracket),
            ';' => Ok(Token::Semicolon),
            ',' => Ok(Token::Comma),
            ':' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::ColonEqual)
                } else {
                    Ok(Token::Colon)
                }
            }
            '@' => Ok(Token::At),
            '~' => Ok(Token::Tilde),
            '^' => Ok(Token::Caret),
            '%' => Ok(Token::Percent),
            '+' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::PlusEqual)
                } else {
                    Ok(Token::Plus)
                }
            }
            '/' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::SlashEqual)
                } else {
                    Ok(Token::Slash)
                }
            }
            '.' => {
                if self.peek() == '.' && self.peek_next() == '.' {
                    self.advance();
                    self.advance();
                    Ok(Token::Ellipsis)
                } else {
                    Ok(Token::Dot)
                }
            }
            '*' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::StarEqual)
                } else {
                    Ok(Token::Star)
                }
            }
            '-' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::MinusEqual)
                } else if self.peek() == '>' {
                    self.advance();
                    Ok(Token::Arrow)
                } else {
                    Ok(Token::Minus)
                }
            }
            '=' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::EqualEqual)
                } else if self.peek() == '>' {
                    self.advance();
                    Ok(Token::FatArrow)
                } else {
                    Ok(Token::Equal)
                }
            }
            '!' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::BangEqual)
                } else {
                    Ok(Token::Bang)
                }
            }
            '<' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::LessEqual)
                } else if self.peek() == '<' {
                    self.advance();
                    Ok(Token::LessLess)
                } else {
                    Ok(Token::Less)
                }
            }
            '>' => {
                if self.peek() == '=' {
                    self.advance();
                    Ok(Token::GreaterEqual)
                } else if self.peek() == '>' {
                    self.advance();
                    Ok(Token::GreaterGreater)
                } else {
                    Ok(Token::Greater)
                }
            }
            '&' => {
                if self.peek() == '&' {
                    self.advance();
                    Ok(Token::AmpAmp)
                } else {
                    Ok(Token::Amp)
                }
            }
            '|' => {
                if self.peek() == '|' {
                    self.advance();
                    Ok(Token::PipePipe)
                } else {
                    Ok(Token::Pipe)
                }
            }
            _ => Err(Error {
                message: format!("unexpected character '{ch}'"),
                span: Some(span),
            }),
        }
    }

    fn lex_number(&mut self) -> crate::Result<Token> {
        let span = self.span();
        let first = self.advance();
        if first == '0' && matches!(self.peek(), 'x' | 'b' | 'o') {
            let prefix = self.advance();
            let (radix, valid_digit): (u32, fn(char) -> bool) = match prefix {
                'x' => (16, |c: char| c.is_ascii_hexdigit()),
                'b' => (2, |c: char| c == '0' || c == '1'),
                'o' => (8, |c: char| ('0'..='7').contains(&c)),
                _ => unreachable!(),
            };
            let mut digits = std::string::String::new();
            while !self.at_end() && (valid_digit(self.peek()) || self.peek() == '_') {
                let ch = self.advance();
                if ch != '_' {
                    digits.push(ch);
                }
            }
            let value = u128::from_str_radix(&digits, radix).map_err(|_| Error {
                message: format!("invalid integer literal '0{prefix}{digits}'"),
                span: Some(span),
            })?;
            return Ok(Token::Integer(value));
        }
        let mut number = std::string::String::from(first);
        while !self.at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
            let ch = self.advance();
            if ch != '_' {
                number.push(ch);
            }
        }
        if self.peek() == '.' && self.peek_next().is_ascii_digit() {
            number.push(self.advance());
            while !self.at_end() && (self.peek().is_ascii_digit() || self.peek() == '_') {
                let ch = self.advance();
                if ch != '_' {
                    number.push(ch);
                }
            }
            let value: f64 = number.parse().map_err(|_| Error {
                message: format!("invalid float literal '{number}'"),
                span: Some(span),
            })?;
            Ok(Token::Float(value))
        } else {
            let value: u128 = number.parse().map_err(|_| Error {
                message: format!("invalid integer literal '{number}'"),
                span: Some(span),
            })?;
            Ok(Token::Integer(value))
        }
    }

    fn lex_identifier(&mut self) -> Token {
        let mut name = std::string::String::new();
        while !self.at_end() && (self.peek().is_ascii_alphanumeric() || self.peek() == '_') {
            name.push(self.advance());
        }
        keyword(&name).unwrap_or(Token::Identifier(name))
    }

    fn lex_string(&mut self) -> crate::Result<Token> {
        let span = self.span();
        self.advance();
        let mut bytes = Vec::new();
        loop {
            if self.at_end() {
                return Err(Error {
                    message: "unterminated string literal".into(),
                    span: Some(span),
                });
            }
            let ch = self.advance();
            if ch == '"' {
                break;
            }
            if ch == '\\' {
                if self.at_end() {
                    return Err(Error {
                        message: "unterminated escape sequence".into(),
                        span: Some(self.span()),
                    });
                }
                let escaped = self.advance();
                match escaped {
                    'n' => bytes.push(b'\n'),
                    't' => bytes.push(b'\t'),
                    'r' => bytes.push(b'\r'),
                    '\\' => bytes.push(b'\\'),
                    '"' => bytes.push(b'"'),
                    '0' => bytes.push(0),
                    _ => {
                        return Err(Error {
                            message: format!("unknown escape sequence '\\{escaped}'"),
                            span: Some(self.span()),
                        });
                    }
                }
            } else {
                let mut buffer = [0u8; 4];
                let encoded = ch.encode_utf8(&mut buffer);
                bytes.extend_from_slice(encoded.as_bytes());
            }
        }
        Ok(Token::String(bytes))
    }
}