meta-language 0.46.0

A self-describing links-network core for lossless language representation
use super::error_at;
use crate::grammar::import::GrammarImportError;

#[derive(Clone, Debug, PartialEq, Eq)]
pub(super) struct Token {
    pub(super) kind: TokenKind,
    pub(super) offset: usize,
}

impl Token {
    pub(super) fn ident(&self) -> Option<&str> {
        match &self.kind {
            TokenKind::Ident(value) => Some(value),
            _ => None,
        }
    }

    pub(super) fn is_keyword(&self, keyword: &str) -> bool {
        self.ident() == Some(keyword)
    }

    pub(super) fn text(&self) -> String {
        match &self.kind {
            TokenKind::Ident(value) | TokenKind::Comment(value) => value.clone(),
            TokenKind::String(value) => format!("'{}'", escape_literal(value)),
            TokenKind::CharSet(value) => format!("[{value}]"),
            TokenKind::Action(value) => format!("{{{value}}}"),
            TokenKind::Colon => ":".to_string(),
            TokenKind::Semicolon => ";".to_string(),
            TokenKind::Pipe => "|".to_string(),
            TokenKind::LParen => "(".to_string(),
            TokenKind::RParen => ")".to_string(),
            TokenKind::Question => "?".to_string(),
            TokenKind::Star => "*".to_string(),
            TokenKind::Plus => "+".to_string(),
            TokenKind::Tilde => "~".to_string(),
            TokenKind::Dot => ".".to_string(),
            TokenKind::Equal => "=".to_string(),
            TokenKind::PlusEqual => "+=".to_string(),
            TokenKind::Arrow => "->".to_string(),
            TokenKind::Range => "..".to_string(),
            TokenKind::Comma => ",".to_string(),
        }
    }
}

#[derive(Clone, Debug, PartialEq, Eq)]
pub(super) enum TokenKind {
    Ident(String),
    String(String),
    CharSet(String),
    Action(String),
    Comment(String),
    Colon,
    Semicolon,
    Pipe,
    LParen,
    RParen,
    Question,
    Star,
    Plus,
    Tilde,
    Dot,
    Equal,
    PlusEqual,
    Arrow,
    Range,
    Comma,
}

#[derive(Clone, Debug)]
pub(super) struct Lexer<'text> {
    text: &'text str,
    cursor: usize,
}

impl<'text> Lexer<'text> {
    pub(super) const fn new(text: &'text str) -> Self {
        Self { text, cursor: 0 }
    }

    pub(super) fn tokenize(mut self) -> Result<Vec<Token>, GrammarImportError> {
        let mut tokens = Vec::new();
        while !self.is_end() {
            self.skip_whitespace();
            if self.is_end() {
                break;
            }
            let offset = self.cursor;
            let kind = self.next_token_kind()?;
            tokens.push(Token { kind, offset });
        }
        Ok(tokens)
    }

    fn next_token_kind(&mut self) -> Result<TokenKind, GrammarImportError> {
        if self.starts_with("//") {
            return Ok(TokenKind::Comment(self.line_comment()));
        }
        if self.starts_with("/*") {
            return self.block_comment().map(TokenKind::Comment);
        }

        let Some(character) = self.peek_char() else {
            return Err(error_at(self.cursor, "unexpected end of input"));
        };

        match character {
            '\'' => self.string_literal().map(TokenKind::String),
            '[' => self.char_set().map(TokenKind::CharSet),
            '{' => self.action_block().map(TokenKind::Action),
            ':' => {
                self.advance_char();
                Ok(TokenKind::Colon)
            }
            ';' => {
                self.advance_char();
                Ok(TokenKind::Semicolon)
            }
            '|' => {
                self.advance_char();
                Ok(TokenKind::Pipe)
            }
            '(' => {
                self.advance_char();
                Ok(TokenKind::LParen)
            }
            ')' => {
                self.advance_char();
                Ok(TokenKind::RParen)
            }
            '?' => {
                self.advance_char();
                Ok(TokenKind::Question)
            }
            '*' => {
                self.advance_char();
                Ok(TokenKind::Star)
            }
            '+' if self.starts_with("+=") => {
                self.cursor += 2;
                Ok(TokenKind::PlusEqual)
            }
            '+' => {
                self.advance_char();
                Ok(TokenKind::Plus)
            }
            '~' => {
                self.advance_char();
                Ok(TokenKind::Tilde)
            }
            '.' if self.starts_with("..") => {
                self.cursor += 2;
                Ok(TokenKind::Range)
            }
            '.' => {
                self.advance_char();
                Ok(TokenKind::Dot)
            }
            '=' => {
                self.advance_char();
                Ok(TokenKind::Equal)
            }
            '-' if self.starts_with("->") => {
                self.cursor += 2;
                Ok(TokenKind::Arrow)
            }
            ',' => {
                self.advance_char();
                Ok(TokenKind::Comma)
            }
            character if is_ident_start(character) => Ok(TokenKind::Ident(self.identifier())),
            character => Err(error_at(
                self.cursor,
                format!("unexpected character {character:?}"),
            )),
        }
    }

    fn line_comment(&mut self) -> String {
        let start = self.cursor;
        while let Some(character) = self.peek_char() {
            if character == '\n' || character == '\r' {
                break;
            }
            self.advance_char();
        }
        self.text[start..self.cursor].trim().to_string()
    }

    fn block_comment(&mut self) -> Result<String, GrammarImportError> {
        let start = self.cursor;
        self.cursor += 2;
        while !self.is_end() {
            if self.starts_with("*/") {
                self.cursor += 2;
                return Ok(self.text[start..self.cursor].trim().to_string());
            }
            self.advance_char();
        }
        Err(error_at(start, "unterminated block comment"))
    }

    fn string_literal(&mut self) -> Result<String, GrammarImportError> {
        let start = self.cursor;
        self.advance_char();
        let mut value = String::new();
        while let Some(character) = self.advance_char() {
            match character {
                '\'' => return Ok(value),
                '\\' => value.push(self.escape_sequence(start)?),
                character => value.push(character),
            }
        }
        Err(error_at(start, "unterminated string literal"))
    }

    fn char_set(&mut self) -> Result<String, GrammarImportError> {
        let start = self.cursor;
        self.advance_char();
        let mut escaped = false;
        let content_start = self.cursor;
        while let Some(character) = self.advance_char() {
            if escaped {
                escaped = false;
            } else if character == '\\' {
                escaped = true;
            } else if character == ']' {
                let content_end = self.cursor - character.len_utf8();
                return Ok(self.text[content_start..content_end].to_string());
            }
        }
        Err(error_at(start, "unterminated character set"))
    }

    fn action_block(&mut self) -> Result<String, GrammarImportError> {
        let start = self.cursor;
        self.advance_char();
        let content_start = self.cursor;
        let mut depth = 1_usize;
        while let Some(character) = self.advance_char() {
            match character {
                '\'' | '"' => self.skip_quoted_in_action(character, start)?,
                '/' if self.starts_with("/") => self.skip_line_comment_in_action(),
                '/' if self.starts_with("*") => self.skip_block_comment_in_action(start)?,
                '{' => depth += 1,
                '}' => {
                    depth -= 1;
                    if depth == 0 {
                        let content_end = self.cursor - character.len_utf8();
                        return Ok(self.text[content_start..content_end].to_string());
                    }
                }
                _ => {}
            }
        }
        Err(error_at(start, "unterminated action block"))
    }

    fn skip_quoted_in_action(
        &mut self,
        quote: char,
        action_start: usize,
    ) -> Result<(), GrammarImportError> {
        let mut escaped = false;
        while let Some(character) = self.advance_char() {
            if escaped {
                escaped = false;
            } else if character == '\\' {
                escaped = true;
            } else if character == quote {
                return Ok(());
            }
        }
        Err(error_at(action_start, "unterminated quoted action text"))
    }

    fn skip_line_comment_in_action(&mut self) {
        self.advance_char();
        while let Some(character) = self.peek_char() {
            if character == '\n' || character == '\r' {
                break;
            }
            self.advance_char();
        }
    }

    fn skip_block_comment_in_action(
        &mut self,
        action_start: usize,
    ) -> Result<(), GrammarImportError> {
        self.advance_char();
        while !self.is_end() {
            if self.starts_with("*/") {
                self.cursor += 2;
                return Ok(());
            }
            self.advance_char();
        }
        Err(error_at(
            action_start,
            "unterminated block comment in action",
        ))
    }

    fn escape_sequence(&mut self, start: usize) -> Result<char, GrammarImportError> {
        let Some(character) = self.advance_char() else {
            return Err(error_at(start, "unterminated escape sequence"));
        };
        match character {
            'n' => Ok('\n'),
            'r' => Ok('\r'),
            't' => Ok('\t'),
            'b' => Ok('\u{08}'),
            'f' => Ok('\u{0c}'),
            '\\' | '\'' | '"' | '[' | ']' | '-' => Ok(character),
            'u' => self.unicode_escape(start),
            character => Ok(character),
        }
    }

    fn unicode_escape(&mut self, start: usize) -> Result<char, GrammarImportError> {
        let mut value = 0_u32;
        for _ in 0..4 {
            let Some(character) = self.advance_char() else {
                return Err(error_at(start, "unterminated unicode escape"));
            };
            let Some(digit) = character.to_digit(16) else {
                return Err(error_at(
                    self.cursor - character.len_utf8(),
                    "unicode escape requires hexadecimal digits",
                ));
            };
            value = (value << 4) | digit;
        }
        char::from_u32(value).ok_or_else(|| error_at(start, "invalid unicode escape"))
    }

    fn identifier(&mut self) -> String {
        let start = self.cursor;
        self.advance_char();
        while self.peek_char().is_some_and(is_ident_continue) {
            self.advance_char();
        }
        self.text[start..self.cursor].to_string()
    }

    fn skip_whitespace(&mut self) {
        while self.peek_char().is_some_and(char::is_whitespace) {
            self.advance_char();
        }
    }

    fn starts_with(&self, prefix: &str) -> bool {
        self.text[self.cursor..].starts_with(prefix)
    }

    const fn is_end(&self) -> bool {
        self.cursor >= self.text.len()
    }

    fn peek_char(&self) -> Option<char> {
        self.text[self.cursor..].chars().next()
    }

    fn advance_char(&mut self) -> Option<char> {
        let character = self.peek_char()?;
        self.cursor += character.len_utf8();
        Some(character)
    }
}

fn escape_literal(value: &str) -> String {
    value
        .chars()
        .flat_map(char::escape_default)
        .collect::<String>()
        .replace('\'', "\\'")
}

const fn is_ident_start(character: char) -> bool {
    character == '_' || character.is_ascii_alphabetic()
}

const fn is_ident_continue(character: char) -> bool {
    character == '_' || character.is_ascii_alphanumeric()
}