meta-language 0.46.0

A self-describing links-network core for lossless language representation
use super::{skeleton_error, GrammarSurfaceError};

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(super) enum Delimiter {
    Round,
    Square,
    Brace,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(super) enum QuoteKind {
    Single,
    Double,
    Backtick,
}

#[derive(Clone, Debug, PartialEq, Eq)]
pub(super) enum Token {
    Atom(String),
    Quoted {
        value: String,
        quote: QuoteKind,
    },
    Group {
        delimiter: Delimiter,
        tokens: Vec<Self>,
    },
}

impl Token {
    pub(super) fn atom(&self) -> Option<&str> {
        match self {
            Self::Atom(value) => Some(value),
            Self::Quoted { .. } | Self::Group { .. } => None,
        }
    }

    pub(super) fn is_atom(&self, expected: &str) -> bool {
        self.atom() == Some(expected)
    }
}

pub(super) fn parse_surface_tokens(text: &str) -> Result<Vec<Token>, GrammarSurfaceError> {
    SurfaceTokenizer::new(text).parse_document()
}

struct SurfaceTokenizer<'input> {
    text: &'input str,
    cursor: usize,
}

impl<'input> SurfaceTokenizer<'input> {
    const fn new(text: &'input str) -> Self {
        Self { text, cursor: 0 }
    }

    fn parse_document(mut self) -> Result<Vec<Token>, GrammarSurfaceError> {
        self.parse_tokens_until(None)
    }

    fn parse_tokens_until(
        &mut self,
        closing: Option<char>,
    ) -> Result<Vec<Token>, GrammarSurfaceError> {
        let mut tokens = Vec::new();
        while self.cursor < self.text.len() {
            self.skip_whitespace();
            if self.cursor >= self.text.len() {
                break;
            }

            let character = self.current_char().expect("cursor is inside text");
            if Some(character) == closing {
                self.cursor += character.len_utf8();
                return Ok(tokens);
            }

            let token = match character {
                '(' => self.parse_group(Delimiter::Round, ')')?,
                '[' => self.parse_group(Delimiter::Square, ']')?,
                '{' => self.parse_group(Delimiter::Brace, '}')?,
                ')' | ']' | '}' => {
                    return Err(skeleton_error(format!(
                        "unexpected closing delimiter {character:?}"
                    )));
                }
                '\'' | '"' | '`' => self.parse_quoted(character)?,
                ':' | ',' | '?' | '*' | '+' | '/' | '|' | '&' | '!' | '.' | '^' => {
                    self.cursor += character.len_utf8();
                    Token::Atom(character.to_string())
                }
                _ => self.parse_atom(),
            };
            tokens.push(token);
        }

        if let Some(closing) = closing {
            return Err(skeleton_error(format!(
                "missing closing delimiter {closing:?}"
            )));
        }
        Ok(tokens)
    }

    fn parse_group(
        &mut self,
        delimiter: Delimiter,
        closing: char,
    ) -> Result<Token, GrammarSurfaceError> {
        self.cursor += 1;
        let tokens = self.parse_tokens_until(Some(closing))?;
        Ok(Token::Group { delimiter, tokens })
    }

    fn parse_quoted(&mut self, quote: char) -> Result<Token, GrammarSurfaceError> {
        self.cursor += quote.len_utf8();

        let mut value = String::new();
        while self.cursor < self.text.len() {
            let remaining = &self.text[self.cursor..];
            if remaining.starts_with(quote) {
                let after = &remaining[quote.len_utf8()..];
                if after.starts_with(quote) {
                    value.push(quote);
                    self.cursor += quote.len_utf8() * 2;
                    continue;
                }
                self.cursor += quote.len_utf8();
                return Ok(Token::Quoted {
                    value,
                    quote: quote_kind(quote),
                });
            }

            let character = remaining
                .chars()
                .next()
                .expect("remaining text is non-empty");
            value.push(character);
            self.cursor += character.len_utf8();
        }

        Err(skeleton_error(format!(
            "unterminated quoted string starting with {quote:?}"
        )))
    }

    fn parse_atom(&mut self) -> Token {
        let start = self.cursor;
        while self.cursor < self.text.len() {
            let character = self.current_char().expect("cursor is inside text");
            if character.is_whitespace()
                || matches!(
                    character,
                    '(' | ')'
                        | '['
                        | ']'
                        | '{'
                        | '}'
                        | '\''
                        | '"'
                        | '`'
                        | ':'
                        | ','
                        | '?'
                        | '*'
                        | '+'
                        | '/'
                        | '|'
                        | '&'
                        | '!'
                        | '.'
                        | '^'
                )
            {
                break;
            }
            self.cursor += character.len_utf8();
        }
        Token::Atom(self.text[start..self.cursor].to_string())
    }

    fn skip_whitespace(&mut self) {
        while self.current_char().is_some_and(char::is_whitespace) {
            self.cursor += self
                .current_char()
                .expect("cursor is inside text")
                .len_utf8();
        }
    }

    fn current_char(&self) -> Option<char> {
        self.text[self.cursor..].chars().next()
    }
}

fn quote_kind(quote: char) -> QuoteKind {
    match quote {
        '\'' => QuoteKind::Single,
        '"' => QuoteKind::Double,
        '`' => QuoteKind::Backtick,
        _ => unreachable!("only quote characters reach quote_kind"),
    }
}