antlr-rust-runtime 0.1.0

Clean-room Rust runtime and target support for ANTLR v4 generated parsers
Documentation
use crate::char_stream::TextInterval;
use std::fmt;
use std::rc::Rc;

pub const TOKEN_EOF: i32 = -1;
pub const INVALID_TOKEN_TYPE: i32 = 0;
pub const DEFAULT_CHANNEL: i32 = 0;
pub const HIDDEN_CHANNEL: i32 = 1;

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum TokenChannel {
    Default,
    Hidden,
    Custom(i32),
}

impl TokenChannel {
    pub const fn value(self) -> i32 {
        match self {
            Self::Default => DEFAULT_CHANNEL,
            Self::Hidden => HIDDEN_CHANNEL,
            Self::Custom(channel) => channel,
        }
    }
}

impl From<i32> for TokenChannel {
    fn from(value: i32) -> Self {
        match value {
            DEFAULT_CHANNEL => Self::Default,
            HIDDEN_CHANNEL => Self::Hidden,
            other => Self::Custom(other),
        }
    }
}

pub trait Token: fmt::Debug {
    fn token_type(&self) -> i32;
    fn channel(&self) -> i32;
    fn start(&self) -> usize;
    fn stop(&self) -> usize;
    fn token_index(&self) -> isize;
    fn line(&self) -> usize;
    fn column(&self) -> usize;
    fn text(&self) -> Option<&str>;
    fn source_name(&self) -> &str;

    fn interval(&self) -> TextInterval {
        TextInterval::new(self.start(), self.stop())
    }
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub struct CommonToken {
    token_type: i32,
    channel: i32,
    start: usize,
    stop: usize,
    token_index: isize,
    line: usize,
    column: usize,
    text: Option<String>,
    source_name: String,
}

#[derive(Debug)]
pub struct TokenSpec<'a> {
    pub token_type: i32,
    pub channel: i32,
    pub start: usize,
    pub stop: usize,
    pub line: usize,
    pub column: usize,
    pub text: Option<String>,
    pub source_name: &'a str,
}

impl CommonToken {
    pub const fn new(token_type: i32) -> Self {
        Self {
            token_type,
            channel: DEFAULT_CHANNEL,
            start: 0,
            stop: 0,
            token_index: -1,
            line: 1,
            column: 0,
            text: None,
            source_name: String::new(),
        }
    }

    pub fn eof(source_name: impl Into<String>, index: usize, line: usize, column: usize) -> Self {
        Self {
            token_type: TOKEN_EOF,
            channel: DEFAULT_CHANNEL,
            start: index,
            stop: index.checked_sub(1).unwrap_or(usize::MAX),
            token_index: -1,
            line,
            column,
            text: Some("<EOF>".to_owned()),
            source_name: source_name.into(),
        }
    }

    #[must_use]
    pub fn with_text(mut self, text: impl Into<String>) -> Self {
        self.text = Some(text.into());
        self
    }

    #[must_use]
    pub const fn with_span(mut self, start: usize, stop: usize) -> Self {
        self.start = start;
        self.stop = stop;
        self
    }

    #[must_use]
    pub const fn with_position(mut self, line: usize, column: usize) -> Self {
        self.line = line;
        self.column = column;
        self
    }

    #[must_use]
    pub const fn with_channel(mut self, channel: i32) -> Self {
        self.channel = channel;
        self
    }

    #[must_use]
    pub fn with_source_name(mut self, source_name: impl Into<String>) -> Self {
        self.source_name = source_name.into();
        self
    }

    pub const fn set_token_index(&mut self, token_index: isize) {
        self.token_index = token_index;
    }
}

impl Token for CommonToken {
    fn token_type(&self) -> i32 {
        self.token_type
    }

    fn channel(&self) -> i32 {
        self.channel
    }

    fn start(&self) -> usize {
        self.start
    }

    fn stop(&self) -> usize {
        self.stop
    }

    fn token_index(&self) -> isize {
        self.token_index
    }

    fn line(&self) -> usize {
        self.line
    }

    fn column(&self) -> usize {
        self.column
    }

    fn text(&self) -> Option<&str> {
        self.text.as_deref()
    }

    fn source_name(&self) -> &str {
        &self.source_name
    }
}

impl fmt::Display for CommonToken {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let text = self.text().unwrap_or("");
        let channel = if self.channel() == DEFAULT_CHANNEL {
            String::new()
        } else {
            format!(",channel={}", self.channel())
        };
        write!(
            f,
            "[@{},{}:{}='{}',<{}>{},{}:{}]",
            self.token_index(),
            display_token_boundary(self.start()),
            display_token_boundary(self.stop()),
            display_text(text),
            self.token_type(),
            channel,
            self.line(),
            self.column()
        )
    }
}

/// Formats synthetic-token boundaries with ANTLR's `-1` sentinel.
fn display_token_boundary(value: usize) -> String {
    if value == usize::MAX {
        "-1".to_owned()
    } else {
        value.to_string()
    }
}

/// Escapes token text the way ANTLR's token display format expects.
///
/// Debug escaping is close but not identical: ANTLR leaves ordinary
/// backslashes and quotes unescaped, and only normalizes control characters
/// that would otherwise disrupt the one-line token representation.
fn display_text(text: &str) -> String {
    let mut out = String::new();
    for ch in text.chars() {
        match ch {
            '\n' => out.push_str("\\n"),
            '\r' => out.push_str("\\r"),
            '\t' => out.push_str("\\t"),
            other => out.push(other),
        }
    }
    out
}

pub type TokenRef = Rc<CommonToken>;

pub trait TokenFactory {
    fn create(&self, spec: TokenSpec<'_>) -> CommonToken;
}

#[derive(Clone, Debug, Default)]
pub struct CommonTokenFactory;

impl TokenFactory for CommonTokenFactory {
    fn create(&self, spec: TokenSpec<'_>) -> CommonToken {
        let mut token = CommonToken::new(spec.token_type)
            .with_channel(spec.channel)
            .with_span(spec.start, spec.stop)
            .with_position(spec.line, spec.column)
            .with_source_name(spec.source_name);
        if let Some(text) = spec.text {
            token = token.with_text(text);
        }
        token
    }
}

/// A diagnostic buffered by a token source while it was producing tokens.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct TokenSourceError {
    /// One-based input line where the diagnostic starts.
    pub line: usize,
    /// Zero-based column within `line` where the diagnostic starts.
    pub column: usize,
    /// ANTLR-compatible diagnostic message without the leading line/column.
    pub message: String,
}

impl TokenSourceError {
    /// Creates a token-source diagnostic at the given input position.
    pub fn new(line: usize, column: usize, message: impl Into<String>) -> Self {
        Self {
            line,
            column,
            message: message.into(),
        }
    }
}

pub trait TokenSource {
    fn next_token(&mut self) -> CommonToken;
    fn line(&self) -> usize;
    fn column(&self) -> usize;
    fn source_name(&self) -> &str;
    /// Returns and clears diagnostics emitted while fetching tokens.
    fn drain_errors(&mut self) -> Vec<TokenSourceError> {
        Vec::new()
    }

    /// Serializes lexer DFA cache state when the token source exposes one.
    fn lexer_dfa_string(&self) -> String {
        String::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn common_token_display_matches_antlr_shape() {
        let mut token = CommonToken::new(7)
            .with_text("abc")
            .with_span(2, 4)
            .with_position(3, 9);
        token.set_token_index(5);
        assert_eq!(token.to_string(), "[@5,2:4='abc',<7>,3:9]");
    }

    #[test]
    fn common_token_display_matches_antlr_escaping() {
        let quote = CommonToken::new(1).with_text("\"");
        assert_eq!(quote.to_string(), "[@-1,0:0='\"',<1>,1:0]");

        let newline = CommonToken::new(1).with_text("\n");
        assert_eq!(newline.to_string(), "[@-1,0:0='\\n',<1>,1:0]");

        let backslash = CommonToken::new(1).with_text("\\");
        assert_eq!(backslash.to_string(), "[@-1,0:0='\\',<1>,1:0]");
    }

    #[test]
    fn common_token_display_includes_non_default_channel() {
        let token = CommonToken::new(2).with_text("b").with_channel(2);
        assert_eq!(token.to_string(), "[@-1,0:0='b',<2>,channel=2,1:0]");
    }

    #[test]
    fn eof_display_uses_antlr_empty_input_stop_index() {
        let token = CommonToken::eof("", 0, 1, 0);
        assert_eq!(token.to_string(), "[@-1,0:-1='<EOF>',<-1>,1:0]");
    }
}