gollum-parser 0.4.0

Parser for the Gollum language
Documentation
//! Logos-based lexer for Gollum source text.
#![allow(missing_docs)]
use logos::Logos;
use std::fmt;

/// A single Gollum token produced by the lexer.
#[derive(Logos, Debug, PartialEq, Clone)]
#[logos(skip r"[ \t\r\n\f]+")] // whitespace
#[logos(skip r"%[^\n]*")] // line comments (% ...)
#[logos(skip r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")] // block comments /* ... */
pub enum Token {
    // --- Structural ---
    #[token(":-")]
    Neck,
    #[token("?-")]
    QueryNeck,
    #[token("::")]
    ColonColon,
    #[token(":")]
    Colon,
    #[token(",")]
    Comma,
    #[token(".")]
    Dot,
    #[token("(")]
    LParen,
    #[token(")")]
    RParen,
    #[token("[")]
    LBracket,
    #[token("]")]
    RBracket,
    #[token("|")]
    Pipe,
    #[token("!")]
    Cut,
    #[token("@")]
    At,

    // --- Arithmetic ---
    #[token("+")]
    Plus,
    #[token("-")]
    Minus,
    #[token("*")]
    Star,
    #[token("/")]
    Slash,
    #[token("mod")]
    Mod,

    // --- CLP(FD) constraint operators (longer tokens first) ---
    /// Tensor literal open bracket `#[` — must precede CLP `#=` etc.
    #[token("#[")]
    TensorOpen,
    #[token("#>=")]
    ClpGte,
    #[token("#=<")]
    ClpLte,
    #[token("#\\=")]
    ClpNeq,
    #[token("#>")]
    ClpGt,
    #[token("#<")]
    ClpLt,
    #[token("#=")]
    ClpEq,

    // --- Range operator ---
    #[token("..")]
    DotDot,

    // --- Comparison / unification (longer tokens first to win over prefixes) ---
    #[token("=:=")]
    ArithEq,
    #[token("=\\=")]
    ArithNeq,
    #[token("\\+")]
    NotPlus,
    #[token("\\=")]
    NotEq,
    #[token("=<")]
    Lte,
    #[token(">=")]
    Gte,
    #[token("<")]
    Lt,
    #[token(">")]
    Gt,
    /// Neural soft-unification operator `~=`.
    #[token("~=")]
    NeuralUnify,
    #[token("=")]
    Eq,

    // --- Modal logic (Unicode) ---
    #[token("")]
    Box,
    #[token("")]
    Diamond,

    // --- Keywords ---
    #[token("in")]
    In,
    #[token("is")]
    Is,
    #[token("not")]
    Not,
    #[token("before")]
    Before,
    #[token("after")]
    After,
    #[token("during")]
    During,
    #[token("until")]
    Until,
    #[token("using")]
    Using,
    #[token("minimize")]
    Minimize,
    #[token("maximize")]
    Maximize,

    // --- Literals ---
    /// Float: must come before Integer so "3.14" is not split into Integer(3) + Dot + Integer(14)
    #[regex(r"[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
    Float(f64),

    /// Integer: bare number without unit
    #[regex(r"[0-9]+", |lex| lex.slice().parse::<i64>().ok())]
    Integer(i64),

    /// Number with time unit (e.g., 100s, -1h, 1500ms)
    #[regex(r"-?[0-9]+(\.[0-9]+)?[a-zµμ]+", parse_unit_literal)]
    UnitLiteral((i64, &'static str)),

    // --- Variables and anonymous ---
    /// Anonymous variable `_`
    #[token("_")]
    Anon,

    /// Named variable: uppercase start, or underscore + alphanumeric
    #[regex(r"[A-Z][a-zA-Z0-9_]*|_[a-zA-Z0-9_]+", |lex| lex.slice().to_string())]
    Var(String),

    // --- Atoms ---
    /// Unquoted atom: lowercase start
    #[regex(r"[a-z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
    Atom(String),

    /// Quoted atom: 'foo bar'
    #[regex(r"'[^']*'", |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })]
    QuotedAtom(String),

    /// String literal: "hello"
    #[regex(r#""[^"]*""#, |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })]
    Str(String),
}

fn parse_unit_literal(lex: &mut logos::Lexer<Token>) -> Option<(i64, &'static str)> {
    let slice = lex.slice();

    let mut num_end = 0;
    for (i, c) in slice.char_indices() {
        match c {
            '0'..='9' | '-' | '.' => num_end = i + 1,
            _ => break,
        }
    }

    if num_end == 0 {
        return None;
    }

    let num_str = &slice[..num_end];
    let unit = &slice[num_end..];

    if unit.is_empty() {
        return None;
    }

    let num: i64 = num_str.parse().ok()?;
    let unit_static: &'static str = match unit {
        "ns" => "ns",
        "us" | "µs" | "μs" => "us",
        "ms" => "ms",
        "s" => "s",
        "min" => "min",
        "h" => "h",
        "d" => "d",
        "w" => "w",
        "y" => "y",
        _ => return None,
    };

    Some((num, unit_static))
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Atom(s) => write!(f, r#"Atom("{s}")"#),
            Self::Var(s) => write!(f, r#"Var("{s}")"#),
            Self::QuotedAtom(s) => write!(f, r#"QuotedAtom("{s}")"#),
            Self::Str(s) => write!(f, r#"Str("{s}")"#),
            Self::Integer(n) => write!(f, "Integer({n})"),
            Self::Float(n) => write!(f, "Float({n})"),
            Self::Anon => write!(f, "Anon"),
            Self::Neck => write!(f, "Neck"),
            Self::QueryNeck => write!(f, "QueryNeck"),
            Self::ColonColon => write!(f, "ColonColon"),
            Self::Colon => write!(f, "Colon"),
            Self::Comma => write!(f, "Comma"),
            Self::Dot => write!(f, "Dot"),
            Self::LParen => write!(f, "LParen"),
            Self::RParen => write!(f, "RParen"),
            Self::LBracket => write!(f, "LBracket"),
            Self::RBracket => write!(f, "RBracket"),
            Self::Pipe => write!(f, "Pipe"),
            Self::Cut => write!(f, "Cut"),
            Self::Plus => write!(f, "Plus"),
            Self::Minus => write!(f, "Minus"),
            Self::Star => write!(f, "Star"),
            Self::Slash => write!(f, "Slash"),
            Self::Mod => write!(f, "Mod"),
            Self::Eq => write!(f, "Eq"),
            Self::NotEq => write!(f, "NotEq"),
            Self::ArithEq => write!(f, "ArithEq"),
            Self::ArithNeq => write!(f, "ArithNeq"),
            Self::Lt => write!(f, "Lt"),
            Self::Gt => write!(f, "Gt"),
            Self::Lte => write!(f, "Lte"),
            Self::Gte => write!(f, "Gte"),
            Self::NotPlus => write!(f, "NotPlus"),
            Self::Is => write!(f, "Is"),
            Self::Not => write!(f, "Not"),
            Self::Before => write!(f, "Before"),
            Self::After => write!(f, "After"),
            Self::During => write!(f, "During"),
            Self::Until => write!(f, "Until"),
            Self::Using => write!(f, "Using"),
            Self::Minimize => write!(f, "Minimize"),
            Self::Maximize => write!(f, "Maximize"),
            Self::Box => write!(f, "Box"),
            Self::Diamond => write!(f, "Diamond"),
            Self::At => write!(f, "At"),
            Self::UnitLiteral((n, u)) => write!(f, "UnitLiteral({}{})", n, u),
            Self::In => write!(f, "In"),
            Self::DotDot => write!(f, "DotDot"),
            Self::ClpEq => write!(f, "ClpEq"),
            Self::ClpNeq => write!(f, "ClpNeq"),
            Self::ClpLt => write!(f, "ClpLt"),
            Self::ClpLte => write!(f, "ClpLte"),
            Self::ClpGt => write!(f, "ClpGt"),
            Self::ClpGte => write!(f, "ClpGte"),
            Self::NeuralUnify => write!(f, "NeuralUnify"),
            Self::TensorOpen => write!(f, "TensorOpen"),
        }
    }
}

/// Tokenize a Gollum source string.
/// Returns `Ok(Token)` for each recognized token, `Err(())` for unrecognized input.
pub fn tokenize(source: &str) -> Vec<Result<Token, ()>> {
    Token::lexer(source).collect()
}