thal 0.0.1

Reactive semantic runtime — molecules, reactions, and effect actors for building LLM-backed applications as dataflow programs.
Documentation
use crate::Error;

#[derive(Clone, Debug, PartialEq)]
pub enum Token {
    Ident(String),
    Int(i64),
    Float(f64),
    String(String),
    Duration(i64), // millis

    // keywords
    Molecule,
    Reaction,
    When,
    Where,
    Rollup,
    By,
    Emit,
    PrimaryKey,
    Merge,
    Default,
    Type,
    Enum,
    Mixin,
    With,
    For,
    In,
    And,
    Or,
    Not,
    True,
    False,
    Null,

    // punctuation
    LBrace,
    RBrace,
    LParen,
    RParen,
    LBracket,
    RBracket,
    Colon,
    Comma,
    Pipe,
    Dot,
    Plus,
    Minus,
    Eq,
    EqEq,
    NotEq,
    Lt,
    LtEq,
    Gt,
    GtEq,
    Question,
}

#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct Span {
    pub start: usize,
    pub end: usize,
    pub line: u32,
    pub col: u32,
}

#[derive(Clone, Debug)]
pub struct Tok {
    pub token: Token,
    pub span: Span,
}

pub fn lex(source: &str) -> Result<Vec<Tok>, Error> {
    let bytes = source.as_bytes();
    let mut out = Vec::new();
    let mut i = 0;
    let mut line: u32 = 1;
    let mut line_start: usize = 0;

    while i < bytes.len() {
        let b = bytes[i];

        // skip whitespace
        if b == b' ' || b == b'\t' || b == b'\r' {
            i += 1;
            continue;
        }
        if b == b'\n' {
            i += 1;
            line += 1;
            line_start = i;
            continue;
        }

        // line comment
        if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
            while i < bytes.len() && bytes[i] != b'\n' {
                i += 1;
            }
            continue;
        }

        let start = i;
        let col = (start - line_start) as u32 + 1;

        let mk_span = |end: usize| Span {
            start,
            end,
            line,
            col,
        };

        // two-char operators first (==, !=, <=, >=)
        if i + 1 < bytes.len() {
            let two = (bytes[i], bytes[i + 1]);
            let multi = match two {
                (b'=', b'=') => Some(Token::EqEq),
                (b'!', b'=') => Some(Token::NotEq),
                (b'<', b'=') => Some(Token::LtEq),
                (b'>', b'=') => Some(Token::GtEq),
                _ => None,
            };
            if let Some(tok) = multi {
                i += 2;
                out.push(Tok {
                    token: tok,
                    span: mk_span(i),
                });
                continue;
            }
        }

        // single-char punctuation
        let single = match b {
            b'{' => Some(Token::LBrace),
            b'}' => Some(Token::RBrace),
            b'(' => Some(Token::LParen),
            b')' => Some(Token::RParen),
            b'[' => Some(Token::LBracket),
            b']' => Some(Token::RBracket),
            b':' => Some(Token::Colon),
            b',' => Some(Token::Comma),
            b'|' => Some(Token::Pipe),
            b'.' => Some(Token::Dot),
            b'+' => Some(Token::Plus),
            b'-' => Some(Token::Minus),
            b'=' => Some(Token::Eq),
            b'<' => Some(Token::Lt),
            b'>' => Some(Token::Gt),
            b'?' => Some(Token::Question),
            _ => None,
        };
        if let Some(tok) = single {
            i += 1;
            out.push(Tok {
                token: tok,
                span: mk_span(i),
            });
            continue;
        }

        // string literal
        if b == b'"' {
            i += 1;
            let s_start = i;
            while i < bytes.len() && bytes[i] != b'"' {
                if bytes[i] == b'\\' && i + 1 < bytes.len() {
                    i += 2;
                } else {
                    i += 1;
                }
            }
            if i >= bytes.len() {
                return Err(Error::Parse(format!(
                    "unterminated string at line {line}:{col}"
                )));
            }
            let s = std::str::from_utf8(&bytes[s_start..i])
                .map_err(|_| Error::Parse("non-utf8 string".into()))?
                .to_string();
            i += 1;
            out.push(Tok {
                token: Token::String(s),
                span: mk_span(i),
            });
            continue;
        }

        // number: int, float, or int with duration suffix
        if b.is_ascii_digit() {
            let n_start = i;
            while i < bytes.len() && bytes[i].is_ascii_digit() {
                i += 1;
            }

            // optional fractional part — but only if a digit follows the `.`
            // (otherwise it's a field-access token like `t.field`)
            let mut is_float = false;
            if i + 1 < bytes.len() && bytes[i] == b'.' && bytes[i + 1].is_ascii_digit() {
                is_float = true;
                i += 1;
                while i < bytes.len() && bytes[i].is_ascii_digit() {
                    i += 1;
                }
            }

            // optional exponent: [eE][+-]?digits
            if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
                is_float = true;
                i += 1;
                if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
                    i += 1;
                }
                while i < bytes.len() && bytes[i].is_ascii_digit() {
                    i += 1;
                }
            }

            if is_float {
                let f: f64 = std::str::from_utf8(&bytes[n_start..i])
                    .map_err(|_| Error::Parse("bad float".into()))?
                    .parse()
                    .map_err(|_| Error::Parse("float out of range".into()))?;
                out.push(Tok {
                    token: Token::Float(f),
                    span: mk_span(i),
                });
                continue;
            }

            let n: i64 = std::str::from_utf8(&bytes[n_start..i])
                .map_err(|_| Error::Parse("bad number".into()))?
                .parse()
                .map_err(|_| Error::Parse("number out of range".into()))?;

            // duration suffix: ms, s, m, h
            let suffix_start = i;
            while i < bytes.len()
                && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_')
            {
                i += 1;
            }
            let suffix = &bytes[suffix_start..i];
            if suffix.is_empty() {
                out.push(Tok {
                    token: Token::Int(n),
                    span: mk_span(i),
                });
            } else {
                let ms = match suffix {
                    b"ms" => n,
                    b"s" => n * 1_000,
                    b"m" => n * 60_000,
                    b"h" => n * 3_600_000,
                    _ => {
                        return Err(Error::Parse(format!(
                            "unknown duration suffix at line {line}:{col}"
                        )))
                    }
                };
                out.push(Tok {
                    token: Token::Duration(ms),
                    span: mk_span(i),
                });
            }
            continue;
        }

        // identifier or keyword
        if b.is_ascii_alphabetic() || b == b'_' {
            let id_start = i;
            while i < bytes.len()
                && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_')
            {
                i += 1;
            }
            let s = std::str::from_utf8(&bytes[id_start..i])
                .map_err(|_| Error::Parse("non-utf8 ident".into()))?;
            let token = match s {
                "molecule" => Token::Molecule,
                "reaction" => Token::Reaction,
                "when" => Token::When,
                "where" => Token::Where,
                "rollup" => Token::Rollup,
                "by" => Token::By,
                "emit" => Token::Emit,
                "primary_key" => Token::PrimaryKey,
                "merge" => Token::Merge,
                "default" => Token::Default,
                "type" => Token::Type,
                "enum" => Token::Enum,
                "mixin" => Token::Mixin,
                "with" => Token::With,
                "for" => Token::For,
                "in" => Token::In,
                "and" => Token::And,
                "or" => Token::Or,
                "not" => Token::Not,
                "true" => Token::True,
                "false" => Token::False,
                "null" => Token::Null,
                other => Token::Ident(other.to_string()),
            };
            out.push(Tok {
                token,
                span: mk_span(i),
            });
            continue;
        }

        return Err(Error::Parse(format!(
            "unexpected character {:?} at line {line}:{col}",
            b as char
        )));
    }

    Ok(out)
}