plotnik-compiler 0.3.2

//! Syntax kinds for the query language.
//!
//! `SyntaxKind` serves dual roles: token kinds (from lexer) and node kinds (from parser).
//! Logos derives token recognition; node kinds lack token/regex attributes.
//! `QLang` implements Rowan's `Language` trait for tree construction.

#![allow(dead_code)] // Some items are for future use

use logos::Logos;
use rowan::Language;

/// All token and node kinds. Tokens first, then nodes, then `__LAST` sentinel.
/// `#[repr(u16)]` enables safe transmute in `kind_from_raw`.
#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(u16)]
pub enum SyntaxKind {
    #[token("(")]
    ParenOpen = 0,

    #[token(")")]
    ParenClose,

    #[token("[")]
    BracketOpen,

    #[token("]")]
    BracketClose,

    #[token("{")]
    BraceOpen,

    #[token("}")]
    BraceClose,

    /// `::` for type annotations. Defined before `Colon` for correct precedence.
    #[token("::")]
    DoubleColon,

    #[token(":")]
    Colon,

    #[token("=")]
    Equals,

    #[token("!")]
    Negation,

    #[token("-")]
    Minus,

    #[token("~")]
    Tilde,

    #[token("_")]
    Underscore,

    #[token("*")]
    Star,

    #[token("+")]
    Plus,

    #[token("?")]
    Question,

    /// Non-greedy `*?` quantifier
    #[token("*?")]
    StarQuestion,

    /// Non-greedy `+?` quantifier
    #[token("+?")]
    PlusQuestion,

    /// Non-greedy `??` quantifier
    #[token("??")]
    QuestionQuestion,

    /// Slash for supertype paths: `(expression/binary_expression)`
    #[token("/")]
    Slash,

    /// Comma (invalid separator, for error recovery)
    #[token(",")]
    Comma,

    /// Pipe (invalid separator, for error recovery)
    #[token("|")]
    Pipe,

    #[regex(r#""(?:[^"\\]|\\.)*""#)]
    #[regex(r"'(?:[^'\\]|\\.)*'")]
    #[doc(hidden)]
    StringLiteral, // Lexer-internal only

    DoubleQuote,
    SingleQuote,
    /// String content between quotes
    StrVal,

    #[token("ERROR")]
    KwError,

    #[token("MISSING")]
    KwMissing,

    /// Identifier. Accepts dots/hyphens for tree-sitter compat; parser validates per context.
    /// Defined after keywords so they take precedence.
    #[regex(r"[a-zA-Z][a-zA-Z0-9_.\-]*")]
    Id,

    #[token(".")]
    Dot,

    /// Regular capture: @name (matches before `At`)
    #[regex(r"@[a-zA-Z][a-zA-Z0-9_]*")]
    CaptureToken,

    /// Suppressive capture: @_ or @_name (matches before `At`)
    #[regex(r"@_[a-zA-Z0-9_]*")]
    SuppressiveCapture,

    /// Bare @ (for error recovery: "capture without target")
    #[token("@")]
    At,

    #[regex(r"[ \t]+")]
    Whitespace,

    #[token("\n")]
    #[token("\r\n")]
    Newline,

    #[regex(r"//[^\n]*", allow_greedy = true)]
    #[regex(r";[^\n]*", allow_greedy = true)]
    LineComment,

    #[regex(r"/\*(?:[^*]|\*[^/])*\*/")]
    BlockComment,

    /// `==` for predicate equals
    #[token("==")]
    OpEq,

    /// `!=` for predicate not equals
    #[token("!=")]
    OpNe,

    /// `^=` for predicate starts-with
    #[token("^=")]
    OpStartsWith,

    /// `$=` for predicate ends-with
    #[token("$=")]
    OpEndsWith,

    /// `*=` for predicate contains (defined after `Star` for correct precedence)
    #[token("*=")]
    OpContains,

    /// `=~` for predicate regex match (when followed by string or error)
    #[token("=~")]
    OpRegexMatch,

    /// `!~` for predicate regex no-match (when followed by string or error)
    #[token("!~")]
    OpRegexNoMatch,

    /// `=~` followed by regex literal: `=~ /pattern/`
    /// Compound token to avoid `//` being lexed as line comment.
    #[regex(r"=~[ \t\r\n]*/", lex_regex_predicate)]
    RegexPredicateMatch,

    /// `!~` followed by regex literal: `!~ /pattern/`
    #[regex(r"!~[ \t\r\n]*/", lex_regex_predicate)]
    RegexPredicateNoMatch,

    /// Regex literal token (after splitting compound predicate)
    RegexLiteral,

    /// Regex pattern content (between slashes, set by parser)
    RegexContent,

    /// XML-like tags matched as errors (common LLM output)
    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*(?:\s+[^>]*)?>")]
    #[regex(r"</[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*>")]
    #[regex(r"<[a-zA-Z_:][a-zA-Z0-9_:\.\-]*\s*/\s*>")]
    XMLGarbage,
    /// Tree-sitter predicates (unsupported)
    #[regex(r"#[a-zA-Z_][a-zA-Z0-9_]*[?!]?")]
    TsPredicate,
    /// Coalesced unrecognized characters
    Garbage,
    Error,

    Root,
    Tree,
    Ref,
    Str,
    Field,
    Capture,
    Type,
    Quantifier,
    Seq,
    Alt,
    Branch,
    Wildcard,
    Anchor,
    NegatedField,
    Def,
    /// Predicate on a node: `(identifier == "foo")`
    NodePredicate,
    /// Regex literal: `/pattern/`
    Regex,

    // Must be last - used for bounds checking in `kind_from_raw`
    #[doc(hidden)]
    __LAST,
}

use SyntaxKind::*;

/// Logos callback for regex predicate tokens.
/// Called after matching `=~\s*/` or `!~\s*/`, consumes until closing unescaped `/`.
fn lex_regex_predicate(lexer: &mut logos::Lexer<SyntaxKind>) -> bool {
    let remaining = lexer.remainder();
    let mut backslash_count = 0;

    for (i, c) in remaining.char_indices() {
        if c == '/' && backslash_count % 2 == 0 {
            // Found unescaped closing slash
            lexer.bump(i + 1);
            return true;
        }
        backslash_count = if c == '\\' { backslash_count + 1 } else { 0 };
    }

    // No closing slash - consume rest as unclosed regex (parser will error)
    lexer.bump(remaining.len());
    true
}

impl SyntaxKind {
    #[inline]
    pub fn is_trivia(self) -> bool {
        matches!(self, Whitespace | Newline | LineComment | BlockComment)
    }

    #[inline]
    pub fn is_error(self) -> bool {
        matches!(self, Error | XMLGarbage | Garbage | TsPredicate)
    }
}

impl From<SyntaxKind> for rowan::SyntaxKind {
    #[inline]
    fn from(kind: SyntaxKind) -> Self {
        Self(kind as u16)
    }
}

/// Language tag for Rowan's tree types.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum QLang {}

impl Language for QLang {
    type Kind = SyntaxKind;

    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
        assert!(raw.0 < __LAST as u16);
        // SAFETY: We've verified the value is in bounds, and SyntaxKind is repr(u16)
        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
    }

    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
        kind.into()
    }
}

/// Type aliases for Rowan types parameterized by our language.
pub type SyntaxNode = rowan::SyntaxNode<QLang>;
pub type SyntaxToken = rowan::SyntaxToken<QLang>;
pub type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;

/// 128-bit bitset of `SyntaxKind`s for O(1) membership testing.
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct TokenSet(u128);

impl TokenSet {
    /// Creates an empty token set.
    pub const EMPTY: TokenSet = TokenSet(0);

    /// Panics at compile time if any kind's discriminant >= 128.
    #[inline]
    pub const fn new(kinds: &[SyntaxKind]) -> Self {
        let mut bits = 0u128;
        let mut i = 0;
        while i < kinds.len() {
            let kind = kinds[i] as u16;
            assert!(kind < 128, "SyntaxKind value exceeds TokenSet capacity");
            bits |= 1 << kind;
            i += 1;
        }
        TokenSet(bits)
    }

    #[inline]
    pub const fn single(kind: SyntaxKind) -> Self {
        let kind = kind as u16;
        assert!(kind < 128, "SyntaxKind value exceeds TokenSet capacity");
        TokenSet(1 << kind)
    }

    #[inline]
    pub const fn contains(&self, kind: SyntaxKind) -> bool {
        let kind = kind as u16;
        if kind >= 128 {
            return false;
        }
        self.0 & (1 << kind) != 0
    }

    #[inline]
    pub const fn union(self, other: TokenSet) -> TokenSet {
        TokenSet(self.0 | other.0)
    }
}

impl std::fmt::Debug for TokenSet {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let mut list = f.debug_set();
        for i in 0..128u16 {
            if self.0 & (1 << i) != 0 && i < __LAST as u16 {
                let kind: SyntaxKind = unsafe { std::mem::transmute(i) };
                list.entry(&kind);
            }
        }
        list.finish()
    }
}

/// Pre-defined token sets for the parser.
pub mod token_sets {
    use super::*;

    /// FIRST set of expr. `At` excluded (captures wrap, not start).
    pub const EXPR_FIRST_TOKENS: TokenSet = TokenSet::new(&[
        ParenOpen,
        BracketOpen,
        BraceOpen,
        Underscore,
        Id,
        DoubleQuote,
        SingleQuote,
        Dot,
        Negation,
        Minus,
        KwError,
        KwMissing,
    ]);

    /// FIRST set for root-level expressions. Excludes `Dot`/`Negation` (tree-internal).
    pub const ROOT_EXPR_FIRST_TOKENS: TokenSet = TokenSet::new(&[
        ParenOpen,
        BracketOpen,
        BraceOpen,
        Underscore,
        Id,
        DoubleQuote,
        SingleQuote,
        KwError,
        KwMissing,
    ]);

    pub const QUANTIFIERS: TokenSet = TokenSet::new(&[
        Star,
        Plus,
        Question,
        StarQuestion,
        PlusQuestion,
        QuestionQuestion,
    ]);

    pub const TRIVIA: TokenSet = TokenSet::new(&[Whitespace, Newline, LineComment, BlockComment]);
    pub const SEPARATORS: TokenSet = TokenSet::new(&[Comma, Pipe]);

    pub const TREE_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen]);

    pub const ALT_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[ParenClose]);

    pub const FIELD_RECOVERY_TOKENS: TokenSet = TokenSet::new(&[
        ParenClose,
        BracketClose,
        BraceClose,
        CaptureToken,
        SuppressiveCapture,
        Colon,
    ]);

    pub const ROOT_RECOVERY_TOKENS: TokenSet =
        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id]);

    pub const DEF_RECOVERY_TOKENS: TokenSet =
        TokenSet::new(&[ParenOpen, BracketOpen, BraceOpen, Id, Equals]);

    pub const SEQ_RECOVERY_TOKENS: TokenSet =
        TokenSet::new(&[BraceClose, ParenClose, BracketClose]);

    pub const PREDICATE_OPS: TokenSet = TokenSet::new(&[
        OpEq,
        OpNe,
        OpStartsWith,
        OpEndsWith,
        OpContains,
        OpRegexMatch,
        OpRegexNoMatch,
    ]);
}