Skip to main content

amql_predicates/
lib.rs

1//! General-purpose expression tokenizer and predicate parser.
2//!
3//! The public API has two layers:
4//! 1. **Tokens** — `Token` enum + `tokenize()` produce a flat token stream
5//!    from any expression string. Consumers build their own grammar on top
6//!    (selectors, queries, filters, etc.).
7//! 2. **Predicates** — `Predicate` struct + `parse_predicate()` parse
8//!    bracket-content expressions (`name op? value?`).
9
10#[cfg(feature = "wasm")]
11mod index;
12
13use serde::Serialize;
14use std::borrow::Cow;
15
16// ---------------------------------------------------------------------------
17// Token layer
18// ---------------------------------------------------------------------------
19
20/// Lexical token produced by `tokenize()`.
21#[derive(Debug, Clone, PartialEq, Serialize)]
22#[cfg_attr(feature = "ts", derive(ts_rs::TS))]
23#[cfg_attr(feature = "flow", derive(flowjs_rs::Flow))]
24#[cfg_attr(feature = "ts", ts(export))]
25#[cfg_attr(feature = "flow", flow(export))]
26#[non_exhaustive]
27pub enum Token {
28    /// Identifier: unquoted name (`method`, `async`, `count`).
29    Ident(String),
30    /// Quoted string literal.
31    Str(String),
32    /// Numeric literal (parsed to `f64`).
33    Number(f64),
34    /// `=`
35    Eq,
36    /// `!=`
37    NotEq,
38    /// `~=`
39    WordMatch,
40    /// `|=`
41    PrefixMatch,
42    /// `^=`
43    StartsWith,
44    /// `$=`
45    EndsWith,
46    /// `*=`
47    Contains,
48    /// `<`
49    Lt,
50    /// `>`
51    Gt,
52    /// `<=`
53    Lte,
54    /// `>=`
55    Gte,
56    /// `[`
57    LBracket,
58    /// `]`
59    RBracket,
60    /// `(`
61    LParen,
62    /// `)`
63    RParen,
64    /// `,`
65    Comma,
66    /// `.`
67    Dot,
68    /// `*` (standalone, not `*=`)
69    Star,
70    /// `#`
71    Hash,
72    /// `:`
73    Colon,
74    /// `::`
75    DoubleColon,
76    /// `+` (standalone, not inside an operator)
77    Plus,
78    /// `~` (standalone, not `~=`)
79    Tilde,
80    /// `!` (standalone, not `!=`)
81    Bang,
82    /// `;`
83    Semicolon,
84    /// Keyword `null`
85    Null,
86    /// Keyword `true`
87    True,
88    /// Keyword `false`
89    False,
90    /// Keyword `not`
91    Not,
92    /// Keyword `is`
93    Is,
94    /// Keyword `and`
95    And,
96    /// Keyword `or`
97    Or,
98    /// Consumer-defined token from `Tokenizer::op()` or `Tokenizer::keyword()`.
99    Custom(String),
100}
101
102// ---------------------------------------------------------------------------
103// Configurable tokenizer (Express-style)
104// ---------------------------------------------------------------------------
105
106/// Custom operator registration: pattern bytes → token to emit.
107struct CustomOp {
108    pattern: Vec<u8>,
109    token: Token,
110}
111
112/// Configurable tokenizer with custom operators, keywords, and middleware.
113///
114/// Built-in operators and keywords are always available. Custom registrations
115/// take priority over built-ins, checked longest-pattern-first.
116///
117/// ```ignore
118/// let tok = Tokenizer::new()
119///     .op("=~", Token::Custom("RegexMatch".into()))
120///     .keyword("where", Token::Custom("Where".into()))
121///     .transform(|tokens| { /* rewrite and return */ tokens });
122/// let tokens = tok.tokenize("name =~ 'foo.*'")?;
123/// ```
124pub struct Tokenizer {
125    custom_ops: Vec<CustomOp>,
126    custom_keywords: Vec<(String, Token)>,
127    transforms: Vec<fn(Vec<Token>) -> Vec<Token>>,
128}
129
130impl Tokenizer {
131    /// Create a new tokenizer with only built-in operators and keywords.
132    pub fn new() -> Self {
133        Self {
134            custom_ops: Vec::new(),
135            custom_keywords: Vec::new(),
136            transforms: Vec::new(),
137        }
138    }
139
140    /// Register a custom operator pattern.
141    ///
142    /// The pattern is matched byte-for-byte against the input. Custom ops
143    /// are checked before built-in ops, longest pattern first.
144    pub fn op(mut self, pattern: &str, token: Token) -> Self {
145        self.custom_ops.push(CustomOp {
146            pattern: pattern.as_bytes().to_vec(),
147            token,
148        });
149        // Sort longest-first so longer patterns match before shorter prefixes
150        self.custom_ops
151            .sort_by(|a, b| b.pattern.len().cmp(&a.pattern.len()));
152        self
153    }
154
155    /// Register a custom keyword.
156    ///
157    /// Keywords are matched case-insensitively against identifier tokens.
158    /// Custom keywords override built-in keywords with the same name.
159    pub fn keyword(mut self, word: &str, token: Token) -> Self {
160        self.custom_keywords
161            .push((word.to_ascii_lowercase(), token));
162        self
163    }
164
165    /// Register a transform (middleware) that runs after tokenization.
166    ///
167    /// Transforms run in registration order. Each receives the token stream
168    /// and returns a new one. Use `fn` pointers for zero overhead.
169    pub fn transform(mut self, f: fn(Vec<Token>) -> Vec<Token>) -> Self {
170        self.transforms.push(f);
171        self
172    }
173
174    /// Tokenize input using this configuration.
175    #[must_use = "tokenizing is useless without inspecting the result"]
176    pub fn tokenize(&self, input: &str) -> Result<Vec<Token>, String> {
177        let mut tokens = Vec::new();
178        let bytes = input.as_bytes();
179        let mut pos = 0;
180
181        while pos < bytes.len() {
182            if bytes[pos].is_ascii_whitespace() {
183                pos += 1;
184                continue;
185            }
186
187            // Quoted strings
188            if bytes[pos] == b'"' || bytes[pos] == b'\'' {
189                let (s, new_pos) = lex_quoted_string(bytes, pos)?;
190                tokens.push(Token::Str(s));
191                pos = new_pos;
192                continue;
193            }
194
195            // Custom operators (checked first, longest-first)
196            if let Some((len, tok)) = self.match_custom_op(&bytes[pos..]) {
197                tokens.push(tok);
198                pos += len;
199                continue;
200            }
201
202            // Built-in two-char operators
203            if pos + 1 < bytes.len() {
204                let two = &bytes[pos..pos + 2];
205                let op = match_builtin_two_char(two);
206                if let Some(tok) = op {
207                    tokens.push(tok);
208                    pos += 2;
209                    continue;
210                }
211            }
212
213            // Built-in single-char
214            if let Some(tok) = match_builtin_single(bytes[pos]) {
215                tokens.push(tok);
216                pos += 1;
217                continue;
218            }
219
220            // Numbers
221            if bytes[pos].is_ascii_digit()
222                || (bytes[pos] == b'-' && pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit())
223            {
224                let (tok, new_pos) = lex_number(input, bytes, pos)?;
225                tokens.push(tok);
226                pos = new_pos;
227                continue;
228            }
229
230            // Identifiers and keywords
231            if is_ident_start(bytes[pos]) {
232                let start = pos;
233                while pos < bytes.len() && is_ident_char(bytes[pos]) {
234                    pos += 1;
235                }
236                let word = &input[start..pos];
237                let tok = self.resolve_keyword(word);
238                tokens.push(tok);
239                continue;
240            }
241
242            return Err(format!(
243                "Unexpected character '{}' at position {}",
244                bytes[pos] as char, pos
245            ));
246        }
247
248        // Run middleware transforms in order
249        for transform in &self.transforms {
250            tokens = transform(tokens);
251        }
252
253        Ok(tokens)
254    }
255
256    fn match_custom_op(&self, remaining: &[u8]) -> Option<(usize, Token)> {
257        for custom in &self.custom_ops {
258            if remaining.starts_with(&custom.pattern) {
259                return Some((custom.pattern.len(), custom.token.clone()));
260            }
261        }
262        None
263    }
264
265    fn resolve_keyword(&self, word: &str) -> Token {
266        let lower = word.to_ascii_lowercase();
267        // Custom keywords first
268        for (kw, tok) in &self.custom_keywords {
269            if lower == *kw {
270                return tok.clone();
271            }
272        }
273        // Built-in keywords
274        match lower.as_str() {
275            "null" => Token::Null,
276            "true" => Token::True,
277            "false" => Token::False,
278            "not" => Token::Not,
279            "is" => Token::Is,
280            "and" => Token::And,
281            "or" => Token::Or,
282            _ => Token::Ident(word.to_owned()),
283        }
284    }
285}
286
287impl Default for Tokenizer {
288    fn default() -> Self {
289        Self::new()
290    }
291}
292
293// ---------------------------------------------------------------------------
294// Built-in matching (hardcoded for zero-overhead hot path)
295// ---------------------------------------------------------------------------
296
297fn match_builtin_two_char(two: &[u8]) -> Option<Token> {
298    match two {
299        b"~=" => Some(Token::WordMatch),
300        b"|=" => Some(Token::PrefixMatch),
301        b"^=" => Some(Token::StartsWith),
302        b"$=" => Some(Token::EndsWith),
303        b"*=" => Some(Token::Contains),
304        b"!=" => Some(Token::NotEq),
305        b"<=" => Some(Token::Lte),
306        b">=" => Some(Token::Gte),
307        b"::" => Some(Token::DoubleColon),
308        _ => None,
309    }
310}
311
312fn match_builtin_single(ch: u8) -> Option<Token> {
313    match ch {
314        b'=' => Some(Token::Eq),
315        b'<' => Some(Token::Lt),
316        b'>' => Some(Token::Gt),
317        b'[' => Some(Token::LBracket),
318        b']' => Some(Token::RBracket),
319        b'(' => Some(Token::LParen),
320        b')' => Some(Token::RParen),
321        b',' => Some(Token::Comma),
322        b'.' => Some(Token::Dot),
323        b'*' => Some(Token::Star),
324        b'#' => Some(Token::Hash),
325        b':' => Some(Token::Colon),
326        b'+' => Some(Token::Plus),
327        b'~' => Some(Token::Tilde),
328        b'!' => Some(Token::Bang),
329        b';' => Some(Token::Semicolon),
330        _ => None,
331    }
332}
333
334fn lex_number(input: &str, bytes: &[u8], start: usize) -> Result<(Token, usize), String> {
335    let mut pos = start;
336    if bytes[pos] == b'-' {
337        pos += 1;
338    }
339    while pos < bytes.len() && bytes[pos].is_ascii_digit() {
340        pos += 1;
341    }
342    if pos < bytes.len() && bytes[pos] == b'.' {
343        pos += 1;
344        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
345            pos += 1;
346        }
347    }
348    let num_str = &input[start..pos];
349    let n = num_str
350        .parse::<f64>()
351        .map_err(|e| format!("Invalid number '{num_str}': {e}"))?;
352    Ok((Token::Number(n), pos))
353}
354
355// ---------------------------------------------------------------------------
356// Default tokenize (zero-config convenience)
357// ---------------------------------------------------------------------------
358
359/// Tokenize an input string with default (built-in) operators and keywords.
360///
361/// For custom operators, keywords, or middleware, use `Tokenizer::new()`.
362#[must_use = "tokenizing is useless without inspecting the result"]
363pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
364    // Default tokenizer has empty custom_ops/keywords/transforms,
365    // so match_custom_op is a single empty-Vec branch — zero overhead.
366    Tokenizer::new().tokenize(input)
367}
368
369fn lex_quoted_string(bytes: &[u8], start: usize) -> Result<(String, usize), String> {
370    let quote = bytes[start];
371    let mut pos = start + 1;
372    let mut result = String::new();
373    while pos < bytes.len() && bytes[pos] != quote {
374        if bytes[pos] == b'\\' {
375            pos += 1;
376            if pos < bytes.len() {
377                result.push(bytes[pos] as char);
378                pos += 1;
379            }
380        } else {
381            result.push(bytes[pos] as char);
382            pos += 1;
383        }
384    }
385    if pos >= bytes.len() {
386        return Err(format!(
387            "Unterminated string starting at position {}",
388            start
389        ));
390    }
391    pos += 1; // skip closing quote
392    Ok((result, pos))
393}
394
395// ---------------------------------------------------------------------------
396// Predicate types
397// ---------------------------------------------------------------------------
398
399/// Typed predicate value.
400#[cfg_attr(feature = "ts", derive(ts_rs::TS))]
401#[cfg_attr(feature = "flow", derive(flowjs_rs::Flow))]
402#[cfg_attr(feature = "ts", ts(export))]
403#[cfg_attr(feature = "flow", flow(export))]
404#[derive(Debug, Clone, PartialEq, Serialize)]
405#[non_exhaustive]
406pub enum PredicateValue {
407    /// String value (quoted in source).
408    String(String),
409    /// Numeric value.
410    Number(f64),
411    /// Boolean value (`true` / `false`).
412    Bool(bool),
413    /// Null literal.
414    Null,
415}
416
417impl PredicateValue {
418    /// Convert to a string representation for comparison.
419    pub fn as_str_repr(&self) -> Cow<'_, str> {
420        match self {
421            PredicateValue::String(s) => Cow::Borrowed(s.as_str()),
422            PredicateValue::Number(n) => Cow::Owned(n.to_string()),
423            PredicateValue::Bool(b) => Cow::Owned(b.to_string()),
424            PredicateValue::Null => Cow::Borrowed(""),
425        }
426    }
427
428    /// Extract as f64 if numeric, or attempt to parse a string value.
429    pub fn as_f64(&self) -> Option<f64> {
430        match self {
431            PredicateValue::Number(n) => Some(*n),
432            PredicateValue::String(s) => s.parse().ok(),
433            _ => None,
434        }
435    }
436}
437
438/// A predicate expression: `name`, `name=value`, `name^=value`, etc.
439#[cfg_attr(feature = "ts", derive(ts_rs::TS))]
440#[cfg_attr(feature = "flow", derive(flowjs_rs::Flow))]
441#[cfg_attr(feature = "ts", ts(export))]
442#[cfg_attr(feature = "flow", flow(export))]
443#[derive(Debug, Clone, PartialEq, Serialize)]
444pub struct Predicate {
445    /// Name to test.
446    pub name: String,
447    /// Comparison operator; `None` means presence-only check.
448    #[serde(skip_serializing_if = "Option::is_none")]
449    pub op: Option<PredicateOp>,
450    /// Typed value to compare against, if an operator is present.
451    #[serde(skip_serializing_if = "Option::is_none")]
452    pub value: Option<PredicateValue>,
453}
454
455/// Comparison operator.
456#[cfg_attr(feature = "ts", derive(ts_rs::TS))]
457#[cfg_attr(feature = "flow", derive(flowjs_rs::Flow))]
458#[cfg_attr(feature = "ts", ts(export))]
459#[cfg_attr(feature = "flow", flow(export))]
460#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
461#[non_exhaustive]
462pub enum PredicateOp {
463    /// `=` — exact match
464    Eq,
465    /// `~=` — whitespace-separated word match
466    WordMatch,
467    /// `|=` — exact or hyphenated prefix
468    PrefixMatch,
469    /// `^=` — starts with
470    StartsWith,
471    /// `$=` — ends with
472    EndsWith,
473    /// `*=` — contains substring
474    Contains,
475    /// `!=` — not equal
476    NotEq,
477    /// `<` — less than (numeric)
478    Lt,
479    /// `>` — greater than (numeric)
480    Gt,
481    /// `<=` — less than or equal (numeric)
482    Lte,
483    /// `>=` — greater than or equal (numeric)
484    Gte,
485}
486
487// Backwards-compatible type aliases
488/// Alias for `Predicate` (backwards compatibility).
489pub type AttrPredicate = Predicate;
490/// Alias for `PredicateOp` (backwards compatibility).
491pub type AttrOp = PredicateOp;
492
493// ---------------------------------------------------------------------------
494// Predicate parser (token-based)
495// ---------------------------------------------------------------------------
496
497/// Parse a single predicate expression from bracket content.
498///
499/// Expects input like `method="GET"`, `async`, `count>=5`.
500/// Returns the parsed predicate or an error message.
501#[must_use = "parsing a predicate is useless without inspecting the result"]
502pub fn parse_predicate(input: &str) -> Result<Predicate, String> {
503    let tokens = tokenize(input.trim())?;
504    let mut pos = 0;
505    let pred = parse_one_predicate(&tokens, &mut pos)?;
506    if pos < tokens.len() {
507        return Err(format!(
508            "Unexpected token after predicate: {:?}",
509            tokens[pos]
510        ));
511    }
512    Ok(pred)
513}
514
515/// Parse a comma-separated list of predicates from bracket content.
516///
517/// Expects input like `method="GET", async, count>=5`.
518#[must_use = "parsing predicates is useless without inspecting the result"]
519pub fn parse_predicate_list(input: &str) -> Result<Vec<Predicate>, String> {
520    let tokens = tokenize(input.trim())?;
521    if tokens.is_empty() {
522        return Ok(Vec::new());
523    }
524
525    let mut preds = Vec::new();
526    let mut pos = 0;
527    loop {
528        if pos >= tokens.len() {
529            break;
530        }
531        preds.push(parse_one_predicate(&tokens, &mut pos)?);
532        if pos < tokens.len() && tokens[pos] == Token::Comma {
533            pos += 1;
534        } else {
535            break;
536        }
537    }
538
539    if pos < tokens.len() {
540        return Err(format!(
541            "Unexpected token at position {}: {:?}",
542            pos, tokens[pos]
543        ));
544    }
545
546    Ok(preds)
547}
548
549/// Evaluate a predicate operator string against a node value and a predicate value.
550///
551/// Parses `op` by constructing a minimal predicate expression, then delegates to `eval_op`.
552#[must_use = "evaluating a predicate is useless without inspecting the result"]
553pub fn eval_predicate(op: &str, node_val: &str, pred_val: &str) -> Result<bool, String> {
554    let pred = parse_predicate(&format!("x{op}{pred_val}"))?;
555    let resolved_op = pred.op.ok_or_else(|| "missing operator".to_string())?;
556    Ok(eval_op(resolved_op, node_val, pred_val))
557}
558
559/// Evaluate a predicate operator against two string values.
560///
561/// For numeric operators (Lt, Gt, Lte, Gte), attempts to parse both
562/// values as `f64`. Returns `false` if either side is not a valid number.
563#[must_use = "evaluating an operator is useless without inspecting the result"]
564pub fn eval_op(op: PredicateOp, node_val: &str, pred_val: &str) -> bool {
565    match op {
566        PredicateOp::Eq => node_val == pred_val,
567        PredicateOp::NotEq => node_val != pred_val,
568        PredicateOp::WordMatch => node_val.split_whitespace().any(|w| w == pred_val),
569        PredicateOp::StartsWith => node_val.starts_with(pred_val),
570        PredicateOp::EndsWith => node_val.ends_with(pred_val),
571        PredicateOp::Contains => node_val.contains(pred_val),
572        PredicateOp::PrefixMatch => {
573            node_val == pred_val
574                || (node_val.starts_with(pred_val)
575                    && node_val.as_bytes().get(pred_val.len()) == Some(&b'-'))
576        }
577        PredicateOp::Lt | PredicateOp::Gt | PredicateOp::Lte | PredicateOp::Gte => {
578            match (node_val.parse::<f64>(), pred_val.parse::<f64>()) {
579                (Ok(a), Ok(b)) => match op {
580                    PredicateOp::Lt => a < b,
581                    PredicateOp::Gt => a > b,
582                    PredicateOp::Lte => a <= b,
583                    PredicateOp::Gte => a >= b,
584                    _ => unreachable!(),
585                },
586                _ => false,
587            }
588        }
589    }
590}
591
592/// Evaluate a predicate operator with typed values.
593///
594/// Uses `PredicateValue` for the predicate side and a string for the node side
595/// (since node values often arrive as strings from JSON or other formats).
596#[must_use = "evaluating an operator is useless without inspecting the result"]
597pub fn eval_op_typed(op: PredicateOp, node_val: &str, pred_val: &PredicateValue) -> bool {
598    match op {
599        PredicateOp::Lt | PredicateOp::Gt | PredicateOp::Lte | PredicateOp::Gte => {
600            let node_num = node_val.parse::<f64>().ok();
601            let pred_num = pred_val.as_f64();
602            match (node_num, pred_num) {
603                (Some(a), Some(b)) => match op {
604                    PredicateOp::Lt => a < b,
605                    PredicateOp::Gt => a > b,
606                    PredicateOp::Lte => a <= b,
607                    PredicateOp::Gte => a >= b,
608                    _ => unreachable!(),
609                },
610                _ => false,
611            }
612        }
613        _ => {
614            let pred_str = pred_val.as_str_repr();
615            eval_op(op, node_val, &pred_str)
616        }
617    }
618}
619
620fn token_to_op(tok: &Token) -> Option<PredicateOp> {
621    match tok {
622        Token::Eq => Some(PredicateOp::Eq),
623        Token::NotEq => Some(PredicateOp::NotEq),
624        Token::WordMatch => Some(PredicateOp::WordMatch),
625        Token::PrefixMatch => Some(PredicateOp::PrefixMatch),
626        Token::StartsWith => Some(PredicateOp::StartsWith),
627        Token::EndsWith => Some(PredicateOp::EndsWith),
628        Token::Contains => Some(PredicateOp::Contains),
629        Token::Lt => Some(PredicateOp::Lt),
630        Token::Gt => Some(PredicateOp::Gt),
631        Token::Lte => Some(PredicateOp::Lte),
632        Token::Gte => Some(PredicateOp::Gte),
633        _ => None,
634    }
635}
636
637fn token_to_value(tok: &Token) -> Option<PredicateValue> {
638    match tok {
639        Token::Str(s) => Some(PredicateValue::String(s.clone())),
640        Token::Number(n) => Some(PredicateValue::Number(*n)),
641        Token::True => Some(PredicateValue::Bool(true)),
642        Token::False => Some(PredicateValue::Bool(false)),
643        Token::Null => Some(PredicateValue::Null),
644        Token::Ident(s) => Some(PredicateValue::String(s.clone())),
645        _ => None,
646    }
647}
648
649fn token_as_name(tok: &Token) -> Option<String> {
650    match tok {
651        Token::Null => Some("null".to_string()),
652        Token::True => Some("true".to_string()),
653        Token::False => Some("false".to_string()),
654        Token::Not => Some("not".to_string()),
655        Token::Is => Some("is".to_string()),
656        Token::And => Some("and".to_string()),
657        Token::Or => Some("or".to_string()),
658        _ => None,
659    }
660}
661
662fn parse_one_predicate(tokens: &[Token], pos: &mut usize) -> Result<Predicate, String> {
663    if *pos >= tokens.len() {
664        return Err("Expected predicate, got end of input".to_string());
665    }
666
667    let name = match &tokens[*pos] {
668        Token::Ident(s) => s.clone(),
669        tok => match token_as_name(tok) {
670            Some(s) => s,
671            None => return Err(format!("Expected attribute name, got {:?}", tok)),
672        },
673    };
674    *pos += 1;
675
676    // Check for operator
677    if *pos >= tokens.len() {
678        return Ok(Predicate {
679            name,
680            op: None,
681            value: None,
682        });
683    }
684
685    let op = match token_to_op(&tokens[*pos]) {
686        Some(op) => op,
687        None => {
688            return Ok(Predicate {
689                name,
690                op: None,
691                value: None,
692            });
693        }
694    };
695    *pos += 1;
696
697    // Parse value
698    if *pos >= tokens.len() {
699        return Err(format!("Expected value after operator for '{name}'"));
700    }
701
702    let value = token_to_value(&tokens[*pos])
703        .ok_or_else(|| format!("Expected value, got {:?}", tokens[*pos]))?;
704    *pos += 1;
705
706    Ok(Predicate {
707        name,
708        op: Some(op),
709        value: Some(value),
710    })
711}
712
713fn is_ident_start(ch: u8) -> bool {
714    ch.is_ascii_alphabetic() || ch == b'_'
715}
716
717fn is_ident_char(ch: u8) -> bool {
718    ch.is_ascii_alphanumeric() || ch == b'_' || ch == b'-'
719}
720
721#[cfg(test)]
722mod tests {
723    use super::*;
724
725    // -- tokenizer tests --
726
727    #[test]
728    fn tokenizes_basic_predicate() {
729        // Arrange & Act
730        let tokens = tokenize(r#"method="GET""#).unwrap();
731
732        // Assert
733        assert_eq!(tokens.len(), 3, "should produce 3 tokens");
734        assert_eq!(tokens[0], Token::Ident("method".to_string()), "ident");
735        assert_eq!(tokens[1], Token::Eq, "eq");
736        assert_eq!(tokens[2], Token::Str("GET".to_string()), "string value");
737    }
738
739    #[test]
740    fn tokenizes_numeric_value() {
741        // Arrange & Act
742        let tokens = tokenize("count>=42.5").unwrap();
743
744        // Assert
745        assert_eq!(tokens.len(), 3, "should produce 3 tokens");
746        assert_eq!(tokens[0], Token::Ident("count".to_string()), "ident");
747        assert_eq!(tokens[1], Token::Gte, "gte op");
748        assert_eq!(tokens[2], Token::Number(42.5), "number value");
749    }
750
751    #[test]
752    fn tokenizes_keywords_case_insensitive() {
753        // Arrange & Act
754        let lower = tokenize("null true false not is and or").unwrap();
755        let upper = tokenize("NULL TRUE FALSE NOT IS AND OR").unwrap();
756        let mixed = tokenize("Null True False Not Is And Or").unwrap();
757
758        // Assert
759        let expected = vec![
760            Token::Null,
761            Token::True,
762            Token::False,
763            Token::Not,
764            Token::Is,
765            Token::And,
766            Token::Or,
767        ];
768        assert_eq!(lower, expected, "lowercase keywords");
769        assert_eq!(upper, expected, "uppercase keywords");
770        assert_eq!(mixed, expected, "mixed-case keywords");
771    }
772
773    #[test]
774    fn tokenizes_keywords_in_expression() {
775        // Arrange & Act
776        let tokens = tokenize("status is not null and active = true or count < 5").unwrap();
777
778        // Assert
779        let expected = vec![
780            Token::Ident("status".to_string()),
781            Token::Is,
782            Token::Not,
783            Token::Null,
784            Token::And,
785            Token::Ident("active".to_string()),
786            Token::Eq,
787            Token::True,
788            Token::Or,
789            Token::Ident("count".to_string()),
790            Token::Lt,
791            Token::Number(5.0),
792        ];
793        assert_eq!(tokens, expected, "keyword tokenization");
794    }
795
796    #[test]
797    fn tokenizes_structural_tokens() {
798        // Arrange & Act
799        let tokens = tokenize("[a.b, c(d)]").unwrap();
800
801        // Assert
802        assert_eq!(tokens[0], Token::LBracket, "lbracket");
803        assert_eq!(tokens[1], Token::Ident("a".to_string()), "a");
804        assert_eq!(tokens[2], Token::Dot, "dot");
805        assert_eq!(tokens[3], Token::Ident("b".to_string()), "b");
806        assert_eq!(tokens[4], Token::Comma, "comma");
807        assert_eq!(tokens[5], Token::Ident("c".to_string()), "c");
808        assert_eq!(tokens[6], Token::LParen, "lparen");
809        assert_eq!(tokens[7], Token::Ident("d".to_string()), "d");
810        assert_eq!(tokens[8], Token::RParen, "rparen");
811        assert_eq!(tokens[9], Token::RBracket, "rbracket");
812    }
813
814    #[test]
815    fn tokenizes_selector_tokens() {
816        // Arrange & Act
817        let tokens = tokenize("div > .class + #id ~ span::before :hover !important;").unwrap();
818
819        // Assert
820        let expected = vec![
821            Token::Ident("div".to_string()),
822            Token::Gt,
823            Token::Dot,
824            Token::Ident("class".to_string()),
825            Token::Plus,
826            Token::Hash,
827            Token::Ident("id".to_string()),
828            Token::Tilde,
829            Token::Ident("span".to_string()),
830            Token::DoubleColon,
831            Token::Ident("before".to_string()),
832            Token::Colon,
833            Token::Ident("hover".to_string()),
834            Token::Bang,
835            Token::Ident("important".to_string()),
836            Token::Semicolon,
837        ];
838        assert_eq!(tokens, expected, "selector-style tokens");
839    }
840
841    #[test]
842    fn tokenizes_all_operators() {
843        // Arrange & Act
844        let tokens = tokenize("~= |= ^= $= *= != <= >= < > =").unwrap();
845
846        // Assert
847        let expected = vec![
848            Token::WordMatch,
849            Token::PrefixMatch,
850            Token::StartsWith,
851            Token::EndsWith,
852            Token::Contains,
853            Token::NotEq,
854            Token::Lte,
855            Token::Gte,
856            Token::Lt,
857            Token::Gt,
858            Token::Eq,
859        ];
860        assert_eq!(tokens, expected, "all operators");
861    }
862
863    #[test]
864    fn tokenizes_negative_number() {
865        // Arrange & Act
866        let tokens = tokenize("val>-2.5").unwrap();
867
868        // Assert
869        assert_eq!(tokens.len(), 3, "should produce 3 tokens");
870        assert_eq!(tokens[2], Token::Number(-2.5), "negative float");
871    }
872
873    #[test]
874    fn tokenizes_standalone_star() {
875        // Arrange & Act
876        let tokens = tokenize("* a").unwrap();
877
878        // Assert
879        assert_eq!(tokens[0], Token::Star, "standalone star");
880        assert_eq!(tokens[1], Token::Ident("a".to_string()), "ident after star");
881    }
882
883    // -- predicate parser tests --
884
885    #[test]
886    fn parses_presence_only() {
887        // Arrange & Act
888        let pred = parse_predicate("async").unwrap();
889
890        // Assert
891        assert_eq!(pred.name, "async", "name should be parsed");
892        assert_eq!(pred.op, None, "no operator for presence check");
893        assert_eq!(pred.value, None, "no value for presence check");
894    }
895
896    #[test]
897    fn parses_eq_string() {
898        // Arrange & Act
899        let pred = parse_predicate(r#"method="GET""#).unwrap();
900
901        // Assert
902        assert_eq!(pred.name, "method", "name");
903        assert_eq!(pred.op, Some(PredicateOp::Eq), "operator");
904        assert_eq!(
905            pred.value,
906            Some(PredicateValue::String("GET".to_string())),
907            "typed string value"
908        );
909    }
910
911    #[test]
912    fn parses_numeric_value() {
913        // Arrange & Act
914        let pred = parse_predicate("count>=5").unwrap();
915
916        // Assert
917        assert_eq!(pred.name, "count", "name");
918        assert_eq!(pred.op, Some(PredicateOp::Gte), "operator");
919        assert_eq!(
920            pred.value,
921            Some(PredicateValue::Number(5.0)),
922            "typed numeric value"
923        );
924    }
925
926    #[test]
927    fn parses_bool_value() {
928        // Arrange & Act
929        let pred = parse_predicate("active=true").unwrap();
930
931        // Assert
932        assert_eq!(
933            pred.value,
934            Some(PredicateValue::Bool(true)),
935            "typed bool value"
936        );
937    }
938
939    #[test]
940    fn parses_null_value() {
941        // Arrange & Act
942        let pred = parse_predicate("status=null").unwrap();
943
944        // Assert
945        assert_eq!(pred.value, Some(PredicateValue::Null), "typed null value");
946    }
947
948    #[test]
949    fn parses_css_string_operators() {
950        // Arrange & Act
951        let starts = parse_predicate(r#"name^="handle""#).unwrap();
952        let ends = parse_predicate(r#"name$="Controller""#).unwrap();
953        let contains = parse_predicate(r#"name*="user""#).unwrap();
954        let word = parse_predicate(r#"class~="active""#).unwrap();
955        let prefix = parse_predicate(r#"lang|="en""#).unwrap();
956
957        // Assert
958        assert_eq!(
959            starts.op,
960            Some(PredicateOp::StartsWith),
961            "^= should be StartsWith"
962        );
963        assert_eq!(
964            ends.op,
965            Some(PredicateOp::EndsWith),
966            "$= should be EndsWith"
967        );
968        assert_eq!(
969            contains.op,
970            Some(PredicateOp::Contains),
971            "*= should be Contains"
972        );
973        assert_eq!(
974            word.op,
975            Some(PredicateOp::WordMatch),
976            "~= should be WordMatch"
977        );
978        assert_eq!(
979            prefix.op,
980            Some(PredicateOp::PrefixMatch),
981            "|= should be PrefixMatch"
982        );
983    }
984
985    #[test]
986    fn parses_numeric_operators() {
987        // Arrange & Act
988        let lt = parse_predicate("count<5").unwrap();
989        let gt = parse_predicate("count>5").unwrap();
990        let lte = parse_predicate("count<=5").unwrap();
991        let gte = parse_predicate("count>=5").unwrap();
992        let neq = parse_predicate("status!=200").unwrap();
993
994        // Assert
995        assert_eq!(lt.op, Some(PredicateOp::Lt), "< should be Lt");
996        assert_eq!(gt.op, Some(PredicateOp::Gt), "> should be Gt");
997        assert_eq!(lte.op, Some(PredicateOp::Lte), "<= should be Lte");
998        assert_eq!(gte.op, Some(PredicateOp::Gte), ">= should be Gte");
999        assert_eq!(neq.op, Some(PredicateOp::NotEq), "!= should be NotEq");
1000        assert_eq!(lt.value, Some(PredicateValue::Number(5.0)), "numeric value");
1001    }
1002
1003    #[test]
1004    fn parses_predicate_list() {
1005        // Arrange & Act
1006        let preds = parse_predicate_list(r#"method="POST", async, count>=1"#).unwrap();
1007
1008        // Assert
1009        assert_eq!(preds.len(), 3, "should parse 3 predicates");
1010        assert_eq!(preds[0].name, "method", "first predicate name");
1011        assert_eq!(preds[1].name, "async", "second predicate name");
1012        assert_eq!(preds[2].name, "count", "third predicate name");
1013        assert_eq!(preds[2].op, Some(PredicateOp::Gte), "third predicate op");
1014    }
1015
1016    #[test]
1017    fn parses_escape_sequences() {
1018        // Arrange & Act
1019        let pred = parse_predicate(r#"name="foo\"bar""#).unwrap();
1020
1021        // Assert
1022        assert_eq!(
1023            pred.value,
1024            Some(PredicateValue::String(r#"foo"bar"#.to_string())),
1025            "should handle escaped quotes"
1026        );
1027    }
1028
1029    #[test]
1030    fn parses_single_quoted_values() {
1031        // Arrange & Act
1032        let pred = parse_predicate("method='POST'").unwrap();
1033
1034        // Assert
1035        assert_eq!(
1036            pred.value,
1037            Some(PredicateValue::String("POST".to_string())),
1038            "single quotes should work"
1039        );
1040    }
1041
1042    // -- keyword-as-name tests --
1043
1044    #[test]
1045    fn keywords_as_attribute_names() {
1046        // Arrange & Act
1047        let null_presence = parse_predicate("null").unwrap();
1048        let true_eq = parse_predicate(r#"true="yes""#).unwrap();
1049        let not_presence = parse_predicate("not").unwrap();
1050        let or_eq = parse_predicate("or=1").unwrap();
1051        let list = parse_predicate_list("null, true, false, not, is, and, or").unwrap();
1052
1053        // Assert
1054        assert_eq!(null_presence.name, "null", "null as attr name");
1055        assert_eq!(null_presence.op, None, "presence check");
1056
1057        assert_eq!(true_eq.name, "true", "true as attr name");
1058        assert_eq!(true_eq.op, Some(PredicateOp::Eq), "eq operator");
1059        assert_eq!(
1060            true_eq.value,
1061            Some(PredicateValue::String("yes".to_string())),
1062            "string value"
1063        );
1064
1065        assert_eq!(not_presence.name, "not", "not as attr name");
1066        assert_eq!(or_eq.name, "or", "or as attr name");
1067
1068        assert_eq!(list.len(), 7, "all keywords as names");
1069        let names: Vec<&str> = list.iter().map(|p| p.name.as_str()).collect();
1070        assert_eq!(
1071            names,
1072            vec!["null", "true", "false", "not", "is", "and", "or"],
1073            "keyword names in list"
1074        );
1075    }
1076
1077    // -- eval tests --
1078
1079    #[test]
1080    fn eval_string_operators() {
1081        // Arrange, Act, and Assert
1082        assert!(eval_op(PredicateOp::Eq, "GET", "GET"), "exact match");
1083        assert!(!eval_op(PredicateOp::Eq, "GET", "POST"), "exact mismatch");
1084        assert!(eval_op(PredicateOp::NotEq, "GET", "POST"), "not equal");
1085        assert!(
1086            eval_op(PredicateOp::StartsWith, "handleClick", "handle"),
1087            "starts with"
1088        );
1089        assert!(
1090            eval_op(PredicateOp::EndsWith, "UserController", "Controller"),
1091            "ends with"
1092        );
1093        assert!(
1094            eval_op(PredicateOp::Contains, "createUser", "User"),
1095            "contains"
1096        );
1097        assert!(
1098            eval_op(PredicateOp::WordMatch, "foo bar baz", "bar"),
1099            "word match"
1100        );
1101        assert!(
1102            eval_op(PredicateOp::PrefixMatch, "en-US", "en"),
1103            "prefix match with hyphen"
1104        );
1105        assert!(
1106            eval_op(PredicateOp::PrefixMatch, "en", "en"),
1107            "prefix match exact"
1108        );
1109        assert!(
1110            !eval_op(PredicateOp::PrefixMatch, "energy", "en"),
1111            "prefix match no hyphen"
1112        );
1113    }
1114
1115    #[test]
1116    fn eval_numeric_operators() {
1117        // Arrange, Act, and Assert
1118        assert!(eval_op(PredicateOp::Lt, "3", "5"), "3 < 5");
1119        assert!(!eval_op(PredicateOp::Lt, "5", "3"), "5 not < 3");
1120        assert!(eval_op(PredicateOp::Gt, "5", "3"), "5 > 3");
1121        assert!(eval_op(PredicateOp::Lte, "5", "5"), "5 <= 5");
1122        assert!(eval_op(PredicateOp::Gte, "5", "5"), "5 >= 5");
1123        assert!(
1124            !eval_op(PredicateOp::Lt, "abc", "5"),
1125            "non-numeric returns false"
1126        );
1127    }
1128
1129    #[test]
1130    fn eval_typed_numeric() {
1131        // Arrange, Act, and Assert
1132        assert!(
1133            eval_op_typed(PredicateOp::Gt, "10", &PredicateValue::Number(5.0)),
1134            "typed numeric comparison"
1135        );
1136        assert!(
1137            eval_op_typed(
1138                PredicateOp::Eq,
1139                "hello",
1140                &PredicateValue::String("hello".to_string())
1141            ),
1142            "typed string comparison"
1143        );
1144        assert!(
1145            !eval_op_typed(PredicateOp::Lt, "abc", &PredicateValue::Number(5.0)),
1146            "non-numeric node returns false"
1147        );
1148    }
1149
1150    // -- configurable tokenizer tests --
1151
1152    #[test]
1153    fn custom_operator() {
1154        // Arrange
1155        let tok = Tokenizer::new().op("=~", Token::Custom("RegexMatch".into()));
1156
1157        // Act
1158        let tokens = tok.tokenize("name =~ 'foo.*'").unwrap();
1159
1160        // Assert
1161        assert_eq!(tokens.len(), 3, "should produce 3 tokens");
1162        assert_eq!(tokens[0], Token::Ident("name".to_string()), "ident");
1163        assert_eq!(tokens[1], Token::Custom("RegexMatch".into()), "custom op");
1164        assert_eq!(tokens[2], Token::Str("foo.*".to_string()), "pattern");
1165    }
1166
1167    #[test]
1168    fn custom_operator_overrides_builtin() {
1169        // Arrange — override `>=` with a custom token
1170        let tok = Tokenizer::new().op(">=", Token::Custom("GreaterOrEqual".into()));
1171
1172        // Act
1173        let tokens = tok.tokenize("count >= 5").unwrap();
1174
1175        // Assert
1176        assert_eq!(
1177            tokens[1],
1178            Token::Custom("GreaterOrEqual".into()),
1179            "custom overrides builtin"
1180        );
1181    }
1182
1183    #[test]
1184    fn custom_keyword() {
1185        // Arrange
1186        let tok = Tokenizer::new()
1187            .keyword("where", Token::Custom("Where".into()))
1188            .keyword("select", Token::Custom("Select".into()));
1189
1190        // Act
1191        let tokens = tok.tokenize("SELECT name WHERE count > 5").unwrap();
1192
1193        // Assert
1194        assert_eq!(
1195            tokens[0],
1196            Token::Custom("Select".into()),
1197            "custom keyword select"
1198        );
1199        assert_eq!(tokens[1], Token::Ident("name".to_string()), "ident");
1200        assert_eq!(
1201            tokens[2],
1202            Token::Custom("Where".into()),
1203            "custom keyword where"
1204        );
1205    }
1206
1207    #[test]
1208    fn transform_middleware() {
1209        // Arrange — middleware that removes all Comma tokens
1210        fn strip_commas(tokens: Vec<Token>) -> Vec<Token> {
1211            tokens.into_iter().filter(|t| *t != Token::Comma).collect()
1212        }
1213
1214        let tok = Tokenizer::new().transform(strip_commas);
1215
1216        // Act
1217        let tokens = tok.tokenize("a, b, c").unwrap();
1218
1219        // Assert
1220        assert_eq!(tokens.len(), 3, "commas removed");
1221        assert_eq!(tokens[0], Token::Ident("a".to_string()), "a");
1222        assert_eq!(tokens[1], Token::Ident("b".to_string()), "b");
1223        assert_eq!(tokens[2], Token::Ident("c".to_string()), "c");
1224    }
1225
1226    #[test]
1227    fn chained_transforms() {
1228        // Arrange — two middleware: first strips commas, second uppercases idents
1229        fn strip_commas(tokens: Vec<Token>) -> Vec<Token> {
1230            tokens.into_iter().filter(|t| *t != Token::Comma).collect()
1231        }
1232        fn uppercase_idents(tokens: Vec<Token>) -> Vec<Token> {
1233            tokens
1234                .into_iter()
1235                .map(|tok| match tok {
1236                    Token::Ident(s) => Token::Ident(s.to_uppercase()),
1237                    other => other,
1238                })
1239                .collect()
1240        }
1241
1242        let tok = Tokenizer::new()
1243            .transform(strip_commas)
1244            .transform(uppercase_idents);
1245
1246        // Act
1247        let tokens = tok.tokenize("foo, bar").unwrap();
1248
1249        // Assert
1250        assert_eq!(tokens.len(), 2, "commas stripped");
1251        assert_eq!(tokens[0], Token::Ident("FOO".to_string()), "uppercased");
1252        assert_eq!(tokens[1], Token::Ident("BAR".to_string()), "uppercased");
1253    }
1254
1255    #[test]
1256    fn multi_char_custom_op_priority() {
1257        // Arrange — 3-char op `<=>` should match before `<=` + `>`
1258        let tok = Tokenizer::new().op("<=>", Token::Custom("Spaceship".into()));
1259
1260        // Act
1261        let tokens = tok.tokenize("a <=> b").unwrap();
1262
1263        // Assert
1264        assert_eq!(tokens.len(), 3, "should produce 3 tokens");
1265        assert_eq!(
1266            tokens[1],
1267            Token::Custom("Spaceship".into()),
1268            "3-char custom op"
1269        );
1270    }
1271}
1272