cyrs_syntax/
lexer.rs

1//! Lexer — `logos`-generated DFA producing a stream of [`LexToken`].
2//!
3//! Reference: spec 0001 §4.1. Every significant token carries its byte
4//! range; trivia (whitespace + comments) is surfaced as regular tokens so
5//! the parser can attach it to the enclosing node.
6//!
7//! Case insensitivity is handled per-keyword via `ignore(case)` on the
8//! logos derive; the text of the token preserves the original casing so
9//! the formatter can honour user preference when requested.
10
11use logos::Logos;
12use smol_str::SmolStr;
13use text_size::{TextRange, TextSize};
14
15use crate::SyntaxKind;
16
17/// A single lexed token: kind, original text, and byte range.
18#[derive(Debug, Clone)]
19pub struct LexToken {
20    /// Classification of the token (keyword, identifier, literal, …).
21    pub kind: SyntaxKind,
22    /// Exact source text the token spans.  Trivia tokens keep their
23    /// whitespace / comment text verbatim.
24    pub text: SmolStr,
25    /// Byte range the token occupies in the source.
26    pub range: TextRange,
27}
28
29/// A lexer-level diagnostic: a code (matching `DiagCode` discriminants in
30/// `cyrs-diag`) plus a message and byte range. Emitted by
31/// [`validate_tokens`] for errors that can only be detected after the DFA
32/// run — unterminated literals, invalid escape sequences, etc.
33///
34/// `cyrs-syntax` does not depend on `cyrs-diag` (spec §3.1), so the
35/// code is carried as a plain `u16`.
36#[derive(Debug, Clone)]
37pub struct LexError {
38    /// Numeric discriminant of the corresponding `DiagCode` in
39    /// `cyrs-diag` (e.g. `4` for `E0004`).
40    pub code: u16,
41    /// Human-readable message (rustc-style lower-case initial, no
42    /// trailing period).
43    pub message: String,
44    /// Byte range of the offending lexeme.
45    pub range: TextRange,
46}
47
48/// Scan a token stream for lex-level errors that the DFA cannot express
49/// directly:
50///
51/// - **E0004** — unterminated string literal (a run of ERROR tokens that
52///   starts with a quote character but never closes).
53/// - **E0005** — unterminated block comment (a run of ERROR tokens that
54///   starts with `/*` but has no matching `*/`).
55/// - **E0046** — invalid escape sequence inside an otherwise-valid
56///   `STRING_LITERAL` token (`\q`, `\X`, etc.).
57///
58/// Returns a (possibly empty) list of [`LexError`]s in source order. The
59/// token stream itself is not modified — every byte remains in a token.
60#[must_use]
61pub fn validate_tokens(src: &str, tokens: &[LexToken]) -> Vec<LexError> {
62    let mut errors: Vec<LexError> = Vec::new();
63
64    // --- E0004 / E0005: unterminated string literal / block comment ---
65    //
66    // A logos `ERROR` token is produced for any byte (or byte sequence)
67    // the DFA cannot recognise. An unterminated string or block comment
68    // produces a run of ERROR tokens starting at the opening delimiter.
69    // We detect these by scanning error-token runs.
70    let mut i = 0;
71    while i < tokens.len() {
72        let tok = &tokens[i];
73        if tok.kind == SyntaxKind::ERROR {
74            let start = tok.range.start();
75            let start_usize = usize::from(start);
76
77            // Collect the contiguous run of ERROR tokens.
78            let mut run_end = tok.range.end();
79            let mut j = i + 1;
80            while j < tokens.len() && tokens[j].kind == SyntaxKind::ERROR {
81                run_end = tokens[j].range.end();
82                j += 1;
83            }
84            let run_src = &src[start_usize..usize::from(run_end)];
85
86            if run_src.starts_with("/*") {
87                // E0005 — unterminated block comment.
88                errors.push(LexError {
89                    code: super::parser::syntax_codes::UNCLOSED_BLOCK_COMMENT,
90                    message: "unterminated block comment".to_owned(),
91                    range: TextRange::new(start, run_end),
92                });
93            } else if run_src.starts_with('"') || run_src.starts_with('\'') {
94                // E0004 — unterminated string literal.
95                errors.push(LexError {
96                    code: super::parser::syntax_codes::UNCLOSED_STRING,
97                    message: "unterminated string literal".to_owned(),
98                    range: TextRange::new(start, run_end),
99                });
100            }
101            // E0002 / E0006 — unexpected / unrecognised token(s) that
102            // are not an unterminated literal.
103            if !run_src.starts_with("/*") && !run_src.starts_with('"') && !run_src.starts_with('\'')
104            {
105                // Heuristic: an error run starting with a digit that
106                // contains a base prefix (`0x`, `0o`, `0b`) followed by
107                // no valid digits is an invalid numeric literal (E0006).
108                let first = run_src.chars().next().unwrap_or('\0');
109                let is_bad_numeric = first.is_ascii_digit()
110                    && (run_src.starts_with("0x")
111                        || run_src.starts_with("0X")
112                        || run_src.starts_with("0o")
113                        || run_src.starts_with("0O")
114                        || run_src.starts_with("0b")
115                        || run_src.starts_with("0B"));
116                if is_bad_numeric {
117                    errors.push(LexError {
118                        code: super::parser::syntax_codes::INVALID_NUMERIC_LITERAL,
119                        message: format!("invalid numeric literal `{run_src}`"),
120                        range: TextRange::new(start, run_end),
121                    });
122                } else {
123                    errors.push(LexError {
124                        code: super::parser::syntax_codes::UNEXPECTED_TOKEN,
125                        message: format!("unexpected token `{first}`"),
126                        range: TextRange::new(start, run_end),
127                    });
128                }
129            }
130            i = j;
131            continue;
132        }
133        i += 1;
134    }
135
136    // --- E0046: invalid escape sequence in string literal ---------------
137    //
138    // The DFA matches any `\.` as a valid escape because verifying the
139    // escape character would require lookahead inside logos patterns. We
140    // perform the semantic check here.
141    let valid_escapes: &[char] = &['n', 't', 'r', '\\', '\'', '"', '0', 'b', 'f', 'u', 'U'];
142    for tok in tokens {
143        if tok.kind != SyntaxKind::STRING_LITERAL {
144            continue;
145        }
146        let text = tok.text.as_str();
147        // Strip surrounding quotes.
148        let inner = if (text.starts_with('"') && text.ends_with('"'))
149            || (text.starts_with('\'') && text.ends_with('\''))
150        {
151            &text[1..text.len() - 1]
152        } else {
153            continue;
154        };
155        let mut chars = inner.char_indices().peekable();
156        while let Some((byte_off, ch)) = chars.next() {
157            if ch == '\\'
158                && let Some(&(_, next_ch)) = chars.peek()
159            {
160                if !valid_escapes.contains(&next_ch) {
161                    let abs_start = usize::from(tok.range.start())
162                        + 1 // opening quote
163                        + byte_off;
164                    let abs_end = abs_start + 1 + next_ch.len_utf8();
165                    let range = TextRange::new(
166                        TextSize::try_from(abs_start).expect("offset fits u32"),
167                        TextSize::try_from(abs_end).expect("offset fits u32"),
168                    );
169                    errors.push(LexError {
170                        code: super::parser::syntax_codes::INVALID_ESCAPE,
171                        message: format!("invalid escape sequence `\\{next_ch}`"),
172                        range,
173                    });
174                }
175                chars.next(); // consume the escape character
176            }
177        }
178    }
179
180    errors
181}
182
183/// Tokenise an entire source string. Unknown bytes become [`SyntaxKind::ERROR`]
184/// tokens that preserve their range; the lexer never panics on input.
185#[must_use]
186pub fn lex(src: &str) -> Vec<LexToken> {
187    let mut out = Vec::new();
188    let mut lex = RawToken::lexer(src);
189    while let Some(raw) = lex.next() {
190        let range = {
191            let span = lex.span();
192            let start = TextSize::try_from(span.start).expect("span.start fits u32");
193            let end = TextSize::try_from(span.end).expect("span.end fits u32");
194            TextRange::new(start, end)
195        };
196        let text = SmolStr::new(lex.slice());
197        let kind = match raw {
198            Ok(tok) => tok.to_syntax_kind(),
199            Err(()) => SyntaxKind::ERROR,
200        };
201        out.push(LexToken { kind, text, range });
202    }
203    out
204}
205
206/// Internal logos-generated token enum.
207///
208/// Keywords use `ignore(case)` per spec §4.1. Identifiers are recognised
209/// as a fallback after keywords so `MATCHING` doesn't lex as `MATCH_KW`
210/// followed by `ING` — logos resolves to the longest match.
211#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
212enum RawToken {
213    // ---- trivia ------------------------------------------------------
214    #[regex(r"[ \t\r\n]+")]
215    Whitespace,
216    #[regex(r"//[^\n\r]*")]
217    LineComment,
218    // Canonical non-nested C-style block-comment regex. Spec §4.1.
219    //   /\*          opening
220    //   [^*]*        any non-star run
221    //   \*+          one-or-more closing stars
222    //   ([^/*] [^*]* \*+)*   non-closing star-runs
223    //   /            final slash
224    #[regex(r"/\*[^*]*\*+([^/*][^*]*\*+)*/")]
225    BlockComment,
226
227    // ---- keywords (case-insensitive) ---------------------------------
228    #[token("MATCH", ignore(case))]
229    Match,
230    #[token("OPTIONAL", ignore(case))]
231    Optional,
232    #[token("WHERE", ignore(case))]
233    Where,
234    #[token("WITH", ignore(case))]
235    With,
236    #[token("RETURN", ignore(case))]
237    Return,
238    #[token("CREATE", ignore(case))]
239    Create,
240    #[token("MERGE", ignore(case))]
241    Merge,
242    #[token("DELETE", ignore(case))]
243    Delete,
244    #[token("DETACH", ignore(case))]
245    Detach,
246    #[token("SET", ignore(case))]
247    Set,
248    #[token("REMOVE", ignore(case))]
249    Remove,
250    #[token("UNWIND", ignore(case))]
251    Unwind,
252    #[token("CALL", ignore(case))]
253    Call,
254    #[token("YIELD", ignore(case))]
255    Yield,
256    #[token("ON", ignore(case))]
257    On,
258    #[token("AS", ignore(case))]
259    As,
260    #[token("AND", ignore(case))]
261    And,
262    #[token("OR", ignore(case))]
263    Or,
264    #[token("XOR", ignore(case))]
265    Xor,
266    #[token("NOT", ignore(case))]
267    Not,
268    #[token("IN", ignore(case))]
269    In,
270    #[token("IS", ignore(case))]
271    Is,
272    #[token("NULL", ignore(case))]
273    Null,
274    #[token("TRUE", ignore(case))]
275    True,
276    #[token("FALSE", ignore(case))]
277    False,
278    #[token("CASE", ignore(case))]
279    Case,
280    #[token("WHEN", ignore(case))]
281    When,
282    #[token("THEN", ignore(case))]
283    Then,
284    #[token("ELSE", ignore(case))]
285    Else,
286    #[token("END", ignore(case))]
287    End,
288    #[token("ORDER", ignore(case))]
289    Order,
290    #[token("BY", ignore(case))]
291    By,
292    #[token("ASC", ignore(case))]
293    Asc,
294    #[token("ASCENDING", ignore(case))]
295    Ascending,
296    #[token("DESC", ignore(case))]
297    Desc,
298    #[token("DESCENDING", ignore(case))]
299    Descending,
300    #[token("SKIP", ignore(case))]
301    Skip,
302    #[token("LIMIT", ignore(case))]
303    Limit,
304    #[token("DISTINCT", ignore(case))]
305    Distinct,
306    #[token("UNION", ignore(case))]
307    Union,
308    #[token("ALL", ignore(case))]
309    All,
310    #[token("STARTS", ignore(case))]
311    Starts,
312    #[token("ENDS", ignore(case))]
313    Ends,
314    #[token("CONTAINS", ignore(case))]
315    Contains,
316    #[token("DIV", ignore(case))]
317    Div,
318    #[token("MOD", ignore(case))]
319    Mod,
320    #[token("COUNT", ignore(case))]
321    Count,
322    #[token("EXISTS", ignore(case))]
323    Exists,
324    #[token("shortestPath", ignore(case))]
325    ShortestPath,
326    #[token("allShortestPaths", ignore(case))]
327    AllShortestPaths,
328    // List-predicate keywords (cy-8x5). `All` above already carries ALL_KW.
329    #[token("ANY", ignore(case))]
330    Any,
331    #[token("NONE", ignore(case))]
332    None,
333    #[token("SINGLE", ignore(case))]
334    Single,
335
336    // ---- identifiers & parameters ------------------------------------
337    #[regex(r"[A-Za-z_][A-Za-z0-9_]*", priority = 1)]
338    Ident,
339    #[regex(r"`(``|[^`])*`")]
340    QuotedIdent,
341    #[regex(r"\$[A-Za-z_][A-Za-z0-9_]*|\$[0-9]+")]
342    Param,
343
344    // ---- numeric literals --------------------------------------------
345    // Float first so `1.0` doesn't shadow to `1` + `.` + `0`.
346    #[regex(r"[0-9]+\.[0-9]+([eE][+\-]?[0-9]+)?")]
347    #[regex(r"[0-9]+[eE][+\-]?[0-9]+")]
348    Float,
349    #[regex(r"0[xX][0-9A-Fa-f]+")]
350    #[regex(r"0[oO][0-7]+")]
351    #[regex(r"0[bB][01]+")]
352    #[regex(r"[0-9]+")]
353    Int,
354
355    // ---- string literals ---------------------------------------------
356    #[regex(r#""([^"\\]|\\.)*""#)]
357    #[regex(r"'([^'\\]|\\.)*'")]
358    String,
359
360    // ---- punctuation -------------------------------------------------
361    #[token("(")]
362    LParen,
363    #[token(")")]
364    RParen,
365    #[token("[")]
366    LBrack,
367    #[token("]")]
368    RBrack,
369    #[token("{")]
370    LBrace,
371    #[token("}")]
372    RBrace,
373    #[token(",")]
374    Comma,
375    #[token(";")]
376    Semi,
377    #[token("::")]
378    DoubleColon,
379    #[token(":")]
380    Colon,
381    #[token("..")]
382    DotDot,
383    #[token(".")]
384    Dot,
385    #[token("|")]
386    Pipe,
387    #[token("*")]
388    Star,
389    #[token("+")]
390    Plus,
391    #[token("->")]
392    ArrowR,
393    #[token("<-")]
394    ArrowL,
395    #[token("-")]
396    Minus,
397    #[token("/")]
398    Slash,
399    #[token("%")]
400    Percent,
401    #[token("^")]
402    Caret,
403    #[token("<>")]
404    Neq,
405    #[token("!=")]
406    BangEq,
407    #[token("<=")]
408    Le,
409    #[token(">=")]
410    Ge,
411    #[token("<")]
412    Lt,
413    #[token(">")]
414    Gt,
415    #[token("=~")]
416    RegexEq,
417    #[token("=")]
418    Eq,
419    #[token("$")]
420    Dollar,
421    #[token("!")]
422    Bang,
423    #[token("&")]
424    Amp,
425}
426
427impl RawToken {
428    fn to_syntax_kind(self) -> SyntaxKind {
429        match self {
430            Self::Whitespace => SyntaxKind::WHITESPACE,
431            Self::LineComment => SyntaxKind::LINE_COMMENT,
432            Self::BlockComment => SyntaxKind::BLOCK_COMMENT,
433
434            Self::Match => SyntaxKind::MATCH_KW,
435            Self::Optional => SyntaxKind::OPTIONAL_KW,
436            Self::Where => SyntaxKind::WHERE_KW,
437            Self::With => SyntaxKind::WITH_KW,
438            Self::Return => SyntaxKind::RETURN_KW,
439            Self::Create => SyntaxKind::CREATE_KW,
440            Self::Merge => SyntaxKind::MERGE_KW,
441            Self::Delete => SyntaxKind::DELETE_KW,
442            Self::Detach => SyntaxKind::DETACH_KW,
443            Self::Set => SyntaxKind::SET_KW,
444            Self::Remove => SyntaxKind::REMOVE_KW,
445            Self::Unwind => SyntaxKind::UNWIND_KW,
446            Self::Call => SyntaxKind::CALL_KW,
447            Self::Yield => SyntaxKind::YIELD_KW,
448            Self::On => SyntaxKind::ON_KW,
449            Self::As => SyntaxKind::AS_KW,
450            Self::And => SyntaxKind::AND_KW,
451            Self::Or => SyntaxKind::OR_KW,
452            Self::Xor => SyntaxKind::XOR_KW,
453            Self::Not => SyntaxKind::NOT_KW,
454            Self::In => SyntaxKind::IN_KW,
455            Self::Is => SyntaxKind::IS_KW,
456            Self::Null => SyntaxKind::NULL_KW,
457            Self::True => SyntaxKind::TRUE_KW,
458            Self::False => SyntaxKind::FALSE_KW,
459            Self::Case => SyntaxKind::CASE_KW,
460            Self::When => SyntaxKind::WHEN_KW,
461            Self::Then => SyntaxKind::THEN_KW,
462            Self::Else => SyntaxKind::ELSE_KW,
463            Self::End => SyntaxKind::END_KW,
464            Self::Order => SyntaxKind::ORDER_KW,
465            Self::By => SyntaxKind::BY_KW,
466            Self::Asc => SyntaxKind::ASC_KW,
467            Self::Ascending => SyntaxKind::ASCENDING_KW,
468            Self::Desc => SyntaxKind::DESC_KW,
469            Self::Descending => SyntaxKind::DESCENDING_KW,
470            Self::Skip => SyntaxKind::SKIP_KW,
471            Self::Limit => SyntaxKind::LIMIT_KW,
472            Self::Distinct => SyntaxKind::DISTINCT_KW,
473            Self::Union => SyntaxKind::UNION_KW,
474            Self::All => SyntaxKind::ALL_KW,
475            Self::Starts => SyntaxKind::STARTS_KW,
476            Self::Ends => SyntaxKind::ENDS_KW,
477            Self::Contains => SyntaxKind::CONTAINS_KW,
478            Self::Div => SyntaxKind::DIV_KW,
479            Self::Mod => SyntaxKind::MOD_KW,
480            Self::Count => SyntaxKind::COUNT_KW,
481            Self::Exists => SyntaxKind::EXISTS_KW,
482            Self::ShortestPath => SyntaxKind::SHORTESTPATH_KW,
483            Self::AllShortestPaths => SyntaxKind::ALLSHORTESTPATHS_KW,
484            Self::Any => SyntaxKind::ANY_KW,
485            Self::None => SyntaxKind::NONE_KW,
486            Self::Single => SyntaxKind::SINGLE_KW,
487
488            Self::Ident => SyntaxKind::IDENT,
489            Self::QuotedIdent => SyntaxKind::QUOTED_IDENT,
490            Self::Param => SyntaxKind::PARAM,
491
492            Self::Int => SyntaxKind::INT_LITERAL,
493            Self::Float => SyntaxKind::FLOAT_LITERAL,
494            Self::String => SyntaxKind::STRING_LITERAL,
495
496            Self::LParen => SyntaxKind::L_PAREN,
497            Self::RParen => SyntaxKind::R_PAREN,
498            Self::LBrack => SyntaxKind::L_BRACK,
499            Self::RBrack => SyntaxKind::R_BRACK,
500            Self::LBrace => SyntaxKind::L_BRACE,
501            Self::RBrace => SyntaxKind::R_BRACE,
502            Self::Comma => SyntaxKind::COMMA,
503            Self::Semi => SyntaxKind::SEMI,
504            Self::Colon => SyntaxKind::COLON,
505            Self::DoubleColon => SyntaxKind::DOUBLE_COLON,
506            Self::Dot => SyntaxKind::DOT,
507            Self::DotDot => SyntaxKind::DOT_DOT,
508            Self::Pipe => SyntaxKind::PIPE,
509            Self::Star => SyntaxKind::STAR,
510            Self::Plus => SyntaxKind::PLUS,
511            Self::Minus => SyntaxKind::MINUS,
512            Self::Slash => SyntaxKind::SLASH,
513            Self::Percent => SyntaxKind::PERCENT,
514            Self::Caret => SyntaxKind::CARET,
515            Self::Eq => SyntaxKind::EQ,
516            Self::Neq => SyntaxKind::NEQ,
517            Self::BangEq => SyntaxKind::BANG_EQ,
518            Self::Lt => SyntaxKind::LT,
519            Self::Le => SyntaxKind::LE,
520            Self::Gt => SyntaxKind::GT,
521            Self::Ge => SyntaxKind::GE,
522            Self::ArrowR => SyntaxKind::ARROW_R,
523            Self::ArrowL => SyntaxKind::ARROW_L,
524            Self::RegexEq => SyntaxKind::REGEX_EQ,
525            Self::Dollar => SyntaxKind::DOLLAR,
526            Self::Bang => SyntaxKind::BANG,
527            Self::Amp => SyntaxKind::AMP,
528        }
529    }
530}
531
532#[cfg(test)]
533mod tests {
534    use super::{SyntaxKind, lex};
535    use text_size::{TextRange, TextSize};
536
537    fn kinds(src: &str) -> Vec<SyntaxKind> {
538        lex(src).into_iter().map(|t| t.kind).collect()
539    }
540
541    #[test]
542    fn lex_empty() {
543        assert!(lex("").is_empty());
544    }
545
546    #[test]
547    fn tokenises_empty_input_to_zero_tokens() {
548        // §4.1: empty input yields no tokens (EOF is synthesised by the
549        // parser, not the lexer).
550        assert_eq!(lex("").len(), 0);
551    }
552
553    #[test]
554    fn lex_simple_match() {
555        let k = kinds("MATCH (n) RETURN n");
556        assert_eq!(
557            k,
558            vec![
559                SyntaxKind::MATCH_KW,
560                SyntaxKind::WHITESPACE,
561                SyntaxKind::L_PAREN,
562                SyntaxKind::IDENT,
563                SyntaxKind::R_PAREN,
564                SyntaxKind::WHITESPACE,
565                SyntaxKind::RETURN_KW,
566                SyntaxKind::WHITESPACE,
567                SyntaxKind::IDENT,
568            ]
569        );
570    }
571
572    #[test]
573    fn keywords_are_case_insensitive() {
574        assert_eq!(kinds("match")[0], SyntaxKind::MATCH_KW);
575        assert_eq!(kinds("MaTcH")[0], SyntaxKind::MATCH_KW);
576    }
577
578    #[test]
579    fn keywords_case_insensitive_preserves_case() {
580        // §4.1: case-insensitive match, original casing preserved in `text`.
581        let toks = lex("match MATCH Match");
582        let kw_toks: Vec<_> = toks
583            .iter()
584            .filter(|t| t.kind == SyntaxKind::MATCH_KW)
585            .collect();
586        assert_eq!(kw_toks.len(), 3);
587        assert_eq!(kw_toks[0].text.as_str(), "match");
588        assert_eq!(kw_toks[1].text.as_str(), "MATCH");
589        assert_eq!(kw_toks[2].text.as_str(), "Match");
590    }
591
592    #[test]
593    fn identifier_not_shadowed_by_keyword_prefix() {
594        // `MATCHING` must lex as a single IDENT, not MATCH_KW + ING.
595        assert_eq!(kinds("MATCHING"), vec![SyntaxKind::IDENT]);
596    }
597
598    #[test]
599    fn identifiers_vs_keywords() {
600        // `matching` is a single IDENT; word-boundary prevents MATCH_KW + ing.
601        let toks = lex("matching");
602        assert_eq!(toks.len(), 1);
603        assert_eq!(toks[0].kind, SyntaxKind::IDENT);
604        assert_eq!(toks[0].text.as_str(), "matching");
605    }
606
607    #[test]
608    fn numeric_literals() {
609        assert_eq!(kinds("42"), vec![SyntaxKind::INT_LITERAL]);
610        assert_eq!(kinds("3.14"), vec![SyntaxKind::FLOAT_LITERAL]);
611        assert_eq!(kinds("0xFF"), vec![SyntaxKind::INT_LITERAL]);
612        assert_eq!(kinds("0x1f"), vec![SyntaxKind::INT_LITERAL]);
613        assert_eq!(kinds("0o17"), vec![SyntaxKind::INT_LITERAL]);
614        assert_eq!(kinds("0b10"), vec![SyntaxKind::INT_LITERAL]);
615        // Float with exponent.
616        assert_eq!(kinds("1.5e10"), vec![SyntaxKind::FLOAT_LITERAL]);
617        assert_eq!(kinds("2e-5"), vec![SyntaxKind::FLOAT_LITERAL]);
618    }
619
620    #[test]
621    fn string_literals() {
622        assert_eq!(kinds(r#""hello""#), vec![SyntaxKind::STRING_LITERAL]);
623        assert_eq!(kinds("'world'"), vec![SyntaxKind::STRING_LITERAL]);
624    }
625
626    #[test]
627    fn string_literal_with_escapes() {
628        // Single literal covering the full range; escape sequences are
629        // syntactically consumed but not decoded at lex time (§4.1).
630        let src = "'a\\nb'";
631        let toks = lex(src);
632        assert_eq!(toks.len(), 1);
633        assert_eq!(toks[0].kind, SyntaxKind::STRING_LITERAL);
634        assert_eq!(toks[0].text.as_str(), src);
635        let end = TextSize::try_from(src.len()).expect("len fits u32");
636        assert_eq!(toks[0].range, TextRange::new(TextSize::from(0), end));
637        // Double-quoted variant with several escapes also lexes as one token.
638        let src2 = r#""tab:\t quote:\" backslash:\\""#;
639        let toks2 = lex(src2);
640        assert_eq!(toks2.len(), 1);
641        assert_eq!(toks2[0].kind, SyntaxKind::STRING_LITERAL);
642        assert_eq!(toks2[0].text.as_str(), src2);
643    }
644
645    #[test]
646    fn parameters() {
647        assert_eq!(kinds("$foo"), vec![SyntaxKind::PARAM]);
648        assert_eq!(kinds("$0"), vec![SyntaxKind::PARAM]);
649    }
650
651    #[test]
652    fn param_forms() {
653        // `$ident` and `$<decimal>` both lex as a single PARAM token.
654        let a = lex("$name");
655        assert_eq!(a.len(), 1);
656        assert_eq!(a[0].kind, SyntaxKind::PARAM);
657        assert_eq!(a[0].text.as_str(), "$name");
658        let b = lex("$0");
659        assert_eq!(b.len(), 1);
660        assert_eq!(b[0].kind, SyntaxKind::PARAM);
661        assert_eq!(b[0].text.as_str(), "$0");
662    }
663
664    #[test]
665    fn quoted_identifier_with_escaped_backtick() {
666        // Backtick-delimited, escape by doubling per spec §4.1.
667        let src = "`weird``name`";
668        let toks = lex(src);
669        assert_eq!(toks.len(), 1);
670        assert_eq!(toks[0].kind, SyntaxKind::QUOTED_IDENT);
671        assert_eq!(toks[0].text.as_str(), src);
672    }
673
674    #[test]
675    fn punctuation_composite() {
676        // Each composite operator must lex as exactly ONE token.
677        for (src, expected) in [
678            ("<>", SyntaxKind::NEQ),
679            ("!=", SyntaxKind::BANG_EQ),
680            ("<=", SyntaxKind::LE),
681            (">=", SyntaxKind::GE),
682            ("->", SyntaxKind::ARROW_R),
683            ("<-", SyntaxKind::ARROW_L),
684            ("::", SyntaxKind::DOUBLE_COLON),
685            ("..", SyntaxKind::DOT_DOT),
686            ("=~", SyntaxKind::REGEX_EQ),
687        ] {
688            let toks = lex(src);
689            assert_eq!(toks.len(), 1, "expected 1 token for {src:?}");
690            assert_eq!(toks[0].kind, expected, "wrong kind for {src:?}");
691            assert_eq!(toks[0].text.as_str(), src);
692        }
693    }
694
695    #[test]
696    fn comments() {
697        assert_eq!(kinds("// hi"), vec![SyntaxKind::LINE_COMMENT]);
698        assert_eq!(kinds("/* hi */"), vec![SyntaxKind::BLOCK_COMMENT]);
699    }
700
701    #[test]
702    fn block_comment_and_line_comment() {
703        // Each comment is a single trivia token spanning its full range.
704        let line = lex("// a comment");
705        assert_eq!(line.len(), 1);
706        assert_eq!(line[0].kind, SyntaxKind::LINE_COMMENT);
707        assert_eq!(line[0].text.as_str(), "// a comment");
708        let block = lex("/* multi\nline */");
709        assert_eq!(block.len(), 1);
710        assert_eq!(block[0].kind, SyntaxKind::BLOCK_COMMENT);
711        assert_eq!(block[0].text.as_str(), "/* multi\nline */");
712    }
713
714    #[test]
715    fn losslessness_invariant_sample() {
716        let src = "MATCH (n:Person {name: $nm}) // find\nRETURN n";
717        let reassembled: String = lex(src).into_iter().map(|t| t.text.to_string()).collect();
718        assert_eq!(reassembled, src);
719    }
720
721    #[test]
722    fn lossless_concat() {
723        // §4.4 losslessness invariant at the lexer level: concatenating
724        // every token's text reproduces the source byte-for-byte.
725        let src = "MATCH (n:Person {name: 'a\\nb', age: 42})\n// trailing\nRETURN n.age + 1";
726        let reassembled: String = lex(src).into_iter().map(|t| t.text.to_string()).collect();
727        assert_eq!(reassembled, src);
728    }
729
730    #[test]
731    fn error_token_for_unknown_bytes() {
732        // A stray `@` is not a valid token in v1.
733        let toks = lex("@");
734        assert_eq!(toks.len(), 1);
735        assert_eq!(toks[0].kind, SyntaxKind::ERROR);
736    }
737
738    #[test]
739    fn unknown_byte_becomes_error_token() {
740        // A stray multi-byte codepoint (`§`, U+00A7) is not consumable by
741        // the DFA; the lexer emits an ERROR token spanning its bytes and
742        // does not panic. Losslessness is preserved: text + range round-trip.
743        let src = "§";
744        let toks = lex(src);
745        assert!(!toks.is_empty());
746        assert!(toks.iter().all(|t| t.kind == SyntaxKind::ERROR));
747        let reassembled: String = toks.iter().map(|t| t.text.to_string()).collect();
748        assert_eq!(reassembled, src);
749        // First token starts at offset 0.
750        assert_eq!(u32::from(toks[0].range.start()), 0);
751        // Final token ends at end of input.
752        let last_end = usize::from(toks.last().unwrap().range.end());
753        assert_eq!(last_end, src.len());
754    }
755}
cyrs_syntax/lexer.rs

cyrs_syntax/
lexer.rs