bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// v0.142 (ADR 0166): strip `_` digit separators from a numeric literal's lexeme
15/// before it is parsed into a value. The lexer's `IntLit`/`FloatLit` regexes only
16/// admit an `_` between two digit groups, so removing every `_` yields a plain
17/// digit string; the separators are purely visual. Allocates only when the
18/// literal actually carries a separator (the common case does not).
19pub(crate) fn strip_digit_separators(lexeme: &str) -> std::borrow::Cow<'_, str> {
20    if lexeme.as_bytes().contains(&b'_') {
21        std::borrow::Cow::Owned(lexeme.replace('_', ""))
22    } else {
23        std::borrow::Cow::Borrowed(lexeme)
24    }
25}
26
27/// Token kinds. Discriminants without payload data; the lexeme is recovered
28/// from the source string via the token's [`Span`].
29///
30/// Note: `--` line comments and `---` doc block markers are handled outside
31/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
32/// containing only the marker and may span multiple source lines.
33#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
34#[logos(skip r"[ \t\r\n]+")]
35pub enum TokenKind {
36    // Keywords
37    #[token("commons")]
38    Commons,
39    #[token("type")]
40    Type,
41    #[token("fn")]
42    Fn,
43    #[token("where")]
44    Where,
45    #[token("and")]
46    And,
47    #[token("true")]
48    True,
49    #[token("false")]
50    False,
51    #[token("Int")]
52    Int,
53    #[token("String")]
54    String,
55    #[token("Bool")]
56    Bool,
57    // v0.21 keyword
58    #[token("Float")]
59    Float,
60    // v0.86 keyword (ADR 0112): the `Duration` base type.
61    #[token("Duration")]
62    Duration,
63    // v0.90 keyword (ADR 0114): the `Instant` base type.
64    #[token("Instant")]
65    Instant,
66    // v0.110 keyword (ADR 0142): the `Bytes` base type.
67    #[token("Bytes")]
68    Bytes,
69    // v0.1 keywords
70    #[token("let")]
71    Let,
72    #[token("if")]
73    If,
74    #[token("else")]
75    Else,
76    #[token("Ok")]
77    Ok,
78    #[token("Err")]
79    Err,
80    #[token("Result")]
81    Result,
82    #[token("ValidationError")]
83    ValidationError,
84    // v0.22b keyword
85    #[token("JsonError")]
86    JsonError,
87    // v0.2 keywords
88    #[token("enum")]
89    Enum,
90    #[token("match")]
91    Match,
92    #[token("Option")]
93    Option,
94    #[token("record")]
95    Record,
96    #[token("self")]
97    Self_,
98    #[token("Some")]
99    Some,
100    #[token("None")]
101    None,
102    #[token("is")]
103    Is,
104    // v0.3 keywords
105    #[token("opaque")]
106    Opaque,
107    #[token("uses")]
108    Uses,
109    // v0.4 keywords
110    #[token("context")]
111    Context,
112    #[token("consumes")]
113    Consumes,
114    #[token("exports")]
115    Exports,
116    #[token("transparent")]
117    Transparent,
118    // v0.6 keywords
119    #[token("as")]
120    As,
121    // v0.7 keywords (v0.112: `assert`→`expect`, `test`→`suite`/`case`;
122    // v0.118: `mocks` retired — test doubles are `provides` at a seam)
123    #[token("expect")]
124    Expect,
125    #[token("suite")]
126    Suite,
127    #[token("case")]
128    Case,
129    // v0.114 keyword — generative tests (testing track slice 2). `for` and `all`
130    // are deliberately *not* keywords: `all` is a list combinator (`all(xs, p)`)
131    // and must stay a usable identifier. The `for all` binder is parsed
132    // contextually (two identifiers) inside a `property` body instead.
133    #[token("property")]
134    Property,
135    // v0.17 keywords
136    #[token("adapter")]
137    Adapter,
138    #[token("binding")]
139    Binding,
140    // v0.5 keywords
141    #[token("agent")]
142    Agent,
143    #[token("capability")]
144    Capability,
145    #[token("Effect")]
146    Effect,
147    #[token("given")]
148    Given,
149    #[token("on")]
150    On,
151    // v0.9 keyword
152    #[token("http")]
153    Http,
154    // v0.10a keyword
155    #[token("cron")]
156    Cron,
157    // v0.10b keyword
158    #[token("queue")]
159    Queue,
160    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
161    // reserved (protocols are a closed, compiler-known set — no declaration kind).
162    #[token("from")]
163    From,
164    #[token("protocol")]
165    Protocol,
166    #[token("provides")]
167    Provides,
168    #[token("service")]
169    Service,
170    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
171    // heads a handler's actor clause.
172    #[token("actor")]
173    Actor,
174    #[token("by")]
175    By,
176    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
177    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
178    #[token("invariant")]
179    Invariant,
180    #[token("implies")]
181    Implies,
182    // v0.115 keywords — function contracts (testing track slice 3). `requires`
183    // and `ensures` head a contract clause on a `fn` signature (between the
184    // return type and the body). `result` is deliberately *not* a keyword: it is
185    // the ordinary value name outside a contract, so it stays a usable
186    // identifier; inside an `ensures` predicate it is bound contextually as the
187    // function's return value (parsed by scope, like `for`/`all` in slice 2).
188    // Distinct from ADR 0127's capability `@requires` annotation.
189    #[token("requires")]
190    Requires,
191    #[token("ensures")]
192    Ensures,
193    // v0.116 keyword — step invariants (testing track slice 4). `transition` heads
194    // an agent step-invariant declaration (beside `invariant`), a predicate over
195    // the pre- and post-commit state pair. `old` and `new` are deliberately *not*
196    // keywords: they stay ordinary value names outside a `transition`, and inside a
197    // `transition` predicate they are bound contextually to the old/new state
198    // records (parsed by scope, like `result` in an `ensures`).
199    #[token("transition")]
200    Transition,
201    /// `...` — used in record-spread expressions (v0.5).
202    #[token("...")]
203    DotDotDot,
204    /// `<-` — Effect bind operator (v0.5).
205    #[token("<-")]
206    LArrow,
207    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
208    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
209    /// call site shows whether the caller waits.
210    #[token("~>")]
211    TildeArrow,
212    /// `:=` — Cell write (v0.81, storage track). A handler statement
213    /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
214    /// than `:`/`=` so logos matches it as one token.
215    #[token(":=")]
216    ColonEq,
217
218    /// A documentation block: `---` line ... `---` line. The token's span
219    /// covers the full block including both `---` markers. The body content
220    /// is recovered from the source via the span (see [`doc_block_content`]).
221    /// Inserted by [`tokenize`]; not lexed by logos directly.
222    DocBlock,
223
224    /// A line comment: `-- ...` running to end of line. The span starts at
225    /// the `--` marker and runs through the last character before the
226    /// terminating newline (exclusive). The trivia body (the text after the
227    /// `--` marker) is recovered from the source via the span. Inserted by
228    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
229    /// for an `--` operator sequence.
230    Comment,
231
232    // Identifier
233    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
234    Ident,
235
236    // Literals. v0.142 (ADR 0166): an `_` digit separator may appear between
237    // digits (`1_048_576`) — never leading, trailing, or doubled (each `_` must
238    // sit between two digit groups). The separators are stripped before the value
239    // is parsed; they are purely visual.
240    #[regex(r"[0-9]+(_[0-9]+)*")]
241    IntLit,
242    // A float literal: fraction with a digit on both sides of the `.`, an
243    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
244    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
245    // as method calls on numeric literals. Digit separators (v0.142) may appear
246    // in any digit group, including the exponent.
247    #[regex(
248        r"[0-9]+(_[0-9]+)*\.[0-9]+(_[0-9]+)*([eE][+-]?[0-9]+(_[0-9]+)*)?|[0-9]+(_[0-9]+)*[eE][+-]?[0-9]+(_[0-9]+)*"
249    )]
250    FloatLit,
251    // A double-quoted string with simple escapes. The body excludes the closing
252    // quote; we accept any non-quote/non-backslash/non-newline char, or a
253    // backslash followed by one of the four allowed escapes.
254    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
255    StrLit,
256    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
257    // `tokenize` (logos cannot balance the holes' parens), never produced by
258    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
259    // The span covers the whole `"…"`; the parser splits chunks from holes.
260    InterpStr,
261
262    // Multi-char operators
263    #[token("->")]
264    Arrow,
265    #[token("==")]
266    EqEq,
267    #[token("!=")]
268    BangEq,
269    #[token("<=")]
270    LtEq,
271    #[token(">=")]
272    GtEq,
273    #[token("&&")]
274    AmpAmp,
275    #[token("||")]
276    PipePipe,
277
278    // Single-char operators
279    #[token("+")]
280    Plus,
281    #[token("-")]
282    Minus,
283    #[token("*")]
284    Star,
285    #[token("/")]
286    Slash,
287    #[token("!")]
288    Bang,
289    #[token("=")]
290    Eq,
291    #[token("<")]
292    Lt,
293    #[token(">")]
294    Gt,
295    // v0.1 postfix operator
296    #[token("?")]
297    Question,
298    // v0.2 match-arm arrow
299    #[token("=>")]
300    FatArrow,
301    // v0.2 wildcard pattern (also valid as identifier start; the lexer
302    // prefers identifier for any longer match, so `_foo` is still Ident).
303    #[token("_")]
304    Underscore,
305    // v0.2 sum-type variant separator (also used as future bitwise OR);
306    // single `|` distinct from `||`.
307    #[token("|")]
308    Pipe,
309    /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
310    /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
311    /// appears only in store-field-declaration position, never as an expression
312    /// operator.
313    #[token("@")]
314    At,
315
316    // Punctuation
317    #[token("(")]
318    LParen,
319    #[token(")")]
320    RParen,
321    #[token("{")]
322    LBrace,
323    #[token("}")]
324    RBrace,
325    #[token("[")]
326    LBracket,
327    #[token("]")]
328    RBracket,
329    #[token(",")]
330    Comma,
331    #[token(":")]
332    Colon,
333    #[token(".")]
334    Dot,
335}
336
337impl TokenKind {
338    /// Human-readable display name for diagnostics.
339    pub fn describe(self) -> &'static str {
340        use TokenKind::*;
341        match self {
342            Commons => "`commons`",
343            Type => "`type`",
344            Fn => "`fn`",
345            Where => "`where`",
346            And => "`and`",
347            True => "`true`",
348            False => "`false`",
349            Int => "`Int`",
350            String => "`String`",
351            Bool => "`Bool`",
352            Float => "`Float`",
353            Duration => "`Duration`",
354            Instant => "`Instant`",
355            Bytes => "`Bytes`",
356            Let => "`let`",
357            If => "`if`",
358            Else => "`else`",
359            Ok => "`Ok`",
360            Err => "`Err`",
361            Result => "`Result`",
362            ValidationError => "`ValidationError`",
363            JsonError => "`JsonError`",
364            Enum => "`enum`",
365            Match => "`match`",
366            Option => "`Option`",
367            Record => "`record`",
368            Self_ => "`self`",
369            Some => "`Some`",
370            None => "`None`",
371            Is => "`is`",
372            Opaque => "`opaque`",
373            Uses => "`uses`",
374            Context => "`context`",
375            Consumes => "`consumes`",
376            Exports => "`exports`",
377            Transparent => "`transparent`",
378            As => "`as`",
379            Expect => "`expect`",
380            Suite => "`suite`",
381            Case => "`case`",
382            Property => "`property`",
383            Adapter => "`adapter`",
384            Binding => "`binding`",
385            Agent => "`agent`",
386            Capability => "`capability`",
387            Effect => "`Effect`",
388            Given => "`given`",
389            On => "`on`",
390            Http => "`http`",
391            Cron => "`cron`",
392            Queue => "`queue`",
393            From => "`from`",
394            Protocol => "`protocol`",
395            Provides => "`provides`",
396            Service => "`service`",
397            Actor => "`actor`",
398            By => "`by`",
399            Invariant => "`invariant`",
400            Implies => "`implies`",
401            Requires => "`requires`",
402            Ensures => "`ensures`",
403            Transition => "`transition`",
404            ColonEq => "`:=`",
405            DotDotDot => "`...`",
406            LArrow => "`<-`",
407            TildeArrow => "`~>`",
408            DocBlock => "documentation block",
409            Comment => "line comment",
410            Ident => "identifier",
411            IntLit => "integer literal",
412            FloatLit => "float literal",
413            StrLit => "string literal",
414            InterpStr => "interpolated string",
415            Arrow => "`->`",
416            EqEq => "`==`",
417            BangEq => "`!=`",
418            LtEq => "`<=`",
419            GtEq => "`>=`",
420            AmpAmp => "`&&`",
421            PipePipe => "`||`",
422            Plus => "`+`",
423            Minus => "`-`",
424            Star => "`*`",
425            Slash => "`/`",
426            Bang => "`!`",
427            Eq => "`=`",
428            Lt => "`<`",
429            Gt => "`>`",
430            Question => "`?`",
431            FatArrow => "`=>`",
432            Underscore => "`_`",
433            Pipe => "`|`",
434            At => "`@`",
435            LParen => "`(`",
436            RParen => "`)`",
437            LBrace => "`{`",
438            RBrace => "`}`",
439            LBracket => "`[`",
440            RBracket => "`]`",
441            Comma => "`,`",
442            Colon => "`:`",
443            Dot => "`.`",
444        }
445    }
446}
447
448/// A token plus its source span.
449#[derive(Debug, Clone, Copy)]
450pub struct Token {
451    pub kind: TokenKind,
452    pub span: Span,
453}
454
455/// Tokenise a source string. Returns the full token vector or the first
456/// lexical error.
457///
458/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
459/// outside the logos-generated lexer: we scan the source one segment at a
460/// time, dispatching to logos for ordinary tokens between non-token spans.
461pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
462    let mut tokens = Vec::new();
463    let bytes = source.as_bytes();
464    let mut pos = 0;
465    while pos < bytes.len() {
466        // Detect a `---` doc-block marker at the start of a line (the line may
467        // begin with leading whitespace; the marker itself must be alone on
468        // its line).
469        if let Some(open_end) = doc_block_open_at(source, pos) {
470            // Find the matching closing `---` line.
471            match doc_block_close(source, open_end) {
472                Some((close_start, close_end)) => {
473                    let span = Span::new(pos, close_end);
474                    tokens.push(Token {
475                        kind: TokenKind::DocBlock,
476                        span,
477                    });
478                    let _ = close_start;
479                    pos = close_end;
480                    continue;
481                }
482                None => {
483                    return Err(CompileError::new(
484                        "bynk.lex.unclosed_doc_block",
485                        Span::new(pos, open_end),
486                        "documentation block opened but never closed",
487                    )
488                    .with_note(
489                        "a doc block must be terminated by another `---` on a line by itself",
490                    ));
491                }
492            }
493        }
494        // A `--` line comment: emit a `Comment` token covering everything
495        // up to (but not including) the terminating newline. Doc-block
496        // detection above already ruled out a `---` marker at line start
497        // — and once we've consumed past the leading `--`, any further
498        // dashes are part of the comment body. Preserving comments as
499        // trivia tokens lets the parser attach them to declarations so
500        // the formatter can emit them in place (v1.1 LSP spec §3.5).
501        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
502            let start = pos;
503            while pos < bytes.len() && bytes[pos] != b'\n' {
504                pos += 1;
505            }
506            tokens.push(Token {
507                kind: TokenKind::Comment,
508                span: Span::new(start, pos),
509            });
510            continue;
511        }
512        // Skip ordinary whitespace inline (logos handles it too, but we may
513        // be in the middle of the source between specials).
514        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
515            pos += 1;
516            continue;
517        }
518        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
519        // actually contain a `\(` hole are hand-scanned here; plain strings
520        // fall through to the logos `StrLit` path unchanged. `\(` is an
521        // invalid escape in the logos grammar, so this never re-routes a
522        // currently-valid literal.
523        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
524            let end = scan_str(bytes, source, pos)?;
525            tokens.push(Token {
526                kind: TokenKind::InterpStr,
527                span: Span::new(pos, end),
528            });
529            pos = end;
530            continue;
531        }
532        // Otherwise dispatch a single logos token starting at `pos`.
533        let mut lex = TokenKind::lexer(&source[pos..]);
534        let Some(result) = lex.next() else {
535            // No token at this position; treat as unexpected character so
536            // the user sees something useful.
537            let ch = source[pos..].chars().next().unwrap_or('\0');
538            let span = Span::new(pos, pos + ch.len_utf8());
539            return Err(CompileError::new(
540                "bynk.lex.unexpected_character",
541                span,
542                format!("unexpected character `{ch}`"),
543            ));
544        };
545        let local = lex.span();
546        let span: Span = Span::new(pos + local.start, pos + local.end);
547        match result {
548            Ok(kind) => {
549                if kind == TokenKind::IntLit {
550                    let slice = &source[span.range()];
551                    if strip_digit_separators(slice).parse::<i64>().is_err() {
552                        return Err(CompileError::new(
553                            "bynk.lex.integer_overflow",
554                            span,
555                            format!(
556                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
557                            ),
558                        )
559                        .with_note("the range is -2^63 to 2^63 - 1"));
560                    }
561                }
562                if kind == TokenKind::FloatLit {
563                    let slice = &source[span.range()];
564                    match strip_digit_separators(slice).parse::<f64>() {
565                        Ok(v) if v.is_finite() => {}
566                        _ => {
567                            return Err(CompileError::new(
568                                "bynk.lex.float_literal_overflow",
569                                span,
570                                format!(
571                                    "float literal `{slice}` is out of range for a 64-bit float"
572                                ),
573                            )
574                            .with_note(
575                                "the literal does not fit a finite IEEE 754 double; \
576                                 the largest finite value is ~1.8e308",
577                            ));
578                        }
579                    }
580                }
581                tokens.push(Token { kind, span });
582                pos = span.end;
583            }
584            Err(()) => {
585                let slice = &source[span.range()];
586                let ch = slice.chars().next().unwrap_or('\0');
587                let err = if ch == '"' {
588                    CompileError::new(
589                        "bynk.lex.unterminated_string",
590                        span,
591                        "unterminated string literal",
592                    )
593                    .with_note(
594                        "string literals must close with `\"` on the same line; \
595                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
596                    )
597                } else {
598                    CompileError::new(
599                        "bynk.lex.unexpected_character",
600                        span,
601                        format!("unexpected character `{ch}`"),
602                    )
603                };
604                return Err(err);
605            }
606        }
607    }
608    Ok(tokens)
609}
610
611/// Like [`tokenize`], but with every interpolated-string token replaced by the
612/// tokens of its holes — each hole's bytes re-lexed and its token spans rebased
613/// to absolute source positions (the same rebase [`crate::parser`] applies when
614/// parsing a hole), recursing through nested interpolation. Chunk (literal) text
615/// between holes yields no tokens.
616///
617/// An interpolated string lexes to a single opaque `InterpStr` token, so the
618/// LSP's token-based cursor resolution (hover, go-to-definition, references,
619/// semantic tokens) is otherwise blind to identifiers inside `"… \(name) …"`.
620/// Expanding the holes makes those identifiers visible as ordinary `Ident`
621/// tokens with their real spans. (Issue #473.)
622///
623/// On a malformed interpolation (an `InterpStr` whose holes don't split, or a
624/// hole whose bytes don't re-lex) the offending token is kept opaque rather than
625/// dropped, so resolution degrades to the pre-fix behaviour instead of losing
626/// tokens.
627pub fn tokenize_expanding_holes(source: &str) -> Result<Vec<Token>, CompileError> {
628    let mut out = Vec::new();
629    for tok in tokenize(source)? {
630        expand_hole_token(source, tok, &mut out);
631    }
632    Ok(out)
633}
634
635/// Push `tok` onto `out`, expanding it into its holes' tokens if it is an
636/// `InterpStr` (see [`tokenize_expanding_holes`]); otherwise push it as-is.
637fn expand_hole_token(source: &str, tok: Token, out: &mut Vec<Token>) {
638    if tok.kind != TokenKind::InterpStr {
639        out.push(tok);
640        return;
641    }
642    let Ok(segments) = split_interp(source, tok.span) else {
643        out.push(tok); // malformed interpolation — keep the opaque token
644        return;
645    };
646    for segment in segments {
647        let InterpSegment::Hole(hole) = segment else {
648            continue; // chunk text carries no tokens
649        };
650        let Ok(hole_tokens) = tokenize(&source[hole.range()]) else {
651            continue;
652        };
653        for mut t in hole_tokens {
654            // Rebase the hole's local spans to absolute source positions.
655            t.span = Span::new(t.span.start + hole.start, t.span.end + hole.start);
656            expand_hole_token(source, t, out); // recurse for nested interpolation
657        }
658    }
659}
660
661/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
662/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
663/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
664/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
665/// routed here so the hole-aware scanner produces the precise error.
666fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
667    let mut i = start + 1;
668    while i < bytes.len() {
669        match bytes[i] {
670            b'\n' | b'"' => return false,
671            b'\\' => {
672                if bytes.get(i + 1) == Some(&b'(') {
673                    return true;
674                }
675                i += 2;
676            }
677            _ => i += 1,
678        }
679    }
680    false
681}
682
683/// Scan a double-quoted string starting at `start` (the opening `"`), returning
684/// the byte offset just past the closing `"`. Recognises the four simple
685/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
686/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
687fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
688    debug_assert_eq!(bytes[start], b'"');
689    let mut i = start + 1;
690    loop {
691        if i >= bytes.len() || bytes[i] == b'\n' {
692            return Err(CompileError::new(
693                "bynk.lex.unterminated_string",
694                Span::new(start, i.min(bytes.len())),
695                "unterminated string literal",
696            )
697            .with_note(
698                "string literals must close with `\"` on the same line; \
699                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
700            ));
701        }
702        match bytes[i] {
703            b'"' => return Ok(i + 1),
704            b'\\' => match bytes.get(i + 1) {
705                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
706                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
707                other => {
708                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
709                    return Err(CompileError::new(
710                        "bynk.lex.bad_escape",
711                        Span::new(i, (i + 2).min(bytes.len())),
712                        format!("invalid escape sequence `\\{shown}` in string literal"),
713                    )
714                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
715                }
716            },
717            // Any other byte advances one position. UTF-8 continuation bytes
718            // are all >= 0x80, so they never collide with the ASCII specials.
719            _ => i += 1,
720        }
721    }
722}
723
724/// Scan an interpolation hole body. `start` points just past the `\(`; returns
725/// the offset just past the matching `)`. Tracks paren depth and skips nested
726/// strings (whose own parens must not close the hole), recursing through
727/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
728fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
729    let mut i = start;
730    let mut depth = 1usize;
731    loop {
732        if i >= bytes.len() || bytes[i] == b'\n' {
733            return Err(CompileError::new(
734                "bynk.lex.unterminated_interpolation",
735                Span::new(start.saturating_sub(2), i.min(bytes.len())),
736                "unterminated interpolation hole",
737            )
738            .with_note(
739                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
740            ));
741        }
742        match bytes[i] {
743            b'(' => {
744                depth += 1;
745                i += 1;
746            }
747            b')' => {
748                depth -= 1;
749                i += 1;
750                if depth == 0 {
751                    return Ok(i);
752                }
753            }
754            b'"' => i = scan_str(bytes, source, i)?,
755            _ => i += 1,
756        }
757    }
758}
759
760/// One segment of a split interpolated string (v0.43): literal text (escapes
761/// resolved) or the absolute source span of a hole's expression (the bytes
762/// between `\(` and its matching `)`). The parser turns the latter into a real
763/// `Expr`; the lexer owns only the scanning.
764pub(crate) enum InterpSegment {
765    Chunk(String),
766    Hole(Span),
767}
768
769/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
770/// and hole spans. Escapes in the chunks are resolved here (mirroring
771/// [`parse_string_literal`]); holes are returned as spans for the parser to
772/// re-lex and parse as expressions. (v0.43.)
773pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
774    let bytes = source.as_bytes();
775    let inner_end = span.end - 1; // the closing `"`
776    let mut segments = Vec::new();
777    let mut chunk = String::new();
778    let mut i = span.start + 1; // past the opening `"`
779    while i < inner_end {
780        match bytes[i] {
781            b'\\' => match bytes[i + 1] {
782                b'n' => {
783                    chunk.push('\n');
784                    i += 2;
785                }
786                b't' => {
787                    chunk.push('\t');
788                    i += 2;
789                }
790                b'"' => {
791                    chunk.push('"');
792                    i += 2;
793                }
794                b'\\' => {
795                    chunk.push('\\');
796                    i += 2;
797                }
798                b'(' => {
799                    if !chunk.is_empty() {
800                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
801                    }
802                    let hole_start = i + 2;
803                    let after = scan_hole(bytes, source, hole_start)?;
804                    // `after` is one past the matching `)`; the hole body is
805                    // everything up to that `)`.
806                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
807                    i = after;
808                }
809                // The lexer already validated every escape, so nothing else
810                // can appear here.
811                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
812            },
813            _ => {
814                let ch = source[i..].chars().next().unwrap();
815                chunk.push(ch);
816                i += ch.len_utf8();
817            }
818        }
819    }
820    if !chunk.is_empty() {
821        segments.push(InterpSegment::Chunk(chunk));
822    }
823    Ok(segments)
824}
825
826/// If a `---` doc-block marker line starts at or shortly after `pos` (which
827/// must be at a line boundary), return the byte offset just past the marker
828/// line (after the terminating newline, or at EOF). The doc-block grammar
829/// requires the marker to be alone on its line; leading horizontal whitespace
830/// is allowed and ignored.
831fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
832    let bytes = source.as_bytes();
833    if !at_line_start(source, pos) {
834        return None;
835    }
836    // Skip leading horizontal whitespace.
837    let mut i = pos;
838    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
839        i += 1;
840    }
841    if i + 3 > bytes.len() {
842        return None;
843    }
844    if &bytes[i..i + 3] != b"---" {
845        return None;
846    }
847    i += 3;
848    // The marker may have additional trailing dashes (per spec "three or more
849    // consecutive hyphens"). Consume them.
850    while i < bytes.len() && bytes[i] == b'-' {
851        i += 1;
852    }
853    // After the dashes, allow only horizontal whitespace then newline/EOF.
854    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
855        i += 1;
856    }
857    if i == bytes.len() {
858        return Some(i);
859    }
860    if bytes[i] == b'\n' {
861        return Some(i + 1);
862    }
863    None
864}
865
866/// Find the next closing `---` line at or after `pos`. Returns
867/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
868/// terminating newline, or at EOF).
869fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
870    let bytes = source.as_bytes();
871    while pos < bytes.len() {
872        // Advance pos to the start of a line.
873        let line_start = pos;
874        // Find the end of this line.
875        let mut line_end = line_start;
876        while line_end < bytes.len() && bytes[line_end] != b'\n' {
877            line_end += 1;
878        }
879        // Check this line.
880        if let Some(end) = doc_block_open_at(source, line_start) {
881            return Some((line_start, end));
882        }
883        // Move to the next line.
884        pos = if line_end < bytes.len() {
885            line_end + 1
886        } else {
887            line_end
888        };
889    }
890    None
891}
892
893/// Returns true if byte offset `pos` is at a line start (column 0).
894fn at_line_start(source: &str, pos: usize) -> bool {
895    if pos == 0 {
896        return true;
897    }
898    let bytes = source.as_bytes();
899    bytes[pos - 1] == b'\n'
900}
901
902/// Extract the body content of a doc-block token from its source span.
903/// Strips the leading and trailing `---` marker lines and returns the body
904/// verbatim. If every non-empty content line begins with the same horizontal
905/// whitespace prefix (e.g., because the doc block sits inside a brace-form
906/// commons body), that common prefix is removed so the body reads naturally
907/// when emitted as JSDoc.
908pub fn doc_block_content(source: &str, span: Span) -> String {
909    let slice = &source[span.range()];
910    // Drop the first line (opening marker).
911    let after_open = match slice.find('\n') {
912        Some(i) => &slice[i + 1..],
913        None => return String::new(),
914    };
915    let bytes = after_open.as_bytes();
916    // Trim the trailing closing-marker line.
917    let mut i = bytes.len();
918    if i > 0 && bytes[i - 1] == b'\n' {
919        i -= 1;
920    }
921    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
922        i -= 1;
923    }
924    while i > 0 && bytes[i - 1] == b'-' {
925        i -= 1;
926    }
927    if i > 0 && bytes[i - 1] == b'\n' {
928        i -= 1;
929    }
930    let body = &after_open[..i];
931
932    // Compute the common leading-whitespace prefix across all non-empty lines
933    // and strip it. This lets writers indent the doc block alongside the
934    // declaration it documents without bleeding the indent into the JSDoc.
935    let common: Option<usize> = body
936        .lines()
937        .filter(|l| !l.trim().is_empty())
938        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
939        .min();
940    let strip = common.unwrap_or(0);
941    if strip == 0 {
942        return body.to_string();
943    }
944    let mut out = String::with_capacity(body.len());
945    let mut first = true;
946    for line in body.lines() {
947        if !first {
948            out.push('\n');
949        }
950        first = false;
951        if line.trim().is_empty() {
952            // Preserve blank lines.
953            continue;
954        }
955        let leading: usize = line
956            .bytes()
957            .take_while(|&b| b == b' ' || b == b'\t')
958            .count();
959        let drop = strip.min(leading);
960        out.push_str(&line[drop..]);
961    }
962    out
963}
964
965/// Extract the body of a `Comment` trivia token: everything after the
966/// leading `--` marker, preserving its inline whitespace verbatim. Used by
967/// the parser when attaching comments to declarations.
968pub fn comment_body(source: &str, span: Span) -> &str {
969    let slice = &source[span.range()];
970    // Strip leading "--" if present (defensive — the lexer always emits
971    // Comment tokens whose span begins with `--`).
972    slice.strip_prefix("--").unwrap_or(slice)
973}
974
975/// Returns true if there is a blank line (a line containing only whitespace)
976/// in `source` strictly between byte offsets `from` (inclusive) and `to`
977/// (exclusive). Used by the parser to detect orphan doc blocks.
978///
979/// A doc-block token's span ends just past the closing-marker line's
980/// terminating newline. So if the next declaration begins on the immediately
981/// following line, the substring between contains no newline (only optional
982/// indentation). Any newline in the substring therefore implies at least one
983/// entirely-blank line separating the doc from the declaration.
984pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
985    if to <= from {
986        return false;
987    }
988    let bytes = source.as_bytes();
989    let mut i = from;
990    while i < to {
991        if bytes[i] == b'\n' {
992            return true;
993        }
994        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
995            return false;
996        }
997        i += 1;
998    }
999    false
1000}
1001
1002#[cfg(test)]
1003mod tests {
1004    use super::*;
1005
1006    fn kinds(source: &str) -> Vec<TokenKind> {
1007        tokenize(source)
1008            .unwrap()
1009            .into_iter()
1010            .map(|t| t.kind)
1011            .collect()
1012    }
1013
1014    #[test]
1015    fn keywords_and_idents() {
1016        use TokenKind::*;
1017        assert_eq!(
1018            kinds("commons type fn where and true false Int String Bool foo bar"),
1019            vec![
1020                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
1021            ],
1022        );
1023    }
1024
1025    #[test]
1026    fn integer_and_string_literals() {
1027        use TokenKind::*;
1028        assert_eq!(
1029            kinds(r#"0 42 "hello" "with\nescape""#),
1030            vec![IntLit, IntLit, StrLit, StrLit]
1031        );
1032    }
1033
1034    #[test]
1035    fn operators() {
1036        use TokenKind::*;
1037        assert_eq!(
1038            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
1039            vec![
1040                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
1041                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
1042                At,
1043            ],
1044        );
1045    }
1046
1047    #[test]
1048    fn line_comments_emitted_as_trivia() {
1049        // v1.1: line comments are preserved as Comment tokens so the
1050        // formatter can attach and re-emit them.
1051        use TokenKind::*;
1052        let src = "-- a comment\ntype X = Int -- trailing\n";
1053        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
1054    }
1055
1056    #[test]
1057    fn comment_body_extracts_text_after_marker() {
1058        let toks = tokenize("-- hello world\n").unwrap();
1059        assert_eq!(toks.len(), 1);
1060        assert_eq!(toks[0].kind, TokenKind::Comment);
1061        assert_eq!(
1062            comment_body("-- hello world\n", toks[0].span),
1063            " hello world"
1064        );
1065    }
1066
1067    #[test]
1068    fn comment_does_not_consume_newline() {
1069        // Two adjacent comment lines should produce two distinct tokens
1070        // — the newline between them is not part of either comment's span.
1071        let toks = tokenize("-- one\n-- two\n").unwrap();
1072        assert_eq!(toks.len(), 2);
1073        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
1074    }
1075
1076    #[test]
1077    fn unterminated_string_is_error() {
1078        let err = tokenize("\"oops\n").unwrap_err();
1079        assert_eq!(err.category, "bynk.lex.unterminated_string");
1080    }
1081
1082    #[test]
1083    fn integer_overflow_is_error() {
1084        let err = tokenize("99999999999999999999").unwrap_err();
1085        assert_eq!(err.category, "bynk.lex.integer_overflow");
1086    }
1087
1088    #[test]
1089    fn digit_separators_lex_as_one_number() {
1090        use TokenKind::*;
1091        // v0.142 (ADR 0166): `_` between digit groups keeps the literal a single
1092        // token for both Int and Float.
1093        assert_eq!(kinds("1_048_576"), vec![IntLit]);
1094        assert_eq!(kinds("1_000.500_5"), vec![FloatLit]);
1095        assert_eq!(kinds("1_000e1_0"), vec![FloatLit]);
1096        // A separator-carrying literal that is in range still lexes (the value is
1097        // validated after stripping the separators).
1098        assert!(tokenize("9_223_372_036_854_775_807").is_ok());
1099        // Overflow is still caught on the separator-free value.
1100        let err = tokenize("9_999_999_999_999_999_999_9").unwrap_err();
1101        assert_eq!(err.category, "bynk.lex.integer_overflow");
1102    }
1103
1104    #[test]
1105    fn strip_digit_separators_removes_underscores() {
1106        assert_eq!(strip_digit_separators("1_048_576"), "1048576");
1107        assert_eq!(strip_digit_separators("42"), "42");
1108    }
1109
1110    #[test]
1111    fn unexpected_character_is_error() {
1112        let err = tokenize("type X = Int $").unwrap_err();
1113        assert_eq!(err.category, "bynk.lex.unexpected_character");
1114    }
1115
1116    #[test]
1117    fn v0_1_keywords() {
1118        use TokenKind::*;
1119        assert_eq!(
1120            kinds("let if else Ok Err Result ValidationError"),
1121            vec![Let, If, Else, Ok, Err, Result, ValidationError],
1122        );
1123    }
1124
1125    #[test]
1126    fn question_token() {
1127        use TokenKind::*;
1128        assert_eq!(kinds("x?"), vec![Ident, Question]);
1129    }
1130
1131    #[test]
1132    fn v0_2_keywords() {
1133        use TokenKind::*;
1134        assert_eq!(
1135            kinds("enum match Option record self Some None is"),
1136            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1137        );
1138    }
1139
1140    #[test]
1141    fn pipe_and_pipe_pipe_disambiguated() {
1142        use TokenKind::*;
1143        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1144    }
1145
1146    #[test]
1147    fn v0_7_keywords() {
1148        use TokenKind::*;
1149        assert_eq!(kinds("expect suite case"), vec![Expect, Suite, Case],);
1150        // v0.118: `mocks` and `wires` are retired — plain identifiers now.
1151        assert_eq!(kinds("mocks wires"), vec![Ident, Ident]);
1152    }
1153
1154    #[test]
1155    fn fat_arrow_and_underscore() {
1156        use TokenKind::*;
1157        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1158    }
1159
1160    // -- v0.43 string interpolation --
1161
1162    #[test]
1163    fn interp_string_is_one_token() {
1164        use TokenKind::*;
1165        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1166        // A plain string (no hole) stays a `StrLit`, via the logos path.
1167        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1168    }
1169
1170    #[test]
1171    fn interp_balances_nested_parens_and_strings() {
1172        use TokenKind::*;
1173        // The `)` inside `f(x)` must not close the hole early.
1174        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1175        // A `)` inside a nested string inside the hole is also ignored.
1176        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1177        // A nested interpolated string inside a hole.
1178        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1179    }
1180
1181    // Issue #473: hole-expanding tokenisation makes identifiers inside `\(…)`
1182    // visible to the LSP's token-based cursor resolution.
1183    #[test]
1184    fn expanding_holes_exposes_hole_identifiers() {
1185        use TokenKind::*;
1186        let expand = |src: &str| {
1187            tokenize_expanding_holes(src)
1188                .unwrap()
1189                .into_iter()
1190                .map(|t| t.kind)
1191                .collect::<Vec<_>>()
1192        };
1193        // The opaque `InterpStr` is replaced by its hole's tokens; the chunk
1194        // text (`Hello, ` / `!`) carries none.
1195        assert_eq!(expand(r#""Hello, \(name)!""#), vec![Ident]);
1196        // A call hole exposes every token of the call expression.
1197        assert_eq!(expand(r#""= \(f(x))""#), vec![Ident, LParen, Ident, RParen]);
1198        // Nested interpolation recurses to the innermost hole's identifier.
1199        assert_eq!(expand(r#""out \("in \(x)")""#), vec![Ident]);
1200        // A plain (hole-free) string is untouched.
1201        assert_eq!(expand(r#""Hello, world""#), vec![StrLit]);
1202    }
1203
1204    #[test]
1205    fn expanding_holes_rebases_spans_to_absolute() {
1206        let src = r#""Hello, \(name)!""#;
1207        let toks = tokenize_expanding_holes(src).unwrap();
1208        let ident = toks
1209            .iter()
1210            .find(|t| t.kind == TokenKind::Ident)
1211            .expect("the hole identifier is exposed");
1212        // The span points at `name` in the original source, not a hole-local 0.
1213        assert_eq!(&src[ident.span.range()], "name");
1214        assert_eq!(ident.span.start, src.find("name").unwrap());
1215    }
1216
1217    #[test]
1218    fn escaped_open_paren_is_not_a_hole() {
1219        use TokenKind::*;
1220        // `\\(` is a literal backslash followed by `(` — no hole, so the
1221        // string lexes as a plain `StrLit` on the logos path.
1222        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1223    }
1224
1225    #[test]
1226    fn unterminated_hole_is_an_error() {
1227        // The hole runs to end of line without its closing `)`.
1228        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1229        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1230    }
1231
1232    #[test]
1233    fn unterminated_interp_string_is_an_error() {
1234        // A hole closes but the string never does (newline before the `"`).
1235        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1236        assert_eq!(err.category, "bynk.lex.unterminated_string");
1237    }
1238
1239    #[test]
1240    fn bad_escape_in_interp_string_is_an_error() {
1241        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1242        assert_eq!(err.category, "bynk.lex.bad_escape");
1243    }
1244}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs