bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.86 keyword (ADR 0112): the `Duration` base type.
48    #[token("Duration")]
49    Duration,
50    // v0.90 keyword (ADR 0114): the `Instant` base type.
51    #[token("Instant")]
52    Instant,
53    // v0.110 keyword (ADR 0142): the `Bytes` base type.
54    #[token("Bytes")]
55    Bytes,
56    // v0.1 keywords
57    #[token("let")]
58    Let,
59    #[token("if")]
60    If,
61    #[token("else")]
62    Else,
63    #[token("Ok")]
64    Ok,
65    #[token("Err")]
66    Err,
67    #[token("Result")]
68    Result,
69    #[token("ValidationError")]
70    ValidationError,
71    // v0.22b keyword
72    #[token("JsonError")]
73    JsonError,
74    // v0.2 keywords
75    #[token("enum")]
76    Enum,
77    #[token("match")]
78    Match,
79    #[token("Option")]
80    Option,
81    #[token("record")]
82    Record,
83    #[token("self")]
84    Self_,
85    #[token("Some")]
86    Some,
87    #[token("None")]
88    None,
89    #[token("is")]
90    Is,
91    // v0.3 keywords
92    #[token("opaque")]
93    Opaque,
94    #[token("uses")]
95    Uses,
96    // v0.4 keywords
97    #[token("context")]
98    Context,
99    #[token("consumes")]
100    Consumes,
101    #[token("exports")]
102    Exports,
103    #[token("transparent")]
104    Transparent,
105    // v0.6 keywords
106    #[token("as")]
107    As,
108    // v0.7 keywords (v0.112: `assert`→`expect`, `test`→`suite`/`case`;
109    // v0.118: `mocks` retired — test doubles are `provides` at a seam)
110    #[token("expect")]
111    Expect,
112    #[token("suite")]
113    Suite,
114    #[token("case")]
115    Case,
116    // v0.114 keyword — generative tests (testing track slice 2). `for` and `all`
117    // are deliberately *not* keywords: `all` is a list combinator (`all(xs, p)`)
118    // and must stay a usable identifier. The `for all` binder is parsed
119    // contextually (two identifiers) inside a `property` body instead.
120    #[token("property")]
121    Property,
122    // v0.17 keywords
123    #[token("adapter")]
124    Adapter,
125    #[token("binding")]
126    Binding,
127    // v0.5 keywords
128    #[token("agent")]
129    Agent,
130    #[token("capability")]
131    Capability,
132    #[token("Effect")]
133    Effect,
134    #[token("given")]
135    Given,
136    #[token("on")]
137    On,
138    // v0.9 keyword
139    #[token("http")]
140    Http,
141    // v0.10a keyword
142    #[token("cron")]
143    Cron,
144    // v0.10b keyword
145    #[token("queue")]
146    Queue,
147    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
148    // reserved (protocols are a closed, compiler-known set — no declaration kind).
149    #[token("from")]
150    From,
151    #[token("protocol")]
152    Protocol,
153    #[token("provides")]
154    Provides,
155    #[token("service")]
156    Service,
157    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
158    // heads a handler's actor clause.
159    #[token("actor")]
160    Actor,
161    #[token("by")]
162    By,
163    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
164    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
165    #[token("invariant")]
166    Invariant,
167    #[token("implies")]
168    Implies,
169    // v0.115 keywords — function contracts (testing track slice 3). `requires`
170    // and `ensures` head a contract clause on a `fn` signature (between the
171    // return type and the body). `result` is deliberately *not* a keyword: it is
172    // the ordinary value name outside a contract, so it stays a usable
173    // identifier; inside an `ensures` predicate it is bound contextually as the
174    // function's return value (parsed by scope, like `for`/`all` in slice 2).
175    // Distinct from ADR 0127's capability `@requires` annotation.
176    #[token("requires")]
177    Requires,
178    #[token("ensures")]
179    Ensures,
180    // v0.116 keyword — step invariants (testing track slice 4). `transition` heads
181    // an agent step-invariant declaration (beside `invariant`), a predicate over
182    // the pre- and post-commit state pair. `old` and `new` are deliberately *not*
183    // keywords: they stay ordinary value names outside a `transition`, and inside a
184    // `transition` predicate they are bound contextually to the old/new state
185    // records (parsed by scope, like `result` in an `ensures`).
186    #[token("transition")]
187    Transition,
188    /// `...` — used in record-spread expressions (v0.5).
189    #[token("...")]
190    DotDotDot,
191    /// `<-` — Effect bind operator (v0.5).
192    #[token("<-")]
193    LArrow,
194    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
195    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
196    /// call site shows whether the caller waits.
197    #[token("~>")]
198    TildeArrow,
199    /// `:=` — Cell write (v0.81, storage track). A handler statement
200    /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
201    /// than `:`/`=` so logos matches it as one token.
202    #[token(":=")]
203    ColonEq,
204
205    /// A documentation block: `---` line ... `---` line. The token's span
206    /// covers the full block including both `---` markers. The body content
207    /// is recovered from the source via the span (see [`doc_block_content`]).
208    /// Inserted by [`tokenize`]; not lexed by logos directly.
209    DocBlock,
210
211    /// A line comment: `-- ...` running to end of line. The span starts at
212    /// the `--` marker and runs through the last character before the
213    /// terminating newline (exclusive). The trivia body (the text after the
214    /// `--` marker) is recovered from the source via the span. Inserted by
215    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
216    /// for an `--` operator sequence.
217    Comment,
218
219    // Identifier
220    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
221    Ident,
222
223    // Literals
224    #[regex(r"[0-9]+")]
225    IntLit,
226    // A float literal: fraction with a digit on both sides of the `.`, an
227    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
228    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
229    // as method calls on numeric literals.
230    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
231    FloatLit,
232    // A double-quoted string with simple escapes. The body excludes the closing
233    // quote; we accept any non-quote/non-backslash/non-newline char, or a
234    // backslash followed by one of the four allowed escapes.
235    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
236    StrLit,
237    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
238    // `tokenize` (logos cannot balance the holes' parens), never produced by
239    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
240    // The span covers the whole `"…"`; the parser splits chunks from holes.
241    InterpStr,
242
243    // Multi-char operators
244    #[token("->")]
245    Arrow,
246    #[token("==")]
247    EqEq,
248    #[token("!=")]
249    BangEq,
250    #[token("<=")]
251    LtEq,
252    #[token(">=")]
253    GtEq,
254    #[token("&&")]
255    AmpAmp,
256    #[token("||")]
257    PipePipe,
258
259    // Single-char operators
260    #[token("+")]
261    Plus,
262    #[token("-")]
263    Minus,
264    #[token("*")]
265    Star,
266    #[token("/")]
267    Slash,
268    #[token("!")]
269    Bang,
270    #[token("=")]
271    Eq,
272    #[token("<")]
273    Lt,
274    #[token(">")]
275    Gt,
276    // v0.1 postfix operator
277    #[token("?")]
278    Question,
279    // v0.2 match-arm arrow
280    #[token("=>")]
281    FatArrow,
282    // v0.2 wildcard pattern (also valid as identifier start; the lexer
283    // prefers identifier for any longer match, so `_foo` is still Ident).
284    #[token("_")]
285    Underscore,
286    // v0.2 sum-type variant separator (also used as future bitwise OR);
287    // single `|` distinct from `||`.
288    #[token("|")]
289    Pipe,
290    /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
291    /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
292    /// appears only in store-field-declaration position, never as an expression
293    /// operator.
294    #[token("@")]
295    At,
296
297    // Punctuation
298    #[token("(")]
299    LParen,
300    #[token(")")]
301    RParen,
302    #[token("{")]
303    LBrace,
304    #[token("}")]
305    RBrace,
306    #[token("[")]
307    LBracket,
308    #[token("]")]
309    RBracket,
310    #[token(",")]
311    Comma,
312    #[token(":")]
313    Colon,
314    #[token(".")]
315    Dot,
316}
317
318impl TokenKind {
319    /// Human-readable display name for diagnostics.
320    pub fn describe(self) -> &'static str {
321        use TokenKind::*;
322        match self {
323            Commons => "`commons`",
324            Type => "`type`",
325            Fn => "`fn`",
326            Where => "`where`",
327            And => "`and`",
328            True => "`true`",
329            False => "`false`",
330            Int => "`Int`",
331            String => "`String`",
332            Bool => "`Bool`",
333            Float => "`Float`",
334            Duration => "`Duration`",
335            Instant => "`Instant`",
336            Bytes => "`Bytes`",
337            Let => "`let`",
338            If => "`if`",
339            Else => "`else`",
340            Ok => "`Ok`",
341            Err => "`Err`",
342            Result => "`Result`",
343            ValidationError => "`ValidationError`",
344            JsonError => "`JsonError`",
345            Enum => "`enum`",
346            Match => "`match`",
347            Option => "`Option`",
348            Record => "`record`",
349            Self_ => "`self`",
350            Some => "`Some`",
351            None => "`None`",
352            Is => "`is`",
353            Opaque => "`opaque`",
354            Uses => "`uses`",
355            Context => "`context`",
356            Consumes => "`consumes`",
357            Exports => "`exports`",
358            Transparent => "`transparent`",
359            As => "`as`",
360            Expect => "`expect`",
361            Suite => "`suite`",
362            Case => "`case`",
363            Property => "`property`",
364            Adapter => "`adapter`",
365            Binding => "`binding`",
366            Agent => "`agent`",
367            Capability => "`capability`",
368            Effect => "`Effect`",
369            Given => "`given`",
370            On => "`on`",
371            Http => "`http`",
372            Cron => "`cron`",
373            Queue => "`queue`",
374            From => "`from`",
375            Protocol => "`protocol`",
376            Provides => "`provides`",
377            Service => "`service`",
378            Actor => "`actor`",
379            By => "`by`",
380            Invariant => "`invariant`",
381            Implies => "`implies`",
382            Requires => "`requires`",
383            Ensures => "`ensures`",
384            Transition => "`transition`",
385            ColonEq => "`:=`",
386            DotDotDot => "`...`",
387            LArrow => "`<-`",
388            TildeArrow => "`~>`",
389            DocBlock => "documentation block",
390            Comment => "line comment",
391            Ident => "identifier",
392            IntLit => "integer literal",
393            FloatLit => "float literal",
394            StrLit => "string literal",
395            InterpStr => "interpolated string",
396            Arrow => "`->`",
397            EqEq => "`==`",
398            BangEq => "`!=`",
399            LtEq => "`<=`",
400            GtEq => "`>=`",
401            AmpAmp => "`&&`",
402            PipePipe => "`||`",
403            Plus => "`+`",
404            Minus => "`-`",
405            Star => "`*`",
406            Slash => "`/`",
407            Bang => "`!`",
408            Eq => "`=`",
409            Lt => "`<`",
410            Gt => "`>`",
411            Question => "`?`",
412            FatArrow => "`=>`",
413            Underscore => "`_`",
414            Pipe => "`|`",
415            At => "`@`",
416            LParen => "`(`",
417            RParen => "`)`",
418            LBrace => "`{`",
419            RBrace => "`}`",
420            LBracket => "`[`",
421            RBracket => "`]`",
422            Comma => "`,`",
423            Colon => "`:`",
424            Dot => "`.`",
425        }
426    }
427}
428
429/// A token plus its source span.
430#[derive(Debug, Clone, Copy)]
431pub struct Token {
432    pub kind: TokenKind,
433    pub span: Span,
434}
435
436/// Tokenise a source string. Returns the full token vector or the first
437/// lexical error.
438///
439/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
440/// outside the logos-generated lexer: we scan the source one segment at a
441/// time, dispatching to logos for ordinary tokens between non-token spans.
442pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
443    let mut tokens = Vec::new();
444    let bytes = source.as_bytes();
445    let mut pos = 0;
446    while pos < bytes.len() {
447        // Detect a `---` doc-block marker at the start of a line (the line may
448        // begin with leading whitespace; the marker itself must be alone on
449        // its line).
450        if let Some(open_end) = doc_block_open_at(source, pos) {
451            // Find the matching closing `---` line.
452            match doc_block_close(source, open_end) {
453                Some((close_start, close_end)) => {
454                    let span = Span::new(pos, close_end);
455                    tokens.push(Token {
456                        kind: TokenKind::DocBlock,
457                        span,
458                    });
459                    let _ = close_start;
460                    pos = close_end;
461                    continue;
462                }
463                None => {
464                    return Err(CompileError::new(
465                        "bynk.lex.unclosed_doc_block",
466                        Span::new(pos, open_end),
467                        "documentation block opened but never closed",
468                    )
469                    .with_note(
470                        "a doc block must be terminated by another `---` on a line by itself",
471                    ));
472                }
473            }
474        }
475        // A `--` line comment: emit a `Comment` token covering everything
476        // up to (but not including) the terminating newline. Doc-block
477        // detection above already ruled out a `---` marker at line start
478        // — and once we've consumed past the leading `--`, any further
479        // dashes are part of the comment body. Preserving comments as
480        // trivia tokens lets the parser attach them to declarations so
481        // the formatter can emit them in place (v1.1 LSP spec §3.5).
482        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
483            let start = pos;
484            while pos < bytes.len() && bytes[pos] != b'\n' {
485                pos += 1;
486            }
487            tokens.push(Token {
488                kind: TokenKind::Comment,
489                span: Span::new(start, pos),
490            });
491            continue;
492        }
493        // Skip ordinary whitespace inline (logos handles it too, but we may
494        // be in the middle of the source between specials).
495        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
496            pos += 1;
497            continue;
498        }
499        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
500        // actually contain a `\(` hole are hand-scanned here; plain strings
501        // fall through to the logos `StrLit` path unchanged. `\(` is an
502        // invalid escape in the logos grammar, so this never re-routes a
503        // currently-valid literal.
504        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
505            let end = scan_str(bytes, source, pos)?;
506            tokens.push(Token {
507                kind: TokenKind::InterpStr,
508                span: Span::new(pos, end),
509            });
510            pos = end;
511            continue;
512        }
513        // Otherwise dispatch a single logos token starting at `pos`.
514        let mut lex = TokenKind::lexer(&source[pos..]);
515        let Some(result) = lex.next() else {
516            // No token at this position; treat as unexpected character so
517            // the user sees something useful.
518            let ch = source[pos..].chars().next().unwrap_or('\0');
519            let span = Span::new(pos, pos + ch.len_utf8());
520            return Err(CompileError::new(
521                "bynk.lex.unexpected_character",
522                span,
523                format!("unexpected character `{ch}`"),
524            ));
525        };
526        let local = lex.span();
527        let span: Span = Span::new(pos + local.start, pos + local.end);
528        match result {
529            Ok(kind) => {
530                if kind == TokenKind::IntLit {
531                    let slice = &source[span.range()];
532                    if slice.parse::<i64>().is_err() {
533                        return Err(CompileError::new(
534                            "bynk.lex.integer_overflow",
535                            span,
536                            format!(
537                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
538                            ),
539                        )
540                        .with_note("the range is -2^63 to 2^63 - 1"));
541                    }
542                }
543                if kind == TokenKind::FloatLit {
544                    let slice = &source[span.range()];
545                    match slice.parse::<f64>() {
546                        Ok(v) if v.is_finite() => {}
547                        _ => {
548                            return Err(CompileError::new(
549                                "bynk.lex.float_literal_overflow",
550                                span,
551                                format!(
552                                    "float literal `{slice}` is out of range for a 64-bit float"
553                                ),
554                            )
555                            .with_note(
556                                "the literal does not fit a finite IEEE 754 double; \
557                                 the largest finite value is ~1.8e308",
558                            ));
559                        }
560                    }
561                }
562                tokens.push(Token { kind, span });
563                pos = span.end;
564            }
565            Err(()) => {
566                let slice = &source[span.range()];
567                let ch = slice.chars().next().unwrap_or('\0');
568                let err = if ch == '"' {
569                    CompileError::new(
570                        "bynk.lex.unterminated_string",
571                        span,
572                        "unterminated string literal",
573                    )
574                    .with_note(
575                        "string literals must close with `\"` on the same line; \
576                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
577                    )
578                } else {
579                    CompileError::new(
580                        "bynk.lex.unexpected_character",
581                        span,
582                        format!("unexpected character `{ch}`"),
583                    )
584                };
585                return Err(err);
586            }
587        }
588    }
589    Ok(tokens)
590}
591
592/// Like [`tokenize`], but with every interpolated-string token replaced by the
593/// tokens of its holes — each hole's bytes re-lexed and its token spans rebased
594/// to absolute source positions (the same rebase [`crate::parser`] applies when
595/// parsing a hole), recursing through nested interpolation. Chunk (literal) text
596/// between holes yields no tokens.
597///
598/// An interpolated string lexes to a single opaque `InterpStr` token, so the
599/// LSP's token-based cursor resolution (hover, go-to-definition, references,
600/// semantic tokens) is otherwise blind to identifiers inside `"… \(name) …"`.
601/// Expanding the holes makes those identifiers visible as ordinary `Ident`
602/// tokens with their real spans. (Issue #473.)
603///
604/// On a malformed interpolation (an `InterpStr` whose holes don't split, or a
605/// hole whose bytes don't re-lex) the offending token is kept opaque rather than
606/// dropped, so resolution degrades to the pre-fix behaviour instead of losing
607/// tokens.
608pub fn tokenize_expanding_holes(source: &str) -> Result<Vec<Token>, CompileError> {
609    let mut out = Vec::new();
610    for tok in tokenize(source)? {
611        expand_hole_token(source, tok, &mut out);
612    }
613    Ok(out)
614}
615
616/// Push `tok` onto `out`, expanding it into its holes' tokens if it is an
617/// `InterpStr` (see [`tokenize_expanding_holes`]); otherwise push it as-is.
618fn expand_hole_token(source: &str, tok: Token, out: &mut Vec<Token>) {
619    if tok.kind != TokenKind::InterpStr {
620        out.push(tok);
621        return;
622    }
623    let Ok(segments) = split_interp(source, tok.span) else {
624        out.push(tok); // malformed interpolation — keep the opaque token
625        return;
626    };
627    for segment in segments {
628        let InterpSegment::Hole(hole) = segment else {
629            continue; // chunk text carries no tokens
630        };
631        let Ok(hole_tokens) = tokenize(&source[hole.range()]) else {
632            continue;
633        };
634        for mut t in hole_tokens {
635            // Rebase the hole's local spans to absolute source positions.
636            t.span = Span::new(t.span.start + hole.start, t.span.end + hole.start);
637            expand_hole_token(source, t, out); // recurse for nested interpolation
638        }
639    }
640}
641
642/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
643/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
644/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
645/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
646/// routed here so the hole-aware scanner produces the precise error.
647fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
648    let mut i = start + 1;
649    while i < bytes.len() {
650        match bytes[i] {
651            b'\n' | b'"' => return false,
652            b'\\' => {
653                if bytes.get(i + 1) == Some(&b'(') {
654                    return true;
655                }
656                i += 2;
657            }
658            _ => i += 1,
659        }
660    }
661    false
662}
663
664/// Scan a double-quoted string starting at `start` (the opening `"`), returning
665/// the byte offset just past the closing `"`. Recognises the four simple
666/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
667/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
668fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
669    debug_assert_eq!(bytes[start], b'"');
670    let mut i = start + 1;
671    loop {
672        if i >= bytes.len() || bytes[i] == b'\n' {
673            return Err(CompileError::new(
674                "bynk.lex.unterminated_string",
675                Span::new(start, i.min(bytes.len())),
676                "unterminated string literal",
677            )
678            .with_note(
679                "string literals must close with `\"` on the same line; \
680                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
681            ));
682        }
683        match bytes[i] {
684            b'"' => return Ok(i + 1),
685            b'\\' => match bytes.get(i + 1) {
686                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
687                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
688                other => {
689                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
690                    return Err(CompileError::new(
691                        "bynk.lex.bad_escape",
692                        Span::new(i, (i + 2).min(bytes.len())),
693                        format!("invalid escape sequence `\\{shown}` in string literal"),
694                    )
695                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
696                }
697            },
698            // Any other byte advances one position. UTF-8 continuation bytes
699            // are all >= 0x80, so they never collide with the ASCII specials.
700            _ => i += 1,
701        }
702    }
703}
704
705/// Scan an interpolation hole body. `start` points just past the `\(`; returns
706/// the offset just past the matching `)`. Tracks paren depth and skips nested
707/// strings (whose own parens must not close the hole), recursing through
708/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
709fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
710    let mut i = start;
711    let mut depth = 1usize;
712    loop {
713        if i >= bytes.len() || bytes[i] == b'\n' {
714            return Err(CompileError::new(
715                "bynk.lex.unterminated_interpolation",
716                Span::new(start.saturating_sub(2), i.min(bytes.len())),
717                "unterminated interpolation hole",
718            )
719            .with_note(
720                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
721            ));
722        }
723        match bytes[i] {
724            b'(' => {
725                depth += 1;
726                i += 1;
727            }
728            b')' => {
729                depth -= 1;
730                i += 1;
731                if depth == 0 {
732                    return Ok(i);
733                }
734            }
735            b'"' => i = scan_str(bytes, source, i)?,
736            _ => i += 1,
737        }
738    }
739}
740
741/// One segment of a split interpolated string (v0.43): literal text (escapes
742/// resolved) or the absolute source span of a hole's expression (the bytes
743/// between `\(` and its matching `)`). The parser turns the latter into a real
744/// `Expr`; the lexer owns only the scanning.
745pub(crate) enum InterpSegment {
746    Chunk(String),
747    Hole(Span),
748}
749
750/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
751/// and hole spans. Escapes in the chunks are resolved here (mirroring
752/// [`parse_string_literal`]); holes are returned as spans for the parser to
753/// re-lex and parse as expressions. (v0.43.)
754pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
755    let bytes = source.as_bytes();
756    let inner_end = span.end - 1; // the closing `"`
757    let mut segments = Vec::new();
758    let mut chunk = String::new();
759    let mut i = span.start + 1; // past the opening `"`
760    while i < inner_end {
761        match bytes[i] {
762            b'\\' => match bytes[i + 1] {
763                b'n' => {
764                    chunk.push('\n');
765                    i += 2;
766                }
767                b't' => {
768                    chunk.push('\t');
769                    i += 2;
770                }
771                b'"' => {
772                    chunk.push('"');
773                    i += 2;
774                }
775                b'\\' => {
776                    chunk.push('\\');
777                    i += 2;
778                }
779                b'(' => {
780                    if !chunk.is_empty() {
781                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
782                    }
783                    let hole_start = i + 2;
784                    let after = scan_hole(bytes, source, hole_start)?;
785                    // `after` is one past the matching `)`; the hole body is
786                    // everything up to that `)`.
787                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
788                    i = after;
789                }
790                // The lexer already validated every escape, so nothing else
791                // can appear here.
792                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
793            },
794            _ => {
795                let ch = source[i..].chars().next().unwrap();
796                chunk.push(ch);
797                i += ch.len_utf8();
798            }
799        }
800    }
801    if !chunk.is_empty() {
802        segments.push(InterpSegment::Chunk(chunk));
803    }
804    Ok(segments)
805}
806
807/// If a `---` doc-block marker line starts at or shortly after `pos` (which
808/// must be at a line boundary), return the byte offset just past the marker
809/// line (after the terminating newline, or at EOF). The doc-block grammar
810/// requires the marker to be alone on its line; leading horizontal whitespace
811/// is allowed and ignored.
812fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
813    let bytes = source.as_bytes();
814    if !at_line_start(source, pos) {
815        return None;
816    }
817    // Skip leading horizontal whitespace.
818    let mut i = pos;
819    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
820        i += 1;
821    }
822    if i + 3 > bytes.len() {
823        return None;
824    }
825    if &bytes[i..i + 3] != b"---" {
826        return None;
827    }
828    i += 3;
829    // The marker may have additional trailing dashes (per spec "three or more
830    // consecutive hyphens"). Consume them.
831    while i < bytes.len() && bytes[i] == b'-' {
832        i += 1;
833    }
834    // After the dashes, allow only horizontal whitespace then newline/EOF.
835    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
836        i += 1;
837    }
838    if i == bytes.len() {
839        return Some(i);
840    }
841    if bytes[i] == b'\n' {
842        return Some(i + 1);
843    }
844    None
845}
846
847/// Find the next closing `---` line at or after `pos`. Returns
848/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
849/// terminating newline, or at EOF).
850fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
851    let bytes = source.as_bytes();
852    while pos < bytes.len() {
853        // Advance pos to the start of a line.
854        let line_start = pos;
855        // Find the end of this line.
856        let mut line_end = line_start;
857        while line_end < bytes.len() && bytes[line_end] != b'\n' {
858            line_end += 1;
859        }
860        // Check this line.
861        if let Some(end) = doc_block_open_at(source, line_start) {
862            return Some((line_start, end));
863        }
864        // Move to the next line.
865        pos = if line_end < bytes.len() {
866            line_end + 1
867        } else {
868            line_end
869        };
870    }
871    None
872}
873
874/// Returns true if byte offset `pos` is at a line start (column 0).
875fn at_line_start(source: &str, pos: usize) -> bool {
876    if pos == 0 {
877        return true;
878    }
879    let bytes = source.as_bytes();
880    bytes[pos - 1] == b'\n'
881}
882
883/// Extract the body content of a doc-block token from its source span.
884/// Strips the leading and trailing `---` marker lines and returns the body
885/// verbatim. If every non-empty content line begins with the same horizontal
886/// whitespace prefix (e.g., because the doc block sits inside a brace-form
887/// commons body), that common prefix is removed so the body reads naturally
888/// when emitted as JSDoc.
889pub fn doc_block_content(source: &str, span: Span) -> String {
890    let slice = &source[span.range()];
891    // Drop the first line (opening marker).
892    let after_open = match slice.find('\n') {
893        Some(i) => &slice[i + 1..],
894        None => return String::new(),
895    };
896    let bytes = after_open.as_bytes();
897    // Trim the trailing closing-marker line.
898    let mut i = bytes.len();
899    if i > 0 && bytes[i - 1] == b'\n' {
900        i -= 1;
901    }
902    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
903        i -= 1;
904    }
905    while i > 0 && bytes[i - 1] == b'-' {
906        i -= 1;
907    }
908    if i > 0 && bytes[i - 1] == b'\n' {
909        i -= 1;
910    }
911    let body = &after_open[..i];
912
913    // Compute the common leading-whitespace prefix across all non-empty lines
914    // and strip it. This lets writers indent the doc block alongside the
915    // declaration it documents without bleeding the indent into the JSDoc.
916    let common: Option<usize> = body
917        .lines()
918        .filter(|l| !l.trim().is_empty())
919        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
920        .min();
921    let strip = common.unwrap_or(0);
922    if strip == 0 {
923        return body.to_string();
924    }
925    let mut out = String::with_capacity(body.len());
926    let mut first = true;
927    for line in body.lines() {
928        if !first {
929            out.push('\n');
930        }
931        first = false;
932        if line.trim().is_empty() {
933            // Preserve blank lines.
934            continue;
935        }
936        let leading: usize = line
937            .bytes()
938            .take_while(|&b| b == b' ' || b == b'\t')
939            .count();
940        let drop = strip.min(leading);
941        out.push_str(&line[drop..]);
942    }
943    out
944}
945
946/// Extract the body of a `Comment` trivia token: everything after the
947/// leading `--` marker, preserving its inline whitespace verbatim. Used by
948/// the parser when attaching comments to declarations.
949pub fn comment_body(source: &str, span: Span) -> &str {
950    let slice = &source[span.range()];
951    // Strip leading "--" if present (defensive — the lexer always emits
952    // Comment tokens whose span begins with `--`).
953    slice.strip_prefix("--").unwrap_or(slice)
954}
955
956/// Returns true if there is a blank line (a line containing only whitespace)
957/// in `source` strictly between byte offsets `from` (inclusive) and `to`
958/// (exclusive). Used by the parser to detect orphan doc blocks.
959///
960/// A doc-block token's span ends just past the closing-marker line's
961/// terminating newline. So if the next declaration begins on the immediately
962/// following line, the substring between contains no newline (only optional
963/// indentation). Any newline in the substring therefore implies at least one
964/// entirely-blank line separating the doc from the declaration.
965pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
966    if to <= from {
967        return false;
968    }
969    let bytes = source.as_bytes();
970    let mut i = from;
971    while i < to {
972        if bytes[i] == b'\n' {
973            return true;
974        }
975        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
976            return false;
977        }
978        i += 1;
979    }
980    false
981}
982
983#[cfg(test)]
984mod tests {
985    use super::*;
986
987    fn kinds(source: &str) -> Vec<TokenKind> {
988        tokenize(source)
989            .unwrap()
990            .into_iter()
991            .map(|t| t.kind)
992            .collect()
993    }
994
995    #[test]
996    fn keywords_and_idents() {
997        use TokenKind::*;
998        assert_eq!(
999            kinds("commons type fn where and true false Int String Bool foo bar"),
1000            vec![
1001                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
1002            ],
1003        );
1004    }
1005
1006    #[test]
1007    fn integer_and_string_literals() {
1008        use TokenKind::*;
1009        assert_eq!(
1010            kinds(r#"0 42 "hello" "with\nescape""#),
1011            vec![IntLit, IntLit, StrLit, StrLit]
1012        );
1013    }
1014
1015    #[test]
1016    fn operators() {
1017        use TokenKind::*;
1018        assert_eq!(
1019            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
1020            vec![
1021                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
1022                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
1023                At,
1024            ],
1025        );
1026    }
1027
1028    #[test]
1029    fn line_comments_emitted_as_trivia() {
1030        // v1.1: line comments are preserved as Comment tokens so the
1031        // formatter can attach and re-emit them.
1032        use TokenKind::*;
1033        let src = "-- a comment\ntype X = Int -- trailing\n";
1034        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
1035    }
1036
1037    #[test]
1038    fn comment_body_extracts_text_after_marker() {
1039        let toks = tokenize("-- hello world\n").unwrap();
1040        assert_eq!(toks.len(), 1);
1041        assert_eq!(toks[0].kind, TokenKind::Comment);
1042        assert_eq!(
1043            comment_body("-- hello world\n", toks[0].span),
1044            " hello world"
1045        );
1046    }
1047
1048    #[test]
1049    fn comment_does_not_consume_newline() {
1050        // Two adjacent comment lines should produce two distinct tokens
1051        // — the newline between them is not part of either comment's span.
1052        let toks = tokenize("-- one\n-- two\n").unwrap();
1053        assert_eq!(toks.len(), 2);
1054        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
1055    }
1056
1057    #[test]
1058    fn unterminated_string_is_error() {
1059        let err = tokenize("\"oops\n").unwrap_err();
1060        assert_eq!(err.category, "bynk.lex.unterminated_string");
1061    }
1062
1063    #[test]
1064    fn integer_overflow_is_error() {
1065        let err = tokenize("99999999999999999999").unwrap_err();
1066        assert_eq!(err.category, "bynk.lex.integer_overflow");
1067    }
1068
1069    #[test]
1070    fn unexpected_character_is_error() {
1071        let err = tokenize("type X = Int $").unwrap_err();
1072        assert_eq!(err.category, "bynk.lex.unexpected_character");
1073    }
1074
1075    #[test]
1076    fn v0_1_keywords() {
1077        use TokenKind::*;
1078        assert_eq!(
1079            kinds("let if else Ok Err Result ValidationError"),
1080            vec![Let, If, Else, Ok, Err, Result, ValidationError],
1081        );
1082    }
1083
1084    #[test]
1085    fn question_token() {
1086        use TokenKind::*;
1087        assert_eq!(kinds("x?"), vec![Ident, Question]);
1088    }
1089
1090    #[test]
1091    fn v0_2_keywords() {
1092        use TokenKind::*;
1093        assert_eq!(
1094            kinds("enum match Option record self Some None is"),
1095            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1096        );
1097    }
1098
1099    #[test]
1100    fn pipe_and_pipe_pipe_disambiguated() {
1101        use TokenKind::*;
1102        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1103    }
1104
1105    #[test]
1106    fn v0_7_keywords() {
1107        use TokenKind::*;
1108        assert_eq!(kinds("expect suite case"), vec![Expect, Suite, Case],);
1109        // v0.118: `mocks` and `wires` are retired — plain identifiers now.
1110        assert_eq!(kinds("mocks wires"), vec![Ident, Ident]);
1111    }
1112
1113    #[test]
1114    fn fat_arrow_and_underscore() {
1115        use TokenKind::*;
1116        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1117    }
1118
1119    // -- v0.43 string interpolation --
1120
1121    #[test]
1122    fn interp_string_is_one_token() {
1123        use TokenKind::*;
1124        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1125        // A plain string (no hole) stays a `StrLit`, via the logos path.
1126        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1127    }
1128
1129    #[test]
1130    fn interp_balances_nested_parens_and_strings() {
1131        use TokenKind::*;
1132        // The `)` inside `f(x)` must not close the hole early.
1133        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1134        // A `)` inside a nested string inside the hole is also ignored.
1135        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1136        // A nested interpolated string inside a hole.
1137        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1138    }
1139
1140    // Issue #473: hole-expanding tokenisation makes identifiers inside `\(…)`
1141    // visible to the LSP's token-based cursor resolution.
1142    #[test]
1143    fn expanding_holes_exposes_hole_identifiers() {
1144        use TokenKind::*;
1145        let expand = |src: &str| {
1146            tokenize_expanding_holes(src)
1147                .unwrap()
1148                .into_iter()
1149                .map(|t| t.kind)
1150                .collect::<Vec<_>>()
1151        };
1152        // The opaque `InterpStr` is replaced by its hole's tokens; the chunk
1153        // text (`Hello, ` / `!`) carries none.
1154        assert_eq!(expand(r#""Hello, \(name)!""#), vec![Ident]);
1155        // A call hole exposes every token of the call expression.
1156        assert_eq!(expand(r#""= \(f(x))""#), vec![Ident, LParen, Ident, RParen]);
1157        // Nested interpolation recurses to the innermost hole's identifier.
1158        assert_eq!(expand(r#""out \("in \(x)")""#), vec![Ident]);
1159        // A plain (hole-free) string is untouched.
1160        assert_eq!(expand(r#""Hello, world""#), vec![StrLit]);
1161    }
1162
1163    #[test]
1164    fn expanding_holes_rebases_spans_to_absolute() {
1165        let src = r#""Hello, \(name)!""#;
1166        let toks = tokenize_expanding_holes(src).unwrap();
1167        let ident = toks
1168            .iter()
1169            .find(|t| t.kind == TokenKind::Ident)
1170            .expect("the hole identifier is exposed");
1171        // The span points at `name` in the original source, not a hole-local 0.
1172        assert_eq!(&src[ident.span.range()], "name");
1173        assert_eq!(ident.span.start, src.find("name").unwrap());
1174    }
1175
1176    #[test]
1177    fn escaped_open_paren_is_not_a_hole() {
1178        use TokenKind::*;
1179        // `\\(` is a literal backslash followed by `(` — no hole, so the
1180        // string lexes as a plain `StrLit` on the logos path.
1181        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1182    }
1183
1184    #[test]
1185    fn unterminated_hole_is_an_error() {
1186        // The hole runs to end of line without its closing `)`.
1187        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1188        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1189    }
1190
1191    #[test]
1192    fn unterminated_interp_string_is_an_error() {
1193        // A hole closes but the string never does (newline before the `"`).
1194        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1195        assert_eq!(err.category, "bynk.lex.unterminated_string");
1196    }
1197
1198    #[test]
1199    fn bad_escape_in_interp_string_is_an_error() {
1200        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1201        assert_eq!(err.category, "bynk.lex.bad_escape");
1202    }
1203}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs