bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.86 keyword (ADR 0112): the `Duration` base type.
48    #[token("Duration")]
49    Duration,
50    // v0.90 keyword (ADR 0114): the `Instant` base type.
51    #[token("Instant")]
52    Instant,
53    // v0.1 keywords
54    #[token("let")]
55    Let,
56    #[token("if")]
57    If,
58    #[token("else")]
59    Else,
60    #[token("Ok")]
61    Ok,
62    #[token("Err")]
63    Err,
64    #[token("Result")]
65    Result,
66    #[token("ValidationError")]
67    ValidationError,
68    // v0.22b keyword
69    #[token("JsonError")]
70    JsonError,
71    // v0.2 keywords
72    #[token("enum")]
73    Enum,
74    #[token("match")]
75    Match,
76    #[token("Option")]
77    Option,
78    #[token("record")]
79    Record,
80    #[token("self")]
81    Self_,
82    #[token("Some")]
83    Some,
84    #[token("None")]
85    None,
86    #[token("is")]
87    Is,
88    // v0.3 keywords
89    #[token("opaque")]
90    Opaque,
91    #[token("uses")]
92    Uses,
93    // v0.4 keywords
94    #[token("context")]
95    Context,
96    #[token("consumes")]
97    Consumes,
98    #[token("exports")]
99    Exports,
100    #[token("transparent")]
101    Transparent,
102    // v0.6 keywords
103    #[token("as")]
104    As,
105    // v0.7 keywords
106    #[token("assert")]
107    Assert,
108    #[token("expect")]
109    Expect,
110    #[token("mocks")]
111    Mocks,
112    #[token("test")]
113    Test,
114    // v0.16 keyword
115    #[token("wires")]
116    Wires,
117    // v0.17 keywords
118    #[token("adapter")]
119    Adapter,
120    #[token("binding")]
121    Binding,
122    // v0.5 keywords
123    #[token("agent")]
124    Agent,
125    #[token("capability")]
126    Capability,
127    #[token("Effect")]
128    Effect,
129    #[token("given")]
130    Given,
131    #[token("on")]
132    On,
133    // v0.9 keyword
134    #[token("http")]
135    Http,
136    // v0.10a keyword
137    #[token("cron")]
138    Cron,
139    // v0.10b keyword
140    #[token("queue")]
141    Queue,
142    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
143    // reserved (protocols are a closed, compiler-known set — no declaration kind).
144    #[token("from")]
145    From,
146    #[token("protocol")]
147    Protocol,
148    #[token("provides")]
149    Provides,
150    #[token("service")]
151    Service,
152    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
153    // heads a handler's actor clause.
154    #[token("actor")]
155    Actor,
156    #[token("by")]
157    By,
158    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
159    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
160    #[token("invariant")]
161    Invariant,
162    #[token("implies")]
163    Implies,
164    /// `...` — used in record-spread expressions (v0.5).
165    #[token("...")]
166    DotDotDot,
167    /// `<-` — Effect bind operator (v0.5).
168    #[token("<-")]
169    LArrow,
170    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
171    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
172    /// call site shows whether the caller waits.
173    #[token("~>")]
174    TildeArrow,
175    /// `:=` — Cell write (v0.81, storage track). A handler statement
176    /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
177    /// than `:`/`=` so logos matches it as one token.
178    #[token(":=")]
179    ColonEq,
180
181    /// A documentation block: `---` line ... `---` line. The token's span
182    /// covers the full block including both `---` markers. The body content
183    /// is recovered from the source via the span (see [`doc_block_content`]).
184    /// Inserted by [`tokenize`]; not lexed by logos directly.
185    DocBlock,
186
187    /// A line comment: `-- ...` running to end of line. The span starts at
188    /// the `--` marker and runs through the last character before the
189    /// terminating newline (exclusive). The trivia body (the text after the
190    /// `--` marker) is recovered from the source via the span. Inserted by
191    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
192    /// for an `--` operator sequence.
193    Comment,
194
195    // Identifier
196    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
197    Ident,
198
199    // Literals
200    #[regex(r"[0-9]+")]
201    IntLit,
202    // A float literal: fraction with a digit on both sides of the `.`, an
203    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
204    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
205    // as method calls on numeric literals.
206    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
207    FloatLit,
208    // A double-quoted string with simple escapes. The body excludes the closing
209    // quote; we accept any non-quote/non-backslash/non-newline char, or a
210    // backslash followed by one of the four allowed escapes.
211    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
212    StrLit,
213    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
214    // `tokenize` (logos cannot balance the holes' parens), never produced by
215    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
216    // The span covers the whole `"…"`; the parser splits chunks from holes.
217    InterpStr,
218
219    // Multi-char operators
220    #[token("->")]
221    Arrow,
222    #[token("==")]
223    EqEq,
224    #[token("!=")]
225    BangEq,
226    #[token("<=")]
227    LtEq,
228    #[token(">=")]
229    GtEq,
230    #[token("&&")]
231    AmpAmp,
232    #[token("||")]
233    PipePipe,
234
235    // Single-char operators
236    #[token("+")]
237    Plus,
238    #[token("-")]
239    Minus,
240    #[token("*")]
241    Star,
242    #[token("/")]
243    Slash,
244    #[token("!")]
245    Bang,
246    #[token("=")]
247    Eq,
248    #[token("<")]
249    Lt,
250    #[token(">")]
251    Gt,
252    // v0.1 postfix operator
253    #[token("?")]
254    Question,
255    // v0.2 match-arm arrow
256    #[token("=>")]
257    FatArrow,
258    // v0.2 wildcard pattern (also valid as identifier start; the lexer
259    // prefers identifier for any longer match, so `_foo` is still Ident).
260    #[token("_")]
261    Underscore,
262    // v0.2 sum-type variant separator (also used as future bitwise OR);
263    // single `|` distinct from `||`.
264    #[token("|")]
265    Pipe,
266    /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
267    /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
268    /// appears only in store-field-declaration position, never as an expression
269    /// operator.
270    #[token("@")]
271    At,
272
273    // Punctuation
274    #[token("(")]
275    LParen,
276    #[token(")")]
277    RParen,
278    #[token("{")]
279    LBrace,
280    #[token("}")]
281    RBrace,
282    #[token("[")]
283    LBracket,
284    #[token("]")]
285    RBracket,
286    #[token(",")]
287    Comma,
288    #[token(":")]
289    Colon,
290    #[token(".")]
291    Dot,
292}
293
294impl TokenKind {
295    /// Human-readable display name for diagnostics.
296    pub fn describe(self) -> &'static str {
297        use TokenKind::*;
298        match self {
299            Commons => "`commons`",
300            Type => "`type`",
301            Fn => "`fn`",
302            Where => "`where`",
303            And => "`and`",
304            True => "`true`",
305            False => "`false`",
306            Int => "`Int`",
307            String => "`String`",
308            Bool => "`Bool`",
309            Float => "`Float`",
310            Duration => "`Duration`",
311            Instant => "`Instant`",
312            Let => "`let`",
313            If => "`if`",
314            Else => "`else`",
315            Ok => "`Ok`",
316            Err => "`Err`",
317            Result => "`Result`",
318            ValidationError => "`ValidationError`",
319            JsonError => "`JsonError`",
320            Enum => "`enum`",
321            Match => "`match`",
322            Option => "`Option`",
323            Record => "`record`",
324            Self_ => "`self`",
325            Some => "`Some`",
326            None => "`None`",
327            Is => "`is`",
328            Opaque => "`opaque`",
329            Uses => "`uses`",
330            Context => "`context`",
331            Consumes => "`consumes`",
332            Exports => "`exports`",
333            Transparent => "`transparent`",
334            As => "`as`",
335            Assert => "`assert`",
336            Expect => "`expect`",
337            Mocks => "`mocks`",
338            Test => "`test`",
339            Wires => "`wires`",
340            Adapter => "`adapter`",
341            Binding => "`binding`",
342            Agent => "`agent`",
343            Capability => "`capability`",
344            Effect => "`Effect`",
345            Given => "`given`",
346            On => "`on`",
347            Http => "`http`",
348            Cron => "`cron`",
349            Queue => "`queue`",
350            From => "`from`",
351            Protocol => "`protocol`",
352            Provides => "`provides`",
353            Service => "`service`",
354            Actor => "`actor`",
355            By => "`by`",
356            Invariant => "`invariant`",
357            Implies => "`implies`",
358            ColonEq => "`:=`",
359            DotDotDot => "`...`",
360            LArrow => "`<-`",
361            TildeArrow => "`~>`",
362            DocBlock => "documentation block",
363            Comment => "line comment",
364            Ident => "identifier",
365            IntLit => "integer literal",
366            FloatLit => "float literal",
367            StrLit => "string literal",
368            InterpStr => "interpolated string",
369            Arrow => "`->`",
370            EqEq => "`==`",
371            BangEq => "`!=`",
372            LtEq => "`<=`",
373            GtEq => "`>=`",
374            AmpAmp => "`&&`",
375            PipePipe => "`||`",
376            Plus => "`+`",
377            Minus => "`-`",
378            Star => "`*`",
379            Slash => "`/`",
380            Bang => "`!`",
381            Eq => "`=`",
382            Lt => "`<`",
383            Gt => "`>`",
384            Question => "`?`",
385            FatArrow => "`=>`",
386            Underscore => "`_`",
387            Pipe => "`|`",
388            At => "`@`",
389            LParen => "`(`",
390            RParen => "`)`",
391            LBrace => "`{`",
392            RBrace => "`}`",
393            LBracket => "`[`",
394            RBracket => "`]`",
395            Comma => "`,`",
396            Colon => "`:`",
397            Dot => "`.`",
398        }
399    }
400}
401
402/// A token plus its source span.
403#[derive(Debug, Clone, Copy)]
404pub struct Token {
405    pub kind: TokenKind,
406    pub span: Span,
407}
408
409/// Tokenise a source string. Returns the full token vector or the first
410/// lexical error.
411///
412/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
413/// outside the logos-generated lexer: we scan the source one segment at a
414/// time, dispatching to logos for ordinary tokens between non-token spans.
415pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
416    let mut tokens = Vec::new();
417    let bytes = source.as_bytes();
418    let mut pos = 0;
419    while pos < bytes.len() {
420        // Detect a `---` doc-block marker at the start of a line (the line may
421        // begin with leading whitespace; the marker itself must be alone on
422        // its line).
423        if let Some(open_end) = doc_block_open_at(source, pos) {
424            // Find the matching closing `---` line.
425            match doc_block_close(source, open_end) {
426                Some((close_start, close_end)) => {
427                    let span = Span::new(pos, close_end);
428                    tokens.push(Token {
429                        kind: TokenKind::DocBlock,
430                        span,
431                    });
432                    let _ = close_start;
433                    pos = close_end;
434                    continue;
435                }
436                None => {
437                    return Err(CompileError::new(
438                        "bynk.lex.unclosed_doc_block",
439                        Span::new(pos, open_end),
440                        "documentation block opened but never closed",
441                    )
442                    .with_note(
443                        "a doc block must be terminated by another `---` on a line by itself",
444                    ));
445                }
446            }
447        }
448        // A `--` line comment: emit a `Comment` token covering everything
449        // up to (but not including) the terminating newline. Doc-block
450        // detection above already ruled out a `---` marker at line start
451        // — and once we've consumed past the leading `--`, any further
452        // dashes are part of the comment body. Preserving comments as
453        // trivia tokens lets the parser attach them to declarations so
454        // the formatter can emit them in place (v1.1 LSP spec §3.5).
455        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
456            let start = pos;
457            while pos < bytes.len() && bytes[pos] != b'\n' {
458                pos += 1;
459            }
460            tokens.push(Token {
461                kind: TokenKind::Comment,
462                span: Span::new(start, pos),
463            });
464            continue;
465        }
466        // Skip ordinary whitespace inline (logos handles it too, but we may
467        // be in the middle of the source between specials).
468        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
469            pos += 1;
470            continue;
471        }
472        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
473        // actually contain a `\(` hole are hand-scanned here; plain strings
474        // fall through to the logos `StrLit` path unchanged. `\(` is an
475        // invalid escape in the logos grammar, so this never re-routes a
476        // currently-valid literal.
477        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
478            let end = scan_str(bytes, source, pos)?;
479            tokens.push(Token {
480                kind: TokenKind::InterpStr,
481                span: Span::new(pos, end),
482            });
483            pos = end;
484            continue;
485        }
486        // Otherwise dispatch a single logos token starting at `pos`.
487        let mut lex = TokenKind::lexer(&source[pos..]);
488        let Some(result) = lex.next() else {
489            // No token at this position; treat as unexpected character so
490            // the user sees something useful.
491            let ch = source[pos..].chars().next().unwrap_or('\0');
492            let span = Span::new(pos, pos + ch.len_utf8());
493            return Err(CompileError::new(
494                "bynk.lex.unexpected_character",
495                span,
496                format!("unexpected character `{ch}`"),
497            ));
498        };
499        let local = lex.span();
500        let span: Span = Span::new(pos + local.start, pos + local.end);
501        match result {
502            Ok(kind) => {
503                if kind == TokenKind::IntLit {
504                    let slice = &source[span.range()];
505                    if slice.parse::<i64>().is_err() {
506                        return Err(CompileError::new(
507                            "bynk.lex.integer_overflow",
508                            span,
509                            format!(
510                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
511                            ),
512                        )
513                        .with_note("the range is -2^63 to 2^63 - 1"));
514                    }
515                }
516                if kind == TokenKind::FloatLit {
517                    let slice = &source[span.range()];
518                    match slice.parse::<f64>() {
519                        Ok(v) if v.is_finite() => {}
520                        _ => {
521                            return Err(CompileError::new(
522                                "bynk.lex.float_literal_overflow",
523                                span,
524                                format!(
525                                    "float literal `{slice}` is out of range for a 64-bit float"
526                                ),
527                            )
528                            .with_note(
529                                "the literal does not fit a finite IEEE 754 double; \
530                                 the largest finite value is ~1.8e308",
531                            ));
532                        }
533                    }
534                }
535                tokens.push(Token { kind, span });
536                pos = span.end;
537            }
538            Err(()) => {
539                let slice = &source[span.range()];
540                let ch = slice.chars().next().unwrap_or('\0');
541                let err = if ch == '"' {
542                    CompileError::new(
543                        "bynk.lex.unterminated_string",
544                        span,
545                        "unterminated string literal",
546                    )
547                    .with_note(
548                        "string literals must close with `\"` on the same line; \
549                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
550                    )
551                } else {
552                    CompileError::new(
553                        "bynk.lex.unexpected_character",
554                        span,
555                        format!("unexpected character `{ch}`"),
556                    )
557                };
558                return Err(err);
559            }
560        }
561    }
562    Ok(tokens)
563}
564
565/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
566/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
567/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
568/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
569/// routed here so the hole-aware scanner produces the precise error.
570fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
571    let mut i = start + 1;
572    while i < bytes.len() {
573        match bytes[i] {
574            b'\n' | b'"' => return false,
575            b'\\' => {
576                if bytes.get(i + 1) == Some(&b'(') {
577                    return true;
578                }
579                i += 2;
580            }
581            _ => i += 1,
582        }
583    }
584    false
585}
586
587/// Scan a double-quoted string starting at `start` (the opening `"`), returning
588/// the byte offset just past the closing `"`. Recognises the four simple
589/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
590/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
591fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
592    debug_assert_eq!(bytes[start], b'"');
593    let mut i = start + 1;
594    loop {
595        if i >= bytes.len() || bytes[i] == b'\n' {
596            return Err(CompileError::new(
597                "bynk.lex.unterminated_string",
598                Span::new(start, i.min(bytes.len())),
599                "unterminated string literal",
600            )
601            .with_note(
602                "string literals must close with `\"` on the same line; \
603                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
604            ));
605        }
606        match bytes[i] {
607            b'"' => return Ok(i + 1),
608            b'\\' => match bytes.get(i + 1) {
609                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
610                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
611                other => {
612                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
613                    return Err(CompileError::new(
614                        "bynk.lex.bad_escape",
615                        Span::new(i, (i + 2).min(bytes.len())),
616                        format!("invalid escape sequence `\\{shown}` in string literal"),
617                    )
618                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
619                }
620            },
621            // Any other byte advances one position. UTF-8 continuation bytes
622            // are all >= 0x80, so they never collide with the ASCII specials.
623            _ => i += 1,
624        }
625    }
626}
627
628/// Scan an interpolation hole body. `start` points just past the `\(`; returns
629/// the offset just past the matching `)`. Tracks paren depth and skips nested
630/// strings (whose own parens must not close the hole), recursing through
631/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
632fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
633    let mut i = start;
634    let mut depth = 1usize;
635    loop {
636        if i >= bytes.len() || bytes[i] == b'\n' {
637            return Err(CompileError::new(
638                "bynk.lex.unterminated_interpolation",
639                Span::new(start.saturating_sub(2), i.min(bytes.len())),
640                "unterminated interpolation hole",
641            )
642            .with_note(
643                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
644            ));
645        }
646        match bytes[i] {
647            b'(' => {
648                depth += 1;
649                i += 1;
650            }
651            b')' => {
652                depth -= 1;
653                i += 1;
654                if depth == 0 {
655                    return Ok(i);
656                }
657            }
658            b'"' => i = scan_str(bytes, source, i)?,
659            _ => i += 1,
660        }
661    }
662}
663
664/// One segment of a split interpolated string (v0.43): literal text (escapes
665/// resolved) or the absolute source span of a hole's expression (the bytes
666/// between `\(` and its matching `)`). The parser turns the latter into a real
667/// `Expr`; the lexer owns only the scanning.
668pub(crate) enum InterpSegment {
669    Chunk(String),
670    Hole(Span),
671}
672
673/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
674/// and hole spans. Escapes in the chunks are resolved here (mirroring
675/// [`parse_string_literal`]); holes are returned as spans for the parser to
676/// re-lex and parse as expressions. (v0.43.)
677pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
678    let bytes = source.as_bytes();
679    let inner_end = span.end - 1; // the closing `"`
680    let mut segments = Vec::new();
681    let mut chunk = String::new();
682    let mut i = span.start + 1; // past the opening `"`
683    while i < inner_end {
684        match bytes[i] {
685            b'\\' => match bytes[i + 1] {
686                b'n' => {
687                    chunk.push('\n');
688                    i += 2;
689                }
690                b't' => {
691                    chunk.push('\t');
692                    i += 2;
693                }
694                b'"' => {
695                    chunk.push('"');
696                    i += 2;
697                }
698                b'\\' => {
699                    chunk.push('\\');
700                    i += 2;
701                }
702                b'(' => {
703                    if !chunk.is_empty() {
704                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
705                    }
706                    let hole_start = i + 2;
707                    let after = scan_hole(bytes, source, hole_start)?;
708                    // `after` is one past the matching `)`; the hole body is
709                    // everything up to that `)`.
710                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
711                    i = after;
712                }
713                // The lexer already validated every escape, so nothing else
714                // can appear here.
715                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
716            },
717            _ => {
718                let ch = source[i..].chars().next().unwrap();
719                chunk.push(ch);
720                i += ch.len_utf8();
721            }
722        }
723    }
724    if !chunk.is_empty() {
725        segments.push(InterpSegment::Chunk(chunk));
726    }
727    Ok(segments)
728}
729
730/// If a `---` doc-block marker line starts at or shortly after `pos` (which
731/// must be at a line boundary), return the byte offset just past the marker
732/// line (after the terminating newline, or at EOF). The doc-block grammar
733/// requires the marker to be alone on its line; leading horizontal whitespace
734/// is allowed and ignored.
735fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
736    let bytes = source.as_bytes();
737    if !at_line_start(source, pos) {
738        return None;
739    }
740    // Skip leading horizontal whitespace.
741    let mut i = pos;
742    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
743        i += 1;
744    }
745    if i + 3 > bytes.len() {
746        return None;
747    }
748    if &bytes[i..i + 3] != b"---" {
749        return None;
750    }
751    i += 3;
752    // The marker may have additional trailing dashes (per spec "three or more
753    // consecutive hyphens"). Consume them.
754    while i < bytes.len() && bytes[i] == b'-' {
755        i += 1;
756    }
757    // After the dashes, allow only horizontal whitespace then newline/EOF.
758    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
759        i += 1;
760    }
761    if i == bytes.len() {
762        return Some(i);
763    }
764    if bytes[i] == b'\n' {
765        return Some(i + 1);
766    }
767    None
768}
769
770/// Find the next closing `---` line at or after `pos`. Returns
771/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
772/// terminating newline, or at EOF).
773fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
774    let bytes = source.as_bytes();
775    while pos < bytes.len() {
776        // Advance pos to the start of a line.
777        let line_start = pos;
778        // Find the end of this line.
779        let mut line_end = line_start;
780        while line_end < bytes.len() && bytes[line_end] != b'\n' {
781            line_end += 1;
782        }
783        // Check this line.
784        if let Some(end) = doc_block_open_at(source, line_start) {
785            return Some((line_start, end));
786        }
787        // Move to the next line.
788        pos = if line_end < bytes.len() {
789            line_end + 1
790        } else {
791            line_end
792        };
793    }
794    None
795}
796
797/// Returns true if byte offset `pos` is at a line start (column 0).
798fn at_line_start(source: &str, pos: usize) -> bool {
799    if pos == 0 {
800        return true;
801    }
802    let bytes = source.as_bytes();
803    bytes[pos - 1] == b'\n'
804}
805
806/// Extract the body content of a doc-block token from its source span.
807/// Strips the leading and trailing `---` marker lines and returns the body
808/// verbatim. If every non-empty content line begins with the same horizontal
809/// whitespace prefix (e.g., because the doc block sits inside a brace-form
810/// commons body), that common prefix is removed so the body reads naturally
811/// when emitted as JSDoc.
812pub fn doc_block_content(source: &str, span: Span) -> String {
813    let slice = &source[span.range()];
814    // Drop the first line (opening marker).
815    let after_open = match slice.find('\n') {
816        Some(i) => &slice[i + 1..],
817        None => return String::new(),
818    };
819    let bytes = after_open.as_bytes();
820    // Trim the trailing closing-marker line.
821    let mut i = bytes.len();
822    if i > 0 && bytes[i - 1] == b'\n' {
823        i -= 1;
824    }
825    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
826        i -= 1;
827    }
828    while i > 0 && bytes[i - 1] == b'-' {
829        i -= 1;
830    }
831    if i > 0 && bytes[i - 1] == b'\n' {
832        i -= 1;
833    }
834    let body = &after_open[..i];
835
836    // Compute the common leading-whitespace prefix across all non-empty lines
837    // and strip it. This lets writers indent the doc block alongside the
838    // declaration it documents without bleeding the indent into the JSDoc.
839    let common: Option<usize> = body
840        .lines()
841        .filter(|l| !l.trim().is_empty())
842        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
843        .min();
844    let strip = common.unwrap_or(0);
845    if strip == 0 {
846        return body.to_string();
847    }
848    let mut out = String::with_capacity(body.len());
849    let mut first = true;
850    for line in body.lines() {
851        if !first {
852            out.push('\n');
853        }
854        first = false;
855        if line.trim().is_empty() {
856            // Preserve blank lines.
857            continue;
858        }
859        let leading: usize = line
860            .bytes()
861            .take_while(|&b| b == b' ' || b == b'\t')
862            .count();
863        let drop = strip.min(leading);
864        out.push_str(&line[drop..]);
865    }
866    out
867}
868
869/// Extract the body of a `Comment` trivia token: everything after the
870/// leading `--` marker, preserving its inline whitespace verbatim. Used by
871/// the parser when attaching comments to declarations.
872pub fn comment_body(source: &str, span: Span) -> &str {
873    let slice = &source[span.range()];
874    // Strip leading "--" if present (defensive — the lexer always emits
875    // Comment tokens whose span begins with `--`).
876    slice.strip_prefix("--").unwrap_or(slice)
877}
878
879/// Returns true if there is a blank line (a line containing only whitespace)
880/// in `source` strictly between byte offsets `from` (inclusive) and `to`
881/// (exclusive). Used by the parser to detect orphan doc blocks.
882///
883/// A doc-block token's span ends just past the closing-marker line's
884/// terminating newline. So if the next declaration begins on the immediately
885/// following line, the substring between contains no newline (only optional
886/// indentation). Any newline in the substring therefore implies at least one
887/// entirely-blank line separating the doc from the declaration.
888pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
889    if to <= from {
890        return false;
891    }
892    let bytes = source.as_bytes();
893    let mut i = from;
894    while i < to {
895        if bytes[i] == b'\n' {
896            return true;
897        }
898        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
899            return false;
900        }
901        i += 1;
902    }
903    false
904}
905
906#[cfg(test)]
907mod tests {
908    use super::*;
909
910    fn kinds(source: &str) -> Vec<TokenKind> {
911        tokenize(source)
912            .unwrap()
913            .into_iter()
914            .map(|t| t.kind)
915            .collect()
916    }
917
918    #[test]
919    fn keywords_and_idents() {
920        use TokenKind::*;
921        assert_eq!(
922            kinds("commons type fn where and true false Int String Bool foo bar"),
923            vec![
924                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
925            ],
926        );
927    }
928
929    #[test]
930    fn integer_and_string_literals() {
931        use TokenKind::*;
932        assert_eq!(
933            kinds(r#"0 42 "hello" "with\nescape""#),
934            vec![IntLit, IntLit, StrLit, StrLit]
935        );
936    }
937
938    #[test]
939    fn operators() {
940        use TokenKind::*;
941        assert_eq!(
942            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
943            vec![
944                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
945                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
946                At,
947            ],
948        );
949    }
950
951    #[test]
952    fn line_comments_emitted_as_trivia() {
953        // v1.1: line comments are preserved as Comment tokens so the
954        // formatter can attach and re-emit them.
955        use TokenKind::*;
956        let src = "-- a comment\ntype X = Int -- trailing\n";
957        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
958    }
959
960    #[test]
961    fn comment_body_extracts_text_after_marker() {
962        let toks = tokenize("-- hello world\n").unwrap();
963        assert_eq!(toks.len(), 1);
964        assert_eq!(toks[0].kind, TokenKind::Comment);
965        assert_eq!(
966            comment_body("-- hello world\n", toks[0].span),
967            " hello world"
968        );
969    }
970
971    #[test]
972    fn comment_does_not_consume_newline() {
973        // Two adjacent comment lines should produce two distinct tokens
974        // — the newline between them is not part of either comment's span.
975        let toks = tokenize("-- one\n-- two\n").unwrap();
976        assert_eq!(toks.len(), 2);
977        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
978    }
979
980    #[test]
981    fn unterminated_string_is_error() {
982        let err = tokenize("\"oops\n").unwrap_err();
983        assert_eq!(err.category, "bynk.lex.unterminated_string");
984    }
985
986    #[test]
987    fn integer_overflow_is_error() {
988        let err = tokenize("99999999999999999999").unwrap_err();
989        assert_eq!(err.category, "bynk.lex.integer_overflow");
990    }
991
992    #[test]
993    fn unexpected_character_is_error() {
994        let err = tokenize("type X = Int $").unwrap_err();
995        assert_eq!(err.category, "bynk.lex.unexpected_character");
996    }
997
998    #[test]
999    fn v0_1_keywords() {
1000        use TokenKind::*;
1001        assert_eq!(
1002            kinds("let if else Ok Err Result ValidationError"),
1003            vec![Let, If, Else, Ok, Err, Result, ValidationError],
1004        );
1005    }
1006
1007    #[test]
1008    fn question_token() {
1009        use TokenKind::*;
1010        assert_eq!(kinds("x?"), vec![Ident, Question]);
1011    }
1012
1013    #[test]
1014    fn v0_2_keywords() {
1015        use TokenKind::*;
1016        assert_eq!(
1017            kinds("enum match Option record self Some None is"),
1018            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1019        );
1020    }
1021
1022    #[test]
1023    fn pipe_and_pipe_pipe_disambiguated() {
1024        use TokenKind::*;
1025        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1026    }
1027
1028    #[test]
1029    fn v0_7_keywords() {
1030        use TokenKind::*;
1031        assert_eq!(
1032            kinds("assert expect mocks test"),
1033            vec![Assert, Expect, Mocks, Test],
1034        );
1035    }
1036
1037    #[test]
1038    fn fat_arrow_and_underscore() {
1039        use TokenKind::*;
1040        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1041    }
1042
1043    // -- v0.43 string interpolation --
1044
1045    #[test]
1046    fn interp_string_is_one_token() {
1047        use TokenKind::*;
1048        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1049        // A plain string (no hole) stays a `StrLit`, via the logos path.
1050        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1051    }
1052
1053    #[test]
1054    fn interp_balances_nested_parens_and_strings() {
1055        use TokenKind::*;
1056        // The `)` inside `f(x)` must not close the hole early.
1057        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1058        // A `)` inside a nested string inside the hole is also ignored.
1059        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1060        // A nested interpolated string inside a hole.
1061        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1062    }
1063
1064    #[test]
1065    fn escaped_open_paren_is_not_a_hole() {
1066        use TokenKind::*;
1067        // `\\(` is a literal backslash followed by `(` — no hole, so the
1068        // string lexes as a plain `StrLit` on the logos path.
1069        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1070    }
1071
1072    #[test]
1073    fn unterminated_hole_is_an_error() {
1074        // The hole runs to end of line without its closing `)`.
1075        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1076        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1077    }
1078
1079    #[test]
1080    fn unterminated_interp_string_is_an_error() {
1081        // A hole closes but the string never does (newline before the `"`).
1082        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1083        assert_eq!(err.category, "bynk.lex.unterminated_string");
1084    }
1085
1086    #[test]
1087    fn bad_escape_in_interp_string_is_an_error() {
1088        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1089        assert_eq!(err.category, "bynk.lex.bad_escape");
1090    }
1091}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs