bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.1 keywords
48    #[token("let")]
49    Let,
50    #[token("if")]
51    If,
52    #[token("else")]
53    Else,
54    #[token("Ok")]
55    Ok,
56    #[token("Err")]
57    Err,
58    #[token("Result")]
59    Result,
60    #[token("ValidationError")]
61    ValidationError,
62    // v0.22b keyword
63    #[token("JsonError")]
64    JsonError,
65    // v0.2 keywords
66    #[token("enum")]
67    Enum,
68    #[token("match")]
69    Match,
70    #[token("Option")]
71    Option,
72    #[token("record")]
73    Record,
74    #[token("self")]
75    Self_,
76    #[token("Some")]
77    Some,
78    #[token("None")]
79    None,
80    #[token("is")]
81    Is,
82    // v0.3 keywords
83    #[token("opaque")]
84    Opaque,
85    #[token("uses")]
86    Uses,
87    // v0.4 keywords
88    #[token("context")]
89    Context,
90    #[token("consumes")]
91    Consumes,
92    #[token("exports")]
93    Exports,
94    #[token("transparent")]
95    Transparent,
96    // v0.6 keywords
97    #[token("as")]
98    As,
99    // v0.7 keywords
100    #[token("assert")]
101    Assert,
102    #[token("expect")]
103    Expect,
104    #[token("mocks")]
105    Mocks,
106    #[token("test")]
107    Test,
108    // v0.16 keyword
109    #[token("wires")]
110    Wires,
111    // v0.17 keywords
112    #[token("adapter")]
113    Adapter,
114    #[token("binding")]
115    Binding,
116    // v0.5 keywords
117    #[token("agent")]
118    Agent,
119    #[token("capability")]
120    Capability,
121    #[token("commit")]
122    Commit,
123    #[token("Effect")]
124    Effect,
125    #[token("given")]
126    Given,
127    #[token("on")]
128    On,
129    // v0.9 keyword
130    #[token("http")]
131    Http,
132    // v0.10a keyword
133    #[token("cron")]
134    Cron,
135    // v0.10b keyword
136    #[token("queue")]
137    Queue,
138    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
139    // reserved (protocols are a closed, compiler-known set — no declaration kind).
140    #[token("from")]
141    From,
142    #[token("protocol")]
143    Protocol,
144    #[token("provides")]
145    Provides,
146    #[token("service")]
147    Service,
148    #[token("state")]
149    State,
150    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
151    // heads a handler's actor clause.
152    #[token("actor")]
153    Actor,
154    #[token("by")]
155    By,
156    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
157    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
158    #[token("invariant")]
159    Invariant,
160    #[token("implies")]
161    Implies,
162    /// `...` — used in record-spread expressions (v0.5).
163    #[token("...")]
164    DotDotDot,
165    /// `<-` — Effect bind operator (v0.5).
166    #[token("<-")]
167    LArrow,
168    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
169    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
170    /// call site shows whether the caller waits.
171    #[token("~>")]
172    TildeArrow,
173
174    /// A documentation block: `---` line ... `---` line. The token's span
175    /// covers the full block including both `---` markers. The body content
176    /// is recovered from the source via the span (see [`doc_block_content`]).
177    /// Inserted by [`tokenize`]; not lexed by logos directly.
178    DocBlock,
179
180    /// A line comment: `-- ...` running to end of line. The span starts at
181    /// the `--` marker and runs through the last character before the
182    /// terminating newline (exclusive). The trivia body (the text after the
183    /// `--` marker) is recovered from the source via the span. Inserted by
184    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
185    /// for an `--` operator sequence.
186    Comment,
187
188    // Identifier
189    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
190    Ident,
191
192    // Literals
193    #[regex(r"[0-9]+")]
194    IntLit,
195    // A float literal: fraction with a digit on both sides of the `.`, an
196    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
197    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
198    // as method calls on numeric literals.
199    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
200    FloatLit,
201    // A double-quoted string with simple escapes. The body excludes the closing
202    // quote; we accept any non-quote/non-backslash/non-newline char, or a
203    // backslash followed by one of the four allowed escapes.
204    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
205    StrLit,
206    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
207    // `tokenize` (logos cannot balance the holes' parens), never produced by
208    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
209    // The span covers the whole `"…"`; the parser splits chunks from holes.
210    InterpStr,
211
212    // Multi-char operators
213    #[token("->")]
214    Arrow,
215    #[token("==")]
216    EqEq,
217    #[token("!=")]
218    BangEq,
219    #[token("<=")]
220    LtEq,
221    #[token(">=")]
222    GtEq,
223    #[token("&&")]
224    AmpAmp,
225    #[token("||")]
226    PipePipe,
227
228    // Single-char operators
229    #[token("+")]
230    Plus,
231    #[token("-")]
232    Minus,
233    #[token("*")]
234    Star,
235    #[token("/")]
236    Slash,
237    #[token("!")]
238    Bang,
239    #[token("=")]
240    Eq,
241    #[token("<")]
242    Lt,
243    #[token(">")]
244    Gt,
245    // v0.1 postfix operator
246    #[token("?")]
247    Question,
248    // v0.2 match-arm arrow
249    #[token("=>")]
250    FatArrow,
251    // v0.2 wildcard pattern (also valid as identifier start; the lexer
252    // prefers identifier for any longer match, so `_foo` is still Ident).
253    #[token("_")]
254    Underscore,
255    // v0.2 sum-type variant separator (also used as future bitwise OR);
256    // single `|` distinct from `||`.
257    #[token("|")]
258    Pipe,
259
260    // Punctuation
261    #[token("(")]
262    LParen,
263    #[token(")")]
264    RParen,
265    #[token("{")]
266    LBrace,
267    #[token("}")]
268    RBrace,
269    #[token("[")]
270    LBracket,
271    #[token("]")]
272    RBracket,
273    #[token(",")]
274    Comma,
275    #[token(":")]
276    Colon,
277    #[token(".")]
278    Dot,
279}
280
281impl TokenKind {
282    /// Human-readable display name for diagnostics.
283    pub fn describe(self) -> &'static str {
284        use TokenKind::*;
285        match self {
286            Commons => "`commons`",
287            Type => "`type`",
288            Fn => "`fn`",
289            Where => "`where`",
290            And => "`and`",
291            True => "`true`",
292            False => "`false`",
293            Int => "`Int`",
294            String => "`String`",
295            Bool => "`Bool`",
296            Float => "`Float`",
297            Let => "`let`",
298            If => "`if`",
299            Else => "`else`",
300            Ok => "`Ok`",
301            Err => "`Err`",
302            Result => "`Result`",
303            ValidationError => "`ValidationError`",
304            JsonError => "`JsonError`",
305            Enum => "`enum`",
306            Match => "`match`",
307            Option => "`Option`",
308            Record => "`record`",
309            Self_ => "`self`",
310            Some => "`Some`",
311            None => "`None`",
312            Is => "`is`",
313            Opaque => "`opaque`",
314            Uses => "`uses`",
315            Context => "`context`",
316            Consumes => "`consumes`",
317            Exports => "`exports`",
318            Transparent => "`transparent`",
319            As => "`as`",
320            Assert => "`assert`",
321            Expect => "`expect`",
322            Mocks => "`mocks`",
323            Test => "`test`",
324            Wires => "`wires`",
325            Adapter => "`adapter`",
326            Binding => "`binding`",
327            Agent => "`agent`",
328            Capability => "`capability`",
329            Commit => "`commit`",
330            Effect => "`Effect`",
331            Given => "`given`",
332            On => "`on`",
333            Http => "`http`",
334            Cron => "`cron`",
335            Queue => "`queue`",
336            From => "`from`",
337            Protocol => "`protocol`",
338            Provides => "`provides`",
339            Service => "`service`",
340            State => "`state`",
341            Actor => "`actor`",
342            By => "`by`",
343            Invariant => "`invariant`",
344            Implies => "`implies`",
345            DotDotDot => "`...`",
346            LArrow => "`<-`",
347            TildeArrow => "`~>`",
348            DocBlock => "documentation block",
349            Comment => "line comment",
350            Ident => "identifier",
351            IntLit => "integer literal",
352            FloatLit => "float literal",
353            StrLit => "string literal",
354            InterpStr => "interpolated string",
355            Arrow => "`->`",
356            EqEq => "`==`",
357            BangEq => "`!=`",
358            LtEq => "`<=`",
359            GtEq => "`>=`",
360            AmpAmp => "`&&`",
361            PipePipe => "`||`",
362            Plus => "`+`",
363            Minus => "`-`",
364            Star => "`*`",
365            Slash => "`/`",
366            Bang => "`!`",
367            Eq => "`=`",
368            Lt => "`<`",
369            Gt => "`>`",
370            Question => "`?`",
371            FatArrow => "`=>`",
372            Underscore => "`_`",
373            Pipe => "`|`",
374            LParen => "`(`",
375            RParen => "`)`",
376            LBrace => "`{`",
377            RBrace => "`}`",
378            LBracket => "`[`",
379            RBracket => "`]`",
380            Comma => "`,`",
381            Colon => "`:`",
382            Dot => "`.`",
383        }
384    }
385}
386
387/// A token plus its source span.
388#[derive(Debug, Clone, Copy)]
389pub struct Token {
390    pub kind: TokenKind,
391    pub span: Span,
392}
393
394/// Tokenise a source string. Returns the full token vector or the first
395/// lexical error.
396///
397/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
398/// outside the logos-generated lexer: we scan the source one segment at a
399/// time, dispatching to logos for ordinary tokens between non-token spans.
400pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
401    let mut tokens = Vec::new();
402    let bytes = source.as_bytes();
403    let mut pos = 0;
404    while pos < bytes.len() {
405        // Detect a `---` doc-block marker at the start of a line (the line may
406        // begin with leading whitespace; the marker itself must be alone on
407        // its line).
408        if let Some(open_end) = doc_block_open_at(source, pos) {
409            // Find the matching closing `---` line.
410            match doc_block_close(source, open_end) {
411                Some((close_start, close_end)) => {
412                    let span = Span::new(pos, close_end);
413                    tokens.push(Token {
414                        kind: TokenKind::DocBlock,
415                        span,
416                    });
417                    let _ = close_start;
418                    pos = close_end;
419                    continue;
420                }
421                None => {
422                    return Err(CompileError::new(
423                        "bynk.lex.unclosed_doc_block",
424                        Span::new(pos, open_end),
425                        "documentation block opened but never closed",
426                    )
427                    .with_note(
428                        "a doc block must be terminated by another `---` on a line by itself",
429                    ));
430                }
431            }
432        }
433        // A `--` line comment: emit a `Comment` token covering everything
434        // up to (but not including) the terminating newline. Doc-block
435        // detection above already ruled out a `---` marker at line start
436        // — and once we've consumed past the leading `--`, any further
437        // dashes are part of the comment body. Preserving comments as
438        // trivia tokens lets the parser attach them to declarations so
439        // the formatter can emit them in place (v1.1 LSP spec §3.5).
440        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
441            let start = pos;
442            while pos < bytes.len() && bytes[pos] != b'\n' {
443                pos += 1;
444            }
445            tokens.push(Token {
446                kind: TokenKind::Comment,
447                span: Span::new(start, pos),
448            });
449            continue;
450        }
451        // Skip ordinary whitespace inline (logos handles it too, but we may
452        // be in the middle of the source between specials).
453        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
454            pos += 1;
455            continue;
456        }
457        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
458        // actually contain a `\(` hole are hand-scanned here; plain strings
459        // fall through to the logos `StrLit` path unchanged. `\(` is an
460        // invalid escape in the logos grammar, so this never re-routes a
461        // currently-valid literal.
462        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
463            let end = scan_str(bytes, source, pos)?;
464            tokens.push(Token {
465                kind: TokenKind::InterpStr,
466                span: Span::new(pos, end),
467            });
468            pos = end;
469            continue;
470        }
471        // Otherwise dispatch a single logos token starting at `pos`.
472        let mut lex = TokenKind::lexer(&source[pos..]);
473        let Some(result) = lex.next() else {
474            // No token at this position; treat as unexpected character so
475            // the user sees something useful.
476            let ch = source[pos..].chars().next().unwrap_or('\0');
477            let span = Span::new(pos, pos + ch.len_utf8());
478            return Err(CompileError::new(
479                "bynk.lex.unexpected_character",
480                span,
481                format!("unexpected character `{ch}`"),
482            ));
483        };
484        let local = lex.span();
485        let span: Span = Span::new(pos + local.start, pos + local.end);
486        match result {
487            Ok(kind) => {
488                if kind == TokenKind::IntLit {
489                    let slice = &source[span.range()];
490                    if slice.parse::<i64>().is_err() {
491                        return Err(CompileError::new(
492                            "bynk.lex.integer_overflow",
493                            span,
494                            format!(
495                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
496                            ),
497                        )
498                        .with_note("the range is -2^63 to 2^63 - 1"));
499                    }
500                }
501                if kind == TokenKind::FloatLit {
502                    let slice = &source[span.range()];
503                    match slice.parse::<f64>() {
504                        Ok(v) if v.is_finite() => {}
505                        _ => {
506                            return Err(CompileError::new(
507                                "bynk.lex.float_literal_overflow",
508                                span,
509                                format!(
510                                    "float literal `{slice}` is out of range for a 64-bit float"
511                                ),
512                            )
513                            .with_note(
514                                "the literal does not fit a finite IEEE 754 double; \
515                                 the largest finite value is ~1.8e308",
516                            ));
517                        }
518                    }
519                }
520                tokens.push(Token { kind, span });
521                pos = span.end;
522            }
523            Err(()) => {
524                let slice = &source[span.range()];
525                let ch = slice.chars().next().unwrap_or('\0');
526                let err = if ch == '"' {
527                    CompileError::new(
528                        "bynk.lex.unterminated_string",
529                        span,
530                        "unterminated string literal",
531                    )
532                    .with_note(
533                        "string literals must close with `\"` on the same line; \
534                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
535                    )
536                } else {
537                    CompileError::new(
538                        "bynk.lex.unexpected_character",
539                        span,
540                        format!("unexpected character `{ch}`"),
541                    )
542                };
543                return Err(err);
544            }
545        }
546    }
547    Ok(tokens)
548}
549
550/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
551/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
552/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
553/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
554/// routed here so the hole-aware scanner produces the precise error.
555fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
556    let mut i = start + 1;
557    while i < bytes.len() {
558        match bytes[i] {
559            b'\n' | b'"' => return false,
560            b'\\' => {
561                if bytes.get(i + 1) == Some(&b'(') {
562                    return true;
563                }
564                i += 2;
565            }
566            _ => i += 1,
567        }
568    }
569    false
570}
571
572/// Scan a double-quoted string starting at `start` (the opening `"`), returning
573/// the byte offset just past the closing `"`. Recognises the four simple
574/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
575/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
576fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
577    debug_assert_eq!(bytes[start], b'"');
578    let mut i = start + 1;
579    loop {
580        if i >= bytes.len() || bytes[i] == b'\n' {
581            return Err(CompileError::new(
582                "bynk.lex.unterminated_string",
583                Span::new(start, i.min(bytes.len())),
584                "unterminated string literal",
585            )
586            .with_note(
587                "string literals must close with `\"` on the same line; \
588                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
589            ));
590        }
591        match bytes[i] {
592            b'"' => return Ok(i + 1),
593            b'\\' => match bytes.get(i + 1) {
594                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
595                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
596                other => {
597                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
598                    return Err(CompileError::new(
599                        "bynk.lex.bad_escape",
600                        Span::new(i, (i + 2).min(bytes.len())),
601                        format!("invalid escape sequence `\\{shown}` in string literal"),
602                    )
603                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
604                }
605            },
606            // Any other byte advances one position. UTF-8 continuation bytes
607            // are all >= 0x80, so they never collide with the ASCII specials.
608            _ => i += 1,
609        }
610    }
611}
612
613/// Scan an interpolation hole body. `start` points just past the `\(`; returns
614/// the offset just past the matching `)`. Tracks paren depth and skips nested
615/// strings (whose own parens must not close the hole), recursing through
616/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
617fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
618    let mut i = start;
619    let mut depth = 1usize;
620    loop {
621        if i >= bytes.len() || bytes[i] == b'\n' {
622            return Err(CompileError::new(
623                "bynk.lex.unterminated_interpolation",
624                Span::new(start.saturating_sub(2), i.min(bytes.len())),
625                "unterminated interpolation hole",
626            )
627            .with_note(
628                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
629            ));
630        }
631        match bytes[i] {
632            b'(' => {
633                depth += 1;
634                i += 1;
635            }
636            b')' => {
637                depth -= 1;
638                i += 1;
639                if depth == 0 {
640                    return Ok(i);
641                }
642            }
643            b'"' => i = scan_str(bytes, source, i)?,
644            _ => i += 1,
645        }
646    }
647}
648
649/// One segment of a split interpolated string (v0.43): literal text (escapes
650/// resolved) or the absolute source span of a hole's expression (the bytes
651/// between `\(` and its matching `)`). The parser turns the latter into a real
652/// `Expr`; the lexer owns only the scanning.
653pub(crate) enum InterpSegment {
654    Chunk(String),
655    Hole(Span),
656}
657
658/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
659/// and hole spans. Escapes in the chunks are resolved here (mirroring
660/// [`parse_string_literal`]); holes are returned as spans for the parser to
661/// re-lex and parse as expressions. (v0.43.)
662pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
663    let bytes = source.as_bytes();
664    let inner_end = span.end - 1; // the closing `"`
665    let mut segments = Vec::new();
666    let mut chunk = String::new();
667    let mut i = span.start + 1; // past the opening `"`
668    while i < inner_end {
669        match bytes[i] {
670            b'\\' => match bytes[i + 1] {
671                b'n' => {
672                    chunk.push('\n');
673                    i += 2;
674                }
675                b't' => {
676                    chunk.push('\t');
677                    i += 2;
678                }
679                b'"' => {
680                    chunk.push('"');
681                    i += 2;
682                }
683                b'\\' => {
684                    chunk.push('\\');
685                    i += 2;
686                }
687                b'(' => {
688                    if !chunk.is_empty() {
689                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
690                    }
691                    let hole_start = i + 2;
692                    let after = scan_hole(bytes, source, hole_start)?;
693                    // `after` is one past the matching `)`; the hole body is
694                    // everything up to that `)`.
695                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
696                    i = after;
697                }
698                // The lexer already validated every escape, so nothing else
699                // can appear here.
700                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
701            },
702            _ => {
703                let ch = source[i..].chars().next().unwrap();
704                chunk.push(ch);
705                i += ch.len_utf8();
706            }
707        }
708    }
709    if !chunk.is_empty() {
710        segments.push(InterpSegment::Chunk(chunk));
711    }
712    Ok(segments)
713}
714
715/// If a `---` doc-block marker line starts at or shortly after `pos` (which
716/// must be at a line boundary), return the byte offset just past the marker
717/// line (after the terminating newline, or at EOF). The doc-block grammar
718/// requires the marker to be alone on its line; leading horizontal whitespace
719/// is allowed and ignored.
720fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
721    let bytes = source.as_bytes();
722    if !at_line_start(source, pos) {
723        return None;
724    }
725    // Skip leading horizontal whitespace.
726    let mut i = pos;
727    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
728        i += 1;
729    }
730    if i + 3 > bytes.len() {
731        return None;
732    }
733    if &bytes[i..i + 3] != b"---" {
734        return None;
735    }
736    i += 3;
737    // The marker may have additional trailing dashes (per spec "three or more
738    // consecutive hyphens"). Consume them.
739    while i < bytes.len() && bytes[i] == b'-' {
740        i += 1;
741    }
742    // After the dashes, allow only horizontal whitespace then newline/EOF.
743    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
744        i += 1;
745    }
746    if i == bytes.len() {
747        return Some(i);
748    }
749    if bytes[i] == b'\n' {
750        return Some(i + 1);
751    }
752    None
753}
754
755/// Find the next closing `---` line at or after `pos`. Returns
756/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
757/// terminating newline, or at EOF).
758fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
759    let bytes = source.as_bytes();
760    while pos < bytes.len() {
761        // Advance pos to the start of a line.
762        let line_start = pos;
763        // Find the end of this line.
764        let mut line_end = line_start;
765        while line_end < bytes.len() && bytes[line_end] != b'\n' {
766            line_end += 1;
767        }
768        // Check this line.
769        if let Some(end) = doc_block_open_at(source, line_start) {
770            return Some((line_start, end));
771        }
772        // Move to the next line.
773        pos = if line_end < bytes.len() {
774            line_end + 1
775        } else {
776            line_end
777        };
778    }
779    None
780}
781
782/// Returns true if byte offset `pos` is at a line start (column 0).
783fn at_line_start(source: &str, pos: usize) -> bool {
784    if pos == 0 {
785        return true;
786    }
787    let bytes = source.as_bytes();
788    bytes[pos - 1] == b'\n'
789}
790
791/// Extract the body content of a doc-block token from its source span.
792/// Strips the leading and trailing `---` marker lines and returns the body
793/// verbatim. If every non-empty content line begins with the same horizontal
794/// whitespace prefix (e.g., because the doc block sits inside a brace-form
795/// commons body), that common prefix is removed so the body reads naturally
796/// when emitted as JSDoc.
797pub fn doc_block_content(source: &str, span: Span) -> String {
798    let slice = &source[span.range()];
799    // Drop the first line (opening marker).
800    let after_open = match slice.find('\n') {
801        Some(i) => &slice[i + 1..],
802        None => return String::new(),
803    };
804    let bytes = after_open.as_bytes();
805    // Trim the trailing closing-marker line.
806    let mut i = bytes.len();
807    if i > 0 && bytes[i - 1] == b'\n' {
808        i -= 1;
809    }
810    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
811        i -= 1;
812    }
813    while i > 0 && bytes[i - 1] == b'-' {
814        i -= 1;
815    }
816    if i > 0 && bytes[i - 1] == b'\n' {
817        i -= 1;
818    }
819    let body = &after_open[..i];
820
821    // Compute the common leading-whitespace prefix across all non-empty lines
822    // and strip it. This lets writers indent the doc block alongside the
823    // declaration it documents without bleeding the indent into the JSDoc.
824    let common: Option<usize> = body
825        .lines()
826        .filter(|l| !l.trim().is_empty())
827        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
828        .min();
829    let strip = common.unwrap_or(0);
830    if strip == 0 {
831        return body.to_string();
832    }
833    let mut out = String::with_capacity(body.len());
834    let mut first = true;
835    for line in body.lines() {
836        if !first {
837            out.push('\n');
838        }
839        first = false;
840        if line.trim().is_empty() {
841            // Preserve blank lines.
842            continue;
843        }
844        let leading: usize = line
845            .bytes()
846            .take_while(|&b| b == b' ' || b == b'\t')
847            .count();
848        let drop = strip.min(leading);
849        out.push_str(&line[drop..]);
850    }
851    out
852}
853
854/// Extract the body of a `Comment` trivia token: everything after the
855/// leading `--` marker, preserving its inline whitespace verbatim. Used by
856/// the parser when attaching comments to declarations.
857pub fn comment_body(source: &str, span: Span) -> &str {
858    let slice = &source[span.range()];
859    // Strip leading "--" if present (defensive — the lexer always emits
860    // Comment tokens whose span begins with `--`).
861    slice.strip_prefix("--").unwrap_or(slice)
862}
863
864/// Returns true if there is a blank line (a line containing only whitespace)
865/// in `source` strictly between byte offsets `from` (inclusive) and `to`
866/// (exclusive). Used by the parser to detect orphan doc blocks.
867///
868/// A doc-block token's span ends just past the closing-marker line's
869/// terminating newline. So if the next declaration begins on the immediately
870/// following line, the substring between contains no newline (only optional
871/// indentation). Any newline in the substring therefore implies at least one
872/// entirely-blank line separating the doc from the declaration.
873pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
874    if to <= from {
875        return false;
876    }
877    let bytes = source.as_bytes();
878    let mut i = from;
879    while i < to {
880        if bytes[i] == b'\n' {
881            return true;
882        }
883        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
884            return false;
885        }
886        i += 1;
887    }
888    false
889}
890
891#[cfg(test)]
892mod tests {
893    use super::*;
894
895    fn kinds(source: &str) -> Vec<TokenKind> {
896        tokenize(source)
897            .unwrap()
898            .into_iter()
899            .map(|t| t.kind)
900            .collect()
901    }
902
903    #[test]
904    fn keywords_and_idents() {
905        use TokenKind::*;
906        assert_eq!(
907            kinds("commons type fn where and true false Int String Bool foo bar"),
908            vec![
909                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
910            ],
911        );
912    }
913
914    #[test]
915    fn integer_and_string_literals() {
916        use TokenKind::*;
917        assert_eq!(
918            kinds(r#"0 42 "hello" "with\nescape""#),
919            vec![IntLit, IntLit, StrLit, StrLit]
920        );
921    }
922
923    #[test]
924    fn operators() {
925        use TokenKind::*;
926        assert_eq!(
927            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : ."),
928            vec![
929                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
930                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
931            ],
932        );
933    }
934
935    #[test]
936    fn line_comments_emitted_as_trivia() {
937        // v1.1: line comments are preserved as Comment tokens so the
938        // formatter can attach and re-emit them.
939        use TokenKind::*;
940        let src = "-- a comment\ntype X = Int -- trailing\n";
941        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
942    }
943
944    #[test]
945    fn comment_body_extracts_text_after_marker() {
946        let toks = tokenize("-- hello world\n").unwrap();
947        assert_eq!(toks.len(), 1);
948        assert_eq!(toks[0].kind, TokenKind::Comment);
949        assert_eq!(
950            comment_body("-- hello world\n", toks[0].span),
951            " hello world"
952        );
953    }
954
955    #[test]
956    fn comment_does_not_consume_newline() {
957        // Two adjacent comment lines should produce two distinct tokens
958        // — the newline between them is not part of either comment's span.
959        let toks = tokenize("-- one\n-- two\n").unwrap();
960        assert_eq!(toks.len(), 2);
961        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
962    }
963
964    #[test]
965    fn unterminated_string_is_error() {
966        let err = tokenize("\"oops\n").unwrap_err();
967        assert_eq!(err.category, "bynk.lex.unterminated_string");
968    }
969
970    #[test]
971    fn integer_overflow_is_error() {
972        let err = tokenize("99999999999999999999").unwrap_err();
973        assert_eq!(err.category, "bynk.lex.integer_overflow");
974    }
975
976    #[test]
977    fn unexpected_character_is_error() {
978        let err = tokenize("type X = Int $").unwrap_err();
979        assert_eq!(err.category, "bynk.lex.unexpected_character");
980    }
981
982    #[test]
983    fn v0_1_keywords() {
984        use TokenKind::*;
985        assert_eq!(
986            kinds("let if else Ok Err Result ValidationError"),
987            vec![Let, If, Else, Ok, Err, Result, ValidationError],
988        );
989    }
990
991    #[test]
992    fn question_token() {
993        use TokenKind::*;
994        assert_eq!(kinds("x?"), vec![Ident, Question]);
995    }
996
997    #[test]
998    fn v0_2_keywords() {
999        use TokenKind::*;
1000        assert_eq!(
1001            kinds("enum match Option record self Some None is"),
1002            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1003        );
1004    }
1005
1006    #[test]
1007    fn pipe_and_pipe_pipe_disambiguated() {
1008        use TokenKind::*;
1009        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1010    }
1011
1012    #[test]
1013    fn v0_7_keywords() {
1014        use TokenKind::*;
1015        assert_eq!(
1016            kinds("assert expect mocks test"),
1017            vec![Assert, Expect, Mocks, Test],
1018        );
1019    }
1020
1021    #[test]
1022    fn fat_arrow_and_underscore() {
1023        use TokenKind::*;
1024        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1025    }
1026
1027    // -- v0.43 string interpolation --
1028
1029    #[test]
1030    fn interp_string_is_one_token() {
1031        use TokenKind::*;
1032        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1033        // A plain string (no hole) stays a `StrLit`, via the logos path.
1034        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1035    }
1036
1037    #[test]
1038    fn interp_balances_nested_parens_and_strings() {
1039        use TokenKind::*;
1040        // The `)` inside `f(x)` must not close the hole early.
1041        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1042        // A `)` inside a nested string inside the hole is also ignored.
1043        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1044        // A nested interpolated string inside a hole.
1045        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1046    }
1047
1048    #[test]
1049    fn escaped_open_paren_is_not_a_hole() {
1050        use TokenKind::*;
1051        // `\\(` is a literal backslash followed by `(` — no hole, so the
1052        // string lexes as a plain `StrLit` on the logos path.
1053        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1054    }
1055
1056    #[test]
1057    fn unterminated_hole_is_an_error() {
1058        // The hole runs to end of line without its closing `)`.
1059        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1060        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1061    }
1062
1063    #[test]
1064    fn unterminated_interp_string_is_an_error() {
1065        // A hole closes but the string never does (newline before the `"`).
1066        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1067        assert_eq!(err.category, "bynk.lex.unterminated_string");
1068    }
1069
1070    #[test]
1071    fn bad_escape_in_interp_string_is_an_error() {
1072        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1073        assert_eq!(err.category, "bynk.lex.bad_escape");
1074    }
1075}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs