bynk_syntax/lexer.rs
1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// v0.142 (ADR 0166): strip `_` digit separators from a numeric literal's lexeme
15/// before it is parsed into a value. The lexer's `IntLit`/`FloatLit` regexes only
16/// admit an `_` between two digit groups, so removing every `_` yields a plain
17/// digit string; the separators are purely visual. Allocates only when the
18/// literal actually carries a separator (the common case does not).
19pub(crate) fn strip_digit_separators(lexeme: &str) -> std::borrow::Cow<'_, str> {
20 if lexeme.as_bytes().contains(&b'_') {
21 std::borrow::Cow::Owned(lexeme.replace('_', ""))
22 } else {
23 std::borrow::Cow::Borrowed(lexeme)
24 }
25}
26
27/// Token kinds. Discriminants without payload data; the lexeme is recovered
28/// from the source string via the token's [`Span`].
29///
30/// Note: `--` line comments and `---` doc block markers are handled outside
31/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
32/// containing only the marker and may span multiple source lines.
33#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
34#[logos(skip r"[ \t\r\n]+")]
35pub enum TokenKind {
36 // Keywords
37 #[token("commons")]
38 Commons,
39 #[token("type")]
40 Type,
41 #[token("fn")]
42 Fn,
43 #[token("where")]
44 Where,
45 #[token("and")]
46 And,
47 #[token("true")]
48 True,
49 #[token("false")]
50 False,
51 #[token("Int")]
52 Int,
53 #[token("String")]
54 String,
55 #[token("Bool")]
56 Bool,
57 // v0.21 keyword
58 #[token("Float")]
59 Float,
60 // v0.86 keyword (ADR 0112): the `Duration` base type.
61 #[token("Duration")]
62 Duration,
63 // v0.90 keyword (ADR 0114): the `Instant` base type.
64 #[token("Instant")]
65 Instant,
66 // v0.110 keyword (ADR 0142): the `Bytes` base type.
67 #[token("Bytes")]
68 Bytes,
69 // v0.1 keywords
70 #[token("let")]
71 Let,
72 #[token("if")]
73 If,
74 #[token("else")]
75 Else,
76 #[token("Ok")]
77 Ok,
78 #[token("Err")]
79 Err,
80 #[token("Result")]
81 Result,
82 #[token("ValidationError")]
83 ValidationError,
84 // v0.22b keyword
85 #[token("JsonError")]
86 JsonError,
87 // v0.2 keywords
88 #[token("enum")]
89 Enum,
90 #[token("match")]
91 Match,
92 #[token("Option")]
93 Option,
94 #[token("record")]
95 Record,
96 #[token("self")]
97 Self_,
98 #[token("Some")]
99 Some,
100 #[token("None")]
101 None,
102 #[token("is")]
103 Is,
104 // v0.3 keywords
105 #[token("opaque")]
106 Opaque,
107 #[token("uses")]
108 Uses,
109 // v0.4 keywords
110 #[token("context")]
111 Context,
112 #[token("consumes")]
113 Consumes,
114 #[token("exports")]
115 Exports,
116 #[token("transparent")]
117 Transparent,
118 // v0.6 keywords
119 #[token("as")]
120 As,
121 // v0.7 keywords (v0.112: `assert`→`expect`, `test`→`suite`/`case`;
122 // v0.118: `mocks` retired — test doubles are `provides` at a seam)
123 #[token("expect")]
124 Expect,
125 #[token("suite")]
126 Suite,
127 #[token("case")]
128 Case,
129 // v0.114 keyword — generative tests (testing track slice 2). `for` and `all`
130 // are deliberately *not* keywords: `all` is a list combinator (`all(xs, p)`)
131 // and must stay a usable identifier. The `for all` binder is parsed
132 // contextually (two identifiers) inside a `property` body instead.
133 #[token("property")]
134 Property,
135 // v0.17 keywords
136 #[token("adapter")]
137 Adapter,
138 #[token("binding")]
139 Binding,
140 // v0.5 keywords
141 #[token("agent")]
142 Agent,
143 #[token("capability")]
144 Capability,
145 #[token("Effect")]
146 Effect,
147 #[token("given")]
148 Given,
149 #[token("on")]
150 On,
151 // v0.9 keyword
152 #[token("http")]
153 Http,
154 // v0.10a keyword
155 #[token("cron")]
156 Cron,
157 // v0.10b keyword
158 #[token("queue")]
159 Queue,
160 // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
161 // reserved (protocols are a closed, compiler-known set — no declaration kind).
162 #[token("from")]
163 From,
164 #[token("protocol")]
165 Protocol,
166 #[token("provides")]
167 Provides,
168 #[token("service")]
169 Service,
170 // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
171 // heads a handler's actor clause.
172 #[token("actor")]
173 Actor,
174 #[token("by")]
175 By,
176 // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
177 // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
178 #[token("invariant")]
179 Invariant,
180 #[token("implies")]
181 Implies,
182 // v0.115 keywords — function contracts (testing track slice 3). `requires`
183 // and `ensures` head a contract clause on a `fn` signature (between the
184 // return type and the body). `result` is deliberately *not* a keyword: it is
185 // the ordinary value name outside a contract, so it stays a usable
186 // identifier; inside an `ensures` predicate it is bound contextually as the
187 // function's return value (parsed by scope, like `for`/`all` in slice 2).
188 // Distinct from ADR 0127's capability `@requires` annotation.
189 #[token("requires")]
190 Requires,
191 #[token("ensures")]
192 Ensures,
193 // v0.116 keyword — step invariants (testing track slice 4). `transition` heads
194 // an agent step-invariant declaration (beside `invariant`), a predicate over
195 // the pre- and post-commit state pair. `old` and `new` are deliberately *not*
196 // keywords: they stay ordinary value names outside a `transition`, and inside a
197 // `transition` predicate they are bound contextually to the old/new state
198 // records (parsed by scope, like `result` in an `ensures`).
199 #[token("transition")]
200 Transition,
201 /// `...` — used in record-spread expressions (v0.5).
202 #[token("...")]
203 DotDotDot,
204 /// `<-` — Effect bind operator (v0.5).
205 #[token("<-")]
206 LArrow,
207 /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
208 /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
209 /// call site shows whether the caller waits.
210 #[token("~>")]
211 TildeArrow,
212 /// `:=` — Cell write (v0.81, storage track). A handler statement
213 /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
214 /// than `:`/`=` so logos matches it as one token.
215 #[token(":=")]
216 ColonEq,
217
218 /// A documentation block: `---` line ... `---` line. The token's span
219 /// covers the full block including both `---` markers. The body content
220 /// is recovered from the source via the span (see [`doc_block_content`]).
221 /// Inserted by [`tokenize`]; not lexed by logos directly.
222 DocBlock,
223
224 /// A line comment: `-- ...` running to end of line. The span starts at
225 /// the `--` marker and runs through the last character before the
226 /// terminating newline (exclusive). The trivia body (the text after the
227 /// `--` marker) is recovered from the source via the span. Inserted by
228 /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
229 /// for an `--` operator sequence.
230 Comment,
231
232 // Identifier
233 #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
234 Ident,
235
236 // Literals. v0.142 (ADR 0166): an `_` digit separator may appear between
237 // digits (`1_048_576`) — never leading, trailing, or doubled (each `_` must
238 // sit between two digit groups). The separators are stripped before the value
239 // is parsed; they are purely visual.
240 #[regex(r"[0-9]+(_[0-9]+)*")]
241 IntLit,
242 // A float literal: fraction with a digit on both sides of the `.`, an
243 // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
244 // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
245 // as method calls on numeric literals. Digit separators (v0.142) may appear
246 // in any digit group, including the exponent.
247 #[regex(
248 r"[0-9]+(_[0-9]+)*\.[0-9]+(_[0-9]+)*([eE][+-]?[0-9]+(_[0-9]+)*)?|[0-9]+(_[0-9]+)*[eE][+-]?[0-9]+(_[0-9]+)*"
249 )]
250 FloatLit,
251 // A double-quoted string with simple escapes. The body excludes the closing
252 // quote; we accept any non-quote/non-backslash/non-newline char, or a
253 // backslash followed by one of the four allowed escapes.
254 #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
255 StrLit,
256 // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
257 // `tokenize` (logos cannot balance the holes' parens), never produced by
258 // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
259 // The span covers the whole `"…"`; the parser splits chunks from holes.
260 InterpStr,
261
262 // Multi-char operators
263 #[token("->")]
264 Arrow,
265 #[token("==")]
266 EqEq,
267 #[token("!=")]
268 BangEq,
269 #[token("<=")]
270 LtEq,
271 #[token(">=")]
272 GtEq,
273 #[token("&&")]
274 AmpAmp,
275 #[token("||")]
276 PipePipe,
277
278 // Single-char operators
279 #[token("+")]
280 Plus,
281 #[token("-")]
282 Minus,
283 #[token("*")]
284 Star,
285 #[token("/")]
286 Slash,
287 #[token("!")]
288 Bang,
289 #[token("=")]
290 Eq,
291 #[token("<")]
292 Lt,
293 #[token(">")]
294 Gt,
295 // v0.1 postfix operator
296 #[token("?")]
297 Question,
298 // v0.2 match-arm arrow
299 #[token("=>")]
300 FatArrow,
301 // v0.2 wildcard pattern (also valid as identifier start; the lexer
302 // prefers identifier for any longer match, so `_foo` is still Ident).
303 #[token("_")]
304 Underscore,
305 // v0.2 sum-type variant separator (also used as future bitwise OR);
306 // single `|` distinct from `||`.
307 #[token("|")]
308 Pipe,
309 /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
310 /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
311 /// appears only in store-field-declaration position, never as an expression
312 /// operator.
313 #[token("@")]
314 At,
315
316 // Punctuation
317 #[token("(")]
318 LParen,
319 #[token(")")]
320 RParen,
321 #[token("{")]
322 LBrace,
323 #[token("}")]
324 RBrace,
325 #[token("[")]
326 LBracket,
327 #[token("]")]
328 RBracket,
329 #[token(",")]
330 Comma,
331 #[token(":")]
332 Colon,
333 #[token(".")]
334 Dot,
335}
336
337impl TokenKind {
338 /// Human-readable display name for diagnostics.
339 pub fn describe(self) -> &'static str {
340 use TokenKind::*;
341 match self {
342 Commons => "`commons`",
343 Type => "`type`",
344 Fn => "`fn`",
345 Where => "`where`",
346 And => "`and`",
347 True => "`true`",
348 False => "`false`",
349 Int => "`Int`",
350 String => "`String`",
351 Bool => "`Bool`",
352 Float => "`Float`",
353 Duration => "`Duration`",
354 Instant => "`Instant`",
355 Bytes => "`Bytes`",
356 Let => "`let`",
357 If => "`if`",
358 Else => "`else`",
359 Ok => "`Ok`",
360 Err => "`Err`",
361 Result => "`Result`",
362 ValidationError => "`ValidationError`",
363 JsonError => "`JsonError`",
364 Enum => "`enum`",
365 Match => "`match`",
366 Option => "`Option`",
367 Record => "`record`",
368 Self_ => "`self`",
369 Some => "`Some`",
370 None => "`None`",
371 Is => "`is`",
372 Opaque => "`opaque`",
373 Uses => "`uses`",
374 Context => "`context`",
375 Consumes => "`consumes`",
376 Exports => "`exports`",
377 Transparent => "`transparent`",
378 As => "`as`",
379 Expect => "`expect`",
380 Suite => "`suite`",
381 Case => "`case`",
382 Property => "`property`",
383 Adapter => "`adapter`",
384 Binding => "`binding`",
385 Agent => "`agent`",
386 Capability => "`capability`",
387 Effect => "`Effect`",
388 Given => "`given`",
389 On => "`on`",
390 Http => "`http`",
391 Cron => "`cron`",
392 Queue => "`queue`",
393 From => "`from`",
394 Protocol => "`protocol`",
395 Provides => "`provides`",
396 Service => "`service`",
397 Actor => "`actor`",
398 By => "`by`",
399 Invariant => "`invariant`",
400 Implies => "`implies`",
401 Requires => "`requires`",
402 Ensures => "`ensures`",
403 Transition => "`transition`",
404 ColonEq => "`:=`",
405 DotDotDot => "`...`",
406 LArrow => "`<-`",
407 TildeArrow => "`~>`",
408 DocBlock => "documentation block",
409 Comment => "line comment",
410 Ident => "identifier",
411 IntLit => "integer literal",
412 FloatLit => "float literal",
413 StrLit => "string literal",
414 InterpStr => "interpolated string",
415 Arrow => "`->`",
416 EqEq => "`==`",
417 BangEq => "`!=`",
418 LtEq => "`<=`",
419 GtEq => "`>=`",
420 AmpAmp => "`&&`",
421 PipePipe => "`||`",
422 Plus => "`+`",
423 Minus => "`-`",
424 Star => "`*`",
425 Slash => "`/`",
426 Bang => "`!`",
427 Eq => "`=`",
428 Lt => "`<`",
429 Gt => "`>`",
430 Question => "`?`",
431 FatArrow => "`=>`",
432 Underscore => "`_`",
433 Pipe => "`|`",
434 At => "`@`",
435 LParen => "`(`",
436 RParen => "`)`",
437 LBrace => "`{`",
438 RBrace => "`}`",
439 LBracket => "`[`",
440 RBracket => "`]`",
441 Comma => "`,`",
442 Colon => "`:`",
443 Dot => "`.`",
444 }
445 }
446}
447
448/// A token plus its source span.
449#[derive(Debug, Clone, Copy)]
450pub struct Token {
451 pub kind: TokenKind,
452 pub span: Span,
453}
454
455/// Tokenise a source string. Returns the full token vector or the first
456/// lexical error.
457///
458/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
459/// outside the logos-generated lexer: we scan the source one segment at a
460/// time, dispatching to logos for ordinary tokens between non-token spans.
461pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
462 let mut tokens = Vec::new();
463 let bytes = source.as_bytes();
464 let mut pos = 0;
465 while pos < bytes.len() {
466 // Detect a `---` doc-block marker at the start of a line (the line may
467 // begin with leading whitespace; the marker itself must be alone on
468 // its line).
469 if let Some(open_end) = doc_block_open_at(source, pos) {
470 // Find the matching closing `---` line.
471 match doc_block_close(source, open_end) {
472 Some((close_start, close_end)) => {
473 let span = Span::new(pos, close_end);
474 tokens.push(Token {
475 kind: TokenKind::DocBlock,
476 span,
477 });
478 let _ = close_start;
479 pos = close_end;
480 continue;
481 }
482 None => {
483 return Err(CompileError::new(
484 "bynk.lex.unclosed_doc_block",
485 Span::new(pos, open_end),
486 "documentation block opened but never closed",
487 )
488 .with_note(
489 "a doc block must be terminated by another `---` on a line by itself",
490 ));
491 }
492 }
493 }
494 // A `--` line comment: emit a `Comment` token covering everything
495 // up to (but not including) the terminating newline. Doc-block
496 // detection above already ruled out a `---` marker at line start
497 // — and once we've consumed past the leading `--`, any further
498 // dashes are part of the comment body. Preserving comments as
499 // trivia tokens lets the parser attach them to declarations so
500 // the formatter can emit them in place (v1.1 LSP spec §3.5).
501 if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
502 let start = pos;
503 while pos < bytes.len() && bytes[pos] != b'\n' {
504 pos += 1;
505 }
506 tokens.push(Token {
507 kind: TokenKind::Comment,
508 span: Span::new(start, pos),
509 });
510 continue;
511 }
512 // Skip ordinary whitespace inline (logos handles it too, but we may
513 // be in the middle of the source between specials).
514 if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
515 pos += 1;
516 continue;
517 }
518 // An interpolated string `"… \(expr) …"` (v0.43): only strings that
519 // actually contain a `\(` hole are hand-scanned here; plain strings
520 // fall through to the logos `StrLit` path unchanged. `\(` is an
521 // invalid escape in the logos grammar, so this never re-routes a
522 // currently-valid literal.
523 if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
524 let end = scan_str(bytes, source, pos)?;
525 tokens.push(Token {
526 kind: TokenKind::InterpStr,
527 span: Span::new(pos, end),
528 });
529 pos = end;
530 continue;
531 }
532 // Otherwise dispatch a single logos token starting at `pos`.
533 let mut lex = TokenKind::lexer(&source[pos..]);
534 let Some(result) = lex.next() else {
535 // No token at this position; treat as unexpected character so
536 // the user sees something useful.
537 let ch = source[pos..].chars().next().unwrap_or('\0');
538 let span = Span::new(pos, pos + ch.len_utf8());
539 return Err(CompileError::new(
540 "bynk.lex.unexpected_character",
541 span,
542 format!("unexpected character `{ch}`"),
543 ));
544 };
545 let local = lex.span();
546 let span: Span = Span::new(pos + local.start, pos + local.end);
547 match result {
548 Ok(kind) => {
549 if kind == TokenKind::IntLit {
550 let slice = &source[span.range()];
551 if strip_digit_separators(slice).parse::<i64>().is_err() {
552 return Err(CompileError::new(
553 "bynk.lex.integer_overflow",
554 span,
555 format!(
556 "integer literal `{slice}` is out of range for a 64-bit signed integer"
557 ),
558 )
559 .with_note("the range is -2^63 to 2^63 - 1"));
560 }
561 }
562 if kind == TokenKind::FloatLit {
563 let slice = &source[span.range()];
564 match strip_digit_separators(slice).parse::<f64>() {
565 Ok(v) if v.is_finite() => {}
566 _ => {
567 return Err(CompileError::new(
568 "bynk.lex.float_literal_overflow",
569 span,
570 format!(
571 "float literal `{slice}` is out of range for a 64-bit float"
572 ),
573 )
574 .with_note(
575 "the literal does not fit a finite IEEE 754 double; \
576 the largest finite value is ~1.8e308",
577 ));
578 }
579 }
580 }
581 tokens.push(Token { kind, span });
582 pos = span.end;
583 }
584 Err(()) => {
585 let slice = &source[span.range()];
586 let ch = slice.chars().next().unwrap_or('\0');
587 let err = if ch == '"' {
588 CompileError::new(
589 "bynk.lex.unterminated_string",
590 span,
591 "unterminated string literal",
592 )
593 .with_note(
594 "string literals must close with `\"` on the same line; \
595 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
596 )
597 } else {
598 CompileError::new(
599 "bynk.lex.unexpected_character",
600 span,
601 format!("unexpected character `{ch}`"),
602 )
603 };
604 return Err(err);
605 }
606 }
607 }
608 Ok(tokens)
609}
610
611/// Like [`tokenize`], but with every interpolated-string token replaced by the
612/// tokens of its holes — each hole's bytes re-lexed and its token spans rebased
613/// to absolute source positions (the same rebase [`crate::parser`] applies when
614/// parsing a hole), recursing through nested interpolation. Chunk (literal) text
615/// between holes yields no tokens.
616///
617/// An interpolated string lexes to a single opaque `InterpStr` token, so the
618/// LSP's token-based cursor resolution (hover, go-to-definition, references,
619/// semantic tokens) is otherwise blind to identifiers inside `"… \(name) …"`.
620/// Expanding the holes makes those identifiers visible as ordinary `Ident`
621/// tokens with their real spans. (Issue #473.)
622///
623/// On a malformed interpolation (an `InterpStr` whose holes don't split, or a
624/// hole whose bytes don't re-lex) the offending token is kept opaque rather than
625/// dropped, so resolution degrades to the pre-fix behaviour instead of losing
626/// tokens.
627pub fn tokenize_expanding_holes(source: &str) -> Result<Vec<Token>, CompileError> {
628 let mut out = Vec::new();
629 for tok in tokenize(source)? {
630 expand_hole_token(source, tok, &mut out);
631 }
632 Ok(out)
633}
634
635/// Push `tok` onto `out`, expanding it into its holes' tokens if it is an
636/// `InterpStr` (see [`tokenize_expanding_holes`]); otherwise push it as-is.
637fn expand_hole_token(source: &str, tok: Token, out: &mut Vec<Token>) {
638 if tok.kind != TokenKind::InterpStr {
639 out.push(tok);
640 return;
641 }
642 let Ok(segments) = split_interp(source, tok.span) else {
643 out.push(tok); // malformed interpolation — keep the opaque token
644 return;
645 };
646 for segment in segments {
647 let InterpSegment::Hole(hole) = segment else {
648 continue; // chunk text carries no tokens
649 };
650 let Ok(hole_tokens) = tokenize(&source[hole.range()]) else {
651 continue;
652 };
653 for mut t in hole_tokens {
654 // Rebase the hole's local spans to absolute source positions.
655 t.span = Span::new(t.span.start + hole.start, t.span.end + hole.start);
656 expand_hole_token(source, t, out); // recurse for nested interpolation
657 }
658 }
659}
660
661/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
662/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
663/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
664/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
665/// routed here so the hole-aware scanner produces the precise error.
666fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
667 let mut i = start + 1;
668 while i < bytes.len() {
669 match bytes[i] {
670 b'\n' | b'"' => return false,
671 b'\\' => {
672 if bytes.get(i + 1) == Some(&b'(') {
673 return true;
674 }
675 i += 2;
676 }
677 _ => i += 1,
678 }
679 }
680 false
681}
682
683/// Scan a double-quoted string starting at `start` (the opening `"`), returning
684/// the byte offset just past the closing `"`. Recognises the four simple
685/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
686/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
687fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
688 debug_assert_eq!(bytes[start], b'"');
689 let mut i = start + 1;
690 loop {
691 if i >= bytes.len() || bytes[i] == b'\n' {
692 return Err(CompileError::new(
693 "bynk.lex.unterminated_string",
694 Span::new(start, i.min(bytes.len())),
695 "unterminated string literal",
696 )
697 .with_note(
698 "string literals must close with `\"` on the same line; \
699 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
700 ));
701 }
702 match bytes[i] {
703 b'"' => return Ok(i + 1),
704 b'\\' => match bytes.get(i + 1) {
705 Some(b'n' | b't' | b'"' | b'\\') => i += 2,
706 Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
707 other => {
708 let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
709 return Err(CompileError::new(
710 "bynk.lex.bad_escape",
711 Span::new(i, (i + 2).min(bytes.len())),
712 format!("invalid escape sequence `\\{shown}` in string literal"),
713 )
714 .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
715 }
716 },
717 // Any other byte advances one position. UTF-8 continuation bytes
718 // are all >= 0x80, so they never collide with the ASCII specials.
719 _ => i += 1,
720 }
721 }
722}
723
724/// Scan an interpolation hole body. `start` points just past the `\(`; returns
725/// the offset just past the matching `)`. Tracks paren depth and skips nested
726/// strings (whose own parens must not close the hole), recursing through
727/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
728fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
729 let mut i = start;
730 let mut depth = 1usize;
731 loop {
732 if i >= bytes.len() || bytes[i] == b'\n' {
733 return Err(CompileError::new(
734 "bynk.lex.unterminated_interpolation",
735 Span::new(start.saturating_sub(2), i.min(bytes.len())),
736 "unterminated interpolation hole",
737 )
738 .with_note(
739 "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
740 ));
741 }
742 match bytes[i] {
743 b'(' => {
744 depth += 1;
745 i += 1;
746 }
747 b')' => {
748 depth -= 1;
749 i += 1;
750 if depth == 0 {
751 return Ok(i);
752 }
753 }
754 b'"' => i = scan_str(bytes, source, i)?,
755 _ => i += 1,
756 }
757 }
758}
759
760/// One segment of a split interpolated string (v0.43): literal text (escapes
761/// resolved) or the absolute source span of a hole's expression (the bytes
762/// between `\(` and its matching `)`). The parser turns the latter into a real
763/// `Expr`; the lexer owns only the scanning.
764pub(crate) enum InterpSegment {
765 Chunk(String),
766 Hole(Span),
767}
768
769/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
770/// and hole spans. Escapes in the chunks are resolved here (mirroring
771/// [`parse_string_literal`]); holes are returned as spans for the parser to
772/// re-lex and parse as expressions. (v0.43.)
773pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
774 let bytes = source.as_bytes();
775 let inner_end = span.end - 1; // the closing `"`
776 let mut segments = Vec::new();
777 let mut chunk = String::new();
778 let mut i = span.start + 1; // past the opening `"`
779 while i < inner_end {
780 match bytes[i] {
781 b'\\' => match bytes[i + 1] {
782 b'n' => {
783 chunk.push('\n');
784 i += 2;
785 }
786 b't' => {
787 chunk.push('\t');
788 i += 2;
789 }
790 b'"' => {
791 chunk.push('"');
792 i += 2;
793 }
794 b'\\' => {
795 chunk.push('\\');
796 i += 2;
797 }
798 b'(' => {
799 if !chunk.is_empty() {
800 segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
801 }
802 let hole_start = i + 2;
803 let after = scan_hole(bytes, source, hole_start)?;
804 // `after` is one past the matching `)`; the hole body is
805 // everything up to that `)`.
806 segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
807 i = after;
808 }
809 // The lexer already validated every escape, so nothing else
810 // can appear here.
811 other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
812 },
813 _ => {
814 let ch = source[i..].chars().next().unwrap();
815 chunk.push(ch);
816 i += ch.len_utf8();
817 }
818 }
819 }
820 if !chunk.is_empty() {
821 segments.push(InterpSegment::Chunk(chunk));
822 }
823 Ok(segments)
824}
825
826/// If a `---` doc-block marker line starts at or shortly after `pos` (which
827/// must be at a line boundary), return the byte offset just past the marker
828/// line (after the terminating newline, or at EOF). The doc-block grammar
829/// requires the marker to be alone on its line; leading horizontal whitespace
830/// is allowed and ignored.
831fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
832 let bytes = source.as_bytes();
833 if !at_line_start(source, pos) {
834 return None;
835 }
836 // Skip leading horizontal whitespace.
837 let mut i = pos;
838 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
839 i += 1;
840 }
841 if i + 3 > bytes.len() {
842 return None;
843 }
844 if &bytes[i..i + 3] != b"---" {
845 return None;
846 }
847 i += 3;
848 // The marker may have additional trailing dashes (per spec "three or more
849 // consecutive hyphens"). Consume them.
850 while i < bytes.len() && bytes[i] == b'-' {
851 i += 1;
852 }
853 // After the dashes, allow only horizontal whitespace then newline/EOF.
854 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
855 i += 1;
856 }
857 if i == bytes.len() {
858 return Some(i);
859 }
860 if bytes[i] == b'\n' {
861 return Some(i + 1);
862 }
863 None
864}
865
866/// Find the next closing `---` line at or after `pos`. Returns
867/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
868/// terminating newline, or at EOF).
869fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
870 let bytes = source.as_bytes();
871 while pos < bytes.len() {
872 // Advance pos to the start of a line.
873 let line_start = pos;
874 // Find the end of this line.
875 let mut line_end = line_start;
876 while line_end < bytes.len() && bytes[line_end] != b'\n' {
877 line_end += 1;
878 }
879 // Check this line.
880 if let Some(end) = doc_block_open_at(source, line_start) {
881 return Some((line_start, end));
882 }
883 // Move to the next line.
884 pos = if line_end < bytes.len() {
885 line_end + 1
886 } else {
887 line_end
888 };
889 }
890 None
891}
892
893/// Returns true if byte offset `pos` is at a line start (column 0).
894fn at_line_start(source: &str, pos: usize) -> bool {
895 if pos == 0 {
896 return true;
897 }
898 let bytes = source.as_bytes();
899 bytes[pos - 1] == b'\n'
900}
901
902/// Extract the body content of a doc-block token from its source span.
903/// Strips the leading and trailing `---` marker lines and returns the body
904/// verbatim. If every non-empty content line begins with the same horizontal
905/// whitespace prefix (e.g., because the doc block sits inside a brace-form
906/// commons body), that common prefix is removed so the body reads naturally
907/// when emitted as JSDoc.
908pub fn doc_block_content(source: &str, span: Span) -> String {
909 let slice = &source[span.range()];
910 // Drop the first line (opening marker).
911 let after_open = match slice.find('\n') {
912 Some(i) => &slice[i + 1..],
913 None => return String::new(),
914 };
915 let bytes = after_open.as_bytes();
916 // Trim the trailing closing-marker line.
917 let mut i = bytes.len();
918 if i > 0 && bytes[i - 1] == b'\n' {
919 i -= 1;
920 }
921 while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
922 i -= 1;
923 }
924 while i > 0 && bytes[i - 1] == b'-' {
925 i -= 1;
926 }
927 if i > 0 && bytes[i - 1] == b'\n' {
928 i -= 1;
929 }
930 let body = &after_open[..i];
931
932 // Compute the common leading-whitespace prefix across all non-empty lines
933 // and strip it. This lets writers indent the doc block alongside the
934 // declaration it documents without bleeding the indent into the JSDoc.
935 let common: Option<usize> = body
936 .lines()
937 .filter(|l| !l.trim().is_empty())
938 .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
939 .min();
940 let strip = common.unwrap_or(0);
941 if strip == 0 {
942 return body.to_string();
943 }
944 let mut out = String::with_capacity(body.len());
945 let mut first = true;
946 for line in body.lines() {
947 if !first {
948 out.push('\n');
949 }
950 first = false;
951 if line.trim().is_empty() {
952 // Preserve blank lines.
953 continue;
954 }
955 let leading: usize = line
956 .bytes()
957 .take_while(|&b| b == b' ' || b == b'\t')
958 .count();
959 let drop = strip.min(leading);
960 out.push_str(&line[drop..]);
961 }
962 out
963}
964
965/// Extract the body of a `Comment` trivia token: everything after the
966/// leading `--` marker, preserving its inline whitespace verbatim. Used by
967/// the parser when attaching comments to declarations.
968pub fn comment_body(source: &str, span: Span) -> &str {
969 let slice = &source[span.range()];
970 // Strip leading "--" if present (defensive — the lexer always emits
971 // Comment tokens whose span begins with `--`).
972 slice.strip_prefix("--").unwrap_or(slice)
973}
974
975/// Returns true if there is a blank line (a line containing only whitespace)
976/// in `source` strictly between byte offsets `from` (inclusive) and `to`
977/// (exclusive). Used by the parser to detect orphan doc blocks.
978///
979/// A doc-block token's span ends just past the closing-marker line's
980/// terminating newline. So if the next declaration begins on the immediately
981/// following line, the substring between contains no newline (only optional
982/// indentation). Any newline in the substring therefore implies at least one
983/// entirely-blank line separating the doc from the declaration.
984pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
985 if to <= from {
986 return false;
987 }
988 let bytes = source.as_bytes();
989 let mut i = from;
990 while i < to {
991 if bytes[i] == b'\n' {
992 return true;
993 }
994 if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
995 return false;
996 }
997 i += 1;
998 }
999 false
1000}
1001
1002#[cfg(test)]
1003mod tests {
1004 use super::*;
1005
1006 fn kinds(source: &str) -> Vec<TokenKind> {
1007 tokenize(source)
1008 .unwrap()
1009 .into_iter()
1010 .map(|t| t.kind)
1011 .collect()
1012 }
1013
1014 #[test]
1015 fn keywords_and_idents() {
1016 use TokenKind::*;
1017 assert_eq!(
1018 kinds("commons type fn where and true false Int String Bool foo bar"),
1019 vec![
1020 Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
1021 ],
1022 );
1023 }
1024
1025 #[test]
1026 fn integer_and_string_literals() {
1027 use TokenKind::*;
1028 assert_eq!(
1029 kinds(r#"0 42 "hello" "with\nescape""#),
1030 vec![IntLit, IntLit, StrLit, StrLit]
1031 );
1032 }
1033
1034 #[test]
1035 fn operators() {
1036 use TokenKind::*;
1037 assert_eq!(
1038 kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
1039 vec![
1040 Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
1041 Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
1042 At,
1043 ],
1044 );
1045 }
1046
1047 #[test]
1048 fn line_comments_emitted_as_trivia() {
1049 // v1.1: line comments are preserved as Comment tokens so the
1050 // formatter can attach and re-emit them.
1051 use TokenKind::*;
1052 let src = "-- a comment\ntype X = Int -- trailing\n";
1053 assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
1054 }
1055
1056 #[test]
1057 fn comment_body_extracts_text_after_marker() {
1058 let toks = tokenize("-- hello world\n").unwrap();
1059 assert_eq!(toks.len(), 1);
1060 assert_eq!(toks[0].kind, TokenKind::Comment);
1061 assert_eq!(
1062 comment_body("-- hello world\n", toks[0].span),
1063 " hello world"
1064 );
1065 }
1066
1067 #[test]
1068 fn comment_does_not_consume_newline() {
1069 // Two adjacent comment lines should produce two distinct tokens
1070 // — the newline between them is not part of either comment's span.
1071 let toks = tokenize("-- one\n-- two\n").unwrap();
1072 assert_eq!(toks.len(), 2);
1073 assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
1074 }
1075
1076 #[test]
1077 fn unterminated_string_is_error() {
1078 let err = tokenize("\"oops\n").unwrap_err();
1079 assert_eq!(err.category, "bynk.lex.unterminated_string");
1080 }
1081
1082 #[test]
1083 fn integer_overflow_is_error() {
1084 let err = tokenize("99999999999999999999").unwrap_err();
1085 assert_eq!(err.category, "bynk.lex.integer_overflow");
1086 }
1087
1088 #[test]
1089 fn digit_separators_lex_as_one_number() {
1090 use TokenKind::*;
1091 // v0.142 (ADR 0166): `_` between digit groups keeps the literal a single
1092 // token for both Int and Float.
1093 assert_eq!(kinds("1_048_576"), vec![IntLit]);
1094 assert_eq!(kinds("1_000.500_5"), vec![FloatLit]);
1095 assert_eq!(kinds("1_000e1_0"), vec![FloatLit]);
1096 // A separator-carrying literal that is in range still lexes (the value is
1097 // validated after stripping the separators).
1098 assert!(tokenize("9_223_372_036_854_775_807").is_ok());
1099 // Overflow is still caught on the separator-free value.
1100 let err = tokenize("9_999_999_999_999_999_999_9").unwrap_err();
1101 assert_eq!(err.category, "bynk.lex.integer_overflow");
1102 }
1103
1104 #[test]
1105 fn strip_digit_separators_removes_underscores() {
1106 assert_eq!(strip_digit_separators("1_048_576"), "1048576");
1107 assert_eq!(strip_digit_separators("42"), "42");
1108 }
1109
1110 #[test]
1111 fn unexpected_character_is_error() {
1112 let err = tokenize("type X = Int $").unwrap_err();
1113 assert_eq!(err.category, "bynk.lex.unexpected_character");
1114 }
1115
1116 #[test]
1117 fn v0_1_keywords() {
1118 use TokenKind::*;
1119 assert_eq!(
1120 kinds("let if else Ok Err Result ValidationError"),
1121 vec![Let, If, Else, Ok, Err, Result, ValidationError],
1122 );
1123 }
1124
1125 #[test]
1126 fn question_token() {
1127 use TokenKind::*;
1128 assert_eq!(kinds("x?"), vec![Ident, Question]);
1129 }
1130
1131 #[test]
1132 fn v0_2_keywords() {
1133 use TokenKind::*;
1134 assert_eq!(
1135 kinds("enum match Option record self Some None is"),
1136 vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1137 );
1138 }
1139
1140 #[test]
1141 fn pipe_and_pipe_pipe_disambiguated() {
1142 use TokenKind::*;
1143 assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1144 }
1145
1146 #[test]
1147 fn v0_7_keywords() {
1148 use TokenKind::*;
1149 assert_eq!(kinds("expect suite case"), vec![Expect, Suite, Case],);
1150 // v0.118: `mocks` and `wires` are retired — plain identifiers now.
1151 assert_eq!(kinds("mocks wires"), vec![Ident, Ident]);
1152 }
1153
1154 #[test]
1155 fn fat_arrow_and_underscore() {
1156 use TokenKind::*;
1157 assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1158 }
1159
1160 // -- v0.43 string interpolation --
1161
1162 #[test]
1163 fn interp_string_is_one_token() {
1164 use TokenKind::*;
1165 assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1166 // A plain string (no hole) stays a `StrLit`, via the logos path.
1167 assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1168 }
1169
1170 #[test]
1171 fn interp_balances_nested_parens_and_strings() {
1172 use TokenKind::*;
1173 // The `)` inside `f(x)` must not close the hole early.
1174 assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1175 // A `)` inside a nested string inside the hole is also ignored.
1176 assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1177 // A nested interpolated string inside a hole.
1178 assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1179 }
1180
1181 // Issue #473: hole-expanding tokenisation makes identifiers inside `\(…)`
1182 // visible to the LSP's token-based cursor resolution.
1183 #[test]
1184 fn expanding_holes_exposes_hole_identifiers() {
1185 use TokenKind::*;
1186 let expand = |src: &str| {
1187 tokenize_expanding_holes(src)
1188 .unwrap()
1189 .into_iter()
1190 .map(|t| t.kind)
1191 .collect::<Vec<_>>()
1192 };
1193 // The opaque `InterpStr` is replaced by its hole's tokens; the chunk
1194 // text (`Hello, ` / `!`) carries none.
1195 assert_eq!(expand(r#""Hello, \(name)!""#), vec![Ident]);
1196 // A call hole exposes every token of the call expression.
1197 assert_eq!(expand(r#""= \(f(x))""#), vec![Ident, LParen, Ident, RParen]);
1198 // Nested interpolation recurses to the innermost hole's identifier.
1199 assert_eq!(expand(r#""out \("in \(x)")""#), vec![Ident]);
1200 // A plain (hole-free) string is untouched.
1201 assert_eq!(expand(r#""Hello, world""#), vec![StrLit]);
1202 }
1203
1204 #[test]
1205 fn expanding_holes_rebases_spans_to_absolute() {
1206 let src = r#""Hello, \(name)!""#;
1207 let toks = tokenize_expanding_holes(src).unwrap();
1208 let ident = toks
1209 .iter()
1210 .find(|t| t.kind == TokenKind::Ident)
1211 .expect("the hole identifier is exposed");
1212 // The span points at `name` in the original source, not a hole-local 0.
1213 assert_eq!(&src[ident.span.range()], "name");
1214 assert_eq!(ident.span.start, src.find("name").unwrap());
1215 }
1216
1217 #[test]
1218 fn escaped_open_paren_is_not_a_hole() {
1219 use TokenKind::*;
1220 // `\\(` is a literal backslash followed by `(` — no hole, so the
1221 // string lexes as a plain `StrLit` on the logos path.
1222 assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1223 }
1224
1225 #[test]
1226 fn unterminated_hole_is_an_error() {
1227 // The hole runs to end of line without its closing `)`.
1228 let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1229 assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1230 }
1231
1232 #[test]
1233 fn unterminated_interp_string_is_an_error() {
1234 // A hole closes but the string never does (newline before the `"`).
1235 let err = tokenize("\"value \\(x) more\n").unwrap_err();
1236 assert_eq!(err.category, "bynk.lex.unterminated_string");
1237 }
1238
1239 #[test]
1240 fn bad_escape_in_interp_string_is_an_error() {
1241 let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1242 assert_eq!(err.category, "bynk.lex.bad_escape");
1243 }
1244}