use text_size::{TextRange, TextSize};
use crate::SyntaxKind;
use crate::lexer::RawToken;
const TAB_SIZE: u32 = 4;
#[derive(Debug, Clone)]
struct LambdaCtx {
saved_indent_stack: Vec<u32>,
base: u32,
open_bracket_depth: u32,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndentDiagnostic {
pub range: TextRange,
pub message: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum IndentChar {
Tab,
Space,
}
#[must_use]
pub fn run(tokens: &[RawToken], src: &str) -> (Vec<RawToken>, Vec<IndentDiagnostic>) {
let mut p = PrePass {
src,
out: Vec::with_capacity(tokens.len() + 16),
diags: Vec::new(),
indent_stack: vec![0],
bracket_depth: 0,
indent_char: None,
lambda_stack: Vec::new(),
};
p.run_lines(tokens);
(p.out, p.diags)
}
struct PrePass<'s> {
src: &'s str,
out: Vec<RawToken>,
diags: Vec<IndentDiagnostic>,
indent_stack: Vec<u32>,
bracket_depth: u32,
indent_char: Option<IndentChar>,
lambda_stack: Vec<LambdaCtx>,
}
impl PrePass<'_> {
fn run_lines(&mut self, tokens: &[RawToken]) {
let mut start = 0usize;
let mut i = 0usize;
while i < tokens.len() {
if tokens[i].kind == SyntaxKind::NewlinePhys {
self.line(&tokens[start..=i]);
start = i + 1;
}
i += 1;
}
if start < tokens.len() {
self.line(&tokens[start..]); }
self.finish(src_end(self.src));
}
fn line(&mut self, line: &[RawToken]) {
let Some(first) = line.iter().find(|t| !t.kind.is_trivia()) else {
self.copy_verbatim(line);
return;
};
let col = self.column(line);
let at = first.range.start();
if matches!(
first.kind,
SyntaxKind::RParen | SyntaxKind::RBrace | SyntaxKind::RBrack
) && self
.lambda_stack
.last()
.is_some_and(|ctx| ctx.open_bracket_depth >= self.bracket_depth)
{
self.close_lambdas_on_bracket(self.bracket_depth.saturating_sub(1), at);
}
self.close_lambdas(col, at);
let in_lambda = !self.lambda_stack.is_empty();
let suppressed = !in_lambda && self.bracket_depth > 0;
if !suppressed {
self.diagnose_indent(line);
self.emit_indent_dedent(col, at);
}
let mut has_terminator = false;
let mut last_meaningful: Option<SyntaxKind> = None;
for tok in line {
if tok.kind == SyntaxKind::NewlinePhys {
has_terminator = true;
let opens_lambda =
self.bracket_depth > 0 && last_meaningful == Some(SyntaxKind::Colon);
if self.bracket_depth == 0 || in_lambda || opens_lambda {
self.push_marker(SyntaxKind::Newline, tok.range.start());
}
self.out.push(*tok);
} else {
if matches!(
tok.kind,
SyntaxKind::RParen | SyntaxKind::RBrace | SyntaxKind::RBrack
) && !self.lambda_stack.is_empty()
{
let new_depth = self.bracket_depth.saturating_sub(1);
self.close_lambdas_on_bracket(new_depth, tok.range.start());
}
else if tok.kind == SyntaxKind::Comma
&& self
.lambda_stack
.last()
.is_some_and(|ctx| ctx.open_bracket_depth == self.bracket_depth)
{
self.close_lambdas_on_bracket(
self.bracket_depth.saturating_sub(1),
tok.range.start(),
);
}
self.out.push(*tok);
self.track_bracket(tok.kind);
if !tok.kind.is_trivia() {
last_meaningful = Some(tok.kind);
}
}
}
if !has_terminator && (self.bracket_depth == 0 || in_lambda) {
self.push_marker(SyntaxKind::Newline, src_end(self.src));
}
if self.bracket_depth > 0 && last_meaningful == Some(SyntaxKind::Colon) {
let saved = std::mem::replace(&mut self.indent_stack, vec![col]);
self.lambda_stack.push(LambdaCtx {
saved_indent_stack: saved,
base: col,
open_bracket_depth: self.bracket_depth,
});
}
}
fn close_lambdas(&mut self, col: u32, at: TextSize) {
while self.lambda_stack.last().is_some_and(|ctx| col <= ctx.base) {
let base = self.lambda_stack.last().expect("checked").base;
while *self.indent_stack.last().expect("lambda base present") > base {
self.indent_stack.pop();
self.push_marker(SyntaxKind::Dedent, at);
}
let ctx = self.lambda_stack.pop().expect("checked");
self.indent_stack = ctx.saved_indent_stack;
}
}
fn close_lambdas_on_bracket(&mut self, new_depth: u32, at: TextSize) {
while self
.lambda_stack
.last()
.is_some_and(|ctx| ctx.open_bracket_depth > new_depth)
{
let base = self.lambda_stack.last().expect("checked").base;
while *self.indent_stack.last().expect("lambda base present") > base {
self.indent_stack.pop();
self.push_marker(SyntaxKind::Dedent, at);
}
let ctx = self.lambda_stack.pop().expect("checked");
self.indent_stack = ctx.saved_indent_stack;
}
}
fn copy_verbatim(&mut self, line: &[RawToken]) {
for tok in line {
self.out.push(*tok);
if tok.kind != SyntaxKind::NewlinePhys {
self.track_bracket(tok.kind);
}
}
}
fn emit_indent_dedent(&mut self, col: u32, at: TextSize) {
let top = *self.indent_stack.last().expect("indent stack has a base 0");
if col > top {
self.indent_stack.push(col);
self.push_marker(SyntaxKind::Indent, at);
} else if col < top {
while *self.indent_stack.last().expect("base 0 guards the loop") > col {
self.indent_stack.pop();
self.push_marker(SyntaxKind::Dedent, at);
}
if *self.indent_stack.last().expect("non-empty") != col {
self.diags.push(IndentDiagnostic {
range: TextRange::empty(at),
message: "Unindent does not match any outer indentation level.".to_owned(),
});
self.indent_stack.push(col); }
}
}
fn column(&self, line: &[RawToken]) -> u32 {
let Some(ws) = line.first().filter(|t| t.kind == SyntaxKind::Whitespace) else {
return 0;
};
self.src[ws.range]
.bytes()
.fold(0u32, |col, b| col + if b == b'\t' { TAB_SIZE } else { 1 })
}
fn diagnose_indent(&mut self, line: &[RawToken]) {
let Some(ws) = line.first().filter(|t| t.kind == SyntaxKind::Whitespace) else {
return;
};
let text = &self.src[ws.range];
let mut saw_tab = false;
let mut saw_space = false;
for b in text.bytes() {
saw_tab |= b == b'\t';
saw_space |= b == b' ';
}
if saw_tab && saw_space {
self.diags.push(IndentDiagnostic {
range: ws.range,
message: "Mixed use of tabs and spaces for indentation.".to_owned(),
});
} else if let Some(first) = text.bytes().next() {
let this = if first == b'\t' {
IndentChar::Tab
} else {
IndentChar::Space
};
match self.indent_char {
None => self.indent_char = Some(this),
Some(file) if file != this => {
let (used, before) = match this {
IndentChar::Tab => ("tab", "space"),
IndentChar::Space => ("space", "tab"),
};
self.diags.push(IndentDiagnostic {
range: ws.range,
message: format!(
"Used {used} character for indentation instead of {before} as used before in the file."
),
});
}
Some(_) => {}
}
}
}
fn finish(&mut self, at: TextSize) {
self.close_lambdas(0, at); while *self.indent_stack.last().expect("base 0") > 0 {
self.indent_stack.pop();
self.push_marker(SyntaxKind::Dedent, at);
}
}
fn track_bracket(&mut self, kind: SyntaxKind) {
match kind {
SyntaxKind::LParen | SyntaxKind::LBrack | SyntaxKind::LBrace => {
self.bracket_depth += 1;
}
SyntaxKind::RParen | SyntaxKind::RBrack | SyntaxKind::RBrace => {
self.bracket_depth = self.bracket_depth.saturating_sub(1);
}
_ => {}
}
}
fn push_marker(&mut self, kind: SyntaxKind, at: TextSize) {
self.out.push(RawToken {
kind,
range: TextRange::empty(at),
});
}
}
fn src_end(src: &str) -> TextSize {
TextSize::of(src)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenize;
fn prepass(src: &str) -> Vec<RawToken> {
run(&tokenize(src), src).0
}
fn structure(src: &str) -> Vec<SyntaxKind> {
prepass(src)
.into_iter()
.filter(|t| !t.kind.is_trivia())
.map(|t| t.kind)
.collect()
}
fn diagnostics(src: &str) -> Vec<IndentDiagnostic> {
run(&tokenize(src), src).1
}
fn assert_lossless(src: &str) {
let rebuilt: String = prepass(src).iter().map(|t| &src[t.range]).collect();
assert_eq!(rebuilt, src, "prepass not lossless for {src:?}");
}
fn count(src: &str, kind: SyntaxKind) -> usize {
structure(src).into_iter().filter(|&k| k == kind).count()
}
#[test]
fn nested_func_if_drives_indent_dedent() {
use SyntaxKind as S;
let src = "func f():\n\tif x:\n\t\treturn\n";
assert_lossless(src);
assert_eq!(
structure(src),
vec![
S::FuncKw,
S::Ident,
S::LParen,
S::RParen,
S::Colon,
S::Newline,
S::Indent,
S::IfKw,
S::Ident,
S::Colon,
S::Newline,
S::Indent,
S::ReturnKw,
S::Newline,
S::Dedent,
S::Dedent,
]
);
}
#[test]
fn line_continuation_does_not_indent() {
let src = "a = 1 + \\\n 2\n";
assert_lossless(src);
assert_eq!(count(src, SyntaxKind::Indent), 0);
assert_eq!(count(src, SyntaxKind::Newline), 1); }
#[test]
fn multiline_brackets_suppress_indentation() {
let src = "var a = [\n\t1,\n\t2,\n]\n";
assert_lossless(src);
assert_eq!(count(src, SyntaxKind::Indent), 0);
assert_eq!(count(src, SyntaxKind::Dedent), 0);
assert_eq!(count(src, SyntaxKind::Newline), 1); }
#[test]
fn top_level_lambda_body_indents() {
use SyntaxKind as S;
let src = "var f = func():\n\tprint()\nx = 1\n";
assert_lossless(src);
assert_eq!(count(src, S::Indent), 1);
assert_eq!(count(src, S::Dedent), 1);
}
#[test]
fn blank_and_comment_only_lines_keep_state() {
let src = "func f():\n\tx = 1\n\n# top-level comment\n\ty = 2\n";
assert_lossless(src);
assert_eq!(count(src, SyntaxKind::Indent), 1);
assert_eq!(count(src, SyntaxKind::Dedent), 1);
}
#[test]
fn inline_block_has_no_indent() {
let src = "func f(): return 1\n";
assert_lossless(src);
assert_eq!(count(src, SyntaxKind::Indent), 0);
assert_eq!(count(src, SyntaxKind::Newline), 1);
}
#[test]
fn dedent_to_eof_without_trailing_newline() {
use SyntaxKind as S;
let src = "func f():\n\tpass";
assert_lossless(src);
let s = structure(src);
assert_eq!(s.last(), Some(&S::Dedent));
assert_eq!(count(src, S::Indent), 1);
assert_eq!(count(src, S::Dedent), 1);
assert!(s.contains(&S::Newline));
}
#[test]
fn empty_and_comment_only_files() {
assert_lossless("");
assert_eq!(structure(""), Vec::<SyntaxKind>::new());
assert_lossless("# just a comment\n");
assert_eq!(count("# just a comment\n", SyntaxKind::Indent), 0);
}
#[test]
fn mixed_tabs_and_spaces_diagnoses_but_recovers() {
let src = "func f():\n \tpass\n";
assert_lossless(src);
let diags = diagnostics(src);
assert!(
diags
.iter()
.any(|d| d.message.contains("Mixed use of tabs and spaces")),
"expected a mixed-indent diagnostic, got {diags:?}"
);
}
#[test]
fn inconsistent_indent_char_across_lines_is_flagged() {
let src = "func f():\n\ta = 1\nfunc g():\n b = 2\n";
let diags = diagnostics(src);
assert!(
diags.iter().any(|d| d.message.contains("instead of")),
"expected an inconsistent-indent diagnostic, got {diags:?}"
);
}
#[test]
fn match_block_nests() {
use SyntaxKind as S;
let src = "match x:\n\t1:\n\t\tpass\n";
assert_lossless(src);
assert_eq!(count(src, S::Indent), 2);
assert_eq!(count(src, S::Dedent), 2);
assert_eq!(structure(src)[0], S::MatchKw);
}
#[test]
fn multiline_lambda_inside_brackets_indents() {
use SyntaxKind as S;
let src = "arr.sort_custom(func(a, b):\n\treturn a < b\n)\n";
assert_lossless(src);
assert_eq!(count(src, S::Indent), 1, "lambda body should Indent once");
assert_eq!(count(src, S::Dedent), 1, "lambda body should Dedent once");
let s = structure(src);
let colon = s.iter().position(|&k| k == S::Colon).unwrap();
assert_eq!(s[colon + 1], S::Newline);
assert_eq!(s[colon + 2], S::Indent);
let rparen = s.iter().rposition(|&k| k == S::RParen).unwrap();
assert_eq!(s[rparen - 1], S::Dedent);
}
#[test]
fn lambda_inside_multiline_array() {
use SyntaxKind as S;
let src = "var a = [\n\tfunc():\n\t\tprint()\n]\n";
assert_lossless(src);
assert_eq!(count(src, S::Indent), 1);
assert_eq!(count(src, S::Dedent), 1);
}
#[test]
fn nested_lambdas_inside_brackets() {
use SyntaxKind as S;
let src = "outer(func():\n\tinner(func():\n\t\tbody\n\t)\n)\n";
assert_lossless(src);
assert_eq!(count(src, S::Indent), 2, "two nested lambda bodies");
assert_eq!(count(src, S::Dedent), 2);
}
#[test]
fn single_line_lambda_inside_brackets_has_no_indent() {
use SyntaxKind as S;
let src = "arr.map(func(x): x * 2)\n";
assert_lossless(src);
assert_eq!(count(src, S::Indent), 0);
assert_eq!(count(src, S::Dedent), 0);
assert_eq!(count(src, S::Newline), 1);
}
}