nixfmt_rs 0.1.0

//! Hand-written lexer for Nix
//!
//! Ports the comment normalization logic from nixfmt's Lexer.hs

use crate::types::{Token, Trivia};

mod comments;
mod numbers;
mod trivia;

#[cfg(test)]
mod tests;

/// Update `line`/`column` to account for having advanced over `slice`.
/// Nix source is overwhelmingly ASCII, so the no-newline ASCII case is the
/// fast path; only count chars when non-ASCII bytes are present.
///
/// Free function rather than `&mut self` so callers may borrow
/// `self.source` for `slice` while mutating the two counters.
#[inline]
fn bump_line_col(line: &mut usize, column: &mut usize, slice: &str) {
    match memchr::memrchr(b'\n', slice.as_bytes()) {
        None => {
            *column += if slice.is_ascii() {
                slice.len()
            } else {
                slice.chars().count()
            };
        }
        Some(last_nl) => {
            *line += memchr::memchr_iter(b'\n', slice.as_bytes()).count();
            let tail = &slice[last_nl + 1..];
            *column = if tail.is_ascii() {
                tail.len()
            } else {
                tail.chars().count()
            };
        }
    }
}

/// Intermediate trivia representation during parsing
#[derive(Debug, Clone)]
pub enum ParseTrivium {
    /// Multiple newlines
    Newlines(usize),
    /// Line comment with text and column position
    LineComment { text: String, col: usize },
    /// Block comment (`is_doc`, lines)
    BlockComment(bool, Vec<String>),
    /// Language annotation like /* lua */
    LanguageAnnotation(String),
}

/// Cursor-only snapshot of the lexer (no heap state).
#[derive(Clone, Copy)]
pub struct LexerPos {
    byte_pos: usize,
    line: usize,
    column: usize,
}

/// Saved lexer state for backtracking
#[derive(Clone)]
pub struct LexerState {
    byte_pos: usize,
    line: usize,
    column: usize,
    trivia_buffer: Trivia,
    recent_newlines: usize,
    recent_hspace: usize,
}

pub struct Lexer {
    /// Original source. The lexer scans it byte-wise for ASCII tokens and
    /// only decodes UTF-8 at the cursor when a multi-byte char is observed,
    /// avoiding the up-front `Vec<char>` materialisation.
    source: Box<str>,
    /// Byte offset of the cursor; always on a UTF-8 char boundary.
    byte_pos: usize,
    line: usize,
    pub(crate) column: usize,
    /// Accumulated leading trivia for next token
    pub(crate) trivia_buffer: Trivia,
    pub(crate) recent_newlines: usize,
    pub(crate) recent_hspace: usize,
    /// Position before last `parse_trivia()` call, for rewinding.
    /// Kept as a single value so the four cursor components can never
    /// drift out of sync (previously four independent `Option`s).
    trivia_start: Option<LexerPos>,
    /// Scratch buffer reused by `parse_trivia` so the per-token trivia list
    /// does not allocate on every call.
    trivia_scratch: Vec<ParseTrivium>,
}

impl Lexer {
    pub(crate) fn new(source: &str) -> Self {
        Self {
            source: source.into(),
            byte_pos: 0,
            line: 1,
            column: 0,
            trivia_buffer: Trivia::new(),
            recent_newlines: 0,
            recent_hspace: 0,
            trivia_start: None,
            trivia_scratch: Vec::new(),
        }
    }

    /// Save current state for backtracking
    pub(crate) fn save_state(&self) -> LexerState {
        LexerState {
            byte_pos: self.byte_pos,
            line: self.line,
            column: self.column,
            trivia_buffer: self.trivia_buffer.clone(),
            recent_newlines: self.recent_newlines,
            recent_hspace: self.recent_hspace,
        }
    }

    /// Restore saved state
    pub(crate) fn restore_state(&mut self, state: LexerState) {
        self.byte_pos = state.byte_pos;
        self.line = state.line;
        self.column = state.column;
        self.trivia_buffer = state.trivia_buffer;
        self.recent_newlines = state.recent_newlines;
        self.recent_hspace = state.recent_hspace;
    }

    /// Parse a lexeme (token with trivia annotations)
    /// This is the main entry point for the parser
    pub(crate) fn lexeme(&mut self) -> crate::error::Result<crate::types::Ann<Token>> {
        let mut leading_trivia = std::mem::take(&mut self.trivia_buffer);

        let _ = self.skip_hspace();

        // Re-sync: when entering expression mode mid-source (after `${` in a
        // string), the lexer has not yet consumed the trivia before the first
        // body token. There is no preceding Nix token here, so treat all of it
        // as leading trivia rather than splitting off a discarded "trailing".
        if matches!(self.peek_byte(), Some(b'\n' | b'\r' | b'#' | b'/')) {
            self.parse_trivia();
            leading_trivia.extend(trivia::convert_leading(&self.trivia_scratch));
            let _ = self.skip_hspace();
        }

        let token_start = self.byte_pos;
        let start_line = self.line;

        // next_token() also skips hspace; redundant here but harmless.
        let token = self.next_token()?;

        let token_end = self.byte_pos;
        let end_line = self.line;
        let token_span =
            crate::types::Span::with_lines(token_start, token_end, start_line, end_line);

        // String/path delimiters: defer trivia so the parser sees raw source content.
        let skip_trivia = matches!(token, Token::TDoubleQuote | Token::TDoubleSingleQuote);

        let trailing_comment;
        if skip_trivia {
            trailing_comment = None;
            self.trivia_buffer = Trivia::new();
        } else if let Some(newlines) = self.fast_ws_trivia() {
            // Fast path hit: only whitespace between this token and the next.
            trailing_comment = None;
            self.trivia_buffer = if newlines > 1 {
                Trivia::one(crate::types::Trivium::EmptyLine())
            } else {
                Trivia::new()
            };
        } else {
            self.parse_trivia();
            let (tc, next) = trivia::convert_trivia(&self.trivia_scratch, self.column);
            trailing_comment = tc;
            self.trivia_buffer = next;
        }

        Ok(crate::types::Ann {
            pre_trivia: leading_trivia,
            span: token_span,
            value: token,
            trail_comment: trailing_comment,
        })
    }

    /// Parse a whole file (expression + final trivia)
    pub(crate) fn start_parse(&mut self) {
        self.parse_trivia();
        self.trivia_buffer = trivia::convert_leading(&self.trivia_scratch);
    }

    /// Parse trivia and classify it into `(trailing, next_leading)` so the
    /// parser does not need direct access to the scratch buffer.
    pub(crate) fn parse_and_convert_trivia(
        &mut self,
    ) -> (Option<crate::types::TrailingComment>, Trivia) {
        self.parse_trivia();
        trivia::convert_trivia(&self.trivia_scratch, self.column)
    }

    /// Get current position as a zero-length span (in byte offsets)
    pub(crate) const fn current_pos(&self) -> crate::types::Span {
        crate::types::Span::point(self.byte_pos)
    }

    /// Parse next token (without trivia handling)
    /// Trivia should ONLY be managed by `lexeme()`, not by this function.
    /// This matches Haskell nixfmt's `rawSymbol` which parses tokens without trivia.
    pub(super) fn next_token(&mut self) -> crate::error::Result<Token> {
        let _ = self.skip_hspace();

        let Some(b) = self.peek_byte() else {
            return Ok(Token::Sof); // Use SOF as EOF token
        };
        // All token-start characters are ASCII; non-ASCII falls through to the
        // error arm which decodes the full codepoint for the message.
        let ch = b as char;

        // Nix identifiers are ASCII-only: [a-zA-Z_][a-zA-Z0-9_'-]*. Must be
        // checked before the punctuation match below.
        if ch.is_ascii_alphabetic() || ch == '_' {
            return Ok(self.parse_ident_or_keyword());
        }

        match ch {
            '{' => Ok(self.single(Token::TBraceOpen)),
            '}' => Ok(self.single(Token::TBraceClose)),
            '[' => Ok(self.single(Token::TBrackOpen)),
            ']' => Ok(self.single(Token::TBrackClose)),
            '(' => Ok(self.single(Token::TParenOpen)),
            ')' => Ok(self.single(Token::TParenClose)),
            '=' => Ok(self.try_two_char('=', Token::TEqual, Token::TAssign)),
            '@' => Ok(self.single(Token::TAt)),
            ':' => Ok(self.single(Token::TColon)),
            ',' => Ok(self.single(Token::TComma)),
            ';' => Ok(self.single(Token::TSemicolon)),
            '?' => Ok(self.single(Token::TQuestion)),
            '.' => Ok(self.parse_dot_token()),
            '+' => Ok(self.try_two_char('+', Token::TConcat, Token::TPlus)),
            '-' => Ok(self.try_two_char('>', Token::TImplies, Token::TMinus)),
            '*' => Ok(self.single(Token::TMul)),
            '/' => Ok(self.try_two_char('/', Token::TUpdate, Token::TDiv)),
            '!' => Ok(self.try_two_char('=', Token::TUnequal, Token::TNot)),
            '<' if self.peek_ahead(1).is_some_and(char::is_alphanumeric) => self.parse_env_path(),
            '<' => {
                self.advance();
                Ok(match self.peek() {
                    Some('=') => self.single(Token::TLessEqual),
                    Some('|') => self.single(Token::TPipeBackward),
                    _ => Token::TLess,
                })
            }
            '>' => Ok(self.try_two_char('=', Token::TGreaterEqual, Token::TGreater)),
            '&' => {
                if self.at("&&") {
                    self.advance_by(2);
                    Ok(Token::TAnd)
                } else {
                    // Don't advance: keep the error span on the '&' itself.
                    self.err_unexpected(&["'&&'"], "'&'")
                }
            }
            '|' => {
                if self.at("||") {
                    self.advance_by(2);
                    Ok(Token::TOr)
                } else if self.at("|>") {
                    self.advance_by(2);
                    Ok(Token::TPipeForward)
                } else {
                    self.err_unexpected(&["'||'", "'|>'"], "'|'")
                }
            }
            '"' => Ok(self.single(Token::TDoubleQuote)),
            '\'' => {
                if self.at("''") {
                    self.advance_by(2);
                    Ok(Token::TDoubleSingleQuote)
                } else {
                    self.err_unexpected(&["''"], "'")
                }
            }
            '$' => {
                if self.at("${") {
                    self.advance_by(2);
                    Ok(Token::TInterOpen)
                } else {
                    self.err_unexpected(&["'${'"], "'$'")
                }
            }
            '0'..='9' => Ok(self.parse_number()),
            '~' => Ok(self.single(Token::TTilde)),
            _ => {
                // `ch` was derived from a single byte; for the error message
                // decode the actual codepoint so multi-byte input is reported
                // correctly.
                let ch = self.peek().unwrap();
                self.err_unexpected(&[], &format!("'{ch}'"))
            }
        }
    }

    /// Parse identifier or keyword
    fn parse_ident_or_keyword(&mut self) -> Token {
        // Nix identifiers are ASCII-only: [a-zA-Z_][a-zA-Z0-9_'-]*.
        let len = self
            .take_ascii_while(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b'\''))
            .len();
        let start_byte = self.byte_pos - len;
        let bytes = self.source.as_bytes();
        let text = &self.source[start_byte..self.byte_pos];

        // First-byte + length dispatch keeps the common "not a keyword" path
        // to a single comparison instead of up to nine `memcmp`s.

        match (len, bytes[start_byte]) {
            (6, b'a') if text == "assert" => Token::KAssert,
            (4, b'e') if text == "else" => Token::KElse,
            (2, b'i') if text == "if" => Token::KIf,
            (2, b'i') if text == "in" => Token::KIn,
            (7, b'i') if text == "inherit" => Token::KInherit,
            (3, b'l') if text == "let" => Token::KLet,
            (3, b'r') if text == "rec" => Token::KRec,
            (4, b't') if text == "then" => Token::KThen,
            (4, b'w') if text == "with" => Token::KWith,
            _ => Token::Identifier(text.into()),
        }
    }

    /// `.` may start `...`, a leading-dot float, or be `TDot`.
    fn parse_dot_token(&mut self) -> Token {
        if self.at("...") {
            self.advance_by(3);
            Token::TEllipsis
        } else if self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit()) {
            self.advance();
            let mut num = String::from(".");
            num.push_str(&self.consume_digits());
            if let Some(exp) = self.parse_exponent() {
                num.push_str(&exp);
            }
            Token::Float(num.into())
        } else {
            self.advance();
            Token::TDot
        }
    }

    /// Parse angle bracket path: <nixpkgs>
    fn parse_env_path(&mut self) -> crate::error::Result<Token> {
        let opening_span = self.current_pos();
        self.advance(); // consume '<'

        let mut path = String::new();
        while let Some(ch) = self.peek() {
            match ch {
                '>' => {
                    self.advance();
                    return Ok(Token::EnvPath(path.into()));
                }
                _ if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '/' | '.') => {
                    path.push(self.advance().unwrap());
                }
                _ => {
                    return Err(Box::new(crate::error::ParseError {
                        span: self.current_pos(),
                        kind: crate::error::ErrorKind::InvalidSyntax {
                            description: format!("invalid character '{ch}' in path"),
                            hint: Some("paths can only contain alphanumeric characters, '.', '_', '-', and '/'".to_string()),
                        },
                    }))
                }
            }
        }

        Err(Box::new(crate::error::ParseError {
            span: self.current_pos(),
            kind: crate::error::ErrorKind::UnclosedDelimiter {
                delimiter: '<',
                opening_span,
            },
        }))
    }

    /// Build an `UnexpectedToken` error at the current cursor.
    #[cold]
    fn err_unexpected<T>(&self, expected: &[&str], found: &str) -> crate::error::Result<T> {
        Err(Box::new(crate::error::ParseError {
            span: self.current_pos(),
            kind: crate::error::ErrorKind::UnexpectedToken {
                expected: expected
                    .iter()
                    .map(std::string::ToString::to_string)
                    .collect(),
                found: found.to_string(),
            },
        }))
    }

    /// Helper for two-character tokens: advance and check if next char matches
    /// Returns `if_match` if second char matches, otherwise `if_single`
    fn try_two_char(&mut self, second: char, if_match: Token, if_single: Token) -> Token {
        self.advance();
        if self.peek() == Some(second) {
            self.advance();
            if_match
        } else {
            if_single
        }
    }

    /// Remaining input from the cursor.
    #[inline]
    fn rest(&self) -> &str {
        // `byte_pos` is always on a char boundary.
        unsafe { self.source.get_unchecked(self.byte_pos..) }
    }

    /// Peek at current byte without consuming (None at EOF).
    #[inline]
    pub(crate) fn peek_byte(&self) -> Option<u8> {
        self.source.as_bytes().get(self.byte_pos).copied()
    }

    /// Peek at current character without consuming
    #[inline]
    pub(crate) fn peek(&self) -> Option<char> {
        let b = self.peek_byte()?;
        if b < 0x80 {
            Some(b as char)
        } else {
            self.rest().chars().next()
        }
    }

    /// Peek ahead n characters
    #[inline]
    pub(crate) fn peek_ahead(&self, n: usize) -> Option<char> {
        // `n` is at most 3 in practice, so a short char walk is fine.
        self.rest().chars().nth(n)
    }

    /// Check whether the upcoming input matches `s` byte-for-byte.
    /// Replaces open-coded `peek() == Some(a) && peek_ahead(1) == Some(b)` ladders.
    #[inline]
    pub(crate) fn at(&self, s: &str) -> bool {
        self.source.as_bytes()[self.byte_pos..].starts_with(s.as_bytes())
    }

    /// Advance `n` characters.
    #[inline]
    pub(crate) fn advance_by(&mut self, n: usize) {
        for _ in 0..n {
            self.advance();
        }
    }

    /// Snapshot the cursor (position only, no trivia).
    #[inline]
    pub(super) const fn mark(&self) -> LexerPos {
        LexerPos {
            byte_pos: self.byte_pos,
            line: self.line,
            column: self.column,
        }
    }

    /// Restore the cursor from a snapshot taken by `mark()`.
    #[inline]
    pub(super) const fn reset(&mut self, mark: LexerPos) {
        self.byte_pos = mark.byte_pos;
        self.line = mark.line;
        self.column = mark.column;
    }

    /// Run `f`; on `None`, rewind cursor (`byte_pos/line/column`) only.
    /// Does NOT restore `trivia_buffer`/`recent_*` — callers must not mutate
    /// those inside `f`.
    #[inline]
    pub(super) fn try_with_cursor<T>(
        &mut self,
        f: impl FnOnce(&mut Self) -> Option<T>,
    ) -> Option<T> {
        let mark = self.mark();
        let r = f(self);
        if r.is_none() {
            self.reset(mark);
        }
        r
    }

    /// Advance one char and return `tok`; for trivial single-char arms in
    /// `next_token`.
    #[inline]
    fn single(&mut self, tok: Token) -> Token {
        self.advance();
        tok
    }

    /// Consume and return current character
    #[inline]
    pub(crate) fn advance(&mut self) -> Option<char> {
        let b = self.peek_byte()?;
        if b < 0x80 {
            self.byte_pos += 1;
            if b == b'\n' {
                self.line += 1;
                self.column = 0;
            } else {
                self.column += 1;
            }
            Some(b as char)
        } else {
            let ch = self.rest().chars().next()?;
            self.byte_pos += ch.len_utf8();
            self.column += 1;
            Some(ch)
        }
    }

    /// Advance past the longest prefix containing none of the three given
    /// bytes and return it. Newlines inside the run update `line`/`column`.
    /// SIMD-accelerated via `memchr3`, used for string-body scanning.
    #[inline]
    pub(crate) fn scan_until3(&mut self, a: u8, b: u8, c: u8) -> &str {
        let rest = &self.source.as_bytes()[self.byte_pos..];
        let len = memchr::memchr3(a, b, c, rest).unwrap_or(rest.len());
        if len == 0 {
            return "";
        }
        let start = self.byte_pos;
        let end = start + len;
        self.byte_pos = end;
        bump_line_col(&mut self.line, &mut self.column, &self.source[start..end]);
        &self.source[start..end]
    }

    /// Move the cursor to absolute byte offset `target` (which must be on a
    /// char boundary and `>= self.byte_pos`), updating `line`/`column` from
    /// the skipped slice. Used after a `memchr` jump.
    pub(super) fn seek_to(&mut self, target: usize) {
        debug_assert!(target >= self.byte_pos);
        let start = self.byte_pos;
        self.byte_pos = target;
        bump_line_col(
            &mut self.line,
            &mut self.column,
            &self.source[start..target],
        );
    }

    /// Bulk-advance over the next `len` bytes of source, which must contain no
    /// `\n`. Updates `column` by the number of *chars* in that slice.
    /// Returns the consumed text. Used by string/comment scanners after a
    /// `memchr` hit so the per-char `advance()` loop is skipped for the run.
    #[inline]
    pub(super) fn advance_bytes_no_newline(&mut self, len: usize) -> &str {
        let start = self.byte_pos;
        let end = start + len;
        debug_assert!(!self.source.as_bytes()[start..end].contains(&b'\n'));
        self.byte_pos = end;
        bump_line_col(&mut self.line, &mut self.column, &self.source[start..end]);
        &self.source[start..end]
    }

    /// Consume the longest run of ASCII bytes satisfying `pred` and return it
    /// as a `&str` borrow into `self.source`. `pred` must never accept `b'\n'`
    /// (so `column` can be bumped by byte count without line tracking).
    #[inline]
    pub(super) fn take_ascii_while(&mut self, pred: impl Fn(u8) -> bool) -> &str {
        let bytes = self.source.as_bytes();
        let start = self.byte_pos;
        let mut i = start;
        while i < bytes.len() && pred(bytes[i]) {
            i += 1;
        }
        self.byte_pos = i;
        self.column += i - start;
        &self.source[start..i]
    }

    /// Check if we're at end of input
    #[inline]
    fn is_eof(&self) -> bool {
        self.byte_pos >= self.source.len()
    }

    /// Skip horizontal whitespace (spaces and tabs, but not newlines)
    #[inline]
    fn skip_hspace(&mut self) -> usize {
        self.take_ascii_while(|b| matches!(b, b' ' | b'\t')).len()
    }

    /// Consume trivia when it is purely horizontal/vertical whitespace.
    /// Returns `Some(newlines)` and leaves the cursor on the next token if no
    /// `#` / `/*` was encountered; returns `None` *without consuming anything*
    /// otherwise so the slow `parse_trivia` can handle comments.
    ///
    /// This is the overwhelmingly common inter-token case and lets `lexeme`
    /// skip both the scratch-vector bookkeeping and `convert_trivia`.
    #[inline]
    fn fast_ws_trivia(&mut self) -> Option<usize> {
        let bytes = self.source.as_bytes();
        let mut i = self.byte_pos;
        let mut newlines = 0usize;
        let mut last_hspace = 0usize;
        let mut line = self.line;
        while i < bytes.len() {
            match bytes[i] {
                b' ' | b'\t' => {
                    i += 1;
                    last_hspace += 1;
                }
                b'\n' => {
                    i += 1;
                    newlines += 1;
                    line += 1;
                    last_hspace = 0;
                }
                // Comment start (or rare `\r`): bail out to the full path.
                b'#' | b'\r' => return None,
                b'/' if bytes.get(i + 1) == Some(&b'*') => return None,
                _ => break,
            }
        }
        self.trivia_start = Some(self.mark());
        if newlines > 0 {
            self.line = line;
            self.column = last_hspace;
        } else {
            self.column += last_hspace;
        }
        self.byte_pos = i;
        self.recent_newlines = newlines;
        self.recent_hspace = last_hspace;
        Some(newlines)
    }

    /// Parse trivia (comments and whitespace) into `self.trivia_scratch`.
    fn parse_trivia(&mut self) {
        // Save position before parsing trivia, so we can rewind if needed
        self.trivia_start = Some(self.mark());

        self.trivia_scratch.clear();
        self.recent_newlines = 0;
        self.recent_hspace = 0;

        loop {
            let hspace = self.skip_hspace();
            self.recent_hspace = hspace;

            if self.is_eof() {
                break;
            }

            match self.peek() {
                Some('\n' | '\r') => {
                    let count = self.parse_newlines();
                    self.recent_newlines = count;
                    self.trivia_scratch.push(ParseTrivium::Newlines(count));
                }
                Some('#') => {
                    let c = self.parse_line_comment();
                    self.trivia_scratch.push(c);
                }
                Some('/') if self.at("/*") => {
                    // try_parse_language_annotation already restores state on
                    // failure, so no outer save/restore is needed here.
                    if let Some(lang_annot) = self.try_parse_language_annotation() {
                        self.trivia_scratch.push(lang_annot);
                    } else {
                        let c = self.parse_block_comment();
                        self.trivia_scratch.push(c);
                    }
                }
                _ => break,
            }
        }
    }

    /// Parse consecutive newlines, return count
    fn parse_newlines(&mut self) -> usize {
        let mut count = 0;
        while self.eat_one_eol() {
            count += 1;
        }
        count
    }

    /// Consume a single end-of-line sequence (`\n`, `\r\n`, or bare `\r`).
    /// A bare `\r` advances `column` but not `line`, matching the historical
    /// behaviour of `parse_newlines`.
    #[inline]
    pub(super) fn eat_one_eol(&mut self) -> bool {
        let bytes = self.source.as_bytes();
        match bytes.get(self.byte_pos) {
            Some(&b'\n') => {
                self.byte_pos += 1;
                self.line += 1;
                self.column = 0;
                true
            }
            Some(&b'\r') => {
                self.byte_pos += 1;
                self.column += 1;
                if bytes.get(self.byte_pos) == Some(&b'\n') {
                    self.byte_pos += 1;
                    self.line += 1;
                    self.column = 0;
                }
                true
            }
            _ => false,
        }
    }

    /// Rewind the last trivia consumed (horizontal spaces, newlines, and comments)
    /// Also clears the trivia buffer since rewound trivia should not be attached to next token
    pub(crate) fn rewind_trivia(&mut self) {
        if let Some(mark) = self.trivia_start {
            self.reset(mark);
        }

        self.recent_hspace = 0;
        self.recent_newlines = 0;
        self.trivia_buffer.clear();
    }
}