repotoire 0.7.0

//! Token N-gram model for code surprisal analysis
//!
//! Builds a statistical model of "how this project writes code" from token sequences.
//! Lines/functions with high surprisal (low probability under the model) are flagged
//! as unusual — possibly AI-generated, buggy, or inconsistent with project style.
//!
//! Based on: "On the Naturalness of Buggy Code" (Ray & Hellendoorn, 2015)
//! Key insight: buggy lines have significantly higher entropy than correct code.

use std::collections::HashMap;

/// Order of the n-gram model (trigrams balance precision vs sparsity)
const DEFAULT_ORDER: usize = 3;

/// Minimum token count before the model is considered reliable
const MIN_TOKENS_FOR_CONFIDENCE: usize = 5000;

/// A token n-gram language model that learns project coding patterns.
/// Uses simple smoothed n-gram counts — no ML, no external deps.
#[derive(Debug, Clone)]
pub struct NgramModel {
    /// N-gram order (3 = trigrams)
    order: usize,
    /// Counts: ngram_str -> count
    counts: HashMap<String, u32>,
    /// Context counts: (n-1)-gram prefix -> total count
    context_counts: HashMap<String, u32>,
    /// Unigram counts for backoff
    unigram_counts: HashMap<String, u32>,
    /// Total tokens seen
    total_tokens: usize,
    /// Vocabulary size (unique tokens)
    vocab_size: usize,
    /// Whether model has enough data to be useful
    confident: bool,
}

impl NgramModel {
    pub fn new() -> Self {
        Self {
            order: DEFAULT_ORDER,
            counts: HashMap::new(),
            context_counts: HashMap::new(),
            unigram_counts: HashMap::new(),
            total_tokens: 0,
            vocab_size: 0,
            confident: false,
        }
    }

    /// Feed a source file's tokens into the model. Call this for each file during calibration.
    pub fn train_on_tokens(&mut self, tokens: &[String]) {
        if tokens.len() < self.order {
            return;
        }

        for token in tokens {
            *self.unigram_counts.entry(token.clone()).or_insert(0) += 1;
        }

        // Build n-gram and (n-1)-gram counts
        for window in tokens.windows(self.order) {
            let ngram = window.join(" ");
            let context = window[..self.order - 1].join(" ");

            *self.counts.entry(ngram).or_insert(0) += 1;
            *self.context_counts.entry(context).or_insert(0) += 1;
        }

        self.total_tokens += tokens.len();
        self.vocab_size = self.unigram_counts.len();
        self.confident = self.total_tokens >= MIN_TOKENS_FOR_CONFIDENCE;
    }

    /// Tokenize a source line into abstract tokens.
    /// Normalizes identifiers to reduce sparsity while keeping structure.
    pub fn tokenize_line(line: &str) -> Vec<String> {
        let trimmed = line.trim();
        if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with('#') {
            return vec![];
        }

        let mut tokens = Vec::new();
        let mut chars = trimmed.chars().peekable();

        while let Some(&ch) = chars.peek() {
            tokenize_next_char(&mut chars, ch, &mut tokens);
        }

        tokens
    }

    /// Tokenize an entire source file into a flat token sequence with line boundaries.
    ///
    /// Unlike [`Self::tokenize_line`], this is **stateful across newlines** so that
    /// multi-line strings (Rust `r#"..."#`, Python `"""..."""`/`'''...'''`) and
    /// block comments (`/* ... */`) collapse to a single token instead of having
    /// their inner contents tokenized as host-language code. Without this, a Rust
    /// file containing an embedded Python template would pollute the n-gram model
    /// with Python tokens, and a function returning such a template would be
    /// flagged as "unusual" purely because of the embedded DSL.
    pub fn tokenize_file(content: &str) -> Vec<String> {
        tokenize_with_line_attribution(content)
            .into_iter()
            .map(|(tok, _)| tok)
            .collect()
    }

    /// Calculate the surprisal (negative log probability) of a token sequence.
    /// Higher surprisal = more "surprising" / unusual code.
    /// Uses Kneser-Ney-style backoff with add-k smoothing.
    pub fn surprisal(&self, tokens: &[String]) -> f64 {
        if !self.confident || tokens.len() < self.order {
            return 0.0; // Not enough data to judge
        }

        let smoothing = 0.1;
        let vocab = self.vocab_size.max(1) as f64;
        let mut total_surprisal = 0.0;
        let mut count = 0;

        for window in tokens.windows(self.order) {
            let ngram = window.join(" ");
            let context = window[..self.order - 1].join(" ");

            let ngram_count = *self.counts.get(&ngram).unwrap_or(&0) as f64;
            let context_count = *self.context_counts.get(&context).unwrap_or(&0) as f64;

            // Smoothed probability with backoff
            let prob = if context_count > 0.0 {
                (ngram_count + smoothing) / (context_count + smoothing * vocab)
            } else {
                // Backoff to unigram
                let target = &window[self.order - 1];
                let uni_count = *self.unigram_counts.get(target).unwrap_or(&0) as f64;
                (uni_count + smoothing) / (self.total_tokens as f64 + smoothing * vocab)
            };

            total_surprisal += -prob.log2();
            count += 1;
        }

        if count > 0 {
            total_surprisal / count as f64 // Average bits per token
        } else {
            0.0
        }
    }

    /// Score a single line's surprisal against the model.
    pub fn line_surprisal(&self, line: &str) -> f64 {
        let tokens = Self::tokenize_line(line);
        if tokens.len() < self.order {
            return 0.0;
        }
        self.surprisal(&tokens)
    }

    /// Score a function's token sequence. Returns (avg_surprisal, max_line_surprisal, peak_line).
    ///
    /// `peak_line` is the index into `lines` (0-based) of the line whose tokens
    /// contributed the most surprisal. Cross-line constructs (multi-line strings,
    /// block comments) are tokenized statefully and their surprisal attributes
    /// to the line where they OPENED, matching how a human would point at the code.
    pub fn function_surprisal(&self, lines: &[&str]) -> (f64, f64, usize) {
        if !self.confident || lines.is_empty() {
            return (0.0, 0.0, 0);
        }

        // Stateful tokenization across the joined source so multi-line strings
        // and block comments are correctly recognized as single tokens.
        let source = lines.join("\n");
        let attributed = tokenize_with_line_attribution(&source);
        if attributed.len() < self.order {
            return (0.0, 0.0, 0);
        }

        // Bucket per-line surprisal by attributing each n-gram window to the
        // line of its FINAL token (the token being predicted). This matches the
        // legacy semantics of "this line was surprising given prior context".
        let mut per_line_total = vec![0.0f64; lines.len()];
        let mut per_line_count = vec![0usize; lines.len()];

        let smoothing = 0.1;
        let vocab = self.vocab_size.max(1) as f64;

        for window in attributed.windows(self.order) {
            let target_line = window[self.order - 1].1.min(lines.len().saturating_sub(1));
            let ngram = window
                .iter()
                .map(|(t, _)| t.as_str())
                .collect::<Vec<_>>()
                .join(" ");
            let context = window[..self.order - 1]
                .iter()
                .map(|(t, _)| t.as_str())
                .collect::<Vec<_>>()
                .join(" ");

            let ngram_count = *self.counts.get(&ngram).unwrap_or(&0) as f64;
            let context_count = *self.context_counts.get(&context).unwrap_or(&0) as f64;

            let prob = if context_count > 0.0 {
                (ngram_count + smoothing) / (context_count + smoothing * vocab)
            } else {
                let target = &window[self.order - 1].0;
                let uni_count = *self.unigram_counts.get(target).unwrap_or(&0) as f64;
                (uni_count + smoothing) / (self.total_tokens as f64 + smoothing * vocab)
            };

            let bits = -prob.log2();
            per_line_total[target_line] += bits;
            per_line_count[target_line] += 1;
        }

        // Compute per-line average surprisal, find peak, compute function average.
        let mut total = 0.0;
        let mut max_surprisal = 0.0f64;
        let mut max_line = 0;
        let mut scored_lines = 0;

        for (i, (&sum, &count)) in per_line_total.iter().zip(per_line_count.iter()).enumerate() {
            if count == 0 {
                continue;
            }
            let line_avg = sum / count as f64;
            if line_avg <= 0.0 {
                continue;
            }
            total += line_avg;
            scored_lines += 1;
            if line_avg > max_surprisal {
                max_surprisal = line_avg;
                max_line = i;
            }
        }

        let avg = if scored_lines > 0 {
            total / scored_lines as f64
        } else {
            0.0
        };
        (avg, max_surprisal, max_line)
    }

    /// Get the model's baseline stats: mean and stddev of per-line surprisal across all training data.
    pub fn baseline_stats(&self) -> (f64, f64) {
        // We don't store per-line scores during training, so this needs to be computed
        // after training by scoring a sample. For now, return estimates.
        // A well-fitted n-gram model on code typically has mean ~3-5 bits, stddev ~1-2 bits.
        (0.0, 0.0) // Placeholder — computed externally
    }

    pub fn is_confident(&self) -> bool {
        self.confident
    }

    pub fn total_tokens(&self) -> usize {
        self.total_tokens
    }

    pub fn vocab_size(&self) -> usize {
        self.vocab_size
    }

    /// Save model stats (not the full model — that's too large) to a JSON-compatible struct
    pub fn stats_json(&self) -> serde_json::Value {
        serde_json::json!({
            "order": self.order,
            "total_tokens": self.total_tokens,
            "vocab_size": self.vocab_size,
            "ngram_count": self.counts.len(),
            "confident": self.confident,
        })
    }
}

impl Default for NgramModel {
    fn default() -> Self {
        Self::new()
    }
}

/// Consume a numeric literal (digits, hex prefix, underscores, decimal point).
fn consume_number(chars: &mut std::iter::Peekable<std::str::Chars>) {
    while chars
        .peek()
        .is_some_and(|c| c.is_ascii_alphanumeric() || *c == '.' || *c == 'x' || *c == '_')
    {
        chars.next();
    }
}

/// Process a single character during tokenization, advancing the iterator and collecting tokens
fn tokenize_next_char(
    chars: &mut std::iter::Peekable<std::str::Chars>,
    ch: char,
    tokens: &mut Vec<String>,
) {
    match ch {
        // Whitespace -- skip
        ' ' | '\t' => drop(chars.next()),

        // String literals -> normalize to <STR>
        '"' | '\'' | '`' => {
            chars.next();
            consume_string_literal(chars, ch);
            tokens.push("<STR>".to_string());
        }

        // Numbers -> normalize to <NUM>
        '0'..='9' => {
            consume_number(chars);
            tokens.push("<NUM>".to_string());
        }

        // Identifiers and keywords
        'a'..='z' | 'A'..='Z' | '_' => {
            let word = consume_identifier(chars);
            tokens.push(classify_identifier(word));
        }

        // Operators and punctuation -- keep as-is (they carry structure)
        _ => tokens.push(consume_operator(chars)),
    }
}

/// Consume a string literal (everything up to and including the closing quote).
fn consume_string_literal(chars: &mut std::iter::Peekable<std::str::Chars>, quote: char) {
    while let Some(&c) = chars.peek() {
        chars.next();
        if c == quote {
            break;
        }
        if c == '\\' {
            chars.next();
        } // skip escaped char
    }
}

/// Consume an identifier (alphanumeric + underscore sequence) and return it.
fn consume_identifier(chars: &mut std::iter::Peekable<std::str::Chars>) -> String {
    let mut word = String::new();
    while chars
        .peek()
        .is_some_and(|c| c.is_ascii_alphanumeric() || *c == '_')
    {
        if let Some(c) = chars.next() {
            word.push(c);
        }
    }
    word
}

/// Classify an identifier as keyword, constant, type, or generic identifier token.
fn classify_identifier(word: String) -> String {
    if is_keyword(&word) {
        word
    } else if word.chars().all(|c| c.is_uppercase() || c == '_') {
        "<CONST>".to_string()
    } else if word.starts_with(|c: char| c.is_uppercase()) {
        "<TYPE>".to_string()
    } else {
        "<ID>".to_string()
    }
}

/// Greedily consume a multi-char operator from the char stream.
fn consume_operator(chars: &mut std::iter::Peekable<std::str::Chars>) -> String {
    let mut op = String::new();
    let Some(first) = chars.next() else {
        return op;
    };
    op.push(first);

    let Some(&next) = chars.peek() else { return op };
    let two = format!("{}{}", op, next);
    if !matches!(
        two.as_str(),
        "==" | "!="
            | ">="
            | "<="
            | "&&"
            | "||"
            | "->"
            | "=>"
            | "::"
            | "+="
            | "-="
            | "*="
            | "/="
            | ".."
            | "<<"
            | ">>"
    ) {
        return op;
    }
    chars.next();
    op = two;

    let Some(&third) = chars.peek() else {
        return op;
    };
    let three = format!("{}{}", op, third);
    if matches!(
        three.as_str(),
        "===" | "!==" | "..." | ">>>" | "<<=" | ">>="
    ) {
        chars.next();
        op = three;
    }
    op
}

/// Stateful tokenizer that walks an entire source string as a single stream and
/// emits `(token, line_index)` pairs. Tracks state across newlines so that
/// multi-line constructs collapse to single tokens:
///
/// - Block comments `/* ... */` (Rust supports nesting)
/// - Rust raw strings `r"..."`, `r#"..."#`, `r##"..."##`, ...
/// - Python triple-quoted strings `"""..."""` and `'''...'''`
///
/// Single-line strings (`"..."`, `'...'`, `` `...` ``) and line comments (`//`)
/// terminate at `\n` if not closed, preserving the legacy single-line semantics.
///
/// `<EOL>` markers are emitted at every newline encountered while in `Code`
/// state, matching the previous `tokenize_file` line-boundary behavior. They
/// are NOT emitted while inside a multi-line construct, since logically the
/// construct is one token spanning multiple physical lines.
///
/// Tokens are attributed to the line where they STARTED (0-based). For
/// multi-line strings/comments, this is the opening line.
fn tokenize_with_line_attribution(content: &str) -> Vec<(String, usize)> {
    let bytes = content.as_bytes();
    let mut tokens: Vec<(String, usize)> = Vec::new();
    let mut i = 0;
    let mut line = 0usize;
    let mut line_had_code_token = false;

    while i < bytes.len() {
        let b = bytes[i];

        // Newline: emit <EOL> if this line produced any code tokens, advance line.
        if b == b'\n' {
            if line_had_code_token {
                tokens.push(("<EOL>".to_string(), line));
            }
            line += 1;
            line_had_code_token = false;
            i += 1;
            continue;
        }

        // Whitespace inside a line.
        if b == b' ' || b == b'\t' || b == b'\r' {
            i += 1;
            continue;
        }

        // Line comment: skip until newline.
        if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
            while i < bytes.len() && bytes[i] != b'\n' {
                i += 1;
            }
            continue;
        }

        // Block comment with nesting.
        if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'*' {
            i += 2;
            let mut depth = 1usize;
            while i < bytes.len() && depth > 0 {
                if bytes[i] == b'\n' {
                    line += 1;
                    i += 1;
                } else if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'*' {
                    depth += 1;
                    i += 2;
                } else if bytes[i] == b'*' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
                    depth -= 1;
                    i += 2;
                } else {
                    i += 1;
                }
            }
            // Block comments produce no token; comment-only lines stay <EOL>-free.
            continue;
        }

        // Rust raw string: r"...", r#"..."#, r##"..."##, ...
        if b == b'r' && i + 1 < bytes.len() {
            let mut k = i + 1;
            let mut hashes = 0usize;
            while k < bytes.len() && bytes[k] == b'#' {
                hashes += 1;
                k += 1;
            }
            if k < bytes.len() && bytes[k] == b'"' {
                let start_line = line;
                i = k + 1;
                loop {
                    if i >= bytes.len() {
                        break;
                    }
                    if bytes[i] == b'\n' {
                        line += 1;
                        i += 1;
                        continue;
                    }
                    if bytes[i] == b'"' {
                        let mut closing = 0usize;
                        let mut j = i + 1;
                        while closing < hashes && j < bytes.len() && bytes[j] == b'#' {
                            closing += 1;
                            j += 1;
                        }
                        if closing == hashes {
                            i = j;
                            break;
                        }
                    }
                    i += 1;
                }
                tokens.push(("<STR>".to_string(), start_line));
                line_had_code_token = true;
                continue;
            }
        }

        // Triple-quoted strings (Python): """...""" and '''...'''
        if (b == b'"' || b == b'\'')
            && i + 2 < bytes.len()
            && bytes[i + 1] == b
            && bytes[i + 2] == b
        {
            let quote = b;
            let start_line = line;
            i += 3;
            loop {
                if i >= bytes.len() {
                    break;
                }
                if bytes[i] == b'\n' {
                    line += 1;
                    i += 1;
                    continue;
                }
                if bytes[i] == b'\\' && i + 1 < bytes.len() {
                    if bytes[i + 1] == b'\n' {
                        line += 1;
                    }
                    i += 2;
                    continue;
                }
                if bytes[i] == quote
                    && i + 2 < bytes.len()
                    && bytes[i + 1] == quote
                    && bytes[i + 2] == quote
                {
                    i += 3;
                    break;
                }
                i += 1;
            }
            tokens.push(("<STR>".to_string(), start_line));
            line_had_code_token = true;
            continue;
        }

        // Single-line strings: "...", '...', `...`. Terminate at unescaped quote
        // OR at newline (legacy behavior).
        if b == b'"' || b == b'\'' || b == b'`' {
            let quote = b;
            let start_line = line;
            i += 1;
            while i < bytes.len() && bytes[i] != quote && bytes[i] != b'\n' {
                if bytes[i] == b'\\' && i + 1 < bytes.len() {
                    i += 2;
                } else {
                    i += 1;
                }
            }
            if i < bytes.len() && bytes[i] == quote {
                i += 1;
            }
            tokens.push(("<STR>".to_string(), start_line));
            line_had_code_token = true;
            continue;
        }

        // Numbers.
        if b.is_ascii_digit() {
            let start_line = line;
            while i < bytes.len()
                && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'.' || bytes[i] == b'_')
            {
                i += 1;
            }
            tokens.push(("<NUM>".to_string(), start_line));
            line_had_code_token = true;
            continue;
        }

        // Identifiers / keywords.
        if b.is_ascii_alphabetic() || b == b'_' {
            let start_line = line;
            let start = i;
            while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
                i += 1;
            }
            let word = std::str::from_utf8(&bytes[start..i])
                .unwrap_or("")
                .to_string();
            tokens.push((classify_identifier(word), start_line));
            line_had_code_token = true;
            continue;
        }

        // Operators / punctuation: greedy multi-char match using existing helper.
        let start_line = line;
        let remaining = &content[i..];
        let mut chars = remaining.chars().peekable();
        let op = consume_operator(&mut chars);
        if op.is_empty() {
            // Defensive: advance one byte to avoid infinite loop on unexpected input.
            i += 1;
        } else {
            i += op.len();
            tokens.push((op, start_line));
            line_had_code_token = true;
        }
    }

    // Final <EOL> if last line had code without trailing newline.
    if line_had_code_token {
        tokens.push(("<EOL>".to_string(), line));
    }

    tokens
}

/// Check if a token is a language keyword (kept as-is for structural signal).
/// Combined set across Rust, Python, JS/TS, Go, Java, C#, Kotlin, C/C++ — deduplicated.
fn is_keyword(word: &str) -> bool {
    matches!(
        word,
        // Control flow (shared across languages)
        "if" | "else" | "elif" | "for" | "while" | "do" | "loop"
        | "break" | "continue" | "return" | "yield" | "switch" | "case" | "default"
        | "match" | "when" | "select" | "range"
        // Error handling
        | "try" | "catch" | "except" | "finally" | "throw" | "throws" | "raise"
        // Declarations
        | "fn" | "func" | "def" | "function" | "let" | "var" | "val" | "const"
        | "static" | "auto" | "type" | "typedef"
        // OOP / types
        | "class" | "struct" | "enum" | "trait" | "interface" | "impl"
        | "extends" | "implements" | "abstract" | "sealed" | "final"
        | "override" | "virtual" | "explicit" | "friend" | "operator"
        | "object" | "companion" | "data"
        // Visibility
        | "pub" | "private" | "protected" | "public" | "readonly"
        // Modules / imports
        | "use" | "mod" | "import" | "export" | "from" | "package"
        | "as" | "crate" | "super" | "namespace" | "include"
        // Memory / ownership (Rust)
        | "mut" | "ref" | "move" | "dyn" | "unsafe" | "extern"
        // Async
        | "async" | "await" | "defer" | "go"
        // Literals / builtins
        | "true" | "false" | "True" | "False" | "null" | "nil" | "None"
        | "undefined" | "NaN" | "Infinity"
        | "self" | "Self" | "this" | "new" | "delete" | "del"
        // Rust specific types
        | "Box" | "Vec" | "Option" | "Result" | "Some" | "Ok" | "Err"
        // Logic operators (Python)
        | "and" | "or" | "not" | "is" | "in"
        // Python specific
        | "lambda" | "pass" | "assert" | "global" | "nonlocal" | "with"
        // JS/TS specific
        | "typeof" | "instanceof" | "void"
        // Go specific
        | "chan" | "map" | "make" | "append" | "len" | "cap"
        // Java/C# specific
        | "synchronized" | "volatile" | "transient" | "native"
        // C/C++ specific
        | "register" | "sizeof" | "union" | "goto" | "inline" | "restrict"
        | "template" | "noexcept" | "constexpr"
        // Preprocessor
        | "define" | "ifdef" | "ifndef" | "endif" | "pragma"
        // Misc
        | "where"
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tokenize_line() {
        let tokens = NgramModel::tokenize_line("let mut count = 0;");
        assert_eq!(tokens, vec!["let", "mut", "<ID>", "=", "<NUM>", ";"]);
    }

    #[test]
    fn test_tokenize_string_literal() {
        let tokens = NgramModel::tokenize_line(r#"println!("hello world");"#);
        assert!(tokens.contains(&"<STR>".to_string()));
    }

    #[test]
    fn test_tokenize_type() {
        let tokens = NgramModel::tokenize_line("let x: HashMap<String, u32> = HashMap::new();");
        assert!(tokens.contains(&"<TYPE>".to_string()));
    }

    #[test]
    fn test_model_training() {
        let mut model = NgramModel::new();

        // Train on repetitive code (need 5000+ tokens for confidence)
        for _ in 0..800 {
            model.train_on_tokens(&[
                "let".to_string(),
                "mut".to_string(),
                "<ID>".to_string(),
                "=".to_string(),
                "<NUM>".to_string(),
                ";".to_string(),
                "<EOL>".to_string(),
            ]);
        }

        assert!(model.total_tokens() > 1000);
        assert!(model.is_confident());
    }

    #[test]
    fn test_surprisal_familiar_vs_unusual() {
        let mut model = NgramModel::new();

        // Train on a pattern
        for _ in 0..500 {
            model.train_on_tokens(&[
                "let".to_string(),
                "<ID>".to_string(),
                "=".to_string(),
                "<ID>".to_string(),
                ".".to_string(),
                "<ID>".to_string(),
                "(".to_string(),
                ")".to_string(),
                ";".to_string(),
                "<EOL>".to_string(),
            ]);
        }

        // Familiar pattern should have LOW surprisal
        let familiar = vec![
            "let".to_string(),
            "<ID>".to_string(),
            "=".to_string(),
            "<ID>".to_string(),
            ".".to_string(),
            "<ID>".to_string(),
            "(".to_string(),
            ")".to_string(),
            ";".to_string(),
        ];

        // Unusual pattern should have HIGH surprisal
        let unusual = vec![
            "unsafe".to_string(),
            "{".to_string(),
            "<ID>".to_string(),
            "::".to_string(),
            "<ID>".to_string(),
            "(".to_string(),
            "&".to_string(),
            "mut".to_string(),
            "<ID>".to_string(),
        ];

        let s_familiar = model.surprisal(&familiar);
        let s_unusual = model.surprisal(&unusual);

        assert!(
            s_unusual > s_familiar,
            "Unusual code ({:.2}) should be more surprising than familiar code ({:.2})",
            s_unusual,
            s_familiar
        );
    }

    #[test]
    fn test_not_confident_returns_zero() {
        let model = NgramModel::new(); // Empty model
        let tokens = vec!["let".to_string(), "<ID>".to_string(), "=".to_string()];
        assert_eq!(model.surprisal(&tokens), 0.0);
    }

    /// Multi-line raw strings (`r#"..."#`) must collapse to a single `<STR>`
    /// token. Without this, embedded DSLs (Python templates, SQL, GraphQL,
    /// JSX) leak host-language tokens into the n-gram stream and pollute both
    /// training and surprisal scoring.
    #[test]
    fn test_tokenize_file_collapses_rust_raw_string_across_newlines() {
        let source = "let template = r#\"\ndef foo(x):\n    return x + 1\n\"#;\n";
        let tokens = NgramModel::tokenize_file(source);
        // Should NOT contain Python tokens.
        assert!(
            !tokens.iter().any(|t| t == "def" || t == "return"),
            "Multi-line raw string contents leaked as tokens: {:?}",
            tokens
        );
        // Should contain exactly one <STR> for the whole raw string.
        let str_count = tokens.iter().filter(|t| *t == "<STR>").count();
        assert_eq!(
            str_count, 1,
            "Expected one <STR> token for the raw string, got {} in {:?}",
            str_count, tokens
        );
    }

    /// Python triple-quoted strings span newlines and must collapse to one
    /// `<STR>`, same as Rust raw strings.
    #[test]
    fn test_tokenize_file_collapses_python_triple_string_across_newlines() {
        let source = "msg = \"\"\"\nhello\nworld\n\"\"\"\n";
        let tokens = NgramModel::tokenize_file(source);
        let str_count = tokens.iter().filter(|t| *t == "<STR>").count();
        assert_eq!(
            str_count, 1,
            "Expected one <STR> for the triple-quoted string, got {} in {:?}",
            str_count, tokens
        );
    }

    /// Block comments (`/* ... */`) must be skipped entirely, even when they
    /// span multiple lines. Previously a multi-line block comment would have
    /// its inner lines tokenized as code (the leading `*` becoming a multiply
    /// operator).
    #[test]
    fn test_tokenize_file_skips_multiline_block_comment() {
        let source = "let x = 1;\n/* multi\n * line\n * comment */\nlet y = 2;\n";
        let tokens = NgramModel::tokenize_file(source);
        // The block comment must contribute nothing; we should only see tokens
        // for the two `let` statements plus their <EOL> markers.
        let lets = tokens.iter().filter(|t| *t == "let").count();
        assert_eq!(lets, 2, "Expected 2 `let` tokens, got {:?}", tokens);
        // No stray `*` operators from the comment body.
        let stars = tokens.iter().filter(|t| *t == "*").count();
        assert_eq!(
            stars, 0,
            "Block-comment `*` leaked as operator: {:?}",
            tokens
        );
    }

    /// `function_surprisal` must be stateful across the function's lines: a
    /// function whose body is a multi-line raw string should be scored against
    /// `<STR>` tokens, not against the embedded language's keywords.
    #[test]
    fn test_function_surprisal_treats_multiline_raw_string_as_one_token() {
        // Train a confident model on simple Rust patterns.
        let mut model = NgramModel::new();
        for _ in 0..1000 {
            model.train_on_tokens(&[
                "fn".to_string(),
                "<ID>".to_string(),
                "(".to_string(),
                ")".to_string(),
                "->".to_string(),
                "<TYPE>".to_string(),
                "{".to_string(),
                "<EOL>".to_string(),
                "<STR>".to_string(),
                "<EOL>".to_string(),
                "}".to_string(),
                "<EOL>".to_string(),
            ]);
        }
        assert!(model.is_confident());

        // A function whose body is a multi-line raw string with Python content.
        let lines = vec![
            "fn template() -> String {",
            "    r#\"",
            "def handler(req):",
            "    if req.error:",
            "        return 500",
            "    return 200",
            "\"#",
            "}",
        ];
        let (avg, _max, _peak) = model.function_surprisal(&lines);
        // The function should score LOW because its tokens are all known
        // (`fn`, `<ID>`, `(`, `)`, `->`, `<TYPE>`, `{`, `<STR>`, `}`).
        // Without the stateful tokenizer, Python tokens (`def`, `if`, `return`)
        // would dominate and produce high surprisal.
        assert!(
            avg < 5.0,
            "Function with raw-string Python body scored {:.2} bits — \
             stateful tokenizer should collapse the string to <STR>",
            avg
        );
    }
}