lessence 0.4.1 - Docs.rs

use ahash::AHasher;
use anyhow::Result;
use std::hash::{Hash, Hasher};

use crate::config::Config;
use crate::patterns::{
    LogLine, Token, duration::DurationDetector, email::EmailPatternDetector, hash::HashDetector,
    json::JsonDetector, kubernetes::KubernetesDetector, names::NameDetector,
    network::NetworkDetector, path::PathDetector, process::ProcessDetector,
    quoted::QuotedStringDetector, timestamp::TimestampDetector, uuid::UuidDetector,
};

pub struct Normalizer {
    config: Config,
    // Pattern detectors
    email_detector: EmailPatternDetector,
}

impl Normalizer {
    pub fn new(config: Config) -> Self {
        Self {
            config,
            email_detector: EmailPatternDetector::new().unwrap(),
        }
    }

    pub fn normalize_line(&self, original: String) -> Result<LogLine> {
        let mut normalized = original.clone();
        let mut tokens = Vec::with_capacity(8);

        // Apply normalizations in optimized order (most specific to least specific)
        // This prevents conflicts and maximizes pattern detection accuracy

        // 1. TIMESTAMPS (highest priority - most specific format)
        if self.config.normalize_timestamps {
            let (new_normalized, mut new_tokens) =
                TimestampDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 2. EMAIL ADDRESSES (before paths to ensure emails in URLs are handled correctly)
        if self.config.normalize_emails && normalized.contains('@') {
            let (new_normalized, mut new_tokens) =
                self.email_detector.detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 3. PATHS (URLs, file paths, CLI flags - must run early to preserve URL structure)
        // Must run BEFORE network patterns to handle URLs as complete units
        if self.config.normalize_paths {
            let (new_normalized, mut new_tokens) = PathDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 4. JSON (structured data, Event objects, K8s objects)
        if self.config.normalize_json && normalized.contains('{') {
            let (new_normalized, mut new_tokens) = JsonDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 5. UUIDs (MUST run BEFORE hashes to prevent UUID fragmentation!)
        // UUIDs contain hex segments that could be mistaken for hashes
        if self.config.normalize_uuids {
            let (new_normalized, mut new_tokens) = UuidDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 6. NETWORK PATTERNS (IP addresses, ports, FQDNs - very specific formats)
        // Must run AFTER paths to avoid breaking URLs
        if self.config.normalize_ips || self.config.normalize_ports || self.config.normalize_fqdns {
            let (new_normalized, mut new_tokens) = NetworkDetector::detect_and_replace(
                &normalized,
                self.config.normalize_ips,
                self.config.normalize_ports,
                self.config.normalize_fqdns,
            );
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 7. HASHES (specific length and hex pattern)
        // Must run AFTER UUIDs to avoid detecting UUID segments as hashes
        if self.config.normalize_hashes {
            let (new_normalized, mut new_tokens) = HashDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 8. PROCESS IDs (specific patterns like [pid=123], (12345))
        if self.config.normalize_pids {
            let (new_normalized, mut new_tokens) = ProcessDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 9. KUBERNETES PATTERNS (namespaces, volumes, plugins, pod names)
        // PROTECTED DOMAIN: Must run before generic patterns to prevent pattern theft
        if self.config.normalize_kubernetes {
            let (new_normalized, mut new_tokens) =
                KubernetesDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // NEW PATTERNS FROM 001-READ-THE-CURRENT (Now correctly placed AFTER Kubernetes)

        // HttpStatusClass - Groups HTTP status codes (200-299 → 2xx, etc.)
        if self.config.normalize_http_status {
            let (new_normalized, mut new_tokens) =
                crate::patterns::http_status::HttpStatusDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // BracketContext - Detects [error] [mod_jk] style patterns
        if self.config.normalize_brackets && normalized.contains('[') {
            let (new_normalized, mut new_tokens) =
                crate::patterns::bracket_context::BracketContextDetector::detect_and_replace(
                    &normalized,
                );
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // KeyValuePair - Detects config=value, metrics patterns
        if self.config.normalize_key_value && normalized.contains('=') {
            let (new_normalized, mut new_tokens) =
                crate::patterns::key_value::KeyValueDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // LogWithModule - Detects [level] module patterns for Apache/nginx
        if normalized.contains('[') {
            let (new_normalized, mut new_tokens) =
                crate::patterns::log_module::LogWithModuleDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // StructuredMessage - Detects JSON/logfmt structured logging
        if normalized.contains('{') || normalized.contains('=') {
            let (new_normalized, mut new_tokens) =
                crate::patterns::structured::StructuredMessageDetector::detect_and_replace(
                    &normalized,
                );
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 10. DURATIONS & MEASUREMENTS (broad category including decimals, sizes, percentages, HTTP codes)
        // Runs LATE to avoid conflicts with more specific patterns above
        if self.config.normalize_durations {
            let (new_normalized, mut new_tokens) =
                DurationDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 11. NAMES (generic hyphenated component names with variable suffixes)
        // Runs after specific patterns to catch remaining variable names
        if self.config.normalize_names {
            let (new_normalized, mut new_tokens) = NameDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // 12. QUOTED STRINGS (generic quoted variables - high priority for mount operations)
        // Must run after paths to catch normalized quoted paths properly
        if self.config.normalize_quoted && (normalized.contains('"') || normalized.contains('\'')) {
            let (new_normalized, mut new_tokens) =
                QuotedStringDetector::detect_and_replace(&normalized);
            normalized = new_normalized;
            tokens.append(&mut new_tokens);
        }

        // Generate hash for fast comparison
        let hash = self.calculate_hash(&normalized);

        Ok(LogLine {
            original,
            normalized,
            tokens,
            hash,
        })
    }

    fn calculate_hash(&self, normalized: &str) -> u64 {
        let mut hasher = AHasher::default();
        normalized.hash(&mut hasher);
        hasher.finish()
    }

    #[allow(clippy::cast_precision_loss)] // usize lengths → f64 for ratio calc
    pub fn similarity_score(&self, line1: &LogLine, line2: &LogLine) -> f64 {
        let s1 = &line1.normalized;
        let s2 = &line2.normalized;

        if s1 == s2 {
            return 100.0;
        }

        // Ultra-fast similarity: check length difference first
        let len1 = s1.len();
        let len2 = s2.len();
        let max_len = len1.max(len2);
        let min_len = len1.min(len2);

        if max_len == 0 {
            return 100.0;
        }

        // If length difference is too large, reject quickly
        let length_ratio = min_len as f64 / max_len as f64;
        if length_ratio < 0.7 {
            return length_ratio * 100.0;
        }

        // Fast byte-level overlap check (no allocation — works on &[u8] directly)
        let b1 = s1.as_bytes();
        let b2 = s2.as_bytes();
        let compare_len = min_len;
        let mut matches: u32 = 0;

        for i in 0..compare_len {
            if b1[i] == b2[i] {
                matches += 1;
            }
        }

        (f64::from(matches) / max_len as f64) * 100.0
    }

    pub fn are_similar(&self, line1: &LogLine, line2: &LogLine) -> bool {
        // Quick hash comparison first
        if line1.hash == line2.hash {
            return true;
        }

        // If hashes don't match, check similarity score
        let score = self.similarity_score(line1, line2);
        score >= f64::from(self.config.threshold)
    }

    pub fn format_collapsed_line(&self, first: &LogLine, last: &LogLine, count: usize) -> String {
        if self.config.compact {
            // Compact format: [+N similar, varying: TYPE]
            let variation_types = self.summarize_variation_types(&first.tokens, &last.tokens);
            if variation_types.is_empty() {
                format!("[+{count} similar]")
            } else {
                format!(
                    "[+{} similar, varying: {}]",
                    count,
                    variation_types.join(", ")
                )
            }
        } else {
            format!(
                "[...collapsed {} similar lines from {} to {}...]",
                count,
                self.format_timestamp(first),
                self.format_timestamp(last)
            )
        }
    }

    fn format_timestamp(&self, log_line: &LogLine) -> String {
        // Extract first timestamp string from original line using simple regex
        for token in &log_line.tokens {
            if let Token::Timestamp(ts_str) = token {
                // Extract just the time part for display (HH:MM:SS.mmm or HH:MM:SS,mmm)
                if let Some(time_part) = Self::extract_time_part(ts_str) {
                    return time_part;
                }
            }
        }
        "unknown".to_string()
    }

    fn extract_time_part(timestamp: &str) -> Option<String> {
        // Return the full timestamp string as-is from the original log
        // This preserves user's format and shows meaningful ranges
        Some(timestamp.to_string())
    }

    fn summarize_variation_types(
        &self,
        first_tokens: &[Token],
        last_tokens: &[Token],
    ) -> Vec<String> {
        let mut types = std::collections::HashSet::new();

        // Helper function to get token type name and value
        let get_token_info = |token: &Token| -> (&str, String) {
            match token {
                Token::Timestamp(v) => ("timestamp", v.clone()),
                Token::IPv4(v) => ("IP", v.clone()),
                Token::IPv6(v) => ("IP", v.clone()),
                Token::Port(v) => ("port", v.to_string()),
                Token::Hash(_, v) => ("hash", v.clone()),
                Token::Uuid(v) => ("UUID", v.clone()),
                Token::Pid(v) => ("PID", v.to_string()),
                Token::ThreadID(v) => ("thread", v.clone()),
                Token::Path(v) => ("path", v.clone()),
                Token::Json(v) => ("json", v.clone()),
                Token::Duration(v) => ("duration", v.clone()),
                Token::Size(v) => ("size", v.clone()),
                Token::Number(v) => ("number", v.clone()),
                Token::HttpStatus(v) => ("http_status", v.to_string()),
                Token::QuotedString(v) => ("quoted_string", v.clone()),
                Token::Name(v) => ("name", v.clone()),
                Token::KubernetesNamespace(v) => ("namespace", v.clone()),
                Token::VolumeName(v) => ("volume", v.clone()),
                Token::PluginType(v) => ("plugin", v.clone()),
                Token::PodName(v) => ("pod", v.clone()),
                Token::HttpStatusClass(v) => ("http_status_class", v.clone()),
                Token::BracketContext(v) => ("bracket_context", v.join(",")),
                Token::KeyValuePair { key, value_type } => {
                    ("key_value_pair", format!("{key}={value_type}"))
                }
                Token::Email(v) => ("email", v.clone()),
                Token::LogWithModule { .. } => ("log_with_module", String::new()),
                Token::StructuredMessage { .. } => ("structured_message", String::new()),
            }
        };

        // Create maps of token types to values for first and last
        let mut first_values: std::collections::HashMap<&str, Vec<String>> =
            std::collections::HashMap::new();
        let mut last_values: std::collections::HashMap<&str, Vec<String>> =
            std::collections::HashMap::new();

        for token in first_tokens {
            let (token_type, value) = get_token_info(token);
            first_values.entry(token_type).or_default().push(value);
        }

        for token in last_tokens {
            let (token_type, value) = get_token_info(token);
            last_values.entry(token_type).or_default().push(value);
        }

        // Find token types that actually vary between first and last
        let all_types: std::collections::HashSet<&str> = first_values
            .keys()
            .chain(last_values.keys())
            .copied()
            .collect();

        for token_type in all_types {
            // In essence mode, ignore timestamp variations as they're tokenized for temporal independence
            if self.config.essence_mode && token_type == "timestamp" {
                continue;
            }

            let first_vals = first_values.get(token_type).cloned().unwrap_or_default();
            let last_vals = last_values.get(token_type).cloned().unwrap_or_default();

            // If the sets of values differ, this token type varies
            if first_vals != last_vals {
                types.insert(token_type.to_string());
            }
        }

        let mut result: Vec<String> = types.into_iter().collect();
        result.sort();
        result
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_timestamp_normalization() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        let line = normalizer
            .normalize_line("2025-01-20 10:15:30 Error occurred".to_string())
            .unwrap();

        assert_eq!(line.normalized, "<TIMESTAMP> Error occurred");
        assert_eq!(line.tokens.len(), 1);
        assert!(matches!(line.tokens[0], Token::Timestamp(_)));
    }

    #[test]
    fn test_ip_port_normalization() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        let line = normalizer
            .normalize_line("Connection to 192.168.1.100:8080 failed".to_string())
            .unwrap();

        assert_eq!(line.normalized, "Connection to <IP>:<PORT> failed");
        assert_eq!(line.tokens.len(), 2);
    }

    #[test]
    fn test_similarity_calculation() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        let line1 = normalizer
            .normalize_line(
                "2025-01-20 10:15:01 [pid=12345] Connection failed to 192.168.1.100:8080"
                    .to_string(),
            )
            .unwrap();

        let line2 = normalizer
            .normalize_line(
                "2025-01-20 10:15:02 [pid=12346] Connection failed to 192.168.1.101:8081"
                    .to_string(),
            )
            .unwrap();

        assert!(normalizer.are_similar(&line1, &line2));
        let score = normalizer.similarity_score(&line1, &line2);
        assert!(score >= 85.0);
    }

    #[test]
    fn test_hash_consistency() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        let line1 = normalizer
            .normalize_line("<TIMESTAMP> [pid=<PID>] Connection failed to <IP>:<PORT>".to_string())
            .unwrap();

        let line2 = normalizer
            .normalize_line("<TIMESTAMP> [pid=<PID>] Connection failed to <IP>:<PORT>".to_string())
            .unwrap();

        assert_eq!(line1.hash, line2.hash);
    }

    #[test]
    fn test_disabled_normalization() {
        let config = Config {
            normalize_timestamps: false,
            normalize_ips: false,
            normalize_ports: false,
            ..Config::default()
        };

        let normalizer = Normalizer::new(config);

        let line = normalizer
            .normalize_line("2025-01-20 10:15:30 Connection to 192.168.1.100 failed".to_string())
            .unwrap();

        // Even with timestamps/IPs/ports disabled, other always-on patterns
        // (durations, names, etc.) still normalize numbers and decimals
        assert_eq!(
            line.normalized,
            "<NUMBER>-01-20 10:15:30 Connection to <DECIMAL>.<DECIMAL> failed"
        );
    }

    #[test]
    fn test_timestamp_format_preservation() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        // Test PostgreSQL comma format
        let line1 = normalizer
            .normalize_line("2025-09-18 13:26:30,188 INFO: test message".to_string())
            .unwrap();

        let formatted = normalizer.format_timestamp(&line1);
        assert_eq!(formatted, "2025-09-18 13:26:30,188");

        // Test PostgreSQL UTC format
        let line2 = normalizer
            .normalize_line("2025-09-18 13:26:53.345 UTC [24] LOG test".to_string())
            .unwrap();

        let formatted2 = normalizer.format_timestamp(&line2);
        assert_eq!(formatted2, "2025-09-18 13:26:53.345 UTC");

        // Test ISO 8601 format
        let line3 = normalizer
            .normalize_line("2025-01-20T10:15:30.123Z INFO test".to_string())
            .unwrap();

        let formatted3 = normalizer.format_timestamp(&line3);
        assert_eq!(formatted3, "2025-01-20T10:15:30.123Z");
    }

    #[test]
    fn test_invalid_timestamp_handling() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        // Test invalid date that would crash parsing
        let line = normalizer
            .normalize_line("2025-02-31 25:99:99,999 ERROR: invalid timestamp".to_string())
            .unwrap();

        // Should not crash and should preserve the invalid timestamp
        let formatted = normalizer.format_timestamp(&line);
        assert_eq!(formatted, "2025-02-31 25:99:99,999");
    }

    #[test]
    fn test_no_timestamp_handling() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        // Test line with no timestamp
        let line = normalizer
            .normalize_line("Just a log message with no timestamp".to_string())
            .unwrap();

        let formatted = normalizer.format_timestamp(&line);
        assert_eq!(formatted, "unknown");
    }

    #[test]
    fn test_port_detection_vs_timestamps() {
        let config = Config::default();
        let normalizer = Normalizer::new(config);

        // Test that timestamps are NOT detected as ports
        let line1 = normalizer
            .normalize_line("2025-01-20 10:15:30 Connection failed".to_string())
            .unwrap();

        // Should normalize timestamp but NOT detect ports in the time
        assert_eq!(line1.normalized, "<TIMESTAMP> Connection failed");
        assert!(
            line1
                .tokens
                .iter()
                .any(|t| matches!(t, Token::Timestamp(_)))
        );
        assert!(!line1.tokens.iter().any(|t| matches!(t, Token::Port(_))));

        // Test that actual ports ARE detected
        let line2 = normalizer
            .normalize_line("Connection to localhost:8080 failed".to_string())
            .unwrap();

        assert_eq!(line2.normalized, "Connection to localhost:<PORT> failed");
        assert!(line2.tokens.iter().any(|t| matches!(t, Token::Port(8080))));

        // Test that IP:port combinations work
        let line3 = normalizer
            .normalize_line("Connection to 192.168.1.1:3000 failed".to_string())
            .unwrap();

        assert_eq!(line3.normalized, "Connection to <IP>:<PORT> failed");
        assert!(line3.tokens.iter().any(|t| matches!(t, Token::IPv4(_))));
        assert!(line3.tokens.iter().any(|t| matches!(t, Token::Port(3000))));

        // Test that IPv6:port combinations work
        let line4 = normalizer
            .normalize_line("Connection to [2001:db8::1]:8080 failed".to_string())
            .unwrap();

        assert_eq!(line4.normalized, "Connection to [<IP>]:<PORT> failed");
        assert!(line4.tokens.iter().any(|t| matches!(t, Token::IPv6(_))));
        assert!(line4.tokens.iter().any(|t| matches!(t, Token::Port(8080))));
    }

    // --- similarity_score direct tests (mutant kills) ---

    #[test]
    fn test_similarity_score_identical() {
        let normalizer = Normalizer::new(Config::default());
        let line = normalizer
            .normalize_line("hello world".to_string())
            .unwrap();
        let score = normalizer.similarity_score(&line, &line);
        assert!((score - 100.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_similarity_score_completely_different() {
        let normalizer = Normalizer::new(Config::default());
        let a = normalizer.normalize_line("aaaa".to_string()).unwrap();
        let b = normalizer.normalize_line("zzzz".to_string()).unwrap();
        let score = normalizer.similarity_score(&a, &b);
        assert!(
            score < 1.0,
            "Completely different strings should score near 0, got {score}"
        );
    }

    #[test]
    fn test_similarity_score_partial_match() {
        let normalizer = Normalizer::new(Config::default());
        let a = normalizer.normalize_line("hello".to_string()).unwrap();
        let b = normalizer.normalize_line("hella".to_string()).unwrap();
        let score = normalizer.similarity_score(&a, &b);
        // 4/5 chars match = 80.0
        assert!(
            (score - 80.0).abs() < f64::EPSILON,
            "Expected 80.0, got {score}"
        );
    }

    #[test]
    fn test_similarity_score_length_ratio_rejection() {
        let normalizer = Normalizer::new(Config::default());
        let short = normalizer.normalize_line("ab".to_string()).unwrap();
        let long = normalizer.normalize_line("abcdefghij".to_string()).unwrap();
        let score = normalizer.similarity_score(&short, &long);
        // ratio = 2/10 = 0.2, below 0.7 threshold → returns 0.2 * 100 = 20.0
        assert!(
            (score - 20.0).abs() < f64::EPSILON,
            "Expected 20.0 (ratio rejection), got {score}"
        );
    }

    #[test]
    fn test_similarity_score_empty_strings() {
        let normalizer = Normalizer::new(Config::default());
        let empty = LogLine {
            original: String::new(),
            normalized: String::new(),
            tokens: vec![],
            hash: 0,
        };
        let score = normalizer.similarity_score(&empty, &empty);
        assert!(
            (score - 100.0).abs() < f64::EPSILON,
            "Empty vs empty should be 100.0"
        );
    }

    #[test]
    fn test_similarity_score_at_length_ratio_boundary() {
        let normalizer = Normalizer::new(Config::default());
        let ten_chars = normalizer.normalize_line("abcdefghij".to_string()).unwrap();

        // 7/10 = 0.7, exactly at threshold → NOT rejected → char comparison: 7/10 = 70.0
        let seven_match = normalizer.normalize_line("abcdefg".to_string()).unwrap();
        let score = normalizer.similarity_score(&seven_match, &ten_chars);
        assert!(
            (score - 70.0).abs() < f64::EPSILON,
            "At boundary (0.7), should use char comparison. Got {score}"
        );

        // 6/10 = 0.6, below threshold → rejected early → returns 0.6*100 = 60.0
        let six_match = normalizer.normalize_line("abcdef".to_string()).unwrap();
        let score_below = normalizer.similarity_score(&six_match, &ten_chars);
        assert!(
            (score_below - 60.0).abs() < f64::EPSILON,
            "Below boundary, should return ratio*100=60.0. Got {score_below}"
        );

        // 7 chars but last differs → ratio=0.7, char comparison: 6/10 = 60.0
        let seven_mismatch = normalizer.normalize_line("abcdefz".to_string()).unwrap();
        let score_mismatch = normalizer.similarity_score(&seven_mismatch, &ten_chars);
        assert!(
            (score_mismatch - 60.0).abs() < f64::EPSILON,
            "At boundary with mismatch, char comparison gives 60.0. Got {score_mismatch}"
        );

        // 7 chars, none match → ratio=0.7, NOT rejected, char comparison: 0/10 = 0.0
        let seven_none = normalizer.normalize_line("xyzxyzx".to_string()).unwrap();
        let score_none = normalizer.similarity_score(&seven_none, &ten_chars);
        assert!(
            score_none < 1.0,
            "At boundary with zero char matches, should be ~0. Got {score_none}"
        );
    }

    #[test]
    fn test_similarity_score_one_char_diff() {
        let normalizer = Normalizer::new(Config::default());
        let a = normalizer.normalize_line("abcdefghij".to_string()).unwrap();
        let b = normalizer.normalize_line("abcdefghix".to_string()).unwrap();
        let score = normalizer.similarity_score(&a, &b);
        // 9/10 chars match = 90.0
        assert!(
            (score - 90.0).abs() < f64::EPSILON,
            "Expected 90.0, got {score}"
        );
    }

    // --- similarity_score edge cases for uncaught mutants ---

    #[test]
    fn test_similarity_score_empty_vs_nonempty() {
        // Kills mutant: max_len == 0 → max_len != 0
        // With one empty and one non-empty, max_len > 0, min_len = 0
        // length_ratio = 0/5 = 0.0 < 0.7 → returns 0.0
        let normalizer = Normalizer::new(Config::default());
        let empty = LogLine {
            original: String::new(),
            normalized: String::new(),
            tokens: vec![],
            hash: 0,
        };
        let nonempty = LogLine {
            original: "hello".into(),
            normalized: "hello".into(),
            tokens: vec![],
            hash: 1,
        };
        let score = normalizer.similarity_score(&empty, &nonempty);
        assert!(
            score < 1.0,
            "empty vs non-empty should score near 0, got {score}"
        );
    }

    #[test]
    fn test_similarity_score_min_max_not_swapped() {
        // Kills mutant: min_len ↔ max_len swap in length_ratio calculation
        // len1=3, len2=10: ratio should be 3/10=0.3, NOT 10/3=3.33
        let normalizer = Normalizer::new(Config::default());
        let short = LogLine {
            original: "abc".into(),
            normalized: "abc".into(),
            tokens: vec![],
            hash: 0,
        };
        let long = LogLine {
            original: "abcdefghij".into(),
            normalized: "abcdefghij".into(),
            tokens: vec![],
            hash: 1,
        };
        let score = normalizer.similarity_score(&short, &long);
        // ratio = 3/10 = 0.3 < 0.7 → returns 0.3 * 100 = 30.0
        assert!(
            (score - 30.0).abs() < f64::EPSILON,
            "3/10 ratio should give 30.0, got {score}"
        );
    }

    #[test]
    fn test_similarity_score_division_direction() {
        // Kills mutant: `/ max_len` → `* max_len` or `+ max_len` in line 234
        // 5 chars match out of 10 max → 5/10 * 100 = 50.0 (not 5*10*100)
        let normalizer = Normalizer::new(Config::default());
        let a = LogLine {
            original: "abcdeXXXXX".into(),
            normalized: "abcdeXXXXX".into(),
            tokens: vec![],
            hash: 0,
        };
        let b = LogLine {
            original: "abcdeYYYYY".into(),
            normalized: "abcdeYYYYY".into(),
            tokens: vec![],
            hash: 1,
        };
        let score = normalizer.similarity_score(&a, &b);
        assert!(
            (score - 50.0).abs() < f64::EPSILON,
            "5/10 matching chars should give 50.0, got {score}"
        );
    }

    #[test]
    fn test_are_similar_hash_shortcircuit() {
        let normalizer = Normalizer::new(Config::default());
        let a = LogLine {
            original: "completely different".into(),
            normalized: "completely different".into(),
            tokens: vec![],
            hash: 42,
        };
        let b = LogLine {
            original: "not similar at all really".into(),
            normalized: "not similar at all really".into(),
            tokens: vec![],
            hash: 42, // same hash = shortcircuit to true
        };
        assert!(
            normalizer.are_similar(&a, &b),
            "same hash should shortcircuit to similar"
        );
    }

    // --- summarize_variation_types direct tests (mutant kills) ---

    #[test]
    fn test_variation_types_different_ips() {
        let normalizer = Normalizer::new(Config::default());
        let first = vec![Token::IPv4("10.0.0.1".to_string())];
        let last = vec![Token::IPv4("10.0.0.2".to_string())];
        let types = normalizer.summarize_variation_types(&first, &last);
        assert_eq!(types, vec!["IP"]);
    }

    #[test]
    fn test_variation_types_same_tokens_no_variation() {
        let normalizer = Normalizer::new(Config::default());
        let tokens = vec![Token::IPv4("10.0.0.1".to_string())];
        let types = normalizer.summarize_variation_types(&tokens, &tokens);
        assert!(types.is_empty(), "Same tokens should produce no variation");
    }

    #[test]
    fn test_variation_types_essence_mode_skips_timestamps() {
        let config = Config {
            essence_mode: true,
            ..Config::default()
        };
        let normalizer = Normalizer::new(config);
        let first = vec![Token::Timestamp("2025-01-01T00:00:00Z".to_string())];
        let last = vec![Token::Timestamp("2025-01-02T00:00:00Z".to_string())];
        let types = normalizer.summarize_variation_types(&first, &last);
        assert!(
            types.is_empty(),
            "Essence mode should skip timestamp variations"
        );
    }

    #[test]
    fn test_variation_types_non_essence_includes_timestamps() {
        let normalizer = Normalizer::new(Config::default());
        let first = vec![Token::Timestamp("2025-01-01T00:00:00Z".to_string())];
        let last = vec![Token::Timestamp("2025-01-02T00:00:00Z".to_string())];
        let types = normalizer.summarize_variation_types(&first, &last);
        assert_eq!(types, vec!["timestamp"]);
    }

    #[test]
    fn test_variation_types_multiple_types_sorted() {
        let normalizer = Normalizer::new(Config::default());
        let first = vec![
            Token::IPv4("10.0.0.1".to_string()),
            Token::Uuid("aaa".to_string()),
        ];
        let last = vec![
            Token::IPv4("10.0.0.2".to_string()),
            Token::Uuid("bbb".to_string()),
        ];
        let types = normalizer.summarize_variation_types(&first, &last);
        assert_eq!(types, vec!["IP", "UUID"]);
    }

    // --- normalize_line short-circuit tests (mutant kills) ---

    #[test]
    fn test_normalize_ips_only_flag() {
        let config = Config {
            normalize_ips: true,
            normalize_ports: false,
            normalize_fqdns: false,
            ..Config::default()
        };
        let normalizer = Normalizer::new(config);
        let line = normalizer
            .normalize_line("connect to 10.0.0.1:8080".to_string())
            .unwrap();
        assert!(
            line.tokens.iter().any(|t| matches!(t, Token::IPv4(_))),
            "IPs should be detected"
        );
    }

    #[test]
    fn test_normalize_ports_only_flag() {
        let config = Config {
            normalize_ips: false,
            normalize_ports: true,
            normalize_fqdns: false,
            ..Config::default()
        };
        let normalizer = Normalizer::new(config);
        let line = normalizer
            .normalize_line("connect to localhost:8080".to_string())
            .unwrap();
        assert!(
            line.tokens.iter().any(|t| matches!(t, Token::Port(_))),
            "Ports should be detected"
        );
    }

    // ---- normalize_line: boolean condition tests ----

    #[test]
    fn normalize_line_json_disabled_no_detection() {
        let config = Config {
            normalize_json: false,
            ..Config::default()
        };
        let n = Normalizer::new(config);
        let line = n.normalize_line(r"&Event{Type: Warning}".into()).unwrap();
        // With JSON detection disabled, Event objects should NOT be detected
        assert!(
            !line.tokens.iter().any(|t| matches!(t, Token::Json(_))),
            "JSON detection should be disabled"
        );
    }

    #[test]
    fn structured_detection_brace_only() {
        // Input with { but no = — should still trigger structured detection
        let n = Normalizer::new(Config::default());
        let line = n
            .normalize_line(r#"{"level":"error","component":"web","msg":"fail"}"#.into())
            .unwrap();
        assert!(
            line.tokens
                .iter()
                .any(|t| matches!(t, Token::StructuredMessage { .. })),
            "Brace-only input should trigger structured detection: {:?}",
            line.tokens
        );
    }

    #[test]
    fn structured_detection_equals_only() {
        // Input with = but no { — should still trigger the structured/KV detection path
        // The || ensures both branches (contains '{') and (contains '=') individually pass
        let n = Normalizer::new(Config::default());
        let line = n
            .normalize_line("level=error component=web msg=fail".into())
            .unwrap();
        // Either StructuredMessage or KeyValuePair tokens indicate the = path was taken
        assert!(
            line.tokens.iter().any(|t| matches!(
                t,
                Token::StructuredMessage { .. } | Token::KeyValuePair { .. }
            )),
            "Equals-only input should trigger structured or KV detection: {:?}",
            line.tokens
        );
    }

    // ---- Mutant-killing: normalize_timestamps=false with colon input ----

    #[test]
    fn normalize_timestamps_disabled_with_colon_input() {
        // Kills mutant: `self.config.normalize_timestamps && text.contains(':')` → `||`
        // If mutated to ||, timestamps would be detected even when disabled
        let config = Config {
            normalize_timestamps: false,
            ..Config::default()
        };
        let normalizer = Normalizer::new(config);
        let line = normalizer
            .normalize_line("10:15:30 Error occurred".to_string())
            .unwrap();
        assert!(
            !line.tokens.iter().any(|t| matches!(t, Token::Timestamp(_))),
            "Timestamps should NOT be detected when normalize_timestamps=false"
        );
    }

    // ---- Mutant-killing: normalize_emails=false (line 43) ----

    #[test]
    fn normalize_emails_disabled_no_detection() {
        let config = Config {
            normalize_emails: false,
            ..Config::default()
        };
        let n = Normalizer::new(config);
        let line = n
            .normalize_line("user test@example.com logged in".into())
            .unwrap();
        assert!(
            !line.tokens.iter().any(|t| matches!(t, Token::Email(_))),
            "Emails should NOT be detected when normalize_emails=false: {:?}",
            line.tokens
        );
    }

    // ---- Mutant-killing: quoted string detection (line 173) ----

    #[test]
    fn quoted_detection_single_quote_only() {
        // Input with ' but no " — should still trigger quoted string detection path
        // Kills: || with && on `contains('"') || contains('\'')`
        let n = Normalizer::new(Config::default());
        let line = n
            .normalize_line("mount 'very-long-volume-name-that-exceeds-threshold-ok' done".into())
            .unwrap();
        // The ' path should be entered (if || is correct, either quote type suffices)
        // Just verify no panic — the detection may or may not produce tokens
        let _ = line;
    }

    // ---- Mutant-killing: normalize_json=false with brace input ----

    #[test]
    fn normalize_json_disabled_with_brace_input() {
        // Note: this mutant (normalize.rs:59) is excluded via .cargo/mutants.toml
        // because PathDetector (step 3) already replaces &Event{} with <EVENT_OBJECT>
        // before JsonDetector (step 4) ever runs. The normalize_json guard is
        // structurally unreachable — an equivalent mutant.
        //
        // This test verifies JsonDetector itself works in isolation.
        let (_, direct_tokens) =
            crate::patterns::json::JsonDetector::detect_and_replace("&Event{Type: Warning}");
        assert!(
            direct_tokens.iter().any(|t| matches!(t, Token::Json(_))),
            "JsonDetector should detect Event objects: {direct_tokens:?}"
        );
    }

    // ---- Mutant-killing: --disable-patterns guards at the Normalizer boundary ----
    // Each test uses an input that provably triggers its detector (default on),
    // then asserts the corresponding token type disappears when the guard is off.

    fn run(config_mut: impl FnOnce(&mut Config), input: &str) -> LogLine {
        let mut config = Config::default();
        config_mut(&mut config);
        Normalizer::new(config)
            .normalize_line(input.to_string())
            .unwrap()
    }

    #[test]
    fn normalize_kubernetes_disabled_suppresses_k8s_tokens() {
        let input = "volume \"kube-api-access-abc123\" (projected) failed to mount for pod kube-system/test-pod";
        let on = run(|_| {}, input);
        let off = run(|c| c.normalize_kubernetes = false, input);
        assert!(
            on.tokens.iter().any(|t| matches!(
                t,
                Token::KubernetesNamespace(_) | Token::PodName(_) | Token::VolumeName(_)
            )),
            "expected k8s token with detector ON, got {:?}",
            on.tokens
        );
        assert!(
            !off.tokens.iter().any(|t| matches!(
                t,
                Token::KubernetesNamespace(_) | Token::PodName(_) | Token::VolumeName(_)
            )),
            "expected NO k8s token with detector OFF, got {:?}",
            off.tokens
        );
    }

    #[test]
    fn normalize_names_disabled_suppresses_name_tokens() {
        let input = "service api-deploy-abc123-x1y2 started";
        let on = run(|_| {}, input);
        let off = run(|c| c.normalize_names = false, input);
        assert!(
            on.tokens.iter().any(|t| matches!(t, Token::Name(_))),
            "expected Name token with detector ON, got {:?}",
            on.tokens
        );
        assert!(
            !off.tokens.iter().any(|t| matches!(t, Token::Name(_))),
            "expected NO Name token with detector OFF, got {:?}",
            off.tokens
        );
    }

    #[test]
    fn normalize_quoted_disabled_suppresses_quoted_tokens() {
        let input = "message \"some variable value here\" done";
        let on = run(|_| {}, input);
        let off = run(|c| c.normalize_quoted = false, input);
        assert!(
            on.tokens
                .iter()
                .any(|t| matches!(t, Token::QuotedString(_))),
            "expected QuotedString token with detector ON, got {:?}",
            on.tokens
        );
        assert!(
            !off.tokens
                .iter()
                .any(|t| matches!(t, Token::QuotedString(_))),
            "expected NO QuotedString token with detector OFF, got {:?}",
            off.tokens
        );
    }

    #[test]
    fn normalize_brackets_disabled_suppresses_bracket_context_tokens() {
        let input = "[error] [mod_jk] request failed";
        let on = run(|_| {}, input);
        let off = run(|c| c.normalize_brackets = false, input);
        assert!(
            on.tokens
                .iter()
                .any(|t| matches!(t, Token::BracketContext(_))),
            "expected BracketContext token with detector ON, got {:?}",
            on.tokens
        );
        assert!(
            !off.tokens
                .iter()
                .any(|t| matches!(t, Token::BracketContext(_))),
            "expected NO BracketContext token with detector OFF, got {:?}",
            off.tokens
        );
    }

    #[test]
    fn normalize_key_value_disabled_suppresses_kv_tokens() {
        let input = "level=error status=500 user_id=42";
        let on = run(|_| {}, input);
        let off = run(|c| c.normalize_key_value = false, input);
        assert!(
            on.tokens
                .iter()
                .any(|t| matches!(t, Token::KeyValuePair { .. })),
            "expected KeyValuePair token with detector ON, got {:?}",
            on.tokens
        );
        assert!(
            !off.tokens
                .iter()
                .any(|t| matches!(t, Token::KeyValuePair { .. })),
            "expected NO KeyValuePair token with detector OFF, got {:?}",
            off.tokens
        );
    }

    #[test]
    fn normalize_http_status_disabled_suppresses_http_tokens() {
        let input =
            r#"127.0.0.1 - - [25/Dec/2023:10:15:30 +0000] "POST /api/login HTTP/1.1" 401 256"#;
        let on = run(|_| {}, input);
        let off = run(|c| c.normalize_http_status = false, input);
        assert!(
            on.tokens
                .iter()
                .any(|t| matches!(t, Token::HttpStatus(_) | Token::HttpStatusClass(_))),
            "expected HTTP status token with detector ON, got {:?}",
            on.tokens
        );
        assert!(
            !off.tokens
                .iter()
                .any(|t| matches!(t, Token::HttpStatus(_) | Token::HttpStatusClass(_))),
            "expected NO HTTP status token with detector OFF, got {:?}",
            off.tokens
        );
    }

    // ---- Mutant-killing: similarity_score division vs multiplication ----

    #[test]
    fn similarity_score_division_not_multiplication() {
        // Kills mutant: `min_len as f64 / max_len as f64` → `*`
        // Use strings where:
        //   division: 3/10 = 0.3 < 0.7 → returns 30.0 (quick reject)
        //   multiplication: 3*10 = 30.0 > 0.7 → falls through to char comparison
        // The char comparison for "xyz" vs "abcdefghij" (0 matching bytes) → 0.0
        // So: division returns 30.0, multiplication returns 0.0
        let normalizer = Normalizer::new(Config::default());
        let short = LogLine {
            original: "xyz".into(),
            normalized: "xyz".into(),
            tokens: vec![],
            hash: 0,
        };
        let long = LogLine {
            original: "abcdefghij".into(),
            normalized: "abcdefghij".into(),
            tokens: vec![],
            hash: 1,
        };
        let score = normalizer.similarity_score(&short, &long);
        // With /: ratio = 3/10 = 0.3 < 0.7 → returns 30.0
        // With *: ratio = 30.0 > 0.7 → char comparison (0 matching) → 0.0
        assert!(
            score > 20.0,
            "3/10 ratio should give 30.0, got {score} (if 0.0, division was mutated to *)"
        );
    }
}