use ahash::AHasher;
use anyhow::Result;
use std::hash::{Hash, Hasher};
use crate::config::Config;
use crate::patterns::{
LogLine, Token, duration::DurationDetector, email::EmailPatternDetector, hash::HashDetector,
json::JsonDetector, kubernetes::KubernetesDetector, names::NameDetector,
network::NetworkDetector, path::PathDetector, process::ProcessDetector,
quoted::QuotedStringDetector, timestamp::TimestampDetector, uuid::UuidDetector,
};
pub struct Normalizer {
config: Config,
email_detector: EmailPatternDetector,
}
impl Normalizer {
pub fn new(config: Config) -> Self {
Self {
config,
email_detector: EmailPatternDetector::new().unwrap(),
}
}
pub fn normalize_line(&self, original: String) -> Result<LogLine> {
let mut normalized = original.clone();
let mut tokens = Vec::with_capacity(8);
if self.config.normalize_timestamps {
let (new_normalized, mut new_tokens) =
TimestampDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_emails && normalized.contains('@') {
let (new_normalized, mut new_tokens) =
self.email_detector.detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_paths {
let (new_normalized, mut new_tokens) = PathDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_json && normalized.contains('{') {
let (new_normalized, mut new_tokens) = JsonDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_uuids {
let (new_normalized, mut new_tokens) = UuidDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_ips || self.config.normalize_ports || self.config.normalize_fqdns {
let (new_normalized, mut new_tokens) = NetworkDetector::detect_and_replace(
&normalized,
self.config.normalize_ips,
self.config.normalize_ports,
self.config.normalize_fqdns,
);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_hashes {
let (new_normalized, mut new_tokens) = HashDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_pids {
let (new_normalized, mut new_tokens) = ProcessDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_kubernetes {
let (new_normalized, mut new_tokens) =
KubernetesDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_http_status {
let (new_normalized, mut new_tokens) =
crate::patterns::http_status::HttpStatusDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_brackets && normalized.contains('[') {
let (new_normalized, mut new_tokens) =
crate::patterns::bracket_context::BracketContextDetector::detect_and_replace(
&normalized,
);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_key_value && normalized.contains('=') {
let (new_normalized, mut new_tokens) =
crate::patterns::key_value::KeyValueDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if normalized.contains('[') {
let (new_normalized, mut new_tokens) =
crate::patterns::log_module::LogWithModuleDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if normalized.contains('{') || normalized.contains('=') {
let (new_normalized, mut new_tokens) =
crate::patterns::structured::StructuredMessageDetector::detect_and_replace(
&normalized,
);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_durations {
let (new_normalized, mut new_tokens) =
DurationDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_names {
let (new_normalized, mut new_tokens) = NameDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
if self.config.normalize_quoted && (normalized.contains('"') || normalized.contains('\'')) {
let (new_normalized, mut new_tokens) =
QuotedStringDetector::detect_and_replace(&normalized);
normalized = new_normalized;
tokens.append(&mut new_tokens);
}
let hash = self.calculate_hash(&normalized);
Ok(LogLine {
original,
normalized,
tokens,
hash,
})
}
fn calculate_hash(&self, normalized: &str) -> u64 {
let mut hasher = AHasher::default();
normalized.hash(&mut hasher);
hasher.finish()
}
#[allow(clippy::cast_precision_loss)] pub fn similarity_score(&self, line1: &LogLine, line2: &LogLine) -> f64 {
let s1 = &line1.normalized;
let s2 = &line2.normalized;
if s1 == s2 {
return 100.0;
}
let len1 = s1.len();
let len2 = s2.len();
let max_len = len1.max(len2);
let min_len = len1.min(len2);
if max_len == 0 {
return 100.0;
}
let length_ratio = min_len as f64 / max_len as f64;
if length_ratio < 0.7 {
return length_ratio * 100.0;
}
let b1 = s1.as_bytes();
let b2 = s2.as_bytes();
let compare_len = min_len;
let mut matches: u32 = 0;
for i in 0..compare_len {
if b1[i] == b2[i] {
matches += 1;
}
}
(f64::from(matches) / max_len as f64) * 100.0
}
pub fn are_similar(&self, line1: &LogLine, line2: &LogLine) -> bool {
if line1.hash == line2.hash {
return true;
}
let score = self.similarity_score(line1, line2);
score >= f64::from(self.config.threshold)
}
pub fn format_collapsed_line(&self, first: &LogLine, last: &LogLine, count: usize) -> String {
if self.config.compact {
let variation_types = self.summarize_variation_types(&first.tokens, &last.tokens);
if variation_types.is_empty() {
format!("[+{count} similar]")
} else {
format!(
"[+{} similar, varying: {}]",
count,
variation_types.join(", ")
)
}
} else {
format!(
"[...collapsed {} similar lines from {} to {}...]",
count,
self.format_timestamp(first),
self.format_timestamp(last)
)
}
}
fn format_timestamp(&self, log_line: &LogLine) -> String {
for token in &log_line.tokens {
if let Token::Timestamp(ts_str) = token {
if let Some(time_part) = Self::extract_time_part(ts_str) {
return time_part;
}
}
}
"unknown".to_string()
}
fn extract_time_part(timestamp: &str) -> Option<String> {
Some(timestamp.to_string())
}
fn summarize_variation_types(
&self,
first_tokens: &[Token],
last_tokens: &[Token],
) -> Vec<String> {
let mut types = std::collections::HashSet::new();
let get_token_info = |token: &Token| -> (&str, String) {
match token {
Token::Timestamp(v) => ("timestamp", v.clone()),
Token::IPv4(v) => ("IP", v.clone()),
Token::IPv6(v) => ("IP", v.clone()),
Token::Port(v) => ("port", v.to_string()),
Token::Hash(_, v) => ("hash", v.clone()),
Token::Uuid(v) => ("UUID", v.clone()),
Token::Pid(v) => ("PID", v.to_string()),
Token::ThreadID(v) => ("thread", v.clone()),
Token::Path(v) => ("path", v.clone()),
Token::Json(v) => ("json", v.clone()),
Token::Duration(v) => ("duration", v.clone()),
Token::Size(v) => ("size", v.clone()),
Token::Number(v) => ("number", v.clone()),
Token::HttpStatus(v) => ("http_status", v.to_string()),
Token::QuotedString(v) => ("quoted_string", v.clone()),
Token::Name(v) => ("name", v.clone()),
Token::KubernetesNamespace(v) => ("namespace", v.clone()),
Token::VolumeName(v) => ("volume", v.clone()),
Token::PluginType(v) => ("plugin", v.clone()),
Token::PodName(v) => ("pod", v.clone()),
Token::HttpStatusClass(v) => ("http_status_class", v.clone()),
Token::BracketContext(v) => ("bracket_context", v.join(",")),
Token::KeyValuePair { key, value_type } => {
("key_value_pair", format!("{key}={value_type}"))
}
Token::Email(v) => ("email", v.clone()),
Token::LogWithModule { .. } => ("log_with_module", String::new()),
Token::StructuredMessage { .. } => ("structured_message", String::new()),
}
};
let mut first_values: std::collections::HashMap<&str, Vec<String>> =
std::collections::HashMap::new();
let mut last_values: std::collections::HashMap<&str, Vec<String>> =
std::collections::HashMap::new();
for token in first_tokens {
let (token_type, value) = get_token_info(token);
first_values.entry(token_type).or_default().push(value);
}
for token in last_tokens {
let (token_type, value) = get_token_info(token);
last_values.entry(token_type).or_default().push(value);
}
let all_types: std::collections::HashSet<&str> = first_values
.keys()
.chain(last_values.keys())
.copied()
.collect();
for token_type in all_types {
if self.config.essence_mode && token_type == "timestamp" {
continue;
}
let first_vals = first_values.get(token_type).cloned().unwrap_or_default();
let last_vals = last_values.get(token_type).cloned().unwrap_or_default();
if first_vals != last_vals {
types.insert(token_type.to_string());
}
}
let mut result: Vec<String> = types.into_iter().collect();
result.sort();
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_timestamp_normalization() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("2025-01-20 10:15:30 Error occurred".to_string())
.unwrap();
assert_eq!(line.normalized, "<TIMESTAMP> Error occurred");
assert_eq!(line.tokens.len(), 1);
assert!(matches!(line.tokens[0], Token::Timestamp(_)));
}
#[test]
fn test_ip_port_normalization() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("Connection to 192.168.1.100:8080 failed".to_string())
.unwrap();
assert_eq!(line.normalized, "Connection to <IP>:<PORT> failed");
assert_eq!(line.tokens.len(), 2);
}
#[test]
fn test_similarity_calculation() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line1 = normalizer
.normalize_line(
"2025-01-20 10:15:01 [pid=12345] Connection failed to 192.168.1.100:8080"
.to_string(),
)
.unwrap();
let line2 = normalizer
.normalize_line(
"2025-01-20 10:15:02 [pid=12346] Connection failed to 192.168.1.101:8081"
.to_string(),
)
.unwrap();
assert!(normalizer.are_similar(&line1, &line2));
let score = normalizer.similarity_score(&line1, &line2);
assert!(score >= 85.0);
}
#[test]
fn test_hash_consistency() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line1 = normalizer
.normalize_line("<TIMESTAMP> [pid=<PID>] Connection failed to <IP>:<PORT>".to_string())
.unwrap();
let line2 = normalizer
.normalize_line("<TIMESTAMP> [pid=<PID>] Connection failed to <IP>:<PORT>".to_string())
.unwrap();
assert_eq!(line1.hash, line2.hash);
}
#[test]
fn test_disabled_normalization() {
let config = Config {
normalize_timestamps: false,
normalize_ips: false,
normalize_ports: false,
..Config::default()
};
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("2025-01-20 10:15:30 Connection to 192.168.1.100 failed".to_string())
.unwrap();
assert_eq!(
line.normalized,
"<NUMBER>-01-20 10:15:30 Connection to <DECIMAL>.<DECIMAL> failed"
);
}
#[test]
fn test_timestamp_format_preservation() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line1 = normalizer
.normalize_line("2025-09-18 13:26:30,188 INFO: test message".to_string())
.unwrap();
let formatted = normalizer.format_timestamp(&line1);
assert_eq!(formatted, "2025-09-18 13:26:30,188");
let line2 = normalizer
.normalize_line("2025-09-18 13:26:53.345 UTC [24] LOG test".to_string())
.unwrap();
let formatted2 = normalizer.format_timestamp(&line2);
assert_eq!(formatted2, "2025-09-18 13:26:53.345 UTC");
let line3 = normalizer
.normalize_line("2025-01-20T10:15:30.123Z INFO test".to_string())
.unwrap();
let formatted3 = normalizer.format_timestamp(&line3);
assert_eq!(formatted3, "2025-01-20T10:15:30.123Z");
}
#[test]
fn test_invalid_timestamp_handling() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("2025-02-31 25:99:99,999 ERROR: invalid timestamp".to_string())
.unwrap();
let formatted = normalizer.format_timestamp(&line);
assert_eq!(formatted, "2025-02-31 25:99:99,999");
}
#[test]
fn test_no_timestamp_handling() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("Just a log message with no timestamp".to_string())
.unwrap();
let formatted = normalizer.format_timestamp(&line);
assert_eq!(formatted, "unknown");
}
#[test]
fn test_port_detection_vs_timestamps() {
let config = Config::default();
let normalizer = Normalizer::new(config);
let line1 = normalizer
.normalize_line("2025-01-20 10:15:30 Connection failed".to_string())
.unwrap();
assert_eq!(line1.normalized, "<TIMESTAMP> Connection failed");
assert!(
line1
.tokens
.iter()
.any(|t| matches!(t, Token::Timestamp(_)))
);
assert!(!line1.tokens.iter().any(|t| matches!(t, Token::Port(_))));
let line2 = normalizer
.normalize_line("Connection to localhost:8080 failed".to_string())
.unwrap();
assert_eq!(line2.normalized, "Connection to localhost:<PORT> failed");
assert!(line2.tokens.iter().any(|t| matches!(t, Token::Port(8080))));
let line3 = normalizer
.normalize_line("Connection to 192.168.1.1:3000 failed".to_string())
.unwrap();
assert_eq!(line3.normalized, "Connection to <IP>:<PORT> failed");
assert!(line3.tokens.iter().any(|t| matches!(t, Token::IPv4(_))));
assert!(line3.tokens.iter().any(|t| matches!(t, Token::Port(3000))));
let line4 = normalizer
.normalize_line("Connection to [2001:db8::1]:8080 failed".to_string())
.unwrap();
assert_eq!(line4.normalized, "Connection to [<IP>]:<PORT> failed");
assert!(line4.tokens.iter().any(|t| matches!(t, Token::IPv6(_))));
assert!(line4.tokens.iter().any(|t| matches!(t, Token::Port(8080))));
}
#[test]
fn test_similarity_score_identical() {
let normalizer = Normalizer::new(Config::default());
let line = normalizer
.normalize_line("hello world".to_string())
.unwrap();
let score = normalizer.similarity_score(&line, &line);
assert!((score - 100.0).abs() < f64::EPSILON);
}
#[test]
fn test_similarity_score_completely_different() {
let normalizer = Normalizer::new(Config::default());
let a = normalizer.normalize_line("aaaa".to_string()).unwrap();
let b = normalizer.normalize_line("zzzz".to_string()).unwrap();
let score = normalizer.similarity_score(&a, &b);
assert!(
score < 1.0,
"Completely different strings should score near 0, got {score}"
);
}
#[test]
fn test_similarity_score_partial_match() {
let normalizer = Normalizer::new(Config::default());
let a = normalizer.normalize_line("hello".to_string()).unwrap();
let b = normalizer.normalize_line("hella".to_string()).unwrap();
let score = normalizer.similarity_score(&a, &b);
assert!(
(score - 80.0).abs() < f64::EPSILON,
"Expected 80.0, got {score}"
);
}
#[test]
fn test_similarity_score_length_ratio_rejection() {
let normalizer = Normalizer::new(Config::default());
let short = normalizer.normalize_line("ab".to_string()).unwrap();
let long = normalizer.normalize_line("abcdefghij".to_string()).unwrap();
let score = normalizer.similarity_score(&short, &long);
assert!(
(score - 20.0).abs() < f64::EPSILON,
"Expected 20.0 (ratio rejection), got {score}"
);
}
#[test]
fn test_similarity_score_empty_strings() {
let normalizer = Normalizer::new(Config::default());
let empty = LogLine {
original: String::new(),
normalized: String::new(),
tokens: vec![],
hash: 0,
};
let score = normalizer.similarity_score(&empty, &empty);
assert!(
(score - 100.0).abs() < f64::EPSILON,
"Empty vs empty should be 100.0"
);
}
#[test]
fn test_similarity_score_at_length_ratio_boundary() {
let normalizer = Normalizer::new(Config::default());
let ten_chars = normalizer.normalize_line("abcdefghij".to_string()).unwrap();
let seven_match = normalizer.normalize_line("abcdefg".to_string()).unwrap();
let score = normalizer.similarity_score(&seven_match, &ten_chars);
assert!(
(score - 70.0).abs() < f64::EPSILON,
"At boundary (0.7), should use char comparison. Got {score}"
);
let six_match = normalizer.normalize_line("abcdef".to_string()).unwrap();
let score_below = normalizer.similarity_score(&six_match, &ten_chars);
assert!(
(score_below - 60.0).abs() < f64::EPSILON,
"Below boundary, should return ratio*100=60.0. Got {score_below}"
);
let seven_mismatch = normalizer.normalize_line("abcdefz".to_string()).unwrap();
let score_mismatch = normalizer.similarity_score(&seven_mismatch, &ten_chars);
assert!(
(score_mismatch - 60.0).abs() < f64::EPSILON,
"At boundary with mismatch, char comparison gives 60.0. Got {score_mismatch}"
);
let seven_none = normalizer.normalize_line("xyzxyzx".to_string()).unwrap();
let score_none = normalizer.similarity_score(&seven_none, &ten_chars);
assert!(
score_none < 1.0,
"At boundary with zero char matches, should be ~0. Got {score_none}"
);
}
#[test]
fn test_similarity_score_one_char_diff() {
let normalizer = Normalizer::new(Config::default());
let a = normalizer.normalize_line("abcdefghij".to_string()).unwrap();
let b = normalizer.normalize_line("abcdefghix".to_string()).unwrap();
let score = normalizer.similarity_score(&a, &b);
assert!(
(score - 90.0).abs() < f64::EPSILON,
"Expected 90.0, got {score}"
);
}
#[test]
fn test_similarity_score_empty_vs_nonempty() {
let normalizer = Normalizer::new(Config::default());
let empty = LogLine {
original: String::new(),
normalized: String::new(),
tokens: vec![],
hash: 0,
};
let nonempty = LogLine {
original: "hello".into(),
normalized: "hello".into(),
tokens: vec![],
hash: 1,
};
let score = normalizer.similarity_score(&empty, &nonempty);
assert!(
score < 1.0,
"empty vs non-empty should score near 0, got {score}"
);
}
#[test]
fn test_similarity_score_min_max_not_swapped() {
let normalizer = Normalizer::new(Config::default());
let short = LogLine {
original: "abc".into(),
normalized: "abc".into(),
tokens: vec![],
hash: 0,
};
let long = LogLine {
original: "abcdefghij".into(),
normalized: "abcdefghij".into(),
tokens: vec![],
hash: 1,
};
let score = normalizer.similarity_score(&short, &long);
assert!(
(score - 30.0).abs() < f64::EPSILON,
"3/10 ratio should give 30.0, got {score}"
);
}
#[test]
fn test_similarity_score_division_direction() {
let normalizer = Normalizer::new(Config::default());
let a = LogLine {
original: "abcdeXXXXX".into(),
normalized: "abcdeXXXXX".into(),
tokens: vec![],
hash: 0,
};
let b = LogLine {
original: "abcdeYYYYY".into(),
normalized: "abcdeYYYYY".into(),
tokens: vec![],
hash: 1,
};
let score = normalizer.similarity_score(&a, &b);
assert!(
(score - 50.0).abs() < f64::EPSILON,
"5/10 matching chars should give 50.0, got {score}"
);
}
#[test]
fn test_are_similar_hash_shortcircuit() {
let normalizer = Normalizer::new(Config::default());
let a = LogLine {
original: "completely different".into(),
normalized: "completely different".into(),
tokens: vec![],
hash: 42,
};
let b = LogLine {
original: "not similar at all really".into(),
normalized: "not similar at all really".into(),
tokens: vec![],
hash: 42, };
assert!(
normalizer.are_similar(&a, &b),
"same hash should shortcircuit to similar"
);
}
#[test]
fn test_variation_types_different_ips() {
let normalizer = Normalizer::new(Config::default());
let first = vec![Token::IPv4("10.0.0.1".to_string())];
let last = vec![Token::IPv4("10.0.0.2".to_string())];
let types = normalizer.summarize_variation_types(&first, &last);
assert_eq!(types, vec!["IP"]);
}
#[test]
fn test_variation_types_same_tokens_no_variation() {
let normalizer = Normalizer::new(Config::default());
let tokens = vec![Token::IPv4("10.0.0.1".to_string())];
let types = normalizer.summarize_variation_types(&tokens, &tokens);
assert!(types.is_empty(), "Same tokens should produce no variation");
}
#[test]
fn test_variation_types_essence_mode_skips_timestamps() {
let config = Config {
essence_mode: true,
..Config::default()
};
let normalizer = Normalizer::new(config);
let first = vec![Token::Timestamp("2025-01-01T00:00:00Z".to_string())];
let last = vec![Token::Timestamp("2025-01-02T00:00:00Z".to_string())];
let types = normalizer.summarize_variation_types(&first, &last);
assert!(
types.is_empty(),
"Essence mode should skip timestamp variations"
);
}
#[test]
fn test_variation_types_non_essence_includes_timestamps() {
let normalizer = Normalizer::new(Config::default());
let first = vec![Token::Timestamp("2025-01-01T00:00:00Z".to_string())];
let last = vec![Token::Timestamp("2025-01-02T00:00:00Z".to_string())];
let types = normalizer.summarize_variation_types(&first, &last);
assert_eq!(types, vec!["timestamp"]);
}
#[test]
fn test_variation_types_multiple_types_sorted() {
let normalizer = Normalizer::new(Config::default());
let first = vec![
Token::IPv4("10.0.0.1".to_string()),
Token::Uuid("aaa".to_string()),
];
let last = vec![
Token::IPv4("10.0.0.2".to_string()),
Token::Uuid("bbb".to_string()),
];
let types = normalizer.summarize_variation_types(&first, &last);
assert_eq!(types, vec!["IP", "UUID"]);
}
#[test]
fn test_normalize_ips_only_flag() {
let config = Config {
normalize_ips: true,
normalize_ports: false,
normalize_fqdns: false,
..Config::default()
};
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("connect to 10.0.0.1:8080".to_string())
.unwrap();
assert!(
line.tokens.iter().any(|t| matches!(t, Token::IPv4(_))),
"IPs should be detected"
);
}
#[test]
fn test_normalize_ports_only_flag() {
let config = Config {
normalize_ips: false,
normalize_ports: true,
normalize_fqdns: false,
..Config::default()
};
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("connect to localhost:8080".to_string())
.unwrap();
assert!(
line.tokens.iter().any(|t| matches!(t, Token::Port(_))),
"Ports should be detected"
);
}
#[test]
fn normalize_line_json_disabled_no_detection() {
let config = Config {
normalize_json: false,
..Config::default()
};
let n = Normalizer::new(config);
let line = n.normalize_line(r"&Event{Type: Warning}".into()).unwrap();
assert!(
!line.tokens.iter().any(|t| matches!(t, Token::Json(_))),
"JSON detection should be disabled"
);
}
#[test]
fn structured_detection_brace_only() {
let n = Normalizer::new(Config::default());
let line = n
.normalize_line(r#"{"level":"error","component":"web","msg":"fail"}"#.into())
.unwrap();
assert!(
line.tokens
.iter()
.any(|t| matches!(t, Token::StructuredMessage { .. })),
"Brace-only input should trigger structured detection: {:?}",
line.tokens
);
}
#[test]
fn structured_detection_equals_only() {
let n = Normalizer::new(Config::default());
let line = n
.normalize_line("level=error component=web msg=fail".into())
.unwrap();
assert!(
line.tokens.iter().any(|t| matches!(
t,
Token::StructuredMessage { .. } | Token::KeyValuePair { .. }
)),
"Equals-only input should trigger structured or KV detection: {:?}",
line.tokens
);
}
#[test]
fn normalize_timestamps_disabled_with_colon_input() {
let config = Config {
normalize_timestamps: false,
..Config::default()
};
let normalizer = Normalizer::new(config);
let line = normalizer
.normalize_line("10:15:30 Error occurred".to_string())
.unwrap();
assert!(
!line.tokens.iter().any(|t| matches!(t, Token::Timestamp(_))),
"Timestamps should NOT be detected when normalize_timestamps=false"
);
}
#[test]
fn normalize_emails_disabled_no_detection() {
let config = Config {
normalize_emails: false,
..Config::default()
};
let n = Normalizer::new(config);
let line = n
.normalize_line("user test@example.com logged in".into())
.unwrap();
assert!(
!line.tokens.iter().any(|t| matches!(t, Token::Email(_))),
"Emails should NOT be detected when normalize_emails=false: {:?}",
line.tokens
);
}
#[test]
fn quoted_detection_single_quote_only() {
let n = Normalizer::new(Config::default());
let line = n
.normalize_line("mount 'very-long-volume-name-that-exceeds-threshold-ok' done".into())
.unwrap();
let _ = line;
}
#[test]
fn normalize_json_disabled_with_brace_input() {
let (_, direct_tokens) =
crate::patterns::json::JsonDetector::detect_and_replace("&Event{Type: Warning}");
assert!(
direct_tokens.iter().any(|t| matches!(t, Token::Json(_))),
"JsonDetector should detect Event objects: {direct_tokens:?}"
);
}
fn run(config_mut: impl FnOnce(&mut Config), input: &str) -> LogLine {
let mut config = Config::default();
config_mut(&mut config);
Normalizer::new(config)
.normalize_line(input.to_string())
.unwrap()
}
#[test]
fn normalize_kubernetes_disabled_suppresses_k8s_tokens() {
let input = "volume \"kube-api-access-abc123\" (projected) failed to mount for pod kube-system/test-pod";
let on = run(|_| {}, input);
let off = run(|c| c.normalize_kubernetes = false, input);
assert!(
on.tokens.iter().any(|t| matches!(
t,
Token::KubernetesNamespace(_) | Token::PodName(_) | Token::VolumeName(_)
)),
"expected k8s token with detector ON, got {:?}",
on.tokens
);
assert!(
!off.tokens.iter().any(|t| matches!(
t,
Token::KubernetesNamespace(_) | Token::PodName(_) | Token::VolumeName(_)
)),
"expected NO k8s token with detector OFF, got {:?}",
off.tokens
);
}
#[test]
fn normalize_names_disabled_suppresses_name_tokens() {
let input = "service api-deploy-abc123-x1y2 started";
let on = run(|_| {}, input);
let off = run(|c| c.normalize_names = false, input);
assert!(
on.tokens.iter().any(|t| matches!(t, Token::Name(_))),
"expected Name token with detector ON, got {:?}",
on.tokens
);
assert!(
!off.tokens.iter().any(|t| matches!(t, Token::Name(_))),
"expected NO Name token with detector OFF, got {:?}",
off.tokens
);
}
#[test]
fn normalize_quoted_disabled_suppresses_quoted_tokens() {
let input = "message \"some variable value here\" done";
let on = run(|_| {}, input);
let off = run(|c| c.normalize_quoted = false, input);
assert!(
on.tokens
.iter()
.any(|t| matches!(t, Token::QuotedString(_))),
"expected QuotedString token with detector ON, got {:?}",
on.tokens
);
assert!(
!off.tokens
.iter()
.any(|t| matches!(t, Token::QuotedString(_))),
"expected NO QuotedString token with detector OFF, got {:?}",
off.tokens
);
}
#[test]
fn normalize_brackets_disabled_suppresses_bracket_context_tokens() {
let input = "[error] [mod_jk] request failed";
let on = run(|_| {}, input);
let off = run(|c| c.normalize_brackets = false, input);
assert!(
on.tokens
.iter()
.any(|t| matches!(t, Token::BracketContext(_))),
"expected BracketContext token with detector ON, got {:?}",
on.tokens
);
assert!(
!off.tokens
.iter()
.any(|t| matches!(t, Token::BracketContext(_))),
"expected NO BracketContext token with detector OFF, got {:?}",
off.tokens
);
}
#[test]
fn normalize_key_value_disabled_suppresses_kv_tokens() {
let input = "level=error status=500 user_id=42";
let on = run(|_| {}, input);
let off = run(|c| c.normalize_key_value = false, input);
assert!(
on.tokens
.iter()
.any(|t| matches!(t, Token::KeyValuePair { .. })),
"expected KeyValuePair token with detector ON, got {:?}",
on.tokens
);
assert!(
!off.tokens
.iter()
.any(|t| matches!(t, Token::KeyValuePair { .. })),
"expected NO KeyValuePair token with detector OFF, got {:?}",
off.tokens
);
}
#[test]
fn normalize_http_status_disabled_suppresses_http_tokens() {
let input =
r#"127.0.0.1 - - [25/Dec/2023:10:15:30 +0000] "POST /api/login HTTP/1.1" 401 256"#;
let on = run(|_| {}, input);
let off = run(|c| c.normalize_http_status = false, input);
assert!(
on.tokens
.iter()
.any(|t| matches!(t, Token::HttpStatus(_) | Token::HttpStatusClass(_))),
"expected HTTP status token with detector ON, got {:?}",
on.tokens
);
assert!(
!off.tokens
.iter()
.any(|t| matches!(t, Token::HttpStatus(_) | Token::HttpStatusClass(_))),
"expected NO HTTP status token with detector OFF, got {:?}",
off.tokens
);
}
#[test]
fn similarity_score_division_not_multiplication() {
let normalizer = Normalizer::new(Config::default());
let short = LogLine {
original: "xyz".into(),
normalized: "xyz".into(),
tokens: vec![],
hash: 0,
};
let long = LogLine {
original: "abcdefghij".into(),
normalized: "abcdefghij".into(),
tokens: vec![],
hash: 1,
};
let score = normalizer.similarity_score(&short, &long);
assert!(
score > 20.0,
"3/10 ratio should give 30.0, got {score} (if 0.0, division was mutated to *)"
);
}
}