use super::Token;
use regex::Regex;
use std::sync::LazyLock;
static KEY_VALUE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([a-zA-Z][a-zA-Z0-9_.-]*)\s*[=:]\s*([^\s,;|]+(?:%|ms|s|MB|GB|KB|bytes?)?)")
.unwrap()
});
static CONFIG_KV_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"([a-zA-Z][a-zA-Z0-9_.-]*)\s*=\s*([^\s,;|]+)").unwrap());
static METRICS_KV_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([a-zA-Z][a-zA-Z0-9_.-]*)\s*=\s*(\d+(?:\.\d+)?(?:%|ms|s|MB|GB|KB|rps|qps)?)")
.unwrap()
});
static JSON_KV_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#""([a-zA-Z][a-zA-Z0-9_.-]*)"\s*:\s*(?:"([^"]+)"|(\d+(?:\.\d+)?)|true|false|null)"#)
.unwrap()
});
pub struct KeyValueDetector;
impl KeyValueDetector {
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
if !Self::has_key_value_indicators(text) {
return (text.to_string(), Vec::new());
}
let mut result = text.to_string();
let mut tokens = Vec::new();
Self::apply_metrics_pattern(&mut result, &mut tokens);
Self::apply_config_pattern(&mut result, &mut tokens);
Self::apply_json_pattern(&mut result, &mut tokens);
Self::apply_general_pattern(&mut result, &mut tokens);
(result, tokens)
}
fn has_key_value_indicators(text: &str) -> bool {
(text.contains('=') || text.contains(':')) &&
!text.contains("if ") && !text.contains("for ") && !text.contains("while ") && !text.contains("SELECT ") && !text.contains("http://") && !text.contains("https://") && !text.contains("ftp://") }
#[mutants::skip] fn apply_metrics_pattern(text: &mut String, tokens: &mut Vec<Token>) {
*text = METRICS_KV_REGEX
.replace_all(text, |caps: ®ex::Captures| {
let key = caps.get(1).unwrap().as_str();
let value = caps.get(2).unwrap().as_str();
if Self::is_metrics_context(text, caps.get(0).unwrap().start()) {
let value_type = Self::classify_value_type(value);
tokens.push(Token::KeyValuePair {
key: key.to_lowercase(),
value_type,
});
"<KEY_VALUE>".to_string()
} else {
caps.get(0).unwrap().as_str().to_string()
}
})
.to_string();
}
fn apply_config_pattern(text: &mut String, tokens: &mut Vec<Token>) {
*text = CONFIG_KV_REGEX
.replace_all(text, |caps: ®ex::Captures| {
let key = caps.get(1).unwrap().as_str();
let value = caps.get(2).unwrap().as_str();
if Self::is_config_context(text, caps.get(0).unwrap().start()) {
let value_type = Self::classify_value_type(value);
tokens.push(Token::KeyValuePair {
key: key.to_lowercase(),
value_type,
});
"<KEY_VALUE>".to_string()
} else {
caps.get(0).unwrap().as_str().to_string()
}
})
.to_string();
}
fn apply_json_pattern(text: &mut String, tokens: &mut Vec<Token>) {
*text = JSON_KV_REGEX
.replace_all(text, |caps: ®ex::Captures| {
let key = caps.get(1).unwrap().as_str();
let value = caps
.get(2)
.or_else(|| caps.get(3))
.map_or("null", |m| m.as_str());
if Self::is_logging_json(text) {
let value_type = Self::classify_value_type(value);
tokens.push(Token::KeyValuePair {
key: key.to_lowercase(),
value_type,
});
format!(r#""{key}": <KEY_VALUE>"#)
} else {
caps.get(0).unwrap().as_str().to_string()
}
})
.to_string();
}
fn apply_general_pattern(text: &mut String, tokens: &mut Vec<Token>) {
*text = KEY_VALUE_REGEX
.replace_all(text, |caps: ®ex::Captures| {
let key = caps.get(1).unwrap().as_str();
let value = caps.get(2).unwrap().as_str();
if Self::is_valid_key_value_context(key, value, text) {
let value_type = Self::classify_value_type(value);
tokens.push(Token::KeyValuePair {
key: key.to_lowercase(),
value_type,
});
"<KEY_VALUE>".to_string()
} else {
caps.get(0).unwrap().as_str().to_string()
}
})
.to_string();
}
fn classify_value_type(value: &str) -> String {
let lower_value = value.to_lowercase();
if value.ends_with('%') {
return "percentage".to_string();
}
if value.ends_with("ms") || value.ends_with("us") || value.ends_with("ns") {
return "duration".to_string();
}
if value.ends_with('s')
&& value
.chars()
.rev()
.nth(1)
.is_some_and(|c| c.is_ascii_digit())
{
return "duration".to_string();
}
if value.ends_with("MB")
|| value.ends_with("GB")
|| value.ends_with("KB")
|| value.ends_with("bytes")
|| value.ends_with("byte")
{
return "size".to_string();
}
if value.ends_with("rps")
|| value.ends_with("qps")
|| value.ends_with("/s")
|| value.ends_with("/min")
|| value.ends_with("/hr")
{
return "rate".to_string();
}
if lower_value == "true"
|| lower_value == "false"
|| lower_value == "enabled"
|| lower_value == "disabled"
|| lower_value == "on"
|| lower_value == "off"
{
return "boolean".to_string();
}
if value.chars().all(|c| c.is_ascii_digit() || c == '.') {
return "number".to_string();
}
if Self::is_ip_address(value) {
return "ip".to_string();
}
if value.starts_with("http://")
|| value.starts_with("https://")
|| value.starts_with("ftp://")
{
return "url".to_string();
}
"string".to_string()
}
fn is_metrics_context(text: &str, _position: usize) -> bool {
let metrics_indicators = [
"metrics",
"stats",
"performance",
"monitor",
"usage",
"cpu",
"memory",
"disk",
"network",
"load",
"throughput",
"latency",
"response_time",
"error_rate",
"success_rate",
];
let lower_text = text.to_lowercase();
metrics_indicators
.iter()
.any(|&indicator| lower_text.contains(indicator))
}
fn is_config_context(text: &str, _position: usize) -> bool {
let config_indicators = [
"config",
"configuration",
"settings",
"params",
"parameters",
"options",
"properties",
"environment",
"variables",
];
let lower_text = text.to_lowercase();
config_indicators
.iter()
.any(|&indicator| lower_text.contains(indicator))
}
fn is_logging_json(text: &str) -> bool {
let log_indicators = [
"level",
"timestamp",
"message",
"msg",
"component",
"service",
"logger",
"severity",
"time",
"ts",
];
log_indicators
.iter()
.any(|&indicator| text.contains(indicator))
}
#[mutants::skip] fn is_valid_key_value_context(key: &str, value: &str, text: &str) -> bool {
if key == "if" || key == "for" || key == "while" || key == "switch" {
return false;
}
if text.contains(" + ")
|| text.contains(" - ")
|| text.contains(" * ")
|| text.contains(" / ")
{
return false;
}
if text.to_uppercase().contains("SELECT ")
|| text.to_uppercase().contains("INSERT ")
|| text.to_uppercase().contains("UPDATE ")
|| text.to_uppercase().contains("DELETE ")
{
return false;
}
let valid_keys = [
"timeout",
"retries",
"max_connections",
"port",
"host",
"ssl",
"debug",
"verbose",
"level",
"user_id",
"session_id",
"request_id",
"attempt_count",
"failure_rate",
"success_rate",
"response_time",
"cpu_usage",
"memory_usage",
"disk_usage",
"queue_size",
"buffer_size",
];
valid_keys.contains(&key) || Self::is_common_config_pattern(key, value)
}
fn is_common_config_pattern(key: &str, value: &str) -> bool {
key.ends_with("_timeout") ||
key.ends_with("_limit") ||
key.ends_with("_size") ||
key.ends_with("_count") ||
key.ends_with("_rate") ||
key.ends_with("_usage") ||
key.starts_with("max_") ||
key.starts_with("min_") ||
value.ends_with("ms") ||
value.ends_with('%') ||
value.ends_with("MB") ||
value.ends_with("KB") ||
value.ends_with("GB")
}
fn is_ip_address(value: &str) -> bool {
let parts: Vec<&str> = value.split('.').collect();
if parts.len() == 4 {
return parts.iter().all(|&part| part.parse::<u8>().is_ok());
}
value.contains(':') && value.chars().all(|c| c.is_ascii_hexdigit() || c == ':')
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metrics_detection() {
let metrics_line = "Performance metrics: cpu=75%, memory=60%, disk=45%";
let (result, tokens) = KeyValueDetector::detect_and_replace(metrics_line);
assert!(!tokens.is_empty());
assert!(result.contains("<KEY_VALUE>"));
let has_percentage = tokens.iter().any(|token| {
if let Token::KeyValuePair { key: _, value_type } = token {
value_type == "percentage"
} else {
false
}
});
assert!(has_percentage);
}
#[test]
fn test_config_detection() {
let config_line = "Database config: host=localhost, port=5432, ssl=true";
let (result, tokens) = KeyValueDetector::detect_and_replace(config_line);
assert!(tokens.len() >= 3);
assert!(result.contains("<KEY_VALUE>"));
}
#[test]
fn test_value_type_classification() {
assert_eq!(KeyValueDetector::classify_value_type("75%"), "percentage");
assert_eq!(KeyValueDetector::classify_value_type("250ms"), "duration");
assert_eq!(KeyValueDetector::classify_value_type("1GB"), "size");
assert_eq!(KeyValueDetector::classify_value_type("true"), "boolean");
assert_eq!(KeyValueDetector::classify_value_type("12345"), "number");
assert_eq!(
KeyValueDetector::classify_value_type("192.168.1.1"),
"number"
);
}
#[test]
fn classify_duration_single_digit_s() {
assert_eq!(KeyValueDetector::classify_value_type("5s"), "duration");
}
#[test]
fn classify_boolean_off() {
assert_eq!(KeyValueDetector::classify_value_type("off"), "boolean");
}
#[test]
fn classify_number_decimal() {
assert_eq!(KeyValueDetector::classify_value_type("10.5"), "number");
}
#[test]
fn classify_empty_is_number() {
assert_eq!(KeyValueDetector::classify_value_type(""), "number");
}
#[test]
fn classify_rate() {
assert_eq!(KeyValueDetector::classify_value_type("100rps"), "rate");
}
#[test]
fn kv_indicators_equals() {
assert!(KeyValueDetector::has_key_value_indicators("key=value"));
}
#[test]
fn kv_indicators_no_equals_or_colon() {
assert!(!KeyValueDetector::has_key_value_indicators("no kv here"));
}
#[test]
fn kv_indicators_url_excluded() {
assert!(!KeyValueDetector::has_key_value_indicators(
"visit https://example.com"
));
}
#[test]
fn test_no_false_positives() {
let non_kv_cases = vec![
"if variable = value then",
"for i = 1 to 10",
"SELECT * FROM table WHERE id = 123",
"Mathematical equation: x=y+z",
];
for test_case in non_kv_cases {
let (result, tokens) = KeyValueDetector::detect_and_replace(test_case);
if test_case.contains("if ")
|| test_case.contains("for ")
|| test_case.contains("SELECT ")
{
assert_eq!(tokens.len(), 0);
assert_eq!(result, test_case);
}
}
}
#[test]
fn classify_percentage() {
assert_eq!(KeyValueDetector::classify_value_type("75%"), "percentage");
}
#[test]
fn classify_duration_ms() {
assert_eq!(KeyValueDetector::classify_value_type("100ms"), "duration");
}
#[test]
fn classify_duration_us() {
assert_eq!(KeyValueDetector::classify_value_type("50us"), "duration");
}
#[test]
fn classify_duration_ns() {
assert_eq!(KeyValueDetector::classify_value_type("200ns"), "duration");
}
#[test]
fn classify_duration_s() {
assert_eq!(KeyValueDetector::classify_value_type("5s"), "duration");
}
#[test]
fn classify_size_mb() {
assert_eq!(KeyValueDetector::classify_value_type("512MB"), "size");
}
#[test]
fn classify_size_gb() {
assert_eq!(KeyValueDetector::classify_value_type("2GB"), "size");
}
#[test]
fn classify_size_kb() {
assert_eq!(KeyValueDetector::classify_value_type("1024KB"), "size");
}
#[test]
fn classify_size_bytes() {
assert_eq!(KeyValueDetector::classify_value_type("4096bytes"), "size");
}
#[test]
fn classify_size_byte() {
assert_eq!(KeyValueDetector::classify_value_type("1byte"), "size");
}
#[test]
fn classify_rate_rps() {
assert_eq!(KeyValueDetector::classify_value_type("1000rps"), "rate");
}
#[test]
fn classify_rate_qps() {
assert_eq!(KeyValueDetector::classify_value_type("500qps"), "rate");
}
#[test]
fn classify_rate_per_s() {
assert_eq!(KeyValueDetector::classify_value_type("100/s"), "rate");
}
#[test]
fn classify_rate_per_min() {
assert_eq!(KeyValueDetector::classify_value_type("60/min"), "rate");
}
#[test]
fn classify_rate_per_hr() {
assert_eq!(KeyValueDetector::classify_value_type("3600/hr"), "rate");
}
#[test]
fn classify_bool_true() {
assert_eq!(KeyValueDetector::classify_value_type("true"), "boolean");
}
#[test]
fn classify_bool_false() {
assert_eq!(KeyValueDetector::classify_value_type("false"), "boolean");
}
#[test]
fn classify_bool_enabled() {
assert_eq!(KeyValueDetector::classify_value_type("enabled"), "boolean");
}
#[test]
fn classify_bool_disabled() {
assert_eq!(KeyValueDetector::classify_value_type("disabled"), "boolean");
}
#[test]
fn classify_bool_on() {
assert_eq!(KeyValueDetector::classify_value_type("on"), "boolean");
}
#[test]
fn classify_bool_off() {
assert_eq!(KeyValueDetector::classify_value_type("off"), "boolean");
}
#[test]
fn classify_number() {
assert_eq!(KeyValueDetector::classify_value_type("42.5"), "number");
}
#[test]
fn classify_ip_via_is_ip_address() {
assert!(KeyValueDetector::is_ip_address("192.168.1.1"));
}
#[test]
fn classify_url_http() {
assert_eq!(
KeyValueDetector::classify_value_type("http://example.com"),
"url"
);
}
#[test]
fn classify_url_https() {
assert_eq!(
KeyValueDetector::classify_value_type("https://example.com"),
"url"
);
}
#[test]
fn classify_url_ftp() {
assert_eq!(
KeyValueDetector::classify_value_type("ftp://files.com"),
"url"
);
}
#[test]
fn classify_string_default() {
assert_eq!(KeyValueDetector::classify_value_type("hello"), "string");
}
#[test]
fn config_pattern_timeout_suffix() {
assert!(KeyValueDetector::is_common_config_pattern(
"read_timeout",
"30s"
));
}
#[test]
fn config_pattern_limit_suffix() {
assert!(KeyValueDetector::is_common_config_pattern(
"connection_limit",
"100"
));
}
#[test]
fn config_pattern_size_suffix() {
assert!(KeyValueDetector::is_common_config_pattern(
"buffer_size",
"4096"
));
}
#[test]
fn config_pattern_count_suffix() {
assert!(KeyValueDetector::is_common_config_pattern(
"retry_count",
"3"
));
}
#[test]
fn config_pattern_rate_suffix() {
assert!(KeyValueDetector::is_common_config_pattern(
"error_rate",
"0.01"
));
}
#[test]
fn config_pattern_usage_suffix() {
assert!(KeyValueDetector::is_common_config_pattern(
"cpu_usage",
"75%"
));
}
#[test]
fn config_pattern_max_prefix() {
assert!(KeyValueDetector::is_common_config_pattern(
"max_retries",
"5"
));
}
#[test]
fn config_pattern_min_prefix() {
assert!(KeyValueDetector::is_common_config_pattern(
"min_connections",
"1"
));
}
#[test]
fn config_pattern_value_ms() {
assert!(KeyValueDetector::is_common_config_pattern("delay", "100ms"));
}
#[test]
fn config_pattern_value_pct() {
assert!(KeyValueDetector::is_common_config_pattern(
"threshold",
"50%"
));
}
#[test]
fn config_pattern_value_mb() {
assert!(KeyValueDetector::is_common_config_pattern("heap", "512MB"));
}
#[test]
fn config_pattern_value_kb() {
assert!(KeyValueDetector::is_common_config_pattern("page", "4KB"));
}
#[test]
fn config_pattern_value_gb() {
assert!(KeyValueDetector::is_common_config_pattern("disk", "100GB"));
}
#[test]
fn config_pattern_negative() {
assert!(!KeyValueDetector::is_common_config_pattern("foo", "bar"));
}
#[test]
fn kv_ctx_excludes_if() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"if",
"true",
"if x=true then"
));
}
#[test]
fn kv_ctx_excludes_for() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"for",
"x",
"for i=0; i<n"
));
}
#[test]
fn kv_ctx_excludes_while() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"while",
"x",
"while x=true"
));
}
#[test]
fn kv_ctx_excludes_switch() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"switch",
"x",
"switch x=val"
));
}
#[test]
fn kv_ctx_excludes_math_plus() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"x",
"1",
"x + y = 1"
));
}
#[test]
fn kv_ctx_excludes_math_minus() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"x",
"1",
"x - y = 1"
));
}
#[test]
fn kv_ctx_excludes_math_mul() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"x",
"1",
"x * y = 1"
));
}
#[test]
fn kv_ctx_excludes_math_div() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"x",
"1",
"x / y = 1"
));
}
#[test]
fn kv_ctx_excludes_select() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"col",
"v",
"SELECT col FROM t"
));
}
#[test]
fn kv_ctx_excludes_insert() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"col",
"v",
"INSERT INTO t"
));
}
#[test]
fn kv_ctx_excludes_update() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"col",
"v",
"UPDATE t SET col=v"
));
}
#[test]
fn kv_ctx_excludes_delete_sql() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"col",
"v",
"DELETE FROM t"
));
}
#[test]
fn kv_ctx_valid_key() {
assert!(KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"timeout=30"
));
}
#[test]
fn kv_ctx_config_pattern() {
assert!(KeyValueDetector::is_valid_key_value_context(
"read_timeout",
"100ms",
"read_timeout=100ms"
));
}
#[test]
fn ip_addr_valid_ipv4() {
assert!(KeyValueDetector::is_ip_address("192.168.1.1"));
}
#[test]
fn ip_addr_invalid_ipv4() {
assert!(!KeyValueDetector::is_ip_address("999.999.999.999"));
}
#[test]
fn ip_addr_valid_ipv6() {
assert!(KeyValueDetector::is_ip_address("2001:db8:0:0:0:0:0:1"));
}
#[test]
fn ip_addr_short() {
assert!(!KeyValueDetector::is_ip_address("abc"));
}
#[test]
fn ip_addr_three_octets() {
assert!(!KeyValueDetector::is_ip_address("192.168.1"));
}
#[test]
fn apply_metrics_pattern_modifies_text() {
let input = "Performance metrics: cpu=75%";
let (result, tokens) = KeyValueDetector::detect_and_replace(input);
assert_ne!(result, input, "metrics pattern should modify text");
assert!(!tokens.is_empty(), "metrics pattern should produce tokens");
}
#[test]
fn apply_json_pattern_modifies_text() {
let input = r#"{"level": "info", "message": "hello", "component": "web"}"#;
let (result, tokens) = KeyValueDetector::detect_and_replace(input);
assert_ne!(result, input, "JSON pattern should modify text");
assert!(!tokens.is_empty(), "JSON pattern should produce tokens");
}
#[test]
fn apply_general_pattern_modifies_text() {
let input = "timeout=30";
let (result, tokens) = KeyValueDetector::detect_and_replace(input);
assert_ne!(result, input, "general pattern should modify text");
assert!(!tokens.is_empty(), "general pattern should produce tokens");
}
#[test]
fn config_context_returns_false_for_non_config() {
assert!(!KeyValueDetector::is_config_context(
"just a plain log line with no keywords",
0
));
}
#[test]
fn kv_ctx_excludes_select_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"SELECT id FROM users"
));
}
#[test]
fn kv_ctx_excludes_insert_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"INSERT INTO users VALUES (1)"
));
}
#[test]
fn kv_ctx_excludes_update_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"UPDATE users SET name='x'"
));
}
#[test]
fn kv_ctx_excludes_delete_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"DELETE FROM users WHERE id=1"
));
}
#[test]
fn kv_ctx_excludes_math_plus_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout", "30", "a + b"
));
}
#[test]
fn kv_ctx_excludes_math_minus_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout", "30", "a - b"
));
}
#[test]
fn kv_ctx_excludes_math_mul_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout", "30", "a * b"
));
}
#[test]
fn kv_ctx_excludes_math_div_only() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"timeout", "30", "a / b"
));
}
#[test]
fn kv_ctx_valid_key_from_list_timeout() {
assert!(KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"setting timeout=30"
));
}
#[test]
fn kv_ctx_valid_via_config_pattern() {
assert!(KeyValueDetector::is_valid_key_value_context(
"read_timeout",
"100ms",
"read_timeout=100ms"
));
}
#[test]
fn kv_ctx_invalid_key_not_in_list_or_pattern() {
assert!(!KeyValueDetector::is_valid_key_value_context(
"foo", "bar", "foo=bar"
));
}
#[test]
fn metrics_ctx_positive() {
assert!(KeyValueDetector::is_metrics_context("cpu usage report", 0));
}
#[test]
fn metrics_ctx_negative() {
assert!(!KeyValueDetector::is_metrics_context("hello world", 0));
}
#[test]
fn logging_json_positive() {
assert!(KeyValueDetector::is_logging_json(
r#"{"level":"info","msg":"ok"}"#
));
}
#[test]
fn logging_json_negative() {
assert!(!KeyValueDetector::is_logging_json("no json here"));
}
#[test]
fn metrics_only_input() {
let input = "System stats: cpu=85%";
let (result, tokens) = KeyValueDetector::detect_and_replace(input);
assert!(
!tokens.is_empty(),
"metrics-only input should produce tokens: {result}"
);
assert!(
result.contains("<KEY_VALUE>"),
"metrics pattern should modify text: {result}"
);
}
#[test]
fn kv_ctx_config_pattern_not_in_valid_keys() {
assert!(KeyValueDetector::is_valid_key_value_context(
"max_retries",
"5",
"max_retries=5"
));
assert!(!KeyValueDetector::is_valid_key_value_context(
"max_retries",
"5",
"SELECT max_retries FROM t"
));
}
#[test]
fn kv_ctx_valid_key_not_config_pattern() {
assert!(KeyValueDetector::is_valid_key_value_context(
"timeout",
"30",
"timeout=30"
));
}
}