use super::Token;
use regex::{Captures, Regex};
use std::sync::LazyLock;
static DECIMAL_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b\d+\.\d+\b").unwrap());
static INTEGER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b\d{3,}\b").unwrap());
static DURATION_WITH_UNIT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"\b(?:\d+h(?:\d+m)?(?:\d+(?:\.\d+)?s)?|\d+m(?:\d+(?:\.\d+)?s)?|\d+(?:\.\d+)?(?:ms|μs|ns|s))\b|"[0-9h]*[0-9m]*[0-9.]+s""#).unwrap()
});
static K8S_DURATION_FIELD_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\w*[Dd]uration=\d+\.\d+").unwrap());
static SIZE_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b\d+(?:\.\d+)?\s*(?:bytes?|[KMGT]?B)\b").unwrap());
static MEMORY_ADDR_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b0x[a-fA-F0-9]+\b").unwrap());
static PERCENTAGE_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\b\d+(?:\.\d+)?%").unwrap());
static HTTP_STATUS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)(?:status|error|code|returned?)\s+([1-5][0-9][0-9])\b").unwrap()
});
pub struct DurationDetector;
impl DurationDetector {
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
if !text.contains('.')
&& !text.contains('%')
&& !text.contains("ms")
&& !text.contains('s')
&& !text.contains('m')
&& !text.contains('h')
&& !text.contains("bytes")
&& !text.contains("KB")
&& !text.contains("MB")
{
return (text.to_string(), Vec::new());
}
let mut result = text.to_string();
let mut tokens = Vec::new();
for cap in K8S_DURATION_FIELD_REGEX.find_iter(text) {
let duration_str = cap.as_str();
tokens.push(Token::Duration(duration_str.to_string()));
}
result = K8S_DURATION_FIELD_REGEX
.replace_all(&result, "<DURATION_FIELD>")
.to_string();
for cap in DURATION_WITH_UNIT_REGEX.find_iter(&result) {
let duration_str = cap.as_str();
tokens.push(Token::Duration(duration_str.to_string()));
}
result = DURATION_WITH_UNIT_REGEX
.replace_all(&result, "<DURATION>")
.to_string();
for cap in MEMORY_ADDR_REGEX.find_iter(&result) {
let addr_str = cap.as_str();
tokens.push(Token::Number(addr_str.to_string()));
}
result = MEMORY_ADDR_REGEX.replace_all(&result, "<ADDR>").to_string();
for cap in SIZE_REGEX.find_iter(&result) {
let size_str = cap.as_str();
tokens.push(Token::Size(size_str.to_string()));
}
result = SIZE_REGEX.replace_all(&result, "<SIZE>").to_string();
for cap in HTTP_STATUS_REGEX.captures_iter(&result) {
if let Some(status_match) = cap.get(1) {
let status_str = status_match.as_str();
if let Ok(status_code) = status_str.parse::<u16>() {
tokens.push(Token::HttpStatus(status_code));
}
}
}
result = HTTP_STATUS_REGEX
.replace_all(&result, |caps: &Captures| {
let context = &caps[0][..caps[0].len() - caps[1].len()]; format!("{context}<HTTP_STATUS>")
})
.to_string();
for cap in PERCENTAGE_REGEX.find_iter(&result) {
let pct_str = cap.as_str();
tokens.push(Token::Number(pct_str.to_string()));
}
result = PERCENTAGE_REGEX.replace_all(&result, "<PCT>").to_string();
for cap in DECIMAL_REGEX.find_iter(&result) {
let decimal_str = cap.as_str();
tokens.push(Token::Duration(decimal_str.to_string()));
}
result = DECIMAL_REGEX.replace_all(&result, "<DECIMAL>").to_string();
for cap in INTEGER_REGEX.find_iter(&result) {
let int_str = cap.as_str();
tokens.push(Token::Number(int_str.to_string()));
}
result = INTEGER_REGEX.replace_all(&result, "<NUMBER>").to_string();
(result, tokens)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_k8s_duration_detection() {
let text = r#"podStartSLOduration=3.488101038 podStartE2EDuration="3.488101038s""#;
let (result, tokens) = DurationDetector::detect_and_replace(text);
println!("Input: {text}");
println!("Output: {result}");
println!("Tokens: {tokens:?}");
assert!(result.contains("<DURATION_FIELD>"));
assert!(!tokens.is_empty());
}
#[test]
fn test_decimal_detection() {
let text = "value=123.456789 and another=999.111";
let (result, tokens) = DurationDetector::detect_and_replace(text);
assert_eq!(result, "value=<DECIMAL> and another=<DECIMAL>");
assert_eq!(tokens.len(), 2);
}
#[test]
fn test_memory_address() {
let text = "pointer at 0x7fff5fbff8c0 in memory";
let (result, _) = DurationDetector::detect_and_replace(text);
assert_eq!(result, "pointer at <ADDR> in memory");
}
#[test]
fn test_percentage() {
let text = "CPU usage: 87.3% memory: 45.2%";
let (result, _) = DurationDetector::detect_and_replace(text);
assert_eq!(result, "CPU usage: <PCT> memory: <PCT>");
}
#[test]
fn test_duration_units() {
let test_cases = vec![
(
"request took 1.234s to complete",
"request took <DURATION> to complete",
),
("timeout after 523ms", "timeout after <DURATION>"),
("elapsed time: 2m30s", "elapsed time: <DURATION>"),
("uptime: 1h15m", "uptime: <DURATION>"),
("latency 45μs detected", "latency <DURATION> detected"),
("process ran for 3ns", "process ran for <DURATION>"),
("combined: 1h30m15s total", "combined: <DURATION> total"),
];
for (input, expected) in test_cases {
let (result, tokens) = DurationDetector::detect_and_replace(input);
println!("Input: {input} -> Output: {result}");
assert_eq!(result, expected, "Failed for input: {input}");
assert!(!tokens.is_empty(), "No tokens detected for: {input}");
}
}
#[test]
fn test_memory_sizes() {
let test_cases = vec![
("file size: 1234567 bytes", "file size: <SIZE>", true),
(
"allocated 1.2MB of memory",
"allocated <SIZE> of memory",
true,
),
(
"disk usage: 5.6GB available",
"disk usage: <SIZE> available",
true,
),
("buffer: 128KB allocated", "buffer: <SIZE> allocated", true),
("downloaded 2TB of data", "downloaded 2TB of data", false),
("cache: 512 B total", "cache: <SIZE> total", true),
(
"memory usage: 1234567 bytes and 5.6GB disk",
"memory usage: <SIZE> and <SIZE> disk",
true,
),
];
for (input, expected, expect_tokens) in test_cases {
let (result, tokens) = DurationDetector::detect_and_replace(input);
println!("Input: {input} -> Output: {result}");
assert_eq!(result, expected, "Failed for input: {input}");
if expect_tokens {
assert!(!tokens.is_empty(), "No tokens detected for: {input}");
}
}
}
#[test]
fn test_http_status_codes() {
let test_cases = vec![
(
"POST /login returned 401 Unauthorized",
"POST /login returned <HTTP_STATUS> Unauthorized",
true,
),
(
"Error 404 not found on page",
"Error 404 not found on page",
false,
),
(
"Request completed with status 201",
"Request completed with status <HTTP_STATUS>",
true,
),
(
"HTTP status code 500 internal error",
"HTTP status code <HTTP_STATUS> internal error",
true,
),
(
"Error code 403 forbidden",
"Error code 403 forbidden",
false,
),
];
for (input, expected, expect_tokens) in test_cases {
let (result, tokens) = DurationDetector::detect_and_replace(input);
println!("Input: {input} -> Output: {result}");
assert_eq!(result, expected, "Failed for input: {input}");
if expect_tokens {
assert!(!tokens.is_empty(), "No tokens detected for: {input}");
}
}
}
#[test]
fn test_percentages() {
let test_cases = vec![
("CPU usage: 87.3%", "CPU usage: <PCT>"),
("Memory at 45%", "Memory at <PCT>"),
("Disk full at 98%", "Disk full at <PCT>"),
("Progress: 12.5% complete", "Progress: <PCT> complete"),
(
"Stats: CPU: 45%, memory: 78%, disk: 92.1%",
"Stats: CPU: <PCT>, memory: <PCT>, disk: <PCT>",
),
("Low usage: 3.14% only", "Low usage: <PCT> only"),
];
for (input, expected) in test_cases {
let (result, tokens) = DurationDetector::detect_and_replace(input);
println!("Input: {input} -> Output: {result}");
assert_eq!(result, expected, "Failed for input: {input}");
assert!(!tokens.is_empty(), "No tokens detected for: {input}");
}
}
}