use regex::Regex;
use std::sync::LazyLock;
use super::Token;
use super::{
duration::DurationDetector, hash::HashDetector, names::NameDetector, network::NetworkDetector,
path::PathDetector, process::ProcessDetector, timestamp::TimestampDetector, uuid::UuidDetector,
};
static QUOTED_STRING_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#""(?:[^"\\]|\\.)*""#).unwrap());
pub struct QuotedStringDetector;
impl QuotedStringDetector {
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
if !text.contains('"') {
return (text.to_string(), Vec::new());
}
let mut result = text.to_string();
let mut tokens = Vec::new();
result = QUOTED_STRING_PATTERN
.replace_all(&result, |caps: ®ex::Captures| {
let quoted_string = caps.get(0).unwrap().as_str();
let quoted_content = "ed_string[1..quoted_string.len() - 1]; let mut normalized_content = quoted_content.to_string();
let (new_normalized, _) =
TimestampDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
let (new_normalized, _) = PathDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
let (new_normalized, _) = UuidDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
let (new_normalized, _) =
NetworkDetector::detect_and_replace(&normalized_content, true, true, true);
normalized_content = new_normalized;
let (new_normalized, _) = HashDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
let (new_normalized, _) = ProcessDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
let (new_normalized, _) = DurationDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
let (new_normalized, _) = NameDetector::detect_and_replace(&normalized_content);
normalized_content = new_normalized;
if quoted_content.contains('\\')
&& (quoted_content.contains(':')
|| quoted_content.contains('{')
|| quoted_content.contains('['))
{
tokens.push(Token::QuotedString(quoted_string.to_string()));
"<ESCAPED_JSON>".to_string()
} else if normalized_content != quoted_content {
tokens.push(Token::QuotedString(quoted_string.to_string()));
format!("\"{normalized_content}\"") } else {
if quoted_string.len() > 25 {
tokens.push(Token::QuotedString(quoted_string.to_string()));
"<QUOTED_STRING>".to_string()
} else {
quoted_string.to_string()
}
}
})
.to_string();
(result, tokens)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mount_operation_quoted_strings() {
let input = r#"operationExecutor.VerifyControllerAttachedVolume started for volume "csi-log" (UniqueName: "kubernetes.io/host-path/01af48d9-3471-4acf-93aa-689c01b31dff-csi-log") pod "csi-rbdplugin-sr56f" (UID: "01af48d9-3471-4acf-93aa-689c01b31dff")"#;
let (normalized, tokens) = QuotedStringDetector::detect_and_replace(input);
assert_eq!(tokens.len(), 3);
assert!(
normalized.contains("\"csi-log\""),
"csi-log should be kept as-is"
);
assert!(
normalized.contains("csi-rbdplugin-<SUFFIX>"),
"pod name suffix should be normalized"
);
assert!(normalized.contains("<UUID>"), "UUID should be normalized");
}
#[test]
fn test_exclude_keywords() {
let input = r#"status "started" and "finished" operations"#;
let (normalized, tokens) = QuotedStringDetector::detect_and_replace(input);
assert_eq!(tokens.len(), 0);
assert_eq!(normalized, input); }
#[test]
fn test_exclude_short_strings() {
let input = r#"value "x" and "ab" but "longer-value""#;
let (normalized, tokens) = QuotedStringDetector::detect_and_replace(input);
assert_eq!(tokens.len(), 0);
assert_eq!(normalized, input);
}
#[test]
fn test_duration_normalization_in_quotes() {
let input1 = r#"error "back-off 5m0s restarting failed""#;
let input2 = r#"error "back-off 10m0s restarting failed""#;
let (normalized1, tokens1) = QuotedStringDetector::detect_and_replace(input1);
let (normalized2, tokens2) = QuotedStringDetector::detect_and_replace(input2);
assert_eq!(normalized1, normalized2);
assert_eq!(
normalized1,
r#"error "back-off <DURATION> restarting failed""#
);
assert_eq!(tokens1.len(), 1);
assert_eq!(tokens2.len(), 1);
}
#[test]
fn test_multiple_patterns_in_quotes() {
let test_cases = vec![
(
r#"error "connection to 192.168.1.1 failed""#,
r#"error "connection to <IP> failed""#,
),
(
r#"error "connection to 10.0.0.1 failed""#,
r#"error "connection to <IP> failed""#,
),
(
r#"error "volume abc123def456 not found""#,
r#"error "volume <HASH> not found""#,
),
(
r#"error "volume def987fed654321 not found""#,
r#"error "volume <HASH> not found""#,
),
(
r#"error "pod 550e8400-e29b-41d4-a716-446655440000 terminated""#,
r#"error "pod <UUID> terminated""#,
),
(
r#"error "pod 660f9511-f39c-52e5-b827-557766551111 terminated""#,
r#"error "pod <UUID> terminated""#,
),
(
r#"error "process 12345 crashed""#,
r#"error "process <NUMBER> crashed""#,
),
(
r#"error "process 67890 crashed""#,
r#"error "process <NUMBER> crashed""#,
),
];
for (input, expected) in test_cases {
let (result, tokens) = QuotedStringDetector::detect_and_replace(input);
assert_eq!(result, expected, "Failed for input: {input}");
assert_eq!(
tokens.len(),
1,
"Should detect exactly one quoted string token for: {input}"
);
}
}
#[test]
fn test_comprehensive_pattern_grouping() {
let inputs = vec![
r#"error "connection to 192.168.1.1 failed""#,
r#"error "connection to 10.0.0.1 failed""#,
r#"error "connection to 172.16.0.1 failed""#,
];
let mut normalized_results = Vec::new();
for input in &inputs {
let (normalized, tokens) = QuotedStringDetector::detect_and_replace(input);
normalized_results.push(normalized);
assert_eq!(tokens.len(), 1);
}
assert_eq!(normalized_results[0], normalized_results[1]);
assert_eq!(normalized_results[1], normalized_results[2]);
assert_eq!(
normalized_results[0],
r#"error "connection to <IP> failed""#
);
}
#[test]
fn test_timestamp_normalization_in_quotes() {
let test_cases = vec![
(
r#"error "backup at 2025-01-20 10:15:30 failed""#,
r#"error "backup at <TIMESTAMP> failed""#,
),
(
r#"error "event occurred on 2025-01-20T10:15:30Z""#,
r#"error "event occurred on <TIMESTAMP>""#,
),
(
r#"error "log from Jan 20 10:15:30""#,
r#"error "log from <TIMESTAMP>""#,
),
];
for (input, expected) in test_cases {
let (result, tokens) = QuotedStringDetector::detect_and_replace(input);
assert_eq!(result, expected, "Failed for input: {input}");
assert_eq!(tokens.len(), 1);
}
}
#[test]
fn test_path_normalization_in_quotes() {
let test_cases = vec![
(
r#"error "file /var/log/app.log missing""#,
r#"error "file <PATH> missing""#,
),
(
r#"error "cannot read /etc/config/settings.yaml""#,
r#"error "cannot read <PATH>""#,
),
(
r#"error "http://192.168.1.1:8080/api/v1 unreachable""#,
r#"error "<PATH> unreachable""#,
),
(
r#"error "path C:\Windows\System32\config invalid""#,
r"error <ESCAPED_JSON>",
),
];
for (input, expected) in test_cases {
let (result, tokens) = QuotedStringDetector::detect_and_replace(input);
assert_eq!(result, expected, "Failed for input: {input}");
assert_eq!(tokens.len(), 1);
}
}
#[test]
fn quoted_string_escaped_json_bracket_only() {
let input = r#"data "value\[index\]more" done"#;
let (result, tokens) = QuotedStringDetector::detect_and_replace(input);
assert!(
result.contains("<ESCAPED_JSON>"),
"backslash+bracket should trigger escaped JSON: {result}, tokens: {tokens:?}"
);
}
#[test]
fn quoted_string_escaped_json_colon_only() {
let input = r#"data "key\:value\:end" done"#;
let (result, tokens) = QuotedStringDetector::detect_and_replace(input);
assert!(
result.contains("<ESCAPED_JSON>"),
"backslash+colon should trigger escaped JSON: {result}, tokens: {tokens:?}"
);
}
#[test]
fn quoted_string_escaped_json_brace_only() {
let input = r#"data "obj\{inner\}end" done"#;
let (result, tokens) = QuotedStringDetector::detect_and_replace(input);
assert!(
result.contains("<ESCAPED_JSON>"),
"backslash+brace should trigger escaped JSON: {result}, tokens: {tokens:?}"
);
}
#[test]
fn quoted_string_long_unmodified_threshold() {
let content_23 = "it is a very simple tex"; assert_eq!(content_23.len(), 23);
let input_25 = format!(r#"x "{content_23}""#);
let (result25, tokens25) = QuotedStringDetector::detect_and_replace(&input_25);
assert_eq!(
tokens25.len(),
0,
"25-char quoted string should NOT produce token: {result25}"
);
assert_eq!(result25, input_25);
let content_24 = "it is a very simple text";
assert_eq!(content_24.len(), 24);
let input_26 = format!(r#"x "{content_24}""#);
let (result26, tokens26) = QuotedStringDetector::detect_and_replace(&input_26);
assert_eq!(
tokens26.len(),
1,
"26-char quoted string should produce token: {result26}"
);
assert!(
result26.contains("<QUOTED_STRING>"),
"should be replaced: {result26}"
);
}
#[test]
fn test_order_critical_patterns() {
let uuid_input = r#"error "pod 550e8400-e29b-41d4-a716-446655440000 terminated""#;
let (uuid_result, _) = QuotedStringDetector::detect_and_replace(uuid_input);
assert_eq!(uuid_result, r#"error "pod <UUID> terminated""#);
assert!(
!uuid_result.contains("<HASH>"),
"UUID should not be fragmented into hashes"
);
let url_input = r#"error "endpoint http://192.168.1.1:8080/api/v1/users down""#;
let (url_result, _) = QuotedStringDetector::detect_and_replace(url_input);
assert_eq!(url_result, r#"error "endpoint <PATH> down""#);
assert!(
!url_result.contains("<IP>"),
"URL should be preserved as complete path"
);
let complex_input = r#"error "backup of /data/db at 2025-01-20 10:15:30 for pod 550e8400-e29b-41d4-a716-446655440000 failed""#;
let (complex_result, _) = QuotedStringDetector::detect_and_replace(complex_input);
assert_eq!(
complex_result,
r#"error "backup of <PATH> at <TIMESTAMP> for pod <UUID> failed""#
);
}
}