use super::{DetectionResult, TimestampMatch, TimestampRegistry, Token};
use std::sync::LazyLock;
pub struct UnifiedTimestampDetector;
static TIMESTAMP_REGISTRY: LazyLock<TimestampRegistry> = LazyLock::new(TimestampRegistry::new);
impl UnifiedTimestampDetector {
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
let result = Self::detect_with_metadata(text);
let tokens = result
.matches
.iter()
.map(|m| Token::Timestamp(m.original.clone()))
.collect();
(result.normalized_text, tokens)
}
pub fn detect_with_metadata(text: &str) -> DetectionResult {
if !Self::has_timestamp_indicators(text) {
return DetectionResult {
normalized_text: text.to_string(),
matches: Vec::new(),
};
}
let registry = &*TIMESTAMP_REGISTRY;
let patterns = registry.get_patterns();
let mut all_matches = Vec::new();
for pattern in patterns {
for regex_match in pattern.regex.find_iter(text) {
let timestamp_match = TimestampMatch {
original: regex_match.as_str().to_string(),
start_pos: regex_match.start(),
end_pos: regex_match.end(),
priority: pattern.priority.clone(),
};
all_matches.push(timestamp_match);
}
}
all_matches.sort_by_key(|m| m.start_pos);
let resolved_matches = Self::resolve_overlaps(all_matches);
let normalized_text = Self::apply_replacements(text, &resolved_matches);
DetectionResult {
normalized_text,
matches: resolved_matches,
}
}
fn has_timestamp_indicators(text: &str) -> bool {
text.contains(':')
&& (text.contains("20") || text.contains("19") || text.contains('-') || text.contains('T') || text.contains('[') || text.contains("I09") || text.contains("W09") || text.contains("E09") || text.contains("F09") ||
text.contains("I10") || text.contains("W10") || text.contains("E10") || text.contains("F10") ||
text.contains("I11") || text.contains("W11") || text.contains("E11") || text.contains("F11") ||
text.contains("I12") || text.contains("W12") || text.contains("E12") || text.contains("F12") ||
text.contains("Jan") || text.contains("Feb") || text.contains("Mar") ||
text.contains("Apr") || text.contains("May") || text.contains("Jun") ||
text.contains("Jul") || text.contains("Aug") || text.contains("Sep") ||
text.contains("Oct") || text.contains("Nov") || text.contains("Dec"))
}
fn resolve_overlaps(mut matches: Vec<TimestampMatch>) -> Vec<TimestampMatch> {
if matches.is_empty() {
return matches;
}
matches.sort_by(|a, b| {
a.priority
.effective_score()
.cmp(&b.priority.effective_score())
});
let mut resolved = Vec::new();
let mut used_positions = Vec::new();
for candidate in matches {
let candidate_range = candidate.start_pos..candidate.end_pos;
let overlaps = used_positions
.iter()
.any(|used_range: &std::ops::Range<usize>| {
candidate_range.start < used_range.end && candidate_range.end > used_range.start
});
if !overlaps {
used_positions.push(candidate_range);
resolved.push(candidate);
}
}
resolved.sort_by_key(|m| m.start_pos);
resolved
}
fn apply_replacements(text: &str, matches: &[TimestampMatch]) -> String {
if matches.is_empty() {
return text.to_string();
}
let mut result = text.to_string();
let mut sorted_matches = matches.to_vec();
sorted_matches.sort_by_key(|m| std::cmp::Reverse(m.start_pos));
for timestamp_match in sorted_matches {
let range = timestamp_match.start_pos..timestamp_match.end_pos;
if range.end <= result.len() {
result.replace_range(range, "<TIMESTAMP>");
}
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn has_timestamp_indicators_year_and_colon() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"2024-01-01 10:00:00"
));
}
#[test]
fn has_timestamp_indicators_iso8601() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"foo:barT"
));
}
#[test]
fn has_timestamp_indicators_month_name() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Jan 1 10:00:00"
));
}
#[test]
fn has_timestamp_indicators_k8s_level() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"I1025 10:00:00.000"
));
}
#[test]
fn has_timestamp_indicators_no_colon_rejects() {
assert!(!UnifiedTimestampDetector::has_timestamp_indicators(
"2024-01-01 no colon"
));
}
#[test]
fn has_timestamp_indicators_colon_but_no_date() {
assert!(!UnifiedTimestampDetector::has_timestamp_indicators(
"foo:bar"
));
}
#[test]
fn has_timestamp_indicators_empty() {
assert!(!UnifiedTimestampDetector::has_timestamp_indicators(""));
}
#[test]
fn has_timestamp_indicators_bracket() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"[10:00:00]"
));
}
#[test]
fn ts_ind_year_19() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"1999-01:00"
));
}
#[test]
fn ts_ind_dash_only() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators("a-b:c"));
}
#[test]
fn ts_ind_feb() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Feb 1 10:00"
));
}
#[test]
fn ts_ind_mar() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Mar 1 10:00"
));
}
#[test]
fn ts_ind_apr() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Apr 1 10:00"
));
}
#[test]
fn ts_ind_may() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"May 1 10:00"
));
}
#[test]
fn ts_ind_jun() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Jun 1 10:00"
));
}
#[test]
fn ts_ind_jul() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Jul 1 10:00"
));
}
#[test]
fn ts_ind_aug() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Aug 1 10:00"
));
}
#[test]
fn ts_ind_sep() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Sep 1 10:00"
));
}
#[test]
fn ts_ind_oct() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Oct 1 10:00"
));
}
#[test]
fn ts_ind_nov() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Nov 1 10:00"
));
}
#[test]
fn ts_ind_dec() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"Dec 1 10:00"
));
}
#[test]
fn ts_ind_w09() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"W0929 10:00:00"
));
}
#[test]
fn ts_ind_e09() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"E0929 10:00:00"
));
}
#[test]
fn ts_ind_f09() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"F0929 10:00:00"
));
}
#[test]
fn ts_ind_i09() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"I0929 10:00:00"
));
}
#[test]
fn ts_ind_i11() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"I1129 10:00:00"
));
}
#[test]
fn ts_ind_w11() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"W1129 10:00:00"
));
}
#[test]
fn ts_ind_e11() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"E1129 10:00:00"
));
}
#[test]
fn ts_ind_f11() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"F1129 10:00:00"
));
}
#[test]
fn ts_ind_i12() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"I1229 10:00:00"
));
}
#[test]
fn ts_ind_w12() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"W1229 10:00:00"
));
}
#[test]
fn ts_ind_e12() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"E1229 10:00:00"
));
}
#[test]
fn ts_ind_f12() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"F1229 10:00:00"
));
}
#[test]
fn ts_ind_w10() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"W1029 10:00:00"
));
}
#[test]
fn ts_ind_e10() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"E1029 10:00:00"
));
}
#[test]
fn ts_ind_f10() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"F1029 10:00:00"
));
}
fn make_match(start: usize, end: usize, specificity: u32) -> TimestampMatch {
TimestampMatch {
original: String::new(),
start_pos: start,
end_pos: end,
priority: super::super::priority::PatternPriority::new(
specificity,
super::super::priority::FormatFamily::Structured,
),
}
}
#[test]
fn resolve_overlaps_empty() {
let result = UnifiedTimestampDetector::resolve_overlaps(vec![]);
assert!(result.is_empty());
}
#[test]
fn resolve_overlaps_no_overlap() {
let matches = vec![make_match(0, 10, 90), make_match(15, 25, 80)];
let result = UnifiedTimestampDetector::resolve_overlaps(matches);
assert_eq!(result.len(), 2);
}
#[test]
fn resolve_overlaps_overlap_higher_wins() {
let matches = vec![make_match(0, 20, 90), make_match(10, 30, 50)];
let result = UnifiedTimestampDetector::resolve_overlaps(matches);
assert_eq!(result.len(), 1);
assert_eq!(result[0].start_pos, 0);
assert_eq!(result[0].end_pos, 20);
}
#[test]
fn resolve_overlaps_adjacent_both_survive() {
let matches = vec![make_match(0, 10, 90), make_match(10, 20, 80)];
let result = UnifiedTimestampDetector::resolve_overlaps(matches);
assert_eq!(result.len(), 2);
}
#[test]
fn resolve_overlaps_end_equals_start_both_survive() {
let matches = vec![make_match(5, 15, 90), make_match(0, 5, 50)];
let result = UnifiedTimestampDetector::resolve_overlaps(matches);
assert_eq!(result.len(), 2, "adjacent end==start should not overlap");
}
#[test]
fn ts_ind_requires_colon() {
assert!(!UnifiedTimestampDetector::has_timestamp_indicators(
"2024-01-01 no colon here"
));
}
#[test]
fn ts_ind_colon_with_year_20() {
assert!(UnifiedTimestampDetector::has_timestamp_indicators(
"2024:00"
));
}
#[test]
fn resolve_overlaps_single_match() {
let matches = vec![make_match(5, 15, 90)];
let result = UnifiedTimestampDetector::resolve_overlaps(matches);
assert_eq!(result.len(), 1);
assert_eq!(result[0].start_pos, 5);
}
#[test]
fn resolve_overlaps_three_overlapping() {
let matches = vec![
make_match(0, 20, 90),
make_match(5, 25, 50),
make_match(10, 30, 30),
];
let result = UnifiedTimestampDetector::resolve_overlaps(matches);
assert_eq!(result.len(), 1, "only highest priority should survive");
assert_eq!(result[0].start_pos, 0);
}
}