use super::{DetectionResult, TimestampMatch, TimestampRegistry, Token};
use std::sync::LazyLock;
pub struct UnifiedTimestampDetector;
static TIMESTAMP_REGISTRY: LazyLock<TimestampRegistry> = LazyLock::new(TimestampRegistry::new);
impl UnifiedTimestampDetector {
pub fn detect_and_replace(text: &str) -> (String, Vec<Token>) {
let result = Self::detect_with_metadata(text);
let tokens = result
.matches
.iter()
.map(|m| Token::Timestamp(m.original.clone()))
.collect();
(result.normalized_text, tokens)
}
pub fn detect_with_metadata(text: &str) -> DetectionResult {
if !Self::has_timestamp_indicators(text) {
return DetectionResult {
normalized_text: text.to_string(),
matches: Vec::new(),
};
}
let registry = &*TIMESTAMP_REGISTRY;
let patterns = registry.get_patterns();
let mut all_matches = Vec::new();
for pattern in patterns {
for regex_match in pattern.regex.find_iter(text) {
let timestamp_match = TimestampMatch {
original: regex_match.as_str().to_string(),
start_pos: regex_match.start(),
end_pos: regex_match.end(),
priority: pattern.priority.clone(),
};
all_matches.push(timestamp_match);
}
}
all_matches.sort_by_key(|m| m.start_pos);
let resolved_matches = Self::resolve_overlaps(all_matches);
let normalized_text = Self::apply_replacements(text, &resolved_matches);
DetectionResult {
normalized_text,
matches: resolved_matches,
}
}
fn has_timestamp_indicators(text: &str) -> bool {
text.contains(':')
&& (text.contains("20") || text.contains("19") || text.contains('-') || text.contains('T') || text.contains('[') || text.contains("I09") || text.contains("W09") || text.contains("E09") || text.contains("F09") ||
text.contains("I10") || text.contains("W10") || text.contains("E10") || text.contains("F10") ||
text.contains("I11") || text.contains("W11") || text.contains("E11") || text.contains("F11") ||
text.contains("I12") || text.contains("W12") || text.contains("E12") || text.contains("F12") ||
text.contains("Jan") || text.contains("Feb") || text.contains("Mar") ||
text.contains("Apr") || text.contains("May") || text.contains("Jun") ||
text.contains("Jul") || text.contains("Aug") || text.contains("Sep") ||
text.contains("Oct") || text.contains("Nov") || text.contains("Dec"))
}
fn resolve_overlaps(mut matches: Vec<TimestampMatch>) -> Vec<TimestampMatch> {
if matches.is_empty() {
return matches;
}
matches.sort_by(|a, b| {
a.priority
.effective_score()
.cmp(&b.priority.effective_score())
});
let mut resolved = Vec::new();
let mut used_positions = Vec::new();
for candidate in matches {
let candidate_range = candidate.start_pos..candidate.end_pos;
let overlaps = used_positions
.iter()
.any(|used_range: &std::ops::Range<usize>| {
candidate_range.start < used_range.end && candidate_range.end > used_range.start
});
if !overlaps {
used_positions.push(candidate_range);
resolved.push(candidate);
}
}
resolved.sort_by_key(|m| m.start_pos);
resolved
}
fn apply_replacements(text: &str, matches: &[TimestampMatch]) -> String {
if matches.is_empty() {
return text.to_string();
}
let mut result = text.to_string();
let mut sorted_matches = matches.to_vec();
sorted_matches.sort_by_key(|m| std::cmp::Reverse(m.start_pos));
for timestamp_match in sorted_matches {
let range = timestamp_match.start_pos..timestamp_match.end_pos;
if range.end <= result.len() {
result.replace_range(range, "<TIMESTAMP>");
}
}
result
}
}