use super::{FormatFamily, PatternPriority, PatternSource, TimestampFormat, TimestampPattern};
use regex::Regex;
use std::collections::HashMap;
pub struct TimestampRegistry {
patterns: Vec<TimestampPattern>,
}
impl Default for TimestampRegistry {
fn default() -> Self {
Self::new()
}
}
impl TimestampRegistry {
pub fn new() -> Self {
let timestamp_patterns = Self::load_original_timestamp_patterns();
let essence_patterns = Self::load_original_essence_patterns();
let mut merged_patterns =
Self::merge_duplicate_patterns(timestamp_patterns, essence_patterns);
Self::assign_pattern_priorities(&mut merged_patterns);
merged_patterns.sort_by(|a, b| {
a.priority
.effective_score()
.cmp(&b.priority.effective_score())
});
TimestampRegistry {
patterns: merged_patterns,
}
}
fn load_original_timestamp_patterns() -> Vec<TimestampPattern> {
vec![
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?(?:Z|[+-]\d{2}:?\d{2}?)\b").unwrap(),
format_type: TimestampFormat::ISO8601Enhanced,
priority: PatternPriority::new(100, FormatFamily::Structured),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-W\d{2}-\d(?:T\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?(?:Z|[+-]\d{2}:?\d{2}?)?)?\b").unwrap(),
format_type: TimestampFormat::WeekDate,
priority: PatternPriority::new(90, FormatFamily::Structured),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{3}(?:T\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?(?:Z|[+-]\d{2}:?\d{2}?)?)?\b").unwrap(),
format_type: TimestampFormat::OrdinalDate,
priority: PatternPriority::new(90, FormatFamily::Structured),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:[.,]\d{1,9})?(?:\s*(?:UTC|GMT|[+-]\d{2}:?\d{2}?))?\b").unwrap(),
format_type: TimestampFormat::ISO8601Full,
priority: PatternPriority::new(85, FormatFamily::Structured),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{1,9}\b").unwrap(),
format_type: TimestampFormat::JavaSimpleDate,
priority: PatternPriority::new(75, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{2}-\d{2} \d{1,2}:\d{2}:\d{2}(?:\.\d{1,9})?\s*(?:AM|PM|am|pm)\b").unwrap(),
format_type: TimestampFormat::USDate,
priority: PatternPriority::new(70, FormatFamily::Regional),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{6}\s+\d{2}:\d{2}:\d{2}\b").unwrap(),
format_type: TimestampFormat::MySQLTimestamp,
priority: PatternPriority::new(60, FormatFamily::Database),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{2}-[A-Z]{3}-\d{2}\s+\d{2}\.\d{2}\.\d{2}(?:\.\d+)?\s*(?:AM|PM)?").unwrap(),
format_type: TimestampFormat::Oracle,
priority: PatternPriority::new(65, FormatFamily::Database),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b20\d{12}\b").unwrap(),
format_type: TimestampFormat::CompactFormat,
priority: PatternPriority::new(50, FormatFamily::Legacy),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\[\d{2}/[A-Z][a-z]{2}/\d{4}:\d{2}:\d{2}:\d{2}\s+[+-]\d{4}\]").unwrap(),
format_type: TimestampFormat::ApacheCommon,
priority: PatternPriority::new(80, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{2}/[A-Z][a-z]{2}/\d{4}:\d{2}:\d{2}:\d{2}\b").unwrap(),
format_type: TimestampFormat::NginxAccess,
priority: PatternPriority::new(75, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{1,2}/\d{1,2}/\d{4}\s+\d{1,2}:\d{2}:\d{2}(?:\.\d{1,9})?(?:\s*(?:AM|PM))?\b").unwrap(),
format_type: TimestampFormat::USDate,
priority: PatternPriority::new(60, FormatFamily::Regional),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{1,2}-\d{1,2}-\d{4}\s+\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::USDateDash,
priority: PatternPriority::new(60, FormatFamily::Regional),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b[0-3]\d/[01]\d/\d{4}\s+\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::EuropeanDate,
priority: PatternPriority::new(60, FormatFamily::Regional),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b[0-3]\d\.[01]\d\.\d{4}\s+\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::EuropeanDateDot,
priority: PatternPriority::new(60, FormatFamily::Regional),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::SyslogBSD,
priority: PatternPriority::new(55, FormatFamily::Legacy),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::SyslogWithYear,
priority: PatternPriority::new(60, FormatFamily::Legacy),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"[IWEF]\d{4}\s+\d{2}:\d{2}:\d{2}\.\d+").unwrap(),
format_type: TimestampFormat::KubernetesLog,
priority: PatternPriority::new(85, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b1[0-9]{9,10}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::UnixTimestamp,
priority: PatternPriority::new(10, FormatFamily::Unix),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"@1[0-9]{9,10}(?:\.\d{1,9})?\b").unwrap(),
format_type: TimestampFormat::UnixPrefixed,
priority: PatternPriority::new(20, FormatFamily::Unix),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\[1[0-9]{9,10}(?:\.\d{1,9})?\]").unwrap(),
format_type: TimestampFormat::UnixBracketed,
priority: PatternPriority::new(25, FormatFamily::Unix),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{9}Z\b").unwrap(),
format_type: TimestampFormat::DockerLog,
priority: PatternPriority::new(95, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{2}:\d{2}:\d{2}(?:\.\d{1,9})?(?:Z|[+-]\d{2}:?\d{2}?)?\b").unwrap(),
format_type: TimestampFormat::TimeOnly,
priority: PatternPriority::new(30, FormatFamily::Legacy),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\bP(?:\d+Y)?(?:\d+M)?(?:\d+D)?(?:T(?:\d+H)?(?:\d+M)?(?:\d+(?:\.\d+)?S)?)?\b").unwrap(),
format_type: TimestampFormat::Duration,
priority: PatternPriority::new(35, FormatFamily::Legacy),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{2}\.\d{3}\s+\d{2}:\d{2}:\d{2}\b").unwrap(),
format_type: TimestampFormat::IBMFormat,
priority: PatternPriority::new(45, FormatFamily::Legacy),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s+\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[+-]\d{4}\b").unwrap(),
format_type: TimestampFormat::RFC2822,
priority: PatternPriority::new(85, FormatFamily::Structured),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}\s+\[").unwrap(),
format_type: TimestampFormat::Log4j,
priority: PatternPriority::new(70, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
TimestampPattern {
regex: Regex::new(r"\b\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}\.\d{1,6}\b").unwrap(),
format_type: TimestampFormat::Splunk,
priority: PatternPriority::new(65, FormatFamily::Application),
source: PatternSource::OriginalTimestamp,
},
]
}
fn load_original_essence_patterns() -> Vec<TimestampPattern> {
vec![
TimestampPattern {
regex: Regex::new(r"\d{1,2}/\d{1,2}/\d{4}\s+\d{1,2}:\d{2}:\d{2}\s+(?:AM|PM)")
.unwrap(),
format_type: TimestampFormat::WindowsEvent,
priority: PatternPriority::new(55, FormatFamily::Regional),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}").unwrap(),
format_type: TimestampFormat::WindowsIIS,
priority: PatternPriority::new(50, FormatFamily::Regional),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}").unwrap(),
format_type: TimestampFormat::GitCommit,
priority: PatternPriority::new(55, FormatFamily::Legacy),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z").unwrap(),
format_type: TimestampFormat::Aws,
priority: PatternPriority::new(85, FormatFamily::Application),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z").unwrap(),
format_type: TimestampFormat::Gcp,
priority: PatternPriority::new(85, FormatFamily::Application),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{7}Z").unwrap(),
format_type: TimestampFormat::Azure,
priority: PatternPriority::new(85, FormatFamily::Application),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\w{3}\s+\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}").unwrap(),
format_type: TimestampFormat::Ansic,
priority: PatternPriority::new(50, FormatFamily::Legacy),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\w{3},\s+\d{2}\s+\w{3}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT")
.unwrap(),
format_type: TimestampFormat::RFC822,
priority: PatternPriority::new(80, FormatFamily::Structured),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\b\d{13}\b").unwrap(),
format_type: TimestampFormat::UnixTimestampMs,
priority: PatternPriority::new(15, FormatFamily::Unix),
source: PatternSource::OriginalEssence,
},
TimestampPattern {
regex: Regex::new(r"\b\d{19}\b").unwrap(),
format_type: TimestampFormat::UnixTimestampNs,
priority: PatternPriority::new(18, FormatFamily::Unix),
source: PatternSource::OriginalEssence,
},
]
}
fn merge_duplicate_patterns(
timestamp_patterns: Vec<TimestampPattern>,
essence_patterns: Vec<TimestampPattern>,
) -> Vec<TimestampPattern> {
let mut merged_patterns = Vec::new();
let mut pattern_map: HashMap<String, TimestampPattern> = HashMap::new();
for pattern in timestamp_patterns {
let key = pattern.regex.as_str().to_string();
pattern_map.insert(key, pattern);
}
for essence_pattern in essence_patterns {
let key = essence_pattern.regex.as_str().to_string();
match pattern_map.entry(key) {
std::collections::hash_map::Entry::Occupied(mut entry) => {
entry.get_mut().source = PatternSource::Merged;
}
std::collections::hash_map::Entry::Vacant(entry) => {
entry.insert(essence_pattern);
}
}
}
merged_patterns.extend(pattern_map.into_values());
merged_patterns
}
fn assign_pattern_priorities(patterns: &mut [TimestampPattern]) {
for pattern in patterns.iter_mut() {
let specificity = pattern.format_type.specificity_score();
let family = pattern.format_type.format_family();
pattern.priority = PatternPriority::new(specificity, family);
}
}
pub fn get_patterns(&self) -> &[TimestampPattern] {
&self.patterns
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn assign_priorities_sets_nonzero() {
let registry = TimestampRegistry::new();
for pattern in registry.get_patterns() {
assert_ne!(
pattern.priority.specificity_score, 0,
"Pattern {:?} should have nonzero specificity",
pattern.format_type
);
}
}
#[test]
fn essence_patterns_loaded() {
let registry = TimestampRegistry::new();
let has_essence_pattern = registry.get_patterns().iter().any(|p| {
matches!(
p.format_type,
TimestampFormat::WindowsEvent
| TimestampFormat::GitCommit
| TimestampFormat::Aws
| TimestampFormat::Gcp
| TimestampFormat::Azure
)
});
assert!(
has_essence_pattern,
"Registry should contain essence-specific patterns"
);
}
#[test]
fn assign_priorities_actually_changes_priorities() {
let registry = TimestampRegistry::new();
for pattern in registry.get_patterns() {
let expected_specificity = pattern.format_type.specificity_score();
assert_eq!(
pattern.priority.specificity_score, expected_specificity,
"Pattern {:?} should have specificity {} from assign_pattern_priorities, got {}",
pattern.format_type, expected_specificity, pattern.priority.specificity_score
);
}
}
#[test]
fn assign_priorities_unix_lowest() {
let registry = TimestampRegistry::new();
let unix_pattern = registry.get_patterns().iter().find(|p| {
matches!(
p.format_type,
crate::patterns::timestamp::formats::TimestampFormat::UnixTimestamp
)
});
if let Some(up) = unix_pattern {
let structured_pattern = registry.get_patterns().iter().find(|p| {
matches!(
p.format_type,
crate::patterns::timestamp::formats::TimestampFormat::ISO8601Full
)
});
if let Some(sp) = structured_pattern {
assert!(
up.priority.effective_score() > sp.priority.effective_score(),
"Unix should have lower priority (higher score) than ISO8601"
);
}
}
}
}