zeph_tools/filter/
log_dedup.rs1use std::collections::HashMap;
2use std::fmt::Write;
3use std::sync::LazyLock;
4
5use regex::Regex;
6
7use super::{
8 CommandMatcher, FilterConfidence, FilterResult, LogDedupFilterConfig, OutputFilter, make_result,
9};
10
11const MAX_UNIQUE_PATTERNS: usize = 10_000;
12
13static LOG_DEDUP_MATCHER: LazyLock<CommandMatcher> = LazyLock::new(|| {
14 CommandMatcher::Custom(Box::new(|cmd| {
15 let c = cmd.to_lowercase();
16 c.contains("journalctl")
17 || c.contains("tail -f")
18 || c.contains("docker logs")
19 || (c.contains("cat ") && c.contains(".log"))
20 }))
21});
22
23static TIMESTAMP_RE: LazyLock<Regex> = LazyLock::new(|| {
24 Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}([.\d]*)?([Z+-][\d:]*)?").unwrap()
25});
26static UUID_RE: LazyLock<Regex> = LazyLock::new(|| {
27 Regex::new(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").unwrap()
28});
29static IP_RE: LazyLock<Regex> =
30 LazyLock::new(|| Regex::new(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}").unwrap());
31static PORT_PID_RE: LazyLock<Regex> =
32 LazyLock::new(|| Regex::new(r"(?:port|pid|PID)[=: ]+\d+").unwrap());
33
34pub struct LogDedupFilter;
35
36impl LogDedupFilter {
37 #[must_use]
38 pub fn new(_config: LogDedupFilterConfig) -> Self {
39 Self
40 }
41}
42
43impl OutputFilter for LogDedupFilter {
44 fn name(&self) -> &'static str {
45 "log_dedup"
46 }
47
48 fn matcher(&self) -> &CommandMatcher {
49 &LOG_DEDUP_MATCHER
50 }
51
52 fn filter(&self, _command: &str, raw_output: &str, _exit_code: i32) -> FilterResult {
53 let lines: Vec<&str> = raw_output.lines().collect();
54 if lines.len() < 3 {
55 return make_result(
56 raw_output,
57 raw_output.to_owned(),
58 FilterConfidence::Fallback,
59 );
60 }
61
62 let mut pattern_counts: HashMap<String, (usize, String)> = HashMap::new();
63 let mut order: Vec<String> = Vec::new();
64
65 let mut capped = false;
66 for line in &lines {
67 let normalized = normalize(line);
68 if let Some(entry) = pattern_counts.get_mut(&normalized) {
69 entry.0 += 1;
70 } else if pattern_counts.len() < MAX_UNIQUE_PATTERNS {
71 order.push(normalized.clone());
72 pattern_counts.insert(normalized, (1, (*line).to_owned()));
73 } else {
74 capped = true;
75 }
76 }
77
78 let unique = order.len();
79 let total = lines.len();
80
81 if unique == total && !capped {
82 return make_result(
83 raw_output,
84 raw_output.to_owned(),
85 FilterConfidence::Fallback,
86 );
87 }
88
89 let mut output = String::new();
90 for key in &order {
91 let (count, example) = &pattern_counts[key];
92 if *count > 1 {
93 let _ = writeln!(output, "{example} (x{count})");
94 } else {
95 let _ = writeln!(output, "{example}");
96 }
97 }
98 let _ = write!(output, "{unique} unique patterns ({total} total lines)");
99 if capped {
100 let _ = write!(output, " (capped at {MAX_UNIQUE_PATTERNS})");
101 }
102
103 make_result(raw_output, output, FilterConfidence::Full)
104 }
105}
106
107fn normalize(line: &str) -> String {
108 let s = TIMESTAMP_RE.replace_all(line, "<TS>");
109 let s = UUID_RE.replace_all(&s, "<UUID>");
110 let s = IP_RE.replace_all(&s, "<IP>");
111 PORT_PID_RE.replace_all(&s, "<N>").to_string()
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 fn make_filter() -> LogDedupFilter {
119 LogDedupFilter::new(LogDedupFilterConfig::default())
120 }
121
122 #[test]
123 fn matches_log_commands() {
124 let f = make_filter();
125 assert!(f.matcher().matches("journalctl -u nginx"));
126 assert!(f.matcher().matches("tail -f /var/log/syslog"));
127 assert!(f.matcher().matches("docker logs -f container"));
128 assert!(f.matcher().matches("cat /var/log/app.log"));
129 assert!(!f.matcher().matches("cat file.txt"));
130 assert!(!f.matcher().matches("cargo build"));
131 }
132
133 #[test]
134 fn filter_deduplicates() {
135 let f = make_filter();
136 let raw = "\
1372024-01-15T12:00:01Z INFO request handled path=/api/health
1382024-01-15T12:00:02Z INFO request handled path=/api/health
1392024-01-15T12:00:03Z INFO request handled path=/api/health
1402024-01-15T12:00:04Z WARN connection timeout addr=10.0.0.1
1412024-01-15T12:00:05Z WARN connection timeout addr=10.0.0.2
1422024-01-15T12:00:06Z ERROR database unreachable
143";
144 let result = f.filter("journalctl -u app", raw, 0);
145 assert!(result.output.contains("(x3)"));
146 assert!(result.output.contains("(x2)"));
147 assert!(result.output.contains("3 unique patterns (6 total lines)"));
148 assert!(result.savings_pct() > 20.0);
149 assert_eq!(result.confidence, FilterConfidence::Full);
150 }
151
152 #[test]
153 fn filter_all_unique_passthrough() {
154 let f = make_filter();
155 let raw = "line one\nline two\nline three";
156 let result = f.filter("cat app.log", raw, 0);
157 assert_eq!(result.output, raw);
158 assert_eq!(result.confidence, FilterConfidence::Fallback);
159 }
160
161 #[test]
162 fn filter_short_passthrough() {
163 let f = make_filter();
164 let raw = "single line";
165 let result = f.filter("cat app.log", raw, 0);
166 assert_eq!(result.output, raw);
167 assert_eq!(result.confidence, FilterConfidence::Fallback);
168 }
169
170 #[test]
171 fn normalize_replaces_patterns() {
172 let line = "2024-01-15T12:00:00Z req=abc12345-1234-1234-1234-123456789012 addr=192.168.1.1 pid=1234";
173 let n = normalize(line);
174 assert!(n.contains("<TS>"));
175 assert!(n.contains("<UUID>"));
176 assert!(n.contains("<IP>"));
177 assert!(n.contains("<N>"));
178 }
179}