Skip to main content

zeph_tools/filter/
log_dedup.rs

1use std::collections::HashMap;
2use std::fmt::Write;
3use std::sync::LazyLock;
4
5use regex::Regex;
6
7use super::{
8    CommandMatcher, FilterConfidence, FilterResult, LogDedupFilterConfig, OutputFilter, make_result,
9};
10
11const MAX_UNIQUE_PATTERNS: usize = 10_000;
12
13static LOG_DEDUP_MATCHER: LazyLock<CommandMatcher> = LazyLock::new(|| {
14    CommandMatcher::Custom(Box::new(|cmd| {
15        let c = cmd.to_lowercase();
16        c.contains("journalctl")
17            || c.contains("tail -f")
18            || c.contains("docker logs")
19            || (c.contains("cat ") && c.contains(".log"))
20    }))
21});
22
23static TIMESTAMP_RE: LazyLock<Regex> = LazyLock::new(|| {
24    Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}([.\d]*)?([Z+-][\d:]*)?").unwrap()
25});
26static UUID_RE: LazyLock<Regex> = LazyLock::new(|| {
27    Regex::new(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").unwrap()
28});
29static IP_RE: LazyLock<Regex> =
30    LazyLock::new(|| Regex::new(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}").unwrap());
31static PORT_PID_RE: LazyLock<Regex> =
32    LazyLock::new(|| Regex::new(r"(?:port|pid|PID)[=: ]+\d+").unwrap());
33
34pub struct LogDedupFilter;
35
36impl LogDedupFilter {
37    #[must_use]
38    pub fn new(_config: LogDedupFilterConfig) -> Self {
39        Self
40    }
41}
42
43impl OutputFilter for LogDedupFilter {
44    fn name(&self) -> &'static str {
45        "log_dedup"
46    }
47
48    fn matcher(&self) -> &CommandMatcher {
49        &LOG_DEDUP_MATCHER
50    }
51
52    fn filter(&self, _command: &str, raw_output: &str, _exit_code: i32) -> FilterResult {
53        let lines: Vec<&str> = raw_output.lines().collect();
54        if lines.len() < 3 {
55            return make_result(
56                raw_output,
57                raw_output.to_owned(),
58                FilterConfidence::Fallback,
59            );
60        }
61
62        let mut pattern_counts: HashMap<String, (usize, String)> = HashMap::new();
63        let mut order: Vec<String> = Vec::new();
64
65        let mut capped = false;
66        for line in &lines {
67            let normalized = normalize(line);
68            if let Some(entry) = pattern_counts.get_mut(&normalized) {
69                entry.0 += 1;
70            } else if pattern_counts.len() < MAX_UNIQUE_PATTERNS {
71                order.push(normalized.clone());
72                pattern_counts.insert(normalized, (1, (*line).to_owned()));
73            } else {
74                capped = true;
75            }
76        }
77
78        let unique = order.len();
79        let total = lines.len();
80
81        if unique == total && !capped {
82            return make_result(
83                raw_output,
84                raw_output.to_owned(),
85                FilterConfidence::Fallback,
86            );
87        }
88
89        let mut output = String::new();
90        for key in &order {
91            let (count, example) = &pattern_counts[key];
92            if *count > 1 {
93                let _ = writeln!(output, "{example} (x{count})");
94            } else {
95                let _ = writeln!(output, "{example}");
96            }
97        }
98        let _ = write!(output, "{unique} unique patterns ({total} total lines)");
99        if capped {
100            let _ = write!(output, " (capped at {MAX_UNIQUE_PATTERNS})");
101        }
102
103        make_result(raw_output, output, FilterConfidence::Full)
104    }
105}
106
107fn normalize(line: &str) -> String {
108    let s = TIMESTAMP_RE.replace_all(line, "<TS>");
109    let s = UUID_RE.replace_all(&s, "<UUID>");
110    let s = IP_RE.replace_all(&s, "<IP>");
111    PORT_PID_RE.replace_all(&s, "<N>").to_string()
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    fn make_filter() -> LogDedupFilter {
119        LogDedupFilter::new(LogDedupFilterConfig::default())
120    }
121
122    #[test]
123    fn matches_log_commands() {
124        let f = make_filter();
125        assert!(f.matcher().matches("journalctl -u nginx"));
126        assert!(f.matcher().matches("tail -f /var/log/syslog"));
127        assert!(f.matcher().matches("docker logs -f container"));
128        assert!(f.matcher().matches("cat /var/log/app.log"));
129        assert!(!f.matcher().matches("cat file.txt"));
130        assert!(!f.matcher().matches("cargo build"));
131    }
132
133    #[test]
134    fn filter_deduplicates() {
135        let f = make_filter();
136        let raw = "\
1372024-01-15T12:00:01Z INFO request handled path=/api/health
1382024-01-15T12:00:02Z INFO request handled path=/api/health
1392024-01-15T12:00:03Z INFO request handled path=/api/health
1402024-01-15T12:00:04Z WARN connection timeout addr=10.0.0.1
1412024-01-15T12:00:05Z WARN connection timeout addr=10.0.0.2
1422024-01-15T12:00:06Z ERROR database unreachable
143";
144        let result = f.filter("journalctl -u app", raw, 0);
145        assert!(result.output.contains("(x3)"));
146        assert!(result.output.contains("(x2)"));
147        assert!(result.output.contains("3 unique patterns (6 total lines)"));
148        assert!(result.savings_pct() > 20.0);
149        assert_eq!(result.confidence, FilterConfidence::Full);
150    }
151
152    #[test]
153    fn filter_all_unique_passthrough() {
154        let f = make_filter();
155        let raw = "line one\nline two\nline three";
156        let result = f.filter("cat app.log", raw, 0);
157        assert_eq!(result.output, raw);
158        assert_eq!(result.confidence, FilterConfidence::Fallback);
159    }
160
161    #[test]
162    fn filter_short_passthrough() {
163        let f = make_filter();
164        let raw = "single line";
165        let result = f.filter("cat app.log", raw, 0);
166        assert_eq!(result.output, raw);
167        assert_eq!(result.confidence, FilterConfidence::Fallback);
168    }
169
170    #[test]
171    fn normalize_replaces_patterns() {
172        let line = "2024-01-15T12:00:00Z req=abc12345-1234-1234-1234-123456789012 addr=192.168.1.1 pid=1234";
173        let n = normalize(line);
174        assert!(n.contains("<TS>"));
175        assert!(n.contains("<UUID>"));
176        assert!(n.contains("<IP>"));
177        assert!(n.contains("<N>"));
178    }
179}