armature_analytics/
insights.rs

1//! Analytics insights and alerting
2
3use crate::{AnalyticsSnapshot, EndpointMetrics, LatencyMetrics, RequestMetrics};
4use chrono::{DateTime, Utc};
5use serde::{Deserialize, Serialize};
6
7/// Types of insights
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9#[serde(rename_all = "snake_case")]
10pub enum InsightType {
11    HighErrorRate,
12    HighLatency,
13    RateLimitPressure,
14    TrafficSpike,
15    SlowEndpoint,
16    ErrorSpike,
17}
18
19/// Severity levels
20#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
21#[serde(rename_all = "lowercase")]
22pub enum Severity {
23    Info,
24    Warning,
25    Critical,
26}
27
28/// An analytics insight
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct Insight {
31    pub insight_type: InsightType,
32    pub severity: Severity,
33    pub title: String,
34    pub description: String,
35    pub value: f64,
36    pub threshold: f64,
37    pub timestamp: DateTime<Utc>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub endpoint: Option<String>,
40    #[serde(skip_serializing_if = "Option::is_none")]
41    pub recommendation: Option<String>,
42}
43
44impl Insight {
45    pub fn new(
46        insight_type: InsightType,
47        severity: Severity,
48        title: impl Into<String>,
49        description: impl Into<String>,
50    ) -> Self {
51        Self {
52            insight_type,
53            severity,
54            title: title.into(),
55            description: description.into(),
56            value: 0.0,
57            threshold: 0.0,
58            timestamp: Utc::now(),
59            endpoint: None,
60            recommendation: None,
61        }
62    }
63
64    pub fn with_value(mut self, value: f64) -> Self {
65        self.value = value;
66        self
67    }
68
69    pub fn with_threshold(mut self, threshold: f64) -> Self {
70        self.threshold = threshold;
71        self
72    }
73
74    pub fn with_endpoint(mut self, endpoint: impl Into<String>) -> Self {
75        self.endpoint = Some(endpoint.into());
76        self
77    }
78
79    pub fn with_recommendation(mut self, rec: impl Into<String>) -> Self {
80        self.recommendation = Some(rec.into());
81        self
82    }
83}
84
85/// Configuration for insight detection
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct InsightConfig {
88    /// Error rate threshold (percentage)
89    pub error_rate_warning: f64,
90    pub error_rate_critical: f64,
91
92    /// Latency thresholds (milliseconds)
93    pub p99_latency_warning_ms: f64,
94    pub p99_latency_critical_ms: f64,
95
96    /// Rate limit thresholds (percentage of requests limited)
97    pub rate_limit_warning: f64,
98    pub rate_limit_critical: f64,
99
100    /// Traffic spike threshold (multiplier over average)
101    pub traffic_spike_multiplier: f64,
102
103    /// Minimum requests before generating insights
104    pub min_requests: u64,
105}
106
107impl Default for InsightConfig {
108    fn default() -> Self {
109        Self {
110            error_rate_warning: 1.0,    // 1% error rate
111            error_rate_critical: 5.0,   // 5% error rate
112            p99_latency_warning_ms: 500.0,
113            p99_latency_critical_ms: 2000.0,
114            rate_limit_warning: 10.0,   // 10% of requests limited
115            rate_limit_critical: 25.0,  // 25% of requests limited
116            traffic_spike_multiplier: 3.0,
117            min_requests: 100,
118        }
119    }
120}
121
122/// Insight generator
123pub struct InsightGenerator {
124    config: InsightConfig,
125    baseline_rps: Option<f64>,
126}
127
128impl InsightGenerator {
129    pub fn new(config: InsightConfig) -> Self {
130        Self {
131            config,
132            baseline_rps: None,
133        }
134    }
135
136    /// Set baseline RPS for traffic spike detection
137    pub fn set_baseline_rps(&mut self, rps: f64) {
138        self.baseline_rps = Some(rps);
139    }
140
141    /// Generate insights from analytics snapshot
142    pub fn generate(&self, snapshot: &AnalyticsSnapshot) -> Vec<Insight> {
143        let mut insights = Vec::new();
144
145        // Skip if not enough data
146        if snapshot.requests.total < self.config.min_requests {
147            return insights;
148        }
149
150        // Check error rate
151        if let Some(insight) = self.check_error_rate(&snapshot.requests) {
152            insights.push(insight);
153        }
154
155        // Check latency
156        if let Some(insight) = self.check_latency(&snapshot.latency) {
157            insights.push(insight);
158        }
159
160        // Check rate limits
161        if let Some(insight) = self.check_rate_limits(snapshot) {
162            insights.push(insight);
163        }
164
165        // Check traffic spike
166        if let Some(insight) = self.check_traffic_spike(snapshot) {
167            insights.push(insight);
168        }
169
170        // Check slow endpoints
171        for insight in self.check_slow_endpoints(&snapshot.endpoints) {
172            insights.push(insight);
173        }
174
175        insights
176    }
177
178    fn check_error_rate(&self, requests: &RequestMetrics) -> Option<Insight> {
179        let error_rate = requests.error_rate();
180
181        if error_rate >= self.config.error_rate_critical {
182            Some(
183                Insight::new(
184                    InsightType::HighErrorRate,
185                    Severity::Critical,
186                    "Critical Error Rate",
187                    format!("Error rate is {:.2}%, above critical threshold of {:.2}%",
188                        error_rate, self.config.error_rate_critical),
189                )
190                .with_value(error_rate)
191                .with_threshold(self.config.error_rate_critical)
192                .with_recommendation("Investigate error logs immediately. Check for deployment issues or upstream service failures."),
193            )
194        } else if error_rate >= self.config.error_rate_warning {
195            Some(
196                Insight::new(
197                    InsightType::HighErrorRate,
198                    Severity::Warning,
199                    "Elevated Error Rate",
200                    format!("Error rate is {:.2}%, above warning threshold of {:.2}%",
201                        error_rate, self.config.error_rate_warning),
202                )
203                .with_value(error_rate)
204                .with_threshold(self.config.error_rate_warning)
205                .with_recommendation("Review error logs and monitor for further increase."),
206            )
207        } else {
208            None
209        }
210    }
211
212    fn check_latency(&self, latency: &LatencyMetrics) -> Option<Insight> {
213        if latency.p99_ms >= self.config.p99_latency_critical_ms {
214            Some(
215                Insight::new(
216                    InsightType::HighLatency,
217                    Severity::Critical,
218                    "Critical Latency",
219                    format!("P99 latency is {:.0}ms, above critical threshold of {:.0}ms",
220                        latency.p99_ms, self.config.p99_latency_critical_ms),
221                )
222                .with_value(latency.p99_ms)
223                .with_threshold(self.config.p99_latency_critical_ms)
224                .with_recommendation("Check database queries, external service calls, and resource utilization."),
225            )
226        } else if latency.p99_ms >= self.config.p99_latency_warning_ms {
227            Some(
228                Insight::new(
229                    InsightType::HighLatency,
230                    Severity::Warning,
231                    "Elevated Latency",
232                    format!("P99 latency is {:.0}ms, above warning threshold of {:.0}ms",
233                        latency.p99_ms, self.config.p99_latency_warning_ms),
234                )
235                .with_value(latency.p99_ms)
236                .with_threshold(self.config.p99_latency_warning_ms)
237                .with_recommendation("Profile slow requests and consider caching or optimization."),
238            )
239        } else {
240            None
241        }
242    }
243
244    fn check_rate_limits(&self, snapshot: &AnalyticsSnapshot) -> Option<Insight> {
245        let rate_limits = &snapshot.rate_limits;
246        if rate_limits.total_checks == 0 {
247            return None;
248        }
249
250        let limited_rate = (rate_limits.limited as f64 / rate_limits.total_checks as f64) * 100.0;
251
252        if limited_rate >= self.config.rate_limit_critical {
253            Some(
254                Insight::new(
255                    InsightType::RateLimitPressure,
256                    Severity::Critical,
257                    "Critical Rate Limit Pressure",
258                    format!("{:.2}% of requests are being rate limited", limited_rate),
259                )
260                .with_value(limited_rate)
261                .with_threshold(self.config.rate_limit_critical)
262                .with_recommendation("Consider increasing rate limits, adding capacity, or implementing request queuing."),
263            )
264        } else if limited_rate >= self.config.rate_limit_warning {
265            Some(
266                Insight::new(
267                    InsightType::RateLimitPressure,
268                    Severity::Warning,
269                    "Rate Limit Pressure",
270                    format!("{:.2}% of requests are being rate limited", limited_rate),
271                )
272                .with_value(limited_rate)
273                .with_threshold(self.config.rate_limit_warning)
274                .with_recommendation("Monitor rate limit usage and consider adjusting limits for legitimate traffic."),
275            )
276        } else {
277            None
278        }
279    }
280
281    fn check_traffic_spike(&self, snapshot: &AnalyticsSnapshot) -> Option<Insight> {
282        let baseline = self.baseline_rps?;
283        let current_rps = snapshot.throughput.requests_per_second;
284
285        if current_rps > baseline * self.config.traffic_spike_multiplier {
286            Some(
287                Insight::new(
288                    InsightType::TrafficSpike,
289                    Severity::Warning,
290                    "Traffic Spike Detected",
291                    format!("Current RPS ({:.1}) is {:.1}x higher than baseline ({:.1})",
292                        current_rps, current_rps / baseline, baseline),
293                )
294                .with_value(current_rps)
295                .with_threshold(baseline * self.config.traffic_spike_multiplier)
296                .with_recommendation("Investigate traffic source. Consider enabling auto-scaling if available."),
297            )
298        } else {
299            None
300        }
301    }
302
303    fn check_slow_endpoints(&self, endpoints: &[EndpointMetrics]) -> Vec<Insight> {
304        let mut insights = Vec::new();
305
306        for endpoint in endpoints {
307            // Skip endpoints with few requests
308            if endpoint.requests < 10 {
309                continue;
310            }
311
312            // Check for slow endpoints
313            if endpoint.p99_latency_ms >= self.config.p99_latency_critical_ms {
314                insights.push(
315                    Insight::new(
316                        InsightType::SlowEndpoint,
317                        Severity::Warning,
318                        "Slow Endpoint",
319                        format!("{} {} has P99 latency of {:.0}ms",
320                            endpoint.method, endpoint.path, endpoint.p99_latency_ms),
321                    )
322                    .with_value(endpoint.p99_latency_ms)
323                    .with_threshold(self.config.p99_latency_critical_ms)
324                    .with_endpoint(format!("{} {}", endpoint.method, endpoint.path))
325                    .with_recommendation("Profile this specific endpoint for optimization opportunities."),
326                );
327            }
328
329            // Check for high error rate endpoints
330            if endpoint.error_rate >= self.config.error_rate_critical {
331                insights.push(
332                    Insight::new(
333                        InsightType::ErrorSpike,
334                        Severity::Warning,
335                        "High Error Rate Endpoint",
336                        format!("{} {} has error rate of {:.2}%",
337                            endpoint.method, endpoint.path, endpoint.error_rate),
338                    )
339                    .with_value(endpoint.error_rate)
340                    .with_threshold(self.config.error_rate_critical)
341                    .with_endpoint(format!("{} {}", endpoint.method, endpoint.path))
342                    .with_recommendation("Investigate errors specific to this endpoint."),
343                );
344            }
345        }
346
347        insights
348    }
349}
350
351impl Default for InsightGenerator {
352    fn default() -> Self {
353        Self::new(InsightConfig::default())
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    #[test]
362    fn test_insight_creation() {
363        let insight = Insight::new(
364            InsightType::HighErrorRate,
365            Severity::Critical,
366            "Test",
367            "Description",
368        )
369        .with_value(5.0)
370        .with_threshold(1.0)
371        .with_recommendation("Fix it");
372
373        assert_eq!(insight.insight_type, InsightType::HighErrorRate);
374        assert_eq!(insight.severity, Severity::Critical);
375        assert_eq!(insight.value, 5.0);
376    }
377
378    #[test]
379    fn test_insight_config_defaults() {
380        let config = InsightConfig::default();
381        assert_eq!(config.error_rate_warning, 1.0);
382        assert_eq!(config.error_rate_critical, 5.0);
383    }
384}
385