open_lark/core/
error_metrics.rs

1/// 错误统计和监控模块
2///
3/// 提供错误的统计分析和监控功能:
4/// - 错误频率统计
5/// - 错误类型分布
6/// - 性能影响分析
7/// - 趋势分析
8/// - 自动告警
9use std::collections::HashMap;
10use std::{
11    sync::{Arc, Mutex},
12    time::{Duration, SystemTime},
13};
14
15use crate::core::{
16    error::LarkAPIError,
17    error_codes::{ErrorCategory, LarkErrorCode},
18    error_helper::ErrorHandlingCategory,
19};
20
21/// 错误事件记录
22#[derive(Debug, Clone)]
23pub struct ErrorEvent {
24    /// 错误实例
25    pub error: LarkAPIError,
26    /// 发生时间
27    pub timestamp: SystemTime,
28    /// 错误分类
29    pub category: ErrorHandlingCategory,
30    /// 错误码(如果是API错误)
31    pub error_code: Option<LarkErrorCode>,
32    /// 是否可重试
33    pub is_retryable: bool,
34    /// 处理耗时(如果有)
35    pub processing_time: Option<Duration>,
36    /// 上下文信息
37    pub context: HashMap<String, String>,
38}
39
40impl ErrorEvent {
41    /// 从LarkAPIError创建错误事件
42    pub fn from_error(error: LarkAPIError) -> Self {
43        let category = match &error {
44            LarkAPIError::ApiError { code, .. } => {
45                if let Some(error_code) = LarkErrorCode::from_code(*code) {
46                    match error_code.category() {
47                        ErrorCategory::Authentication => ErrorHandlingCategory::Authentication,
48                        ErrorCategory::Permission => ErrorHandlingCategory::Permission,
49                        ErrorCategory::Parameter => ErrorHandlingCategory::ClientError,
50                        ErrorCategory::Resource => ErrorHandlingCategory::ClientError,
51                        ErrorCategory::Server => ErrorHandlingCategory::ServerError,
52                        ErrorCategory::Network => ErrorHandlingCategory::NetworkError,
53                        ErrorCategory::RateLimit => ErrorHandlingCategory::RateLimit,
54                        ErrorCategory::Other => ErrorHandlingCategory::Unknown,
55                    }
56                } else {
57                    ErrorHandlingCategory::Unknown
58                }
59            }
60            LarkAPIError::RequestError(_) => ErrorHandlingCategory::NetworkError,
61            LarkAPIError::MissingAccessToken => ErrorHandlingCategory::Authentication,
62            LarkAPIError::IllegalParamError(_) => ErrorHandlingCategory::ClientError,
63            _ => ErrorHandlingCategory::SystemError,
64        };
65
66        let error_code = match &error {
67            LarkAPIError::ApiError { code, .. } => LarkErrorCode::from_code(*code),
68            _ => None,
69        };
70
71        Self {
72            is_retryable: error.is_retryable(),
73            error,
74            timestamp: SystemTime::now(),
75            category,
76            error_code,
77            processing_time: None,
78            context: HashMap::new(),
79        }
80    }
81
82    /// 添加上下文信息
83    pub fn with_context(mut self, key: &str, value: &str) -> Self {
84        self.context.insert(key.to_string(), value.to_string());
85        self
86    }
87
88    /// 设置处理耗时
89    pub fn with_processing_time(mut self, duration: Duration) -> Self {
90        self.processing_time = Some(duration);
91        self
92    }
93
94    /// 获取错误严重级别
95    pub fn severity_level(&self) -> ErrorSeverity {
96        match &self.category {
97            ErrorHandlingCategory::Authentication => ErrorSeverity::Warning,
98            ErrorHandlingCategory::Permission => ErrorSeverity::Error,
99            ErrorHandlingCategory::ClientError => ErrorSeverity::Warning,
100            ErrorHandlingCategory::ServerError => ErrorSeverity::Critical,
101            ErrorHandlingCategory::NetworkError => ErrorSeverity::Warning,
102            ErrorHandlingCategory::RateLimit => ErrorSeverity::Warning,
103            ErrorHandlingCategory::SystemError => ErrorSeverity::Critical,
104            ErrorHandlingCategory::Unknown => ErrorSeverity::Error,
105        }
106    }
107}
108
109/// 错误严重级别
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
111pub enum ErrorSeverity {
112    /// 信息
113    Info,
114    /// 警告
115    Warning,
116    /// 错误
117    Error,
118    /// 严重
119    Critical,
120}
121
122impl ErrorSeverity {
123    /// 获取数值权重(用于排序)
124    pub fn weight(&self) -> u8 {
125        match self {
126            Self::Info => 1,
127            Self::Warning => 2,
128            Self::Error => 3,
129            Self::Critical => 4,
130        }
131    }
132
133    /// 获取显示符号
134    pub fn symbol(&self) -> &'static str {
135        match self {
136            Self::Info => "ℹ️",
137            Self::Warning => "⚠️",
138            Self::Error => "❌",
139            Self::Critical => "🚨",
140        }
141    }
142}
143
144/// 错误统计数据
145#[derive(Debug, Clone, Default)]
146pub struct ErrorStatistics {
147    /// 总错误数
148    pub total_errors: u64,
149    /// 按类别分组的错误数
150    pub errors_by_category: HashMap<ErrorHandlingCategory, u64>,
151    /// 按错误码分组的错误数
152    pub errors_by_code: HashMap<LarkErrorCode, u64>,
153    /// 按严重级别分组的错误数
154    pub errors_by_severity: HashMap<ErrorSeverity, u64>,
155    /// 可重试错误数
156    pub retryable_errors: u64,
157    /// 平均处理时间
158    pub average_processing_time: Option<Duration>,
159    /// 第一个错误时间
160    pub first_error_time: Option<SystemTime>,
161    /// 最后一个错误时间
162    pub last_error_time: Option<SystemTime>,
163}
164
165impl ErrorStatistics {
166    /// 计算错误率(每分钟)
167    pub fn error_rate_per_minute(&self) -> f64 {
168        if let (Some(first), Some(last)) = (self.first_error_time, self.last_error_time) {
169            if let Ok(duration) = last.duration_since(first) {
170                let minutes = duration.as_secs_f64() / 60.0;
171                if minutes > 0.0 {
172                    return self.total_errors as f64 / minutes;
173                }
174            }
175        }
176        0.0
177    }
178
179    /// 获取最常见的错误类别
180    pub fn most_common_category(&self) -> Option<ErrorHandlingCategory> {
181        self.errors_by_category
182            .iter()
183            .max_by_key(|(_, count)| *count)
184            .map(|(category, _)| *category)
185    }
186
187    /// 获取最严重的错误级别
188    pub fn highest_severity(&self) -> Option<ErrorSeverity> {
189        self.errors_by_severity
190            .keys()
191            .max_by_key(|severity| severity.weight())
192            .copied()
193    }
194
195    /// 计算可重试错误百分比
196    pub fn retryable_percentage(&self) -> f64 {
197        if self.total_errors == 0 {
198            0.0
199        } else {
200            (self.retryable_errors as f64 / self.total_errors as f64) * 100.0
201        }
202    }
203
204    /// 打印统计摘要
205    pub fn print_summary(&self) {
206        println!("📊 错误统计摘要:");
207        println!("   总错误数: {}", self.total_errors);
208        println!("   错误率: {:.2} 错误/分钟", self.error_rate_per_minute());
209        println!(
210            "   可重试错误: {} ({:.1}%)",
211            self.retryable_errors,
212            self.retryable_percentage()
213        );
214
215        if let Some(category) = self.most_common_category() {
216            println!("   最常见类别: {category:?}");
217        }
218
219        if let Some(severity) = self.highest_severity() {
220            println!("   最高严重级别: {} {:?}", severity.symbol(), severity);
221        }
222
223        if let Some(avg_time) = self.average_processing_time {
224            println!("   平均处理时间: {avg_time:?}");
225        }
226    }
227
228    /// 打印详细统计
229    pub fn print_detailed(&self) {
230        self.print_summary();
231
232        println!("\n📈 错误分类统计:");
233        for (category, count) in &self.errors_by_category {
234            let percentage = (*count as f64 / self.total_errors as f64) * 100.0;
235            println!("   {category:?}: {count} ({percentage:.1}%)");
236        }
237
238        println!("\n🔢 错误码统计:");
239        let mut sorted_codes: Vec<_> = self.errors_by_code.iter().collect();
240        sorted_codes.sort_by(|a, b| b.1.cmp(a.1));
241        for (code, count) in sorted_codes.iter().take(10) {
242            let percentage = (**count as f64 / self.total_errors as f64) * 100.0;
243            println!("   {code}: {count} ({percentage:.1}%)");
244        }
245
246        println!("\n⚠️ 严重级别统计:");
247        for severity in [
248            ErrorSeverity::Critical,
249            ErrorSeverity::Error,
250            ErrorSeverity::Warning,
251            ErrorSeverity::Info,
252        ] {
253            if let Some(count) = self.errors_by_severity.get(&severity) {
254                let percentage = (*count as f64 / self.total_errors as f64) * 100.0;
255                println!(
256                    "   {} {:?}: {} ({:.1}%)",
257                    severity.symbol(),
258                    severity,
259                    count,
260                    percentage
261                );
262            }
263        }
264    }
265}
266
267/// 错误监控器
268pub struct ErrorMonitor {
269    /// 错误事件历史
270    events: Arc<Mutex<Vec<ErrorEvent>>>,
271    /// 统计数据
272    statistics: Arc<Mutex<ErrorStatistics>>,
273    /// 配置
274    config: MonitorConfig,
275}
276
277/// 监控配置
278#[derive(Debug, Clone)]
279pub struct MonitorConfig {
280    /// 最大保存事件数量
281    pub max_events: usize,
282    /// 统计时间窗口
283    pub time_window: Duration,
284    /// 是否启用自动清理
285    pub auto_cleanup: bool,
286    /// 告警阈值
287    pub alert_thresholds: AlertThresholds,
288}
289
290impl Default for MonitorConfig {
291    fn default() -> Self {
292        Self {
293            max_events: 1000,
294            time_window: Duration::from_secs(24 * 60 * 60), // 24小时
295            auto_cleanup: true,
296            alert_thresholds: AlertThresholds::default(),
297        }
298    }
299}
300
301/// 告警阈值配置
302#[derive(Debug, Clone)]
303pub struct AlertThresholds {
304    /// 错误率阈值(每分钟)
305    pub error_rate_per_minute: f64,
306    /// 严重错误阈值
307    pub critical_errors_count: u64,
308    /// 连续失败阈值
309    pub consecutive_failures: u32,
310}
311
312impl Default for AlertThresholds {
313    fn default() -> Self {
314        Self {
315            error_rate_per_minute: 10.0,
316            critical_errors_count: 5,
317            consecutive_failures: 3,
318        }
319    }
320}
321
322impl Default for ErrorMonitor {
323    fn default() -> Self {
324        Self::new(MonitorConfig::default())
325    }
326}
327
328impl ErrorMonitor {
329    /// 创建新的错误监控器
330    pub fn new(config: MonitorConfig) -> Self {
331        Self {
332            events: Arc::new(Mutex::new(Vec::new())),
333            statistics: Arc::new(Mutex::new(ErrorStatistics::default())),
334            config,
335        }
336    }
337
338    /// 记录错误事件
339    pub fn record_error(&self, error: LarkAPIError) {
340        let event = ErrorEvent::from_error(error);
341        self.record_event(event);
342    }
343
344    /// 记录带上下文的错误事件
345    pub fn record_error_with_context(&self, error: LarkAPIError, context: HashMap<String, String>) {
346        let mut event = ErrorEvent::from_error(error);
347        event.context = context;
348        self.record_event(event);
349    }
350
351    /// 记录错误事件
352    pub fn record_event(&self, event: ErrorEvent) {
353        // 更新统计数据
354        if let Ok(mut stats) = self.statistics.lock() {
355            stats.total_errors += 1;
356
357            // 更新分类统计
358            *stats.errors_by_category.entry(event.category).or_insert(0) += 1;
359
360            // 更新错误码统计
361            if let Some(code) = event.error_code {
362                *stats.errors_by_code.entry(code).or_insert(0) += 1;
363            }
364
365            // 更新严重级别统计
366            let severity = event.severity_level();
367            *stats.errors_by_severity.entry(severity).or_insert(0) += 1;
368
369            // 更新可重试统计
370            if event.is_retryable {
371                stats.retryable_errors += 1;
372            }
373
374            // 更新时间范围
375            if stats.first_error_time.is_none() {
376                stats.first_error_time = Some(event.timestamp);
377            }
378            stats.last_error_time = Some(event.timestamp);
379        }
380
381        // 添加到事件历史
382        if let Ok(mut events) = self.events.lock() {
383            events.push(event);
384
385            // 自动清理旧事件
386            if self.config.auto_cleanup && events.len() > self.config.max_events {
387                let len = events.len();
388                let max_events = self.config.max_events;
389                events.drain(0..(len - max_events));
390            }
391        }
392
393        // 检查告警条件
394        self.check_alerts();
395    }
396
397    /// 获取统计数据
398    pub fn get_statistics(&self) -> ErrorStatistics {
399        self.statistics.lock().unwrap().clone()
400    }
401
402    /// 获取最近的错误事件
403    pub fn get_recent_events(&self, limit: usize) -> Vec<ErrorEvent> {
404        if let Ok(events) = self.events.lock() {
405            events.iter().rev().take(limit).cloned().collect()
406        } else {
407            Vec::new()
408        }
409    }
410
411    /// 清理旧事件
412    pub fn cleanup_old_events(&self) {
413        if let Ok(mut events) = self.events.lock() {
414            let cutoff_time = SystemTime::now() - self.config.time_window;
415            events.retain(|event| event.timestamp >= cutoff_time);
416        }
417    }
418
419    /// 重置统计数据
420    pub fn reset_statistics(&self) {
421        if let Ok(mut stats) = self.statistics.lock() {
422            *stats = ErrorStatistics::default();
423        }
424        if let Ok(mut events) = self.events.lock() {
425            events.clear();
426        }
427    }
428
429    /// 检查告警条件
430    fn check_alerts(&self) {
431        let stats = self.get_statistics();
432
433        // 检查错误率
434        if stats.error_rate_per_minute() > self.config.alert_thresholds.error_rate_per_minute {
435            self.trigger_alert(
436                AlertType::HighErrorRate,
437                format!("错误率过高: {:.2} 错误/分钟", stats.error_rate_per_minute()),
438            );
439        }
440
441        // 检查严重错误
442        if let Some(critical_count) = stats.errors_by_severity.get(&ErrorSeverity::Critical) {
443            if *critical_count >= self.config.alert_thresholds.critical_errors_count {
444                self.trigger_alert(
445                    AlertType::CriticalErrors,
446                    format!("严重错误过多: {critical_count} 个"),
447                );
448            }
449        }
450    }
451
452    /// 触发告警
453    fn trigger_alert(&self, alert_type: AlertType, message: String) {
454        println!("🚨 告警 [{alert_type:?}]: {message}");
455        // 这里可以集成外部告警系统
456    }
457
458    /// 生成错误报告
459    pub fn generate_report(&self) -> ErrorReport {
460        let stats = self.get_statistics();
461        let recent_events = self.get_recent_events(10);
462
463        ErrorReport {
464            statistics: stats,
465            recent_events,
466            generated_at: SystemTime::now(),
467            time_window: self.config.time_window,
468        }
469    }
470}
471
472/// 告警类型
473#[derive(Debug)]
474enum AlertType {
475    HighErrorRate,
476    CriticalErrors,
477    #[allow(dead_code)]
478    ConsecutiveFailures,
479}
480
481/// 错误报告
482#[derive(Debug)]
483pub struct ErrorReport {
484    /// 统计数据
485    pub statistics: ErrorStatistics,
486    /// 最近事件
487    pub recent_events: Vec<ErrorEvent>,
488    /// 报告生成时间
489    pub generated_at: SystemTime,
490    /// 统计时间窗口
491    pub time_window: Duration,
492}
493
494impl ErrorReport {
495    /// 打印报告
496    pub fn print(&self) {
497        println!("📋 错误监控报告");
498        println!("生成时间: {:?}", self.generated_at);
499        println!("统计窗口: {:?}", self.time_window);
500        println!("{}", "=".repeat(50));
501
502        self.statistics.print_detailed();
503
504        println!("\n🕒 最近错误事件:");
505        for (i, event) in self.recent_events.iter().enumerate() {
506            println!(
507                "   {}. [{:?}] {} {:?}",
508                i + 1,
509                event.timestamp,
510                event.severity_level().symbol(),
511                event.category
512            );
513        }
514    }
515
516    /// 保存到文件
517    pub fn save_to_file(&self, path: &str) -> Result<(), std::io::Error> {
518        use std::{fs::File, io::Write};
519
520        let mut file = File::create(path)?;
521
522        writeln!(file, "错误监控报告")?;
523        writeln!(file, "生成时间: {:?}", self.generated_at)?;
524        writeln!(file, "统计窗口: {:?}", self.time_window)?;
525        writeln!(file, "{}", "=".repeat(50))?;
526
527        writeln!(file, "\n统计摘要:")?;
528        writeln!(file, "总错误数: {}", self.statistics.total_errors)?;
529        writeln!(
530            file,
531            "错误率: {:.2} 错误/分钟",
532            self.statistics.error_rate_per_minute()
533        )?;
534        writeln!(
535            file,
536            "可重试错误: {:.1}%",
537            self.statistics.retryable_percentage()
538        )?;
539
540        Ok(())
541    }
542}
543
544#[cfg(test)]
545mod tests {
546    use super::*;
547
548    #[test]
549    fn test_error_event_creation() {
550        let error = LarkAPIError::api_error(403, "Forbidden", None);
551        let event = ErrorEvent::from_error(error);
552
553        assert_eq!(event.category, ErrorHandlingCategory::Permission);
554        assert_eq!(event.error_code, Some(LarkErrorCode::Forbidden));
555        assert!(!event.is_retryable);
556    }
557
558    #[test]
559    fn test_error_statistics() {
560        let stats = ErrorStatistics {
561            total_errors: 100,
562            retryable_errors: 60,
563            ..Default::default()
564        };
565
566        assert_eq!(stats.retryable_percentage(), 60.0);
567    }
568
569    #[test]
570    fn test_error_monitor() {
571        let monitor = ErrorMonitor::default();
572
573        // 记录一些错误
574        monitor.record_error(LarkAPIError::api_error(403, "Forbidden", None));
575        monitor.record_error(LarkAPIError::api_error(500, "Server Error", None));
576
577        let stats = monitor.get_statistics();
578        assert_eq!(stats.total_errors, 2);
579        assert_eq!(stats.errors_by_category.len(), 2);
580    }
581
582    #[test]
583    fn test_error_severity() {
584        assert_eq!(ErrorSeverity::Critical.weight(), 4);
585        assert_eq!(ErrorSeverity::Warning.weight(), 2);
586        assert_eq!(ErrorSeverity::Critical.symbol(), "🚨");
587    }
588}