1use crate::*;
7use anyhow::Result;
8use serde::{Deserialize, Serialize};
9use std::fmt;
10use uuid::Uuid;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct DebugConfig {
15 pub enable_tensor_inspection: bool,
17 pub enable_gradient_debugging: bool,
19 pub enable_model_diagnostics: bool,
21 pub enable_visualization: bool,
23 pub enable_memory_profiling: bool,
25 pub enable_computation_graph_analysis: bool,
27 pub max_tracked_tensors: usize,
29 pub max_gradient_history: usize,
31 pub output_dir: Option<String>,
33 pub sampling_rate: f32,
35 pub memory_profiling_config: MemoryProfilingConfig,
37 pub graph_analysis_config: GraphAnalysisConfig,
39 pub architecture_analysis_config: architecture_analysis::ArchitectureAnalysisConfig,
41 pub behavior_analysis_config: BehaviorAnalysisConfig,
43 pub training_dynamics_config: TrainingDynamicsConfig,
45 pub differential_debugging_config: DifferentialDebuggingConfig,
47 pub interpretability_config: InterpretabilityConfig,
49 pub neural_network_debugging_config: Option<neural_network_debugging::TransformerDebugConfig>,
51 pub advanced_ml_debugging_config: AdvancedMLDebuggingConfig,
53 pub advanced_gpu_profiling_config: AdvancedGpuProfilingConfig,
55 pub kernel_optimization_config: KernelOptimizationConfig,
57 pub ai_code_analysis_config: AIAnalysisConfig,
59 pub distributed_debugging_config: Option<DistributedDebugConfig>,
61 pub environmental_monitoring_config: EnvironmentalConfig,
63}
64
65impl Default for DebugConfig {
66 fn default() -> Self {
67 Self {
68 enable_tensor_inspection: true,
69 enable_gradient_debugging: true,
70 enable_model_diagnostics: true,
71 enable_visualization: false,
72 enable_memory_profiling: true,
73 enable_computation_graph_analysis: true,
74 max_tracked_tensors: 1000,
75 max_gradient_history: 100,
76 output_dir: None,
77 sampling_rate: 1.0,
78 memory_profiling_config: MemoryProfilingConfig::default(),
79 graph_analysis_config: GraphAnalysisConfig::default(),
80 architecture_analysis_config:
81 architecture_analysis::ArchitectureAnalysisConfig::default(),
82 behavior_analysis_config: BehaviorAnalysisConfig::default(),
83 training_dynamics_config: TrainingDynamicsConfig::default(),
84 differential_debugging_config: DifferentialDebuggingConfig::default(),
85 interpretability_config: InterpretabilityConfig::default(),
86 neural_network_debugging_config: None,
87 advanced_ml_debugging_config: AdvancedMLDebuggingConfig::default(),
88 advanced_gpu_profiling_config: AdvancedGpuProfilingConfig::default(),
89 kernel_optimization_config: KernelOptimizationConfig::default(),
90 ai_code_analysis_config: AIAnalysisConfig::default(),
91 distributed_debugging_config: None,
92 environmental_monitoring_config: EnvironmentalConfig::default(),
93 }
94 }
95}
96
97#[derive(Debug)]
99pub struct DebugSession {
100 id: Uuid,
101 config: DebugConfig,
102 tensor_inspector: TensorInspector,
103 gradient_debugger: GradientDebugger,
104 model_diagnostics: ModelDiagnostics,
105 hooks: HookManager,
106 profiler: Profiler,
107 memory_profiler: Option<MemoryProfiler>,
108 interactive_debugger: InteractiveDebugger,
109 anomaly_detector: AnomalyDetector,
110 computation_graph_analyzer: ComputationGraphAnalyzer,
111 architecture_analyzer: architecture_analysis::ArchitectureAnalyzer,
112 behavior_analyzer: BehaviorAnalyzer,
113 training_dynamics_analyzer: TrainingDynamicsAnalyzer,
114 differential_debugger: DifferentialDebugger,
115 interpretability_analyzer: InterpretabilityAnalyzer,
116 health_checker: crate::health_checker::HealthChecker,
117 transformer_debugger: Option<neural_network_debugging::TransformerDebugger>,
118 advanced_ml_debugger: AdvancedMLDebugger,
119 advanced_gpu_profiler: Option<AdvancedGpuMemoryProfiler>,
120 #[allow(dead_code)]
121 kernel_optimizer: KernelOptimizationAnalyzer,
122 ai_code_analyzer: Option<AICodeAnalyzer>,
123 distributed_debugger: Option<DistributedDebugger>,
124 environmental_monitor: Option<EnvironmentalMonitor>,
125}
126
127impl DebugSession {
128 pub fn new(config: DebugConfig) -> Self {
130 let id = Uuid::new_v4();
131
132 let memory_profiler = if config.enable_memory_profiling {
133 Some(MemoryProfiler::new(config.memory_profiling_config.clone()))
134 } else {
135 None
136 };
137
138 let transformer_debugger =
139 if let Some(ref neural_config) = config.neural_network_debugging_config {
140 Some(neural_network_debugging::TransformerDebugger::new(
141 neural_config.clone(),
142 ))
143 } else {
144 None
145 };
146
147 let advanced_gpu_profiler = if config.advanced_gpu_profiling_config.enable_gpu_profiling {
148 AdvancedGpuMemoryProfiler::new(config.advanced_gpu_profiling_config.device_count).ok()
149 } else {
150 None
151 };
152
153 let ai_code_analyzer = if config.ai_code_analysis_config.enable_deep_analysis {
154 Some(AICodeAnalyzer::new(config.ai_code_analysis_config.clone()))
155 } else {
156 None
157 };
158
159 let distributed_debugger =
160 if let Some(ref dist_config) = config.distributed_debugging_config {
161 let node_id = NodeId::new(0, "debug-node".to_string());
162 Some(DistributedDebugger::new(dist_config.clone(), node_id))
163 } else {
164 None
165 };
166
167 let environmental_monitor = if config.environmental_monitoring_config.enable_carbon_tracking
168 {
169 Some(EnvironmentalMonitor::new(
170 config.environmental_monitoring_config.clone(),
171 ))
172 } else {
173 None
174 };
175
176 Self {
177 id,
178 tensor_inspector: TensorInspector::new(&config),
179 gradient_debugger: GradientDebugger::new(config.clone()),
180 model_diagnostics: ModelDiagnostics::new(&config),
181 hooks: HookManager::new(),
182 profiler: Profiler::new(&config),
183 memory_profiler,
184 interactive_debugger: InteractiveDebugger::new(&config),
185 anomaly_detector: AnomalyDetector::new(&config),
186 computation_graph_analyzer: ComputationGraphAnalyzer::new(
187 config.graph_analysis_config.clone(),
188 ),
189 architecture_analyzer: architecture_analysis::ArchitectureAnalyzer::new(
190 config.architecture_analysis_config.clone(),
191 ),
192 behavior_analyzer: BehaviorAnalyzer::new(config.behavior_analysis_config.clone()),
193 training_dynamics_analyzer: TrainingDynamicsAnalyzer::new(),
194 differential_debugger: DifferentialDebugger::new(
195 config.differential_debugging_config.clone(),
196 ),
197 interpretability_analyzer: InterpretabilityAnalyzer::new(
198 config.interpretability_config.clone(),
199 ),
200 health_checker: crate::health_checker::HealthChecker::new(&config),
201 transformer_debugger,
202 advanced_ml_debugger: AdvancedMLDebugger::new(
203 config.advanced_ml_debugging_config.clone(),
204 ),
205 advanced_gpu_profiler,
206 kernel_optimizer: match KernelOptimizationAnalyzer::new() {
207 Ok(analyzer) => analyzer,
208 Err(e) => {
209 tracing::warn!(
210 "Failed to initialize kernel optimizer: {}, using stub implementation",
211 e
212 );
213 KernelOptimizationAnalyzer::new_stub()
215 },
216 },
217 ai_code_analyzer,
218 distributed_debugger,
219 environmental_monitor,
220 config,
221 }
222 }
223
224 pub fn id(&self) -> Uuid {
226 self.id
227 }
228
229 pub fn config(&self) -> &DebugConfig {
231 &self.config
232 }
233
234 pub fn tensor_inspector(&self) -> &TensorInspector {
236 &self.tensor_inspector
237 }
238
239 pub fn tensor_inspector_mut(&mut self) -> &mut TensorInspector {
241 &mut self.tensor_inspector
242 }
243
244 pub fn gradient_debugger(&self) -> &GradientDebugger {
246 &self.gradient_debugger
247 }
248
249 pub fn gradient_debugger_mut(&mut self) -> &mut GradientDebugger {
251 &mut self.gradient_debugger
252 }
253
254 pub fn model_diagnostics(&self) -> &ModelDiagnostics {
256 &self.model_diagnostics
257 }
258
259 pub fn model_diagnostics_mut(&mut self) -> &mut ModelDiagnostics {
261 &mut self.model_diagnostics
262 }
263
264 pub fn hooks(&self) -> &HookManager {
266 &self.hooks
267 }
268
269 pub fn hooks_mut(&mut self) -> &mut HookManager {
271 &mut self.hooks
272 }
273
274 pub fn profiler(&self) -> &Profiler {
276 &self.profiler
277 }
278
279 pub fn profiler_mut(&mut self) -> &mut Profiler {
281 &mut self.profiler
282 }
283
284 pub fn memory_profiler(&self) -> Option<&MemoryProfiler> {
286 self.memory_profiler.as_ref()
287 }
288
289 pub fn memory_profiler_mut(&mut self) -> Option<&mut MemoryProfiler> {
291 self.memory_profiler.as_mut()
292 }
293
294 pub fn interactive_debugger(&self) -> &InteractiveDebugger {
296 &self.interactive_debugger
297 }
298
299 pub fn interactive_debugger_mut(&mut self) -> &mut InteractiveDebugger {
301 &mut self.interactive_debugger
302 }
303
304 pub fn anomaly_detector(&self) -> &AnomalyDetector {
306 &self.anomaly_detector
307 }
308
309 pub fn anomaly_detector_mut(&mut self) -> &mut AnomalyDetector {
311 &mut self.anomaly_detector
312 }
313
314 pub fn computation_graph_analyzer(&self) -> &ComputationGraphAnalyzer {
316 &self.computation_graph_analyzer
317 }
318
319 pub fn computation_graph_analyzer_mut(&mut self) -> &mut ComputationGraphAnalyzer {
321 &mut self.computation_graph_analyzer
322 }
323
324 pub fn architecture_analyzer(&self) -> &architecture_analysis::ArchitectureAnalyzer {
326 &self.architecture_analyzer
327 }
328
329 pub fn architecture_analyzer_mut(
331 &mut self,
332 ) -> &mut architecture_analysis::ArchitectureAnalyzer {
333 &mut self.architecture_analyzer
334 }
335
336 pub fn behavior_analyzer(&self) -> &BehaviorAnalyzer {
338 &self.behavior_analyzer
339 }
340
341 pub fn behavior_analyzer_mut(&mut self) -> &mut BehaviorAnalyzer {
343 &mut self.behavior_analyzer
344 }
345
346 pub fn training_dynamics_analyzer(&self) -> &TrainingDynamicsAnalyzer {
348 &self.training_dynamics_analyzer
349 }
350
351 pub fn training_dynamics_analyzer_mut(&mut self) -> &mut TrainingDynamicsAnalyzer {
353 &mut self.training_dynamics_analyzer
354 }
355
356 pub fn differential_debugger(&self) -> &DifferentialDebugger {
358 &self.differential_debugger
359 }
360
361 pub fn differential_debugger_mut(&mut self) -> &mut DifferentialDebugger {
363 &mut self.differential_debugger
364 }
365
366 pub fn interpretability_analyzer(&self) -> &InterpretabilityAnalyzer {
368 &self.interpretability_analyzer
369 }
370
371 pub fn interpretability_analyzer_mut(&mut self) -> &mut InterpretabilityAnalyzer {
373 &mut self.interpretability_analyzer
374 }
375
376 pub fn health_checker(&self) -> &crate::health_checker::HealthChecker {
378 &self.health_checker
379 }
380
381 pub fn health_checker_mut(&mut self) -> &mut crate::health_checker::HealthChecker {
383 &mut self.health_checker
384 }
385
386 pub fn transformer_debugger(&self) -> Option<&neural_network_debugging::TransformerDebugger> {
388 self.transformer_debugger.as_ref()
389 }
390
391 pub fn transformer_debugger_mut(
393 &mut self,
394 ) -> Option<&mut neural_network_debugging::TransformerDebugger> {
395 self.transformer_debugger.as_mut()
396 }
397
398 pub fn advanced_ml_debugger(&self) -> &AdvancedMLDebugger {
400 &self.advanced_ml_debugger
401 }
402
403 pub fn advanced_ml_debugger_mut(&mut self) -> &mut AdvancedMLDebugger {
405 &mut self.advanced_ml_debugger
406 }
407
408 pub fn ai_code_analyzer(&self) -> Option<&AICodeAnalyzer> {
410 self.ai_code_analyzer.as_ref()
411 }
412
413 pub fn ai_code_analyzer_mut(&mut self) -> Option<&mut AICodeAnalyzer> {
415 self.ai_code_analyzer.as_mut()
416 }
417
418 pub fn distributed_debugger(&self) -> Option<&DistributedDebugger> {
420 self.distributed_debugger.as_ref()
421 }
422
423 pub fn distributed_debugger_mut(&mut self) -> Option<&mut DistributedDebugger> {
425 self.distributed_debugger.as_mut()
426 }
427
428 pub fn environmental_monitor(&self) -> Option<&EnvironmentalMonitor> {
430 self.environmental_monitor.as_ref()
431 }
432
433 pub fn environmental_monitor_mut(&mut self) -> Option<&mut EnvironmentalMonitor> {
435 self.environmental_monitor.as_mut()
436 }
437
438 pub async fn start(&mut self) -> Result<()> {
440 tracing::info!("Starting debug session {}", self.id);
441
442 if self.config.enable_tensor_inspection {
443 self.tensor_inspector.start().await?;
444 }
445
446 if self.config.enable_gradient_debugging {
447 self.gradient_debugger.start().await?;
448 }
449
450 if self.config.enable_model_diagnostics {
451 self.model_diagnostics.start().await?;
452 }
453
454 self.profiler.start().await?;
455
456 if let Some(ref mut memory_profiler) = self.memory_profiler {
457 memory_profiler.start().await?;
458 }
459
460 self.interactive_debugger.start().await?;
461 self.anomaly_detector.start().await?;
462
463 Ok(())
464 }
465
466 pub async fn stop(&mut self) -> Result<DebugReport> {
468 tracing::info!("Stopping debug session {}", self.id);
469
470 let tensor_report = if self.config.enable_tensor_inspection {
471 Some(self.tensor_inspector.generate_report().await?)
472 } else {
473 None
474 };
475
476 let gradient_report = if self.config.enable_gradient_debugging {
477 Some(self.gradient_debugger.generate_report().await?)
478 } else {
479 None
480 };
481
482 let diagnostics_report = if self.config.enable_model_diagnostics {
483 Some(self.model_diagnostics.generate_report().await?)
484 } else {
485 None
486 };
487
488 let profiler_report = self.profiler.generate_report().await?;
489
490 let memory_profiler_report = if let Some(ref mut memory_profiler) = self.memory_profiler {
491 Some(memory_profiler.stop().await?)
492 } else {
493 None
494 };
495
496 let interactive_debugger_report = self.interactive_debugger.generate_report().await?;
497 let anomaly_report = self.anomaly_detector.generate_report().await?;
498
499 let computation_graph_report = None; let architecture_analysis_report =
504 Some(self.architecture_analyzer.generate_report().await?);
505 let behavior_analysis_report = Some(self.behavior_analyzer.generate_report().await?);
506 let training_dynamics_report =
507 Some(self.training_dynamics_analyzer.generate_report().await?);
508 let differential_debugging_report =
509 Some(self.differential_debugger.generate_report().await?);
510 let interpretability_report = Some(self.interpretability_analyzer.generate_report().await?);
511 let advanced_ml_debugging_report = Some(self.advanced_ml_debugger.generate_report().await?);
512
513 let advanced_gpu_profiling_report = if let Some(ref profiler) = self.advanced_gpu_profiler {
515 Some(profiler.get_memory_analysis_report())
516 } else {
517 None
518 };
519
520 let kernel_optimization_report =
521 Some(self.generate_kernel_optimization_summary_report().await?);
522
523 Ok(DebugReport {
524 session_id: self.id,
525 tensor_report,
526 gradient_report,
527 diagnostics_report,
528 profiler_report,
529 memory_profiler_report,
530 interactive_debugger_report,
531 anomaly_report,
532 computation_graph_report,
533 architecture_analysis_report,
534 behavior_analysis_report,
535 training_dynamics_report,
536 differential_debugging_report,
537 interpretability_report,
538 advanced_ml_debugging_report,
539 advanced_gpu_profiling_report,
540 kernel_optimization_report,
541 config: self.config.clone(),
542 })
543 }
544
545 pub async fn export(&self, path: &str) -> Result<()> {
547 let report = self.generate_snapshot().await?;
548 let json = serde_json::to_string_pretty(&report)?;
549 tokio::fs::write(path, json).await?;
550 Ok(())
551 }
552
553 pub async fn generate_snapshot(&self) -> Result<DebugReport> {
555 let tensor_report = if self.config.enable_tensor_inspection {
556 Some(self.tensor_inspector.generate_report().await?)
557 } else {
558 None
559 };
560
561 let gradient_report = if self.config.enable_gradient_debugging {
562 Some(self.gradient_debugger.generate_report().await?)
563 } else {
564 None
565 };
566
567 let diagnostics_report = if self.config.enable_model_diagnostics {
568 Some(self.model_diagnostics.generate_report().await?)
569 } else {
570 None
571 };
572
573 let profiler_report = self.profiler.generate_report().await?;
574
575 let memory_profiler_report = if let Some(ref _memory_profiler) = self.memory_profiler {
576 None } else {
579 None
580 };
581
582 let interactive_debugger_report = self.interactive_debugger.generate_report().await?;
583 let anomaly_report = self.anomaly_detector.generate_report().await?;
584
585 let computation_graph_report = None; let architecture_analysis_report =
590 Some(self.architecture_analyzer.generate_report().await?);
591 let behavior_analysis_report = Some(self.behavior_analyzer.generate_report().await?);
592 let training_dynamics_report =
593 Some(self.training_dynamics_analyzer.generate_report().await?);
594 let differential_debugging_report =
595 Some(self.differential_debugger.generate_report().await?);
596 let interpretability_report = Some(self.interpretability_analyzer.generate_report().await?);
597 let advanced_ml_debugging_report = Some(self.advanced_ml_debugger.generate_report().await?);
598
599 let advanced_gpu_profiling_report = if let Some(ref profiler) = self.advanced_gpu_profiler {
601 Some(profiler.get_memory_analysis_report())
602 } else {
603 None
604 };
605
606 let kernel_optimization_report =
607 Some(self.generate_kernel_optimization_summary_report().await?);
608
609 Ok(DebugReport {
610 session_id: self.id,
611 tensor_report,
612 gradient_report,
613 diagnostics_report,
614 profiler_report,
615 memory_profiler_report,
616 interactive_debugger_report,
617 anomaly_report,
618 computation_graph_report,
619 architecture_analysis_report,
620 behavior_analysis_report,
621 training_dynamics_report,
622 differential_debugging_report,
623 interpretability_report,
624 advanced_ml_debugging_report,
625 advanced_gpu_profiling_report,
626 kernel_optimization_report,
627 config: self.config.clone(),
628 })
629 }
630
631 pub fn debug_tensor<T>(&mut self, tensor: &ArrayD<T>, name: &str) -> Result<Uuid>
633 where
634 T: Clone + Into<f64> + fmt::Debug + 'static,
635 {
636 self.tensor_inspector.inspect_tensor(tensor, name, None, None)
637 }
638
639 async fn generate_kernel_optimization_summary_report(
641 &self,
642 ) -> Result<KernelOptimizationSummaryReport> {
643 Ok(KernelOptimizationSummaryReport {
646 total_kernels_analyzed: 0,
647 optimization_opportunities_found: 0,
648 high_impact_optimizations: vec![],
649 fusion_opportunities: 0,
650 regression_alerts: 0,
651 overall_optimization_score: 85.0,
652 top_recommendations: vec!["No kernel analysis data available yet".to_string()],
653 })
654 }
655
656 pub fn debug_gradients<T>(&mut self, _layer_name: &str, gradients: &[T]) -> Result<()>
658 where
659 T: Clone + Into<f64> + fmt::Debug + 'static,
660 {
661 use scirs2_core::ndarray::Array; let gradient_array = Array::from_vec(gradients.to_vec()).into_dyn();
664
665 let tensor_id = Uuid::new_v4();
667
668 self.tensor_inspector.inspect_gradients(tensor_id, &gradient_array)
669 }
670}
671
672#[derive(Debug, Clone, Serialize, Deserialize)]
674pub struct DebugReport {
675 pub session_id: Uuid,
676 pub tensor_report: Option<TensorInspectionReport>,
677 pub gradient_report: Option<GradientDebugReport>,
678 pub diagnostics_report: Option<ModelDiagnosticsReport>,
679 pub profiler_report: ProfilerReport,
680 pub memory_profiler_report: Option<MemoryProfilingReport>,
681 pub interactive_debugger_report: InteractiveDebuggerReport,
682 pub anomaly_report: AnomalyDetectorReport,
683 pub computation_graph_report: Option<GraphAnalysisResult>,
684 pub architecture_analysis_report: Option<ArchitectureAnalysisReport>,
685 pub behavior_analysis_report: Option<BehaviorAnalysisReport>,
686 pub training_dynamics_report: Option<model_diagnostics::training::TrainingDynamicsReport>,
687 pub differential_debugging_report: Option<DifferentialDebuggingReport>,
688 pub interpretability_report: Option<InterpretabilityReport>,
689 pub advanced_ml_debugging_report: Option<AdvancedMLDebuggingReport>,
690 pub advanced_gpu_profiling_report: Option<MemoryAnalysisReport>,
691 pub kernel_optimization_report: Option<KernelOptimizationSummaryReport>,
692 pub config: DebugConfig,
693}
694
695impl DebugReport {
696 pub fn summary(&self) -> DebugSummary {
698 let mut issues = Vec::new();
699 let mut recommendations = Vec::new();
700
701 if let Some(ref tensor_report) = self.tensor_report {
703 if tensor_report.has_nan_values() {
704 issues.push("NaN values detected in tensors".to_string());
705 recommendations.push("Check input data and model initialization".to_string());
706 }
707
708 if tensor_report.has_inf_values() {
709 issues.push("Infinite values detected in tensors".to_string());
710 recommendations.push("Reduce learning rate or add gradient clipping".to_string());
711 }
712 }
713
714 if let Some(ref gradient_report) = self.gradient_report {
716 if gradient_report.has_vanishing_gradients() {
717 issues.push("Vanishing gradients detected".to_string());
718 recommendations
719 .push("Consider residual connections or gradient scaling".to_string());
720 }
721
722 if gradient_report.has_exploding_gradients() {
723 issues.push("Exploding gradients detected".to_string());
724 recommendations.push("Add gradient clipping".to_string());
725 }
726 }
727
728 DebugSummary {
729 session_id: self.session_id,
730 total_issues: issues.len(),
731 critical_issues: issues
732 .iter()
733 .filter(|i| i.contains("NaN") || i.contains("exploding"))
734 .count(),
735 issues,
736 recommendations,
737 }
738 }
739}
740
741#[derive(Debug, Serialize, Deserialize)]
743pub struct DebugSummary {
744 pub session_id: Uuid,
745 pub total_issues: usize,
746 pub critical_issues: usize,
747 pub issues: Vec<String>,
748 pub recommendations: Vec<String>,
749}
750
751pub fn debug_session() -> DebugSession {
753 DebugSession::new(DebugConfig::default())
754}
755
756pub fn debug_session_with_config(config: DebugConfig) -> DebugSession {
758 DebugSession::new(config)
759}
760
761pub fn debug_session_with_transformer() -> DebugSession {
763 let mut config = DebugConfig::default();
764 config.neural_network_debugging_config =
765 Some(neural_network_debugging::TransformerDebugConfig::default());
766 DebugSession::new(config)
767}