1use crate::*;
7use anyhow::Result;
8use serde::{Deserialize, Serialize};
9use std::fmt;
10use uuid::Uuid;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct DebugConfig {
15 pub enable_tensor_inspection: bool,
17 pub enable_gradient_debugging: bool,
19 pub enable_model_diagnostics: bool,
21 pub enable_visualization: bool,
23 pub enable_memory_profiling: bool,
25 pub enable_computation_graph_analysis: bool,
27 pub max_tracked_tensors: usize,
29 pub max_gradient_history: usize,
31 pub output_dir: Option<String>,
33 pub sampling_rate: f32,
35 pub memory_profiling_config: MemoryProfilingConfig,
37 pub graph_analysis_config: GraphAnalysisConfig,
39 pub architecture_analysis_config: architecture_analysis::ArchitectureAnalysisConfig,
41 pub behavior_analysis_config: BehaviorAnalysisConfig,
43 pub training_dynamics_config: TrainingDynamicsConfig,
45 pub differential_debugging_config: DifferentialDebuggingConfig,
47 pub interpretability_config: InterpretabilityConfig,
49 pub neural_network_debugging_config: Option<neural_network_debugging::TransformerDebugConfig>,
51 pub advanced_ml_debugging_config: AdvancedMLDebuggingConfig,
53 pub advanced_gpu_profiling_config: AdvancedGpuProfilingConfig,
55 pub kernel_optimization_config: KernelOptimizationConfig,
57 pub ai_code_analysis_config: AIAnalysisConfig,
59 pub distributed_debugging_config: Option<DistributedDebugConfig>,
61 pub environmental_monitoring_config: EnvironmentalConfig,
63}
64
65impl Default for DebugConfig {
66 fn default() -> Self {
67 Self {
68 enable_tensor_inspection: true,
69 enable_gradient_debugging: true,
70 enable_model_diagnostics: true,
71 enable_visualization: false,
72 enable_memory_profiling: true,
73 enable_computation_graph_analysis: true,
74 max_tracked_tensors: 1000,
75 max_gradient_history: 100,
76 output_dir: None,
77 sampling_rate: 1.0,
78 memory_profiling_config: MemoryProfilingConfig::default(),
79 graph_analysis_config: GraphAnalysisConfig::default(),
80 architecture_analysis_config:
81 architecture_analysis::ArchitectureAnalysisConfig::default(),
82 behavior_analysis_config: BehaviorAnalysisConfig::default(),
83 training_dynamics_config: TrainingDynamicsConfig::default(),
84 differential_debugging_config: DifferentialDebuggingConfig::default(),
85 interpretability_config: InterpretabilityConfig,
86 neural_network_debugging_config: None,
87 advanced_ml_debugging_config: AdvancedMLDebuggingConfig::default(),
88 advanced_gpu_profiling_config: AdvancedGpuProfilingConfig::default(),
89 kernel_optimization_config: KernelOptimizationConfig::default(),
90 ai_code_analysis_config: AIAnalysisConfig::default(),
91 distributed_debugging_config: None,
92 environmental_monitoring_config: EnvironmentalConfig::default(),
93 }
94 }
95}
96
97#[derive(Debug)]
99pub struct DebugSession {
100 id: Uuid,
101 config: DebugConfig,
102 tensor_inspector: TensorInspector,
103 gradient_debugger: GradientDebugger,
104 model_diagnostics: ModelDiagnostics,
105 hooks: HookManager,
106 profiler: Profiler,
107 memory_profiler: Option<MemoryProfiler>,
108 interactive_debugger: InteractiveDebugger,
109 anomaly_detector: AnomalyDetector,
110 computation_graph_analyzer: ComputationGraphAnalyzer,
111 architecture_analyzer: architecture_analysis::ArchitectureAnalyzer,
112 behavior_analyzer: BehaviorAnalyzer,
113 training_dynamics_analyzer: TrainingDynamicsAnalyzer,
114 differential_debugger: DifferentialDebugger,
115 interpretability_analyzer: InterpretabilityAnalyzer,
116 health_checker: crate::health_checker::HealthChecker,
117 transformer_debugger: Option<neural_network_debugging::TransformerDebugger>,
118 advanced_ml_debugger: AdvancedMLDebugger,
119 advanced_gpu_profiler: Option<AdvancedGpuMemoryProfiler>,
120 #[allow(dead_code)]
121 kernel_optimizer: KernelOptimizationAnalyzer,
122 ai_code_analyzer: Option<AICodeAnalyzer>,
123 distributed_debugger: Option<DistributedDebugger>,
124 environmental_monitor: Option<EnvironmentalMonitor>,
125}
126
127impl DebugSession {
128 pub fn new(config: DebugConfig) -> Self {
130 let id = Uuid::new_v4();
131
132 let memory_profiler = if config.enable_memory_profiling {
133 Some(MemoryProfiler::new(config.memory_profiling_config.clone()))
134 } else {
135 None
136 };
137
138 let transformer_debugger =
139 config.neural_network_debugging_config.as_ref().map(|neural_config| {
140 neural_network_debugging::TransformerDebugger::new(neural_config.clone())
141 });
142
143 let advanced_gpu_profiler = if config.advanced_gpu_profiling_config.enable_gpu_profiling {
144 AdvancedGpuMemoryProfiler::new(config.advanced_gpu_profiling_config.device_count).ok()
145 } else {
146 None
147 };
148
149 let ai_code_analyzer = if config.ai_code_analysis_config.enable_deep_analysis {
150 Some(AICodeAnalyzer::new(config.ai_code_analysis_config.clone()))
151 } else {
152 None
153 };
154
155 let distributed_debugger =
156 if let Some(ref dist_config) = config.distributed_debugging_config {
157 let node_id = NodeId::new(0, "debug-node".to_string());
158 Some(DistributedDebugger::new(dist_config.clone(), node_id))
159 } else {
160 None
161 };
162
163 let environmental_monitor = if config.environmental_monitoring_config.enable_carbon_tracking
164 {
165 Some(EnvironmentalMonitor::new(
166 config.environmental_monitoring_config.clone(),
167 ))
168 } else {
169 None
170 };
171
172 Self {
173 id,
174 tensor_inspector: TensorInspector::new(&config),
175 gradient_debugger: GradientDebugger::new(config.clone()),
176 model_diagnostics: ModelDiagnostics::new(&config),
177 hooks: HookManager::new(),
178 profiler: Profiler::new(&config),
179 memory_profiler,
180 interactive_debugger: InteractiveDebugger::new(&config),
181 anomaly_detector: AnomalyDetector::new(&config),
182 computation_graph_analyzer: ComputationGraphAnalyzer::new(
183 config.graph_analysis_config.clone(),
184 ),
185 architecture_analyzer: architecture_analysis::ArchitectureAnalyzer::new(
186 config.architecture_analysis_config.clone(),
187 ),
188 behavior_analyzer: BehaviorAnalyzer::new(config.behavior_analysis_config.clone()),
189 training_dynamics_analyzer: TrainingDynamicsAnalyzer::new(),
190 differential_debugger: DifferentialDebugger::new(
191 config.differential_debugging_config.clone(),
192 ),
193 interpretability_analyzer: InterpretabilityAnalyzer::new(
194 config.interpretability_config.clone(),
195 ),
196 health_checker: crate::health_checker::HealthChecker::new(&config),
197 transformer_debugger,
198 advanced_ml_debugger: AdvancedMLDebugger::new(
199 config.advanced_ml_debugging_config.clone(),
200 ),
201 advanced_gpu_profiler,
202 kernel_optimizer: match KernelOptimizationAnalyzer::new() {
203 Ok(analyzer) => analyzer,
204 Err(e) => {
205 tracing::warn!(
206 "Failed to initialize kernel optimizer: {}, using stub implementation",
207 e
208 );
209 KernelOptimizationAnalyzer::new_stub()
211 },
212 },
213 ai_code_analyzer,
214 distributed_debugger,
215 environmental_monitor,
216 config,
217 }
218 }
219
220 pub fn id(&self) -> Uuid {
222 self.id
223 }
224
225 pub fn config(&self) -> &DebugConfig {
227 &self.config
228 }
229
230 pub fn tensor_inspector(&self) -> &TensorInspector {
232 &self.tensor_inspector
233 }
234
235 pub fn tensor_inspector_mut(&mut self) -> &mut TensorInspector {
237 &mut self.tensor_inspector
238 }
239
240 pub fn gradient_debugger(&self) -> &GradientDebugger {
242 &self.gradient_debugger
243 }
244
245 pub fn gradient_debugger_mut(&mut self) -> &mut GradientDebugger {
247 &mut self.gradient_debugger
248 }
249
250 pub fn model_diagnostics(&self) -> &ModelDiagnostics {
252 &self.model_diagnostics
253 }
254
255 pub fn model_diagnostics_mut(&mut self) -> &mut ModelDiagnostics {
257 &mut self.model_diagnostics
258 }
259
260 pub fn hooks(&self) -> &HookManager {
262 &self.hooks
263 }
264
265 pub fn hooks_mut(&mut self) -> &mut HookManager {
267 &mut self.hooks
268 }
269
270 pub fn profiler(&self) -> &Profiler {
272 &self.profiler
273 }
274
275 pub fn profiler_mut(&mut self) -> &mut Profiler {
277 &mut self.profiler
278 }
279
280 pub fn memory_profiler(&self) -> Option<&MemoryProfiler> {
282 self.memory_profiler.as_ref()
283 }
284
285 pub fn memory_profiler_mut(&mut self) -> Option<&mut MemoryProfiler> {
287 self.memory_profiler.as_mut()
288 }
289
290 pub fn interactive_debugger(&self) -> &InteractiveDebugger {
292 &self.interactive_debugger
293 }
294
295 pub fn interactive_debugger_mut(&mut self) -> &mut InteractiveDebugger {
297 &mut self.interactive_debugger
298 }
299
300 pub fn anomaly_detector(&self) -> &AnomalyDetector {
302 &self.anomaly_detector
303 }
304
305 pub fn anomaly_detector_mut(&mut self) -> &mut AnomalyDetector {
307 &mut self.anomaly_detector
308 }
309
310 pub fn computation_graph_analyzer(&self) -> &ComputationGraphAnalyzer {
312 &self.computation_graph_analyzer
313 }
314
315 pub fn computation_graph_analyzer_mut(&mut self) -> &mut ComputationGraphAnalyzer {
317 &mut self.computation_graph_analyzer
318 }
319
320 pub fn architecture_analyzer(&self) -> &architecture_analysis::ArchitectureAnalyzer {
322 &self.architecture_analyzer
323 }
324
325 pub fn architecture_analyzer_mut(
327 &mut self,
328 ) -> &mut architecture_analysis::ArchitectureAnalyzer {
329 &mut self.architecture_analyzer
330 }
331
332 pub fn behavior_analyzer(&self) -> &BehaviorAnalyzer {
334 &self.behavior_analyzer
335 }
336
337 pub fn behavior_analyzer_mut(&mut self) -> &mut BehaviorAnalyzer {
339 &mut self.behavior_analyzer
340 }
341
342 pub fn training_dynamics_analyzer(&self) -> &TrainingDynamicsAnalyzer {
344 &self.training_dynamics_analyzer
345 }
346
347 pub fn training_dynamics_analyzer_mut(&mut self) -> &mut TrainingDynamicsAnalyzer {
349 &mut self.training_dynamics_analyzer
350 }
351
352 pub fn differential_debugger(&self) -> &DifferentialDebugger {
354 &self.differential_debugger
355 }
356
357 pub fn differential_debugger_mut(&mut self) -> &mut DifferentialDebugger {
359 &mut self.differential_debugger
360 }
361
362 pub fn interpretability_analyzer(&self) -> &InterpretabilityAnalyzer {
364 &self.interpretability_analyzer
365 }
366
367 pub fn interpretability_analyzer_mut(&mut self) -> &mut InterpretabilityAnalyzer {
369 &mut self.interpretability_analyzer
370 }
371
372 pub fn health_checker(&self) -> &crate::health_checker::HealthChecker {
374 &self.health_checker
375 }
376
377 pub fn health_checker_mut(&mut self) -> &mut crate::health_checker::HealthChecker {
379 &mut self.health_checker
380 }
381
382 pub fn transformer_debugger(&self) -> Option<&neural_network_debugging::TransformerDebugger> {
384 self.transformer_debugger.as_ref()
385 }
386
387 pub fn transformer_debugger_mut(
389 &mut self,
390 ) -> Option<&mut neural_network_debugging::TransformerDebugger> {
391 self.transformer_debugger.as_mut()
392 }
393
394 pub fn advanced_ml_debugger(&self) -> &AdvancedMLDebugger {
396 &self.advanced_ml_debugger
397 }
398
399 pub fn advanced_ml_debugger_mut(&mut self) -> &mut AdvancedMLDebugger {
401 &mut self.advanced_ml_debugger
402 }
403
404 pub fn ai_code_analyzer(&self) -> Option<&AICodeAnalyzer> {
406 self.ai_code_analyzer.as_ref()
407 }
408
409 pub fn ai_code_analyzer_mut(&mut self) -> Option<&mut AICodeAnalyzer> {
411 self.ai_code_analyzer.as_mut()
412 }
413
414 pub fn distributed_debugger(&self) -> Option<&DistributedDebugger> {
416 self.distributed_debugger.as_ref()
417 }
418
419 pub fn distributed_debugger_mut(&mut self) -> Option<&mut DistributedDebugger> {
421 self.distributed_debugger.as_mut()
422 }
423
424 pub fn environmental_monitor(&self) -> Option<&EnvironmentalMonitor> {
426 self.environmental_monitor.as_ref()
427 }
428
429 pub fn environmental_monitor_mut(&mut self) -> Option<&mut EnvironmentalMonitor> {
431 self.environmental_monitor.as_mut()
432 }
433
434 pub async fn start(&mut self) -> Result<()> {
436 tracing::info!("Starting debug session {}", self.id);
437
438 if self.config.enable_tensor_inspection {
439 self.tensor_inspector.start().await?;
440 }
441
442 if self.config.enable_gradient_debugging {
443 self.gradient_debugger.start().await?;
444 }
445
446 if self.config.enable_model_diagnostics {
447 self.model_diagnostics.start().await?;
448 }
449
450 self.profiler.start().await?;
451
452 if let Some(ref mut memory_profiler) = self.memory_profiler {
453 memory_profiler.start().await?;
454 }
455
456 self.interactive_debugger.start().await?;
457 self.anomaly_detector.start().await?;
458
459 Ok(())
460 }
461
462 pub async fn stop(&mut self) -> Result<DebugReport> {
464 tracing::info!("Stopping debug session {}", self.id);
465
466 let tensor_report = if self.config.enable_tensor_inspection {
467 Some(self.tensor_inspector.generate_report().await?)
468 } else {
469 None
470 };
471
472 let gradient_report = if self.config.enable_gradient_debugging {
473 Some(self.gradient_debugger.generate_report().await?)
474 } else {
475 None
476 };
477
478 let diagnostics_report = if self.config.enable_model_diagnostics {
479 Some(self.model_diagnostics.generate_report().await?)
480 } else {
481 None
482 };
483
484 let profiler_report = self.profiler.generate_report().await?;
485
486 let memory_profiler_report = if let Some(ref mut memory_profiler) = self.memory_profiler {
487 Some(memory_profiler.stop().await?)
488 } else {
489 None
490 };
491
492 let interactive_debugger_report = self.interactive_debugger.generate_report().await?;
493 let anomaly_report = self.anomaly_detector.generate_report().await?;
494
495 let computation_graph_report = None; let architecture_analysis_report =
500 Some(self.architecture_analyzer.generate_report().await?);
501 let behavior_analysis_report = Some(self.behavior_analyzer.generate_report().await?);
502 let training_dynamics_report =
503 Some(self.training_dynamics_analyzer.generate_report().await?);
504 let differential_debugging_report =
505 Some(self.differential_debugger.generate_report().await?);
506 let interpretability_report = Some(self.interpretability_analyzer.generate_report().await?);
507 let advanced_ml_debugging_report = Some(self.advanced_ml_debugger.generate_report().await?);
508
509 let advanced_gpu_profiling_report = self
511 .advanced_gpu_profiler
512 .as_ref()
513 .map(|profiler| profiler.get_memory_analysis_report());
514
515 let kernel_optimization_report =
516 Some(self.generate_kernel_optimization_summary_report().await?);
517
518 Ok(DebugReport {
519 session_id: self.id,
520 tensor_report,
521 gradient_report,
522 diagnostics_report,
523 profiler_report,
524 memory_profiler_report,
525 interactive_debugger_report,
526 anomaly_report,
527 computation_graph_report,
528 architecture_analysis_report,
529 behavior_analysis_report,
530 training_dynamics_report,
531 differential_debugging_report,
532 interpretability_report,
533 advanced_ml_debugging_report,
534 advanced_gpu_profiling_report,
535 kernel_optimization_report,
536 config: self.config.clone(),
537 })
538 }
539
540 pub async fn export(&self, path: &str) -> Result<()> {
542 let report = self.generate_snapshot().await?;
543 let json = serde_json::to_string_pretty(&report)?;
544 tokio::fs::write(path, json).await?;
545 Ok(())
546 }
547
548 pub async fn generate_snapshot(&self) -> Result<DebugReport> {
550 let tensor_report = if self.config.enable_tensor_inspection {
551 Some(self.tensor_inspector.generate_report().await?)
552 } else {
553 None
554 };
555
556 let gradient_report = if self.config.enable_gradient_debugging {
557 Some(self.gradient_debugger.generate_report().await?)
558 } else {
559 None
560 };
561
562 let diagnostics_report = if self.config.enable_model_diagnostics {
563 Some(self.model_diagnostics.generate_report().await?)
564 } else {
565 None
566 };
567
568 let profiler_report = self.profiler.generate_report().await?;
569
570 let memory_profiler_report = if let Some(ref _memory_profiler) = self.memory_profiler {
571 None } else {
574 None
575 };
576
577 let interactive_debugger_report = self.interactive_debugger.generate_report().await?;
578 let anomaly_report = self.anomaly_detector.generate_report().await?;
579
580 let computation_graph_report = None; let architecture_analysis_report =
585 Some(self.architecture_analyzer.generate_report().await?);
586 let behavior_analysis_report = Some(self.behavior_analyzer.generate_report().await?);
587 let training_dynamics_report =
588 Some(self.training_dynamics_analyzer.generate_report().await?);
589 let differential_debugging_report =
590 Some(self.differential_debugger.generate_report().await?);
591 let interpretability_report = Some(self.interpretability_analyzer.generate_report().await?);
592 let advanced_ml_debugging_report = Some(self.advanced_ml_debugger.generate_report().await?);
593
594 let advanced_gpu_profiling_report = self
596 .advanced_gpu_profiler
597 .as_ref()
598 .map(|profiler| profiler.get_memory_analysis_report());
599
600 let kernel_optimization_report =
601 Some(self.generate_kernel_optimization_summary_report().await?);
602
603 Ok(DebugReport {
604 session_id: self.id,
605 tensor_report,
606 gradient_report,
607 diagnostics_report,
608 profiler_report,
609 memory_profiler_report,
610 interactive_debugger_report,
611 anomaly_report,
612 computation_graph_report,
613 architecture_analysis_report,
614 behavior_analysis_report,
615 training_dynamics_report,
616 differential_debugging_report,
617 interpretability_report,
618 advanced_ml_debugging_report,
619 advanced_gpu_profiling_report,
620 kernel_optimization_report,
621 config: self.config.clone(),
622 })
623 }
624
625 pub fn debug_tensor<T>(&mut self, tensor: &ArrayD<T>, name: &str) -> Result<Uuid>
627 where
628 T: Clone + Into<f64> + fmt::Debug + 'static,
629 {
630 self.tensor_inspector.inspect_tensor(tensor, name, None, None)
631 }
632
633 async fn generate_kernel_optimization_summary_report(
635 &self,
636 ) -> Result<KernelOptimizationSummaryReport> {
637 Ok(KernelOptimizationSummaryReport {
640 total_kernels_analyzed: 0,
641 optimization_opportunities_found: 0,
642 high_impact_optimizations: vec![],
643 fusion_opportunities: 0,
644 regression_alerts: 0,
645 overall_optimization_score: 85.0,
646 top_recommendations: vec!["No kernel analysis data available yet".to_string()],
647 })
648 }
649
650 pub fn debug_gradients<T>(&mut self, _layer_name: &str, gradients: &[T]) -> Result<()>
652 where
653 T: Clone + Into<f64> + fmt::Debug + 'static,
654 {
655 use scirs2_core::ndarray::Array; let gradient_array = Array::from_vec(gradients.to_vec()).into_dyn();
658
659 let tensor_id = Uuid::new_v4();
661
662 self.tensor_inspector.inspect_gradients(tensor_id, &gradient_array)
663 }
664}
665
666#[derive(Debug, Clone, Serialize, Deserialize)]
668pub struct DebugReport {
669 pub session_id: Uuid,
670 pub tensor_report: Option<TensorInspectionReport>,
671 pub gradient_report: Option<GradientDebugReport>,
672 pub diagnostics_report: Option<ModelDiagnosticsReport>,
673 pub profiler_report: ProfilerReport,
674 pub memory_profiler_report: Option<MemoryProfilingReport>,
675 pub interactive_debugger_report: InteractiveDebuggerReport,
676 pub anomaly_report: AnomalyDetectorReport,
677 pub computation_graph_report: Option<GraphAnalysisResult>,
678 pub architecture_analysis_report: Option<ArchitectureAnalysisReport>,
679 pub behavior_analysis_report: Option<BehaviorAnalysisReport>,
680 pub training_dynamics_report: Option<model_diagnostics::training::TrainingDynamicsReport>,
681 pub differential_debugging_report: Option<DifferentialDebuggingReport>,
682 pub interpretability_report: Option<InterpretabilityReport>,
683 pub advanced_ml_debugging_report: Option<AdvancedMLDebuggingReport>,
684 pub advanced_gpu_profiling_report: Option<MemoryAnalysisReport>,
685 pub kernel_optimization_report: Option<KernelOptimizationSummaryReport>,
686 pub config: DebugConfig,
687}
688
689impl DebugReport {
690 pub fn summary(&self) -> DebugSummary {
692 let mut issues = Vec::new();
693 let mut recommendations = Vec::new();
694
695 if let Some(ref tensor_report) = self.tensor_report {
697 if tensor_report.has_nan_values() {
698 issues.push("NaN values detected in tensors".to_string());
699 recommendations.push("Check input data and model initialization".to_string());
700 }
701
702 if tensor_report.has_inf_values() {
703 issues.push("Infinite values detected in tensors".to_string());
704 recommendations.push("Reduce learning rate or add gradient clipping".to_string());
705 }
706 }
707
708 if let Some(ref gradient_report) = self.gradient_report {
710 if gradient_report.has_vanishing_gradients() {
711 issues.push("Vanishing gradients detected".to_string());
712 recommendations
713 .push("Consider residual connections or gradient scaling".to_string());
714 }
715
716 if gradient_report.has_exploding_gradients() {
717 issues.push("Exploding gradients detected".to_string());
718 recommendations.push("Add gradient clipping".to_string());
719 }
720 }
721
722 DebugSummary {
723 session_id: self.session_id,
724 total_issues: issues.len(),
725 critical_issues: issues
726 .iter()
727 .filter(|i| i.contains("NaN") || i.contains("exploding"))
728 .count(),
729 issues,
730 recommendations,
731 }
732 }
733}
734
735#[derive(Debug, Serialize, Deserialize)]
737pub struct DebugSummary {
738 pub session_id: Uuid,
739 pub total_issues: usize,
740 pub critical_issues: usize,
741 pub issues: Vec<String>,
742 pub recommendations: Vec<String>,
743}
744
745pub fn debug_session() -> DebugSession {
747 DebugSession::new(DebugConfig::default())
748}
749
750pub fn debug_session_with_config(config: DebugConfig) -> DebugSession {
752 DebugSession::new(config)
753}
754
755pub fn debug_session_with_transformer() -> DebugSession {
757 let config = DebugConfig {
758 neural_network_debugging_config: Some(
759 neural_network_debugging::TransformerDebugConfig::default(),
760 ),
761 ..Default::default()
762 };
763 DebugSession::new(config)
764}