Expand description
Performance profiling for ToRSh
This crate provides comprehensive performance profiling capabilities for the ToRSh deep learning framework, including CPU, GPU, memory, and system profiling.
§Refactored Modular Structure
The profiler has been successfully refactored from a massive 9,517-line monolithic file into a clean, maintainable modular structure:
core: Core profiling types, event management, and profiler implementationplatforms: Platform-specific profiling (CPU, GPU, system)analysis: Performance analysis and optimization recommendationsexport: Export and reporting functionality with multiple format supportdistributed: Distributed profiling coordination
§Usage Examples
§Basic Profiling
use torsh_profiler::{start_profiling, stop_profiling, profile_scope};
// Start global profiling
start_profiling();
{
profile_scope!("computation");
// Your code here
}
stop_profiling();§Advanced Profiling with Metrics
use torsh_profiler::{MetricsScope, export_global_events, ExportFormat};
fn main() -> Result<(), Box<dyn std::error::Error>> {
{
let mut scope = MetricsScope::new("training_step");
scope.set_operation_count(1000);
scope.set_flops(50000);
scope.set_bytes_transferred(4096);
// Training code here
}
// Export results
export_global_events(ExportFormat::ChromeTrace, "profile.json")?;
Ok(())
}§Platform-Specific Profiling
use torsh_profiler::{UnifiedProfiler, CudaProfiler, MemoryProfiler};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut profiler = UnifiedProfiler::with_auto_detection();
profiler.start_all()?;
// Your GPU/CPU workload
profiler.stop_all()?;
Ok(())
}Re-exports§
pub use core::add_global_event;pub use core::add_global_event as add_event;pub use core::clear_global_events;pub use core::get_global_stats;pub use core::global_profiler;pub use core::profile_function_with_category;pub use core::start_profiling;pub use core::stop_profiling;pub use core::MetricsScope;pub use core::Profiler;pub use core::ScopeGuard;pub use export::available_format_names;pub use export::export_chrome_trace_format;pub use export::export_csv_format;pub use export::export_events;pub use export::export_global_events;pub use export::export_json_format;pub use export::parse_format;pub use export::ExportFormat;pub use prometheus::PrometheusExporter;pub use prometheus::PrometheusExporterBuilder;pub use grafana::Dashboard as GrafanaDashboard;pub use grafana::DashboardTemplates;pub use grafana::GrafanaDashboardGenerator;pub use grafana::GridPos;pub use grafana::Panel;pub use grafana::Target;pub use cloudwatch::CloudWatchConfig;pub use cloudwatch::CloudWatchPublisher;pub use cloudwatch::CloudWatchPublisherBuilder;pub use cloudwatch::Dimension;pub use cloudwatch::MetricDatum;pub use cloudwatch::StatisticSet;pub use cloudwatch::Unit as CloudWatchUnit;pub use streaming::create_high_performance_streaming_engine;pub use streaming::create_low_latency_streaming_engine;pub use streaming::create_streaming_engine;pub use streaming::AdaptiveBitrateConfig;pub use streaming::AdaptiveRateController;pub use streaming::AdjustmentReason;pub use streaming::AdvancedFeatures;pub use streaming::BitrateAdjustment;pub use streaming::BufferedEvent;pub use streaming::CompressionAlgorithm;pub use streaming::CompressionConfig;pub use streaming::CompressionManager;pub use streaming::ConnectionManager;pub use streaming::ControlMessage;pub use streaming::EnhancedStreamingEngine;pub use streaming::EventBuffer;pub use streaming::EventPriority;pub use streaming::ProtocolConfig;pub use streaming::QualityConfig;pub use streaming::QualityLevel;pub use streaming::QualityMetricsThreshold;pub use streaming::SSEConnection;pub use streaming::StreamConnection;pub use streaming::StreamingConfig;pub use streaming::StreamingProtocol;pub use streaming::StreamingStats;pub use streaming::StreamingStatsSnapshot;pub use streaming::TcpConnection;pub use streaming::UdpConnection;pub use streaming::WebSocketConnection;pub use streaming::WebSocketMessage;pub use alerts::create_alert_manager_with_config;pub use alerts::get_alert_manager;pub use alerts::AlertConfig;pub use alerts::AlertManager;pub use alerts::NotificationChannel;pub use attributes::get_registry;pub use attributes::with_profiling;pub use attributes::AsyncProfiler;pub use attributes::AttributeRegistry;pub use attributes::ConditionalProfiler;pub use attributes::ProfileAttribute;pub use attributes::ProfiledFunction;pub use attributes::ProfiledStruct;pub use chrome_trace::create_chrome_event;pub use chrome_trace::export;pub use chrome_trace::export_to_writer;pub use chrome_trace::phases;pub use chrome_trace::scopes;pub use ci_cd::CiCdConfig;pub use ci_cd::CiCdIntegration;pub use ci_cd::CiCdPlatform;pub use cpu::CpuProfiler;pub use cpu::ProfileScope;pub use cuda::get_cuda_device_properties;pub use cuda::get_cuda_memory_stats;pub use cuda::CudaEvent;pub use cuda::CudaMemoryStats;pub use cuda::CudaProfiler;pub use cuda::CudaSynchronizationStats;pub use cuda::NvtxRange;pub use custom_export::CsvColumn;pub use custom_export::CsvFormatter;pub use custom_export::CustomExportFormat;pub use custom_export::CustomExporter;pub use custom_export::ExportSchema;pub use dashboard::alerts::create_alert_manager;pub use dashboard::create_dashboard;pub use dashboard::create_dashboard_with_config;pub use dashboard::export_dashboard_html;pub use dashboard::generate_3d_landscape;pub use dashboard::generate_performance_heatmap;pub use dashboard::Dashboard;pub use dashboard::DashboardAlert;pub use dashboard::DashboardAlertSeverity;pub use dashboard::DashboardConfig;pub use dashboard::DashboardData;pub use dashboard::HeatmapCell;pub use dashboard::MemoryMetrics;pub use dashboard::OperationSummary;pub use dashboard::PerformanceHeatmap;pub use dashboard::PerformanceLandscape;pub use dashboard::PerformanceMetrics;pub use dashboard::PerformancePoint3D;pub use dashboard::SystemMetrics;pub use dashboard::VisualizationColorScheme;pub use dashboard::VisualizationConfig;pub use dashboard::WebSocketConfig;pub use scirs2_integration::AdvancedProfilingConfig;pub use scirs2_integration::BenchmarkResults;pub use scirs2_integration::HistogramStats;pub use scirs2_integration::MetricsSummary;pub use scirs2_integration::PerformanceAnalysis;pub use scirs2_integration::PerformanceTargets;pub use scirs2_integration::SamplingStrategy;pub use scirs2_integration::ScirS2EnhancedProfiler;pub use scirs2_integration::ScirS2ProfilingData;pub use scirs2_integration::ValidationLevel;pub use instruments::create_instruments_profiler;pub use instruments::create_instruments_profiler_with_config;pub use instruments::export_instruments_json;pub use instruments::get_instruments_statistics;pub use instruments::AllocationType;pub use instruments::EnergyComponent;pub use instruments::InstrumentsConfig;pub use instruments::InstrumentsExportData;pub use instruments::InstrumentsProfiler;pub use instruments::InstrumentsStats;pub use instruments::SignpostInterval;pub use macros::ProfileResult;pub use memory::FragmentationAnalysis;pub use memory::LeakDetectionResults;pub use memory::MemoryBlock;pub use memory::MemoryEvent;pub use memory::MemoryEventType;pub use memory::MemoryLeak;pub use memory::MemoryProfiler;pub use memory::MemoryStats;pub use memory::MemoryTimeline;pub use memory::SystemMemoryInfo;pub use memory_optimization::create_memory_optimizer;pub use memory_optimization::create_memory_optimizer_for_low_memory;pub use memory_optimization::create_memory_optimizer_with_aggressive_settings;pub use memory_optimization::AdaptivePoolManager;pub use memory_optimization::AdvancedMemoryOptimizer;pub use memory_optimization::MemoryOptimizationConfig;pub use memory_optimization::MemoryOptimizationStats;pub use memory_optimization::MemorySnapshot;pub use memory_optimization::MemoryStrategies;pub use memory_optimization::MemoryUsagePredictor;pub use memory_optimization::OptimizationExportData;pub use memory_optimization::OptimizationStatsSummary;pub use core::metrics::CorrelationAnalysis;pub use core::metrics::CorrelationStrength;pub use core::metrics::CorrelationSummary;pub use core::metrics::CorrelationType;pub use core::metrics::MemoryCorrelation;pub use core::metrics::OperationCorrelation;pub use core::metrics::PerformanceCorrelation;pub use core::metrics::TemporalCorrelation;pub use core::events::*;pub use core::metrics::*;pub use export::dashboard::*;pub use export::formats::*;pub use export::reporting::*;pub use platforms::cpu::*;pub use platforms::gpu::*;pub use platforms::system::*;pub use analysis::ml_analysis::*;pub use analysis::optimization::*;pub use analysis::regression::*;pub use distributed::profiling::*;
Modules§
- advanced_
visualization - Advanced Visualization Export
- alerts
- Alert system for performance monitoring
- amd
- AMD Tools Integration
- analysis
- Performance analysis and optimization Performance analysis and optimization recommendations
- attributes
- Attribute-based profiling support
- chrome_
trace - Chrome tracing format export
- ci_cd
- CI/CD integration for performance profiling
- cloud_
providers - Cloud Provider Integrations for ToRSh Profiler
- cloudwatch
- AWS CloudWatch metrics integration for torsh-profiler
- core
- Core profiling functionality Core profiling types and utilities
- cpu
- CPU profiling
- cross_
platform - Cross-platform Profiling Support
- cuda
- CUDA profiling
- custom_
export - Custom export formats for profiling data
- custom_
tools - Custom tool APIs for profiler integration
- dashboard
- Real-time Performance Dashboard
- distributed
- Distributed profiling coordination Distributed profiling coordination
- export
- Export and reporting capabilities Export and reporting functionality
- grafana
- Grafana dashboard integration for torsh-profiler
- instruments
- Apple Instruments profiling integration
- integrated_
profiler - Integrated Profiler System
- kubernetes
- Kubernetes Operator for Cloud-Native Profiling
- macros
- Convenient macros for profiling operations
- memory
- Memory profiling
- memory_
optimization - Advanced Memory Optimization Features
- ml_
analysis - Machine Learning-based performance analysis
- nsight
- NVIDIA Nsight profiling integration
- online_
learning - Online Learning Module for Real-time Performance Analysis
- optimization
- Performance optimizations and overhead minimization
- platforms
- Platform-specific profiling implementations Platform-specific profiling implementations
- power
- Power profiling capabilities for energy-efficient performance monitoring
- prelude
- Prelude module for convenient imports
- prometheus
- Prometheus metrics integration for torsh-profiler
- regression
- Performance regression detection system
- reporting
- Comprehensive reporting system for performance profiling
- scirs2_
integration - SCIRS2 Integration for Advanced Profiling
- streaming
- Enhanced Real-time Streaming Capabilities
- tensorboard
- TensorBoard export functionality
- thermal
- Thermal analysis system for performance profiling
- vtune
- Intel VTune profiling integration
- workload_
characterization - Workload Characterization
Macros§
- benchmark_
scirs2 - Macro for advanced benchmarking with SciRS2
- collect_
scirs2_ metrics - Macro for advanced metrics collection
- cuda_
nvtx_ range - Macro for NVTX range profiling
- profile_
alloc - Profile memory allocation with tracking
- profile_
async - Profile async operations
- profile_
attribute - Attribute-like macro for profiling functions
- profile_
block - Profile a block of code with automatic naming
- profile_
closure - Profile a closure with optional name and category
- profile_
compare - Benchmark and profile comparison between different implementations
- profile_
cuda - Profile CUDA operations
- profile_
current_ function - Profile the current function automatically
- profile_
function - profile_
if - Conditionally profile based on a feature flag or condition
- profile_
loop - Profile loop iterations with automatic batching
- profile_
metrics - profile_
sampled - Profile with sampling (only profile every N calls)
- profile_
scirs2_ comprehensive - Enhanced macro for comprehensive metrics profiling
- profile_
scirs2_ sampling - Convenient macros for SCIRS2-enhanced profiling
- profile_
scirs2_ validated - Advanced profiling macro with validation
- profile_
scope - Macros for convenient scope profiling
- profile_
tensor_ op - Profile tensor operations with automatic FLOPS counting
- profile_
thread_ local - Profile with thread-local storage for reduced overhead
- profile_
with_ metadata - Profile with custom metadata
- profile_
with_ overhead - Profile with automatic overhead measurement
- profiled_
fn - Helper macro for creating profiled function wrappers
- profiling_
scope - Create a profiling scope with automatic cleanup
Structs§
- Anomaly
Analysis - Anomaly analysis result structure
- Bottleneck
Analysis - Bottleneck analysis results
- Bottleneck
Event - A performance bottleneck event
- Efficiency
Issue - Efficiency issue
- Memory
Anomaly - Memory anomaly data structure
- Memory
Hotspot - Memory hotspot information
- Overhead
Stats - Overhead statistics for profiling operations
- Pattern
Analysis - Pattern analysis result structure
- Performance
Anomaly - Performance anomaly data structure
- Performance
Pattern - Performance pattern data structure
- Profile
Event - Core profiling event structure
- Thread
Contention Event - Thread contention event
- Unified
Profiler - Enhanced unified profiler combining all platform profilers with simplified API
Enums§
- Bottleneck
Severity - Severity of a bottleneck
- Efficiency
Issue Type - Type of efficiency issue
Constants§
Functions§
- analyze_
global_ correlations - Analyze global correlations with proper implementation
- are_
global_ stack_ traces_ enabled - Check if global stack traces are enabled
- create_
basic_ profiler - Create a basic profiler for development
- create_
production_ profiler - Create a profiler optimized for production use
- create_
unified_ profiler - Create a unified profiler with automatic platform detection
- detect_
global_ anomalies - Detect global anomalies in profiling data (stub implementation)
- detect_
global_ patterns - Detect global patterns in profiling data (stub implementation)
- export_
duration_ histogram - Export duration histogram (stub implementation)
- export_
global_ anomaly_ analysis - Export global anomaly analysis (stub implementation)
- export_
global_ correlation_ analysis - Export global correlation analysis (stub implementation)
- export_
global_ csv - export_
global_ custom - Export using a custom format
- export_
global_ json - export_
global_ pattern_ analysis - Export global pattern analysis (stub implementation)
- export_
global_ tensorboard - export_
global_ trace - Enhanced global export functions with multiple format support
- export_
memory_ scatter_ plot - Export memory scatter plot (stub implementation)
- export_
operation_ frequency_ chart - Export operation frequency chart (stub implementation)
- export_
performance_ trend_ chart - Export performance trend chart (stub implementation)
- get_
global_ custom_ export_ formats - Get available custom export format names
- get_
global_ overhead_ stats - is_
global_ overhead_ tracking_ enabled - register_
global_ custom_ export_ format - Register a custom export format globally
- reset_
global_ overhead_ stats - set_
global_ overhead_ tracking_ enabled - Enhanced overhead tracking
- set_
global_ stack_ traces_ enabled - Set global stack traces enabled with enhanced functionality
Type Aliases§
- Torsh
Result - Convenience type alias for Results in this crate