Expand description
Distributed training support for ToRSh
This crate provides distributed training capabilities including:
- Data parallel training (DDP)
- Model parallel training
- Pipeline parallelism
- Collective communication operations
- RPC framework
Re-exports§
pub use backend::Backend;pub use backend::BackendType;pub use backend::ReduceOp;pub use bottleneck_detection::init_global_bottleneck_detector;pub use bottleneck_detection::run_global_bottleneck_detection;pub use bottleneck_detection::with_global_bottleneck_detector;pub use bottleneck_detection::Bottleneck;pub use bottleneck_detection::BottleneckDetectionConfig;pub use bottleneck_detection::BottleneckDetector;pub use bottleneck_detection::BottleneckSeverity;pub use bottleneck_detection::BottleneckThresholds;pub use bottleneck_detection::BottleneckType;pub use collectives::all_gather;pub use collectives::all_gather_group;pub use collectives::all_reduce;pub use collectives::all_reduce_group;pub use collectives::all_to_all;pub use collectives::barrier;pub use collectives::barrier_group;pub use collectives::broadcast;pub use collectives::broadcast_group;pub use collectives::bucket_all_reduce;pub use collectives::hierarchical_all_reduce;pub use collectives::irecv;pub use collectives::isend;pub use collectives::recv;pub use collectives::recv_group;pub use collectives::reduce;pub use collectives::reduce_group;pub use collectives::reduce_scatter;pub use collectives::ring_all_reduce;pub use collectives::scatter;pub use collectives::send;pub use collectives::send_group;pub use collectives::CommunicationGroup;pub use collectives::GroupManager;pub use communication::deserialize_message;pub use communication::deserialize_tensor;pub use communication::retry_with_backoff;pub use communication::serialize_message;pub use communication::serialize_tensor;pub use communication::validate_backend_initialized;pub use communication::validate_rank;pub use communication::with_backend_read;pub use communication::with_backend_write;pub use communication::wrap_communication_error;pub use communication::CommunicationStats;pub use communication::StatsCollector;pub use communication_scheduler::CommunicationOp;pub use communication_scheduler::CommunicationScheduler;pub use communication_scheduler::CommunicationTask;pub use communication_scheduler::Priority;pub use communication_scheduler::SchedulerConfig;pub use communication_scheduler::SchedulerStats;pub use communication_scheduler::SchedulingStrategy;pub use dask_integration::DaskArrayConfig;pub use dask_integration::DaskBagConfig;pub use dask_integration::DaskClusterConfig;pub use dask_integration::DaskClusterType;pub use dask_integration::DaskConfig;pub use dask_integration::DaskDataFrameConfig;pub use dask_integration::DaskDistributedConfig;pub use dask_integration::DaskIntegration;pub use dask_integration::DaskMLConfig;pub use dask_integration::DaskMLSearchMethod;pub use dask_integration::DaskScalingConfig;pub use dask_integration::DaskSchedulerConfig;pub use dask_integration::DaskSecurityConfig;pub use dask_integration::DaskShuffleMethod;pub use dask_integration::DaskStats;pub use dask_integration::DaskWorkerConfig;pub use ddp::BucketConfig;pub use ddp::BucketInfo;pub use ddp::DistributedDataParallel;pub use ddp::GradientSyncStats;pub use ddp::OverlapConfig;pub use debugging::get_global_debugger;pub use debugging::init_global_debugger;pub use debugging::ActiveOperation;pub use debugging::CommunicationState;pub use debugging::DebugConfig;pub use debugging::DebugEvent;pub use debugging::DiagnosticResult;pub use debugging::DistributedDebugger;pub use debugging::LogLevel;pub use debugging::ProcessGroupState;pub use debugging::ResourceState;pub use debugging::SystemStateSnapshot;pub use deepspeed_integration::ActivationCheckpointingConfig;pub use deepspeed_integration::DeepSpeedConfig;pub use deepspeed_integration::DeepSpeedIntegration;pub use deepspeed_integration::DeepSpeedStats;pub use deepspeed_integration::FP16Config;pub use deepspeed_integration::OffloadOptimizerConfig;pub use deepspeed_integration::OffloadParamConfig;pub use deepspeed_integration::ZeroOptimizationConfig;pub use deepspeed_integration::ZeroStage;pub use edge_computing::AdaptiveCommunicationParams;pub use edge_computing::AggregationSchedule;pub use edge_computing::AggregationStrategy;pub use edge_computing::BandwidthAdaptationConfig;pub use edge_computing::BandwidthMonitor;pub use edge_computing::ClientSelectionStrategy;pub use edge_computing::CommunicationManager;pub use edge_computing::ComputeCapability;pub use edge_computing::ConnectionType;pub use edge_computing::DataInfo;pub use edge_computing::DataLimits;pub use edge_computing::DeviceDiscoveryConfig;pub use edge_computing::DeviceLocation;pub use edge_computing::DeviceResources;pub use edge_computing::DeviceStatus;pub use edge_computing::DeviceType;pub use edge_computing::DiscoveryProtocol;pub use edge_computing::EdgeComputingConfig;pub use edge_computing::EdgeComputingManager;pub use edge_computing::EdgeDevice;pub use edge_computing::EdgeOptimizationConfig;pub use edge_computing::FederatedAlgorithm;pub use edge_computing::FederatedLearningConfig;pub use edge_computing::FederatedLearningCoordinator;pub use edge_computing::HierarchicalTrainingConfig;pub use edge_computing::HierarchicalTrainingCoordinator;pub use edge_computing::NetworkInfo;pub use edge_computing::PrivacyConfig;pub use edge_computing::PrivacyLevel;pub use edge_computing::PrivacyManager;pub use edge_computing::PrivacyMechanism;pub use edge_computing::ThermalState;pub use edge_computing::TrainingTier;pub use error_recovery::CircuitBreaker;pub use error_recovery::CircuitBreakerConfig;pub use error_recovery::CircuitBreakerState;pub use error_recovery::FailureDetector;pub use error_recovery::HealthChecker;pub use error_recovery::HealthStatus;pub use error_recovery::RetryConfig;pub use error_recovery::RetryExecutor;pub use error_recovery::RetryStats;pub use expert_parallelism::DistributedExpertManager;pub use expert_parallelism::ExpertAssignment;pub use expert_parallelism::ExpertParallelismConfig;pub use expert_parallelism::ExpertParameters;pub use expert_parallelism::ExpertRouter;pub use expert_parallelism::ExpertShardingStrategy;pub use expert_parallelism::RoutingDecision;pub use expert_parallelism::RoutingStats;pub use fairscale_integration::FairScaleActivationCheckpointingConfig;pub use fairscale_integration::FairScaleAutoWrapPolicy;pub use fairscale_integration::FairScaleBalanceMode;pub use fairscale_integration::FairScaleCheckpointingStrategy;pub use fairscale_integration::FairScaleConfig;pub use fairscale_integration::FairScaleFsdpConfig;pub use fairscale_integration::FairScaleGradScalerConfig;pub use fairscale_integration::FairScaleIntegration;pub use fairscale_integration::FairScaleMemoryOptimizationConfig;pub use fairscale_integration::FairScaleOssConfig;pub use fairscale_integration::FairScalePipelineConfig;pub use fairscale_integration::FairScalePipelineSchedule;pub use fairscale_integration::FairScaleStats;pub use fault_tolerance::checkpoint_utils;pub use fault_tolerance::CheckpointConfig;pub use fault_tolerance::CheckpointManager;pub use fault_tolerance::DistributedMetadata;pub use fault_tolerance::ElasticConfig;pub use fault_tolerance::ElasticTrainingManager;pub use fault_tolerance::ScalingEvent;pub use fault_tolerance::ScalingState;pub use fault_tolerance::TrainingCheckpoint;pub use fsdp::auto_wrap_modules;pub use fsdp::fsdp_wrap;pub use fsdp::AutoWrapPolicy;pub use fsdp::BackwardPrefetch;pub use fsdp::FsdpConfig;pub use fsdp::FullyShardedDataParallel;pub use fsdp::MemoryConfig;pub use fsdp::MemoryStats;pub use fsdp::MixedPrecisionConfig;pub use fsdp::ShardInfo as FsdpShardInfo;pub use fsdp::ShardingStrategy;pub use gradient_compression::CompressedData;pub use gradient_compression::CompressedGradient;pub use gradient_compression::CompressionConfig;pub use gradient_compression::CompressionMetadata;pub use gradient_compression::CompressionMethod;pub use gradient_compression::CompressionStats;pub use gradient_compression::GradientCompressor;pub use green_computing::CarbonFootprintData;pub use green_computing::DeviceEnergyData;pub use green_computing::GreenComputingConfig;pub use green_computing::GreenComputingManager;pub use green_computing::GreenOptimizationStrategy;pub use green_computing::GreenTrainingScheduler;pub use green_computing::PowerManagementStrategy;pub use green_computing::RenewableEnergyData;pub use green_computing::ScheduleAction;pub use green_computing::SustainabilityMetrics;pub use green_computing::SustainabilityReport;pub use green_computing::SustainabilityReportingConfig;pub use green_computing::TrainingScheduleRecommendation;pub use green_computing::TrainingWindow;pub use horovod_integration::HorovodCompressionConfig;pub use horovod_integration::HorovodCompressionType;pub use horovod_integration::HorovodConfig;pub use horovod_integration::HorovodElasticConfig;pub use horovod_integration::HorovodIntegration;pub use horovod_integration::HorovodOptimizerFusionConfig;pub use horovod_integration::HorovodStats;pub use horovod_integration::HorovodTimelineConfig;pub use metrics::get_global_metrics_collector;pub use metrics::init_global_metrics_collector;pub use metrics::start_global_metrics_collection;pub use metrics::stop_global_metrics_collection;pub use metrics::CommunicationMetrics;pub use metrics::MetricsCollector;pub use metrics::MetricsConfig;pub use metrics::PerformanceMetrics;pub use metrics::SystemMetrics;pub use metrics::TimeSeries;pub use metrics::TimeSeriesPoint;pub use metrics::TrainingMetrics;pub use parameter_server::ParameterServer;pub use parameter_server::ParameterServerClient;pub use parameter_server::ParameterServerConfig;pub use parameter_server::ParameterServerMessage;pub use parameter_server::ParameterServerResponse;pub use parameter_server::ParameterServerStats;pub use pipeline::create_pipeline_stages;pub use pipeline::PipelineConfig;pub use pipeline::PipelineParallel;pub use pipeline::PipelineStage;pub use pipeline::PipelineStats;pub use pipeline::ScheduleType;pub use process_group::ProcessGroup;pub use process_group::Rank;pub use process_group::WorldSize;pub use profiling::get_global_profiler;pub use profiling::init_global_profiler;pub use profiling::CommunicationEvent;pub use profiling::CommunicationOpType;pub use profiling::CommunicationProfiler;pub use profiling::OperationStats;pub use profiling::ProfilingConfig;pub use profiling::ProfilingTimer;pub use ray_integration::RayCheckpointConfig;pub use ray_integration::RayClusterConfig;pub use ray_integration::RayConfig;pub use ray_integration::RayDataConfig;pub use ray_integration::RayDataFormat;pub use ray_integration::RayFailureConfig;pub use ray_integration::RayFaultToleranceConfig;pub use ray_integration::RayIntegration;pub use ray_integration::RayPlacementGroupStrategy;pub use ray_integration::RayResourceConfig;pub use ray_integration::RayRunConfig;pub use ray_integration::RayScalingConfig;pub use ray_integration::RayScheduler;pub use ray_integration::RaySearchAlgorithm;pub use ray_integration::RayServeConfig;pub use ray_integration::RayStats;pub use ray_integration::RayTrainBackend;pub use ray_integration::RayTrainConfig;pub use ray_integration::RayTuneConfig;pub use rdma_support::CompletionStatus;pub use rdma_support::MemoryAccess;pub use rdma_support::MemoryRegion;pub use rdma_support::MemoryRegistration;pub use rdma_support::RdmaConfig;pub use rdma_support::RdmaConnectionManager;pub use rdma_support::RdmaEndpoint;pub use rdma_support::RdmaError;pub use rdma_support::RdmaMemoryPool;pub use rdma_support::RdmaMemoryPoolConfig;pub use rdma_support::RdmaOperation;pub use rdma_support::RdmaProtocol;pub use rdma_support::RdmaQoS;pub use rdma_support::RdmaResult;pub use rdma_support::RdmaStatistics;pub use rdma_support::RdmaTensorScheduler;pub use rdma_support::WorkCompletion;pub use rdma_support::WorkRequest;pub use rpc::delete_rref;pub use rpc::get_worker_rank;pub use rpc::get_world_size;pub use rpc::init_rpc;pub use rpc::is_initialized;pub use rpc::register_function;pub use rpc::remote;pub use rpc::rpc_async;pub use rpc::shutdown;pub use rpc::RRef;pub use rpc::RpcBackendOptions;pub use rpc::RpcMessage;pub use store::create_store;pub use store::FileStore;pub use store::MemoryStore;pub use store::Store;pub use store::StoreBackend;pub use store::StoreConfig;pub use store::StoreValue;pub use tensor_parallel::ShardInfo as TpShardInfo;pub use tensor_parallel::TensorParallel;pub use tensor_parallel::TensorParallelConfig;pub use tensor_parallel::TensorParallelLayer;pub use tensor_parallel::TensorParallelStats;pub use tensor_parallel::TensorParallelStrategy;pub use three_d_parallelism::CommunicationStrategy;pub use three_d_parallelism::LayerShard;pub use three_d_parallelism::LayerType;pub use three_d_parallelism::Memory3DStats;pub use three_d_parallelism::MemoryOptimizationStrategy;pub use three_d_parallelism::ModelShards;pub use three_d_parallelism::Performance3DStats;pub use three_d_parallelism::RankMapping;pub use three_d_parallelism::ThreeDParallelismConfig;pub use three_d_parallelism::ThreeDParallelismCoordinator;pub use training_analytics_dashboard::CommunicationAnalytics;pub use training_analytics_dashboard::CommunicationHotspot;pub use training_analytics_dashboard::CommunicationPatterns;pub use training_analytics_dashboard::ConvergenceAnalytics;pub use training_analytics_dashboard::DashboardConfig;pub use training_analytics_dashboard::DashboardExport;pub use training_analytics_dashboard::EfficiencyAnalytics;pub use training_analytics_dashboard::OptimizationRecommendation;pub use training_analytics_dashboard::RecommendationCategory;pub use training_analytics_dashboard::ResourceBottleneck;pub use training_analytics_dashboard::ResourceUtilizationAnalytics;pub use training_analytics_dashboard::SystemHealthAnalytics;pub use training_analytics_dashboard::TrainingAnalytics;pub use training_analytics_dashboard::TrainingAnalyticsDashboard;pub use training_analytics_dashboard::TrainingPerformanceAnalytics;pub use training_analytics_dashboard::TrainingSummaryReport;pub use visualization::generate_communication_network_html;pub use visualization::generate_monitoring_dashboard;pub use visualization::Chart;pub use visualization::ChartSeries;pub use visualization::ChartType;pub use visualization::ColorScheme;pub use visualization::Dashboard;pub use visualization::DashboardLayout;pub use visualization::DataPoint;pub use visualization::VisualizationConfig;pub use visualization::VisualizationGenerator;pub use zero_3_cpu_offload::AutoMemoryStrategy;pub use zero_3_cpu_offload::ConfigModelParameters as ModelParameters;pub use zero_3_cpu_offload::CpuCompressionMethod;pub use zero_3_cpu_offload::Zero3CpuOffloadConfig;pub use zero_3_cpu_offload::Zero3CpuOffloadManager;pub use zero_3_cpu_offload::Zero3MemoryStats;pub use zero_3_cpu_offload::Zero3PerformanceStats;
Modules§
- advanced_
monitoring - Advanced Monitoring and Performance Analytics for Distributed Training
- alerting
- Real-time Alerting System for Distributed Training
- backend
- Distributed backend implementations
- bottleneck_
detection - Bottleneck detection algorithms for distributed training
- collectives
- Collective communication operations
- communication
- Communication utilities for consolidating common patterns across distributed modules
- communication_
scheduler - Communication Scheduler for Distributed Training
- dask_
integration - Dask integration for ToRSh distributed training
- ddp
- Distributed Data Parallel (DDP) implementation
- debugging
- Debugging utilities for distributed training systems
- deepspeed_
integration - DeepSpeed integration for ToRSh distributed training
- distributed_
memory_ optimization - Distributed Memory Optimization for Training
- distributed_
monitoring - Advanced Distributed Training Monitoring System
- edge_
computing - Edge Computing for Distributed Training
- enhanced_
benchmarks - Enhanced Benchmarking Suite for Distributed Training Features
- enhanced_
fault_ tolerance - Enhanced Fault Tolerance for Distributed Training
- error_
recovery - Error handling and recovery mechanisms for distributed training
- expert_
parallelism - Expert Parallelism Module
- fairscale_
integration - FairScale compatibility layer for ToRSh distributed training
- fault_
tolerance - Fault tolerance features for distributed training
- fsdp
- Fully Sharded Data Parallel (FSDP) implementation
- gradient_
compression - Gradient Compression for Distributed Training
- gradient_
compression_ enhanced - Enhanced Gradient Compression with Performance Optimizations
- green_
computing - Green Computing for Distributed Training
- horovod_
integration - Horovod compatibility layer for ToRSh distributed training
- metrics
- Performance metrics collection for distributed training
- network_
aware_ compression - Network-Aware Adaptive Gradient Compression
- parameter_
server - Parameter Server implementation for distributed training
- pipeline
- Pipeline parallelism implementation for distributed training
- prelude
- Prelude module for convenient imports
- process_
group - Process group management for distributed training
- profiling
- Communication profiling and performance monitoring for distributed training
- prometheus_
exporter - Prometheus Metrics Exporter for Distributed Training
- ray_
integration - Ray integration for ToRSh distributed training
- rdma_
support - RDMA (Remote Direct Memory Access) Support for High-Performance Distributed Computing
- rpc
- Remote Procedure Call (RPC) framework for distributed training
- store
- Distributed store for process coordination
- tensor_
parallel - Tensor Parallelism implementation for distributed training
- three_
d_ parallelism - Modular 3D Parallelism System
- training_
analytics_ dashboard - Distributed Training Analytics Dashboard
- visualization
- Visualization tools for distributed training monitoring
- zero_
3_ cpu_ offload - ZeRO-3 CPU Offloading Module
Macros§
- debug_
log - Convenience macros for debugging
- debug_
trace_ operation - profile_
communication - Convenience macro for profiling communication operations
Enums§
- Torsh
Distributed Error - Distributed training specific errors with detailed context
Constants§
Functions§
- init_
process_ group - Initialize the distributed process group
- is_
available - Check if distributed training is available
- is_
gloo_ available - Check if Gloo backend is available
- is_
mpi_ available - Check if MPI backend is available
- is_
nccl_ available - Check if NCCL backend is available
Type Aliases§
- Torsh
Result - Type alias for Results with TorshDistributedError