pub struct DistributedTrainer { /* private fields */ }Expand description
Distributed training coordinator
Implementations§
Source§impl DistributedTrainer
impl DistributedTrainer
Sourcepub fn new(
gpu_ids: Vec<usize>,
strategy: ParallelismStrategy,
config: TrainingConfig,
) -> RusTorchResult<Self>
pub fn new( gpu_ids: Vec<usize>, strategy: ParallelismStrategy, config: TrainingConfig, ) -> RusTorchResult<Self>
Create new distributed trainer
Sourcepub fn enable_profiling(&mut self) -> RusTorchResult<()>
pub fn enable_profiling(&mut self) -> RusTorchResult<()>
Enable performance profiling
Sourcepub fn get_profiling_report(&self) -> Option<PerformanceReport>
pub fn get_profiling_report(&self) -> Option<PerformanceReport>
Get profiling report
Sourcepub fn get_gpu_count(&self) -> usize
pub fn get_gpu_count(&self) -> usize
Get GPU count
Sourcepub fn training_step(
&mut self,
model_parameters: &HashMap<String, Tensor<f32>>,
gradients: HashMap<String, Vec<Tensor<f32>>>,
) -> RusTorchResult<HashMap<String, Tensor<f32>>>
pub fn training_step( &mut self, model_parameters: &HashMap<String, Tensor<f32>>, gradients: HashMap<String, Vec<Tensor<f32>>>, ) -> RusTorchResult<HashMap<String, Tensor<f32>>>
Execute training step across all GPUs
Sourcepub fn barrier_sync(&self) -> RusTorchResult<()>
pub fn barrier_sync(&self) -> RusTorchResult<()>
Synchronize all GPUs before critical operations
Sourcepub fn get_metrics(&self) -> TrainingMetrics
pub fn get_metrics(&self) -> TrainingMetrics
Get training performance metrics
Sourcepub fn handle_failure(&mut self, failed_gpu: usize) -> RusTorchResult<()>
pub fn handle_failure(&mut self, failed_gpu: usize) -> RusTorchResult<()>
Handle GPU failure and recovery
Auto Trait Implementations§
impl Freeze for DistributedTrainer
impl RefUnwindSafe for DistributedTrainer
impl Send for DistributedTrainer
impl Sync for DistributedTrainer
impl Unpin for DistributedTrainer
impl UnwindSafe for DistributedTrainer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more