pub struct MultiNodeTrainer<T: Optimizer> { /* private fields */ }Expand description
Multi-node distributed training coordinator
Implementations§
Source§impl<T: Optimizer> MultiNodeTrainer<T>
impl<T: Optimizer> MultiNodeTrainer<T>
Sourcepub fn new(config: MultiNodeConfig, base_optimizer: T) -> Result<Self>
pub fn new(config: MultiNodeConfig, base_optimizer: T) -> Result<Self>
Create a new multi-node trainer
Sourcepub fn initialize_environment() -> Result<()>
pub fn initialize_environment() -> Result<()>
Initialize MPI environment for multi-node training
Sourcepub fn register_parameters(
&mut self,
parameters: HashMap<String, Tensor>,
) -> Result<()>
pub fn register_parameters( &mut self, parameters: HashMap<String, Tensor>, ) -> Result<()>
Register parameters for multi-node training
Sourcepub fn update_gradients(
&mut self,
gradients: HashMap<String, Tensor>,
) -> Result<()>
pub fn update_gradients( &mut self, gradients: HashMap<String, Tensor>, ) -> Result<()>
Update gradients with multi-node synchronization
Sourcepub fn synchronize_gradients(&mut self) -> Result<()>
pub fn synchronize_gradients(&mut self) -> Result<()>
Synchronize gradients across all nodes
Sourcepub fn apply_gradients(&mut self, accumulation_steps: usize) -> Result<()>
pub fn apply_gradients(&mut self, accumulation_steps: usize) -> Result<()>
Apply gradients with multi-node coordination
Sourcepub fn optimizer_step(&mut self) -> Result<()>
pub fn optimizer_step(&mut self) -> Result<()>
Perform optimizer step with multi-node coordination
Sourcepub fn get_memory_usage(&self) -> HashMap<String, usize>
pub fn get_memory_usage(&self) -> HashMap<String, usize>
Get comprehensive memory usage across nodes
Sourcepub fn get_training_stats(&self) -> MultiNodeStats
pub fn get_training_stats(&self) -> MultiNodeStats
Get multi-node training statistics
Sourcepub fn should_save_checkpoint(&self) -> bool
pub fn should_save_checkpoint(&self) -> bool
Check if this process should save checkpoints
Auto Trait Implementations§
impl<T> Freeze for MultiNodeTrainer<T>where
T: Freeze,
impl<T> !RefUnwindSafe for MultiNodeTrainer<T>
impl<T> Send for MultiNodeTrainer<T>
impl<T> Sync for MultiNodeTrainer<T>
impl<T> Unpin for MultiNodeTrainer<T>where
T: Unpin,
impl<T> !UnwindSafe for MultiNodeTrainer<T>
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more