pub struct GradientAccumulator<F: Float + Debug + ScalarOperand + Send + Sync + FromPrimitive> {
pub config: GradientAccumulationConfig,
/* private fields */
}
Expand description
Gradient accumulator for accumulating gradients over multiple batches
Fields§
§config: GradientAccumulationConfig
Configuration for gradient accumulation
Implementations§
Source§impl<F: Float + Debug + ScalarOperand + Send + Sync + FromPrimitive> GradientAccumulator<F>
impl<F: Float + Debug + ScalarOperand + Send + Sync + FromPrimitive> GradientAccumulator<F>
Sourcepub fn new(config: GradientAccumulationConfig) -> Self
pub fn new(config: GradientAccumulationConfig) -> Self
Create a new gradient accumulator
Examples found in repository?
examples/advanced_training_example.rs (lines 203-210)
124fn main() -> Result<()> {
125 println!("Advanced Training Examples");
126 println!("-------------------------");
127
128 // 1. Gradient Accumulation
129 println!("\n1. Training with Gradient Accumulation:");
130
131 // Generate synthetic dataset
132 let _dataset = generate_regression_dataset::<f32>(1000, 10, 2)?;
133 let _val_dataset = generate_regression_dataset::<f32>(200, 10, 2)?;
134
135 // Create model, optimizer, and loss function
136 let model = create_regression_model::<f32>(10, 64, 2)?;
137 let optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
138 let loss_fn = MSELoss::new();
139
140 // Create gradient accumulation config
141 let ga_config = GradientAccumulationConfig {
142 accumulation_steps: 4,
143 average_gradients: true,
144 zero_gradients_after_update: true,
145 clip_gradients: true,
146 max_gradient_norm: Some(1.0),
147 log_gradient_stats: true,
148 };
149
150 // Create training config
151 let training_config = TrainingConfig {
152 batch_size: 32,
153 shuffle: true,
154 num_workers: 0,
155 learning_rate: 0.001,
156 epochs: 5,
157 verbose: 1,
158 validation: Some(ValidationSettings {
159 enabled: true,
160 validation_split: 0.0, // Use separate validation dataset
161 batch_size: 32,
162 num_workers: 0,
163 }),
164 gradient_accumulation: Some(ga_config),
165 mixed_precision: None,
166 };
167
168 // Create trainer
169 let _trainer = Trainer::new(model, optimizer, loss_fn, training_config);
170
171 // Note: To properly use callbacks, we would need to implement the appropriate trait interfaces
172 // Here we're simplifying for the example
173
174 // We'll use a simple closure to describe the early stopping callback
175 println!("Note: We would add callbacks like EarlyStopping and ModelCheckpoint here");
176 println!("For example: EarlyStopping with patience=5, min_delta=0.001");
177
178 // Create learning rate scheduler - we'll just demonstrate its usage
179 let _lr_scheduler = CosineAnnealingScheduler::new(0.001_f32, 0.0001_f32);
180 println!("Using CosineAnnealingScheduler with initial_lr=0.001, min_lr=0.0001");
181
182 // Train model
183 println!("\nTraining model with gradient accumulation...");
184
185 // For demonstration purposes, show what would happen with real training
186 println!("Would execute: trainer.train(&dataset, Some(&val_dataset))?");
187
188 // Since we're not actually training, just show example output
189 println!("\nExample of training output that would be shown:");
190 println!("Training completed in 3 epochs");
191 println!("Final loss: 0.0124");
192 println!("Final validation loss: 0.0156");
193
194 // 2. Manual Gradient Accumulation
195 println!("\n2. Manual Gradient Accumulation:");
196
197 // Create model, optimizer, and loss function
198 let model = create_regression_model::<f32>(10, 64, 2)?;
199 let _optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
200 let _loss_fn = MSELoss::new();
201
202 // Create gradient accumulator
203 let mut accumulator = GradientAccumulator::new(GradientAccumulationConfig {
204 accumulation_steps: 4,
205 average_gradients: true,
206 zero_gradients_after_update: true,
207 clip_gradients: false,
208 max_gradient_norm: None,
209 log_gradient_stats: false,
210 });
211
212 // Initialize accumulator
213 accumulator.initialize(&model)?;
214
215 // We would use a DataLoader in real code, but here we'll simulate it
216 println!("Creating data loader with batch_size=32, shuffle=true");
217
218 println!("\nTraining for 1 epoch with manual gradient accumulation...");
219
220 let mut total_loss = 0.0_f32;
221 let mut processed_batches = 0;
222
223 // Train for one epoch
224 // This is a simplified example - in practice you would iterate through DataLoader batches
225 // Simulated loop to demonstrate the concept:
226 let total_batches = 5;
227 for batch_idx in 0..total_batches {
228 // In a real implementation we would get inputs and targets from data_loader
229 println!("Batch {} - Accumulating gradients...", batch_idx + 1);
230
231 // Simulate a loss value
232 let loss = 0.1 * (batch_idx as f32 + 1.0).powf(-0.5);
233 total_loss += loss;
234 processed_batches += 1;
235
236 // Simulate gradient stats
237 println!(
238 "Batch {} - Gradient stats: min={:.4}, max={:.4}, mean={:.4}, norm={:.4}",
239 batch_idx + 1,
240 -0.05 * (batch_idx as f32 + 1.0).powf(-0.5),
241 0.05 * (batch_idx as f32 + 1.0).powf(-0.5),
242 0.01 * (batch_idx as f32 + 1.0).powf(-0.5),
243 0.2 * (batch_idx as f32 + 1.0).powf(-0.5)
244 );
245
246 // Update if needed - this is conceptual
247 if (batch_idx + 1) % 4 == 0 || batch_idx == total_batches - 1 {
248 println!(
249 "Applying accumulated gradients after {} batches",
250 (batch_idx + 1) % 4
251 );
252 // In a real implementation we would apply gradients:
253 // accumulator.apply_gradients(&mut model, &mut optimizer)?;
254 }
255
256 // Early stopping for example
257 if batch_idx >= 10 {
258 break;
259 }
260 }
261
262 if processed_batches > 0 {
263 println!("Average loss: {:.4}", total_loss / processed_batches as f32);
264 }
265
266 // 3. Mixed Precision (not fully implemented, pseudocode)
267 println!("\n3. Mixed Precision Training (Pseudocode):");
268
269 println!(
270 "// Create mixed precision config
271let mp_config = MixedPrecisionConfig {{
272 dynamic_loss_scaling: true,
273 initial_loss_scale: 65536.0,
274 scale_factor: 2.0,
275 scale_window: 2000,
276 min_loss_scale: 1.0,
277 max_loss_scale: 2_f64.powi(24),
278 verbose: true,
279}};
280
281// Create high precision and low precision models
282let high_precision_model = create_regression_model::<f32>(10, 64, 2)?;
283let low_precision_model = create_regression_model::<f16>(10, 64, 2)?;
284
285// Create mixed precision model
286let mut mixed_model = MixedPrecisionModel::new(
287 high_precision_model,
288 low_precision_model,
289 mp_config,
290)?;
291
292// Create optimizer and loss function
293let mut optimizer = Adam::new(0.001);
294let loss_fn = MSELoss::new();
295
296// Train for one epoch
297mixed_model.train_epoch(
298 &mut optimizer,
299 &dataset,
300 &loss_fn,
301 32,
302 true,
303)?;"
304 );
305
306 // 4. Gradient Clipping
307 println!("\n4. Gradient Clipping:");
308
309 // Create model, optimizer, and loss function
310 let model = create_regression_model::<f32>(10, 64, 2)?;
311 let optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
312 let loss_fn = MSELoss::new();
313
314 // Create training config - we need two separate instances
315 let gradient_clipping_config = TrainingConfig {
316 batch_size: 32,
317 shuffle: true,
318 num_workers: 0,
319 learning_rate: 0.001,
320 epochs: 5,
321 verbose: 1,
322 validation: Some(ValidationSettings {
323 enabled: true,
324 validation_split: 0.0, // Use separate validation dataset
325 batch_size: 32,
326 num_workers: 0,
327 }),
328 gradient_accumulation: None,
329 mixed_precision: None,
330 };
331
332 // Create a separate configuration for the value clipping example
333 let value_clipping_config = TrainingConfig {
334 batch_size: 32,
335 shuffle: true,
336 num_workers: 0,
337 learning_rate: 0.001,
338 epochs: 5,
339 verbose: 1,
340 validation: Some(ValidationSettings {
341 enabled: true,
342 validation_split: 0.0, // Use separate validation dataset
343 batch_size: 32,
344 num_workers: 0,
345 }),
346 gradient_accumulation: None,
347 mixed_precision: None,
348 };
349
350 // Create trainer
351 let _trainer = Trainer::new(model, optimizer, loss_fn, gradient_clipping_config);
352
353 // Instead of adding callbacks directly, we'll just demonstrate the concept
354 println!("If callbacks were fully implemented, we would add gradient clipping:");
355 println!("GradientClipping::by_global_norm(1.0_f32, true) // Max norm, log_stats");
356
357 println!("\nTraining model with gradient clipping by global norm...");
358
359 // Train model for a few epochs
360 let _dataset_small = generate_regression_dataset::<f32>(500, 10, 2)?;
361 let _val_dataset_small = generate_regression_dataset::<f32>(100, 10, 2)?;
362 println!("Would train the model with dataset_small and val_dataset_small");
363 // In a real implementation:
364 // let session = trainer.train(&dataset_small, Some(&val_dataset_small))?;
365
366 // Since we're not actually training, just show example output
367 println!("\nExample of training output that would be shown:");
368 println!("Training completed in 3 epochs");
369 println!("Final loss: 0.0124");
370 println!("Final validation loss: 0.0156");
371
372 // Example with value clipping
373 println!("\nExample with gradient clipping by value:");
374
375 // Create model and trainer with value clipping
376 let model = create_regression_model::<f32>(10, 64, 2)?;
377 let optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
378 let _trainer = Trainer::new(model, optimizer, loss_fn, value_clipping_config);
379
380 // Instead of actual callbacks, show how we would use them
381 println!("For gradient clipping by value, we would use:");
382 println!("GradientClipping::by_value(0.5_f32, true) // Max value, log_stats");
383
384 println!("\nDemonstration of how to set up gradient clipping by value:");
385 println!("trainer.add_callback(Box::new(GradientClipping::by_value(");
386 println!(" 0.5_f32, // Max value");
387 println!(" true, // Log stats");
388 println!(")));");
389
390 // Demonstrate the training utilities
391 println!("\nAdvanced Training Examples Completed Successfully!");
392
393 Ok(())
394}
Sourcepub fn initialize<L: Layer<F> + ?Sized>(&mut self, model: &L) -> Result<()>
pub fn initialize<L: Layer<F> + ?Sized>(&mut self, model: &L) -> Result<()>
Initialize the accumulator with the model’s parameter shapes
Examples found in repository?
examples/advanced_training_example.rs (line 213)
124fn main() -> Result<()> {
125 println!("Advanced Training Examples");
126 println!("-------------------------");
127
128 // 1. Gradient Accumulation
129 println!("\n1. Training with Gradient Accumulation:");
130
131 // Generate synthetic dataset
132 let _dataset = generate_regression_dataset::<f32>(1000, 10, 2)?;
133 let _val_dataset = generate_regression_dataset::<f32>(200, 10, 2)?;
134
135 // Create model, optimizer, and loss function
136 let model = create_regression_model::<f32>(10, 64, 2)?;
137 let optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
138 let loss_fn = MSELoss::new();
139
140 // Create gradient accumulation config
141 let ga_config = GradientAccumulationConfig {
142 accumulation_steps: 4,
143 average_gradients: true,
144 zero_gradients_after_update: true,
145 clip_gradients: true,
146 max_gradient_norm: Some(1.0),
147 log_gradient_stats: true,
148 };
149
150 // Create training config
151 let training_config = TrainingConfig {
152 batch_size: 32,
153 shuffle: true,
154 num_workers: 0,
155 learning_rate: 0.001,
156 epochs: 5,
157 verbose: 1,
158 validation: Some(ValidationSettings {
159 enabled: true,
160 validation_split: 0.0, // Use separate validation dataset
161 batch_size: 32,
162 num_workers: 0,
163 }),
164 gradient_accumulation: Some(ga_config),
165 mixed_precision: None,
166 };
167
168 // Create trainer
169 let _trainer = Trainer::new(model, optimizer, loss_fn, training_config);
170
171 // Note: To properly use callbacks, we would need to implement the appropriate trait interfaces
172 // Here we're simplifying for the example
173
174 // We'll use a simple closure to describe the early stopping callback
175 println!("Note: We would add callbacks like EarlyStopping and ModelCheckpoint here");
176 println!("For example: EarlyStopping with patience=5, min_delta=0.001");
177
178 // Create learning rate scheduler - we'll just demonstrate its usage
179 let _lr_scheduler = CosineAnnealingScheduler::new(0.001_f32, 0.0001_f32);
180 println!("Using CosineAnnealingScheduler with initial_lr=0.001, min_lr=0.0001");
181
182 // Train model
183 println!("\nTraining model with gradient accumulation...");
184
185 // For demonstration purposes, show what would happen with real training
186 println!("Would execute: trainer.train(&dataset, Some(&val_dataset))?");
187
188 // Since we're not actually training, just show example output
189 println!("\nExample of training output that would be shown:");
190 println!("Training completed in 3 epochs");
191 println!("Final loss: 0.0124");
192 println!("Final validation loss: 0.0156");
193
194 // 2. Manual Gradient Accumulation
195 println!("\n2. Manual Gradient Accumulation:");
196
197 // Create model, optimizer, and loss function
198 let model = create_regression_model::<f32>(10, 64, 2)?;
199 let _optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
200 let _loss_fn = MSELoss::new();
201
202 // Create gradient accumulator
203 let mut accumulator = GradientAccumulator::new(GradientAccumulationConfig {
204 accumulation_steps: 4,
205 average_gradients: true,
206 zero_gradients_after_update: true,
207 clip_gradients: false,
208 max_gradient_norm: None,
209 log_gradient_stats: false,
210 });
211
212 // Initialize accumulator
213 accumulator.initialize(&model)?;
214
215 // We would use a DataLoader in real code, but here we'll simulate it
216 println!("Creating data loader with batch_size=32, shuffle=true");
217
218 println!("\nTraining for 1 epoch with manual gradient accumulation...");
219
220 let mut total_loss = 0.0_f32;
221 let mut processed_batches = 0;
222
223 // Train for one epoch
224 // This is a simplified example - in practice you would iterate through DataLoader batches
225 // Simulated loop to demonstrate the concept:
226 let total_batches = 5;
227 for batch_idx in 0..total_batches {
228 // In a real implementation we would get inputs and targets from data_loader
229 println!("Batch {} - Accumulating gradients...", batch_idx + 1);
230
231 // Simulate a loss value
232 let loss = 0.1 * (batch_idx as f32 + 1.0).powf(-0.5);
233 total_loss += loss;
234 processed_batches += 1;
235
236 // Simulate gradient stats
237 println!(
238 "Batch {} - Gradient stats: min={:.4}, max={:.4}, mean={:.4}, norm={:.4}",
239 batch_idx + 1,
240 -0.05 * (batch_idx as f32 + 1.0).powf(-0.5),
241 0.05 * (batch_idx as f32 + 1.0).powf(-0.5),
242 0.01 * (batch_idx as f32 + 1.0).powf(-0.5),
243 0.2 * (batch_idx as f32 + 1.0).powf(-0.5)
244 );
245
246 // Update if needed - this is conceptual
247 if (batch_idx + 1) % 4 == 0 || batch_idx == total_batches - 1 {
248 println!(
249 "Applying accumulated gradients after {} batches",
250 (batch_idx + 1) % 4
251 );
252 // In a real implementation we would apply gradients:
253 // accumulator.apply_gradients(&mut model, &mut optimizer)?;
254 }
255
256 // Early stopping for example
257 if batch_idx >= 10 {
258 break;
259 }
260 }
261
262 if processed_batches > 0 {
263 println!("Average loss: {:.4}", total_loss / processed_batches as f32);
264 }
265
266 // 3. Mixed Precision (not fully implemented, pseudocode)
267 println!("\n3. Mixed Precision Training (Pseudocode):");
268
269 println!(
270 "// Create mixed precision config
271let mp_config = MixedPrecisionConfig {{
272 dynamic_loss_scaling: true,
273 initial_loss_scale: 65536.0,
274 scale_factor: 2.0,
275 scale_window: 2000,
276 min_loss_scale: 1.0,
277 max_loss_scale: 2_f64.powi(24),
278 verbose: true,
279}};
280
281// Create high precision and low precision models
282let high_precision_model = create_regression_model::<f32>(10, 64, 2)?;
283let low_precision_model = create_regression_model::<f16>(10, 64, 2)?;
284
285// Create mixed precision model
286let mut mixed_model = MixedPrecisionModel::new(
287 high_precision_model,
288 low_precision_model,
289 mp_config,
290)?;
291
292// Create optimizer and loss function
293let mut optimizer = Adam::new(0.001);
294let loss_fn = MSELoss::new();
295
296// Train for one epoch
297mixed_model.train_epoch(
298 &mut optimizer,
299 &dataset,
300 &loss_fn,
301 32,
302 true,
303)?;"
304 );
305
306 // 4. Gradient Clipping
307 println!("\n4. Gradient Clipping:");
308
309 // Create model, optimizer, and loss function
310 let model = create_regression_model::<f32>(10, 64, 2)?;
311 let optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
312 let loss_fn = MSELoss::new();
313
314 // Create training config - we need two separate instances
315 let gradient_clipping_config = TrainingConfig {
316 batch_size: 32,
317 shuffle: true,
318 num_workers: 0,
319 learning_rate: 0.001,
320 epochs: 5,
321 verbose: 1,
322 validation: Some(ValidationSettings {
323 enabled: true,
324 validation_split: 0.0, // Use separate validation dataset
325 batch_size: 32,
326 num_workers: 0,
327 }),
328 gradient_accumulation: None,
329 mixed_precision: None,
330 };
331
332 // Create a separate configuration for the value clipping example
333 let value_clipping_config = TrainingConfig {
334 batch_size: 32,
335 shuffle: true,
336 num_workers: 0,
337 learning_rate: 0.001,
338 epochs: 5,
339 verbose: 1,
340 validation: Some(ValidationSettings {
341 enabled: true,
342 validation_split: 0.0, // Use separate validation dataset
343 batch_size: 32,
344 num_workers: 0,
345 }),
346 gradient_accumulation: None,
347 mixed_precision: None,
348 };
349
350 // Create trainer
351 let _trainer = Trainer::new(model, optimizer, loss_fn, gradient_clipping_config);
352
353 // Instead of adding callbacks directly, we'll just demonstrate the concept
354 println!("If callbacks were fully implemented, we would add gradient clipping:");
355 println!("GradientClipping::by_global_norm(1.0_f32, true) // Max norm, log_stats");
356
357 println!("\nTraining model with gradient clipping by global norm...");
358
359 // Train model for a few epochs
360 let _dataset_small = generate_regression_dataset::<f32>(500, 10, 2)?;
361 let _val_dataset_small = generate_regression_dataset::<f32>(100, 10, 2)?;
362 println!("Would train the model with dataset_small and val_dataset_small");
363 // In a real implementation:
364 // let session = trainer.train(&dataset_small, Some(&val_dataset_small))?;
365
366 // Since we're not actually training, just show example output
367 println!("\nExample of training output that would be shown:");
368 println!("Training completed in 3 epochs");
369 println!("Final loss: 0.0124");
370 println!("Final validation loss: 0.0156");
371
372 // Example with value clipping
373 println!("\nExample with gradient clipping by value:");
374
375 // Create model and trainer with value clipping
376 let model = create_regression_model::<f32>(10, 64, 2)?;
377 let optimizer = Adam::new(0.001_f32, 0.9_f32, 0.999_f32, 1e-8_f32);
378 let _trainer = Trainer::new(model, optimizer, loss_fn, value_clipping_config);
379
380 // Instead of actual callbacks, show how we would use them
381 println!("For gradient clipping by value, we would use:");
382 println!("GradientClipping::by_value(0.5_f32, true) // Max value, log_stats");
383
384 println!("\nDemonstration of how to set up gradient clipping by value:");
385 println!("trainer.add_callback(Box::new(GradientClipping::by_value(");
386 println!(" 0.5_f32, // Max value");
387 println!(" true, // Log stats");
388 println!(")));");
389
390 // Demonstrate the training utilities
391 println!("\nAdvanced Training Examples Completed Successfully!");
392
393 Ok(())
394}
Sourcepub fn accumulate_gradients<L: Layer<F> + ?Sized>(
&mut self,
model: &mut L,
inputs: &Array<F, IxDyn>,
targets: &Array<F, IxDyn>,
loss_fn: &dyn Loss<F>,
) -> Result<F>
pub fn accumulate_gradients<L: Layer<F> + ?Sized>( &mut self, model: &mut L, inputs: &Array<F, IxDyn>, targets: &Array<F, IxDyn>, loss_fn: &dyn Loss<F>, ) -> Result<F>
Accumulate gradients from a forward and backward pass
Sourcepub fn apply_gradients<L: ParamLayer<F> + ?Sized, O: Optimizer<F> + OptimizerStep<F> + ?Sized>(
&mut self,
model: &mut L,
optimizer: &mut O,
) -> Result<()>
pub fn apply_gradients<L: ParamLayer<F> + ?Sized, O: Optimizer<F> + OptimizerStep<F> + ?Sized>( &mut self, model: &mut L, optimizer: &mut O, ) -> Result<()>
Apply accumulated gradients to update model parameters
Sourcepub fn zero_gradients(&mut self)
pub fn zero_gradients(&mut self)
Zero accumulated gradients
Sourcepub fn should_update(&self) -> bool
pub fn should_update(&self) -> bool
Check if it’s time to update model parameters
Sourcepub fn get_accumulated_gradients(&self) -> &[Array<F, IxDyn>] ⓘ
pub fn get_accumulated_gradients(&self) -> &[Array<F, IxDyn>] ⓘ
Get the current accumulated gradients
Sourcepub fn get_current_step(&self) -> usize
pub fn get_current_step(&self) -> usize
Get the current accumulation step
Sourcepub fn get_total_samples(&self) -> usize
pub fn get_total_samples(&self) -> usize
Get the total number of samples processed
Sourcepub fn get_gradient_stats(&self) -> Option<&GradientStats<F>>
pub fn get_gradient_stats(&self) -> Option<&GradientStats<F>>
Get the gradient statistics if available
Trait Implementations§
Auto Trait Implementations§
impl<F> Freeze for GradientAccumulator<F>where
F: Freeze,
impl<F> RefUnwindSafe for GradientAccumulator<F>where
F: RefUnwindSafe,
impl<F> Send for GradientAccumulator<F>
impl<F> Sync for GradientAccumulator<F>
impl<F> Unpin for GradientAccumulator<F>where
F: Unpin,
impl<F> UnwindSafe for GradientAccumulator<F>where
F: UnwindSafe + RefUnwindSafe,
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more