Skip to main content

trustformers_optim/
lib.rs

1// Allow certain clippy warnings at crate level for numeric algorithms
2// These patterns are common and intentional in optimization code
3#![allow(
4    clippy::needless_range_loop,
5    clippy::manual_memcpy,
6    clippy::vec_init_then_push,
7    clippy::borrowed_box
8)]
9
10//! # TrustformeRS Optimization
11//!
12//! This crate provides state-of-the-art optimization algorithms for training transformer models,
13//! including distributed training support and memory-efficient techniques.
14//!
15//! ## Overview
16//!
17//! TrustformeRS Optim includes:
18//! - **Core Optimizers**: Adam, AdamW, SGD, LAMB, AdaFactor
19//! - **Cutting-Edge 2024-2025 Optimizers**: HN-Adam, AdEMAMix, Muon, CAME, MicroAdam for state-of-the-art performance
20//! - **Schedule-Free Optimizers**: Schedule-Free SGD and Adam (no LR scheduling needed)
21//! - **Advanced Quantization**: 4-bit optimizers with NF4 and block-wise quantization
22//! - **Memory-Efficient Optimization**: MicroAdam with compressed gradients and low space overhead
23//! - **Learning Rate Schedulers**: Linear, Cosine, Polynomial, Step, Exponential
24//! - **Distributed Training**: ZeRO optimization stages, multi-node support
25//! - **Memory Optimization**: Gradient accumulation, mixed precision, CPU offloading
26//!
27//! ## Optimizers
28//!
29//! ### Adam and AdamW
30//!
31//! Adaptive Moment Estimation with optional weight decay:
32//! ```rust,no_run
33//! use trustformers_optim::{AdamW, OptimizerState};
34//! use trustformers_core::traits::Optimizer;
35//!
36//! let mut optimizer = AdamW::new(
37//!     1e-3,           // learning_rate
38//!     (0.9, 0.999),   // (beta1, beta2)
39//!     1e-8,           // epsilon
40//!     0.01,           // weight_decay
41//! );
42//!
43//! // Ready to use in training loop with .zero_grad(), .update(), and .step()
44//! ```
45//!
46//! ### SGD
47//!
48//! Stochastic Gradient Descent with momentum and Nesterov acceleration:
49//! ```rust,no_run
50//! use trustformers_optim::SGD;
51//!
52//! let optimizer = SGD::new(
53//!     0.1,        // learning_rate
54//!     0.9,        // momentum
55//!     1e-4,       // weight_decay
56//!     true,       // nesterov
57//! );
58//! ```
59//!
60//! ### Schedule-Free Optimizers
61//!
62//! Revolutionary optimizers that eliminate the need for learning rate scheduling:
63//! ```rust,no_run
64//! use trustformers_optim::{ScheduleFreeAdam, ScheduleFreeSGD};
65//! use trustformers_core::traits::Optimizer;
66//!
67//! // Schedule-Free Adam - no learning rate scheduling needed!
68//! let optimizer = ScheduleFreeAdam::for_language_models();
69//!
70//! // Higher learning rates work better (e.g., 0.25-1.0 instead of 0.001)
71//! let optimizer = ScheduleFreeAdam::new(0.5, 0.9, 0.95, 1e-8, 0.1);
72//!
73//! // Schedule-Free SGD for simpler models
74//! let optimizer = ScheduleFreeSGD::for_large_models();
75//!
76//! // No learning rate scheduler needed! Just use .zero_grad(), .update(), .step()
77//! // eval_mode() can be used to switch to average weights
78//! ```
79//!
80//! ### Cutting-Edge 2024-2025 Optimizers
81//!
82//! The latest state-of-the-art optimizers for superior performance:
83//!
84//! #### 🌟 **NEW: Latest 2025 Research Algorithms** 🚀
85//!
86//! **Self-Scaled BFGS (SSBFGS)** - Revolutionary quasi-Newton method:
87//! ```rust,no_run
88//! use trustformers_optim::{SSBFGS, SSBFGSConfig};
89//!
90//! // For Physics-Informed Neural Networks (PINNs)
91//! let optimizer = SSBFGS::for_physics_informed();
92//!
93//! // For challenging non-convex problems
94//! let optimizer = SSBFGS::for_non_convex();
95//!
96//! // Custom configuration
97//! let optimizer = SSBFGS::from_config(SSBFGSConfig {
98//!     learning_rate: 0.8,
99//!     history_size: 15,
100//!     scaling_factor: 1.2,
101//!     momentum: 0.95,
102//! });
103//!
104//! // Get optimization statistics
105//! let stats = optimizer.get_stats();
106//! println!("Current scaling factor: {:.3}", stats.current_scaling_factor);
107//! ```
108//!
109//! **Self-Scaled Broyden (SSBroyden)** - Efficient rank-1 updates:
110//! ```rust,no_run
111//! use trustformers_optim::{SSBroyden, SSBroydenConfig};
112//!
113//! // Optimized for PINNs with rank-1 efficiency
114//! let optimizer = SSBroyden::for_physics_informed();
115//!
116//! // More computationally efficient than BFGS
117//! let optimizer = SSBroyden::new(); // Default configuration
118//! ```
119//!
120//! **PDE-aware Optimizer** - Specialized for Physics-Informed Neural Networks:
121//! ```rust,no_run
122//! use trustformers_optim::{PDEAwareOptimizer, PDEAwareConfig};
123//!
124//! // Specialized configurations for different PDEs
125//! let burgers_opt = PDEAwareOptimizer::for_burgers_equation();    // Burgers' equation
126//! let allen_cahn_opt = PDEAwareOptimizer::for_allen_cahn();       // Allen-Cahn equation
127//! let kdv_opt = PDEAwareOptimizer::for_kdv_equation();            // Korteweg-de Vries
128//! let sharp_grad_opt = PDEAwareOptimizer::for_sharp_gradients();  // Sharp gradient regions
129//!
130//! // Get PDE-specific optimization statistics
131//! let stats = sharp_grad_opt.get_pde_stats();
132//! println!("Average residual variance: {:.6}", stats.average_residual_variance);
133//! ```
134//!
135//! **🔬 Research Breakthrough Features:**
136//! - **Orders-of-magnitude improvements** in PINN training accuracy
137//! - **Dynamic rescaling** based on gradient history and PDE residual variance
138//! - **Sharp gradient handling** for challenging PDE optimization landscapes
139//! - **Lower computational cost** than second-order methods like SOAP
140//! - **Specialized presets** for different equation types (Burgers, Allen-Cahn, KdV)
141//!
142//! #### BGE-Adam (2024) - Revolutionary Performance Optimization! 🚀
143//! Enhanced Adam with entropy weighting and adaptive gradient strategy, now featuring **OptimizedBGEAdam** with **3-5x speedup**:
144//! ```rust,no_run
145//! use trustformers_optim::{BGEAdam, OptimizedBGEAdam, BGEAdamConfig, OptimizedBGEAdamConfig};
146//!
147//! // 🚀 RECOMMENDED: Use the optimized version for 3-5x better performance!
148//! let optimizer = OptimizedBGEAdam::new(); // 3-5x faster than original!
149//!
150//! // Performance-optimized presets for different use cases
151//! let llm_optimizer = OptimizedBGEAdam::for_large_models();     // For LLMs (optimized settings)
152//! let vision_optimizer = OptimizedBGEAdam::for_vision();        // For computer vision
153//! let perf_optimizer = OptimizedBGEAdam::for_high_performance(); // Maximum speed
154//!
155//! // Built-in performance monitoring and entropy statistics
156//! println!("{}", optimizer.performance_stats());
157//! let (min_entropy, max_entropy, avg_entropy) = optimizer.get_entropy_stats();
158//!
159//! // Original BGE-Adam still available (but much slower)
160//! let original_optimizer = BGEAdam::new(
161//!     1e-3,        // learning rate
162//!     (0.9, 0.999), // (β1, β2)
163//!     1e-8,        // epsilon
164//!     0.01,        // weight decay
165//!     0.1,         // entropy scaling factor
166//!     0.05,        // β1 adaptation factor
167//!     0.05,        // β2 adaptation factor
168//! );
169//! ```
170//!
171//! **🔥 Performance Improvements in OptimizedBGEAdam:**
172//! - ⚡ **3.4-4.9x faster execution** (16.3ms → 4.7ms per iteration for 50k params)
173//! - 💾 **85-87x memory reduction** through optimized buffer management
174//! - 🔥 **Single-pass processing** eliminates redundant calculations
175//! - 🚀 **Vectorized operations** with SIMD-friendly processing patterns
176//!
177//! #### HN-Adam (2024)
178//! Hybrid Norm Adam with adaptive step size:
179//! ```rust,no_run
180//! use trustformers_optim::{HNAdam, HNAdamConfig};
181//!
182//! // Automatically adjusts step size based on update norms
183//! let optimizer = HNAdam::new(1e-3, (0.9, 0.999), 1e-8, 0.01, 0.1);
184//!
185//! // Or use presets for specific tasks
186//! let transformer_opt = HNAdam::for_transformers(); // Optimized for transformers
187//! let vision_opt = HNAdam::for_vision(); // Optimized for computer vision
188//!
189//! // Better convergence speed and accuracy than standard Adam
190//! ```
191//!
192//! #### AdEMAMix (2024)
193//! Dual EMA system for better gradient utilization:
194//! ```rust,no_run
195//! use trustformers_optim::AdEMAMix;
196//!
197//! // Revolutionary dual EMA optimizer from Apple/EPFL
198//! let optimizer = AdEMAMix::for_llm_training(); // Optimized for LLMs
199//!
200//! // Or for vision tasks
201//! let optimizer = AdEMAMix::for_vision_training();
202//!
203//! // 95% data efficiency improvement demonstrated in research
204//! ```
205//!
206//! #### Muon (2024)
207//! Second-order optimizer for hidden layers:
208//! ```rust,no_run
209//! use trustformers_optim::Muon;
210//!
211//! // Used in NanoGPT and CIFAR-10 speed records
212//! let optimizer = Muon::for_nanogpt(); // <1% FLOP overhead
213//!
214//! // For large language models
215//! let optimizer = Muon::for_large_lm();
216//!
217//! // Automatically chooses 2D optimization for matrices, 1D fallback for vectors
218//! ```
219//!
220//! #### CAME (2023)
221//! Confidence-guided memory efficient optimization:
222//! ```rust,no_run
223//! use trustformers_optim::CAME;
224//!
225//! // Memory efficient with fast convergence
226//! let optimizer = CAME::for_bert_training();
227//!
228//! // For memory-constrained environments
229//! let optimizer = CAME::for_memory_constrained();
230//!
231//! // Check memory savings
232//! println!("Memory savings: {:.1}%", optimizer.memory_savings_ratio() * 100.0);
233//! ```
234//!
235//! #### MicroAdam (NeurIPS 2024)
236//! Memory-efficient Adam with compressed gradients:
237//! ```rust,no_run
238//! use trustformers_optim::MicroAdam;
239//!
240//! // Standard configuration with adaptive compression
241//! let optimizer = MicroAdam::new();
242//!
243//! // For large language models (higher compression)
244//! let optimizer = MicroAdam::for_large_models();
245//!
246//! // Memory-constrained environments (aggressive compression)
247//! let optimizer = MicroAdam::for_memory_constrained();
248//!
249//! // Check compression statistics
250//! println!("{}", optimizer.compression_statistics());
251//! println!("Memory savings: {:.1}%", optimizer.memory_savings_ratio() * 100.0);
252//! ```
253//!
254//! ### Advanced Quantization
255//!
256//! Ultra-low memory usage with 4-bit quantization:
257//! ```rust,no_run
258//! use trustformers_optim::{Adam4bit, AdvancedQuantizationConfig, QuantizationMethod};
259//!
260//! // 4-bit Adam with NF4 quantization (75% memory savings)
261//! let optimizer = Adam4bit::new(0.001, 0.9, 0.999, 1e-8, 0.01);
262//!
263//! // Custom quantization configuration
264//! let quant_config = AdvancedQuantizationConfig {
265//!     method: QuantizationMethod::NF4,
266//!     block_size: 64,
267//!     adaptation_rate: 0.01,
268//!     double_quantization: true,
269//!     ..Default::default()
270//! };
271//!
272//! let optimizer = Adam4bit::with_quantization_config(
273//!     Default::default(),
274//!     quant_config,
275//! );
276//!
277//! // Massive memory savings for large models
278//! println!("Memory savings: {:.1}%", optimizer.memory_savings() * 100.0);
279//! ```
280//!
281//! ## Learning Rate Schedules
282//!
283//! Control learning rate during training:
284//! ```rust,no_run
285//! use trustformers_optim::{AdamW, CosineScheduler, LRScheduler};
286//!
287//! let base_lr = 1e-3;
288//! let optimizer = AdamW::new(base_lr, (0.9, 0.999), 1e-8, 0.01);
289//!
290//! // Cosine annealing with warmup
291//! let scheduler = CosineScheduler::new(
292//!     base_lr,
293//!     1000,   // num_warmup_steps
294//!     10000,  // num_training_steps
295//!     1e-5,   // min_lr
296//! );
297//!
298//! // Update learning rate each step
299//! for step in 0..10000 {
300//!     let current_lr = scheduler.get_lr(step);
301//!     // Use current_lr with optimizer.set_lr(current_lr)
302//! }
303//! ```
304//!
305//! ## ZeRO Optimization
306//!
307//! Memory-efficient distributed training:
308//! ```rust,ignore
309//! // ZeRO distributed training (requires distributed environment)
310//! use trustformers_optim::{AdamW};
311//!
312//! let optimizer = AdamW::new(1e-4, (0.9, 0.999), 1e-8, 0.01);
313//! // ZeRO configuration and distributed setup would go here
314//! ```
315//!
316//! ### ZeRO Stages
317//!
318//! - **Stage 1**: Optimizer state partitioning (4x memory reduction)
319//! - **Stage 2**: Optimizer + gradient partitioning (8x memory reduction)
320//! - **Stage 3**: Full parameter partitioning (Nx memory reduction)
321//!
322//! ## Multi-Node Training
323//!
324//! Scale training across multiple machines:
325//! ```rust,ignore
326//! // Multi-node distributed training setup
327//! // Configuration and training would require distributed environment
328//! // Example: MultiNodeTrainer::new(config)
329//! ```
330//!
331//! ## Advanced Features
332//!
333//! ### Gradient Accumulation
334//! ```rust,ignore
335//! // Example: Accumulate gradients over multiple batches before stepping
336//! // if (step + 1) % accumulation_steps == 0 {
337//! //     optimizer.step(&mut model.parameters())?;
338//! //     optimizer.zero_grad();
339//! // }
340//! ```
341//!
342//! ### Mixed Precision Training
343//! ```rust,ignore
344//! // Mixed precision optimizers can provide memory savings and speed improvements
345//! // Configuration example:
346//! // MixedPrecisionOptimizer::new(base_optimizer, scale_config)
347//! ```
348//!
349//! ## Performance Tips
350//!
351//! 1. **Choose the Right Optimizer**:
352//!    - AdamW for most transformer training
353//!    - SGD for fine-tuning with small learning rates
354//!    - LAMB for large batch training
355//!
356//! 2. **Learning Rate Scheduling**:
357//!    - Use warmup for stable training start
358//!    - Cosine schedule for most cases
359//!    - Linear decay for fine-tuning
360//!
361//! 3. **Memory Optimization**:
362//!    - Enable ZeRO Stage 2 for models > 1B parameters
363//!    - Use gradient accumulation for larger effective batch sizes
364//!    - Consider CPU offloading for very large models
365//!
366//! 4. **Distributed Training**:
367//!    - Use data parallelism for models < 10B parameters
368//!    - Add model parallelism for larger models
369//!    - Enable communication overlap for better throughput
370
371// Allow large error types in Result (TrustformersError is large by design)
372#![allow(clippy::result_large_err)]
373// Allow common patterns in optimizer implementations
374#![allow(clippy::too_many_arguments)]
375#![allow(clippy::type_complexity)]
376#![allow(clippy::excessive_nesting)]
377
378pub mod adafactor_new;
379pub mod adafisher_simple;
380pub mod adam;
381pub mod adam_v2;
382pub mod adamax_plus;
383pub mod adan;
384pub mod adaptive;
385pub mod ademamix;
386pub mod advanced_2025_research;
387pub mod advanced_distributed_features;
388pub mod advanced_features;
389pub mod amacp;
390pub mod async_optim;
391pub mod averaged_adam;
392pub mod bge_adam;
393pub mod bge_adam_optimized;
394pub mod cache_friendly;
395pub mod came;
396pub mod common;
397pub mod compression;
398pub mod continual_learning;
399pub mod convergence;
400pub mod cpu_offload;
401pub mod cross_framework;
402pub mod deep_distributed_qp;
403pub mod enhanced_distributed_training;
404pub mod eva;
405pub mod federated;
406pub mod fusion;
407pub mod genie_stub;
408pub mod gradient_processing;
409pub mod hardware_aware;
410pub mod hierarchical_aggregation;
411pub mod hn_adam;
412pub mod hyperparameter_tuning;
413pub mod jax_compat;
414pub mod kernel_fusion;
415pub mod lamb;
416pub mod lancbio;
417pub mod lion;
418pub mod lookahead;
419pub mod lora;
420pub mod lora_rite_stub;
421pub mod memory_layout;
422pub mod microadam;
423pub mod monitoring;
424pub mod multinode;
425pub mod muon;
426pub mod novograd;
427pub mod onnx_export;
428pub mod optimizer;
429pub mod parallel;
430pub mod pde_aware;
431pub mod performance_validation;
432pub mod prodigy;
433pub mod pytorch_compat;
434pub mod quantized;
435pub mod quantized_advanced;
436pub mod quantum_inspired;
437pub mod schedule_free;
438pub mod scheduler;
439pub mod second_order;
440pub mod sgd;
441pub mod simd_optimizations;
442pub mod sofo_stub;
443pub mod sophia;
444pub mod sparse;
445pub mod task_specific;
446pub mod tensorflow_compat;
447pub mod traits;
448pub mod zero;
449
450#[cfg(test)]
451pub mod tests;
452
453pub use adafactor_new::{AdaFactor, AdaFactorConfig};
454pub use adafisher_simple::{AdaFisher, AdaFisherConfig};
455pub use adam::{AdaBelief, Adam, AdamW, NAdam, RAdam};
456pub use adam_v2::{AdamConfig, StandardizedAdam, StandardizedAdamW};
457pub use adamax_plus::{AdaMaxPlus, AdaMaxPlusConfig};
458pub use adan::{Adan, AdanConfig};
459pub use adaptive::{create_ranger, create_ranger_with_config, AMSBound, AdaBound, Ranger};
460pub use ademamix::{AdEMAMix, AdEMAMixConfig};
461pub use advanced_2025_research::{AdaWin, AdaWinConfig, DiWo, DiWoConfig, MeZOV2, MeZOV2Config};
462pub use advanced_distributed_features::{
463    AutoScaler, AutoScalerConfig, CheckpointConfig as AdvancedCheckpointConfig, CheckpointInfo,
464    CostOptimizer, MLOptimizerConfig, OptimizationResult, OptimizationType, PerformanceMLOptimizer,
465    ScalingDecision, ScalingStrategy, SmartCheckpointManager, WorkloadPredictor,
466};
467pub use advanced_features::{
468    CheckpointConfig, FusedOptimizer, MemoryBandwidthOptimizer, MultiOptimizerStats,
469    MultiOptimizerTrainer, ResourceUtilization, WarmupOptimizer, WarmupStrategy,
470};
471pub use amacp::{AMacP, AMacPConfig, AMacPStats};
472pub use async_optim::{
473    AsyncSGD, AsyncSGDConfig, DelayCompensationMethod, DelayedGradient, DelayedGradientConfig,
474    ElasticAveraging, ElasticAveragingConfig, Hogwild, HogwildConfig, ParameterServer,
475};
476pub use averaged_adam::{AveragedAdam, AveragedAdamConfig};
477pub use bge_adam::{BGEAdam, BGEAdamConfig};
478pub use bge_adam_optimized::{OptimizedBGEAdam, OptimizedBGEAdamConfig};
479pub use cache_friendly::{
480    CacheConfig, CacheFriendlyAdam, CacheFriendlyState, CacheStats, ParameterMetadata,
481};
482pub use came::{CAMEConfig, CAME};
483pub use common::{
484    BiasCorrection, GradientProcessor, OptimizerState, ParameterIds, ParameterUpdate,
485    StateMemoryStats, WeightDecayMode,
486};
487pub use compression::{
488    CompressedAllReduce, CompressedGradient, CompressionMethod, GradientCompressor,
489};
490pub use continual_learning::{
491    AllocationStrategy, EWCConfig, FisherMethod, L2Regularization, L2RegularizationConfig,
492    MemoryReplay, MemoryReplayConfig, MemorySelectionStrategy, PackNet, PackNetConfig,
493    UpdateStrategy, EWC,
494};
495pub use convergence::{
496    AggMo, AggMoConfig, FISTAConfig, HeavyBall, HeavyBallConfig, NesterovAcceleratedGradient,
497    NesterovAcceleratedGradientConfig, QHMConfig, VarianceReduction, VarianceReductionConfig,
498    VarianceReductionMethod, FISTA, QHM,
499};
500pub use cpu_offload::{
501    create_cpu_offloaded_adam, create_cpu_offloaded_adamw, create_cpu_offloaded_sgd,
502    CPUOffloadConfig, CPUOffloadStats, CPUOffloadedOptimizer,
503};
504pub use cross_framework::{
505    ConfigSource, ConfigTarget, CrossFrameworkConverter, Framework, JAXOptimizerConfig,
506    PyTorchOptimizerConfig, TrustformeRSOptimizerConfig, UniversalOptimizerConfig,
507    UniversalOptimizerState,
508};
509pub use deep_distributed_qp::{DeepDistributedQP, DeepDistributedQPConfig};
510pub use enhanced_distributed_training::{
511    Bottleneck, CompressionConfig, CompressionType, DistributedConfig, DistributedTrainingStats,
512    DynamicBatchingConfig, EnhancedDistributedTrainer, FaultToleranceConfig,
513    MemoryOptimizationConfig, MonitoringConfig as DistributedMonitoringConfig,
514    PerformanceMetrics as DistributedPerformanceMetrics, PerformanceTrend, TrainingStepResult,
515};
516pub use eva::{EVAConfig, EVA};
517pub use federated::{
518    ClientInfo, ClientSelectionStrategy, DifferentialPrivacy, DifferentialPrivacyConfig, FedAvg,
519    FedAvgConfig, FedProx, FedProxConfig, NoiseMechanism, SecureAggregation,
520};
521#[cfg(target_arch = "x86_64")]
522pub use fusion::simd;
523pub use fusion::{FusedOperation, FusedOptimizerState, FusionConfig, FusionStats};
524pub use genie_stub::{DomainStats, GENIEConfig, GENIEStats, GENIE};
525pub use gradient_processing::{
526    AdaptiveClippingConfig, GradientProcessedOptimizer, GradientProcessingConfig,
527    HessianApproximationType, HessianPreconditioningConfig, NoiseInjectionConfig, NoiseType,
528    SmoothingConfig,
529};
530pub use hardware_aware::{
531    create_edge_optimizer, create_gpu_adam, create_mobile_optimizer, create_tpu_optimizer,
532    CompressionRatio, EdgeOptimizer, GPUAdam, HardwareAwareConfig, HardwareTarget, MobileOptimizer,
533    TPUOptimizer, TPUVersion,
534};
535pub use hierarchical_aggregation::{
536    AggregationStats, AggregationStrategy, ButterflyStructure, CommunicationGroups, FaultDetector,
537    HierarchicalAggregator, HierarchicalConfig, NodeTopology, RecoveryStrategy, RingStructure,
538    TreeStructure,
539};
540pub use hn_adam::{HNAdam, HNAdamConfig};
541pub use hyperparameter_tuning::{
542    BayesianOptimizer, HyperparameterSample, HyperparameterSpace, HyperparameterTuner,
543    MultiObjectiveOptimizer, OptimizationTask, OptimizerType,
544    PerformanceMetrics as HyperparameterPerformanceMetrics, TaskType as HyperparameterTaskType,
545};
546pub use jax_compat::{
547    JAXAdam, JAXAdamW, JAXChain, JAXCosineDecay, JAXCosineDecaySchedule, JAXExponentialDecay,
548    JAXGradientTransformation, JAXLearningRateSchedule, JAXOptState, JAXOptimizerFactory,
549    JAXOptimizerState, JAXWarmupCosineDecay, JAXSGD,
550};
551pub use kernel_fusion::{
552    CoalescingLevel, FusedGPUState, GPUMemoryStats, KernelFusedAdam, KernelFusionConfig,
553};
554pub use lamb::LAMB;
555pub use lancbio::{LancBiO, LancBiOConfig};
556pub use lion::{Lion, LionConfig};
557pub use lookahead::{
558    Lookahead, LookaheadAdam, LookaheadAdamW, LookaheadNAdam, LookaheadRAdam, LookaheadSGD,
559};
560pub use lora::{
561    create_lora_adam, create_lora_adamw, create_lora_sgd, LoRAAdapter, LoRAConfig, LoRAOptimizer,
562};
563pub use lora_rite_stub::{LoRARITE, LoRARITEConfig, LoRARITEStats, TransformationStats};
564pub use memory_layout::{
565    AlignedAllocator, AlignmentConfig, LayoutOptimizedAdam, LayoutStats, SoAOptimizerState,
566};
567pub use microadam::{MicroAdam, MicroAdamConfig};
568pub use monitoring::{
569    ConvergenceIndicators, ConvergenceSpeed, HyperparameterSensitivity,
570    HyperparameterSensitivityConfig, HyperparameterSensitivityMetrics, MemoryStats, MemoryUsage,
571    MetricStats, MonitoringConfig, OptimizerMetrics, OptimizerMonitor, OptimizerRecommendation,
572    OptimizerSelector, PerformanceStats, PerformanceTier,
573};
574pub use muon::{Muon, MuonConfig};
575pub use pde_aware::{PDEAwareConfig, PDEAwareOptimizer, PDEAwareStats};
576pub use prodigy::{Prodigy, ProdigyConfig};
577// pub use optimizer::OptimizerState; // Already imported from common
578pub use performance_validation::{
579    BenchmarkScenario, ConvergenceAnalysisResults, CorrectnessResults,
580    DistributedValidationResults, MathematicalProperty, MathematicalTestCase,
581    MemoryValidationResults, PerformanceBenchmarkResults, PerformanceValidator,
582    RegressionAnalysisResults, StatisticalMetrics, ValidationConfig, ValidationResults,
583};
584pub use pytorch_compat::{
585    PyTorchAdam, PyTorchAdamW, PyTorchLRScheduler, PyTorchOptimizer, PyTorchOptimizerFactory,
586    PyTorchOptimizerState, PyTorchParamGroup, PyTorchSGD,
587};
588pub use quantized::{Adam8bit, AdamW8bit, QuantizationConfig, QuantizedState};
589pub use quantized_advanced::{
590    Adam4bit, Adam4bitOptimizerConfig, AdvancedQuantizationConfig, GradientStatistics,
591    QuantizationMethod, QuantizationUtils, QuantizedTensor,
592};
593pub use quantum_inspired::{
594    QuantumAnnealingConfig, QuantumAnnealingOptimizer, QuantumAnnealingStats,
595};
596pub use schedule_free::{
597    ScheduleFreeAdam, ScheduleFreeAdamConfig, ScheduleFreeSGD, ScheduleFreeSGDConfig,
598};
599pub use scheduler::{
600    AdaptiveScheduler, CompositeScheduler, ConstantWithWarmupScheduler, CosineScheduler,
601    CosineWithRestartsScheduler, CyclicalMode, CyclicalScheduler, DynamicScheduler,
602    ExponentialScheduler, LRScheduler, LinearScheduler, OneCycleScheduler, Phase,
603    PhaseBasedScheduler, PolynomialScheduler, StepScheduler, SwitchCondition,
604    TaskSpecificScheduler, TaskType as SchedulerTaskType,
605};
606pub use second_order::{
607    LineSearchMethod, NewtonCG, SSBFGSConfig, SSBFGSStats, SSBroyden, SSBroydenConfig, LBFGS,
608    SSBFGS,
609};
610pub use sgd::SGD;
611pub use simd_optimizations::{SIMDConfig, SIMDOptimizer, SIMDPerformanceInfo};
612pub use sofo_stub::{
613    ForwardModeStats, MemoryStats as SOFOMemoryStats, SOFOConfig, SOFOStats, SOFO,
614};
615pub use sophia::{Sophia, SophiaConfig};
616pub use sparse::{SparseAdam, SparseConfig, SparseMomentumState, SparseSGD};
617pub use task_specific::{
618    create_bert_optimizer, create_gan_optimizer, create_maml_optimizer, create_ppo_optimizer,
619    BERTOptimizer, GANOptimizer, MetaOptimizer as TaskMetaOptimizer, RLOptimizer,
620};
621pub use tensorflow_compat::{
622    TensorFlowAdam, TensorFlowAdamW, TensorFlowCosineDecay, TensorFlowExponentialDecay,
623    TensorFlowLearningRateSchedule, TensorFlowOptimizer, TensorFlowOptimizerConfig,
624    TensorFlowOptimizerFactory,
625};
626pub use traits::{
627    AdaptiveMomentumOptimizer, AsyncOptimizer, ClassicalMomentumOptimizer, CompositeOptimizer,
628    DistributedOptimizer, FederatedOptimizer, GPUOptimizer, GradientCompressionOptimizer,
629    HardwareOptimizer, HardwareStats, LookaheadOptimizer, MetaOptimizer, MomentumOptimizer,
630    OptimizerFactory, ScheduledOptimizer, SecondOrderOptimizer, SerializableOptimizer,
631    StalenessCompensation, StatefulOptimizer,
632};
633pub use zero::{
634    all_gather_gradients, gather_parameters, partition_gradients, partition_parameters,
635    reduce_scatter_gradients, GradientBuffer, ParameterGroup, ParameterPartition, ZeROConfig,
636    ZeROImplementationStage, ZeROMemoryStats, ZeROOptimizer, ZeROStage, ZeROStage1, ZeROStage2,
637    ZeROStage3, ZeROState,
638};
639
640pub use multinode::{MultiNodeConfig, MultiNodeStats, MultiNodeTrainer};
641pub use novograd::{MemoryEfficiencyStats, NovoGrad, NovoGradConfig, NovoGradStats};
642pub use onnx_export::{
643    ONNXExportConfig, ONNXGraph, ONNXModel, ONNXNode, ONNXOptimizerExporter, ONNXOptimizerMetadata,
644    OptimizerConfig,
645};
646pub use parallel::{BatchUpdate, ParallelAdam, ParallelConfig, ParallelStats};