trustformers_optim/
async_optim.rs

1//! # Asynchronous Optimization Methods
2//!
3//! This module implements asynchronous optimization algorithms for distributed
4//! training where workers can update parameters without strict synchronization.
5//!
6//! ## Available Methods
7//!
8//! - **Async SGD**: Asynchronous stochastic gradient descent
9//! - **Hogwild!**: Lock-free asynchronous SGD for sparse features
10//! - **Delayed Gradient**: Methods that handle stale gradients
11//! - **Elastic Averaging SGD**: Combines local and global parameter averaging
12
13// reason: research-stage module — reserved API/scaffolding fields and methods
14// retained intentionally for in-progress features; not yet on active call paths.
15#![allow(dead_code)]
16
17use anyhow::Result;
18use parking_lot::{Mutex, RwLock};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::sync::atomic::{AtomicUsize, Ordering};
22use std::sync::Arc;
23use std::time::{Duration, Instant};
24use trustformers_core::tensor::Tensor;
25
26/// Configuration for asynchronous SGD.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct AsyncSGDConfig {
29    /// Learning rate
30    pub learning_rate: f32,
31    /// Momentum coefficient
32    pub momentum: f32,
33    /// Weight decay
34    pub weight_decay: f32,
35    /// Maximum allowed staleness for gradient updates
36    pub max_staleness: usize,
37    /// Staleness adaptive factor
38    pub staleness_factor: f32,
39}
40
41impl Default for AsyncSGDConfig {
42    fn default() -> Self {
43        Self {
44            learning_rate: 1e-3,
45            momentum: 0.9,
46            weight_decay: 0.0,
47            max_staleness: 10,
48            staleness_factor: 0.9,
49        }
50    }
51}
52
53/// Configuration for Hogwild! optimizer.
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct HogwildConfig {
56    /// Learning rate
57    pub learning_rate: f32,
58    /// Sparse update ratio (fraction of parameters updated per step)
59    pub sparse_ratio: f32,
60    /// Maximum number of concurrent workers
61    pub max_workers: usize,
62}
63
64impl Default for HogwildConfig {
65    fn default() -> Self {
66        Self {
67            learning_rate: 1e-3,
68            sparse_ratio: 0.1,
69            max_workers: 4,
70        }
71    }
72}
73
74/// Configuration for delayed gradient methods.
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct DelayedGradientConfig {
77    /// Base learning rate
78    pub learning_rate: f32,
79    /// Maximum gradient delay (in steps)
80    pub max_delay: usize,
81    /// Delay compensation method
82    pub compensation_method: DelayCompensationMethod,
83    /// Compensation factor
84    pub compensation_factor: f32,
85}
86
87impl Default for DelayedGradientConfig {
88    fn default() -> Self {
89        Self {
90            learning_rate: 1e-3,
91            max_delay: 20,
92            compensation_method: DelayCompensationMethod::LinearDecay,
93            compensation_factor: 0.5,
94        }
95    }
96}
97
98/// Methods for compensating gradient delays.
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub enum DelayCompensationMethod {
101    /// No compensation
102    None,
103    /// Linear decay based on delay
104    LinearDecay,
105    /// Exponential decay based on delay
106    ExponentialDecay,
107    /// Adaptive compensation
108    Adaptive,
109}
110
111/// Configuration for Elastic Averaging SGD.
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct ElasticAveragingConfig {
114    /// Learning rate
115    pub learning_rate: f32,
116    /// Elastic force coefficient
117    pub alpha: f32,
118    /// Communication period (steps between synchronization)
119    pub tau: usize,
120    /// Beta parameter for moving average
121    pub beta: f32,
122}
123
124impl Default for ElasticAveragingConfig {
125    fn default() -> Self {
126        Self {
127            learning_rate: 1e-3,
128            alpha: 0.6,
129            tau: 10,
130            beta: 0.9,
131        }
132    }
133}
134
135/// Shared parameter server for asynchronous optimization.
136pub struct ParameterServer {
137    /// Global parameters
138    parameters: Arc<RwLock<Vec<Tensor>>>,
139    /// Global step counter
140    global_step: AtomicUsize,
141    /// Parameter version counters
142    version_counters: Arc<Mutex<Vec<usize>>>,
143    /// Worker update timestamps
144    worker_timestamps: Arc<Mutex<HashMap<usize, Instant>>>,
145}
146
147impl ParameterServer {
148    /// Create a new parameter server.
149    pub fn new(initial_parameters: Vec<Tensor>) -> Self {
150        let param_count = initial_parameters.len();
151        Self {
152            parameters: Arc::new(RwLock::new(initial_parameters)),
153            global_step: AtomicUsize::new(0),
154            version_counters: Arc::new(Mutex::new(vec![0; param_count])),
155            worker_timestamps: Arc::new(Mutex::new(HashMap::new())),
156        }
157    }
158
159    /// Get current parameters for a worker.
160    pub fn get_parameters(&self, worker_id: usize) -> Result<(Vec<Tensor>, Vec<usize>)> {
161        let params = self.parameters.read().clone();
162        let versions = self.version_counters.lock().clone();
163
164        // Update worker timestamp
165        let mut timestamps = self.worker_timestamps.lock();
166        timestamps.insert(worker_id, Instant::now());
167
168        Ok((params, versions))
169    }
170
171    /// Update parameters with gradients from a worker.
172    pub fn update_parameters(
173        &self,
174        worker_id: usize,
175        gradients: Vec<Tensor>,
176        param_versions: Vec<usize>,
177        learning_rate: f32,
178    ) -> Result<()> {
179        let _current_step = self.global_step.load(Ordering::SeqCst);
180
181        // Check staleness
182        let staleness = self.compute_staleness(worker_id, &param_versions)?;
183        if staleness > 10 {
184            // Skip very stale updates
185            return Ok(());
186        }
187
188        // Apply staleness compensation
189        let compensated_lr = learning_rate * (1.0 / (1.0 + staleness as f32 * 0.1));
190
191        // Update parameters
192        {
193            let mut params = self.parameters.write();
194            let mut versions = self.version_counters.lock();
195
196            for (i, gradient) in gradients.iter().enumerate() {
197                if i < params.len() {
198                    let update = gradient.mul_scalar(compensated_lr)?;
199                    params[i] = params[i].sub(&update)?;
200                    versions[i] += 1;
201                }
202            }
203        }
204
205        self.global_step.fetch_add(1, Ordering::SeqCst);
206        Ok(())
207    }
208
209    fn compute_staleness(&self, _worker_id: usize, param_versions: &[usize]) -> Result<usize> {
210        let current_versions = self.version_counters.lock();
211        let max_staleness = param_versions
212            .iter()
213            .zip(current_versions.iter())
214            .map(|(old, new)| new.saturating_sub(*old))
215            .max()
216            .unwrap_or(0);
217        Ok(max_staleness)
218    }
219
220    /// Get current global step.
221    pub fn get_global_step(&self) -> usize {
222        self.global_step.load(Ordering::SeqCst)
223    }
224}
225
226/// Asynchronous SGD optimizer.
227pub struct AsyncSGD {
228    config: AsyncSGDConfig,
229    worker_id: usize,
230    parameter_server: Arc<ParameterServer>,
231    momentum_buffers: Vec<Tensor>,
232    local_parameters: Vec<Tensor>,
233    param_versions: Vec<usize>,
234    last_sync_step: usize,
235}
236
237impl AsyncSGD {
238    /// Create a new async SGD optimizer.
239    pub fn new(
240        config: AsyncSGDConfig,
241        worker_id: usize,
242        parameter_server: Arc<ParameterServer>,
243    ) -> Result<Self> {
244        let (params, versions) = parameter_server.get_parameters(worker_id)?;
245        let param_count = params.len();
246
247        Ok(Self {
248            config,
249            worker_id,
250            parameter_server,
251            momentum_buffers: (0..param_count)
252                .map(|i| Tensor::zeros(&params[i].shape()).map_err(anyhow::Error::from))
253                .collect::<Result<Vec<_>>>()?,
254            local_parameters: params,
255            param_versions: versions,
256            last_sync_step: 0,
257        })
258    }
259
260    /// Perform an optimization step.
261    pub fn step(&mut self, gradients: &[Tensor]) -> Result<()> {
262        // Check if we need to sync with parameter server
263        let current_step = self.parameter_server.get_global_step();
264        let staleness = current_step - self.last_sync_step;
265
266        if staleness > self.config.max_staleness {
267            self.sync_with_server()?;
268        }
269
270        // Apply momentum and update local parameters
271        for (i, gradient) in gradients.iter().enumerate() {
272            if i < self.local_parameters.len() {
273                // Apply weight decay
274                let effective_grad = if self.config.weight_decay > 0.0 {
275                    gradient.add(&self.local_parameters[i].mul_scalar(self.config.weight_decay)?)?
276                } else {
277                    gradient.clone()
278                };
279
280                // Update momentum
281                self.momentum_buffers[i] = self.momentum_buffers[i]
282                    .mul_scalar(self.config.momentum)?
283                    .add(&effective_grad)?;
284
285                // Apply staleness compensation
286                let staleness_factor = self.config.staleness_factor.powi(staleness as i32);
287                let compensated_lr = self.config.learning_rate * staleness_factor;
288
289                // Update local parameters
290                let update = self.momentum_buffers[i].mul_scalar(compensated_lr)?;
291                self.local_parameters[i] = self.local_parameters[i].sub(&update)?;
292            }
293        }
294
295        // Send updates to parameter server periodically
296        if current_step.is_multiple_of(5) {
297            self.push_to_server(gradients)?;
298        }
299
300        Ok(())
301    }
302
303    fn sync_with_server(&mut self) -> Result<()> {
304        let (params, versions) = self.parameter_server.get_parameters(self.worker_id)?;
305        self.local_parameters = params;
306        self.param_versions = versions;
307        self.last_sync_step = self.parameter_server.get_global_step();
308        Ok(())
309    }
310
311    fn push_to_server(&self, gradients: &[Tensor]) -> Result<()> {
312        self.parameter_server.update_parameters(
313            self.worker_id,
314            gradients.to_vec(),
315            self.param_versions.clone(),
316            self.config.learning_rate,
317        )
318    }
319
320    /// Get current local parameters.
321    pub fn get_parameters(&self) -> &[Tensor] {
322        &self.local_parameters
323    }
324}
325
326/// Hogwild! optimizer for sparse features.
327pub struct Hogwild {
328    config: HogwildConfig,
329    worker_id: usize,
330    shared_parameters: Arc<RwLock<Vec<Tensor>>>,
331    local_step: usize,
332}
333
334impl Hogwild {
335    /// Create a new Hogwild! optimizer.
336    pub fn new(
337        config: HogwildConfig,
338        worker_id: usize,
339        shared_parameters: Arc<RwLock<Vec<Tensor>>>,
340    ) -> Self {
341        Self {
342            config,
343            worker_id,
344            shared_parameters,
345            local_step: 0,
346        }
347    }
348
349    /// Perform sparse parameter update.
350    pub fn sparse_step(&mut self, sparse_gradients: &[(usize, Tensor)]) -> Result<()> {
351        // Lock-free updates for sparse gradients
352        // In practice, this would use atomic operations for true lock-free behavior
353
354        for &(param_idx, ref gradient) in sparse_gradients {
355            {
356                let params = self.shared_parameters.read();
357                if param_idx >= params.len() {
358                    continue;
359                }
360            } // Release read lock
361
362            // This is a simplified version - real Hogwild! uses lock-free atomic updates
363            let mut params_write = self.shared_parameters.write();
364            let update = gradient.mul_scalar(self.config.learning_rate)?;
365            params_write[param_idx] = params_write[param_idx].sub(&update)?;
366        }
367
368        self.local_step += 1;
369        Ok(())
370    }
371
372    /// Generate sparse gradient indices based on sparse ratio.
373    pub fn select_sparse_indices(&self, total_params: usize) -> Vec<usize> {
374        use scirs2_core::random::*; // SciRS2 Integration Policy
375
376        let num_sparse = (total_params as f32 * self.config.sparse_ratio) as usize;
377        let mut indices: Vec<usize> = (0..total_params).collect();
378        let mut rng = thread_rng();
379        indices.shuffle(rng.rng_mut());
380        indices.truncate(num_sparse);
381        indices
382    }
383}
384
385/// Delayed gradient optimizer.
386pub struct DelayedGradient {
387    config: DelayedGradientConfig,
388    parameters: Vec<Tensor>,
389    gradient_buffer: Vec<(Tensor, usize, Instant)>, // (gradient, delay, timestamp)
390    current_step: usize,
391}
392
393impl DelayedGradient {
394    /// Create a new delayed gradient optimizer.
395    pub fn new(config: DelayedGradientConfig, initial_parameters: Vec<Tensor>) -> Self {
396        Self {
397            config,
398            parameters: initial_parameters,
399            gradient_buffer: Vec::new(),
400            current_step: 0,
401        }
402    }
403
404    /// Add a delayed gradient to the buffer.
405    pub fn add_delayed_gradient(&mut self, gradient: Tensor, delay: usize) {
406        self.gradient_buffer.push((gradient, delay, Instant::now()));
407    }
408
409    /// Process delayed gradients and update parameters.
410    pub fn step(&mut self) -> Result<()> {
411        self.current_step += 1;
412
413        // Process gradients that are ready
414        let mut i = 0;
415        while i < self.gradient_buffer.len() {
416            let (ref gradient, delay, timestamp) = &self.gradient_buffer[i];
417            let age = timestamp.elapsed();
418
419            if age >= Duration::from_millis((*delay as u64) * 10) {
420                // Apply delay compensation
421                let compensation = self.compute_delay_compensation(*delay)?;
422                let compensated_lr = self.config.learning_rate * compensation;
423
424                // Update parameters
425                for (j, param) in self.parameters.iter_mut().enumerate() {
426                    if j < 1 {
427                        // Assuming single parameter for simplicity
428                        let update = gradient.mul_scalar(compensated_lr)?;
429                        *param = param.sub(&update)?;
430                    }
431                }
432
433                self.gradient_buffer.remove(i);
434            } else {
435                i += 1;
436            }
437        }
438
439        Ok(())
440    }
441
442    fn compute_delay_compensation(&self, delay: usize) -> Result<f32> {
443        if delay > self.config.max_delay {
444            return Ok(0.0); // Discard very old gradients
445        }
446
447        let delay_ratio = delay as f32 / self.config.max_delay as f32;
448
449        let compensation = match self.config.compensation_method {
450            DelayCompensationMethod::None => 1.0,
451            DelayCompensationMethod::LinearDecay => {
452                1.0 - delay_ratio * self.config.compensation_factor
453            },
454            DelayCompensationMethod::ExponentialDecay => {
455                (-delay_ratio * self.config.compensation_factor).exp()
456            },
457            DelayCompensationMethod::Adaptive => {
458                // Simple adaptive scheme
459                1.0 / (1.0 + delay_ratio * self.config.compensation_factor)
460            },
461        };
462
463        Ok(compensation.max(0.1)) // Minimum 10% of original learning rate
464    }
465
466    /// Get current parameters.
467    pub fn get_parameters(&self) -> &[Tensor] {
468        &self.parameters
469    }
470}
471
472/// Elastic Averaging SGD optimizer.
473pub struct ElasticAveraging {
474    config: ElasticAveragingConfig,
475    worker_id: usize,
476    local_parameters: Vec<Tensor>,
477    global_parameters: Arc<RwLock<Vec<Tensor>>>,
478    elastic_force: Vec<Tensor>,
479    local_step: usize,
480    last_communication: usize,
481}
482
483impl ElasticAveraging {
484    /// Create a new Elastic Averaging SGD optimizer.
485    pub fn new(
486        config: ElasticAveragingConfig,
487        worker_id: usize,
488        global_parameters: Arc<RwLock<Vec<Tensor>>>,
489    ) -> Result<Self> {
490        let global_params = global_parameters.read().clone();
491        let param_count = global_params.len();
492
493        Ok(Self {
494            config,
495            worker_id,
496            local_parameters: global_params.clone(),
497            global_parameters,
498            elastic_force: (0..param_count)
499                .map(|i| Tensor::zeros(&global_params[i].shape()).map_err(anyhow::Error::from))
500                .collect::<Result<Vec<_>>>()?,
501            local_step: 0,
502            last_communication: 0,
503        })
504    }
505
506    /// Perform optimization step with elastic averaging.
507    pub fn step(&mut self, gradients: &[Tensor]) -> Result<()> {
508        // Update local parameters with gradients
509        for (i, gradient) in gradients.iter().enumerate() {
510            if i < self.local_parameters.len() {
511                let update = gradient.mul_scalar(self.config.learning_rate)?;
512                self.local_parameters[i] = self.local_parameters[i].sub(&update)?;
513            }
514        }
515
516        // Apply elastic force
517        let global_params = self.global_parameters.read();
518        for i in 0..self.local_parameters.len() {
519            let diff = self.local_parameters[i].sub(&global_params[i])?;
520            self.elastic_force[i] = diff.mul_scalar(self.config.alpha)?;
521            let elastic_update = self.elastic_force[i].mul_scalar(self.config.learning_rate)?;
522            self.local_parameters[i] = self.local_parameters[i].sub(&elastic_update)?;
523        }
524        drop(global_params);
525
526        self.local_step += 1;
527
528        // Communicate with global parameters periodically
529        if self.local_step - self.last_communication >= self.config.tau {
530            self.communicate_with_global()?;
531            self.last_communication = self.local_step;
532        }
533
534        Ok(())
535    }
536
537    fn communicate_with_global(&mut self) -> Result<()> {
538        let mut global_params = self.global_parameters.write();
539
540        // Update global parameters with moving average
541        for i in 0..global_params.len() {
542            let local_contrib = self.local_parameters[i].mul_scalar(1.0 - self.config.beta)?;
543            let global_contrib = global_params[i].mul_scalar(self.config.beta)?;
544            global_params[i] = local_contrib.add(&global_contrib)?;
545        }
546
547        // Update local parameters from global
548        self.local_parameters = global_params.clone();
549
550        Ok(())
551    }
552
553    /// Get current local parameters.
554    pub fn get_parameters(&self) -> &[Tensor] {
555        &self.local_parameters
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    #[test]
564    fn test_async_sgd_config() {
565        let config = AsyncSGDConfig::default();
566        assert_eq!(config.learning_rate, 1e-3);
567        assert_eq!(config.momentum, 0.9);
568        assert_eq!(config.max_staleness, 10);
569    }
570
571    #[test]
572    fn test_hogwild_config() {
573        let config = HogwildConfig::default();
574        assert_eq!(config.learning_rate, 1e-3);
575        assert_eq!(config.sparse_ratio, 0.1);
576        assert_eq!(config.max_workers, 4);
577    }
578
579    #[test]
580    fn test_delayed_gradient_config() {
581        let config = DelayedGradientConfig::default();
582        assert_eq!(config.learning_rate, 1e-3);
583        assert_eq!(config.max_delay, 20);
584        assert!(matches!(
585            config.compensation_method,
586            DelayCompensationMethod::LinearDecay
587        ));
588    }
589
590    #[test]
591    fn test_parameter_server_creation() {
592        let params = vec![Tensor::zeros(&[10]).expect("Failed to create tensor")];
593        let server = ParameterServer::new(params);
594        assert_eq!(server.get_global_step(), 0);
595    }
596
597    #[test]
598    fn test_elastic_averaging_config() {
599        let config = ElasticAveragingConfig::default();
600        assert_eq!(config.learning_rate, 1e-3);
601        assert_eq!(config.alpha, 0.6);
602        assert_eq!(config.tau, 10);
603        assert_eq!(config.beta, 0.9);
604    }
605}
trustformers_optim/async_optim.rs

trustformers_optim/
async_optim.rs