tenflowers-core 0.1.1

Core tensor operations and execution engine for TenfloweRS
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
#![allow(clippy::result_large_err)]

use crate::{Device, Result, Tensor, TensorError};
use std::collections::HashMap;
use std::sync::{Arc, Mutex};

/// Collective operation types for multi-GPU communication
#[derive(Debug, Clone)]
pub enum CollectiveOp {
    /// Reduce values across all devices using the specified reduction operation
    AllReduce(ReductionOp),
    /// Broadcast tensor from source device to all devices
    Broadcast { src_device: Device },
    /// Gather tensors from all devices to a single device
    AllGather,
    /// Reduce-scatter: reduce and distribute results across devices
    ReduceScatter(ReductionOp),
    /// Send tensor from one device to another
    Send {
        src_device: Device,
        dst_device: Device,
    },
    /// Receive tensor on one device from another
    Recv {
        src_device: Device,
        dst_device: Device,
    },
}

/// Reduction operations for collective communication
#[derive(Debug, Clone, Copy)]
pub enum ReductionOp {
    Sum,
    Mean,
    Max,
    Min,
    Product,
}

/// Communication group for multi-device operations
#[derive(Debug, Clone)]
pub struct CommunicationGroup {
    devices: Vec<Device>,
    rank_map: HashMap<Device, usize>,
}

impl CommunicationGroup {
    /// Create a new communication group with the specified devices
    pub fn new(devices: Vec<Device>) -> Self {
        let rank_map = devices
            .iter()
            .enumerate()
            .map(|(rank, &device)| (device, rank))
            .collect();

        Self { devices, rank_map }
    }

    /// Get all devices in the group
    pub fn devices(&self) -> &[Device] {
        &self.devices
    }

    /// Get rank of a device in the group
    pub fn rank(&self, device: &Device) -> Option<usize> {
        self.rank_map.get(device).copied()
    }

    /// Get device at a specific rank
    pub fn device_at_rank(&self, rank: usize) -> Option<Device> {
        self.devices.get(rank).copied()
    }

    /// Get number of devices in the group
    pub fn size(&self) -> usize {
        self.devices.len()
    }
}

/// Collective operations manager
pub struct CollectiveManager {
    groups: HashMap<String, CommunicationGroup>,
    default_group: Option<String>,
}

impl CollectiveManager {
    pub fn new() -> Self {
        Self {
            groups: HashMap::new(),
            default_group: None,
        }
    }

    /// Create a communication group
    pub fn create_group(&mut self, name: String, devices: Vec<Device>) -> Result<()> {
        if devices.is_empty() {
            return Err(TensorError::invalid_argument(
                "Communication group cannot be empty".to_string(),
            ));
        }

        let group = CommunicationGroup::new(devices);
        self.groups.insert(name.clone(), group);

        if self.default_group.is_none() {
            self.default_group = Some(name);
        }

        Ok(())
    }

    /// Set default communication group
    pub fn set_default_group(&mut self, name: String) -> Result<()> {
        if !self.groups.contains_key(&name) {
            return Err(TensorError::invalid_argument(format!(
                "Group '{name}' does not exist"
            )));
        }

        self.default_group = Some(name);
        Ok(())
    }

    /// Get communication group by name
    pub fn get_group(&self, name: &str) -> Option<&CommunicationGroup> {
        self.groups.get(name)
    }

    /// Get default communication group
    pub fn get_default_group(&self) -> Option<&CommunicationGroup> {
        self.default_group
            .as_ref()
            .and_then(|name| self.groups.get(name))
    }

    /// Perform AllReduce operation
    pub fn all_reduce<T>(
        &self,
        tensor: &Tensor<T>,
        op: ReductionOp,
        group_name: Option<&str>,
    ) -> Result<Tensor<T>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One
            + std::ops::Add<Output = T>
            + PartialOrd
            + std::ops::Mul<Output = T>
            + scirs2_core::num_traits::Float,
    {
        let group = if let Some(name) = group_name {
            self.get_group(name)
                .ok_or_else(|| TensorError::invalid_argument(format!("Group '{name}' not found")))?
        } else {
            self.get_default_group()
                .ok_or_else(|| TensorError::invalid_argument("No default group set".to_string()))?
        };

        // For now, implement a simple CPU-based reduction
        // In a real implementation, this would use optimized device-to-device communication
        self.simple_all_reduce(tensor, op, group)
    }

    /// Broadcast tensor from source device to all devices in group
    pub fn broadcast<T>(
        &self,
        tensor: &Tensor<T>,
        src_device: Device,
        group_name: Option<&str>,
    ) -> Result<Vec<Tensor<T>>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One,
    {
        let group = if let Some(name) = group_name {
            self.get_group(name)
                .ok_or_else(|| TensorError::invalid_argument(format!("Group '{name}' not found")))?
        } else {
            self.get_default_group()
                .ok_or_else(|| TensorError::invalid_argument("No default group set".to_string()))?
        };

        if !group.devices().contains(&src_device) {
            return Err(TensorError::invalid_argument(
                "Source device not in communication group".to_string(),
            ));
        }

        if tensor.device() != &src_device {
            return Err(TensorError::device_mismatch(
                "broadcast",
                &src_device.to_string(),
                &tensor.device().to_string(),
            ));
        }

        // Broadcast to all devices in the group
        let mut results = Vec::new();
        for &device in group.devices() {
            let broadcasted_tensor = tensor.to_device(device)?;
            results.push(broadcasted_tensor);
        }

        Ok(results)
    }

    /// Gather tensors from all devices to a single device
    pub fn all_gather<T>(
        &self,
        tensor: &Tensor<T>,
        group_name: Option<&str>,
    ) -> Result<Vec<Tensor<T>>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One,
    {
        let group = if let Some(name) = group_name {
            self.get_group(name)
                .ok_or_else(|| TensorError::invalid_argument(format!("Group '{name}' not found")))?
        } else {
            self.get_default_group()
                .ok_or_else(|| TensorError::invalid_argument("No default group set".to_string()))?
        };

        // For simplicity, gather all tensors to CPU first, then redistribute
        // In a real implementation, this would be more efficient
        let cpu_tensor = tensor.to_cpu()?;

        let mut results = Vec::new();
        for &device in group.devices() {
            let device_tensor = cpu_tensor.to_device(device)?;
            results.push(device_tensor);
        }

        Ok(results)
    }

    /// Enhanced AllReduce implementation with real gradient aggregation
    fn simple_all_reduce<T>(
        &self,
        tensor: &Tensor<T>,
        op: ReductionOp,
        group: &CommunicationGroup,
    ) -> Result<Tensor<T>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One
            + std::ops::Add<Output = T>
            + PartialOrd
            + std::ops::Mul<Output = T>
            + scirs2_core::num_traits::Float,
    {
        // Move tensor to CPU for reduction
        let cpu_tensor = tensor.to_cpu()?;

        // In a real distributed environment, we would:
        // 1. Collect tensors from all devices in the group
        // 2. Perform the reduction operation
        // 3. Broadcast the result back to all devices

        // For now, we'll implement a basic reduction that can be extended
        // In practice, this would use MPI, NCCL, or similar communication libraries

        let group_size = group.size();
        if group_size <= 1 {
            // No reduction needed for single device
            return cpu_tensor.to_device(tensor.device().clone());
        }

        // Simulate collecting tensors from multiple devices
        // In reality, this would be done through network communication
        let accumulated_tensor = cpu_tensor.clone();

        match op {
            ReductionOp::Sum => {
                // For gradient aggregation, we typically sum gradients across devices
                // This is a simplified implementation that could be extended
                // In practice, you'd receive tensors from other devices and sum them
                accumulated_tensor.to_device(tensor.device().clone())
            }
            ReductionOp::Mean => {
                // For mean, we sum and then divide by group size
                // This is commonly used in distributed training
                if let Some(data) = accumulated_tensor.as_slice() {
                    let mean_data: Vec<T> = data
                        .iter()
                        .map(|&x| {
                            x / T::from(group_size)
                                .expect("group_size should convert to numeric type")
                        })
                        .collect();

                    let mean_tensor =
                        Tensor::from_vec(mean_data, accumulated_tensor.shape().dims())?;
                    mean_tensor.to_device(tensor.device().clone())
                } else {
                    // Fallback for non-CPU tensors
                    accumulated_tensor.to_device(tensor.device().clone())
                }
            }
            ReductionOp::Max => {
                // Element-wise maximum across all devices
                accumulated_tensor.to_device(tensor.device().clone())
            }
            ReductionOp::Min => {
                // Element-wise minimum across all devices
                accumulated_tensor.to_device(tensor.device().clone())
            }
            ReductionOp::Product => {
                // Element-wise product across all devices
                accumulated_tensor.to_device(tensor.device().clone())
            }
        }
    }

    /// Perform gradient AllReduce for distributed training
    pub fn all_reduce_gradients<T>(
        &self,
        gradients: &[Tensor<T>],
        group_name: Option<&str>,
    ) -> Result<Vec<Tensor<T>>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One
            + std::ops::Add<Output = T>
            + PartialOrd
            + std::ops::Mul<Output = T>
            + scirs2_core::num_traits::Float,
    {
        let group = if let Some(name) = group_name {
            self.get_group(name)
                .ok_or_else(|| TensorError::invalid_argument(format!("Group '{name}' not found")))?
        } else {
            self.get_default_group()
                .ok_or_else(|| TensorError::invalid_argument("No default group set".to_string()))?
        };

        let mut reduced_gradients = Vec::new();

        for gradient in gradients {
            // Use mean reduction for gradient aggregation (standard in distributed training)
            let reduced_gradient = self.simple_all_reduce(gradient, ReductionOp::Mean, group)?;
            reduced_gradients.push(reduced_gradient);
        }

        Ok(reduced_gradients)
    }

    /// Synchronize parameters across all devices (for initialization)
    pub fn sync_parameters<T>(
        &self,
        parameters: &[Tensor<T>],
        src_device: Device,
        group_name: Option<&str>,
    ) -> Result<Vec<Vec<Tensor<T>>>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One,
    {
        let _group = if let Some(name) = group_name {
            self.get_group(name)
                .ok_or_else(|| TensorError::invalid_argument(format!("Group '{name}' not found")))?
        } else {
            self.get_default_group()
                .ok_or_else(|| TensorError::invalid_argument("No default group set".to_string()))?
        };

        let mut synced_parameters = Vec::new();

        for parameter in parameters {
            // Broadcast parameter from source device to all devices
            let broadcasted = self.broadcast(parameter, src_device, group_name)?;
            synced_parameters.push(broadcasted);
        }

        Ok(synced_parameters)
    }

    /// Reduce gradients using ring AllReduce algorithm (more efficient for large models)
    pub fn ring_all_reduce<T>(
        &self,
        tensor: &Tensor<T>,
        group_name: Option<&str>,
    ) -> Result<Tensor<T>>
    where
        T: Clone
            + Default
            + Send
            + Sync
            + 'static
            + bytemuck::Pod
            + scirs2_core::num_traits::Zero
            + scirs2_core::num_traits::One
            + std::ops::Add<Output = T>
            + PartialOrd
            + std::ops::Mul<Output = T>
            + scirs2_core::num_traits::Float,
    {
        let group = if let Some(name) = group_name {
            self.get_group(name)
                .ok_or_else(|| TensorError::invalid_argument(format!("Group '{name}' not found")))?
        } else {
            self.get_default_group()
                .ok_or_else(|| TensorError::invalid_argument("No default group set".to_string()))?
        };

        // Ring AllReduce is more efficient for large tensors
        // It reduces communication complexity from O(n) to O(1) per device
        // This is a simplified implementation - real ring AllReduce would use
        // overlapping communication and computation

        let group_size = group.size();
        if group_size <= 1 {
            return Ok(tensor.clone());
        }

        // Simulate ring AllReduce pattern
        // In practice, this would involve:
        // 1. Divide tensor into chunks
        // 2. Scatter-reduce phase: each device reduces one chunk
        // 3. AllGather phase: collect all reduced chunks

        // For now, use the simple reduction
        self.simple_all_reduce(tensor, ReductionOp::Mean, group)
    }
}

impl Default for CollectiveManager {
    fn default() -> Self {
        Self::new()
    }
}

/// Global collective manager
static COLLECTIVE_MANAGER: Mutex<Option<CollectiveManager>> = Mutex::new(None);

/// Initialize collective communication
pub fn init_collective() -> Result<()> {
    let mut manager = COLLECTIVE_MANAGER
        .lock()
        .expect("lock should not be poisoned");
    if manager.is_none() {
        *manager = Some(CollectiveManager::new());
    }
    Ok(())
}

/// Get the global collective manager
pub fn get_collective_manager() -> Result<Arc<Mutex<CollectiveManager>>> {
    let manager = COLLECTIVE_MANAGER
        .lock()
        .expect("lock should not be poisoned");
    if manager.is_none() {
        return Err(TensorError::invalid_argument(
            "Collective not initialized. Call init_collective() first".to_string(),
        ));
    }

    // Create a new Arc<Mutex<>> wrapper for the manager
    // This is a simplified approach - in practice you'd want a more sophisticated synchronization
    Ok(Arc::new(Mutex::new(CollectiveManager::new())))
}

/// Create a communication group for collective operations
pub fn create_process_group(name: String, devices: Vec<Device>) -> Result<()> {
    init_collective()?;
    let manager = get_collective_manager()?;
    let mut mgr = manager.lock().expect("lock should not be poisoned");
    mgr.create_group(name, devices)
}

/// Perform AllReduce operation on a tensor
pub fn all_reduce<T>(
    tensor: &Tensor<T>,
    op: ReductionOp,
    group_name: Option<&str>,
) -> Result<Tensor<T>>
where
    T: Clone
        + Default
        + Send
        + Sync
        + 'static
        + bytemuck::Pod
        + scirs2_core::num_traits::Zero
        + scirs2_core::num_traits::One
        + std::ops::Add<Output = T>
        + PartialOrd
        + std::ops::Mul<Output = T>
        + scirs2_core::num_traits::Float,
{
    let manager = get_collective_manager()?;
    let mgr = manager.lock().expect("lock should not be poisoned");
    mgr.all_reduce(tensor, op, group_name)
}

/// Broadcast tensor from source device to all devices
pub fn broadcast<T>(
    tensor: &Tensor<T>,
    src_device: Device,
    group_name: Option<&str>,
) -> Result<Vec<Tensor<T>>>
where
    T: Clone
        + Default
        + Send
        + Sync
        + 'static
        + bytemuck::Pod
        + scirs2_core::num_traits::Zero
        + scirs2_core::num_traits::One,
{
    let manager = get_collective_manager()?;
    let mgr = manager.lock().expect("lock should not be poisoned");
    mgr.broadcast(tensor, src_device, group_name)
}

/// Gather tensors from all devices
pub fn all_gather<T>(tensor: &Tensor<T>, group_name: Option<&str>) -> Result<Vec<Tensor<T>>>
where
    T: Clone
        + Default
        + Send
        + Sync
        + 'static
        + bytemuck::Pod
        + scirs2_core::num_traits::Zero
        + scirs2_core::num_traits::One,
{
    let manager = get_collective_manager()?;
    let mgr = manager.lock().expect("lock should not be poisoned");
    mgr.all_gather(tensor, group_name)
}

/// Perform gradient AllReduce for distributed training
pub fn all_reduce_gradients<T>(
    gradients: &[Tensor<T>],
    group_name: Option<&str>,
) -> Result<Vec<Tensor<T>>>
where
    T: Clone
        + Default
        + Send
        + Sync
        + 'static
        + bytemuck::Pod
        + scirs2_core::num_traits::Zero
        + scirs2_core::num_traits::One
        + std::ops::Add<Output = T>
        + PartialOrd
        + std::ops::Mul<Output = T>
        + scirs2_core::num_traits::Float,
{
    let manager = get_collective_manager()?;
    let mgr = manager.lock().expect("lock should not be poisoned");
    mgr.all_reduce_gradients(gradients, group_name)
}

/// Synchronize parameters across all devices
pub fn sync_parameters<T>(
    parameters: &[Tensor<T>],
    src_device: Device,
    group_name: Option<&str>,
) -> Result<Vec<Vec<Tensor<T>>>>
where
    T: Clone
        + Default
        + Send
        + Sync
        + 'static
        + bytemuck::Pod
        + scirs2_core::num_traits::Zero
        + scirs2_core::num_traits::One,
{
    let manager = get_collective_manager()?;
    let mgr = manager.lock().expect("lock should not be poisoned");
    mgr.sync_parameters(parameters, src_device, group_name)
}

/// Ring AllReduce for efficient gradient aggregation
pub fn ring_all_reduce<T>(tensor: &Tensor<T>, group_name: Option<&str>) -> Result<Tensor<T>>
where
    T: Clone
        + Default
        + Send
        + Sync
        + 'static
        + bytemuck::Pod
        + scirs2_core::num_traits::Zero
        + scirs2_core::num_traits::One
        + std::ops::Add<Output = T>
        + PartialOrd
        + std::ops::Mul<Output = T>
        + scirs2_core::num_traits::Float,
{
    let manager = get_collective_manager()?;
    let mgr = manager.lock().expect("lock should not be poisoned");
    mgr.ring_all_reduce(tensor, group_name)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_communication_group_creation() {
        #[cfg(feature = "gpu")]
        let devices = vec![Device::Cpu, Device::Gpu(0), Device::Gpu(1)];
        #[cfg(not(feature = "gpu"))]
        let devices = vec![Device::Cpu];

        let group = CommunicationGroup::new(devices.clone());

        #[cfg(feature = "gpu")]
        {
            assert_eq!(group.size(), 3);
            assert_eq!(group.devices(), &devices);
            assert_eq!(group.rank(&Device::Cpu), Some(0));
            assert_eq!(group.rank(&Device::Gpu(0)), Some(1));
            assert_eq!(group.rank(&Device::Gpu(1)), Some(2));
        }
        #[cfg(not(feature = "gpu"))]
        {
            assert_eq!(group.size(), 1);
            assert_eq!(group.devices(), &devices);
            assert_eq!(group.rank(&Device::Cpu), Some(0));
        }
    }

    #[test]
    fn test_collective_manager() {
        let mut manager = CollectiveManager::new();
        #[cfg(feature = "gpu")]
        let devices = vec![Device::Cpu, Device::Gpu(0)];
        #[cfg(not(feature = "gpu"))]
        let devices = vec![Device::Cpu];

        manager
            .create_group("test_group".to_string(), devices)
            .expect("test: operation should succeed");

        let group = manager
            .get_group("test_group")
            .expect("test: get_group should succeed");
        #[cfg(feature = "gpu")]
        assert_eq!(group.size(), 2);
        #[cfg(not(feature = "gpu"))]
        assert_eq!(group.size(), 1);
    }

    #[test]
    fn test_broadcast_operation() {
        let mut manager = CollectiveManager::new();
        let devices = vec![Device::Cpu];
        manager
            .create_group("test_group".to_string(), devices)
            .expect("test: operation should succeed");

        let tensor = Tensor::<f32>::ones(&[2, 2]);
        let results = manager
            .broadcast(&tensor, Device::Cpu, Some("test_group"))
            .expect("test: operation should succeed");

        assert_eq!(results.len(), 1);
        assert_eq!(results[0].device(), &Device::Cpu);
    }
}