1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
//! # ulib: General library for universal computing.
//!
//! This library basically implements traits and structs for holding vectors on hosts and different kinds of devices.
//! It is intended to be used with ucc builder which generates wrapper bindings using this library.
//!
//! CUDA support must be manually enabled using the
//! feature `cuda`.

#[allow(unused_imports)]
use lazy_static::lazy_static;

// For our derive macros to refer to cust even when cust is
// not listed as a dependency in our dependent crates.
#[cfg(feature = "cuda")]
pub extern crate cust;

pub use ulib_derive::UniversalCopy;

#[cfg(feature = "cuda")]
use cust::memory::{ DeviceCopy, DeviceSlice };

#[cfg(feature = "cuda")]
pub const MAX_NUM_CUDA_DEVICES: usize = 4;
#[cfg(feature = "cuda")]
pub const MAX_DEVICES: usize = MAX_NUM_CUDA_DEVICES + 1;

#[cfg(not(feature = "cuda"))]
pub const MAX_DEVICES: usize = 1;

/// All supported device types.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum Device {
    CPU,
    #[cfg(feature = "cuda")]
    CUDA(u8 /* device id */)
}

/// A generic device context
pub struct DeviceContext {
    #[cfg(feature = "cuda")]
    #[allow(dead_code)]
    cuda_context: Option<cust::context::Context>,
}

impl Device {
    #[inline]
    fn to_id(self) -> usize {
        use Device::*;
        match self {
            CPU => 0,
            #[cfg(feature = "cuda")]
            CUDA(c) => {
                assert!((c as usize) < MAX_NUM_CUDA_DEVICES,
                        "invalid cuda device id");
                c as usize + 1
            }
        }
    }
    
    #[inline]
    fn from_id(id: usize) -> Device {
        use Device::*;
        match id {
            0 => CPU,
            #[cfg(feature = "cuda")]
            c @ 1..=MAX_NUM_CUDA_DEVICES => CUDA(c as u8 - 1),
            id @ _ => panic!("device id {} is invalid.", id)
        }
    }

    #[inline]
    pub fn get_context(self) -> DeviceContext {
        use Device::*;
        match self {
            CPU => DeviceContext {
                #[cfg(feature = "cuda")]
                cuda_context: None
            },
            #[cfg(feature = "cuda")]
            CUDA(c) => DeviceContext {
                cuda_context: Some(cust::context::Context::new(
                    CUDA_DEVICES[c as usize].0).unwrap())
            }
        }
    }

    #[inline]
    pub fn synchronize(self) {
        use Device::*;
        match self {
            CPU => {},
            #[cfg(feature = "cuda")]
            CUDA(c) => {
                let _context = cust::context::Context::new(
                    CUDA_DEVICES[c as usize].0).unwrap();
                cust::context::CurrentContext::synchronize().unwrap();
            }
        }
    }
}

/// The trait for universally bit-copyable element.
///
/// For cuda build, this is equivalent to `Copy + cust::DeviceCopy`.
/// You can use the derive macro like this:
///
/// ```
/// use ulib::UniversalCopy;
/// 
/// #[derive(UniversalCopy, Clone)]
/// struct Test {
///     a: i32,
///     b: usize
/// }
/// ```
#[cfg(feature = "cuda")]
pub trait UniversalCopy: Copy + DeviceCopy { }
#[cfg(feature = "cuda")]
impl<T: Copy + DeviceCopy> UniversalCopy for T { }

/// The trait for universally bit-copyable element.
///
/// For cpu-only build, this is equivalent to a pure `Copy`.
/// You can use the derive macro like this:
///
/// ```
/// use ulib::UniversalCopy;
/// 
/// #[derive(UniversalCopy, Clone)]
/// struct Test {
///     a: i32,
///     b: usize
/// }
/// ```
#[cfg(not(feature = "cuda"))]
pub trait UniversalCopy: Copy { }
#[cfg(not(feature = "cuda"))]
impl<T: Copy> UniversalCopy for T { }

#[cfg(feature = "cuda")]
lazy_static! {
    /// vector of all devices and their primary contexts.
    ///
    /// the contexts follow the CUDA Driver API, not the runtime API.
    /// all contexts are kept here so they are never deallocated.
    static ref CUDA_DEVICES: Vec<(cust::device::Device, cust::context::Context)> = {
        // initialize the CUDA driver here and only here.
        cust::init(cust::CudaFlags::empty()).unwrap();
        let mut ret = cust::device::Device::devices().unwrap()
            .map(|d| {
                let d = d.unwrap();
                (d, cust::context::Context::new(d).unwrap())
            })
            .collect::<Vec<_>>();
        if ret.len() > MAX_NUM_CUDA_DEVICES as usize {
            clilog::warn!(ULIB_CUDA_TRUNC,
                          "the number of available cuda gpus {} \
                           exceed max supported {}, truncated.",
                          ret.len(), MAX_NUM_CUDA_DEVICES);
            ret.truncate(MAX_NUM_CUDA_DEVICES as usize);
        }
        ret
    };
    
    /// the number of CUDA devices.
    pub static ref NUM_CUDA_DEVICES: usize = CUDA_DEVICES.len();
}

/// A trait for objects that can be borrowed as an immutable CUDA slice.
#[cfg(feature = "cuda")]
pub trait AsCUDASlice<T: UniversalCopy> {
    /// Get an immutable CUDA slice on a specific GPU.
    /// 
    /// There is no borrow checker taking place, so nothing
    /// prevents one from using it mutably. Just don't do it.
    /// It would not only lead to data races, but also safety
    /// issues due to the possible Vec-like reallocation.
    /// 
    /// If one needs to update the content, use [`AsCUDASliceMut`]
    /// instead as it tracks the dirty flags correctly.
    fn as_cuda_slice(&self, cuda_device: Device) -> DeviceSlice<T>;
}

/// A trait for objects that can be borrowed as a mutable CUDA slice.
#[cfg(feature = "cuda")]
pub trait AsCUDASliceMut<T: UniversalCopy> {
    /// Get an immutable CUDA slice on a specific GPU.
    fn as_cuda_slice_mut(&mut self, cuda_device: Device) ->
        DeviceSlice<T>;
}

/// A trait to get raw pointer for any device.
pub trait AsUPtr<T: UniversalCopy> {
    /// Get an immutable raw pointer.
    fn as_uptr(&self, device: Device) -> *const T;
}

/// A trait to get mutable raw pointer for any device.
pub trait AsUPtrMut<T: UniversalCopy> {
    /// Get a mutable raw pointer.
    fn as_mut_uptr(&mut self, device: Device) -> *mut T;
}

mod uvec;
pub use uvec::UVec;