mod cpu;
#[cfg(feature = "scry-gpu")]
mod scry_gpu;
#[cfg(feature = "scry-gpu")]
pub use self::scry_gpu::ScryGpuBackend;
pub use cpu::CpuBackend;
pub enum GpuTensor {
Cpu(Vec<f64>, usize, usize),
#[cfg(feature = "scry-gpu")]
Gpu(::scry_gpu::Buffer<f32>, usize, usize),
}
impl GpuTensor {
pub fn shape(&self) -> (usize, usize) {
match self {
Self::Cpu(_, r, c) => (*r, *c),
#[cfg(feature = "scry-gpu")]
Self::Gpu(_, r, c) => (*r, *c),
}
}
#[allow(dead_code)]
pub fn to_cpu_tensor(&self) -> Self {
let (rows, cols) = self.shape();
Self::Cpu(self.to_cpu(), rows, cols)
}
pub fn to_cpu(&self) -> Vec<f64> {
match self {
Self::Cpu(data, _, _) => data.clone(),
#[cfg(feature = "scry-gpu")]
Self::Gpu(buf, _, _) => buf
.download()
.unwrap_or_default()
.iter()
.map(|&v| f64::from(v))
.collect(),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuActivation {
Identity,
Relu,
Sigmoid,
Tanh,
}
pub struct GpuForwardLayer<'a> {
pub weights_t: &'a GpuTensor,
pub bias: &'a GpuTensor,
pub activation: GpuActivation,
pub in_size: usize,
pub out_size: usize,
}
pub struct GpuLayerCache {
pub input: GpuTensor,
pub z: GpuTensor,
pub a: GpuTensor,
pub batch: usize,
}
pub struct GpuBackwardLayer<'a> {
pub z_cache: &'a GpuTensor,
pub a_cache: &'a GpuTensor,
pub input_cache: &'a GpuTensor,
pub weights_w: &'a GpuTensor,
pub activation: GpuActivation,
pub batch: usize,
pub in_size: usize,
pub out_size: usize,
}
pub(crate) fn gpu_forward_batch_default<B: ComputeBackend + ?Sized>(
backend: &B,
input: &GpuTensor,
batch: usize,
layers: &[GpuForwardLayer<'_>],
training: bool,
) -> (GpuTensor, Vec<GpuLayerCache>) {
let mut caches = Vec::new();
let mut current = backend.gpu_copy(input);
for layer in layers {
let layer_input = current;
let z = backend.gpu_matmul(
&layer_input,
layer.weights_t,
batch,
layer.in_size,
layer.out_size,
);
let z = backend.gpu_bias_add(&z, layer.bias, batch, layer.out_size);
let a = match layer.activation {
GpuActivation::Identity => {
if training {
let z_cache = backend.gpu_copy(&z);
let a_cache = backend.gpu_copy(&z);
caches.push(GpuLayerCache {
input: layer_input,
z: z_cache,
a: a_cache,
batch,
});
}
z
}
act => {
let a = match act {
GpuActivation::Relu => backend.gpu_relu(&z),
GpuActivation::Sigmoid => backend.gpu_sigmoid(&z),
GpuActivation::Tanh => backend.gpu_tanh(&z),
GpuActivation::Identity => unreachable!(),
};
if training {
caches.push(GpuLayerCache {
input: layer_input,
z,
a: backend.gpu_copy(&a),
batch,
});
}
a
}
};
current = a;
}
(current, caches)
}
pub(crate) fn gpu_backward_batch_default<B: ComputeBackend + ?Sized>(
backend: &B,
grad_output: &GpuTensor,
layers: &[GpuBackwardLayer<'_>],
) -> Vec<(Vec<f64>, Vec<f64>)> {
let mut grads = Vec::with_capacity(layers.len());
let mut current_grad = backend.gpu_copy(grad_output);
for layer in layers {
let batch = layer.batch;
let delta = match layer.activation {
GpuActivation::Identity => current_grad,
GpuActivation::Relu => backend.gpu_relu_backward(¤t_grad, layer.z_cache),
GpuActivation::Sigmoid => backend.gpu_sigmoid_backward(¤t_grad, layer.a_cache),
GpuActivation::Tanh => backend.gpu_tanh_backward(¤t_grad, layer.a_cache),
};
let db_gpu = backend.gpu_reduce_cols(&delta, batch, layer.out_size, 1.0 / batch as f64);
let db = backend.gpu_download(&db_gpu);
let delta_t = backend.gpu_transpose(&delta, batch, layer.out_size);
let dw_gpu = backend.gpu_matmul(
&delta_t,
layer.input_cache,
layer.out_size,
batch,
layer.in_size,
);
let dw_gpu = backend.gpu_scale(&dw_gpu, 1.0 / batch as f64);
let dw = backend.gpu_download(&dw_gpu);
current_grad = backend.gpu_matmul(
&delta,
layer.weights_w,
batch,
layer.out_size,
layer.in_size,
);
grads.push((dw, db));
}
grads
}
#[allow(dead_code)]
pub trait ComputeBackend {
fn matmul(&self, a: &[f64], b: &[f64], m: usize, k: usize, n: usize) -> Vec<f64>;
fn xtx_xty(&self, features: &[Vec<f64>], target: &[f64]) -> (Vec<f64>, Vec<f64>);
fn pairwise_distances_squared(
&self,
queries: &[f64],
train: &[f64],
n_q: usize,
n_t: usize,
dim: usize,
) -> Vec<f64>;
fn xtx_xty_contiguous(
&self,
data: &[f64],
target: &[f64],
n_samples: usize,
n_features: usize,
) -> (Vec<f64>, Vec<f64>) {
let features: Vec<Vec<f64>> = (0..n_features)
.map(|j| data[j * n_samples..(j + 1) * n_samples].to_vec())
.collect();
self.xtx_xty(&features, target)
}
fn name(&self) -> &'static str;
fn gpu_upload(&self, data: &[f64], rows: usize, cols: usize) -> GpuTensor {
GpuTensor::Cpu(data.to_vec(), rows, cols)
}
fn gpu_matmul(&self, a: &GpuTensor, b: &GpuTensor, m: usize, k: usize, n: usize) -> GpuTensor {
let a_data = a.to_cpu();
let b_data = b.to_cpu();
GpuTensor::Cpu(self.matmul(&a_data, &b_data, m, k, n), m, n)
}
fn gpu_bias_add(&self, z: &GpuTensor, bias: &GpuTensor, rows: usize, cols: usize) -> GpuTensor {
let mut data = z.to_cpu();
let b = bias.to_cpu();
for i in 0..rows {
for j in 0..cols {
data[i * cols + j] += b[j];
}
}
GpuTensor::Cpu(data, rows, cols)
}
fn gpu_relu(&self, x: &GpuTensor) -> GpuTensor {
let (rows, cols) = x.shape();
let mut data = x.to_cpu();
for v in &mut data {
if *v < 0.0 {
*v = 0.0;
}
}
GpuTensor::Cpu(data, rows, cols)
}
fn gpu_tanh(&self, x: &GpuTensor) -> GpuTensor {
let (rows, cols) = x.shape();
let mut data = x.to_cpu();
for v in &mut data {
*v = v.tanh();
}
GpuTensor::Cpu(data, rows, cols)
}
fn gpu_sigmoid(&self, x: &GpuTensor) -> GpuTensor {
let (rows, cols) = x.shape();
let mut data = x.to_cpu();
for v in &mut data {
*v = if *v >= 0.0 {
1.0 / (1.0 + (-*v).exp())
} else {
let ex = v.exp();
ex / (1.0 + ex)
};
}
GpuTensor::Cpu(data, rows, cols)
}
fn gpu_download(&self, t: &GpuTensor) -> Vec<f64> {
t.to_cpu()
}
fn gpu_copy(&self, x: &GpuTensor) -> GpuTensor {
let (rows, cols) = x.shape();
GpuTensor::Cpu(x.to_cpu(), rows, cols)
}
fn gpu_relu_backward(&self, grad: &GpuTensor, z: &GpuTensor) -> GpuTensor {
let (rows, cols) = grad.shape();
let g = grad.to_cpu();
let zv = z.to_cpu();
let out: Vec<f64> = g
.iter()
.zip(zv.iter())
.map(|(&gi, &zi)| if zi > 0.0 { gi } else { 0.0 })
.collect();
GpuTensor::Cpu(out, rows, cols)
}
fn gpu_sigmoid_backward(&self, grad: &GpuTensor, activated: &GpuTensor) -> GpuTensor {
let (rows, cols) = grad.shape();
let g = grad.to_cpu();
let a = activated.to_cpu();
let out: Vec<f64> = g
.iter()
.zip(a.iter())
.map(|(&gi, &ai)| gi * ai * (1.0 - ai))
.collect();
GpuTensor::Cpu(out, rows, cols)
}
fn gpu_tanh_backward(&self, grad: &GpuTensor, activated: &GpuTensor) -> GpuTensor {
let (rows, cols) = grad.shape();
let g = grad.to_cpu();
let a = activated.to_cpu();
let out: Vec<f64> = g
.iter()
.zip(a.iter())
.map(|(&gi, &ai)| gi * (1.0 - ai * ai))
.collect();
GpuTensor::Cpu(out, rows, cols)
}
fn gpu_transpose(&self, m: &GpuTensor, rows: usize, cols: usize) -> GpuTensor {
let data = m.to_cpu();
let mut t = vec![0.0; rows * cols];
for i in 0..rows {
for j in 0..cols {
t[j * rows + i] = data[i * cols + j];
}
}
GpuTensor::Cpu(t, cols, rows)
}
fn gpu_scale(&self, x: &GpuTensor, alpha: f64) -> GpuTensor {
let (rows, cols) = x.shape();
let out: Vec<f64> = x.to_cpu().iter().map(|&v| v * alpha).collect();
GpuTensor::Cpu(out, rows, cols)
}
fn gpu_reduce_cols(&self, x: &GpuTensor, rows: usize, cols: usize, scale: f64) -> GpuTensor {
let data = x.to_cpu();
let mut out = vec![0.0; cols];
for i in 0..rows {
for j in 0..cols {
out[j] += data[i * cols + j];
}
}
for v in &mut out {
*v *= scale;
}
GpuTensor::Cpu(out, 1, cols)
}
fn gpu_forward_batch(
&self,
input: &GpuTensor,
batch: usize,
layers: &[GpuForwardLayer<'_>],
training: bool,
) -> (GpuTensor, Vec<GpuLayerCache>) {
gpu_forward_batch_default(self, input, batch, layers, training)
}
fn gpu_backward_batch(
&self,
grad_output: &GpuTensor,
layers: &[GpuBackwardLayer<'_>],
) -> Vec<(Vec<f64>, Vec<f64>)> {
gpu_backward_batch_default(self, grad_output, layers)
}
fn build_histograms(
&self,
binned: &[Vec<u8>],
gradients: &[f64],
hessians: &[f64],
sample_indices: &[usize],
n_features: usize,
n_bins: usize,
) -> Vec<Vec<(f64, f64, f64)>> {
let mut histograms = vec![vec![(0.0_f64, 0.0_f64, 0.0_f64); n_bins]; n_features];
for &idx in sample_indices {
let g = gradients[idx];
let h = hessians[idx];
for f in 0..n_features {
let bin = binned[f][idx] as usize;
if bin < n_bins {
histograms[f][bin].0 += g;
histograms[f][bin].1 += h;
histograms[f][bin].2 += 1.0;
}
}
}
histograms
}
}
pub fn auto() -> Box<dyn ComputeBackend> {
#[cfg(feature = "scry-gpu")]
{
match ScryGpuBackend::new() {
Ok(gpu) => return Box::new(gpu),
Err(_e) => {
}
}
}
Box::new(CpuBackend)
}
#[allow(dead_code)]
pub fn cpu() -> CpuBackend {
CpuBackend
}