1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
use custos::{CLDevice, CPU, opencl::construct_buffer, VecRead};

use crate::Matrix;

/// Compute operations on the CPU even though the matrix was created with an OpenCL device.
/// There were some optimizations implemented regarding unified memory architectures.
/// 
/// # Example
/// ```
/// use custos::{CLDevice, VecRead, ClearBuf};
/// use custos_math::{Matrix, opencl::cpu_exec};
/// 
/// fn main() -> custos::Result<()> {
///     let device = CLDevice::new(0)?;
///     let a = Matrix::<i32>::from((&device, 2, 2, [1, 2, 3, 4]));
///     let res = cpu_exec(&device, &a, |cpu, mut x| {cpu.clear(&mut x); x})?;
///     assert_eq!(device.read(&res), vec![0, 0, 0, 0]);
///     Ok(())
/// }
/// ```
pub fn cpu_exec<T, F>(device: &CLDevice, matrix: &Matrix<T>, f: F) -> custos::Result<Matrix<T>> 
where 
    F: Fn(&CPU, Matrix<T>) -> Matrix<T>,
    T: Copy+Default
{
    let cpu = CPU::new();

    if device.unified_mem() && !cfg!(feature="safe") { 
        // host ptr matrix
        let no_drop = f(&cpu, matrix.clone());
        // convert host ptr / CPU matrix into a host ptr + OpenCL ptr matrix
        return construct_buffer(device, &cpu, no_drop.to_buf())
            .map(|buf| (buf, no_drop.dims()).into());
    }
    
    let x = if device.unified_mem() {
        matrix.clone()
    } else {
        // convert an OpenCL buffer to a cpu buffer
        Matrix::from((&cpu, matrix.dims(), device.read(matrix.as_buf())))
    };
    
    Ok(Matrix::from((device, f(&cpu, x))))
}

pub fn cpu_exec_lhs_rhs<T, F>(device: &CLDevice, lhs: &Matrix<T>, rhs: &Matrix<T>, f: F) -> custos::Result<Matrix<T>> 
where 
    F: Fn(&CPU, &Matrix<T>, &Matrix<T>) -> Matrix<T>,
    T: Copy+Default
{
    let cpu = CPU::new();

    if device.unified_mem() && !cfg!(feature="safe") { 
        
        let no_drop = f(&cpu, lhs, rhs);
        let no_drop_dims = no_drop.dims();
        // convert host ptr / CPU matrix into a host ptr + OpenCL ptr matrix
        return construct_buffer(device, &cpu, no_drop.to_buf()).map(|buf| (buf, no_drop_dims).into());
    }

    let (lhs, rhs) = if device.unified_mem() {
        (lhs.clone(), rhs.clone())
    } else {
        // convert an OpenCL buffer to a cpu buffer
        (
            Matrix::from((&cpu, lhs.dims(), device.read(lhs.as_buf()))),
            Matrix::from((&cpu, rhs.dims(), device.read(rhs.as_buf())))
        )
    };

    Ok(Matrix::from((device, f(&cpu, &lhs, &rhs))))
}

pub fn cpu_exec_scalar<T, F>(device: &CLDevice, matrix: &Matrix<T>, f: F) -> T 
where 
    F: Fn(&CPU, Matrix<T>) -> T,
    T: Copy + Default
{
    let cpu = CPU::new();
    let x = if device.unified_mem() {
        matrix.clone()
    } else {
        // convert an OpenCL buffer to a cpu buffer
        Matrix::from((&cpu, matrix.dims(), device.read(matrix.as_buf())))
    };
    f(&cpu, x)

}