use super::{GpuBackend, GpuError, TaylorBatchResult};
use crate::stde::EstimatorResult;
pub fn laplacian_gpu<B: GpuBackend>(
backend: &B,
tape: &B::TapeBuffers,
x: &[f32],
directions: &[&[f32]],
) -> Result<EstimatorResult<f32>, GpuError> {
if backend.num_outputs(tape) != 1 {
return Err(GpuError::Other(format!(
"laplacian_gpu requires a scalar-output tape; got {} outputs",
backend.num_outputs(tape)
)));
}
let n = x.len();
let s = directions.len();
if s == 0 {
return Err(GpuError::Other("no directions provided".into()));
}
let mut primals = Vec::with_capacity(s * n);
let mut seeds = Vec::with_capacity(s * n);
for dir in directions {
assert_eq!(dir.len(), n, "direction length must match x");
primals.extend_from_slice(x);
seeds.extend_from_slice(dir);
}
assert!(
s <= u32::MAX as usize,
"too many directions for GPU dispatch"
);
let result = backend.taylor_forward_2nd_batch(tape, &primals, &seeds, s as u32)?;
Ok(aggregate_laplacian(&result, s))
}
fn aggregate_laplacian(result: &TaylorBatchResult<f32>, s: usize) -> EstimatorResult<f32> {
let value = result.values[0]; let mut mean = 0.0f32;
let mut m2 = 0.0f32;
for i in 0..s {
let sample = 2.0 * result.c2s[i]; let count = (i + 1) as f32;
let delta = sample - mean;
mean += delta / count;
let delta2 = sample - mean;
m2 += delta * delta2;
}
let sample_variance = if s > 1 { m2 / (s - 1) as f32 } else { 0.0 };
let standard_error = if s > 1 {
(sample_variance / s as f32).sqrt()
} else {
0.0
};
EstimatorResult {
value,
estimate: mean,
sample_variance,
standard_error,
num_samples: s,
}
}
pub fn hessian_diagonal_gpu<B: GpuBackend>(
backend: &B,
tape: &B::TapeBuffers,
x: &[f32],
) -> Result<(f32, Vec<f32>), GpuError> {
let n = x.len();
let mut primals = Vec::with_capacity(n * n);
let mut seeds = vec![0.0f32; n * n];
for j in 0..n {
primals.extend_from_slice(x);
seeds[j * n + j] = 1.0;
}
assert!(n <= u32::MAX as usize, "too many inputs for GPU dispatch");
let result = backend.taylor_forward_2nd_batch(tape, &primals, &seeds, n as u32)?;
let value = result.values[0];
let diag: Vec<f32> = result.c2s.iter().map(|&c2| 2.0 * c2).collect();
Ok((value, diag))
}
pub fn laplacian_with_control_gpu<B: GpuBackend>(
backend: &B,
tape: &B::TapeBuffers,
x: &[f32],
directions: &[&[f32]],
control_diagonal: &[f32],
) -> Result<EstimatorResult<f32>, GpuError> {
if backend.num_outputs(tape) != 1 {
return Err(GpuError::Other(format!(
"laplacian_with_control_gpu requires a scalar-output tape; got {} outputs",
backend.num_outputs(tape)
)));
}
let n = x.len();
let s = directions.len();
assert_eq!(
control_diagonal.len(),
n,
"control diagonal length must match x"
);
if s == 0 {
return Err(GpuError::Other("no directions provided".into()));
}
let mut primals = Vec::with_capacity(s * n);
let mut seeds = Vec::with_capacity(s * n);
for dir in directions {
assert_eq!(dir.len(), n, "direction length must match x");
primals.extend_from_slice(x);
seeds.extend_from_slice(dir);
}
assert!(
s <= u32::MAX as usize,
"too many directions for GPU dispatch"
);
let result = backend.taylor_forward_2nd_batch(tape, &primals, &seeds, s as u32)?;
let trace_diag: f32 = control_diagonal.iter().sum();
let value = result.values[0];
let mut mean = 0.0f32;
let mut m2 = 0.0f32;
for (i, dir) in directions.iter().enumerate() {
let vhv = 2.0 * result.c2s[i]; let v_diag_v: f32 = dir
.iter()
.zip(control_diagonal.iter())
.map(|(&vj, &dj)| dj * vj * vj)
.sum();
let sample = vhv - v_diag_v; let count = (i + 1) as f32;
let delta = sample - mean;
mean += delta / count;
let delta2 = sample - mean;
m2 += delta * delta2;
}
let estimate = trace_diag + mean;
let sample_variance = if s > 1 { m2 / (s - 1) as f32 } else { 0.0 };
let standard_error = if s > 1 {
(sample_variance / s as f32).sqrt()
} else {
0.0
};
Ok(EstimatorResult {
value,
estimate,
sample_variance,
standard_error,
num_samples: s,
})
}
#[cfg(feature = "gpu-cuda")]
#[deprecated(
since = "0.5.0",
note = "use laplacian_gpu() which is now generic over GpuBackend"
)]
pub fn laplacian_gpu_cuda(
backend: &super::CudaContext,
tape: &super::CudaTapeBuffers,
x: &[f32],
directions: &[&[f32]],
) -> Result<EstimatorResult<f32>, GpuError> {
laplacian_gpu(backend, tape, x, directions)
}
#[cfg(feature = "gpu-cuda")]
#[deprecated(
since = "0.5.0",
note = "use hessian_diagonal_gpu() which is now generic over GpuBackend"
)]
pub fn hessian_diagonal_gpu_cuda(
backend: &super::CudaContext,
tape: &super::CudaTapeBuffers,
x: &[f32],
) -> Result<(f32, Vec<f32>), GpuError> {
hessian_diagonal_gpu(backend, tape, x)
}