use std::sync::Arc;
use std::time::Instant;
use xlog_core::{Result, XlogError};
use super::{CudaKernelProvider, PtxLoadProfile};
use crate::kernel_manifest_data::KERNEL_MODULES;
use crate::CudaDevice;
impl CudaKernelProvider {
pub(crate) fn load_all_kernel_modules(
device: &Arc<CudaDevice>,
profiling: bool,
) -> Result<Option<PtxLoadProfile>> {
let cc = super::detect_compute_capability(device)?;
let mut profile = PtxLoadProfile::default();
for spec in KERNEL_MODULES {
let t0 = if profiling {
Some(Instant::now())
} else {
None
};
let (path, is_cubin) = super::load_module_from_file(spec.cu_name, cc)?;
device
.inner()
.load_file(&path, spec.module_name, spec.kernels)
.map_err(|e| {
XlogError::Kernel(format!("Failed to load {} module: {}", spec.cu_name, e))
})?;
if let Some(t0) = t0 {
if profiling {
device.inner().synchronize().map_err(|e| {
XlogError::Kernel(format!("sync after {} load: {}", spec.cu_name, e))
})?;
}
let elapsed = t0.elapsed().as_secs_f64();
profile
.per_module_sec
.push((spec.cu_name.to_string(), elapsed));
profile.total_sec += elapsed;
if is_cubin {
profile.cubin_loaded += 1;
} else {
profile.ptx_fallback += 1;
}
}
}
Ok(if profiling { Some(profile) } else { None })
}
}