use crate::activations::ActivationKind;
use core::cell::UnsafeCell;
use core::hint::spin_loop;
use core::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
pub enum ComputeBackend {
Cpu = 0,
Gpu = 1,
Tpu = 2,
Lpu = 3,
}
pub fn contract_abi_version_for_backend(backend: ComputeBackend) -> Option<u16> {
match backend {
ComputeBackend::Cpu => None,
ComputeBackend::Gpu => Some(GPU_CONTRACT_ABI_VERSION),
ComputeBackend::Tpu => Some(TPU_CONTRACT_ABI_VERSION),
ComputeBackend::Lpu => Some(LPU_CONTRACT_ABI_VERSION),
}
}
pub fn is_contract_abi_compatible(backend: ComputeBackend, abi_version: u16) -> bool {
contract_abi_version_for_backend(backend)
.map(|v| v == abi_version)
.unwrap_or(false)
}
pub const GPU_CONTRACT_MAGIC: u32 = 0x4750_5543;
pub const GPU_CONTRACT_ABI_VERSION: u16 = 1;
pub const TPU_CONTRACT_MAGIC: u32 = 0x5450_5543;
pub const TPU_CONTRACT_ABI_VERSION: u16 = 1;
pub const LPU_CONTRACT_MAGIC: u32 = 0x4C50_5543;
pub const LPU_CONTRACT_ABI_VERSION: u16 = 1;
#[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct GpuContractHeader {
pub magic: u32,
pub abi_version: u16,
pub reserved: u16,
pub opcode: u32,
pub payload_len: u32,
pub payload_hash: u32,
pub flags: u32,
}
#[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct GpuContractRequest {
pub header: GpuContractHeader,
pub payload_ptr: *const u8,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u32)]
pub enum GpuOpCode {
F32 = 1,
F64 = 2,
SoftmaxF32 = 3,
SoftmaxF64 = 4,
LayerNormF32 = 5,
LayerNormF64 = 6,
RmsNormF32 = 7,
RmsNormF64 = 8,
AttentionF32 = 9,
AttentionF64 = 10,
QuantizeI8F32 = 11,
QuantizeI8F64 = 12,
DequantizeI8F32 = 13,
DequantizeI8F64 = 14,
SgdF32 = 15,
SgdF64 = 16,
AdamwF32 = 17,
AdamwF64 = 18,
}
pub type TpuOpCode = GpuOpCode;
pub type LpuOpCode = GpuOpCode;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u32)]
pub enum GpuDispatchStatus {
Ok = 0,
NotSupported = 1,
Failed = 2,
BadContract = 3,
BadPayload = 4,
}
pub type TpuDispatchStatus = GpuDispatchStatus;
pub type LpuDispatchStatus = GpuDispatchStatus;
pub const CONTRACT_FEATURE_F32: u32 = 1 << 0;
pub const CONTRACT_FEATURE_F64: u32 = 1 << 1;
pub const CONTRACT_FEATURE_INT8: u32 = 1 << 2;
pub const CONTRACT_FEATURE_BATCH: u32 = 1 << 3;
pub const HOST_OS_FLAG_UNKNOWN: u32 = 0;
pub const HOST_OS_FLAG_LINUX: u32 = 1;
pub const HOST_OS_FLAG_WINDOWS: u32 = 2;
pub const HOST_OS_FLAG_MACOS: u32 = 3;
pub const HARDWARE_FLAG_CPU: u32 = 1 << 0;
pub const HARDWARE_FLAG_GPU: u32 = 1 << 1;
pub const HARDWARE_FLAG_TPU: u32 = 1 << 2;
pub const HARDWARE_FLAG_LPU: u32 = 1 << 3;
pub const KERNEL_SPINLOCK_FLAG: u64 = 1 << 0;
pub const KERNEL_SPINLOCK_FLAG_SOFTMAX: u64 = 1 << 1;
pub const KERNEL_SPINLOCK_FLAG_LAYER_NORM: u64 = 1 << 2;
pub const KERNEL_SPINLOCK_FLAG_RMS_NORM: u64 = 1 << 3;
pub const KERNEL_SPINLOCK_FLAG_ATTENTION: u64 = 1 << 4;
pub const KERNEL_SPINLOCK_FLAG_QUANTIZATION: u64 = 1 << 5;
pub const KERNEL_SPINLOCK_FLAG_OPTIMIZER: u64 = 1 << 6;
pub const KERNEL_SPINLOCK_FLAG_GLOBAL: u64 = 1 << 7;
pub const KERNEL_SPINLOCK_FLAG_OS_LINUX_HINT: u64 = 1 << 60;
pub const KERNEL_SPINLOCK_FLAG_OS_WINDOWS_HINT: u64 = 1 << 61;
pub const KERNEL_SPINLOCK_FLAG_OS_MACOS_HINT: u64 = 1 << 62;
pub const HARDWARE_TIMING_FLAG_CLOCK_HZ_VALID: u64 = 1 << 0;
pub const HARDWARE_TIMING_FLAG_CYCLE_COUNTER_VALID: u64 = 1 << 1;
pub const HARDWARE_TIMING_FLAG_NOMINAL_CYCLES_VALID: u64 = 1 << 2;
pub const HARDWARE_TIMING_FLAG_OBSERVED_CYCLES_VALID: u64 = 1 << 3;
pub const HARDWARE_PROBE_FLAG_HANDLER_REGISTERED: u32 = 1 << 0;
pub const HARDWARE_PROBE_FLAG_SNAPSHOT_AVAILABLE: u32 = 1 << 1;
pub const DISCOVERY_STAGE_SPINLOCK_GUARDED: u32 = 1 << 0;
pub const DISCOVERY_STAGE_STAGING_ACTIVE: u32 = 1 << 1;
pub const DISCOVERY_STAGE_PROGRESSIVE_VALIDATED: u32 = 1 << 2;
pub const DISCOVERY_STAGE_SOFT_OVERHEAD_APPLIED: u32 = 1 << 3;
pub const DISCOVERY_STAGE_FINALIZED: u32 = 1 << 4;
pub const DISCOVERY_STAGE_SOFTBUFFER_STOPPED: u32 = 1 << 5;
pub const SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED: u32 = 1 << 0;
pub const SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED: u32 = 1 << 1;
pub const SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED: u32 = 1 << 2;
pub const SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED: u32 = 1 << 3;
pub const SOFTBUFFER_RUNTIME_FLAG_SNAPSHOT_MISSING: u32 = 1 << 31;
pub const CONTRACT_FLAG_REQUIRE_FINITE_INPUTS: u32 = 1 << 0;
pub const CONTRACT_FLAG_REQUIRE_STRICT_ALIGNMENT: u32 = 1 << 1;
pub const CONTRACT_FLAG_DETERMINISTIC_MATH: u32 = 1 << 2;
pub const CONTRACT_KNOWN_FLAGS_MASK: u32 = CONTRACT_FLAG_REQUIRE_FINITE_INPUTS
| CONTRACT_FLAG_REQUIRE_STRICT_ALIGNMENT
| CONTRACT_FLAG_DETERMINISTIC_MATH;
pub const FLOPS_SCALE_GIGA: u128 = 1_000_000_000;
pub const FLOPS_SCALE_TERA: u128 = 1_000_000_000_000;
pub const FLOPS_SCALE_LARGE: u128 = 1_000_000_000_000_000;
pub const DEFAULT_GPU_SUSTAINED_FLOPS_PER_SECOND: usize = 1_000_000_000_000;
pub const DEFAULT_TPU_SUSTAINED_FLOPS_PER_SECOND: usize = 2_000_000_000_000;
pub const DEFAULT_LPU_SUSTAINED_FLOPS_PER_SECOND: usize = 250_000_000_000;
pub const DEFAULT_GPU_TOTAL_MEMORY_BYTES: usize = 8 * 1024 * 1024 * 1024;
pub const DEFAULT_TPU_TOTAL_MEMORY_BYTES: usize = 16 * 1024 * 1024 * 1024;
pub const DEFAULT_LPU_TOTAL_MEMORY_BYTES: usize = 4 * 1024 * 1024 * 1024;
pub const DEFAULT_HARDWARE_HEADROOM_PPM: usize = 50_000;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct FlopAbstraction {
pub flops_per_second: u128,
pub gflops_x1000: u128,
pub tflops_x1000: u128,
pub lflops_x1000: u128,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct RamAbstraction {
pub total_bytes: u128,
pub available_bytes: u128,
pub used_bytes: u128,
pub total_mib: u128,
pub available_mib: u128,
pub used_mib: u128,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct BackendTimingAbstraction {
pub clock_hz: u64,
pub cycle_counter_hz: u64,
pub nominal_cycles_per_kernel: u64,
pub observed_cycles_per_kernel: u64,
pub timing_flags: u64,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct BufferStagingProfile {
pub discovered_total_bytes: u64,
pub discovered_available_bytes: u64,
pub staged_candidate_bytes: u64,
pub progressive_validated_bytes: u64,
pub soft_overhead_bytes: u64,
pub final_buffer_bytes: u64,
pub sequence_count: u32,
pub stage_flags: u32,
}
#[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub struct HardwareComponentSnapshot {
pub present: u8,
pub reserved0: u8,
pub reserved1: u8,
pub reserved2: u8,
pub device_count: u32,
pub total_memory_bytes: u64,
pub available_memory_bytes: u64,
pub sustained_flops_per_second: u64,
pub kernel_spinlock_flags: u64,
pub clock_hz: u64,
pub cycle_counter_hz: u64,
pub nominal_cycles_per_kernel: u64,
pub observed_cycles_per_kernel: u64,
pub timing_flags: u64,
}
#[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
pub struct HardwareDetectionSnapshot {
pub cpu_logical_cores: u32,
pub os_flags: u32,
pub hardware_flags: u32,
pub system_total_memory_bytes: u64,
pub system_available_memory_bytes: u64,
pub cpu_kernel_spinlock_flags: u64,
pub cpu_clock_hz: u64,
pub cpu_cycle_counter_hz: u64,
pub cpu_nominal_cycles_per_kernel: u64,
pub cpu_observed_cycles_per_kernel: u64,
pub cpu_timing_flags: u64,
pub gpu: HardwareComponentSnapshot,
pub tpu: HardwareComponentSnapshot,
pub lpu: HardwareComponentSnapshot,
}
pub type HardwareProbeFn =
extern "C" fn(out_snapshot: *mut HardwareDetectionSnapshot, user_ctx: usize) -> u32;
pub fn build_flop_abstraction_with_l_scale(
flops_per_second: u128,
lflop_scale_divisor: u128,
) -> FlopAbstraction {
let l_scale = if lflop_scale_divisor == 0 {
FLOPS_SCALE_LARGE
} else {
lflop_scale_divisor
};
FlopAbstraction {
flops_per_second,
gflops_x1000: flops_per_second.saturating_mul(1000) / FLOPS_SCALE_GIGA,
tflops_x1000: flops_per_second.saturating_mul(1000) / FLOPS_SCALE_TERA,
lflops_x1000: flops_per_second.saturating_mul(1000) / l_scale,
}
}
pub fn build_flop_abstraction(flops_per_second: u128) -> FlopAbstraction {
build_flop_abstraction_with_l_scale(flops_per_second, FLOPS_SCALE_LARGE)
}
pub fn build_ram_abstraction(total_bytes: u128, available_bytes: u128) -> RamAbstraction {
let total = total_bytes;
let available = available_bytes.min(total);
let used = total.saturating_sub(available);
let mib = 1024u128 * 1024u128;
RamAbstraction {
total_bytes: total,
available_bytes: available,
used_bytes: used,
total_mib: total / mib,
available_mib: available / mib,
used_mib: used / mib,
}
}
const ALL_OPCODE_MASK: u32 = (1u32 << 18) - 1;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u32)]
pub enum ContractRejectReason {
NoActiveBackendContract = 1,
OpcodeNotSupported = 2,
PayloadSizeMismatch = 3,
NullPayloadPointer = 4,
NoActiveContractInfo = 5,
NoRegisteredHandler = 6,
HandlerReturnedNonOk = 7,
UnsupportedRequestFlags = 8,
InsufficientSustainedFlops = 9,
InsufficientAvailableMemory = 10,
}
#[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct BackendContractCapabilities {
pub magic: u32,
pub abi_version: u16,
pub reserved: u16,
pub opcode_mask: u32,
pub feature_flags: u32,
pub max_batch: u32,
pub max_elements: u32,
pub preferred_alignment: u32,
pub sustained_flops_per_second: u64,
pub total_memory_bytes: u64,
pub available_memory_bytes: u64,
}
fn opcode_bit(opcode: GpuOpCode) -> u32 {
1u32 << ((opcode as u32) - 1)
}
fn opcode_from_raw(raw: u32) -> Option<GpuOpCode> {
match raw {
1 => Some(GpuOpCode::F32),
2 => Some(GpuOpCode::F64),
3 => Some(GpuOpCode::SoftmaxF32),
4 => Some(GpuOpCode::SoftmaxF64),
5 => Some(GpuOpCode::LayerNormF32),
6 => Some(GpuOpCode::LayerNormF64),
7 => Some(GpuOpCode::RmsNormF32),
8 => Some(GpuOpCode::RmsNormF64),
9 => Some(GpuOpCode::AttentionF32),
10 => Some(GpuOpCode::AttentionF64),
11 => Some(GpuOpCode::QuantizeI8F32),
12 => Some(GpuOpCode::QuantizeI8F64),
13 => Some(GpuOpCode::DequantizeI8F32),
14 => Some(GpuOpCode::DequantizeI8F64),
15 => Some(GpuOpCode::SgdF32),
16 => Some(GpuOpCode::SgdF64),
17 => Some(GpuOpCode::AdamwF32),
18 => Some(GpuOpCode::AdamwF64),
_ => None,
}
}
fn contract_reject_reason_from_raw(raw: u32) -> Option<ContractRejectReason> {
match raw {
1 => Some(ContractRejectReason::NoActiveBackendContract),
2 => Some(ContractRejectReason::OpcodeNotSupported),
3 => Some(ContractRejectReason::PayloadSizeMismatch),
4 => Some(ContractRejectReason::NullPayloadPointer),
5 => Some(ContractRejectReason::NoActiveContractInfo),
6 => Some(ContractRejectReason::NoRegisteredHandler),
7 => Some(ContractRejectReason::HandlerReturnedNonOk),
8 => Some(ContractRejectReason::UnsupportedRequestFlags),
9 => Some(ContractRejectReason::InsufficientSustainedFlops),
10 => Some(ContractRejectReason::InsufficientAvailableMemory),
_ => None,
}
}
fn backend_from_raw(raw: u8) -> Option<ComputeBackend> {
match raw {
0 => Some(ComputeBackend::Cpu),
1 => Some(ComputeBackend::Gpu),
2 => Some(ComputeBackend::Tpu),
3 => Some(ComputeBackend::Lpu),
_ => None,
}
}
fn expected_payload_len(opcode: GpuOpCode) -> usize {
match opcode {
GpuOpCode::F32 => core::mem::size_of::<KernelCmdF32>(),
GpuOpCode::F64 => core::mem::size_of::<KernelCmdF64>(),
GpuOpCode::SoftmaxF32 => core::mem::size_of::<SoftmaxCmdF32>(),
GpuOpCode::SoftmaxF64 => core::mem::size_of::<SoftmaxCmdF64>(),
GpuOpCode::LayerNormF32 => core::mem::size_of::<LayerNormCmdF32>(),
GpuOpCode::LayerNormF64 => core::mem::size_of::<LayerNormCmdF64>(),
GpuOpCode::RmsNormF32 => core::mem::size_of::<RmsNormCmdF32>(),
GpuOpCode::RmsNormF64 => core::mem::size_of::<RmsNormCmdF64>(),
GpuOpCode::AttentionF32 => core::mem::size_of::<AttentionCmdF32>(),
GpuOpCode::AttentionF64 => core::mem::size_of::<AttentionCmdF64>(),
GpuOpCode::QuantizeI8F32 => core::mem::size_of::<QuantizeI8CmdF32>(),
GpuOpCode::QuantizeI8F64 => core::mem::size_of::<QuantizeI8CmdF64>(),
GpuOpCode::DequantizeI8F32 => core::mem::size_of::<DequantizeI8CmdF32>(),
GpuOpCode::DequantizeI8F64 => core::mem::size_of::<DequantizeI8CmdF64>(),
GpuOpCode::SgdF32 => core::mem::size_of::<SgdCmdF32>(),
GpuOpCode::SgdF64 => core::mem::size_of::<SgdCmdF64>(),
GpuOpCode::AdamwF32 => core::mem::size_of::<AdamwCmdF32>(),
GpuOpCode::AdamwF64 => core::mem::size_of::<AdamwCmdF64>(),
}
}
fn feature_flags_from_opcode_mask(opcode_mask: u32) -> u32 {
let has_f32 = (opcode_mask
& (opcode_bit(GpuOpCode::F32)
| opcode_bit(GpuOpCode::SoftmaxF32)
| opcode_bit(GpuOpCode::LayerNormF32)
| opcode_bit(GpuOpCode::RmsNormF32)
| opcode_bit(GpuOpCode::AttentionF32)
| opcode_bit(GpuOpCode::SgdF32)
| opcode_bit(GpuOpCode::AdamwF32)))
!= 0;
let has_f64 = (opcode_mask
& (opcode_bit(GpuOpCode::F64)
| opcode_bit(GpuOpCode::SoftmaxF64)
| opcode_bit(GpuOpCode::LayerNormF64)
| opcode_bit(GpuOpCode::RmsNormF64)
| opcode_bit(GpuOpCode::AttentionF64)
| opcode_bit(GpuOpCode::SgdF64)
| opcode_bit(GpuOpCode::AdamwF64)))
!= 0;
let has_int8 = (opcode_mask
& (opcode_bit(GpuOpCode::QuantizeI8F32)
| opcode_bit(GpuOpCode::QuantizeI8F64)
| opcode_bit(GpuOpCode::DequantizeI8F32)
| opcode_bit(GpuOpCode::DequantizeI8F64)))
!= 0;
let mut flags = 0u32;
if has_f32 {
flags |= CONTRACT_FEATURE_F32;
}
if has_f64 {
flags |= CONTRACT_FEATURE_F64;
}
if has_int8 {
flags |= CONTRACT_FEATURE_INT8;
}
flags | CONTRACT_FEATURE_BATCH
}
pub fn probe_backend_contract(backend: ComputeBackend) -> Option<BackendContractCapabilities> {
if backend != ComputeBackend::Cpu {
sync_backend_detected_profile(backend);
}
if backend != ComputeBackend::Cpu && !backend_compute_present_from_detected_hardware(backend) {
return None;
}
let (magic, abi_version, opcode_mask, preferred_alignment) = match backend {
ComputeBackend::Cpu => return None,
ComputeBackend::Gpu => {
let has_handler = GPU_COMMAND_HANDLER.load(Ordering::SeqCst) != 0;
let has_legacy = GPU_KERNEL_F32.load(Ordering::SeqCst) != 0
|| GPU_KERNEL_F64.load(Ordering::SeqCst) != 0;
let opcode_mask = if has_handler {
let explicit = GPU_OPCODE_MASK.load(Ordering::SeqCst) as u32;
if explicit != 0 {
explicit & ALL_OPCODE_MASK
} else {
ALL_OPCODE_MASK
}
} else {
let mut mask = 0u32;
if GPU_KERNEL_F32.load(Ordering::SeqCst) != 0 {
mask |= opcode_bit(GpuOpCode::F32);
}
if GPU_KERNEL_F64.load(Ordering::SeqCst) != 0 {
mask |= opcode_bit(GpuOpCode::F64);
}
if has_legacy {
mask
} else {
0u32
}
};
(
GPU_CONTRACT_MAGIC,
GPU_CONTRACT_ABI_VERSION,
opcode_mask,
32,
)
}
ComputeBackend::Tpu => {
let opcode_mask = if TPU_COMMAND_HANDLER.load(Ordering::SeqCst) != 0 {
let explicit = TPU_OPCODE_MASK.load(Ordering::SeqCst) as u32;
if explicit != 0 {
explicit & ALL_OPCODE_MASK
} else {
ALL_OPCODE_MASK
}
} else {
0u32
};
(
TPU_CONTRACT_MAGIC,
TPU_CONTRACT_ABI_VERSION,
opcode_mask,
64,
)
}
ComputeBackend::Lpu => {
let opcode_mask = if LPU_COMMAND_HANDLER.load(Ordering::SeqCst) != 0 {
let explicit = LPU_OPCODE_MASK.load(Ordering::SeqCst) as u32;
if explicit != 0 {
explicit & ALL_OPCODE_MASK
} else {
ALL_OPCODE_MASK
}
} else {
0u32
};
(
LPU_CONTRACT_MAGIC,
LPU_CONTRACT_ABI_VERSION,
opcode_mask,
64,
)
}
};
Some(BackendContractCapabilities {
magic,
abi_version,
reserved: 0,
opcode_mask,
feature_flags: feature_flags_from_opcode_mask(opcode_mask),
max_batch: u32::MAX,
max_elements: u32::MAX,
preferred_alignment,
sustained_flops_per_second: backend_detected_sustained_flops_per_second(backend) as u64,
total_memory_bytes: backend_detected_total_memory_bytes(backend) as u64,
available_memory_bytes: backend_detected_available_memory_bytes(backend) as u64,
})
}
pub fn active_backend_contract_capabilities() -> Option<BackendContractCapabilities> {
probe_backend_contract(get_compute_backend())
}
pub fn backend_supports_opcode(backend: ComputeBackend, opcode: GpuOpCode) -> bool {
probe_backend_contract(backend)
.map(|caps| (caps.opcode_mask & opcode_bit(opcode)) != 0)
.unwrap_or(false)
}
pub type GpuCommandHandler =
extern "C" fn(request: *const GpuContractRequest, user_ctx: usize) -> u32;
pub type TpuCommandHandler =
extern "C" fn(request: *const GpuContractRequest, user_ctx: usize) -> u32;
pub type LpuCommandHandler =
extern "C" fn(request: *const GpuContractRequest, user_ctx: usize) -> u32;
pub type ContractRejectHandler = extern "C" fn(
reason: ContractRejectReason,
backend: ComputeBackend,
opcode: u32,
user_ctx: usize,
);
pub type TpuContractHeader = GpuContractHeader;
pub type TpuContractRequest = GpuContractRequest;
pub type LpuContractHeader = GpuContractHeader;
pub type LpuContractRequest = GpuContractRequest;
pub type GpuKernelF32 = fn(
src: &[f32],
dst: &mut [f32],
batch_size: usize,
stride: usize,
in_size: usize,
out_size: usize,
weights: &[f32],
biases: &[f32],
activation: ActivationKind,
) -> bool;
pub type GpuKernelF64 = fn(
src: &[f64],
dst: &mut [f64],
batch_size: usize,
stride: usize,
in_size: usize,
out_size: usize,
weights: &[f64],
biases: &[f64],
activation: ActivationKind,
) -> bool;
#[repr(C)]
pub struct KernelCmdF32 {
pub src_ptr: *const f32,
pub src_len: usize,
pub dst_ptr: *mut f32,
pub dst_len: usize,
pub batch_size: usize,
pub stride: usize,
pub in_size: usize,
pub out_size: usize,
pub weights_ptr: *const f32,
pub weights_len: usize,
pub biases_ptr: *const f32,
pub biases_len: usize,
pub activation: u8,
}
#[repr(C)]
pub struct KernelCmdF64 {
pub src_ptr: *const f64,
pub src_len: usize,
pub dst_ptr: *mut f64,
pub dst_len: usize,
pub batch_size: usize,
pub stride: usize,
pub in_size: usize,
pub out_size: usize,
pub weights_ptr: *const f64,
pub weights_len: usize,
pub biases_ptr: *const f64,
pub biases_len: usize,
pub activation: u8,
}
#[repr(C)]
pub struct SoftmaxCmdF32 {
pub logits_ptr: *const f32,
pub out_ptr: *mut f32,
pub len: usize,
}
#[repr(C)]
pub struct SoftmaxCmdF64 {
pub logits_ptr: *const f64,
pub out_ptr: *mut f64,
pub len: usize,
}
#[repr(C)]
pub struct LayerNormCmdF32 {
pub x_ptr: *mut f32,
pub gamma_ptr: *const f32,
pub beta_ptr: *const f32,
pub len: usize,
pub eps: f32,
}
#[repr(C)]
pub struct LayerNormCmdF64 {
pub x_ptr: *mut f64,
pub gamma_ptr: *const f64,
pub beta_ptr: *const f64,
pub len: usize,
pub eps: f64,
}
#[repr(C)]
pub struct RmsNormCmdF32 {
pub x_ptr: *mut f32,
pub gamma_ptr: *const f32,
pub len: usize,
pub eps: f32,
}
#[repr(C)]
pub struct RmsNormCmdF64 {
pub x_ptr: *mut f64,
pub gamma_ptr: *const f64,
pub len: usize,
pub eps: f64,
}
#[repr(C)]
pub struct AttentionCmdF32 {
pub q_ptr: *const f32,
pub q_len_total: usize,
pub k_ptr: *const f32,
pub k_len_total: usize,
pub v_ptr: *const f32,
pub v_len_total: usize,
pub out_ptr: *mut f32,
pub out_len_total: usize,
pub scratch_scores_ptr: *mut f32,
pub scratch_scores_len: usize,
pub q_len: usize,
pub k_len: usize,
pub d_k: usize,
pub d_v: usize,
pub mask: u8,
}
#[repr(C)]
pub struct AttentionCmdF64 {
pub q_ptr: *const f64,
pub q_len_total: usize,
pub k_ptr: *const f64,
pub k_len_total: usize,
pub v_ptr: *const f64,
pub v_len_total: usize,
pub out_ptr: *mut f64,
pub out_len_total: usize,
pub scratch_scores_ptr: *mut f64,
pub scratch_scores_len: usize,
pub q_len: usize,
pub k_len: usize,
pub d_k: usize,
pub d_v: usize,
pub mask: u8,
}
#[repr(C)]
pub struct QuantizeI8CmdF32 {
pub input_ptr: *const f32,
pub input_len: usize,
pub output_ptr: *mut i8,
pub output_len: usize,
pub scale_out_ptr: *mut f32,
}
#[repr(C)]
pub struct QuantizeI8CmdF64 {
pub input_ptr: *const f64,
pub input_len: usize,
pub output_ptr: *mut i8,
pub output_len: usize,
pub scale_out_ptr: *mut f64,
}
#[repr(C)]
pub struct DequantizeI8CmdF32 {
pub input_ptr: *const i8,
pub input_len: usize,
pub output_ptr: *mut f32,
pub output_len: usize,
pub scale: f32,
}
#[repr(C)]
pub struct DequantizeI8CmdF64 {
pub input_ptr: *const i8,
pub input_len: usize,
pub output_ptr: *mut f64,
pub output_len: usize,
pub scale: f64,
}
#[repr(C)]
pub struct SgdCmdF32 {
pub params_ptr: *mut f32,
pub grads_ptr: *const f32,
pub velocity_ptr: *mut f32,
pub len: usize,
pub learning_rate: f32,
pub momentum: f32,
pub nesterov: u8,
}
#[repr(C)]
pub struct SgdCmdF64 {
pub params_ptr: *mut f64,
pub grads_ptr: *const f64,
pub velocity_ptr: *mut f64,
pub len: usize,
pub learning_rate: f64,
pub momentum: f64,
pub nesterov: u8,
}
#[repr(C)]
pub struct AdamwCmdF32 {
pub params_ptr: *mut f32,
pub grads_ptr: *const f32,
pub m_ptr: *mut f32,
pub v_ptr: *mut f32,
pub len: usize,
pub learning_rate: f32,
pub step: u32,
pub beta1: f32,
pub beta2: f32,
pub eps: f32,
pub weight_decay: f32,
}
#[repr(C)]
pub struct AdamwCmdF64 {
pub params_ptr: *mut f64,
pub grads_ptr: *const f64,
pub m_ptr: *mut f64,
pub v_ptr: *mut f64,
pub len: usize,
pub learning_rate: f64,
pub step: u32,
pub beta1: f64,
pub beta2: f64,
pub eps: f64,
pub weight_decay: f64,
}
pub struct KernelInvokeF32<'a> {
pub src: &'a [f32],
pub dst: &'a mut [f32],
pub batch_size: usize,
pub stride: usize,
pub in_size: usize,
pub out_size: usize,
pub weights: &'a [f32],
pub biases: &'a [f32],
pub activation: ActivationKind,
}
pub struct KernelInvokeF64<'a> {
pub src: &'a [f64],
pub dst: &'a mut [f64],
pub batch_size: usize,
pub stride: usize,
pub in_size: usize,
pub out_size: usize,
pub weights: &'a [f64],
pub biases: &'a [f64],
pub activation: ActivationKind,
}
pub struct AttentionInvokeF32<'a> {
pub q: &'a [f32],
pub k: &'a [f32],
pub v: &'a [f32],
pub out: &'a mut [f32],
pub scratch_scores: &'a mut [f32],
pub q_len: usize,
pub k_len: usize,
pub d_k: usize,
pub d_v: usize,
pub mask: u8,
}
pub struct AttentionInvokeF64<'a> {
pub q: &'a [f64],
pub k: &'a [f64],
pub v: &'a [f64],
pub out: &'a mut [f64],
pub scratch_scores: &'a mut [f64],
pub q_len: usize,
pub k_len: usize,
pub d_k: usize,
pub d_v: usize,
pub mask: u8,
}
pub struct AdamwInvokeF32<'a> {
pub params: &'a mut [f32],
pub grads: &'a [f32],
pub m: &'a mut [f32],
pub v: &'a mut [f32],
pub learning_rate: f32,
pub step: u32,
pub beta1: f32,
pub beta2: f32,
pub eps: f32,
pub weight_decay: f32,
}
pub struct AdamwInvokeF64<'a> {
pub params: &'a mut [f64],
pub grads: &'a [f64],
pub m: &'a mut [f64],
pub v: &'a mut [f64],
pub learning_rate: f64,
pub step: u32,
pub beta1: f64,
pub beta2: f64,
pub eps: f64,
pub weight_decay: f64,
}
static BACKEND: AtomicU8 = AtomicU8::new(ComputeBackend::Cpu as u8);
static HARDWARE_CONTRACT_STRICT: AtomicU8 = AtomicU8::new(1);
static CONTRACT_RUNTIME_FLAGS: AtomicUsize = AtomicUsize::new(0);
static CONTRACT_REJECT_HANDLER: AtomicUsize = AtomicUsize::new(0);
static CONTRACT_REJECT_USER_CTX: AtomicUsize = AtomicUsize::new(0);
static LAST_CONTRACT_REJECT_REASON: AtomicUsize = AtomicUsize::new(0);
static LAST_CONTRACT_REJECT_BACKEND: AtomicU8 = AtomicU8::new(ComputeBackend::Cpu as u8);
static LAST_CONTRACT_REJECT_OPCODE: AtomicUsize = AtomicUsize::new(0);
static GPU_KERNEL_F32: AtomicUsize = AtomicUsize::new(0);
static GPU_KERNEL_F64: AtomicUsize = AtomicUsize::new(0);
static GPU_COMMAND_HANDLER: AtomicUsize = AtomicUsize::new(0);
static GPU_COMMAND_USER_CTX: AtomicUsize = AtomicUsize::new(0);
static GPU_OPCODE_MASK: AtomicUsize = AtomicUsize::new(0);
static TPU_COMMAND_HANDLER: AtomicUsize = AtomicUsize::new(0);
static TPU_COMMAND_USER_CTX: AtomicUsize = AtomicUsize::new(0);
static TPU_OPCODE_MASK: AtomicUsize = AtomicUsize::new(0);
static LPU_COMMAND_HANDLER: AtomicUsize = AtomicUsize::new(0);
static LPU_COMMAND_USER_CTX: AtomicUsize = AtomicUsize::new(0);
static LPU_OPCODE_MASK: AtomicUsize = AtomicUsize::new(0);
static GPU_SUSTAINED_FLOPS_PER_SECOND: AtomicUsize = AtomicUsize::new(0);
static TPU_SUSTAINED_FLOPS_PER_SECOND: AtomicUsize = AtomicUsize::new(0);
static LPU_SUSTAINED_FLOPS_PER_SECOND: AtomicUsize = AtomicUsize::new(0);
static GPU_TOTAL_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static TPU_TOTAL_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static LPU_TOTAL_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static GPU_AVAILABLE_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static TPU_AVAILABLE_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static LPU_AVAILABLE_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static CONTRACT_MIN_SUSTAINED_FLOPS_PER_SECOND: AtomicUsize = AtomicUsize::new(0);
static CONTRACT_MIN_AVAILABLE_MEMORY_BYTES: AtomicUsize = AtomicUsize::new(0);
static HARDWARE_HEADROOM_PPM: AtomicUsize = AtomicUsize::new(DEFAULT_HARDWARE_HEADROOM_PPM);
static HARDWARE_PROBE_HANDLER: AtomicUsize = AtomicUsize::new(0);
static HARDWARE_PROBE_USER_CTX: AtomicUsize = AtomicUsize::new(0);
static SOFTBUFFER_RUNTIME_FLAGS: AtomicUsize = AtomicUsize::new(
(SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_SNAPSHOT_MISSING) as usize,
);
struct SnapshotStore {
lock: AtomicU8,
valid: AtomicU8,
snapshot: UnsafeCell<HardwareDetectionSnapshot>,
}
unsafe impl Sync for SnapshotStore {}
static SNAPSHOT_STORE: SnapshotStore = SnapshotStore {
lock: AtomicU8::new(0),
valid: AtomicU8::new(0),
snapshot: UnsafeCell::new(HardwareDetectionSnapshot {
cpu_logical_cores: 0,
os_flags: HOST_OS_FLAG_UNKNOWN,
hardware_flags: HARDWARE_FLAG_CPU,
system_total_memory_bytes: 0,
system_available_memory_bytes: 0,
cpu_kernel_spinlock_flags: 0,
cpu_clock_hz: 0,
cpu_cycle_counter_hz: 0,
cpu_nominal_cycles_per_kernel: 0,
cpu_observed_cycles_per_kernel: 0,
cpu_timing_flags: 0,
gpu: HardwareComponentSnapshot {
present: 0,
reserved0: 0,
reserved1: 0,
reserved2: 0,
device_count: 0,
total_memory_bytes: 0,
available_memory_bytes: 0,
sustained_flops_per_second: 0,
kernel_spinlock_flags: 0,
clock_hz: 0,
cycle_counter_hz: 0,
nominal_cycles_per_kernel: 0,
observed_cycles_per_kernel: 0,
timing_flags: 0,
},
tpu: HardwareComponentSnapshot {
present: 0,
reserved0: 0,
reserved1: 0,
reserved2: 0,
device_count: 0,
total_memory_bytes: 0,
available_memory_bytes: 0,
sustained_flops_per_second: 0,
kernel_spinlock_flags: 0,
clock_hz: 0,
cycle_counter_hz: 0,
nominal_cycles_per_kernel: 0,
observed_cycles_per_kernel: 0,
timing_flags: 0,
},
lpu: HardwareComponentSnapshot {
present: 0,
reserved0: 0,
reserved1: 0,
reserved2: 0,
device_count: 0,
total_memory_bytes: 0,
available_memory_bytes: 0,
sustained_flops_per_second: 0,
kernel_spinlock_flags: 0,
clock_hz: 0,
cycle_counter_hz: 0,
nominal_cycles_per_kernel: 0,
observed_cycles_per_kernel: 0,
timing_flags: 0,
},
}),
};
#[inline]
fn acquire_snapshot_lock() {
loop {
if SNAPSHOT_STORE
.lock
.compare_exchange(0, 1, Ordering::Acquire, Ordering::Relaxed)
.is_ok()
{
break;
}
spin_loop();
}
}
#[inline]
fn release_snapshot_lock() {
SNAPSHOT_STORE.lock.store(0, Ordering::Release);
}
fn load_snapshot() -> Option<HardwareDetectionSnapshot> {
if SNAPSHOT_STORE.valid.load(Ordering::SeqCst) == 0 {
return None;
}
acquire_snapshot_lock();
let snapshot = unsafe { *SNAPSHOT_STORE.snapshot.get() };
release_snapshot_lock();
Some(snapshot)
}
fn store_snapshot(snapshot: HardwareDetectionSnapshot) {
acquire_snapshot_lock();
unsafe {
*SNAPSHOT_STORE.snapshot.get() = snapshot;
}
SNAPSHOT_STORE.valid.store(1, Ordering::SeqCst);
release_snapshot_lock();
}
#[inline]
fn decode_contract_reject_handler(ptr: usize) -> Option<ContractRejectHandler> {
if ptr == 0 {
return None;
}
Some(unsafe { core::mem::transmute::<usize, ContractRejectHandler>(ptr) })
}
#[inline]
fn decode_gpu_command_handler(ptr: usize) -> Option<GpuCommandHandler> {
if ptr == 0 {
return None;
}
Some(unsafe { core::mem::transmute::<usize, GpuCommandHandler>(ptr) })
}
#[inline]
fn decode_gpu_kernel_f32(ptr: usize) -> Option<GpuKernelF32> {
if ptr == 0 {
return None;
}
Some(unsafe { core::mem::transmute::<usize, GpuKernelF32>(ptr) })
}
#[inline]
fn decode_gpu_kernel_f64(ptr: usize) -> Option<GpuKernelF64> {
if ptr == 0 {
return None;
}
Some(unsafe { core::mem::transmute::<usize, GpuKernelF64>(ptr) })
}
#[inline]
fn decode_hardware_probe_handler(ptr: usize) -> Option<HardwareProbeFn> {
if ptr == 0 {
return None;
}
Some(unsafe { core::mem::transmute::<usize, HardwareProbeFn>(ptr) })
}
#[inline]
fn sanitize_component(mut component: HardwareComponentSnapshot) -> HardwareComponentSnapshot {
if component.present == 0 || component.device_count == 0 {
component.present = 0;
component.device_count = 0;
component.total_memory_bytes = 0;
component.available_memory_bytes = 0;
component.sustained_flops_per_second = 0;
} else if component.available_memory_bytes > component.total_memory_bytes {
component.available_memory_bytes = component.total_memory_bytes;
}
component
}
fn normalize_hardware_snapshot(
mut snapshot: HardwareDetectionSnapshot,
) -> Option<HardwareDetectionSnapshot> {
if !is_valid_os_flag_value(snapshot.os_flags) {
return None;
}
if snapshot.system_available_memory_bytes > snapshot.system_total_memory_bytes {
snapshot.system_available_memory_bytes = snapshot.system_total_memory_bytes;
}
snapshot.gpu = sanitize_component(snapshot.gpu);
snapshot.tpu = sanitize_component(snapshot.tpu);
snapshot.lpu = sanitize_component(snapshot.lpu);
let mut normalized_flags = HARDWARE_FLAG_CPU;
if component_compute_ready(snapshot.gpu) {
normalized_flags |= HARDWARE_FLAG_GPU;
}
if component_compute_ready(snapshot.tpu) {
normalized_flags |= HARDWARE_FLAG_TPU;
}
if component_compute_ready(snapshot.lpu) {
normalized_flags |= HARDWARE_FLAG_LPU;
}
snapshot.hardware_flags = normalized_flags;
Some(snapshot)
}
#[inline]
fn cpu_buffers_ready(snapshot: &HardwareDetectionSnapshot) -> bool {
snapshot.system_total_memory_bytes > 0
&& snapshot.system_available_memory_bytes > 0
&& snapshot.system_available_memory_bytes <= snapshot.system_total_memory_bytes
}
#[inline]
fn cpu_softbuffer_ready(snapshot: &HardwareDetectionSnapshot) -> bool {
cpu_buffers_ready(snapshot) && snapshot.cpu_kernel_spinlock_flags != 0
}
#[inline]
fn component_softbuffer_ready(component: HardwareComponentSnapshot) -> bool {
component_buffers_ready(component) && component.kernel_spinlock_flags != 0
}
fn compute_softbuffer_runtime_flags(snapshot: &HardwareDetectionSnapshot) -> u32 {
let mut flags = 0u32;
if !cpu_softbuffer_ready(snapshot) {
flags |= SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED;
}
if !component_softbuffer_ready(snapshot.gpu) {
flags |= SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED;
}
if !component_softbuffer_ready(snapshot.tpu) {
flags |= SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED;
}
if !component_softbuffer_ready(snapshot.lpu) {
flags |= SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED;
}
flags
}
fn set_softbuffer_runtime_flags(flags: u32) {
SOFTBUFFER_RUNTIME_FLAGS.store(flags as usize, Ordering::SeqCst);
}
fn run_hardware_probe() -> Option<HardwareDetectionSnapshot> {
let handler_ptr = HARDWARE_PROBE_HANDLER.load(Ordering::SeqCst);
if let Some(handler) = decode_hardware_probe_handler(handler_ptr) {
let user_ctx = HARDWARE_PROBE_USER_CTX.load(Ordering::SeqCst);
let mut snapshot = HardwareDetectionSnapshot::default();
let status = handler(&mut snapshot as *mut HardwareDetectionSnapshot, user_ctx);
if status == 0 {
set_softbuffer_runtime_flags(
SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_SNAPSHOT_MISSING,
);
return None;
}
let normalized = normalize_hardware_snapshot(snapshot)?;
set_softbuffer_runtime_flags(compute_softbuffer_runtime_flags(&normalized));
Some(normalized)
} else {
let Some(snapshot) = load_snapshot() else {
set_softbuffer_runtime_flags(
SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_SNAPSHOT_MISSING,
);
return None;
};
let normalized = normalize_hardware_snapshot(snapshot)?;
set_softbuffer_runtime_flags(compute_softbuffer_runtime_flags(&normalized));
Some(normalized)
}
}
#[inline]
fn is_valid_os_flag_value(os_flags: u32) -> bool {
os_flags <= HOST_OS_FLAG_MACOS
}
#[inline]
fn snapshot_component(
snapshot: &HardwareDetectionSnapshot,
backend: ComputeBackend,
) -> Option<HardwareComponentSnapshot> {
match backend {
ComputeBackend::Cpu => None,
ComputeBackend::Gpu => Some(snapshot.gpu),
ComputeBackend::Tpu => Some(snapshot.tpu),
ComputeBackend::Lpu => Some(snapshot.lpu),
}
}
pub fn host_os_flags_from_name(os_name: &str) -> u32 {
if os_name.eq_ignore_ascii_case("linux") {
HOST_OS_FLAG_LINUX
} else if os_name.eq_ignore_ascii_case("windows") {
HOST_OS_FLAG_WINDOWS
} else if os_name.eq_ignore_ascii_case("macos")
|| os_name.eq_ignore_ascii_case("darwin")
|| os_name.eq_ignore_ascii_case("mac")
{
HOST_OS_FLAG_MACOS
} else {
HOST_OS_FLAG_UNKNOWN
}
}
pub fn probe_hardware_os_flags() -> u32 {
run_hardware_probe().map(|s| s.os_flags).unwrap_or(0)
}
pub fn probe_hardware_probe_flags() -> u32 {
let mut flags = 0u32;
if HARDWARE_PROBE_HANDLER.load(Ordering::SeqCst) != 0 {
flags |= HARDWARE_PROBE_FLAG_HANDLER_REGISTERED;
}
if SNAPSHOT_STORE.valid.load(Ordering::SeqCst) != 0 {
flags |= HARDWARE_PROBE_FLAG_SNAPSHOT_AVAILABLE;
}
if run_hardware_probe().is_some() {
flags |= HARDWARE_PROBE_FLAG_SNAPSHOT_AVAILABLE;
}
flags
}
pub fn set_hardware_snapshot(snapshot: HardwareDetectionSnapshot) -> bool {
let Some(normalized) = normalize_hardware_snapshot(snapshot) else {
return false;
};
store_snapshot(normalized);
let _ = sync_detected_hardware_profile_to_contract();
let _ = sync_backend_detected_profile(get_compute_backend());
true
}
pub fn clear_hardware_snapshot() {
SNAPSHOT_STORE.valid.store(0, Ordering::SeqCst);
set_softbuffer_runtime_flags(
SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_SNAPSHOT_MISSING,
);
}
pub fn probe_softbuffer_runtime_flags() -> u32 {
let _ = run_hardware_probe();
SOFTBUFFER_RUNTIME_FLAGS.load(Ordering::SeqCst) as u32
}
pub fn probe_hardware_available_flags() -> u32 {
run_hardware_probe().map(|s| s.hardware_flags).unwrap_or(0)
}
pub fn probe_cpu_kernel_spinlock_flags() -> u64 {
run_hardware_probe()
.map(|s| s.cpu_kernel_spinlock_flags)
.unwrap_or(0)
}
pub fn probe_hardware_kernel_spinlock_flags(backend: ComputeBackend) -> u64 {
let Some(snapshot) = run_hardware_probe() else {
return 0;
};
match backend {
ComputeBackend::Cpu => snapshot.cpu_kernel_spinlock_flags,
ComputeBackend::Gpu => snapshot.gpu.kernel_spinlock_flags,
ComputeBackend::Tpu => snapshot.tpu.kernel_spinlock_flags,
ComputeBackend::Lpu => snapshot.lpu.kernel_spinlock_flags,
}
}
pub fn probe_cpu_timing_abstraction() -> Option<BackendTimingAbstraction> {
let snapshot = run_hardware_probe()?;
Some(BackendTimingAbstraction {
clock_hz: snapshot.cpu_clock_hz,
cycle_counter_hz: snapshot.cpu_cycle_counter_hz,
nominal_cycles_per_kernel: snapshot.cpu_nominal_cycles_per_kernel,
observed_cycles_per_kernel: snapshot.cpu_observed_cycles_per_kernel,
timing_flags: snapshot.cpu_timing_flags,
})
}
pub fn probe_hardware_timing_abstraction(
backend: ComputeBackend,
) -> Option<BackendTimingAbstraction> {
let snapshot = run_hardware_probe()?;
match backend {
ComputeBackend::Cpu => Some(BackendTimingAbstraction {
clock_hz: snapshot.cpu_clock_hz,
cycle_counter_hz: snapshot.cpu_cycle_counter_hz,
nominal_cycles_per_kernel: snapshot.cpu_nominal_cycles_per_kernel,
observed_cycles_per_kernel: snapshot.cpu_observed_cycles_per_kernel,
timing_flags: snapshot.cpu_timing_flags,
}),
ComputeBackend::Gpu => Some(BackendTimingAbstraction {
clock_hz: snapshot.gpu.clock_hz,
cycle_counter_hz: snapshot.gpu.cycle_counter_hz,
nominal_cycles_per_kernel: snapshot.gpu.nominal_cycles_per_kernel,
observed_cycles_per_kernel: snapshot.gpu.observed_cycles_per_kernel,
timing_flags: snapshot.gpu.timing_flags,
}),
ComputeBackend::Tpu => Some(BackendTimingAbstraction {
clock_hz: snapshot.tpu.clock_hz,
cycle_counter_hz: snapshot.tpu.cycle_counter_hz,
nominal_cycles_per_kernel: snapshot.tpu.nominal_cycles_per_kernel,
observed_cycles_per_kernel: snapshot.tpu.observed_cycles_per_kernel,
timing_flags: snapshot.tpu.timing_flags,
}),
ComputeBackend::Lpu => Some(BackendTimingAbstraction {
clock_hz: snapshot.lpu.clock_hz,
cycle_counter_hz: snapshot.lpu.cycle_counter_hz,
nominal_cycles_per_kernel: snapshot.lpu.nominal_cycles_per_kernel,
observed_cycles_per_kernel: snapshot.lpu.observed_cycles_per_kernel,
timing_flags: snapshot.lpu.timing_flags,
}),
}
}
#[inline]
fn saturating_u128_to_usize(value: u128) -> usize {
if value > usize::MAX as u128 {
usize::MAX
} else {
value as usize
}
}
#[inline]
fn apply_headroom(value: usize, headroom_ppm: usize) -> usize {
let ppm = headroom_ppm.min(900_000);
let kept_ppm = 1_000_000usize.saturating_sub(ppm) as u128;
((value as u128).saturating_mul(kept_ppm) / 1_000_000u128) as usize
}
pub fn set_hardware_headroom_ppm(headroom_ppm: u32) {
HARDWARE_HEADROOM_PPM.store((headroom_ppm as usize).min(900_000), Ordering::SeqCst);
}
pub fn get_hardware_headroom_ppm() -> u32 {
HARDWARE_HEADROOM_PPM.load(Ordering::SeqCst) as u32
}
pub fn register_hardware_probe(handler: HardwareProbeFn, user_ctx: usize) {
HARDWARE_PROBE_USER_CTX.store(user_ctx, Ordering::SeqCst);
HARDWARE_PROBE_HANDLER.store(handler as usize, Ordering::SeqCst);
let _ = sync_detected_hardware_profile_to_contract();
let _ = sync_backend_detected_profile(get_compute_backend());
}
pub fn clear_hardware_probe() {
HARDWARE_PROBE_HANDLER.store(0, Ordering::SeqCst);
HARDWARE_PROBE_USER_CTX.store(0, Ordering::SeqCst);
}
pub fn probe_hardware_snapshot() -> Option<HardwareDetectionSnapshot> {
run_hardware_probe()
}
fn backend_present_from_detected_hardware(backend: ComputeBackend) -> bool {
if backend == ComputeBackend::Cpu {
return true;
}
let Some(snapshot) = run_hardware_probe() else {
return false;
};
let Some(component) = snapshot_component(&snapshot, backend) else {
return false;
};
component_softbuffer_ready(component)
}
#[inline]
fn component_buffers_ready(component: HardwareComponentSnapshot) -> bool {
component.present != 0
&& component.device_count > 0
&& component.total_memory_bytes > 0
&& component.available_memory_bytes > 0
&& component.available_memory_bytes <= component.total_memory_bytes
}
#[inline]
fn component_compute_ready(component: HardwareComponentSnapshot) -> bool {
component.present != 0 && component.device_count > 0
}
fn backend_compute_present_from_detected_hardware(backend: ComputeBackend) -> bool {
if backend == ComputeBackend::Cpu {
return false;
}
let Some(snapshot) = run_hardware_probe() else {
return false;
};
let Some(component) = snapshot_component(&snapshot, backend) else {
return false;
};
component_compute_ready(component)
}
#[inline]
fn progressive_validate_staged_size(current_bytes: u64) -> u64 {
if current_bytes <= 1 {
current_bytes
} else {
current_bytes / 2
}
}
fn build_buffer_staging_profile(
component: HardwareComponentSnapshot,
soft_overhead_ppm: u32,
) -> Option<BufferStagingProfile> {
if !component_buffers_ready(component) {
return None;
}
if component.kernel_spinlock_flags == 0 {
return Some(BufferStagingProfile {
discovered_total_bytes: component.total_memory_bytes,
discovered_available_bytes: component.available_memory_bytes,
staged_candidate_bytes: component.available_memory_bytes,
progressive_validated_bytes: component.available_memory_bytes,
soft_overhead_bytes: component.available_memory_bytes,
final_buffer_bytes: 0,
sequence_count: 0,
stage_flags: DISCOVERY_STAGE_STAGING_ACTIVE
| DISCOVERY_STAGE_SOFTBUFFER_STOPPED
| DISCOVERY_STAGE_FINALIZED,
});
}
let discovered_total = component.total_memory_bytes;
let discovered_available = component.available_memory_bytes.min(discovered_total);
let staged_candidate = discovered_available;
let mut stage_flags = DISCOVERY_STAGE_STAGING_ACTIVE;
if component.kernel_spinlock_flags != 0 {
stage_flags |= DISCOVERY_STAGE_SPINLOCK_GUARDED;
}
let mut progressive_validated = staged_candidate;
let mut sequence_count = 0u32;
while sequence_count < 4 && progressive_validated > 1 {
progressive_validated = progressive_validate_staged_size(progressive_validated);
sequence_count = sequence_count.saturating_add(1);
}
if sequence_count > 0 {
stage_flags |= DISCOVERY_STAGE_PROGRESSIVE_VALIDATED;
}
let ppm = soft_overhead_ppm.min(900_000) as u64;
let soft_overhead_bytes = progressive_validated
.saturating_mul(ppm)
.saturating_div(1_000_000);
let final_buffer_bytes = progressive_validated.saturating_sub(soft_overhead_bytes);
stage_flags |= DISCOVERY_STAGE_SOFT_OVERHEAD_APPLIED;
stage_flags |= DISCOVERY_STAGE_FINALIZED;
Some(BufferStagingProfile {
discovered_total_bytes: discovered_total,
discovered_available_bytes: discovered_available,
staged_candidate_bytes: staged_candidate,
progressive_validated_bytes: progressive_validated,
soft_overhead_bytes,
final_buffer_bytes,
sequence_count,
stage_flags,
})
}
fn build_cpu_buffer_staging_profile(
snapshot: &HardwareDetectionSnapshot,
soft_overhead_ppm: u32,
) -> Option<BufferStagingProfile> {
let discovered_total = snapshot.system_total_memory_bytes;
let discovered_available = snapshot.system_available_memory_bytes.min(discovered_total);
if discovered_total == 0 || discovered_available == 0 {
return None;
}
if snapshot.cpu_kernel_spinlock_flags == 0 {
return Some(BufferStagingProfile {
discovered_total_bytes: discovered_total,
discovered_available_bytes: discovered_available,
staged_candidate_bytes: discovered_available,
progressive_validated_bytes: discovered_available,
soft_overhead_bytes: discovered_available,
final_buffer_bytes: 0,
sequence_count: 0,
stage_flags: DISCOVERY_STAGE_STAGING_ACTIVE
| DISCOVERY_STAGE_SOFTBUFFER_STOPPED
| DISCOVERY_STAGE_FINALIZED,
});
}
let staged_candidate = discovered_available;
let mut stage_flags = DISCOVERY_STAGE_STAGING_ACTIVE;
if snapshot.cpu_kernel_spinlock_flags != 0 {
stage_flags |= DISCOVERY_STAGE_SPINLOCK_GUARDED;
}
let mut progressive_validated = staged_candidate;
let mut sequence_count = 0u32;
while sequence_count < 4 && progressive_validated > 1 {
progressive_validated = progressive_validate_staged_size(progressive_validated);
sequence_count = sequence_count.saturating_add(1);
}
if sequence_count > 0 {
stage_flags |= DISCOVERY_STAGE_PROGRESSIVE_VALIDATED;
}
let ppm = soft_overhead_ppm.min(900_000) as u64;
let soft_overhead_bytes = progressive_validated
.saturating_mul(ppm)
.saturating_div(1_000_000);
let final_buffer_bytes = progressive_validated.saturating_sub(soft_overhead_bytes);
stage_flags |= DISCOVERY_STAGE_SOFT_OVERHEAD_APPLIED;
stage_flags |= DISCOVERY_STAGE_FINALIZED;
Some(BufferStagingProfile {
discovered_total_bytes: discovered_total,
discovered_available_bytes: discovered_available,
staged_candidate_bytes: staged_candidate,
progressive_validated_bytes: progressive_validated,
soft_overhead_bytes,
final_buffer_bytes,
sequence_count,
stage_flags,
})
}
pub fn probe_hardware_buffer_staging_profile(
backend: ComputeBackend,
) -> Option<BufferStagingProfile> {
let snapshot = run_hardware_probe()?;
match backend {
ComputeBackend::Cpu => {
build_cpu_buffer_staging_profile(&snapshot, get_hardware_headroom_ppm())
}
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
let component = snapshot_component(&snapshot, backend)?;
build_buffer_staging_profile(component, get_hardware_headroom_ppm())
}
}
}
pub fn request_buffer_with_hardware_approval(
backend: ComputeBackend,
desired_bytes: usize,
max_attempts: u32,
) -> Option<usize> {
if desired_bytes == 0 {
return None;
}
let mut attempt = 0u32;
let mut candidate = desired_bytes as u64;
while attempt < max_attempts && candidate > 0 {
attempt = attempt.saturating_add(1);
let snapshot = match run_hardware_probe() {
Some(s) => s,
None => {
set_softbuffer_runtime_flags(
SOFTBUFFER_RUNTIME_FLAG_CPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_GPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_TPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_LPU_STOPPED
| SOFTBUFFER_RUNTIME_FLAG_SNAPSHOT_MISSING,
);
for _ in 0..256 {
spin_loop();
}
candidate /= 2;
continue;
}
};
if backend != ComputeBackend::Cpu && !backend_present_from_detected_hardware(backend) {
candidate /= 2;
continue;
}
let staging_opt = match backend {
ComputeBackend::Cpu => {
build_cpu_buffer_staging_profile(&snapshot, get_hardware_headroom_ppm())
}
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
let comp = snapshot_component(&snapshot, backend)?;
build_buffer_staging_profile(comp, get_hardware_headroom_ppm())
}
};
if let Some(staging) = staging_opt {
let quant_present = match backend {
ComputeBackend::Cpu => {
snapshot.cpu_kernel_spinlock_flags & KERNEL_SPINLOCK_FLAG_QUANTIZATION != 0
}
ComputeBackend::Gpu => {
snapshot.gpu.kernel_spinlock_flags & KERNEL_SPINLOCK_FLAG_QUANTIZATION != 0
}
ComputeBackend::Tpu => {
snapshot.tpu.kernel_spinlock_flags & KERNEL_SPINLOCK_FLAG_QUANTIZATION != 0
}
ComputeBackend::Lpu => {
snapshot.lpu.kernel_spinlock_flags & KERNEL_SPINLOCK_FLAG_QUANTIZATION != 0
}
};
if !quant_present {
candidate = progressive_validate_staged_size(candidate);
continue;
}
if staging.final_buffer_bytes > 0 {
let approved = (candidate as usize).min(staging.final_buffer_bytes as usize);
return Some(approved);
} else {
candidate /= 2;
continue;
}
} else {
candidate /= 2;
continue;
}
}
None
}
fn backend_detected_total_memory_auto(backend: ComputeBackend) -> usize {
let Some(snapshot) = run_hardware_probe() else {
return 0;
};
match backend {
ComputeBackend::Cpu => {
let Some(staging) =
build_cpu_buffer_staging_profile(&snapshot, get_hardware_headroom_ppm())
else {
return 0;
};
staging.discovered_total_bytes as usize
}
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
let Some(component) = snapshot_component(&snapshot, backend) else {
return 0;
};
let Some(staging) =
build_buffer_staging_profile(component, get_hardware_headroom_ppm())
else {
return 0;
};
staging.discovered_total_bytes as usize
}
}
}
fn backend_detected_available_memory_auto(backend: ComputeBackend) -> usize {
let Some(snapshot) = run_hardware_probe() else {
return 0;
};
match backend {
ComputeBackend::Cpu => {
let Some(staging) =
build_cpu_buffer_staging_profile(&snapshot, get_hardware_headroom_ppm())
else {
return 0;
};
staging.final_buffer_bytes as usize
}
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
let Some(component) = snapshot_component(&snapshot, backend) else {
return 0;
};
let Some(staging) =
build_buffer_staging_profile(component, get_hardware_headroom_ppm())
else {
return 0;
};
staging.final_buffer_bytes as usize
}
}
}
fn backend_detected_sustained_flops_auto(backend: ComputeBackend) -> usize {
let Some(snapshot) = run_hardware_probe() else {
return 0;
};
match backend {
ComputeBackend::Cpu => 0,
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
snapshot_component(&snapshot, backend)
.map(|x| x.sustained_flops_per_second as usize)
.unwrap_or(0)
}
}
}
fn backend_detected_sustained_flops_per_second(backend: ComputeBackend) -> usize {
let raw = match backend {
ComputeBackend::Cpu => 0,
ComputeBackend::Gpu => GPU_SUSTAINED_FLOPS_PER_SECOND.load(Ordering::SeqCst),
ComputeBackend::Tpu => TPU_SUSTAINED_FLOPS_PER_SECOND.load(Ordering::SeqCst),
ComputeBackend::Lpu => LPU_SUSTAINED_FLOPS_PER_SECOND.load(Ordering::SeqCst),
};
let base = if raw == 0 {
backend_detected_sustained_flops_auto(backend)
} else {
raw
};
apply_headroom(base, HARDWARE_HEADROOM_PPM.load(Ordering::SeqCst))
}
fn backend_detected_total_memory_bytes(backend: ComputeBackend) -> usize {
let raw = match backend {
ComputeBackend::Cpu => 0,
ComputeBackend::Gpu => GPU_TOTAL_MEMORY_BYTES.load(Ordering::SeqCst),
ComputeBackend::Tpu => TPU_TOTAL_MEMORY_BYTES.load(Ordering::SeqCst),
ComputeBackend::Lpu => LPU_TOTAL_MEMORY_BYTES.load(Ordering::SeqCst),
};
if raw == 0 {
backend_detected_total_memory_auto(backend)
} else {
raw
}
}
fn backend_detected_available_memory_bytes(backend: ComputeBackend) -> usize {
let raw = match backend {
ComputeBackend::Cpu => 0,
ComputeBackend::Gpu => GPU_AVAILABLE_MEMORY_BYTES.load(Ordering::SeqCst),
ComputeBackend::Tpu => TPU_AVAILABLE_MEMORY_BYTES.load(Ordering::SeqCst),
ComputeBackend::Lpu => LPU_AVAILABLE_MEMORY_BYTES.load(Ordering::SeqCst),
};
let total = backend_detected_total_memory_bytes(backend);
if raw == 0 {
backend_detected_available_memory_auto(backend).min(total)
} else {
raw.min(total)
}
}
fn clear_backend_detected_profile(backend: ComputeBackend) {
set_backend_detected_sustained_flops_per_second(backend, 0);
set_backend_detected_memory(backend, 0, 0);
}
pub fn set_compute_backend(backend: ComputeBackend) {
BACKEND.store(backend as u8, Ordering::SeqCst);
sync_backend_detected_profile(backend);
}
pub fn get_compute_backend() -> ComputeBackend {
match BACKEND.load(Ordering::SeqCst) {
1 => ComputeBackend::Gpu,
2 => ComputeBackend::Tpu,
3 => ComputeBackend::Lpu,
_ => ComputeBackend::Cpu,
}
}
pub fn set_hardware_contract_strict(strict: bool) {
HARDWARE_CONTRACT_STRICT.store(if strict { 1 } else { 0 }, Ordering::SeqCst);
}
pub fn is_hardware_contract_strict() -> bool {
HARDWARE_CONTRACT_STRICT.load(Ordering::SeqCst) != 0
}
pub fn set_contract_runtime_flags(flags: u32) {
CONTRACT_RUNTIME_FLAGS.store(
(flags & CONTRACT_KNOWN_FLAGS_MASK) as usize,
Ordering::SeqCst,
);
}
pub fn get_contract_runtime_flags() -> u32 {
CONTRACT_RUNTIME_FLAGS.load(Ordering::SeqCst) as u32
}
pub fn set_contract_min_sustained_flops_per_second(min_flops_per_second: u128) {
CONTRACT_MIN_SUSTAINED_FLOPS_PER_SECOND.store(
saturating_u128_to_usize(min_flops_per_second),
Ordering::SeqCst,
);
}
pub fn get_contract_min_sustained_flops_per_second() -> u128 {
CONTRACT_MIN_SUSTAINED_FLOPS_PER_SECOND.load(Ordering::SeqCst) as u128
}
pub fn set_contract_min_available_memory_bytes(min_available_memory_bytes: u128) {
CONTRACT_MIN_AVAILABLE_MEMORY_BYTES.store(
saturating_u128_to_usize(min_available_memory_bytes),
Ordering::SeqCst,
);
}
pub fn get_contract_min_available_memory_bytes() -> u128 {
CONTRACT_MIN_AVAILABLE_MEMORY_BYTES.load(Ordering::SeqCst) as u128
}
pub fn set_backend_detected_sustained_flops_per_second(
backend: ComputeBackend,
sustained_flops_per_second: u128,
) {
let value = saturating_u128_to_usize(sustained_flops_per_second);
match backend {
ComputeBackend::Cpu => {}
ComputeBackend::Gpu => GPU_SUSTAINED_FLOPS_PER_SECOND.store(value, Ordering::SeqCst),
ComputeBackend::Tpu => TPU_SUSTAINED_FLOPS_PER_SECOND.store(value, Ordering::SeqCst),
ComputeBackend::Lpu => LPU_SUSTAINED_FLOPS_PER_SECOND.store(value, Ordering::SeqCst),
}
}
pub fn set_backend_detected_memory(
backend: ComputeBackend,
total_memory_bytes: u128,
available_memory_bytes: u128,
) {
let total = saturating_u128_to_usize(total_memory_bytes);
let available = saturating_u128_to_usize(available_memory_bytes).min(total);
match backend {
ComputeBackend::Cpu => {}
ComputeBackend::Gpu => {
GPU_TOTAL_MEMORY_BYTES.store(total, Ordering::SeqCst);
GPU_AVAILABLE_MEMORY_BYTES.store(available, Ordering::SeqCst);
}
ComputeBackend::Tpu => {
TPU_TOTAL_MEMORY_BYTES.store(total, Ordering::SeqCst);
TPU_AVAILABLE_MEMORY_BYTES.store(available, Ordering::SeqCst);
}
ComputeBackend::Lpu => {
LPU_TOTAL_MEMORY_BYTES.store(total, Ordering::SeqCst);
LPU_AVAILABLE_MEMORY_BYTES.store(available, Ordering::SeqCst);
}
}
}
pub fn sync_detected_hardware_profile_to_contract() -> bool {
let Some(snapshot) = run_hardware_probe() else {
return false;
};
let headroom_ppm = get_hardware_headroom_ppm();
if component_compute_ready(snapshot.gpu) {
set_backend_detected_sustained_flops_per_second(
ComputeBackend::Gpu,
snapshot.gpu.sustained_flops_per_second as u128,
);
} else {
clear_backend_detected_profile(ComputeBackend::Gpu);
}
if component_buffers_ready(snapshot.gpu) {
let Some(staging) = build_buffer_staging_profile(snapshot.gpu, headroom_ppm) else {
return false;
};
let gpu_total = staging.discovered_total_bytes as u128;
let gpu_available = staging.final_buffer_bytes as u128;
set_backend_detected_memory(ComputeBackend::Gpu, gpu_total, gpu_available);
} else {
set_backend_detected_memory(ComputeBackend::Gpu, 0, 0);
}
if component_compute_ready(snapshot.tpu) {
set_backend_detected_sustained_flops_per_second(
ComputeBackend::Tpu,
snapshot.tpu.sustained_flops_per_second as u128,
);
} else {
clear_backend_detected_profile(ComputeBackend::Tpu);
}
if component_buffers_ready(snapshot.tpu) {
let Some(staging) = build_buffer_staging_profile(snapshot.tpu, headroom_ppm) else {
return false;
};
let tpu_total = staging.discovered_total_bytes as u128;
let tpu_available = staging.final_buffer_bytes as u128;
set_backend_detected_memory(ComputeBackend::Tpu, tpu_total, tpu_available);
} else {
set_backend_detected_memory(ComputeBackend::Tpu, 0, 0);
}
if component_compute_ready(snapshot.lpu) {
set_backend_detected_sustained_flops_per_second(
ComputeBackend::Lpu,
snapshot.lpu.sustained_flops_per_second as u128,
);
} else {
clear_backend_detected_profile(ComputeBackend::Lpu);
}
if component_buffers_ready(snapshot.lpu) {
let Some(staging) = build_buffer_staging_profile(snapshot.lpu, headroom_ppm) else {
return false;
};
let lpu_total = staging.discovered_total_bytes as u128;
let lpu_available = staging.final_buffer_bytes as u128;
set_backend_detected_memory(ComputeBackend::Lpu, lpu_total, lpu_available);
} else {
set_backend_detected_memory(ComputeBackend::Lpu, 0, 0);
}
true
}
pub fn sync_backend_detected_profile(backend: ComputeBackend) -> bool {
let Some(snapshot) = run_hardware_probe() else {
return false;
};
let headroom_ppm = get_hardware_headroom_ppm();
match backend {
ComputeBackend::Cpu => true,
ComputeBackend::Gpu => {
if !component_compute_ready(snapshot.gpu) {
clear_backend_detected_profile(ComputeBackend::Gpu);
return false;
}
let sustained = snapshot.gpu.sustained_flops_per_second as u128;
set_backend_detected_sustained_flops_per_second(ComputeBackend::Gpu, sustained);
if !component_buffers_ready(snapshot.gpu) {
set_backend_detected_memory(ComputeBackend::Gpu, 0, 0);
return true;
}
let Some(staging) = build_buffer_staging_profile(snapshot.gpu, headroom_ppm) else {
return false;
};
let total = staging.discovered_total_bytes as u128;
let available = staging.final_buffer_bytes as u128;
set_backend_detected_memory(ComputeBackend::Gpu, total, available);
true
}
ComputeBackend::Tpu => {
if !component_compute_ready(snapshot.tpu) {
clear_backend_detected_profile(ComputeBackend::Tpu);
return false;
}
let sustained = snapshot.tpu.sustained_flops_per_second as u128;
set_backend_detected_sustained_flops_per_second(ComputeBackend::Tpu, sustained);
if !component_buffers_ready(snapshot.tpu) {
set_backend_detected_memory(ComputeBackend::Tpu, 0, 0);
return true;
}
let Some(staging) = build_buffer_staging_profile(snapshot.tpu, headroom_ppm) else {
return false;
};
let total = staging.discovered_total_bytes as u128;
let available = staging.final_buffer_bytes as u128;
set_backend_detected_memory(ComputeBackend::Tpu, total, available);
true
}
ComputeBackend::Lpu => {
if !component_compute_ready(snapshot.lpu) {
clear_backend_detected_profile(ComputeBackend::Lpu);
return false;
}
let sustained = snapshot.lpu.sustained_flops_per_second as u128;
set_backend_detected_sustained_flops_per_second(ComputeBackend::Lpu, sustained);
if !component_buffers_ready(snapshot.lpu) {
set_backend_detected_memory(ComputeBackend::Lpu, 0, 0);
return true;
}
let Some(staging) = build_buffer_staging_profile(snapshot.lpu, headroom_ppm) else {
return false;
};
let total = staging.discovered_total_bytes as u128;
let available = staging.final_buffer_bytes as u128;
set_backend_detected_memory(ComputeBackend::Lpu, total, available);
true
}
}
}
pub fn backend_detected_flop_abstraction(backend: ComputeBackend) -> Option<FlopAbstraction> {
match backend {
ComputeBackend::Cpu => None,
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
if !backend_compute_present_from_detected_hardware(backend) {
return None;
}
Some(build_flop_abstraction(
backend_detected_sustained_flops_per_second(backend) as u128,
))
}
}
}
pub fn active_backend_detected_flop_abstraction() -> Option<FlopAbstraction> {
backend_detected_flop_abstraction(get_compute_backend())
}
pub fn backend_detected_ram_abstraction(backend: ComputeBackend) -> Option<RamAbstraction> {
match backend {
ComputeBackend::Cpu => None,
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
if !backend_present_from_detected_hardware(backend) {
return None;
}
let total = backend_detected_total_memory_bytes(backend) as u128;
let available = backend_detected_available_memory_bytes(backend) as u128;
Some(build_ram_abstraction(total, available))
}
}
}
pub fn active_backend_detected_ram_abstraction() -> Option<RamAbstraction> {
backend_detected_ram_abstraction(get_compute_backend())
}
pub fn backend_supported_request_flags(backend: ComputeBackend) -> u32 {
match backend {
ComputeBackend::Cpu => 0,
ComputeBackend::Gpu | ComputeBackend::Tpu | ComputeBackend::Lpu => {
CONTRACT_KNOWN_FLAGS_MASK
}
}
}
pub fn register_contract_reject_handler(handler: ContractRejectHandler, user_ctx: usize) {
CONTRACT_REJECT_USER_CTX.store(user_ctx, Ordering::SeqCst);
CONTRACT_REJECT_HANDLER.store(handler as usize, Ordering::SeqCst);
}
pub fn clear_contract_reject_handler() {
CONTRACT_REJECT_HANDLER.store(0, Ordering::SeqCst);
CONTRACT_REJECT_USER_CTX.store(0, Ordering::SeqCst);
}
pub fn clear_last_contract_reject() {
LAST_CONTRACT_REJECT_REASON.store(0, Ordering::SeqCst);
LAST_CONTRACT_REJECT_BACKEND.store(ComputeBackend::Cpu as u8, Ordering::SeqCst);
LAST_CONTRACT_REJECT_OPCODE.store(0, Ordering::SeqCst);
}
pub fn last_contract_reject_reason() -> Option<ContractRejectReason> {
let raw = LAST_CONTRACT_REJECT_REASON.load(Ordering::SeqCst) as u32;
contract_reject_reason_from_raw(raw)
}
pub fn last_contract_reject_backend() -> Option<ComputeBackend> {
let has_reason = LAST_CONTRACT_REJECT_REASON.load(Ordering::SeqCst) != 0;
if !has_reason {
return None;
}
backend_from_raw(LAST_CONTRACT_REJECT_BACKEND.load(Ordering::SeqCst))
}
pub fn last_contract_reject_opcode() -> Option<GpuOpCode> {
let has_reason = LAST_CONTRACT_REJECT_REASON.load(Ordering::SeqCst) != 0;
if !has_reason {
return None;
}
let raw = LAST_CONTRACT_REJECT_OPCODE.load(Ordering::SeqCst) as u32;
opcode_from_raw(raw)
}
pub fn set_gpu_contract_opcode_mask(opcode_mask: u32) {
GPU_OPCODE_MASK.store((opcode_mask & ALL_OPCODE_MASK) as usize, Ordering::SeqCst);
}
pub fn set_tpu_contract_opcode_mask(opcode_mask: u32) {
TPU_OPCODE_MASK.store((opcode_mask & ALL_OPCODE_MASK) as usize, Ordering::SeqCst);
}
pub fn set_lpu_contract_opcode_mask(opcode_mask: u32) {
LPU_OPCODE_MASK.store((opcode_mask & ALL_OPCODE_MASK) as usize, Ordering::SeqCst);
}
pub fn get_gpu_contract_opcode_mask() -> u32 {
GPU_OPCODE_MASK.load(Ordering::SeqCst) as u32
}
pub fn get_tpu_contract_opcode_mask() -> u32 {
TPU_OPCODE_MASK.load(Ordering::SeqCst) as u32
}
pub fn get_lpu_contract_opcode_mask() -> u32 {
LPU_OPCODE_MASK.load(Ordering::SeqCst) as u32
}
pub fn register_gpu_command_handler_with_capabilities(
handler: GpuCommandHandler,
user_ctx: usize,
opcode_mask: u32,
) {
set_gpu_contract_opcode_mask(opcode_mask);
register_gpu_command_handler(handler, user_ctx);
}
pub fn register_gpu_command_handler_with_hardware_profile(
handler: GpuCommandHandler,
user_ctx: usize,
opcode_mask: u32,
sustained_flops_per_second: u128,
total_memory_bytes: u128,
available_memory_bytes: u128,
) {
set_backend_detected_sustained_flops_per_second(
ComputeBackend::Gpu,
sustained_flops_per_second,
);
set_backend_detected_memory(
ComputeBackend::Gpu,
total_memory_bytes,
available_memory_bytes,
);
register_gpu_command_handler_with_capabilities(handler, user_ctx, opcode_mask);
}
pub fn register_tpu_command_handler_with_capabilities(
handler: TpuCommandHandler,
user_ctx: usize,
opcode_mask: u32,
) {
set_tpu_contract_opcode_mask(opcode_mask);
register_tpu_command_handler(handler, user_ctx);
}
pub fn register_tpu_command_handler_with_hardware_profile(
handler: TpuCommandHandler,
user_ctx: usize,
opcode_mask: u32,
sustained_flops_per_second: u128,
total_memory_bytes: u128,
available_memory_bytes: u128,
) {
set_backend_detected_sustained_flops_per_second(
ComputeBackend::Tpu,
sustained_flops_per_second,
);
set_backend_detected_memory(
ComputeBackend::Tpu,
total_memory_bytes,
available_memory_bytes,
);
register_tpu_command_handler_with_capabilities(handler, user_ctx, opcode_mask);
}
pub fn register_lpu_command_handler_with_capabilities(
handler: LpuCommandHandler,
user_ctx: usize,
opcode_mask: u32,
) {
set_lpu_contract_opcode_mask(opcode_mask);
register_lpu_command_handler(handler, user_ctx);
}
pub fn register_lpu_command_handler_with_hardware_profile(
handler: LpuCommandHandler,
user_ctx: usize,
opcode_mask: u32,
sustained_flops_per_second: u128,
total_memory_bytes: u128,
available_memory_bytes: u128,
) {
set_backend_detected_sustained_flops_per_second(
ComputeBackend::Lpu,
sustained_flops_per_second,
);
set_backend_detected_memory(
ComputeBackend::Lpu,
total_memory_bytes,
available_memory_bytes,
);
register_lpu_command_handler_with_capabilities(handler, user_ctx, opcode_mask);
}
fn contract_dispatch_reject(
reason: ContractRejectReason,
backend: ComputeBackend,
opcode: GpuOpCode,
) -> bool {
LAST_CONTRACT_REJECT_REASON.store(reason as usize, Ordering::SeqCst);
LAST_CONTRACT_REJECT_BACKEND.store(backend as u8, Ordering::SeqCst);
LAST_CONTRACT_REJECT_OPCODE.store(opcode as usize, Ordering::SeqCst);
if is_hardware_contract_strict() {
let handler_ptr = CONTRACT_REJECT_HANDLER.load(Ordering::SeqCst);
if let Some(handler) = decode_contract_reject_handler(handler_ptr) {
let user_ctx = CONTRACT_REJECT_USER_CTX.load(Ordering::SeqCst);
handler(reason, backend, opcode as u32, user_ctx);
return false;
}
}
false
}
pub fn register_gpu_kernel_f32(handler: GpuKernelF32) {
GPU_KERNEL_F32.store(handler as usize, Ordering::SeqCst);
}
pub fn register_gpu_kernel_f64(handler: GpuKernelF64) {
GPU_KERNEL_F64.store(handler as usize, Ordering::SeqCst);
}
pub fn clear_gpu_kernel_f32() {
GPU_KERNEL_F32.store(0, Ordering::SeqCst);
}
pub fn clear_gpu_kernel_f64() {
GPU_KERNEL_F64.store(0, Ordering::SeqCst);
}
pub fn register_gpu_command_handler(handler: GpuCommandHandler, user_ctx: usize) {
GPU_COMMAND_USER_CTX.store(user_ctx, Ordering::SeqCst);
GPU_COMMAND_HANDLER.store(handler as usize, Ordering::SeqCst);
}
pub fn clear_gpu_command_handler() {
GPU_COMMAND_HANDLER.store(0, Ordering::SeqCst);
GPU_COMMAND_USER_CTX.store(0, Ordering::SeqCst);
GPU_OPCODE_MASK.store(0, Ordering::SeqCst);
}
pub fn register_tpu_command_handler(handler: TpuCommandHandler, user_ctx: usize) {
TPU_COMMAND_USER_CTX.store(user_ctx, Ordering::SeqCst);
TPU_COMMAND_HANDLER.store(handler as usize, Ordering::SeqCst);
}
pub fn clear_tpu_command_handler() {
TPU_COMMAND_HANDLER.store(0, Ordering::SeqCst);
TPU_COMMAND_USER_CTX.store(0, Ordering::SeqCst);
TPU_OPCODE_MASK.store(0, Ordering::SeqCst);
}
pub fn register_lpu_command_handler(handler: LpuCommandHandler, user_ctx: usize) {
LPU_COMMAND_USER_CTX.store(user_ctx, Ordering::SeqCst);
LPU_COMMAND_HANDLER.store(handler as usize, Ordering::SeqCst);
}
pub fn clear_lpu_command_handler() {
LPU_COMMAND_HANDLER.store(0, Ordering::SeqCst);
LPU_COMMAND_USER_CTX.store(0, Ordering::SeqCst);
LPU_OPCODE_MASK.store(0, Ordering::SeqCst);
}
fn hash_payload(payload_ptr: *const u8, payload_len: usize) -> u32 {
let mut hash: u32 = 0x811C9DC5;
let mut i = 0usize;
while i < payload_len {
let b = unsafe { *payload_ptr.add(i) };
hash ^= b as u32;
hash = hash.wrapping_mul(0x01000193);
i += 1;
}
hash
}
pub fn validate_contract_request_for_backend(
backend: ComputeBackend,
request: &GpuContractRequest,
) -> GpuDispatchStatus {
let Some(caps) = probe_backend_contract(backend) else {
return GpuDispatchStatus::BadContract;
};
let expected_magic = match backend {
ComputeBackend::Gpu => GPU_CONTRACT_MAGIC,
ComputeBackend::Tpu => TPU_CONTRACT_MAGIC,
ComputeBackend::Lpu => LPU_CONTRACT_MAGIC,
ComputeBackend::Cpu => return GpuDispatchStatus::BadContract,
};
let expected_version = match contract_abi_version_for_backend(backend) {
Some(v) => v,
None => return GpuDispatchStatus::BadContract,
};
if request.header.magic != expected_magic || request.header.abi_version != expected_version {
return GpuDispatchStatus::BadContract;
}
let supported_flags = backend_supported_request_flags(backend);
if request.header.flags & !supported_flags != 0 {
return GpuDispatchStatus::BadContract;
}
if caps.total_memory_bytes == 0 || caps.available_memory_bytes > caps.total_memory_bytes {
return GpuDispatchStatus::BadContract;
}
let min_flops = CONTRACT_MIN_SUSTAINED_FLOPS_PER_SECOND.load(Ordering::SeqCst) as u64;
if min_flops > 0 && caps.sustained_flops_per_second < min_flops {
return GpuDispatchStatus::NotSupported;
}
let min_available_memory = CONTRACT_MIN_AVAILABLE_MEMORY_BYTES.load(Ordering::SeqCst) as u64;
if min_available_memory > 0 && caps.available_memory_bytes < min_available_memory {
return GpuDispatchStatus::NotSupported;
}
let Some(opcode) = opcode_from_raw(request.header.opcode) else {
return GpuDispatchStatus::BadContract;
};
if (caps.opcode_mask & opcode_bit(opcode)) == 0 {
return GpuDispatchStatus::NotSupported;
}
let expected_payload_len = expected_payload_len(opcode);
if request.header.payload_len as usize != expected_payload_len {
return GpuDispatchStatus::BadPayload;
}
let payload_len = request.header.payload_len as usize;
if payload_len > 0 && request.payload_ptr.is_null() {
return GpuDispatchStatus::BadPayload;
}
let actual_hash = if payload_len == 0 {
0x811C9DC5u32
} else {
hash_payload(request.payload_ptr, payload_len)
};
if request.header.payload_hash != actual_hash {
return GpuDispatchStatus::BadPayload;
}
GpuDispatchStatus::Ok
}
fn active_contract_info() -> Option<(u32, u16, usize, usize)> {
match get_compute_backend() {
ComputeBackend::Gpu => Some((
GPU_CONTRACT_MAGIC,
GPU_CONTRACT_ABI_VERSION,
GPU_COMMAND_HANDLER.load(Ordering::SeqCst),
GPU_COMMAND_USER_CTX.load(Ordering::SeqCst),
)),
ComputeBackend::Tpu => Some((
TPU_CONTRACT_MAGIC,
TPU_CONTRACT_ABI_VERSION,
TPU_COMMAND_HANDLER.load(Ordering::SeqCst),
TPU_COMMAND_USER_CTX.load(Ordering::SeqCst),
)),
ComputeBackend::Lpu => Some((
LPU_CONTRACT_MAGIC,
LPU_CONTRACT_ABI_VERSION,
LPU_COMMAND_HANDLER.load(Ordering::SeqCst),
LPU_COMMAND_USER_CTX.load(Ordering::SeqCst),
)),
ComputeBackend::Cpu => None,
}
}
fn dispatch_contract(opcode: GpuOpCode, payload_ptr: *const u8, payload_len: usize) -> bool {
let backend = get_compute_backend();
let Some(caps) = probe_backend_contract(backend) else {
return contract_dispatch_reject(
ContractRejectReason::NoActiveBackendContract,
backend,
opcode,
);
};
if (caps.opcode_mask & opcode_bit(opcode)) == 0 {
return contract_dispatch_reject(ContractRejectReason::OpcodeNotSupported, backend, opcode);
}
if payload_len != expected_payload_len(opcode) {
return contract_dispatch_reject(
ContractRejectReason::PayloadSizeMismatch,
backend,
opcode,
);
}
if payload_len > 0 && payload_ptr.is_null() {
return contract_dispatch_reject(ContractRejectReason::NullPayloadPointer, backend, opcode);
}
let runtime_flags = get_contract_runtime_flags();
let supported_flags = backend_supported_request_flags(backend);
if runtime_flags & !supported_flags != 0 {
return contract_dispatch_reject(
ContractRejectReason::UnsupportedRequestFlags,
backend,
opcode,
);
}
let min_flops = CONTRACT_MIN_SUSTAINED_FLOPS_PER_SECOND.load(Ordering::SeqCst) as u64;
if min_flops > 0 && caps.sustained_flops_per_second < min_flops {
return contract_dispatch_reject(
ContractRejectReason::InsufficientSustainedFlops,
backend,
opcode,
);
}
let min_available_memory = CONTRACT_MIN_AVAILABLE_MEMORY_BYTES.load(Ordering::SeqCst) as u64;
if min_available_memory > 0 && caps.available_memory_bytes < min_available_memory {
return contract_dispatch_reject(
ContractRejectReason::InsufficientAvailableMemory,
backend,
opcode,
);
}
let Some((magic, abi_version, handler_ptr, user_ctx)) = active_contract_info() else {
return contract_dispatch_reject(
ContractRejectReason::NoActiveContractInfo,
backend,
opcode,
);
};
let Some(handler) = decode_gpu_command_handler(handler_ptr) else {
return contract_dispatch_reject(
ContractRejectReason::NoRegisteredHandler,
backend,
opcode,
);
};
let header = GpuContractHeader {
magic,
abi_version,
reserved: 0,
opcode: opcode as u32,
payload_len: payload_len as u32,
payload_hash: hash_payload(payload_ptr, payload_len),
flags: runtime_flags,
};
let request = GpuContractRequest {
header,
payload_ptr,
};
let status = handler(&request as *const GpuContractRequest, user_ctx);
if status == GpuDispatchStatus::Ok as u32 {
true
} else {
contract_dispatch_reject(ContractRejectReason::HandlerReturnedNonOk, backend, opcode)
}
}
pub(crate) fn try_invoke_gpu_kernel_f32(args: KernelInvokeF32<'_>) -> bool {
let KernelInvokeF32 {
src,
dst,
batch_size,
stride,
in_size,
out_size,
weights,
biases,
activation,
} = args;
let cmd = KernelCmdF32 {
src_ptr: src.as_ptr(),
src_len: src.len(),
dst_ptr: dst.as_mut_ptr(),
dst_len: dst.len(),
batch_size,
stride,
in_size,
out_size,
weights_ptr: weights.as_ptr(),
weights_len: weights.len(),
biases_ptr: biases.as_ptr(),
biases_len: biases.len(),
activation: activation.to_u8(),
};
if dispatch_contract(
GpuOpCode::F32,
(&cmd as *const KernelCmdF32).cast::<u8>(),
core::mem::size_of::<KernelCmdF32>(),
) {
return true;
}
if get_compute_backend() != ComputeBackend::Gpu {
return false;
}
let ptr = GPU_KERNEL_F32.load(Ordering::SeqCst);
let Some(handler) = decode_gpu_kernel_f32(ptr) else {
return false;
};
handler(
src, dst, batch_size, stride, in_size, out_size, weights, biases, activation,
)
}
pub(crate) fn try_invoke_gpu_kernel_f64(args: KernelInvokeF64<'_>) -> bool {
let KernelInvokeF64 {
src,
dst,
batch_size,
stride,
in_size,
out_size,
weights,
biases,
activation,
} = args;
let cmd = KernelCmdF64 {
src_ptr: src.as_ptr(),
src_len: src.len(),
dst_ptr: dst.as_mut_ptr(),
dst_len: dst.len(),
batch_size,
stride,
in_size,
out_size,
weights_ptr: weights.as_ptr(),
weights_len: weights.len(),
biases_ptr: biases.as_ptr(),
biases_len: biases.len(),
activation: activation.to_u8(),
};
if dispatch_contract(
GpuOpCode::F64,
(&cmd as *const KernelCmdF64).cast::<u8>(),
core::mem::size_of::<KernelCmdF64>(),
) {
return true;
}
if get_compute_backend() != ComputeBackend::Gpu {
return false;
}
let ptr = GPU_KERNEL_F64.load(Ordering::SeqCst);
let Some(handler) = decode_gpu_kernel_f64(ptr) else {
return false;
};
handler(
src, dst, batch_size, stride, in_size, out_size, weights, biases, activation,
)
}
pub(crate) fn try_invoke_gpu_softmax_f32(logits: &[f32], out: &mut [f32]) -> bool {
let cmd = SoftmaxCmdF32 {
logits_ptr: logits.as_ptr(),
out_ptr: out.as_mut_ptr(),
len: logits.len(),
};
dispatch_contract(
GpuOpCode::SoftmaxF32,
(&cmd as *const SoftmaxCmdF32).cast::<u8>(),
core::mem::size_of::<SoftmaxCmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_softmax_f64(logits: &[f64], out: &mut [f64]) -> bool {
let cmd = SoftmaxCmdF64 {
logits_ptr: logits.as_ptr(),
out_ptr: out.as_mut_ptr(),
len: logits.len(),
};
dispatch_contract(
GpuOpCode::SoftmaxF64,
(&cmd as *const SoftmaxCmdF64).cast::<u8>(),
core::mem::size_of::<SoftmaxCmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_layer_norm_f32(
x: &mut [f32],
gamma: &[f32],
beta: &[f32],
eps: f32,
) -> bool {
let cmd = LayerNormCmdF32 {
x_ptr: x.as_mut_ptr(),
gamma_ptr: gamma.as_ptr(),
beta_ptr: beta.as_ptr(),
len: x.len(),
eps,
};
dispatch_contract(
GpuOpCode::LayerNormF32,
(&cmd as *const LayerNormCmdF32).cast::<u8>(),
core::mem::size_of::<LayerNormCmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_layer_norm_f64(
x: &mut [f64],
gamma: &[f64],
beta: &[f64],
eps: f64,
) -> bool {
let cmd = LayerNormCmdF64 {
x_ptr: x.as_mut_ptr(),
gamma_ptr: gamma.as_ptr(),
beta_ptr: beta.as_ptr(),
len: x.len(),
eps,
};
dispatch_contract(
GpuOpCode::LayerNormF64,
(&cmd as *const LayerNormCmdF64).cast::<u8>(),
core::mem::size_of::<LayerNormCmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_rms_norm_f32(x: &mut [f32], gamma: &[f32], eps: f32) -> bool {
let cmd = RmsNormCmdF32 {
x_ptr: x.as_mut_ptr(),
gamma_ptr: gamma.as_ptr(),
len: x.len(),
eps,
};
dispatch_contract(
GpuOpCode::RmsNormF32,
(&cmd as *const RmsNormCmdF32).cast::<u8>(),
core::mem::size_of::<RmsNormCmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_rms_norm_f64(x: &mut [f64], gamma: &[f64], eps: f64) -> bool {
let cmd = RmsNormCmdF64 {
x_ptr: x.as_mut_ptr(),
gamma_ptr: gamma.as_ptr(),
len: x.len(),
eps,
};
dispatch_contract(
GpuOpCode::RmsNormF64,
(&cmd as *const RmsNormCmdF64).cast::<u8>(),
core::mem::size_of::<RmsNormCmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_attention_f32(args: AttentionInvokeF32<'_>) -> bool {
let AttentionInvokeF32 {
q,
k,
v,
out,
scratch_scores,
q_len,
k_len,
d_k,
d_v,
mask,
} = args;
let cmd = AttentionCmdF32 {
q_ptr: q.as_ptr(),
q_len_total: q.len(),
k_ptr: k.as_ptr(),
k_len_total: k.len(),
v_ptr: v.as_ptr(),
v_len_total: v.len(),
out_ptr: out.as_mut_ptr(),
out_len_total: out.len(),
scratch_scores_ptr: scratch_scores.as_mut_ptr(),
scratch_scores_len: scratch_scores.len(),
q_len,
k_len,
d_k,
d_v,
mask,
};
dispatch_contract(
GpuOpCode::AttentionF32,
(&cmd as *const AttentionCmdF32).cast::<u8>(),
core::mem::size_of::<AttentionCmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_attention_f64(args: AttentionInvokeF64<'_>) -> bool {
let AttentionInvokeF64 {
q,
k,
v,
out,
scratch_scores,
q_len,
k_len,
d_k,
d_v,
mask,
} = args;
let cmd = AttentionCmdF64 {
q_ptr: q.as_ptr(),
q_len_total: q.len(),
k_ptr: k.as_ptr(),
k_len_total: k.len(),
v_ptr: v.as_ptr(),
v_len_total: v.len(),
out_ptr: out.as_mut_ptr(),
out_len_total: out.len(),
scratch_scores_ptr: scratch_scores.as_mut_ptr(),
scratch_scores_len: scratch_scores.len(),
q_len,
k_len,
d_k,
d_v,
mask,
};
dispatch_contract(
GpuOpCode::AttentionF64,
(&cmd as *const AttentionCmdF64).cast::<u8>(),
core::mem::size_of::<AttentionCmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_quantize_i8_f32(
input: &[f32],
output: &mut [i8],
scale_out: &mut f32,
) -> bool {
let cmd = QuantizeI8CmdF32 {
input_ptr: input.as_ptr(),
input_len: input.len(),
output_ptr: output.as_mut_ptr(),
output_len: output.len(),
scale_out_ptr: scale_out as *mut f32,
};
dispatch_contract(
GpuOpCode::QuantizeI8F32,
(&cmd as *const QuantizeI8CmdF32).cast::<u8>(),
core::mem::size_of::<QuantizeI8CmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_quantize_i8_f64(
input: &[f64],
output: &mut [i8],
scale_out: &mut f64,
) -> bool {
let cmd = QuantizeI8CmdF64 {
input_ptr: input.as_ptr(),
input_len: input.len(),
output_ptr: output.as_mut_ptr(),
output_len: output.len(),
scale_out_ptr: scale_out as *mut f64,
};
dispatch_contract(
GpuOpCode::QuantizeI8F64,
(&cmd as *const QuantizeI8CmdF64).cast::<u8>(),
core::mem::size_of::<QuantizeI8CmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_dequantize_i8_f32(
input: &[i8],
output: &mut [f32],
scale: f32,
) -> bool {
let cmd = DequantizeI8CmdF32 {
input_ptr: input.as_ptr(),
input_len: input.len(),
output_ptr: output.as_mut_ptr(),
output_len: output.len(),
scale,
};
dispatch_contract(
GpuOpCode::DequantizeI8F32,
(&cmd as *const DequantizeI8CmdF32).cast::<u8>(),
core::mem::size_of::<DequantizeI8CmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_dequantize_i8_f64(
input: &[i8],
output: &mut [f64],
scale: f64,
) -> bool {
let cmd = DequantizeI8CmdF64 {
input_ptr: input.as_ptr(),
input_len: input.len(),
output_ptr: output.as_mut_ptr(),
output_len: output.len(),
scale,
};
dispatch_contract(
GpuOpCode::DequantizeI8F64,
(&cmd as *const DequantizeI8CmdF64).cast::<u8>(),
core::mem::size_of::<DequantizeI8CmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_sgd_f32(
params: &mut [f32],
grads: &[f32],
velocity: &mut [f32],
learning_rate: f32,
momentum: f32,
nesterov: bool,
) -> bool {
let cmd = SgdCmdF32 {
params_ptr: params.as_mut_ptr(),
grads_ptr: grads.as_ptr(),
velocity_ptr: velocity.as_mut_ptr(),
len: params.len(),
learning_rate,
momentum,
nesterov: if nesterov { 1 } else { 0 },
};
dispatch_contract(
GpuOpCode::SgdF32,
(&cmd as *const SgdCmdF32).cast::<u8>(),
core::mem::size_of::<SgdCmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_sgd_f64(
params: &mut [f64],
grads: &[f64],
velocity: &mut [f64],
learning_rate: f64,
momentum: f64,
nesterov: bool,
) -> bool {
let cmd = SgdCmdF64 {
params_ptr: params.as_mut_ptr(),
grads_ptr: grads.as_ptr(),
velocity_ptr: velocity.as_mut_ptr(),
len: params.len(),
learning_rate,
momentum,
nesterov: if nesterov { 1 } else { 0 },
};
dispatch_contract(
GpuOpCode::SgdF64,
(&cmd as *const SgdCmdF64).cast::<u8>(),
core::mem::size_of::<SgdCmdF64>(),
)
}
pub(crate) fn try_invoke_gpu_adamw_f32(args: AdamwInvokeF32<'_>) -> bool {
let AdamwInvokeF32 {
params,
grads,
m,
v,
learning_rate,
step,
beta1,
beta2,
eps,
weight_decay,
} = args;
let cmd = AdamwCmdF32 {
params_ptr: params.as_mut_ptr(),
grads_ptr: grads.as_ptr(),
m_ptr: m.as_mut_ptr(),
v_ptr: v.as_mut_ptr(),
len: params.len(),
learning_rate,
step,
beta1,
beta2,
eps,
weight_decay,
};
dispatch_contract(
GpuOpCode::AdamwF32,
(&cmd as *const AdamwCmdF32).cast::<u8>(),
core::mem::size_of::<AdamwCmdF32>(),
)
}
pub(crate) fn try_invoke_gpu_adamw_f64(args: AdamwInvokeF64<'_>) -> bool {
let AdamwInvokeF64 {
params,
grads,
m,
v,
learning_rate,
step,
beta1,
beta2,
eps,
weight_decay,
} = args;
let cmd = AdamwCmdF64 {
params_ptr: params.as_mut_ptr(),
grads_ptr: grads.as_ptr(),
m_ptr: m.as_mut_ptr(),
v_ptr: v.as_mut_ptr(),
len: params.len(),
learning_rate,
step,
beta1,
beta2,
eps,
weight_decay,
};
dispatch_contract(
GpuOpCode::AdamwF64,
(&cmd as *const AdamwCmdF64).cast::<u8>(),
core::mem::size_of::<AdamwCmdF64>(),
)
}