tokitai-operator 0.1.0

//! Internal fp16/fp32 conversion helpers shared by the Model layer.
//!
//! The HIP kernels in `src/backend/hip_*.rs` all speak the same
//! fp16 <-> fp32 boundary, and the conversion logic is shared
//! with `src/model/layer.rs` and `src/model/parameter.rs`. The
//! conversions are pure-Rust and round-trip safe.
//!
// Internal fp16/fp32 conversion helpers shared by the Model layer.
//
// The HIP kernels in `src/backend/hip_*.rs` all speak the same
// binary16-via-u16 protocol. The conversion math lives in
// `crate::backend::f16_convert` (bit-exact IEEE binary16 <->
// binary32 round-to-nearest-even matching the `__half2float` /
// `__float2half_rn` semantics used by every kernel). We re-export
// from there so there is exactly one source of truth — duplicate
// hand-rolled copies in this file historically had the subnormal
// shift bug (Task #71) and the subnormal-boundary round-up bug
// (Task #75), both of which silently rounded small training
// gradients to either zero or ±2.0.

pub use crate::backend::f16_convert::{f16_to_f32, f32_to_f16};

/// Convert every element of a `Tensor<f32>` to fp16 bit patterns.
pub fn tensor_to_fp16_bits(t: &crate::object::Tensor<f32>) -> Vec<u16> {
    t.data.iter().copied().map(f32_to_f16).collect()
}

/// Convert a flat `&[u16]` of fp16 bit patterns back into a
/// `Tensor<f32>` with the supplied shape.
pub fn fp16_bits_to_tensor(
    bits: &[u16],
    shape: crate::object::Shape,
    domain: crate::domain::DomainId,
) -> crate::object::Tensor<f32> {
    let data = bits.iter().copied().map(f16_to_f32).collect();
    crate::object::Tensor::dense_cpu(domain, shape, data)
}

/// Build a 1-D `Tensor<f32>` filled with a constant value.
pub fn tensor_full(
    shape: crate::object::Shape,
    value: f32,
    domain: crate::domain::DomainId,
) -> crate::object::Tensor<f32> {
    let n: usize = shape
        .dims
        .iter()
        .map(|d| match d {
            crate::object::Dim::Static(v) => *v,
            _ => 0,
        })
        .product();
    crate::object::Tensor::dense_cpu(domain, shape, vec![value; n])
}

/// Elementwise add: `out[i] = a[i] + b[i]`. Shapes must match
/// exactly; broadcast is not supported.
pub fn tensor_add(
    a: &crate::object::Tensor<f32>,
    b: &crate::object::Tensor<f32>,
) -> Result<crate::object::Tensor<f32>, crate::Error> {
    if a.data.len() != b.data.len() {
        return Err(crate::Error::shape(format!(
            "tensor_add length mismatch: {} vs {}",
            a.data.len(),
            b.data.len()
        )));
    }
    let data: Vec<f32> = a
        .data
        .iter()
        .zip(b.data.iter())
        .map(|(x, y)| x + y)
        .collect();
    Ok(crate::object::Tensor::dense_cpu(
        a.meta.domain.clone(),
        a.meta.shape.clone(),
        data,
    ))
}