ferrotorch-cubecl 0.5.7

//! Unified runtime selection for CubeCL backends.
//!
//! [`CubeDevice`] enumerates the three supported backends (CUDA, ROCm, WGPU),
//! each parameterised by a device ordinal. [`CubeRuntime`] resolves that
//! selection into a real CubeCL [`ComputeClient`] — one per backend — which
//! owns the on-device memory and the compiled-kernel cache for that device.

use std::fmt;

#[cfg(any(feature = "wgpu", feature = "cuda", feature = "rocm"))]
use cubecl::Runtime;
#[cfg(any(feature = "wgpu", feature = "cuda", feature = "rocm"))]
use cubecl::prelude::ComputeClient;
use ferrotorch_core::FerrotorchResult;

#[cfg(feature = "cuda")]
use cubecl_cuda::{CudaDevice, CudaRuntime};
#[cfg(feature = "rocm")]
use cubecl_hip::{AmdDevice, HipRuntime};
#[cfg(feature = "wgpu")]
use cubecl_wgpu::{WgpuDevice, WgpuRuntime};

// ---------------------------------------------------------------------------
// CubeDevice
// ---------------------------------------------------------------------------

/// A device selector for CubeCL backends.
///
/// The `usize` field is the device ordinal (e.g., GPU index 0, 1, ...).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CubeDevice {
    /// NVIDIA GPU via CUDA PTX codegen.
    Cuda(usize),
    /// Portable GPU via WGPU — AMD (Vulkan), Intel (Vulkan), Apple (Metal).
    Wgpu(usize),
    /// AMD GPU via native HIP/ROCm runtime.
    Rocm(usize),
}

impl CubeDevice {
    /// Device ordinal regardless of backend.
    #[inline]
    pub fn ordinal(&self) -> usize {
        match self {
            Self::Cuda(o) | Self::Wgpu(o) | Self::Rocm(o) => *o,
        }
    }

    /// Human-readable backend name.
    #[inline]
    pub fn backend_name(&self) -> &'static str {
        match self {
            Self::Cuda(_) => "cuda",
            Self::Wgpu(_) => "wgpu",
            Self::Rocm(_) => "rocm",
        }
    }
}

impl fmt::Display for CubeDevice {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}:{}", self.backend_name(), self.ordinal())
    }
}

// ---------------------------------------------------------------------------
// CubeClient — per-backend real compute client
// ---------------------------------------------------------------------------

/// An initialised CubeCL compute client for one of the supported backends.
///
/// The variant is determined by which runtime feature was compiled in and
/// what [`CubeDevice`] the runtime was built for. `ops.rs` matches on this
/// enum to dispatch generic CubeCL kernels to the correct backend.
///
/// The [`CubeClient::Stub`] variant is always present (no cfg gate) and
/// is reserved for tests — it holds no client state and every kernel
/// dispatch macro has a `Stub => unreachable!()` arm. Tests that want to
/// exercise pre-dispatch paths (shape checks, signature pins) on a
/// machine without a real backend can build a [`CubeRuntime`] via
/// [`CubeRuntime::new_for_testing`], whose `client` field is `Stub`.
/// Production code paths never construct or observe `Stub` because
/// [`CubeRuntime::new`] only ever yields a real backend client. (#1083)
#[derive(Clone)]
pub enum CubeClient {
    /// A real Wgpu (Vulkan/Metal/DX12) compute client.
    #[cfg(feature = "wgpu")]
    Wgpu(ComputeClient<WgpuRuntime>),
    /// A real CUDA compute client.
    #[cfg(feature = "cuda")]
    Cuda(ComputeClient<CudaRuntime>),
    /// A real HIP/ROCm compute client.
    #[cfg(feature = "rocm")]
    Rocm(ComputeClient<HipRuntime>),
    /// Test stub — every kernel dispatch panics; only pre-dispatch
    /// paths (shape checks, signature pins) are reachable.
    ///
    /// This variant is always compiled in (no cfg gate) so tests in
    /// any feature configuration can construct a runtime without a
    /// real backend client. Reaching a kernel dispatch arm with
    /// `Stub` is a test-discipline bug — the shape-mismatch test
    /// must fire before dispatch. (#1083)
    Stub,
}

impl fmt::Debug for CubeClient {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            #[cfg(feature = "wgpu")]
            Self::Wgpu(_) => f.write_str("CubeClient::Wgpu(..)"),
            #[cfg(feature = "cuda")]
            Self::Cuda(_) => f.write_str("CubeClient::Cuda(..)"),
            #[cfg(feature = "rocm")]
            Self::Rocm(_) => f.write_str("CubeClient::Rocm(..)"),
            Self::Stub => f.write_str("CubeClient::Stub"),
        }
    }
}

// ---------------------------------------------------------------------------
// CubeRuntime
// ---------------------------------------------------------------------------

/// CubeCL runtime wrapper that holds a real compute client for one device.
#[derive(Clone, Debug)]
pub struct CubeRuntime {
    device: CubeDevice,
    client: CubeClient,
}

impl CubeRuntime {
    /// Create a runtime targeting the given device.
    ///
    /// Returns `Err(FerrotorchError::DeviceUnavailable)` if the required
    /// backend feature was not compiled in.
    pub fn new(device: CubeDevice) -> FerrotorchResult<Self> {
        let client = Self::make_client(device)?;
        Ok(Self { device, client })
    }

    /// Construct a `CubeRuntime` whose [`CubeClient`] is the test-only
    /// [`CubeClient::Stub`] variant.
    ///
    /// Reserved for conformance tests that want to exercise CPU-side
    /// pre-dispatch paths — shape validation, dtype checks, signature
    /// pins — when the test environment has no usable wgpu/CUDA/ROCm
    /// adapter. Reaching kernel dispatch with this runtime is a test
    /// bug: every dispatch macro arm panics on `Stub`.
    ///
    /// Production code paths must use [`Self::new`] (which never
    /// returns `Stub`) or [`Self::auto`] (likewise). (#1083)
    #[doc(hidden)]
    pub fn new_for_testing(device: CubeDevice) -> Self {
        Self {
            device,
            client: CubeClient::Stub,
        }
    }

    /// The device this runtime targets.
    #[inline]
    pub fn device(&self) -> &CubeDevice {
        &self.device
    }

    /// The underlying compute client (one variant per backend).
    #[inline]
    pub fn client(&self) -> &CubeClient {
        &self.client
    }

    /// Auto-detect the best available backend, returning `None` if no GPU
    /// backend feature is enabled.
    ///
    /// Priority order: CUDA > ROCm > WGPU.
    #[allow(unreachable_code)] // reason: each cfg-gated branch unconditionally returns; subsequent branches are tried only when the prior feature is off
    pub fn auto() -> Option<Self> {
        // CUDA takes priority when available.
        #[cfg(feature = "cuda")]
        {
            return Self::new(CubeDevice::Cuda(0)).ok();
        }

        // ROCm for AMD-native workloads.
        #[cfg(feature = "rocm")]
        {
            return Self::new(CubeDevice::Rocm(0)).ok();
        }

        // WGPU is the most portable fallback.
        #[cfg(feature = "wgpu")]
        {
            return Self::new(CubeDevice::Wgpu(0)).ok();
        }

        None
    }

    /// Returns `true` if any GPU backend feature was compiled in.
    pub fn is_available() -> bool {
        cfg!(any(feature = "cuda", feature = "rocm", feature = "wgpu"))
    }

    /// Read `n` `f32` values from a device-resident handle back to host memory.
    ///
    /// This is the single readback point for callers (e.g. `ferrotorch-xpu`)
    /// that receive a `(cubecl::server::Handle, Vec<usize>)` from a
    /// `portable_*` op and need CPU-resident data. Dispatches to the correct
    /// backend client. ADR #663 item 4.
    #[cfg(any(feature = "wgpu", feature = "cuda", feature = "rocm"))]
    pub fn read_f32s(
        &self,
        handle: cubecl::server::Handle,
        n: usize,
    ) -> ferrotorch_core::FerrotorchResult<Vec<f32>> {
        use cubecl::prelude::*;
        let bytes = match &self.client {
            #[cfg(feature = "wgpu")]
            CubeClient::Wgpu(c) => c.read_one(handle),
            #[cfg(feature = "cuda")]
            CubeClient::Cuda(c) => c.read_one(handle),
            #[cfg(feature = "rocm")]
            CubeClient::Rocm(c) => c.read_one(handle),
            // #1083: the Stub variant is reserved for tests that exercise
            // pre-dispatch paths only; reaching readback means a kernel
            // would have already had to dispatch, which the dispatch
            // macros refuse for Stub.
            CubeClient::Stub => unreachable!(
                "CubeClient::Stub reached read_f32s — Stub runtimes must not \
                 reach kernel dispatch or readback; shape check or signature \
                 pin should fire first (#1083)"
            ),
        }
        .map_err(|e| ferrotorch_core::FerrotorchError::InvalidArgument {
            message: format!("cubecl read_one failed: {e}"),
        })?;
        // SAFETY: `bytes` came from a `client.empty(n * size_of::<f32>())`
        // buffer filled by a `#[cube]` kernel writing `f32` values. The byte
        // length is `n * 4` by construction. `f32::from_bytes` reinterprets
        // the slice as `&[f32]` (same alignment on all supported backends).
        Ok(f32::from_bytes(&bytes)[..n].to_vec())
    }

    // ---------------------------------------------------------------------
    // Backend client construction
    // ---------------------------------------------------------------------

    #[allow(clippy::unnecessary_wraps)] // reason: returns Err under #[cfg(not(feature=...))]; clippy only sees the all-features path
    fn make_client(device: CubeDevice) -> FerrotorchResult<CubeClient> {
        match device {
            CubeDevice::Wgpu(idx) => {
                #[cfg(feature = "wgpu")]
                {
                    let wgpu_device = wgpu_device_for_index(idx);
                    let client = WgpuRuntime::client(&wgpu_device);
                    Ok(CubeClient::Wgpu(client))
                }
                #[cfg(not(feature = "wgpu"))]
                {
                    let _ = idx;
                    Err(ferrotorch_core::FerrotorchError::DeviceUnavailable)
                }
            }
            CubeDevice::Cuda(idx) => {
                #[cfg(feature = "cuda")]
                {
                    let cuda_device = CudaDevice { index: idx };
                    let client = CudaRuntime::client(&cuda_device);
                    Ok(CubeClient::Cuda(client))
                }
                #[cfg(not(feature = "cuda"))]
                {
                    let _ = idx;
                    Err(ferrotorch_core::FerrotorchError::DeviceUnavailable)
                }
            }
            CubeDevice::Rocm(idx) => {
                #[cfg(feature = "rocm")]
                {
                    let amd_device = AmdDevice { index: idx };
                    let client = HipRuntime::client(&amd_device);
                    Ok(CubeClient::Rocm(client))
                }
                #[cfg(not(feature = "rocm"))]
                {
                    let _ = idx;
                    Err(ferrotorch_core::FerrotorchError::DeviceUnavailable)
                }
            }
        }
    }
}

#[cfg(feature = "wgpu")]
fn wgpu_device_for_index(index: usize) -> WgpuDevice {
    match index {
        // Index 0 maps to the system default adapter; this is the most
        // portable choice and matches how ferrotorch-gpu selects a GPU.
        0 => WgpuDevice::DefaultDevice,
        // Higher indices explicitly select a discrete GPU slot.
        n => WgpuDevice::DiscreteGpu(n),
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn cube_device_ordinal() {
        assert_eq!(CubeDevice::Cuda(3).ordinal(), 3);
        assert_eq!(CubeDevice::Wgpu(1).ordinal(), 1);
        assert_eq!(CubeDevice::Rocm(0).ordinal(), 0);
    }

    #[test]
    fn cube_device_backend_name() {
        assert_eq!(CubeDevice::Cuda(0).backend_name(), "cuda");
        assert_eq!(CubeDevice::Wgpu(0).backend_name(), "wgpu");
        assert_eq!(CubeDevice::Rocm(0).backend_name(), "rocm");
    }

    #[test]
    fn cube_device_display() {
        assert_eq!(CubeDevice::Cuda(2).to_string(), "cuda:2");
        assert_eq!(CubeDevice::Wgpu(0).to_string(), "wgpu:0");
        assert_eq!(CubeDevice::Rocm(1).to_string(), "rocm:1");
    }

    #[test]
    fn cube_device_equality() {
        assert_eq!(CubeDevice::Cuda(0), CubeDevice::Cuda(0));
        assert_ne!(CubeDevice::Cuda(0), CubeDevice::Cuda(1));
        assert_ne!(CubeDevice::Cuda(0), CubeDevice::Wgpu(0));
    }

    #[test]
    fn cube_device_clone_and_hash() {
        use std::collections::HashSet;
        let mut set = HashSet::new();
        set.insert(CubeDevice::Cuda(0));
        set.insert(CubeDevice::Wgpu(0));
        set.insert(CubeDevice::Rocm(0));
        assert_eq!(set.len(), 3);

        // Duplicate should not increase size.
        set.insert(CubeDevice::Cuda(0));
        assert_eq!(set.len(), 3);
    }

    /// Probe whether wgpu can construct a runtime in the current
    /// environment. WSL2 lacks a Vulkan ICD by default, so the cubecl-wgpu
    /// worker thread panics during adapter selection and the panic surfaces
    /// on the main thread as `RecvError`. Catch that here so tests skip
    /// cleanly instead of failing.
    #[cfg(feature = "wgpu")]
    fn wgpu_probe_runtime() -> Option<CubeRuntime> {
        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
            CubeRuntime::new(CubeDevice::Wgpu(0)).ok()
        }));
        match result {
            Ok(Some(rt)) => Some(rt),
            _ => None,
        }
    }

    #[cfg(feature = "wgpu")]
    #[test]
    fn wgpu_runtime_new_and_device() {
        let Some(rt) = wgpu_probe_runtime() else {
            eprintln!("[ferrotorch-cubecl] wgpu adapter unavailable; skipping");
            return;
        };
        assert_eq!(*rt.device(), CubeDevice::Wgpu(0));
        // Client should match the selected backend.
        assert!(matches!(rt.client(), CubeClient::Wgpu(_)));
    }

    #[cfg(not(any(feature = "wgpu", feature = "cuda", feature = "rocm")))]
    #[test]
    fn no_backend_feature_yields_device_unavailable() {
        let err = CubeRuntime::new(CubeDevice::Wgpu(0)).unwrap_err();
        assert!(matches!(
            err,
            ferrotorch_core::FerrotorchError::DeviceUnavailable
        ));
    }

    #[test]
    fn cube_runtime_auto_returns_something_or_none() {
        // `auto()` may panic on the worker thread if a backend feature is
        // compiled in but the actual hardware/driver isn't available
        // (e.g. wgpu in WSL without Vulkan). Catch that and treat it as
        // "not available" — matching the documented contract that this
        // function returns `Option`.
        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(CubeRuntime::auto));
        match result {
            Ok(Some(_)) => assert!(CubeRuntime::is_available()),
            Ok(None) | Err(_) => {
                // Either no backend feature compiled in, or the backend
                // feature is present but no adapter exists at runtime.
                // Both are valid outcomes.
                eprintln!(
                    "[ferrotorch-cubecl] auto() returned no runtime (no backend feature or no \
                     adapter); test passes"
                );
            }
        }
    }

    #[test]
    fn cube_runtime_is_available_consistent() {
        // `is_available()` is a compile-time check (`cfg!(...)`). When a
        // feature is compiled in but no hardware exists at runtime, `auto()`
        // may still return `Some` (lazy init succeeds, kernel dispatch
        // would fail later). We accept that asymmetry here — the test
        // verifies that "feature compiled in" is at least consistent with
        // "auto() doesn't return None for compile-time reasons".
        let available = CubeRuntime::is_available();
        if !available {
            // Belt-and-suspenders: when no feature is compiled, auto() must
            // be None.
            let auto = std::panic::catch_unwind(std::panic::AssertUnwindSafe(CubeRuntime::auto));
            assert!(auto.ok().flatten().is_none());
        }
    }
}