omni-ffi 0.1.0

Zero-cost C++/CUDA FFI bridge for the OmniPulse Wavelet Scattering Transform engine — GPU-accelerated perceptual fingerprinting from Rust
Documentation
//! omni-ffi — zero-cost cxx bridge between the Rust orchestrator and the
//! OmniPulse Module-I WST math engine.
//!
//! On the default (CPU) build the bridge calls `cpu_wst_forward()` from
//! `cpu_wst_engine.h`: a real Radix-2 Cooley-Tukey FFT + analytic Morlet
//! filter bank + depth-m scattering cascade. No mocks.
//!
//! When the `cuda` feature is enabled the bridge links `cudart` and `cufft`
//! and (in the GPU build) dispatches to the templated
//! `WSTEngine<HopperTag, J, Q>` defined in `wst_kernel.cuh`.
//!
//! ## Memory ownership
//!
//! [`run_wst_pipeline`](ffi::run_wst_pipeline) returns a [`WSTResult`] whose
//! `fingerprint_ptr` owns either:
//!   * a `new float[]` heap allocation (CPU build), or
//!   * a `cudaMalloc` device allocation (CUDA build).
//!
//! The Rust caller MUST release it by calling
//! [`free_wst_result`](ffi::free_wst_result) exactly once. Forgetting to do
//! so leaks heap or VRAM. Calling it twice is undefined behavior.

#[cxx::bridge]
mod ffi {
    /// Plain-old-data result struct shared with C++.
    ///
    /// Layout is locked by cxx: three `uint64_t` fields, in this order, no
    /// padding. The matching C++ definition is generated by cxx-build into
    /// `target/cxxbridge/omni-ffi/src/lib.rs.h` and re-used by
    /// `cpp/wst_bridge.h` (which `#include`s the generated header).
    #[derive(Clone, Copy, Debug)]
    struct WSTResult {
        /// Opaque pointer to the output scattering tensor.
        ///
        /// CPU build: a `float*` from `new float[]` — release with
        /// [`free_wst_result`] (`delete[]`).
        ///
        /// CUDA build: a `CUdeviceptr` from `cudaMalloc` — release with
        /// [`free_wst_result`] (`cudaFree`).
        fingerprint_ptr: u64,

        /// Number of `float32` coefficients in the output tensor
        /// (`signal_len * batch_size`).
        coeff_count: u64,

        /// Wall-clock execution time of the scattering cascade in
        /// microseconds. Used by the FinOps autoscaler for cost attribution.
        exec_time_us: u64,
    }

    unsafe extern "C++" {
        include!("wst_bridge.h");

        /// Run one WST/JTFS scattering pass against a Plasma-resident input
        /// buffer.
        ///
        /// # Safety
        ///
        /// The caller MUST uphold every one of the following invariants —
        /// any violation is undefined behavior:
        ///
        /// * `input_plasma_ptr` is a valid, host-readable pointer to a
        ///   contiguous `f32` array of exactly `signal_len * batch_size`
        ///   elements. In production this is the base address of an Apache
        ///   Arrow Plasma `mmap` region. On CUDA builds the same address
        ///   must already be registered with the CUDA driver via
        ///   `cudaHostRegister` so it is reachable from device kernels via
        ///   UVA.
        /// * The Plasma object that backs `input_plasma_ptr` must remain
        ///   live (sealed and not evicted) for the entire duration of this
        ///   call.
        /// * All integer parameters must be strictly positive. The C++ side
        ///   throws `std::runtime_error` on non-positive values; cxx
        ///   surfaces that as `Err(cxx::Exception)`.
        ///
        /// On success the returned [`WSTResult`] owns a heap (CPU) or
        /// device (CUDA) allocation that MUST be released with
        /// [`free_wst_result`] exactly once.
        unsafe fn run_wst_pipeline(
            input_plasma_ptr: u64,
            signal_len: i32,
            batch_size: i32,
            J: i32,
            Q: i32,
            depth: i32,
            use_jtfs: bool,
        ) -> Result<WSTResult>;

        /// Release the tensor allocation backing `result`.
        ///
        /// # Safety
        ///
        /// `result` must have been returned by a successful call to
        /// [`run_wst_pipeline`] on the same process and must not have been
        /// passed to this function previously. Calling this with any other
        /// value, or twice with the same value, is undefined behavior
        /// (double-free / use-after-free of heap or VRAM).
        ///
        /// Calling with a `WSTResult` whose `fingerprint_ptr == 0` is a
        /// no-op — that's the only safe sentinel.
        unsafe fn free_wst_result(result: WSTResult);
    }
}

pub use ffi::WSTResult;

/// Convenience wrapper that runs a single-batch, depth-2, plain-WST pass.
///
/// # Safety
///
/// `plasma_id` must satisfy the same invariants as
/// [`ffi::run_wst_pipeline::input_plasma_ptr`](ffi::run_wst_pipeline): a
/// live, contiguous `f32[signal_len]` Apache Arrow Plasma mmap base
/// address. See the [`ffi::run_wst_pipeline`] docs for the full contract.
///
/// The returned [`WSTResult`] owns an allocation that must be released
/// exactly once via [`free_fingerprint`].
pub unsafe fn execute_fingerprint_pass(
    plasma_id: u64,
    signal_len: i32,
    j: i32,
    q: i32,
) -> Result<WSTResult, cxx::Exception> {
    // SAFETY: forwarded directly to ffi::run_wst_pipeline; the caller of
    // this function has already promised the Plasma-pointer invariants.
    unsafe { ffi::run_wst_pipeline(plasma_id, signal_len, 1, j, q, 2, false) }
}

/// Release the tensor allocation backing a [`WSTResult`].
///
/// # Safety
///
/// `result` must have come from [`execute_fingerprint_pass`] (or
/// [`ffi::run_wst_pipeline`] directly) on this process and must not have
/// been passed here already. See [`ffi::free_wst_result`] for the full
/// contract.
pub unsafe fn free_fingerprint(result: WSTResult) {
    // SAFETY: contract delegated to the caller above.
    unsafe { ffi::free_wst_result(result) }
}