use once_cell::sync::Lazy;
use runmat_builtins::{Tensor, Value};
use std::path::PathBuf;
use std::sync::RwLock;
pub mod backend;
pub mod fusion;
pub mod fusion_exec;
pub mod fusion_residency;
pub mod graph;
mod host_lu;
pub mod native_auto;
pub mod precision;
mod reduction_meta;
pub mod simple_provider;
mod sortrows_host;
pub mod telemetry;
#[cfg(target_arch = "wasm32")]
mod web_auto_offload_store;
#[cfg(feature = "wgpu")]
use crate::backend::wgpu::provider::WgpuProvider;
pub use fusion::*;
pub use graph::*;
pub use native_auto::{
apply_auto_offload_calibration_from_file, auto_offload_report, is_sink, prepare_builtin_args,
promote_binary, promote_reduction_args, promote_unary, reset_auto_offload_log,
AutoOffloadCalibrationOutcome, AutoOffloadCalibrationSummary, AutoOffloadDecisionEntry,
AutoOffloadDisposition, AutoOffloadReport, BinaryOp, CachedProviderInfo, DecisionReason,
ReductionOp, ThresholdBase, ThresholdDelta, ThresholdDeltaEntry, ThresholdSnapshot, UnaryOp,
};
pub use reduction_meta::{value_is_all_keyword, ReductionAxes};
#[cfg(feature = "wgpu")]
use runmat_accelerate_api::AccelProvider;
use serde::{Deserialize, Serialize};
#[cfg(feature = "wgpu")]
use wgpu::PowerPreference;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum AccelerateProviderPreference {
Auto,
Wgpu,
InProcess,
}
impl Default for AccelerateProviderPreference {
fn default() -> Self {
Self::Auto
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum AccelPowerPreference {
Auto,
HighPerformance,
LowPower,
}
impl Default for AccelPowerPreference {
fn default() -> Self {
Self::Auto
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "kebab-case")]
pub enum AutoOffloadLogLevel {
Off,
Info,
#[default]
Trace,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AutoOffloadOptions {
pub enabled: bool,
pub calibrate: bool,
#[serde(default)]
pub profile_path: Option<PathBuf>,
#[serde(default)]
pub log_level: AutoOffloadLogLevel,
}
impl Default for AutoOffloadOptions {
fn default() -> Self {
Self {
enabled: true,
calibrate: true,
profile_path: None,
log_level: AutoOffloadLogLevel::Trace,
}
}
}
static AUTO_OFFLOAD_OPTIONS: Lazy<RwLock<AutoOffloadOptions>> =
Lazy::new(|| RwLock::new(AutoOffloadOptions::default()));
static API_HOOKS: Lazy<()> = Lazy::new(|| {
runmat_accelerate_api::register_residency_mark(fusion_residency::mark);
runmat_accelerate_api::register_residency_clear(fusion_residency::clear);
runmat_accelerate_api::register_sequence_threshold_provider(sequence_threshold_hint_bridge);
runmat_accelerate_api::register_workgroup_size_hint_provider(workgroup_size_hint_bridge);
});
pub(crate) fn ensure_residency_hooks() {
Lazy::force(&API_HOOKS);
}
fn sequence_threshold_hint_bridge() -> Option<usize> {
native_auto::sequence_threshold_hint()
}
fn workgroup_size_hint_bridge() -> Option<u32> {
#[cfg(feature = "wgpu")]
{
Some(crate::backend::wgpu::config::effective_workgroup_size())
}
#[cfg(not(feature = "wgpu"))]
{
None
}
}
pub fn configure_auto_offload(options: AutoOffloadOptions) {
ensure_residency_hooks();
if let Ok(mut guard) = AUTO_OFFLOAD_OPTIONS.write() {
*guard = options;
}
}
pub(crate) fn auto_offload_options() -> AutoOffloadOptions {
AUTO_OFFLOAD_OPTIONS
.read()
.map(|guard| guard.clone())
.unwrap_or_default()
}
#[derive(Debug, Clone)]
pub struct AccelerateInitOptions {
pub enabled: bool,
pub provider: AccelerateProviderPreference,
pub allow_inprocess_fallback: bool,
pub wgpu_power_preference: AccelPowerPreference,
pub wgpu_force_fallback_adapter: bool,
pub auto_offload: AutoOffloadOptions,
}
impl Default for AccelerateInitOptions {
fn default() -> Self {
Self {
enabled: true,
provider: AccelerateProviderPreference::Auto,
allow_inprocess_fallback: true,
wgpu_power_preference: AccelPowerPreference::Auto,
wgpu_force_fallback_adapter: false,
auto_offload: AutoOffloadOptions::default(),
}
}
}
pub fn initialize_acceleration_provider_with(options: &AccelerateInitOptions) {
configure_auto_offload(options.auto_offload.clone());
if runmat_accelerate_api::provider().is_some() {
return;
}
if !options.enabled {
if options.allow_inprocess_fallback {
simple_provider::register_inprocess_provider();
log::info!(
"RunMat Accelerate: acceleration disabled; using in-process provider for compatibility"
);
} else {
log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
}
return;
}
let registered = {
#[cfg(all(feature = "wgpu", not(target_arch = "wasm32")))]
{
let mut reg = false;
if matches!(
options.provider,
AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
) {
let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
power_preference: match options.wgpu_power_preference {
AccelPowerPreference::Auto => PowerPreference::HighPerformance,
AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
AccelPowerPreference::LowPower => PowerPreference::LowPower,
},
force_fallback_adapter: options.wgpu_force_fallback_adapter,
};
match backend::wgpu::provider::register_wgpu_provider(wgpu_options) {
Ok(provider) => {
reg = true;
announce_wgpu_provider(provider);
}
Err(err) => {
log::warn!(
"RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
);
}
}
}
reg
}
#[cfg(all(feature = "wgpu", target_arch = "wasm32"))]
{
if matches!(
options.provider,
AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
) {
log::info!(
"RunMat Accelerate: wasm builds require calling initialize_wgpu_provider_async to enable the WGPU backend"
);
}
false
}
#[cfg(not(feature = "wgpu"))]
{
if matches!(options.provider, AccelerateProviderPreference::Wgpu) {
log::warn!(
"RunMat Accelerate: WGPU provider requested but crate built without 'wgpu' feature"
);
}
false
}
};
if !registered {
if options.allow_inprocess_fallback
|| matches!(options.provider, AccelerateProviderPreference::InProcess)
{
simple_provider::register_inprocess_provider();
log::info!("RunMat Accelerate: using in-process acceleration provider");
} else {
log::warn!("RunMat Accelerate: no acceleration provider registered");
}
}
}
#[cfg(feature = "wgpu")]
fn announce_wgpu_provider(provider: &WgpuProvider) {
let info = provider.device_info_struct();
let backend = info.backend.as_deref().unwrap_or("unknown");
log::info!(
"RunMat Accelerate: using WGPU provider {} (vendor: {}, backend: {})",
info.name,
info.vendor,
backend
);
provider.warmup();
let (hits, misses) = provider.fused_cache_counters();
log::info!(
"RunMat Accelerate: fused pipeline cache after warmup - hits: {}, misses: {}",
hits,
misses
);
}
#[cfg(all(feature = "wgpu", target_arch = "wasm32"))]
pub async fn initialize_wgpu_provider_async(options: &AccelerateInitOptions) -> anyhow::Result<()> {
configure_auto_offload(options.auto_offload.clone());
if runmat_accelerate_api::provider().is_some() {
return Ok(());
}
if !options.enabled {
if options.allow_inprocess_fallback
|| matches!(options.provider, AccelerateProviderPreference::InProcess)
{
simple_provider::register_inprocess_provider();
log::info!(
"RunMat Accelerate: acceleration disabled; using in-process acceleration provider"
);
} else {
log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
}
return Ok(());
}
let mut registered = false;
if matches!(
options.provider,
AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
) {
let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
power_preference: match options.wgpu_power_preference {
AccelPowerPreference::Auto => PowerPreference::HighPerformance,
AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
AccelPowerPreference::LowPower => PowerPreference::LowPower,
},
force_fallback_adapter: options.wgpu_force_fallback_adapter,
};
match backend::wgpu::provider::register_wgpu_provider_async(wgpu_options).await {
Ok(provider) => {
registered = true;
announce_wgpu_provider(provider);
}
Err(err) => {
log::warn!(
"RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
);
}
}
}
if !registered {
if options.allow_inprocess_fallback
|| matches!(options.provider, AccelerateProviderPreference::InProcess)
{
simple_provider::register_inprocess_provider();
log::info!("RunMat Accelerate: using in-process acceleration provider");
} else {
log::warn!("RunMat Accelerate: no acceleration provider registered");
}
}
Ok(())
}
pub fn initialize_acceleration_provider() {
initialize_acceleration_provider_with(&AccelerateInitOptions::default());
}
#[cfg(test)]
mod tests {
#[cfg(feature = "wgpu")]
use crate::backend::wgpu::cache::key::compute_pipeline_hash_bytes;
#[test]
#[cfg(feature = "wgpu")]
fn elementwise_hash_varies_with_arity() {
let wg = 256u32;
let h2 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-2", Some(wg));
let h3 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-3", Some(wg));
assert_ne!(h2, h3, "hash should differ with input arity");
}
}
#[cfg(feature = "wgpu")]
pub fn provider_cache_stats() -> Option<(u64, u64)> {
runmat_accelerate_api::provider().map(|p| p.fused_cache_counters())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum DeviceKind {
Cpu,
Cuda,
Rocm,
Metal,
Vulkan,
OpenCl,
Wgpu,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeviceInfo {
pub kind: DeviceKind,
pub name: String,
pub vendor: String,
pub memory_bytes: Option<u64>,
pub compute_capability: Option<String>,
}
pub trait BufferHandle: Send + Sync {
fn len(&self) -> usize;
fn is_empty(&self) -> bool {
self.len() == 0
}
}
pub trait DeviceMatrix: Send + Sync {
fn rows(&self) -> usize;
fn cols(&self) -> usize;
fn as_buffer(&self) -> &dyn BufferHandle;
}
pub trait AccelerateBackend: Send + Sync {
fn device_info(&self) -> DeviceInfo;
fn upload_matrix(&self, host: &Tensor) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn download_matrix(&self, dev: &dyn DeviceMatrix) -> anyhow::Result<Tensor>;
fn elem_add(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn elem_sub(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn elem_mul(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn elem_ne(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn elem_eq(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn elem_div(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn elem_pow(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn matmul(
&self,
a: &dyn DeviceMatrix,
b: &dyn DeviceMatrix,
) -> anyhow::Result<Box<dyn DeviceMatrix>>;
fn transpose(&self, a: &dyn DeviceMatrix) -> anyhow::Result<Box<dyn DeviceMatrix>>;
}
#[derive(Default)]
pub struct Planner {
backend: Option<Box<dyn AccelerateBackend>>,
}
impl Planner {
pub fn new(backend: Option<Box<dyn AccelerateBackend>>) -> Self {
Self { backend }
}
pub fn device(&self) -> Option<&dyn AccelerateBackend> {
self.backend.as_deref()
}
pub fn choose_elem_add(&self, a: &Tensor, b: &Tensor) -> ExecutionTarget {
if let Some(bk) = &self.backend {
if a.data.len() >= 1 << 16 && a.rows() == b.rows() && a.cols() == b.cols() {
return ExecutionTarget::Gpu(bk.device_info());
}
}
ExecutionTarget::Cpu
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ExecutionTarget {
Cpu,
Gpu(DeviceInfo),
}
pub struct Accelerator {
planner: Planner,
}
impl Accelerator {
pub fn new(planner: Planner) -> Self {
Self { planner }
}
pub async fn elementwise_add(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
match (a, b) {
(Value::GpuTensor(ga), Value::GpuTensor(gb)) => {
let ha = self.gather_handle(ga).await?;
let hb = self.gather_handle(gb).await?;
self.elementwise_add_resolved(&ha, &hb)
}
(Value::GpuTensor(ga), other) => {
let ha = self.gather_handle(ga).await?;
self.elementwise_add_resolved(&ha, other)
}
(other, Value::GpuTensor(gb)) => {
let hb = self.gather_handle(gb).await?;
self.elementwise_add_resolved(other, &hb)
}
_ => self.elementwise_add_resolved(a, b),
}
}
fn elementwise_add_resolved(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
match (a, b) {
(Value::Tensor(ma), Value::Tensor(mb)) => match self.planner.choose_elem_add(ma, mb) {
ExecutionTarget::Cpu => {
runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
.map_err(|e| anyhow::anyhow!(e))
}
ExecutionTarget::Gpu(_) => {
let bk = self
.planner
.device()
.ok_or_else(|| anyhow::anyhow!("no backend"))?;
let da = bk.upload_matrix(ma)?;
let db = bk.upload_matrix(mb)?;
let dc = bk.elem_add(da.as_ref(), db.as_ref())?;
let out = bk.download_matrix(dc.as_ref())?;
Ok(Value::Tensor(out))
}
},
_ => runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
.map_err(|e| anyhow::anyhow!(e)),
}
}
async fn gather_handle(
&self,
h: &runmat_accelerate_api::GpuTensorHandle,
) -> anyhow::Result<Value> {
if let Some(p) = runmat_accelerate_api::provider() {
let ht = p.download(h).await.map_err(|e| anyhow::anyhow!(e))?;
let t = Tensor::new(ht.data, ht.shape).map_err(|e| anyhow::anyhow!(e))?;
Ok(Value::Tensor(t))
} else {
let shape = h.shape.clone();
let total: usize = shape.iter().product();
let zeros = Tensor::new(vec![0.0; total], shape).map_err(|e| anyhow::anyhow!(e))?;
Ok(Value::Tensor(zeros))
}
}
}