Skip to main content

runmat_accelerate/
lib.rs

1//! RunMat Accelerate: GPU Acceleration Abstraction Layer
2//!
3//! Goals:
4//! - Provide a backend-agnostic API surface that maps RunMat operations to GPU kernels.
5//! - Support multiple backends via features (CUDA, ROCm, Metal, Vulkan, OpenCL, wgpu).
6//! - Allow zero-copy interop with `runmat-builtins::Matrix` where possible.
7//! - Defer actual kernel authoring to backend crates/modules; this crate defines traits and wiring.
8
9use once_cell::sync::Lazy;
10use runmat_builtins::{Tensor, Value};
11use std::path::PathBuf;
12use std::sync::RwLock;
13
14pub mod backend;
15pub mod fusion;
16pub mod fusion_exec;
17pub mod fusion_residency;
18pub mod graph;
19mod host_lu;
20pub mod native_auto;
21pub mod precision;
22mod reduction_meta;
23pub mod simple_provider;
24mod sortrows_host;
25pub mod telemetry;
26#[cfg(target_arch = "wasm32")]
27mod web_auto_offload_store;
28#[cfg(feature = "wgpu")]
29use crate::backend::wgpu::provider::WgpuProvider;
30pub use fusion::*;
31pub use graph::*;
32pub use native_auto::{
33    apply_auto_offload_calibration_from_file, auto_offload_report, is_sink, prepare_builtin_args,
34    promote_binary, promote_reduction_args, promote_unary, reset_auto_offload_log,
35    AutoOffloadCalibrationOutcome, AutoOffloadCalibrationSummary, AutoOffloadDecisionEntry,
36    AutoOffloadDisposition, AutoOffloadReport, BinaryOp, CachedProviderInfo, DecisionReason,
37    ReductionOp, ThresholdBase, ThresholdDelta, ThresholdDeltaEntry, ThresholdSnapshot, UnaryOp,
38};
39pub use reduction_meta::{value_is_all_keyword, ReductionAxes};
40#[cfg(feature = "wgpu")]
41use runmat_accelerate_api::AccelProvider;
42use serde::{Deserialize, Serialize};
43#[cfg(feature = "wgpu")]
44use wgpu::PowerPreference;
45
46/// Preferred acceleration provider selection
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
48#[serde(rename_all = "kebab-case")]
49pub enum AccelerateProviderPreference {
50    Auto,
51    Wgpu,
52    InProcess,
53}
54
55impl Default for AccelerateProviderPreference {
56    fn default() -> Self {
57        Self::Auto
58    }
59}
60
61/// Power preference used when initializing a WGPU backend
62#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
63#[serde(rename_all = "kebab-case")]
64pub enum AccelPowerPreference {
65    Auto,
66    HighPerformance,
67    LowPower,
68}
69
70impl Default for AccelPowerPreference {
71    fn default() -> Self {
72        Self::Auto
73    }
74}
75
76/// Logging verbosity for auto-offload promotion decisions.
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
78#[serde(rename_all = "kebab-case")]
79pub enum AutoOffloadLogLevel {
80    Off,
81    Info,
82    #[default]
83    Trace,
84}
85
86/// Configuration passed to the native auto-offload planner.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct AutoOffloadOptions {
89    pub enabled: bool,
90    pub calibrate: bool,
91    #[serde(default)]
92    pub profile_path: Option<PathBuf>,
93    #[serde(default)]
94    pub log_level: AutoOffloadLogLevel,
95}
96
97impl Default for AutoOffloadOptions {
98    fn default() -> Self {
99        Self {
100            enabled: true,
101            calibrate: true,
102            profile_path: None,
103            log_level: AutoOffloadLogLevel::Trace,
104        }
105    }
106}
107
108static AUTO_OFFLOAD_OPTIONS: Lazy<RwLock<AutoOffloadOptions>> =
109    Lazy::new(|| RwLock::new(AutoOffloadOptions::default()));
110
111static API_HOOKS: Lazy<()> = Lazy::new(|| {
112    runmat_accelerate_api::register_residency_mark(fusion_residency::mark);
113    runmat_accelerate_api::register_residency_clear(fusion_residency::clear);
114    runmat_accelerate_api::register_sequence_threshold_provider(sequence_threshold_hint_bridge);
115    runmat_accelerate_api::register_workgroup_size_hint_provider(workgroup_size_hint_bridge);
116});
117
118pub(crate) fn ensure_residency_hooks() {
119    Lazy::force(&API_HOOKS);
120}
121
122fn sequence_threshold_hint_bridge() -> Option<usize> {
123    native_auto::sequence_threshold_hint()
124}
125
126fn workgroup_size_hint_bridge() -> Option<u32> {
127    #[cfg(feature = "wgpu")]
128    {
129        Some(crate::backend::wgpu::config::effective_workgroup_size())
130    }
131    #[cfg(not(feature = "wgpu"))]
132    {
133        None
134    }
135}
136
137pub fn configure_auto_offload(options: AutoOffloadOptions) {
138    ensure_residency_hooks();
139    if let Ok(mut guard) = AUTO_OFFLOAD_OPTIONS.write() {
140        *guard = options;
141    }
142}
143
144pub(crate) fn auto_offload_options() -> AutoOffloadOptions {
145    AUTO_OFFLOAD_OPTIONS
146        .read()
147        .map(|guard| guard.clone())
148        .unwrap_or_default()
149}
150
151/// Initialization options for selecting and configuring the acceleration provider.
152#[derive(Debug, Clone)]
153pub struct AccelerateInitOptions {
154    pub enabled: bool,
155    pub provider: AccelerateProviderPreference,
156    pub allow_inprocess_fallback: bool,
157    pub wgpu_power_preference: AccelPowerPreference,
158    pub wgpu_force_fallback_adapter: bool,
159    pub auto_offload: AutoOffloadOptions,
160}
161
162impl Default for AccelerateInitOptions {
163    fn default() -> Self {
164        Self {
165            enabled: true,
166            provider: AccelerateProviderPreference::Auto,
167            allow_inprocess_fallback: true,
168            wgpu_power_preference: AccelPowerPreference::Auto,
169            wgpu_force_fallback_adapter: false,
170            auto_offload: AutoOffloadOptions::default(),
171        }
172    }
173}
174
175/// Initialize the global acceleration provider using the supplied options.
176pub fn initialize_acceleration_provider_with(options: &AccelerateInitOptions) {
177    configure_auto_offload(options.auto_offload.clone());
178
179    if runmat_accelerate_api::provider().is_some() {
180        return;
181    }
182
183    if !options.enabled {
184        if options.allow_inprocess_fallback {
185            simple_provider::register_inprocess_provider();
186            log::info!(
187                "RunMat Accelerate: acceleration disabled; using in-process provider for compatibility"
188            );
189        } else {
190            log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
191        }
192        return;
193    }
194
195    let registered = {
196        #[cfg(all(feature = "wgpu", not(target_arch = "wasm32")))]
197        {
198            let mut reg = false;
199            if matches!(
200                options.provider,
201                AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
202            ) {
203                let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
204                    power_preference: match options.wgpu_power_preference {
205                        AccelPowerPreference::Auto => PowerPreference::HighPerformance,
206                        AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
207                        AccelPowerPreference::LowPower => PowerPreference::LowPower,
208                    },
209                    force_fallback_adapter: options.wgpu_force_fallback_adapter,
210                };
211
212                match backend::wgpu::provider::register_wgpu_provider(wgpu_options) {
213                    Ok(provider) => {
214                        reg = true;
215                        announce_wgpu_provider(provider);
216                    }
217                    Err(err) => {
218                        log::warn!(
219                            "RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
220                        );
221                    }
222                }
223            }
224            reg
225        }
226        #[cfg(all(feature = "wgpu", target_arch = "wasm32"))]
227        {
228            if matches!(
229                options.provider,
230                AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
231            ) {
232                log::info!(
233                    "RunMat Accelerate: wasm builds require calling initialize_wgpu_provider_async to enable the WGPU backend"
234                );
235            }
236            false
237        }
238        #[cfg(not(feature = "wgpu"))]
239        {
240            if matches!(options.provider, AccelerateProviderPreference::Wgpu) {
241                log::warn!(
242                    "RunMat Accelerate: WGPU provider requested but crate built without 'wgpu' feature"
243                );
244            }
245            false
246        }
247    };
248
249    if !registered {
250        if options.allow_inprocess_fallback
251            || matches!(options.provider, AccelerateProviderPreference::InProcess)
252        {
253            simple_provider::register_inprocess_provider();
254            log::info!("RunMat Accelerate: using in-process acceleration provider");
255        } else {
256            log::warn!("RunMat Accelerate: no acceleration provider registered");
257        }
258    }
259}
260
261#[cfg(feature = "wgpu")]
262fn announce_wgpu_provider(provider: &WgpuProvider) {
263    let info = provider.device_info_struct();
264    let backend = info.backend.as_deref().unwrap_or("unknown");
265    log::info!(
266        "RunMat Accelerate: using WGPU provider {} (vendor: {}, backend: {})",
267        info.name,
268        info.vendor,
269        backend
270    );
271    provider.warmup();
272    let (hits, misses) = provider.fused_cache_counters();
273    log::info!(
274        "RunMat Accelerate: fused pipeline cache after warmup - hits: {}, misses: {}",
275        hits,
276        misses
277    );
278}
279
280#[cfg(all(feature = "wgpu", target_arch = "wasm32"))]
281pub async fn initialize_wgpu_provider_async(options: &AccelerateInitOptions) -> anyhow::Result<()> {
282    configure_auto_offload(options.auto_offload.clone());
283
284    if runmat_accelerate_api::provider().is_some() {
285        return Ok(());
286    }
287
288    if !options.enabled {
289        if options.allow_inprocess_fallback
290            || matches!(options.provider, AccelerateProviderPreference::InProcess)
291        {
292            simple_provider::register_inprocess_provider();
293            log::info!(
294                "RunMat Accelerate: acceleration disabled; using in-process acceleration provider"
295            );
296        } else {
297            log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
298        }
299        return Ok(());
300    }
301
302    let mut registered = false;
303    if matches!(
304        options.provider,
305        AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
306    ) {
307        let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
308            power_preference: match options.wgpu_power_preference {
309                AccelPowerPreference::Auto => PowerPreference::HighPerformance,
310                AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
311                AccelPowerPreference::LowPower => PowerPreference::LowPower,
312            },
313            force_fallback_adapter: options.wgpu_force_fallback_adapter,
314        };
315
316        match backend::wgpu::provider::register_wgpu_provider_async(wgpu_options).await {
317            Ok(provider) => {
318                registered = true;
319                announce_wgpu_provider(provider);
320            }
321            Err(err) => {
322                log::warn!(
323                    "RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
324                );
325            }
326        }
327    }
328
329    if !registered {
330        if options.allow_inprocess_fallback
331            || matches!(options.provider, AccelerateProviderPreference::InProcess)
332        {
333            simple_provider::register_inprocess_provider();
334            log::info!("RunMat Accelerate: using in-process acceleration provider");
335        } else {
336            log::warn!("RunMat Accelerate: no acceleration provider registered");
337        }
338    }
339
340    Ok(())
341}
342
343/// Initialize the acceleration provider using default options.
344pub fn initialize_acceleration_provider() {
345    initialize_acceleration_provider_with(&AccelerateInitOptions::default());
346}
347
348#[cfg(test)]
349mod tests {
350    #[cfg(feature = "wgpu")]
351    use crate::backend::wgpu::cache::key::compute_pipeline_hash_bytes;
352
353    #[test]
354    #[cfg(feature = "wgpu")]
355    fn elementwise_hash_varies_with_arity() {
356        let wg = 256u32;
357        let h2 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-2", Some(wg));
358        let h3 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-3", Some(wg));
359        assert_ne!(h2, h3, "hash should differ with input arity");
360    }
361}
362
363/// Return fused pipeline cache statistics if the active provider exposes them.
364#[cfg(feature = "wgpu")]
365pub fn provider_cache_stats() -> Option<(u64, u64)> {
366    runmat_accelerate_api::provider().map(|p| p.fused_cache_counters())
367}
368
369/// High-level device kind. Concrete selection is provided by backend.
370#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
371pub enum DeviceKind {
372    Cpu,
373    Cuda,
374    Rocm,
375    Metal,
376    Vulkan,
377    OpenCl,
378    Wgpu,
379}
380
381/// Device descriptor used for selection and capabilities query.
382#[derive(Debug, Clone, Serialize, Deserialize)]
383pub struct DeviceInfo {
384    pub kind: DeviceKind,
385    pub name: String,
386    pub vendor: String,
387    pub memory_bytes: Option<u64>,
388    pub compute_capability: Option<String>,
389}
390
391/// Abstract buffer that may reside on device or be host-pinned.
392pub trait BufferHandle: Send + Sync {
393    fn len(&self) -> usize;
394    fn is_empty(&self) -> bool {
395        self.len() == 0
396    }
397}
398
399/// Abstract matrix allocated on a device backend.
400pub trait DeviceMatrix: Send + Sync {
401    fn rows(&self) -> usize;
402    fn cols(&self) -> usize;
403    fn as_buffer(&self) -> &dyn BufferHandle;
404}
405
406/// Core backend interface that concrete backends must implement.
407pub trait AccelerateBackend: Send + Sync {
408    fn device_info(&self) -> DeviceInfo;
409
410    // Memory
411    fn upload_matrix(&self, host: &Tensor) -> anyhow::Result<Box<dyn DeviceMatrix>>;
412    fn download_matrix(&self, dev: &dyn DeviceMatrix) -> anyhow::Result<Tensor>;
413
414    // Elementwise
415    fn elem_add(
416        &self,
417        a: &dyn DeviceMatrix,
418        b: &dyn DeviceMatrix,
419    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
420    fn elem_sub(
421        &self,
422        a: &dyn DeviceMatrix,
423        b: &dyn DeviceMatrix,
424    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
425    fn elem_mul(
426        &self,
427        a: &dyn DeviceMatrix,
428        b: &dyn DeviceMatrix,
429    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
430    fn elem_ne(
431        &self,
432        a: &dyn DeviceMatrix,
433        b: &dyn DeviceMatrix,
434    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
435    fn elem_eq(
436        &self,
437        a: &dyn DeviceMatrix,
438        b: &dyn DeviceMatrix,
439    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
440    fn elem_div(
441        &self,
442        a: &dyn DeviceMatrix,
443        b: &dyn DeviceMatrix,
444    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
445    fn elem_pow(
446        &self,
447        a: &dyn DeviceMatrix,
448        b: &dyn DeviceMatrix,
449    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
450
451    // Linear algebra (future): matmul, transpose, BLAS/LAPACK analogs
452    fn matmul(
453        &self,
454        a: &dyn DeviceMatrix,
455        b: &dyn DeviceMatrix,
456    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
457    fn transpose(&self, a: &dyn DeviceMatrix) -> anyhow::Result<Box<dyn DeviceMatrix>>;
458}
459
460/// Planner determines whether to execute on CPU or a selected backend.
461/// This will eventually consult sizes, heuristics, and device availability.
462#[derive(Default)]
463pub struct Planner {
464    backend: Option<Box<dyn AccelerateBackend>>,
465}
466
467impl Planner {
468    pub fn new(backend: Option<Box<dyn AccelerateBackend>>) -> Self {
469        Self { backend }
470    }
471
472    pub fn device(&self) -> Option<&dyn AccelerateBackend> {
473        self.backend.as_deref()
474    }
475
476    /// Example decision hook: execute elementwise add on GPU if large enough.
477    pub fn choose_elem_add(&self, a: &Tensor, b: &Tensor) -> ExecutionTarget {
478        if let Some(bk) = &self.backend {
479            if a.data.len() >= 1 << 16 && a.rows() == b.rows() && a.cols() == b.cols() {
480                return ExecutionTarget::Gpu(bk.device_info());
481            }
482        }
483        ExecutionTarget::Cpu
484    }
485}
486
487#[derive(Debug, Clone, Serialize, Deserialize)]
488pub enum ExecutionTarget {
489    Cpu,
490    Gpu(DeviceInfo),
491}
492
493/// High-level façade for accelerated operations, falling back to `runmat-runtime`.
494pub struct Accelerator {
495    planner: Planner,
496}
497
498impl Accelerator {
499    pub fn new(planner: Planner) -> Self {
500        Self { planner }
501    }
502
503    pub async fn elementwise_add(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
504        match (a, b) {
505            (Value::GpuTensor(ga), Value::GpuTensor(gb)) => {
506                let ha = self.gather_handle(ga).await?;
507                let hb = self.gather_handle(gb).await?;
508                self.elementwise_add_resolved(&ha, &hb)
509            }
510            (Value::GpuTensor(ga), other) => {
511                let ha = self.gather_handle(ga).await?;
512                self.elementwise_add_resolved(&ha, other)
513            }
514            (other, Value::GpuTensor(gb)) => {
515                let hb = self.gather_handle(gb).await?;
516                self.elementwise_add_resolved(other, &hb)
517            }
518            _ => self.elementwise_add_resolved(a, b),
519        }
520    }
521
522    fn elementwise_add_resolved(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
523        match (a, b) {
524            (Value::Tensor(ma), Value::Tensor(mb)) => match self.planner.choose_elem_add(ma, mb) {
525                ExecutionTarget::Cpu => {
526                    runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
527                        .map_err(|e| anyhow::anyhow!(e))
528                }
529                ExecutionTarget::Gpu(_) => {
530                    let bk = self
531                        .planner
532                        .device()
533                        .ok_or_else(|| anyhow::anyhow!("no backend"))?;
534                    let da = bk.upload_matrix(ma)?;
535                    let db = bk.upload_matrix(mb)?;
536                    let dc = bk.elem_add(da.as_ref(), db.as_ref())?;
537                    let out = bk.download_matrix(dc.as_ref())?;
538                    Ok(Value::Tensor(out))
539                }
540            },
541            _ => runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
542                .map_err(|e| anyhow::anyhow!(e)),
543        }
544    }
545
546    async fn gather_handle(
547        &self,
548        h: &runmat_accelerate_api::GpuTensorHandle,
549    ) -> anyhow::Result<Value> {
550        if let Some(p) = runmat_accelerate_api::provider() {
551            let ht = p.download(h).await.map_err(|e| anyhow::anyhow!(e))?;
552            let t = Tensor::new(ht.data, ht.shape).map_err(|e| anyhow::anyhow!(e))?;
553            Ok(Value::Tensor(t))
554        } else {
555            // Fallback to zeros with same shape if no provider is registered
556            let shape = h.shape.clone();
557            let total: usize = shape.iter().product();
558            let zeros = Tensor::new(vec![0.0; total], shape).map_err(|e| anyhow::anyhow!(e))?;
559            Ok(Value::Tensor(zeros))
560        }
561    }
562}