runmat_accelerate/
lib.rs

1//! RunMat Accelerate: GPU Acceleration Abstraction Layer
2//!
3//! Goals:
4//! - Provide a backend-agnostic API surface that maps RunMat operations to GPU kernels.
5//! - Support multiple backends via features (CUDA, ROCm, Metal, Vulkan, OpenCL, wgpu).
6//! - Allow zero-copy interop with `runmat-builtins::Matrix` where possible.
7//! - Defer actual kernel authoring to backend crates/modules; this crate defines traits and wiring.
8
9use once_cell::sync::Lazy;
10use runmat_builtins::{Tensor, Value};
11use std::path::PathBuf;
12use std::sync::RwLock;
13
14pub mod backend;
15pub mod fusion;
16pub mod fusion_exec;
17pub mod fusion_residency;
18pub mod graph;
19mod host_lu;
20pub mod native_auto;
21pub mod precision;
22mod reduction_meta;
23pub mod simple_provider;
24mod sortrows_host;
25pub mod telemetry;
26pub use fusion::*;
27pub use graph::*;
28pub use native_auto::{
29    apply_auto_offload_calibration_from_file, auto_offload_report, is_sink, prepare_builtin_args,
30    promote_binary, promote_reduction_args, promote_unary, reset_auto_offload_log,
31    AutoOffloadCalibrationOutcome, AutoOffloadCalibrationSummary, AutoOffloadDecisionEntry,
32    AutoOffloadDisposition, AutoOffloadReport, BinaryOp, CachedProviderInfo, DecisionReason,
33    ReductionOp, ThresholdBase, ThresholdDelta, ThresholdDeltaEntry, ThresholdSnapshot, UnaryOp,
34};
35pub use reduction_meta::{value_is_all_keyword, ReductionAxes};
36#[cfg(feature = "wgpu")]
37use runmat_accelerate_api::AccelProvider;
38use serde::{Deserialize, Serialize};
39#[cfg(feature = "wgpu")]
40use wgpu::PowerPreference;
41
42/// Preferred acceleration provider selection
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(rename_all = "kebab-case")]
45pub enum AccelerateProviderPreference {
46    Auto,
47    Wgpu,
48    InProcess,
49}
50
51impl Default for AccelerateProviderPreference {
52    fn default() -> Self {
53        Self::Auto
54    }
55}
56
57/// Power preference used when initializing a WGPU backend
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
59#[serde(rename_all = "kebab-case")]
60pub enum AccelPowerPreference {
61    Auto,
62    HighPerformance,
63    LowPower,
64}
65
66impl Default for AccelPowerPreference {
67    fn default() -> Self {
68        Self::Auto
69    }
70}
71
72/// Logging verbosity for auto-offload promotion decisions.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
74#[serde(rename_all = "kebab-case")]
75pub enum AutoOffloadLogLevel {
76    Off,
77    Info,
78    #[default]
79    Trace,
80}
81
82/// Configuration passed to the native auto-offload planner.
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct AutoOffloadOptions {
85    pub enabled: bool,
86    pub calibrate: bool,
87    #[serde(default)]
88    pub profile_path: Option<PathBuf>,
89    #[serde(default)]
90    pub log_level: AutoOffloadLogLevel,
91}
92
93impl Default for AutoOffloadOptions {
94    fn default() -> Self {
95        Self {
96            enabled: true,
97            calibrate: true,
98            profile_path: None,
99            log_level: AutoOffloadLogLevel::Trace,
100        }
101    }
102}
103
104static AUTO_OFFLOAD_OPTIONS: Lazy<RwLock<AutoOffloadOptions>> =
105    Lazy::new(|| RwLock::new(AutoOffloadOptions::default()));
106
107static API_HOOKS: Lazy<()> = Lazy::new(|| {
108    runmat_accelerate_api::register_residency_clear(fusion_residency::clear);
109    runmat_accelerate_api::register_sequence_threshold_provider(sequence_threshold_hint_bridge);
110});
111
112pub(crate) fn ensure_residency_hooks() {
113    Lazy::force(&API_HOOKS);
114}
115
116fn sequence_threshold_hint_bridge() -> Option<usize> {
117    native_auto::sequence_threshold_hint()
118}
119
120pub fn configure_auto_offload(options: AutoOffloadOptions) {
121    if let Ok(mut guard) = AUTO_OFFLOAD_OPTIONS.write() {
122        *guard = options;
123    }
124}
125
126pub(crate) fn auto_offload_options() -> AutoOffloadOptions {
127    AUTO_OFFLOAD_OPTIONS
128        .read()
129        .map(|guard| guard.clone())
130        .unwrap_or_default()
131}
132
133/// Initialization options for selecting and configuring the acceleration provider.
134#[derive(Debug, Clone)]
135pub struct AccelerateInitOptions {
136    pub enabled: bool,
137    pub provider: AccelerateProviderPreference,
138    pub allow_inprocess_fallback: bool,
139    pub wgpu_power_preference: AccelPowerPreference,
140    pub wgpu_force_fallback_adapter: bool,
141    pub auto_offload: AutoOffloadOptions,
142}
143
144impl Default for AccelerateInitOptions {
145    fn default() -> Self {
146        Self {
147            enabled: true,
148            provider: AccelerateProviderPreference::Auto,
149            allow_inprocess_fallback: true,
150            wgpu_power_preference: AccelPowerPreference::Auto,
151            wgpu_force_fallback_adapter: false,
152            auto_offload: AutoOffloadOptions::default(),
153        }
154    }
155}
156
157/// Initialize the global acceleration provider using the supplied options.
158pub fn initialize_acceleration_provider_with(options: &AccelerateInitOptions) {
159    configure_auto_offload(options.auto_offload.clone());
160
161    if runmat_accelerate_api::provider().is_some() {
162        return;
163    }
164
165    if !options.enabled {
166        if options.allow_inprocess_fallback {
167            simple_provider::register_inprocess_provider();
168            log::info!(
169                "RunMat Accelerate: acceleration disabled; using in-process provider for compatibility"
170            );
171        } else {
172            log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
173        }
174        return;
175    }
176
177    let registered = {
178        #[cfg(feature = "wgpu")]
179        {
180            let mut reg = false;
181            if matches!(
182                options.provider,
183                AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
184            ) {
185                let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
186                    power_preference: match options.wgpu_power_preference {
187                        AccelPowerPreference::Auto => PowerPreference::HighPerformance,
188                        AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
189                        AccelPowerPreference::LowPower => PowerPreference::LowPower,
190                    },
191                    force_fallback_adapter: options.wgpu_force_fallback_adapter,
192                };
193
194                match backend::wgpu::provider::register_wgpu_provider(wgpu_options) {
195                    Ok(provider) => {
196                        reg = true;
197                        let info = provider.device_info_struct();
198                        let backend = info.backend.as_deref().unwrap_or("unknown");
199                        log::info!(
200                            "RunMat Accelerate: using WGPU provider {} (vendor: {}, backend: {})",
201                            info.name,
202                            info.vendor,
203                            backend
204                        );
205                        // Warmup to amortize first-dispatch costs
206                        provider.warmup();
207                        let (hits, misses) = provider.fused_cache_counters();
208                        log::info!(
209                            "RunMat Accelerate: fused pipeline cache after warmup - hits: {}, misses: {}",
210                            hits, misses
211                        );
212                    }
213                    Err(err) => {
214                        log::warn!(
215                            "RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
216                        );
217                    }
218                }
219            }
220            reg
221        }
222        #[cfg(not(feature = "wgpu"))]
223        {
224            if matches!(options.provider, AccelerateProviderPreference::Wgpu) {
225                log::warn!(
226                    "RunMat Accelerate: WGPU provider requested but crate built without 'wgpu' feature"
227                );
228            }
229            false
230        }
231    };
232
233    if !registered {
234        if options.allow_inprocess_fallback
235            || matches!(options.provider, AccelerateProviderPreference::InProcess)
236        {
237            simple_provider::register_inprocess_provider();
238            log::info!("RunMat Accelerate: using in-process acceleration provider");
239        } else {
240            log::warn!("RunMat Accelerate: no acceleration provider registered");
241        }
242    }
243}
244
245/// Initialize the acceleration provider using default options.
246pub fn initialize_acceleration_provider() {
247    initialize_acceleration_provider_with(&AccelerateInitOptions::default());
248}
249
250#[cfg(test)]
251mod tests {
252    #[cfg(feature = "wgpu")]
253    use crate::backend::wgpu::cache::key::compute_pipeline_hash_bytes;
254
255    #[test]
256    #[cfg(feature = "wgpu")]
257    fn elementwise_hash_varies_with_arity() {
258        let wg = 256u32;
259        let h2 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-2", Some(wg));
260        let h3 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-3", Some(wg));
261        assert_ne!(h2, h3, "hash should differ with input arity");
262    }
263}
264
265/// Return fused pipeline cache statistics if the active provider exposes them.
266#[cfg(feature = "wgpu")]
267pub fn provider_cache_stats() -> Option<(u64, u64)> {
268    runmat_accelerate_api::provider().map(|p| p.fused_cache_counters())
269}
270
271/// High-level device kind. Concrete selection is provided by backend.
272#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
273pub enum DeviceKind {
274    Cpu,
275    Cuda,
276    Rocm,
277    Metal,
278    Vulkan,
279    OpenCl,
280    Wgpu,
281}
282
283/// Device descriptor used for selection and capabilities query.
284#[derive(Debug, Clone, Serialize, Deserialize)]
285pub struct DeviceInfo {
286    pub kind: DeviceKind,
287    pub name: String,
288    pub vendor: String,
289    pub memory_bytes: Option<u64>,
290    pub compute_capability: Option<String>,
291}
292
293/// Abstract buffer that may reside on device or be host-pinned.
294pub trait BufferHandle: Send + Sync {
295    fn len(&self) -> usize;
296    fn is_empty(&self) -> bool {
297        self.len() == 0
298    }
299}
300
301/// Abstract matrix allocated on a device backend.
302pub trait DeviceMatrix: Send + Sync {
303    fn rows(&self) -> usize;
304    fn cols(&self) -> usize;
305    fn as_buffer(&self) -> &dyn BufferHandle;
306}
307
308/// Core backend interface that concrete backends must implement.
309pub trait AccelerateBackend: Send + Sync {
310    fn device_info(&self) -> DeviceInfo;
311
312    // Memory
313    fn upload_matrix(&self, host: &Tensor) -> anyhow::Result<Box<dyn DeviceMatrix>>;
314    fn download_matrix(&self, dev: &dyn DeviceMatrix) -> anyhow::Result<Tensor>;
315
316    // Elementwise
317    fn elem_add(
318        &self,
319        a: &dyn DeviceMatrix,
320        b: &dyn DeviceMatrix,
321    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
322    fn elem_sub(
323        &self,
324        a: &dyn DeviceMatrix,
325        b: &dyn DeviceMatrix,
326    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
327    fn elem_mul(
328        &self,
329        a: &dyn DeviceMatrix,
330        b: &dyn DeviceMatrix,
331    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
332    fn elem_ne(
333        &self,
334        a: &dyn DeviceMatrix,
335        b: &dyn DeviceMatrix,
336    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
337    fn elem_eq(
338        &self,
339        a: &dyn DeviceMatrix,
340        b: &dyn DeviceMatrix,
341    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
342    fn elem_div(
343        &self,
344        a: &dyn DeviceMatrix,
345        b: &dyn DeviceMatrix,
346    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
347    fn elem_pow(
348        &self,
349        a: &dyn DeviceMatrix,
350        b: &dyn DeviceMatrix,
351    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
352
353    // Linear algebra (future): matmul, transpose, BLAS/LAPACK analogs
354    fn matmul(
355        &self,
356        a: &dyn DeviceMatrix,
357        b: &dyn DeviceMatrix,
358    ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
359    fn transpose(&self, a: &dyn DeviceMatrix) -> anyhow::Result<Box<dyn DeviceMatrix>>;
360}
361
362/// Planner determines whether to execute on CPU or a selected backend.
363/// This will eventually consult sizes, heuristics, and device availability.
364#[derive(Default)]
365pub struct Planner {
366    backend: Option<Box<dyn AccelerateBackend>>,
367}
368
369impl Planner {
370    pub fn new(backend: Option<Box<dyn AccelerateBackend>>) -> Self {
371        Self { backend }
372    }
373
374    pub fn device(&self) -> Option<&dyn AccelerateBackend> {
375        self.backend.as_deref()
376    }
377
378    /// Example decision hook: execute elementwise add on GPU if large enough.
379    pub fn choose_elem_add(&self, a: &Tensor, b: &Tensor) -> ExecutionTarget {
380        if let Some(bk) = &self.backend {
381            if a.data.len() >= 1 << 16 && a.rows() == b.rows() && a.cols() == b.cols() {
382                return ExecutionTarget::Gpu(bk.device_info());
383            }
384        }
385        ExecutionTarget::Cpu
386    }
387}
388
389#[derive(Debug, Clone, Serialize, Deserialize)]
390pub enum ExecutionTarget {
391    Cpu,
392    Gpu(DeviceInfo),
393}
394
395/// High-level façade for accelerated operations, falling back to `runmat-runtime`.
396pub struct Accelerator {
397    planner: Planner,
398}
399
400impl Accelerator {
401    pub fn new(planner: Planner) -> Self {
402        Self { planner }
403    }
404
405    pub fn elementwise_add(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
406        match (a, b) {
407            (Value::Tensor(ma), Value::Tensor(mb)) => match self.planner.choose_elem_add(ma, mb) {
408                ExecutionTarget::Cpu => {
409                    runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
410                        .map_err(|e| anyhow::anyhow!(e))
411                }
412                ExecutionTarget::Gpu(_) => {
413                    let bk = self
414                        .planner
415                        .device()
416                        .ok_or_else(|| anyhow::anyhow!("no backend"))?;
417                    let da = bk.upload_matrix(ma)?;
418                    let db = bk.upload_matrix(mb)?;
419                    let dc = bk.elem_add(da.as_ref(), db.as_ref())?;
420                    let out = bk.download_matrix(dc.as_ref())?;
421                    Ok(Value::Tensor(out))
422                }
423            },
424            (Value::GpuTensor(ga), Value::GpuTensor(gb)) => {
425                // Placeholder: assume same device; in practice look up buffers by id
426                // Fallback to CPU until device registry is implemented
427                let ha = self.gather_handle(ga)?;
428                let hb = self.gather_handle(gb)?;
429                self.elementwise_add(&ha, &hb)
430            }
431            (Value::GpuTensor(ga), other) => {
432                let ha = self.gather_handle(ga)?;
433                self.elementwise_add(&ha, other)
434            }
435            (other, Value::GpuTensor(gb)) => {
436                let hb = self.gather_handle(gb)?;
437                self.elementwise_add(other, &hb)
438            }
439            _ => runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
440                .map_err(|e| anyhow::anyhow!(e)),
441        }
442    }
443
444    fn gather_handle(&self, h: &runmat_accelerate_api::GpuTensorHandle) -> anyhow::Result<Value> {
445        if let Some(p) = runmat_accelerate_api::provider() {
446            let ht = p.download(h).map_err(|e| anyhow::anyhow!(e))?;
447            let t = Tensor::new(ht.data, ht.shape).map_err(|e| anyhow::anyhow!(e))?;
448            Ok(Value::Tensor(t))
449        } else {
450            // Fallback to zeros with same shape if no provider is registered
451            let shape = h.shape.clone();
452            let total: usize = shape.iter().product();
453            let zeros = Tensor::new(vec![0.0; total], shape).map_err(|e| anyhow::anyhow!(e))?;
454            Ok(Value::Tensor(zeros))
455        }
456    }
457}