1use once_cell::sync::Lazy;
10use runmat_builtins::{Tensor, Value};
11use std::path::PathBuf;
12use std::sync::RwLock;
13
14pub mod backend;
15pub mod fusion;
16pub mod fusion_exec;
17pub mod fusion_residency;
18pub mod graph;
19mod host_lu;
20pub mod native_auto;
21pub mod precision;
22mod reduction_meta;
23pub mod simple_provider;
24mod sortrows_host;
25pub mod telemetry;
26#[cfg(target_arch = "wasm32")]
27mod web_auto_offload_store;
28#[cfg(feature = "wgpu")]
29use crate::backend::wgpu::provider::WgpuProvider;
30pub use fusion::*;
31pub use graph::*;
32pub use native_auto::{
33 apply_auto_offload_calibration_from_file, auto_offload_report, is_sink, prepare_builtin_args,
34 promote_binary, promote_reduction_args, promote_unary, reset_auto_offload_log,
35 AutoOffloadCalibrationOutcome, AutoOffloadCalibrationSummary, AutoOffloadDecisionEntry,
36 AutoOffloadDisposition, AutoOffloadReport, BinaryOp, CachedProviderInfo, DecisionReason,
37 ReductionOp, ThresholdBase, ThresholdDelta, ThresholdDeltaEntry, ThresholdSnapshot, UnaryOp,
38};
39pub use reduction_meta::{value_is_all_keyword, ReductionAxes};
40#[cfg(feature = "wgpu")]
41use runmat_accelerate_api::AccelProvider;
42use serde::{Deserialize, Serialize};
43#[cfg(feature = "wgpu")]
44use wgpu::PowerPreference;
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
48#[serde(rename_all = "kebab-case")]
49pub enum AccelerateProviderPreference {
50 Auto,
51 Wgpu,
52 InProcess,
53}
54
55impl Default for AccelerateProviderPreference {
56 fn default() -> Self {
57 Self::Auto
58 }
59}
60
61#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
63#[serde(rename_all = "kebab-case")]
64pub enum AccelPowerPreference {
65 Auto,
66 HighPerformance,
67 LowPower,
68}
69
70impl Default for AccelPowerPreference {
71 fn default() -> Self {
72 Self::Auto
73 }
74}
75
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
78#[serde(rename_all = "kebab-case")]
79pub enum AutoOffloadLogLevel {
80 Off,
81 Info,
82 #[default]
83 Trace,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct AutoOffloadOptions {
89 pub enabled: bool,
90 pub calibrate: bool,
91 #[serde(default)]
92 pub profile_path: Option<PathBuf>,
93 #[serde(default)]
94 pub log_level: AutoOffloadLogLevel,
95}
96
97impl Default for AutoOffloadOptions {
98 fn default() -> Self {
99 Self {
100 enabled: true,
101 calibrate: true,
102 profile_path: None,
103 log_level: AutoOffloadLogLevel::Trace,
104 }
105 }
106}
107
108static AUTO_OFFLOAD_OPTIONS: Lazy<RwLock<AutoOffloadOptions>> =
109 Lazy::new(|| RwLock::new(AutoOffloadOptions::default()));
110
111static API_HOOKS: Lazy<()> = Lazy::new(|| {
112 runmat_accelerate_api::register_residency_mark(fusion_residency::mark);
113 runmat_accelerate_api::register_residency_clear(fusion_residency::clear);
114 runmat_accelerate_api::register_sequence_threshold_provider(sequence_threshold_hint_bridge);
115 runmat_accelerate_api::register_workgroup_size_hint_provider(workgroup_size_hint_bridge);
116});
117
118pub(crate) fn ensure_residency_hooks() {
119 Lazy::force(&API_HOOKS);
120}
121
122fn sequence_threshold_hint_bridge() -> Option<usize> {
123 native_auto::sequence_threshold_hint()
124}
125
126fn workgroup_size_hint_bridge() -> Option<u32> {
127 #[cfg(feature = "wgpu")]
128 {
129 Some(crate::backend::wgpu::config::effective_workgroup_size())
130 }
131 #[cfg(not(feature = "wgpu"))]
132 {
133 None
134 }
135}
136
137pub fn configure_auto_offload(options: AutoOffloadOptions) {
138 ensure_residency_hooks();
139 if let Ok(mut guard) = AUTO_OFFLOAD_OPTIONS.write() {
140 *guard = options;
141 }
142}
143
144pub(crate) fn auto_offload_options() -> AutoOffloadOptions {
145 AUTO_OFFLOAD_OPTIONS
146 .read()
147 .map(|guard| guard.clone())
148 .unwrap_or_default()
149}
150
151#[derive(Debug, Clone)]
153pub struct AccelerateInitOptions {
154 pub enabled: bool,
155 pub provider: AccelerateProviderPreference,
156 pub allow_inprocess_fallback: bool,
157 pub wgpu_power_preference: AccelPowerPreference,
158 pub wgpu_force_fallback_adapter: bool,
159 pub auto_offload: AutoOffloadOptions,
160}
161
162impl Default for AccelerateInitOptions {
163 fn default() -> Self {
164 Self {
165 enabled: true,
166 provider: AccelerateProviderPreference::Auto,
167 allow_inprocess_fallback: true,
168 wgpu_power_preference: AccelPowerPreference::Auto,
169 wgpu_force_fallback_adapter: false,
170 auto_offload: AutoOffloadOptions::default(),
171 }
172 }
173}
174
175pub fn initialize_acceleration_provider_with(options: &AccelerateInitOptions) {
177 configure_auto_offload(options.auto_offload.clone());
178
179 if runmat_accelerate_api::provider().is_some() {
180 return;
181 }
182
183 if !options.enabled {
184 if options.allow_inprocess_fallback {
185 simple_provider::register_inprocess_provider();
186 log::info!(
187 "RunMat Accelerate: acceleration disabled; using in-process provider for compatibility"
188 );
189 } else {
190 log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
191 }
192 return;
193 }
194
195 let registered = {
196 #[cfg(all(feature = "wgpu", not(target_arch = "wasm32")))]
197 {
198 let mut reg = false;
199 if matches!(
200 options.provider,
201 AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
202 ) {
203 let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
204 power_preference: match options.wgpu_power_preference {
205 AccelPowerPreference::Auto => PowerPreference::HighPerformance,
206 AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
207 AccelPowerPreference::LowPower => PowerPreference::LowPower,
208 },
209 force_fallback_adapter: options.wgpu_force_fallback_adapter,
210 };
211
212 match backend::wgpu::provider::register_wgpu_provider(wgpu_options) {
213 Ok(provider) => {
214 reg = true;
215 announce_wgpu_provider(provider);
216 }
217 Err(err) => {
218 log::warn!(
219 "RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
220 );
221 }
222 }
223 }
224 reg
225 }
226 #[cfg(all(feature = "wgpu", target_arch = "wasm32"))]
227 {
228 if matches!(
229 options.provider,
230 AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
231 ) {
232 log::info!(
233 "RunMat Accelerate: wasm builds require calling initialize_wgpu_provider_async to enable the WGPU backend"
234 );
235 }
236 false
237 }
238 #[cfg(not(feature = "wgpu"))]
239 {
240 if matches!(options.provider, AccelerateProviderPreference::Wgpu) {
241 log::warn!(
242 "RunMat Accelerate: WGPU provider requested but crate built without 'wgpu' feature"
243 );
244 }
245 false
246 }
247 };
248
249 if !registered {
250 if options.allow_inprocess_fallback
251 || matches!(options.provider, AccelerateProviderPreference::InProcess)
252 {
253 simple_provider::register_inprocess_provider();
254 log::info!("RunMat Accelerate: using in-process acceleration provider");
255 } else {
256 log::warn!("RunMat Accelerate: no acceleration provider registered");
257 }
258 }
259}
260
261#[cfg(feature = "wgpu")]
262fn announce_wgpu_provider(provider: &WgpuProvider) {
263 let info = provider.device_info_struct();
264 let backend = info.backend.as_deref().unwrap_or("unknown");
265 log::info!(
266 "RunMat Accelerate: using WGPU provider {} (vendor: {}, backend: {})",
267 info.name,
268 info.vendor,
269 backend
270 );
271 provider.warmup();
272 let (hits, misses) = provider.fused_cache_counters();
273 log::info!(
274 "RunMat Accelerate: fused pipeline cache after warmup - hits: {}, misses: {}",
275 hits,
276 misses
277 );
278}
279
280#[cfg(all(feature = "wgpu", target_arch = "wasm32"))]
281pub async fn initialize_wgpu_provider_async(options: &AccelerateInitOptions) -> anyhow::Result<()> {
282 configure_auto_offload(options.auto_offload.clone());
283
284 if runmat_accelerate_api::provider().is_some() {
285 return Ok(());
286 }
287
288 if !options.enabled {
289 if options.allow_inprocess_fallback
290 || matches!(options.provider, AccelerateProviderPreference::InProcess)
291 {
292 simple_provider::register_inprocess_provider();
293 log::info!(
294 "RunMat Accelerate: acceleration disabled; using in-process acceleration provider"
295 );
296 } else {
297 log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
298 }
299 return Ok(());
300 }
301
302 let mut registered = false;
303 if matches!(
304 options.provider,
305 AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
306 ) {
307 let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
308 power_preference: match options.wgpu_power_preference {
309 AccelPowerPreference::Auto => PowerPreference::HighPerformance,
310 AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
311 AccelPowerPreference::LowPower => PowerPreference::LowPower,
312 },
313 force_fallback_adapter: options.wgpu_force_fallback_adapter,
314 };
315
316 match backend::wgpu::provider::register_wgpu_provider_async(wgpu_options).await {
317 Ok(provider) => {
318 registered = true;
319 announce_wgpu_provider(provider);
320 }
321 Err(err) => {
322 log::warn!(
323 "RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
324 );
325 }
326 }
327 }
328
329 if !registered {
330 if options.allow_inprocess_fallback
331 || matches!(options.provider, AccelerateProviderPreference::InProcess)
332 {
333 simple_provider::register_inprocess_provider();
334 log::info!("RunMat Accelerate: using in-process acceleration provider");
335 } else {
336 log::warn!("RunMat Accelerate: no acceleration provider registered");
337 }
338 }
339
340 Ok(())
341}
342
343pub fn initialize_acceleration_provider() {
345 initialize_acceleration_provider_with(&AccelerateInitOptions::default());
346}
347
348#[cfg(test)]
349mod tests {
350 #[cfg(feature = "wgpu")]
351 use crate::backend::wgpu::cache::key::compute_pipeline_hash_bytes;
352
353 #[test]
354 #[cfg(feature = "wgpu")]
355 fn elementwise_hash_varies_with_arity() {
356 let wg = 256u32;
357 let h2 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-2", Some(wg));
358 let h3 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-3", Some(wg));
359 assert_ne!(h2, h3, "hash should differ with input arity");
360 }
361}
362
363#[cfg(feature = "wgpu")]
365pub fn provider_cache_stats() -> Option<(u64, u64)> {
366 runmat_accelerate_api::provider().map(|p| p.fused_cache_counters())
367}
368
369#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
371pub enum DeviceKind {
372 Cpu,
373 Cuda,
374 Rocm,
375 Metal,
376 Vulkan,
377 OpenCl,
378 Wgpu,
379}
380
381#[derive(Debug, Clone, Serialize, Deserialize)]
383pub struct DeviceInfo {
384 pub kind: DeviceKind,
385 pub name: String,
386 pub vendor: String,
387 pub memory_bytes: Option<u64>,
388 pub compute_capability: Option<String>,
389}
390
391pub trait BufferHandle: Send + Sync {
393 fn len(&self) -> usize;
394 fn is_empty(&self) -> bool {
395 self.len() == 0
396 }
397}
398
399pub trait DeviceMatrix: Send + Sync {
401 fn rows(&self) -> usize;
402 fn cols(&self) -> usize;
403 fn as_buffer(&self) -> &dyn BufferHandle;
404}
405
406pub trait AccelerateBackend: Send + Sync {
408 fn device_info(&self) -> DeviceInfo;
409
410 fn upload_matrix(&self, host: &Tensor) -> anyhow::Result<Box<dyn DeviceMatrix>>;
412 fn download_matrix(&self, dev: &dyn DeviceMatrix) -> anyhow::Result<Tensor>;
413
414 fn elem_add(
416 &self,
417 a: &dyn DeviceMatrix,
418 b: &dyn DeviceMatrix,
419 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
420 fn elem_sub(
421 &self,
422 a: &dyn DeviceMatrix,
423 b: &dyn DeviceMatrix,
424 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
425 fn elem_mul(
426 &self,
427 a: &dyn DeviceMatrix,
428 b: &dyn DeviceMatrix,
429 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
430 fn elem_ne(
431 &self,
432 a: &dyn DeviceMatrix,
433 b: &dyn DeviceMatrix,
434 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
435 fn elem_eq(
436 &self,
437 a: &dyn DeviceMatrix,
438 b: &dyn DeviceMatrix,
439 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
440 fn elem_div(
441 &self,
442 a: &dyn DeviceMatrix,
443 b: &dyn DeviceMatrix,
444 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
445 fn elem_pow(
446 &self,
447 a: &dyn DeviceMatrix,
448 b: &dyn DeviceMatrix,
449 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
450
451 fn matmul(
453 &self,
454 a: &dyn DeviceMatrix,
455 b: &dyn DeviceMatrix,
456 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
457 fn transpose(&self, a: &dyn DeviceMatrix) -> anyhow::Result<Box<dyn DeviceMatrix>>;
458}
459
460#[derive(Default)]
463pub struct Planner {
464 backend: Option<Box<dyn AccelerateBackend>>,
465}
466
467impl Planner {
468 pub fn new(backend: Option<Box<dyn AccelerateBackend>>) -> Self {
469 Self { backend }
470 }
471
472 pub fn device(&self) -> Option<&dyn AccelerateBackend> {
473 self.backend.as_deref()
474 }
475
476 pub fn choose_elem_add(&self, a: &Tensor, b: &Tensor) -> ExecutionTarget {
478 if let Some(bk) = &self.backend {
479 if a.data.len() >= 1 << 16 && a.rows() == b.rows() && a.cols() == b.cols() {
480 return ExecutionTarget::Gpu(bk.device_info());
481 }
482 }
483 ExecutionTarget::Cpu
484 }
485}
486
487#[derive(Debug, Clone, Serialize, Deserialize)]
488pub enum ExecutionTarget {
489 Cpu,
490 Gpu(DeviceInfo),
491}
492
493pub struct Accelerator {
495 planner: Planner,
496}
497
498impl Accelerator {
499 pub fn new(planner: Planner) -> Self {
500 Self { planner }
501 }
502
503 pub async fn elementwise_add(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
504 match (a, b) {
505 (Value::GpuTensor(ga), Value::GpuTensor(gb)) => {
506 let ha = self.gather_handle(ga).await?;
507 let hb = self.gather_handle(gb).await?;
508 self.elementwise_add_resolved(&ha, &hb)
509 }
510 (Value::GpuTensor(ga), other) => {
511 let ha = self.gather_handle(ga).await?;
512 self.elementwise_add_resolved(&ha, other)
513 }
514 (other, Value::GpuTensor(gb)) => {
515 let hb = self.gather_handle(gb).await?;
516 self.elementwise_add_resolved(other, &hb)
517 }
518 _ => self.elementwise_add_resolved(a, b),
519 }
520 }
521
522 fn elementwise_add_resolved(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
523 match (a, b) {
524 (Value::Tensor(ma), Value::Tensor(mb)) => match self.planner.choose_elem_add(ma, mb) {
525 ExecutionTarget::Cpu => {
526 runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
527 .map_err(|e| anyhow::anyhow!(e))
528 }
529 ExecutionTarget::Gpu(_) => {
530 let bk = self
531 .planner
532 .device()
533 .ok_or_else(|| anyhow::anyhow!("no backend"))?;
534 let da = bk.upload_matrix(ma)?;
535 let db = bk.upload_matrix(mb)?;
536 let dc = bk.elem_add(da.as_ref(), db.as_ref())?;
537 let out = bk.download_matrix(dc.as_ref())?;
538 Ok(Value::Tensor(out))
539 }
540 },
541 _ => runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
542 .map_err(|e| anyhow::anyhow!(e)),
543 }
544 }
545
546 async fn gather_handle(
547 &self,
548 h: &runmat_accelerate_api::GpuTensorHandle,
549 ) -> anyhow::Result<Value> {
550 if let Some(p) = runmat_accelerate_api::provider() {
551 let ht = p.download(h).await.map_err(|e| anyhow::anyhow!(e))?;
552 let t = Tensor::new(ht.data, ht.shape).map_err(|e| anyhow::anyhow!(e))?;
553 Ok(Value::Tensor(t))
554 } else {
555 let shape = h.shape.clone();
557 let total: usize = shape.iter().product();
558 let zeros = Tensor::new(vec![0.0; total], shape).map_err(|e| anyhow::anyhow!(e))?;
559 Ok(Value::Tensor(zeros))
560 }
561 }
562}