1use once_cell::sync::Lazy;
10use runmat_builtins::{Tensor, Value};
11use std::path::PathBuf;
12use std::sync::RwLock;
13
14pub mod backend;
15pub mod fusion;
16pub mod fusion_exec;
17pub mod fusion_residency;
18pub mod graph;
19mod host_lu;
20pub mod native_auto;
21pub mod precision;
22mod reduction_meta;
23pub mod simple_provider;
24mod sortrows_host;
25pub mod telemetry;
26pub use fusion::*;
27pub use graph::*;
28pub use native_auto::{
29 apply_auto_offload_calibration_from_file, auto_offload_report, is_sink, prepare_builtin_args,
30 promote_binary, promote_reduction_args, promote_unary, reset_auto_offload_log,
31 AutoOffloadCalibrationOutcome, AutoOffloadCalibrationSummary, AutoOffloadDecisionEntry,
32 AutoOffloadDisposition, AutoOffloadReport, BinaryOp, CachedProviderInfo, DecisionReason,
33 ReductionOp, ThresholdBase, ThresholdDelta, ThresholdDeltaEntry, ThresholdSnapshot, UnaryOp,
34};
35pub use reduction_meta::{value_is_all_keyword, ReductionAxes};
36#[cfg(feature = "wgpu")]
37use runmat_accelerate_api::AccelProvider;
38use serde::{Deserialize, Serialize};
39#[cfg(feature = "wgpu")]
40use wgpu::PowerPreference;
41
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(rename_all = "kebab-case")]
45pub enum AccelerateProviderPreference {
46 Auto,
47 Wgpu,
48 InProcess,
49}
50
51impl Default for AccelerateProviderPreference {
52 fn default() -> Self {
53 Self::Auto
54 }
55}
56
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
59#[serde(rename_all = "kebab-case")]
60pub enum AccelPowerPreference {
61 Auto,
62 HighPerformance,
63 LowPower,
64}
65
66impl Default for AccelPowerPreference {
67 fn default() -> Self {
68 Self::Auto
69 }
70}
71
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
74#[serde(rename_all = "kebab-case")]
75pub enum AutoOffloadLogLevel {
76 Off,
77 Info,
78 #[default]
79 Trace,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct AutoOffloadOptions {
85 pub enabled: bool,
86 pub calibrate: bool,
87 #[serde(default)]
88 pub profile_path: Option<PathBuf>,
89 #[serde(default)]
90 pub log_level: AutoOffloadLogLevel,
91}
92
93impl Default for AutoOffloadOptions {
94 fn default() -> Self {
95 Self {
96 enabled: true,
97 calibrate: true,
98 profile_path: None,
99 log_level: AutoOffloadLogLevel::Trace,
100 }
101 }
102}
103
104static AUTO_OFFLOAD_OPTIONS: Lazy<RwLock<AutoOffloadOptions>> =
105 Lazy::new(|| RwLock::new(AutoOffloadOptions::default()));
106
107static API_HOOKS: Lazy<()> = Lazy::new(|| {
108 runmat_accelerate_api::register_residency_clear(fusion_residency::clear);
109 runmat_accelerate_api::register_sequence_threshold_provider(sequence_threshold_hint_bridge);
110});
111
112pub(crate) fn ensure_residency_hooks() {
113 Lazy::force(&API_HOOKS);
114}
115
116fn sequence_threshold_hint_bridge() -> Option<usize> {
117 native_auto::sequence_threshold_hint()
118}
119
120pub fn configure_auto_offload(options: AutoOffloadOptions) {
121 if let Ok(mut guard) = AUTO_OFFLOAD_OPTIONS.write() {
122 *guard = options;
123 }
124}
125
126pub(crate) fn auto_offload_options() -> AutoOffloadOptions {
127 AUTO_OFFLOAD_OPTIONS
128 .read()
129 .map(|guard| guard.clone())
130 .unwrap_or_default()
131}
132
133#[derive(Debug, Clone)]
135pub struct AccelerateInitOptions {
136 pub enabled: bool,
137 pub provider: AccelerateProviderPreference,
138 pub allow_inprocess_fallback: bool,
139 pub wgpu_power_preference: AccelPowerPreference,
140 pub wgpu_force_fallback_adapter: bool,
141 pub auto_offload: AutoOffloadOptions,
142}
143
144impl Default for AccelerateInitOptions {
145 fn default() -> Self {
146 Self {
147 enabled: true,
148 provider: AccelerateProviderPreference::Auto,
149 allow_inprocess_fallback: true,
150 wgpu_power_preference: AccelPowerPreference::Auto,
151 wgpu_force_fallback_adapter: false,
152 auto_offload: AutoOffloadOptions::default(),
153 }
154 }
155}
156
157pub fn initialize_acceleration_provider_with(options: &AccelerateInitOptions) {
159 configure_auto_offload(options.auto_offload.clone());
160
161 if runmat_accelerate_api::provider().is_some() {
162 return;
163 }
164
165 if !options.enabled {
166 if options.allow_inprocess_fallback {
167 simple_provider::register_inprocess_provider();
168 log::info!(
169 "RunMat Accelerate: acceleration disabled; using in-process provider for compatibility"
170 );
171 } else {
172 log::info!("RunMat Accelerate: acceleration disabled; no provider registered");
173 }
174 return;
175 }
176
177 let registered = {
178 #[cfg(feature = "wgpu")]
179 {
180 let mut reg = false;
181 if matches!(
182 options.provider,
183 AccelerateProviderPreference::Auto | AccelerateProviderPreference::Wgpu
184 ) {
185 let wgpu_options = backend::wgpu::provider::WgpuProviderOptions {
186 power_preference: match options.wgpu_power_preference {
187 AccelPowerPreference::Auto => PowerPreference::HighPerformance,
188 AccelPowerPreference::HighPerformance => PowerPreference::HighPerformance,
189 AccelPowerPreference::LowPower => PowerPreference::LowPower,
190 },
191 force_fallback_adapter: options.wgpu_force_fallback_adapter,
192 };
193
194 match backend::wgpu::provider::register_wgpu_provider(wgpu_options) {
195 Ok(provider) => {
196 reg = true;
197 let info = provider.device_info_struct();
198 let backend = info.backend.as_deref().unwrap_or("unknown");
199 log::info!(
200 "RunMat Accelerate: using WGPU provider {} (vendor: {}, backend: {})",
201 info.name,
202 info.vendor,
203 backend
204 );
205 provider.warmup();
207 let (hits, misses) = provider.fused_cache_counters();
208 log::info!(
209 "RunMat Accelerate: fused pipeline cache after warmup - hits: {}, misses: {}",
210 hits, misses
211 );
212 }
213 Err(err) => {
214 log::warn!(
215 "RunMat Accelerate: failed to initialize WGPU provider, falling back: {err}"
216 );
217 }
218 }
219 }
220 reg
221 }
222 #[cfg(not(feature = "wgpu"))]
223 {
224 if matches!(options.provider, AccelerateProviderPreference::Wgpu) {
225 log::warn!(
226 "RunMat Accelerate: WGPU provider requested but crate built without 'wgpu' feature"
227 );
228 }
229 false
230 }
231 };
232
233 if !registered {
234 if options.allow_inprocess_fallback
235 || matches!(options.provider, AccelerateProviderPreference::InProcess)
236 {
237 simple_provider::register_inprocess_provider();
238 log::info!("RunMat Accelerate: using in-process acceleration provider");
239 } else {
240 log::warn!("RunMat Accelerate: no acceleration provider registered");
241 }
242 }
243}
244
245pub fn initialize_acceleration_provider() {
247 initialize_acceleration_provider_with(&AccelerateInitOptions::default());
248}
249
250#[cfg(test)]
251mod tests {
252 #[cfg(feature = "wgpu")]
253 use crate::backend::wgpu::cache::key::compute_pipeline_hash_bytes;
254
255 #[test]
256 #[cfg(feature = "wgpu")]
257 fn elementwise_hash_varies_with_arity() {
258 let wg = 256u32;
259 let h2 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-2", Some(wg));
260 let h3 = compute_pipeline_hash_bytes(b"shader", "runmat-fusion-layout-3", Some(wg));
261 assert_ne!(h2, h3, "hash should differ with input arity");
262 }
263}
264
265#[cfg(feature = "wgpu")]
267pub fn provider_cache_stats() -> Option<(u64, u64)> {
268 runmat_accelerate_api::provider().map(|p| p.fused_cache_counters())
269}
270
271#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
273pub enum DeviceKind {
274 Cpu,
275 Cuda,
276 Rocm,
277 Metal,
278 Vulkan,
279 OpenCl,
280 Wgpu,
281}
282
283#[derive(Debug, Clone, Serialize, Deserialize)]
285pub struct DeviceInfo {
286 pub kind: DeviceKind,
287 pub name: String,
288 pub vendor: String,
289 pub memory_bytes: Option<u64>,
290 pub compute_capability: Option<String>,
291}
292
293pub trait BufferHandle: Send + Sync {
295 fn len(&self) -> usize;
296 fn is_empty(&self) -> bool {
297 self.len() == 0
298 }
299}
300
301pub trait DeviceMatrix: Send + Sync {
303 fn rows(&self) -> usize;
304 fn cols(&self) -> usize;
305 fn as_buffer(&self) -> &dyn BufferHandle;
306}
307
308pub trait AccelerateBackend: Send + Sync {
310 fn device_info(&self) -> DeviceInfo;
311
312 fn upload_matrix(&self, host: &Tensor) -> anyhow::Result<Box<dyn DeviceMatrix>>;
314 fn download_matrix(&self, dev: &dyn DeviceMatrix) -> anyhow::Result<Tensor>;
315
316 fn elem_add(
318 &self,
319 a: &dyn DeviceMatrix,
320 b: &dyn DeviceMatrix,
321 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
322 fn elem_sub(
323 &self,
324 a: &dyn DeviceMatrix,
325 b: &dyn DeviceMatrix,
326 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
327 fn elem_mul(
328 &self,
329 a: &dyn DeviceMatrix,
330 b: &dyn DeviceMatrix,
331 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
332 fn elem_ne(
333 &self,
334 a: &dyn DeviceMatrix,
335 b: &dyn DeviceMatrix,
336 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
337 fn elem_eq(
338 &self,
339 a: &dyn DeviceMatrix,
340 b: &dyn DeviceMatrix,
341 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
342 fn elem_div(
343 &self,
344 a: &dyn DeviceMatrix,
345 b: &dyn DeviceMatrix,
346 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
347 fn elem_pow(
348 &self,
349 a: &dyn DeviceMatrix,
350 b: &dyn DeviceMatrix,
351 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
352
353 fn matmul(
355 &self,
356 a: &dyn DeviceMatrix,
357 b: &dyn DeviceMatrix,
358 ) -> anyhow::Result<Box<dyn DeviceMatrix>>;
359 fn transpose(&self, a: &dyn DeviceMatrix) -> anyhow::Result<Box<dyn DeviceMatrix>>;
360}
361
362#[derive(Default)]
365pub struct Planner {
366 backend: Option<Box<dyn AccelerateBackend>>,
367}
368
369impl Planner {
370 pub fn new(backend: Option<Box<dyn AccelerateBackend>>) -> Self {
371 Self { backend }
372 }
373
374 pub fn device(&self) -> Option<&dyn AccelerateBackend> {
375 self.backend.as_deref()
376 }
377
378 pub fn choose_elem_add(&self, a: &Tensor, b: &Tensor) -> ExecutionTarget {
380 if let Some(bk) = &self.backend {
381 if a.data.len() >= 1 << 16 && a.rows() == b.rows() && a.cols() == b.cols() {
382 return ExecutionTarget::Gpu(bk.device_info());
383 }
384 }
385 ExecutionTarget::Cpu
386 }
387}
388
389#[derive(Debug, Clone, Serialize, Deserialize)]
390pub enum ExecutionTarget {
391 Cpu,
392 Gpu(DeviceInfo),
393}
394
395pub struct Accelerator {
397 planner: Planner,
398}
399
400impl Accelerator {
401 pub fn new(planner: Planner) -> Self {
402 Self { planner }
403 }
404
405 pub fn elementwise_add(&self, a: &Value, b: &Value) -> anyhow::Result<Value> {
406 match (a, b) {
407 (Value::Tensor(ma), Value::Tensor(mb)) => match self.planner.choose_elem_add(ma, mb) {
408 ExecutionTarget::Cpu => {
409 runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
410 .map_err(|e| anyhow::anyhow!(e))
411 }
412 ExecutionTarget::Gpu(_) => {
413 let bk = self
414 .planner
415 .device()
416 .ok_or_else(|| anyhow::anyhow!("no backend"))?;
417 let da = bk.upload_matrix(ma)?;
418 let db = bk.upload_matrix(mb)?;
419 let dc = bk.elem_add(da.as_ref(), db.as_ref())?;
420 let out = bk.download_matrix(dc.as_ref())?;
421 Ok(Value::Tensor(out))
422 }
423 },
424 (Value::GpuTensor(ga), Value::GpuTensor(gb)) => {
425 let ha = self.gather_handle(ga)?;
428 let hb = self.gather_handle(gb)?;
429 self.elementwise_add(&ha, &hb)
430 }
431 (Value::GpuTensor(ga), other) => {
432 let ha = self.gather_handle(ga)?;
433 self.elementwise_add(&ha, other)
434 }
435 (other, Value::GpuTensor(gb)) => {
436 let hb = self.gather_handle(gb)?;
437 self.elementwise_add(other, &hb)
438 }
439 _ => runmat_runtime::call_builtin("plus", &[a.clone(), b.clone()])
440 .map_err(|e| anyhow::anyhow!(e)),
441 }
442 }
443
444 fn gather_handle(&self, h: &runmat_accelerate_api::GpuTensorHandle) -> anyhow::Result<Value> {
445 if let Some(p) = runmat_accelerate_api::provider() {
446 let ht = p.download(h).map_err(|e| anyhow::anyhow!(e))?;
447 let t = Tensor::new(ht.data, ht.shape).map_err(|e| anyhow::anyhow!(e))?;
448 Ok(Value::Tensor(t))
449 } else {
450 let shape = h.shape.clone();
452 let total: usize = shape.iter().product();
453 let zeros = Tensor::new(vec![0.0; total], shape).map_err(|e| anyhow::anyhow!(e))?;
454 Ok(Value::Tensor(zeros))
455 }
456 }
457}