cubecl_runtime/tune/mod.rs
1//! # Autotuning
2//!
3//! Autotuning runs several candidate kernels on reference inputs and caches the fastest
4//! one per key.
5//!
6//! ```ignore
7//! #[derive(AutotuneKey)]
8//! struct KernelKey { size: u32 }
9//!
10//! fn run_kernel_tuned(lhs: Tensor, rhs: Tensor) -> Tensor {
11//! static TUNER: LocalTuner<String, KernelKey> = local_tuner!();
12//!
13//! let tunables = TUNER.init(|| {
14//! TunableSet::new(KernelKey::new, |_key, (lhs, rhs)| (lhs.clone(), rhs.clone()))
15//! .with(Tunable::new("k1", |(lhs, rhs)| kernel_1(lhs, rhs)))
16//! .with(Tunable::new("k2", |(lhs, rhs)| kernel_2(lhs, rhs)))
17//! });
18//!
19//! TUNER.execute(&device_id, &lhs.client, tunables, (lhs, rhs));
20//! }
21//! ```
22//!
23//! Kernels are closures returning `Result<Out, impl Into<String>>`. Multi-input kernels
24//! take a single tuple argument and destructure: `|(lhs, rhs, out)| body`.
25//!
26//! See [`TuneInputs`] for the borrowed-inputs story, and [`Tunable::new`] for why its
27//! HRTB bound is spelled out directly (closure inference).
28
29mod base;
30mod input_generator;
31mod key_generator;
32mod local;
33mod operation;
34mod tune_benchmark;
35mod tune_cache;
36mod tune_inputs;
37mod tuner;
38mod util;
39
40pub use base::*;
41pub use input_generator::*;
42pub use key_generator::*;
43pub use local::*;
44pub use operation::*;
45pub use tune_benchmark::*;
46pub use tune_cache::*;
47pub use tune_inputs::*;
48pub use tuner::*;
49pub use util::*;