1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//! # Autotuning
//!
//! Autotuning runs several candidate kernels on reference inputs and caches the fastest
//! one per key.
//!
//! ```ignore
//! #[derive(AutotuneKey)]
//! struct KernelKey { size: u32 }
//!
//! fn run_kernel_tuned(lhs: Tensor, rhs: Tensor) -> Tensor {
//! static TUNER: LocalTuner<String, KernelKey> = local_tuner!();
//!
//! let tunables = TUNER.init(|| {
//! TunableSet::new(KernelKey::new, |_key, (lhs, rhs)| (lhs.clone(), rhs.clone()))
//! .with(Tunable::new("k1", |(lhs, rhs)| kernel_1(lhs, rhs)))
//! .with(Tunable::new("k2", |(lhs, rhs)| kernel_2(lhs, rhs)))
//! });
//!
//! TUNER.execute(&device_id, &lhs.client, tunables, (lhs, rhs));
//! }
//! ```
//!
//! Kernels are closures returning `Result<Out, impl Into<String>>`. Multi-input kernels
//! take a single tuple argument and destructure: `|(lhs, rhs, out)| body`.
//!
//! See [`TuneInputs`] for the borrowed-inputs story, and [`Tunable::new`] for why its
//! HRTB bound is spelled out directly (closure inference).
pub use *;
pub use *;
pub use *;
pub use *;
pub use *;
pub use *;
pub use *;
pub use *;
pub use *;
pub use *;