1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
//! BLAS Level 1 — vector-vector operations.
//!
//! This module provides GPU-accelerated implementations of the classic BLAS
//! Level 1 routines. Each operation launches one or more PTX kernels via the
//! [`BlasHandle`](crate::handle::BlasHandle).
//!
//! | Function | Operation |
//! |----------|------------------------------------|
//! | [`fn@axpy`] | y = alpha * x + y |
//! | [`fn@scal`] | x = alpha * x |
//! | [`fn@dot`] | result = x . y (dot product) |
//! | [`fn@nrm2`] | result = ||x||_2 (L2 norm) |
//! | [`fn@asum`] | result = sum |x_i| (L1 norm) |
//! | [`fn@iamax`]| result = argmax |x_i| |
//! | [`fn@copy_vec`] | y = x (vector copy) |
//! | [`fn@swap`] | x <-> y (swap two vectors) |
// Re-export the primary entry-point functions.
pub use asum;
pub use axpy;
pub use copy_vec;
pub use dot;
pub use iamax;
pub use nrm2;
pub use scal;
pub use swap;
/// Computes the minimum number of elements a buffer must hold for a vector
/// of `n` logical elements with the given stride (increment).
///
/// The last accessed element is at index `(n - 1) * |inc|`, so the buffer
/// needs at least `1 + (n - 1) * |inc|` elements.
pub
/// Default block size for Level 1 element-wise kernels.
pub const L1_BLOCK_SIZE: u32 = 256;