pub enum TensorOp {
Atom {
pipeline: Arc<CachedPipeline>,
bindings: Vec<Arc<BindGroup>>,
dispatch: [u32; 3],
},
List(Vec<TensorOp>),
Sep,
}Variants§
Implementations§
Source§impl TensorOp
impl TensorOp
pub const NF4_BLOCK_SIZE: u32 = 64u32
pub const INT8_BLOCK_SIZE: u32 = 128u32
pub fn empty() -> Self
Sourcepub fn softmax(
x: &TensorGpu<impl Float, ReadWrite>,
) -> Result<Self, TensorError>
pub fn softmax( x: &TensorGpu<impl Float, ReadWrite>, ) -> Result<Self, TensorError>
Softmax operator applied on x.
Sourcepub fn embed(
tokens: &TensorGpu<u32, ReadWrite>,
input: &TensorGpu<f16, ReadWrite>,
output: &TensorGpu<impl Float, ReadWrite>,
) -> Result<Self, TensorError>
pub fn embed( tokens: &TensorGpu<u32, ReadWrite>, input: &TensorGpu<f16, ReadWrite>, output: &TensorGpu<impl Float, ReadWrite>, ) -> Result<Self, TensorError>
Embedding on GPU.
tokensshape:[T, B].inputshape:[C, V].outputshape:[C, T, B].
Sourcepub fn layer_norm(
w: &TensorGpu<f16, ReadWrite>,
b: &TensorGpu<f16, ReadWrite>,
x: &TensorGpu<impl Float, ReadWrite>,
eps: f32,
) -> Result<Self, TensorError>
pub fn layer_norm( w: &TensorGpu<f16, ReadWrite>, b: &TensorGpu<f16, ReadWrite>, x: &TensorGpu<impl Float, ReadWrite>, eps: f32, ) -> Result<Self, TensorError>
Layer normalization applied on x, with weight w and bias b.
xshape:[C, T, B].wshape:[C, 1, 1].bshape:[C, 1, 1].sshape:[4, T, B], mean and inverse std ofx.
Sourcepub fn group_norm(
w: &TensorGpu<f16, ReadWrite>,
b: &TensorGpu<f16, ReadWrite>,
x: &TensorGpu<impl Float, ReadWrite>,
eps: f32,
) -> Result<Self, TensorError>
pub fn group_norm( w: &TensorGpu<f16, ReadWrite>, b: &TensorGpu<f16, ReadWrite>, x: &TensorGpu<impl Float, ReadWrite>, eps: f32, ) -> Result<Self, TensorError>
Group normalization applied on x, with weight w and bias b.
xshape:[S, H, A].wshape:[S, H, 1].bshape:[S, H, 1].
Sourcepub fn recenter(
x: &TensorGpu<impl Float, ReadWrite>,
) -> Result<Self, TensorError>
pub fn recenter( x: &TensorGpu<impl Float, ReadWrite>, ) -> Result<Self, TensorError>
Recenter x to be zero-mean.
Sourcepub fn rms_norm(
w: &TensorGpu<f16, ReadWrite>,
b: &TensorGpu<f16, ReadWrite>,
x: &TensorGpu<impl Float, ReadWrite>,
eps: f32,
) -> Result<Self, TensorError>
pub fn rms_norm( w: &TensorGpu<f16, ReadWrite>, b: &TensorGpu<f16, ReadWrite>, x: &TensorGpu<impl Float, ReadWrite>, eps: f32, ) -> Result<Self, TensorError>
Root-mean-square normalization applied on x, with weight w and bias b.
xshape:[C, T, B].wshape:[C, 1, 1].bshape:[C, 1, 1].
Sourcepub fn l2_norm(
x: &TensorGpu<impl Float, ReadWrite>,
eps: f32,
) -> Result<Self, TensorError>
pub fn l2_norm( x: &TensorGpu<impl Float, ReadWrite>, eps: f32, ) -> Result<Self, TensorError>
L2 normalization applied on x.
xshape:[C, T, B].
Sourcepub fn matmul_vec_fp16<'a, 'b, F0: Float, F1: Float>(
matrix: &TensorGpu<f16, ReadWrite>,
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act: Activation,
sparse: bool,
) -> Result<Self, TensorError>
pub fn matmul_vec_fp16<'a, 'b, F0: Float, F1: Float>( matrix: &TensorGpu<f16, ReadWrite>, input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act: Activation, sparse: bool, ) -> Result<Self, TensorError>
Fp32 matrix-vector multiplication.
matrixshape:[C, R, B].inputshape:[C, T, B].outputshape:[R, T, B].
Sourcepub fn matmul_vec_int8<'a, 'b, F0: Float, F1: Float>(
matrix: &TensorGpu<u8, ReadWrite>,
minmax: &TensorGpu<f16, ReadWrite>,
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act: Activation,
sparse: bool,
) -> Result<Self, TensorError>
pub fn matmul_vec_int8<'a, 'b, F0: Float, F1: Float>( matrix: &TensorGpu<u8, ReadWrite>, minmax: &TensorGpu<f16, ReadWrite>, input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act: Activation, sparse: bool, ) -> Result<Self, TensorError>
Int8 matrix-vector multiplication.
matrixshape:[C, R, B].inputshape:[C, T, B].outputshape:[R, T, B].
Sourcepub fn matmul_vec_nf4<'a, 'b, F0: Float, F1: Float>(
matrix: &TensorGpu<u8, ReadWrite>,
quant: &TensorGpu<f32, Uniform>,
absmax: &TensorGpu<f16, ReadWrite>,
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act: Activation,
sparse: bool,
) -> Result<Self, TensorError>
pub fn matmul_vec_nf4<'a, 'b, F0: Float, F1: Float>( matrix: &TensorGpu<u8, ReadWrite>, quant: &TensorGpu<f32, Uniform>, absmax: &TensorGpu<f16, ReadWrite>, input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act: Activation, sparse: bool, ) -> Result<Self, TensorError>
NFloat4 matrix-vector multiplication.
matrixshape:[C, R, B].inputshape:[C, T, B].outputshape:[R, T, B].
Sourcepub fn matmul_mat_fp16<'a, 'b, 'c, F0: Float, F1: Float>(
matrix: impl Into<TensorGpuView<'c, f16>>,
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act: Activation,
) -> Result<Self, TensorError>
pub fn matmul_mat_fp16<'a, 'b, 'c, F0: Float, F1: Float>( matrix: impl Into<TensorGpuView<'c, f16>>, input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act: Activation, ) -> Result<Self, TensorError>
Fp16 matrix-matrix multiplication.
matrixshape:[K, M, B].inputshape:[K, N, B].outputshape:[M, N, B].
Note: K must be multiples of 4; M and N must be multiples of 4.
Sourcepub fn matmul_mat_int8<'a, 'b, 'c, F0: Float, F1: Float>(
matrix: impl Into<TensorGpuView<'c, u8>>,
minmax: &TensorGpu<f16, ReadWrite>,
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act: Activation,
) -> Result<Self, TensorError>
pub fn matmul_mat_int8<'a, 'b, 'c, F0: Float, F1: Float>( matrix: impl Into<TensorGpuView<'c, u8>>, minmax: &TensorGpu<f16, ReadWrite>, input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act: Activation, ) -> Result<Self, TensorError>
Int8 matrix-matrix multiplication.
matrixshape:[K, M, B].inputshape:[K, N, B].outputshape:[M, N, B].
Notes:
Kmust be multiples of 4;MandNmust be multiples of 4.- The total size of
matrixmust be multiples of 128.
Sourcepub fn matmul_mat_nf4<'a, 'b, 'c, F0: Float, F1: Float>(
matrix: impl Into<TensorGpuView<'c, u8>>,
quant: &TensorGpu<f32, Uniform>,
absmax: &TensorGpu<f16, ReadWrite>,
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act: Activation,
) -> Result<Self, TensorError>
pub fn matmul_mat_nf4<'a, 'b, 'c, F0: Float, F1: Float>( matrix: impl Into<TensorGpuView<'c, u8>>, quant: &TensorGpu<f32, Uniform>, absmax: &TensorGpu<f16, ReadWrite>, input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act: Activation, ) -> Result<Self, TensorError>
NFloat4 matrix-matrix multiplication.
matrixshape:[K, M, B].inputshape:[K, N, B].outputshape:[M, N, B].
Notes:
Kmust be multiples of 8;MandNmust be multiples of 8.- The total size of
matrixmust be multiples of 256.
Sourcepub fn add_activate<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act_x: Activation,
act_y: Activation,
act_out: Activation,
) -> Result<Self, TensorError>
pub fn add_activate<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act_x: Activation, act_y: Activation, act_out: Activation, ) -> Result<Self, TensorError>
Add input to output.
inputshape:[C, 1, B]or[C, T, B].outputshape:[C, T, B].- Activations may be applied to
input,outputand the final result.
Sourcepub fn add<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
) -> Result<Self, TensorError>
pub fn add<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, ) -> Result<Self, TensorError>
Add input to output.
inputshape:[C, 1, B]or[C, T, B].outputshape:[C, T, B].
Sourcepub fn mul_activate<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
act_x: Activation,
act_y: Activation,
act_out: Activation,
) -> Result<Self, TensorError>
pub fn mul_activate<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, act_x: Activation, act_y: Activation, act_out: Activation, ) -> Result<Self, TensorError>
Multiply input to output.
inputshape:[C, 1, B]or[C, T, B].outputshape:[C, T, B].- Activations may be applied to
input,outputand the final result.
Sourcepub fn mul<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
) -> Result<Self, TensorError>
pub fn mul<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, ) -> Result<Self, TensorError>
Multiply input to output.
inputshape:[C, 1, B]or[C, T, B].outputshape:[C, T, B].
pub fn token_shift<'a, 'b, F: Float>( cursors: &TensorGpu<u32, ReadWrite>, time_mix: impl Into<TensorGpuView<'a, F>>, state: impl Into<TensorGpuView<'b, f32>>, input: &TensorGpu<impl Float, ReadWrite>, output: &TensorGpu<impl Float, ReadWrite>, reversed: bool, ) -> Result<Self, TensorError>
pub fn time_mix_v4<'a, T: Float>( cursors: &TensorGpu<u32, ReadWrite>, time_decay: &TensorGpu<f32, ReadWrite>, time_first: &TensorGpu<f32, ReadWrite>, state: impl Into<TensorGpuView<'a, f32>>, k: &TensorGpu<T, ReadWrite>, v: &TensorGpu<T, ReadWrite>, r: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
pub fn time_mix_v5<'a, T: Float>( cursors: &TensorGpu<u32, ReadWrite>, time_decay: &TensorGpu<f32, ReadWrite>, time_first: &TensorGpu<f32, ReadWrite>, state: impl Into<TensorGpuView<'a, f32>>, k: &TensorGpu<T, ReadWrite>, v: &TensorGpu<T, ReadWrite>, r: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
pub fn time_mix_v6<'a, T: Float>( cursors: &TensorGpu<u32, ReadWrite>, time_decay: &TensorGpu<f32, ReadWrite>, time_first: &TensorGpu<f32, ReadWrite>, state: impl Into<TensorGpuView<'a, f32>>, k: &TensorGpu<T, ReadWrite>, v: &TensorGpu<T, ReadWrite>, r: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
Sourcepub fn time_mix_v7<'a, T: Float>(
cursors: &TensorGpu<u32, ReadWrite>,
state: impl Into<TensorGpuView<'a, f32>>,
r: &TensorGpu<T, ReadWrite>,
w: &TensorGpu<T, ReadWrite>,
n: &TensorGpu<T, ReadWrite>,
x: &TensorGpu<T, ReadWrite>,
) -> Result<Self, TensorError>
pub fn time_mix_v7<'a, T: Float>( cursors: &TensorGpu<u32, ReadWrite>, state: impl Into<TensorGpuView<'a, f32>>, r: &TensorGpu<T, ReadWrite>, w: &TensorGpu<T, ReadWrite>, n: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
The V7 WKV kernel.
n: Stack ofk,v,a,kk.
Note that the state layout is different from the official implementation. Here is an illustration of each head’s layout:
pub fn time_first_v7<T: Float>( u: &TensorGpu<f16, ReadWrite>, r: &TensorGpu<T, ReadWrite>, n: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
pub fn control_k_v7<'a, 'b, F0: Float, F1: Float>( p: &TensorGpu<f16, ReadWrite>, a: impl Into<TensorGpuView<'a, F0>>, k: impl Into<TensorGpuView<'b, F1>>, ) -> Result<Self, TensorError>
pub fn channel_mix<'a, T: Float>( cursors: &TensorGpu<u32, ReadWrite>, state: impl Into<TensorGpuView<'a, f32>>, r: &TensorGpu<T, ReadWrite>, v: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
pub fn channel_mix_v7<'a, T: Float>( cursors: &TensorGpu<u32, ReadWrite>, state: impl Into<TensorGpuView<'a, f32>>, v: &TensorGpu<T, ReadWrite>, x: &TensorGpu<T, ReadWrite>, ) -> Result<Self, TensorError>
pub fn activate<'a, F: Float>( x: impl Into<TensorGpuView<'a, F>>, act: Activation, ) -> Result<Self, TensorError>
Sourcepub fn blit<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
) -> Result<Self, TensorError>
pub fn blit<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, ) -> Result<Self, TensorError>
Copy the content of input into output of the same shape.
Sourcepub fn broadcast<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
) -> Result<Self, TensorError>
pub fn broadcast<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, ) -> Result<Self, TensorError>
Repeat the content of input into output along the token and batch axes.
Sourcepub fn transpose<'a, 'b, F0: Float, F1: Float>(
input: impl Into<TensorGpuView<'a, F0>>,
output: impl Into<TensorGpuView<'b, F1>>,
) -> Result<Self, TensorError>
pub fn transpose<'a, 'b, F0: Float, F1: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, ) -> Result<Self, TensorError>
Swap the token and batch axes.
pub fn blend( factor: &TensorGpu<f32, Uniform>, input: &TensorGpu<impl Float, ReadWrite>, output: &TensorGpu<impl Float, ReadWrite>, ) -> Result<Self, TensorError>
pub fn blend_lora<'a, 'b, 'c>( factor: &TensorGpu<f32, Uniform>, xa: impl Into<TensorGpuView<'a, f16>>, xb: impl Into<TensorGpuView<'b, f16>>, output: impl Into<TensorGpuView<'c, f16>>, ) -> Result<Self, TensorError>
pub fn lerp<'a, 'b, 'c, F0: Float, F1: Float, F2: Float>( input: impl Into<TensorGpuView<'a, F0>>, output: impl Into<TensorGpuView<'b, F1>>, factor: impl Into<TensorGpuView<'c, F2>>, reversed: bool, ) -> Result<Self, TensorError>
pub fn affine( x: &TensorGpu<impl Float, ReadWrite>, scale: f32, bias: f32, ) -> Result<Self, TensorError>
pub fn quantize_mat_int8( input: &TensorGpu<f16, ReadWrite>, minmax: &TensorGpu<f16, ReadWrite>, output: &TensorGpu<u8, ReadWrite>, ) -> Result<Self, TensorError>
pub fn quantize_mat_nf4( input: &TensorGpu<f16, ReadWrite>, quant: &TensorGpu<f32, Uniform>, absmax: &TensorGpu<f16, ReadWrite>, output: &TensorGpu<u8, ReadWrite>, ) -> Result<Self, TensorError>
Auto Trait Implementations§
impl Freeze for TensorOp
impl !RefUnwindSafe for TensorOp
impl Send for TensorOp
impl Sync for TensorOp
impl Unpin for TensorOp
impl !UnwindSafe for TensorOp
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
impl<SS, SP> SupersetOf<SS> for SPwhere
SS: SubsetOf<SP>,
Source§fn to_subset(&self) -> Option<SS>
fn to_subset(&self) -> Option<SS>
self from the equivalent element of its
superset. Read moreSource§fn is_in_subset(&self) -> bool
fn is_in_subset(&self) -> bool
self is actually part of its subset T (and can be converted to it).Source§fn to_subset_unchecked(&self) -> SS
fn to_subset_unchecked(&self) -> SS
self.to_subset but without any property checks. Always succeeds.Source§fn from_subset(element: &SS) -> SP
fn from_subset(element: &SS) -> SP
self to the equivalent element of its superset.