1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
//! Sparse attention mechanisms
//!
//! This module provides attention mechanisms optimized for sparse data and sparse
//! attention patterns. These implementations are designed for transformer-based
//! architectures and other attention-based neural networks where sparsity can
//! significantly reduce computational complexity.
use crate::layers::linear::SparseLinear;
use crate::{CooTensor, CsrTensor, TorshResult};
use torsh_core::{Shape, TorshError};
use torsh_tensor::{creation::zeros, Tensor};
/// Sparse Multi-Head Attention mechanism
///
/// Implements efficient attention for sparse matrices, reducing computational complexity
/// by only computing attention for non-zero positions in the sparse attention mask.
/// This is particularly useful for long sequences or structured sparsity patterns
/// like local attention or hierarchical attention.
///
/// # Mathematical Formulation
/// For standard attention: Attention(Q,K,V) = softmax(QK^T / √d_k)V
/// For sparse attention: Only compute attention scores where `mask[i,j]` ≠ 0
///
/// # Benefits
/// - Reduces O(n²) complexity to O(s) where s is the number of sparse connections
/// - Maintains quality for many attention patterns (local, strided, etc.)
/// - Enables processing of much longer sequences
#[derive(Debug, Clone)]
pub struct SparseAttention {
/// Query projection layer
query_proj: SparseLinear,
/// Key projection layer
key_proj: SparseLinear,
/// Value projection layer
value_proj: SparseLinear,
/// Output projection layer
output_proj: SparseLinear,
/// Number of attention heads
num_heads: usize,
/// Dimension per head
head_dim: usize,
/// Model dimension
model_dim: usize,
/// Dropout probability (for future implementation)
#[allow(dead_code)]
dropout: f32,
/// Temperature scaling factor
scale: f32,
}
impl SparseAttention {
/// Create a new sparse attention layer
///
/// # Arguments
/// * `model_dim` - Model dimension (must be divisible by num_heads)
/// * `num_heads` - Number of attention heads
/// * `sparsity` - Sparsity level for projection layers (0.0 = dense, 1.0 = fully sparse)
/// * `dropout` - Dropout probability (currently unused)
///
/// # Returns
/// * `TorshResult<Self>` - New sparse attention layer or error
///
/// # Example
/// ```rust
/// use torsh_sparse::nn::attention::SparseAttention;
///
/// // Create 8-head attention with 512 model dimension and 0.9 sparsity
/// let attention = SparseAttention::new(512, 8, 0.9, 0.1).expect("valid attention config");
/// ```
pub fn new(
model_dim: usize,
num_heads: usize,
sparsity: f32,
dropout: f32,
) -> TorshResult<Self> {
if model_dim % num_heads != 0 {
return Err(TorshError::InvalidArgument(
"Model dimension must be divisible by number of heads".to_string(),
));
}
if !(0.0..=1.0).contains(&sparsity) {
return Err(TorshError::InvalidArgument(
"Sparsity must be between 0.0 and 1.0".to_string(),
));
}
if !(0.0..=1.0).contains(&dropout) {
return Err(TorshError::InvalidArgument(
"Dropout must be between 0.0 and 1.0".to_string(),
));
}
let head_dim = model_dim / num_heads;
let scale = 1.0 / (head_dim as f32).sqrt();
// Create projection layers with sparse weights
let query_proj = SparseLinear::new(model_dim, model_dim, sparsity, false)?;
let key_proj = SparseLinear::new(model_dim, model_dim, sparsity, false)?;
let value_proj = SparseLinear::new(model_dim, model_dim, sparsity, false)?;
let output_proj = SparseLinear::new(model_dim, model_dim, sparsity, false)?;
Ok(Self {
query_proj,
key_proj,
value_proj,
output_proj,
num_heads,
head_dim,
model_dim,
dropout,
scale,
})
}
/// Forward pass with sparse attention mask
///
/// # Arguments
/// * `query` - Query tensor (batch_size, seq_len, model_dim)
/// * `key` - Key tensor (batch_size, seq_len, model_dim)
/// * `value` - Value tensor (batch_size, seq_len, model_dim)
/// * `attention_mask` - Optional sparse attention mask (seq_len, seq_len)
///
/// # Returns
/// * `TorshResult<Tensor>` - Attention output (batch_size, seq_len, model_dim)
///
/// # Note
/// If no attention mask is provided, standard dense attention is computed.
/// The attention mask should be a sparse matrix where non-zero entries
/// indicate which attention connections to compute.
pub fn forward(
&self,
query: &Tensor,
key: &Tensor,
value: &Tensor,
attention_mask: Option<&CsrTensor>,
) -> TorshResult<Tensor> {
let batch_size = query.shape().dims()[0];
let seq_len = query.shape().dims()[1];
// Validate input shapes
self.validate_inputs(query, key, value)?;
// Reshape 3D input to 2D for linear projections (batch*seq_len, model_dim)
let query_2d = self.reshape_3d_to_2d(query, batch_size, seq_len)?;
let key_2d = self.reshape_3d_to_2d(key, batch_size, seq_len)?;
let value_2d = self.reshape_3d_to_2d(value, batch_size, seq_len)?;
// Project to Q, K, V
let q_2d = self.query_proj.forward(&query_2d)?;
let k_2d = self.key_proj.forward(&key_2d)?;
let v_2d = self.value_proj.forward(&value_2d)?;
// Reshape back to 3D
let q = self.reshape_2d_to_3d(&q_2d, batch_size, seq_len)?;
let k = self.reshape_2d_to_3d(&k_2d, batch_size, seq_len)?;
let v = self.reshape_2d_to_3d(&v_2d, batch_size, seq_len)?;
// Reshape for multi-head attention
let q_reshaped = self.reshape_for_attention(&q, batch_size, seq_len)?;
let k_reshaped = self.reshape_for_attention(&k, batch_size, seq_len)?;
let v_reshaped = self.reshape_for_attention(&v, batch_size, seq_len)?;
// Compute attention scores with sparsity
let attention_output = self.compute_sparse_attention(
&q_reshaped,
&k_reshaped,
&v_reshaped,
batch_size,
seq_len,
attention_mask,
)?;
// Reshape back and apply output projection
let output_reshaped =
self.reshape_from_attention(&attention_output, batch_size, seq_len)?;
let output_2d = self.reshape_3d_to_2d(&output_reshaped, batch_size, seq_len)?;
let projected_2d = self.output_proj.forward(&output_2d)?;
self.reshape_2d_to_3d(&projected_2d, batch_size, seq_len)
}
/// Self-attention convenience method
///
/// # Arguments
/// * `input` - Input tensor (batch_size, seq_len, model_dim)
/// * `attention_mask` - Optional sparse attention mask
///
/// # Returns
/// * `TorshResult<Tensor>` - Self-attention output
pub fn self_attention(
&self,
input: &Tensor,
attention_mask: Option<&CsrTensor>,
) -> TorshResult<Tensor> {
self.forward(input, input, input, attention_mask)
}
/// Create a local attention mask
///
/// Creates a sparse attention mask that only allows attention within
/// a local window around each position.
///
/// # Arguments
/// * `seq_len` - Sequence length
/// * `window_size` - Size of the local window (total window = 2*window_size + 1)
///
/// # Returns
/// * `TorshResult<CsrTensor>` - Local attention mask
pub fn create_local_attention_mask(
seq_len: usize,
window_size: usize,
) -> TorshResult<CsrTensor> {
let mut row_indices = Vec::new();
let mut col_indices = Vec::new();
let mut values = Vec::new();
for i in 0..seq_len {
let start = i.saturating_sub(window_size);
let end = std::cmp::min(i + window_size + 1, seq_len);
for j in start..end {
row_indices.push(i);
col_indices.push(j);
values.push(1.0);
}
}
let shape = Shape::new(vec![seq_len, seq_len]);
let coo = CooTensor::new(row_indices, col_indices, values, shape)?;
CsrTensor::from_coo(&coo)
}
/// Create a strided attention mask
///
/// Creates a sparse attention mask that allows attention at regular
/// intervals (stride pattern).
///
/// # Arguments
/// * `seq_len` - Sequence length
/// * `stride` - Stride between attended positions
/// * `local_window` - Size of local window around each position
///
/// # Returns
/// * `TorshResult<CsrTensor>` - Strided attention mask
pub fn create_strided_attention_mask(
seq_len: usize,
stride: usize,
local_window: usize,
) -> TorshResult<CsrTensor> {
if stride == 0 {
return Err(TorshError::InvalidArgument(
"Stride must be greater than 0".to_string(),
));
}
let mut row_indices = Vec::new();
let mut col_indices = Vec::new();
let mut values = Vec::new();
for i in 0..seq_len {
// Local window
let local_start = i.saturating_sub(local_window);
let local_end = std::cmp::min(i + local_window + 1, seq_len);
for j in local_start..local_end {
row_indices.push(i);
col_indices.push(j);
values.push(1.0);
}
// Strided connections
let mut j = i % stride;
while j < seq_len {
if j < local_start || j >= local_end {
row_indices.push(i);
col_indices.push(j);
values.push(1.0);
}
j += stride;
}
}
let shape = Shape::new(vec![seq_len, seq_len]);
let coo = CooTensor::new(row_indices, col_indices, values, shape)?;
CsrTensor::from_coo(&coo)
}
/// Validate input tensor shapes
fn validate_inputs(&self, query: &Tensor, key: &Tensor, value: &Tensor) -> TorshResult<()> {
let q_shape = query.shape();
let k_shape = key.shape();
let v_shape = value.shape();
if q_shape.ndim() != 3 || k_shape.ndim() != 3 || v_shape.ndim() != 3 {
return Err(TorshError::InvalidArgument(
"Input tensors must be 3D (batch, seq_len, model_dim)".to_string(),
));
}
if q_shape.dims()[0] != k_shape.dims()[0] || q_shape.dims()[0] != v_shape.dims()[0] {
return Err(TorshError::InvalidArgument(
"Batch sizes must match across Q, K, V".to_string(),
));
}
if k_shape.dims()[1] != v_shape.dims()[1] {
return Err(TorshError::InvalidArgument(
"Key and Value sequence lengths must match".to_string(),
));
}
if q_shape.dims()[2] != self.model_dim {
return Err(TorshError::InvalidArgument(
"Query dimension doesn't match model dimension".to_string(),
));
}
if k_shape.dims()[2] != self.model_dim || v_shape.dims()[2] != self.model_dim {
return Err(TorshError::InvalidArgument(
"Key/Value dimensions don't match model dimension".to_string(),
));
}
Ok(())
}
/// Reshape tensor for multi-head attention
fn reshape_for_attention(
&self,
tensor: &Tensor,
batch_size: usize,
seq_len: usize,
) -> TorshResult<Tensor> {
// Reshape from (batch, seq_len, model_dim) to (batch, num_heads, seq_len, head_dim)
let reshaped = zeros::<f32>(&[batch_size, self.num_heads, seq_len, self.head_dim])?;
for b in 0..batch_size {
for s in 0..seq_len {
for h in 0..self.num_heads {
for d in 0..self.head_dim {
let model_idx = h * self.head_dim + d;
let val = tensor.get(&[b, s, model_idx])?;
reshaped.set(&[b, h, s, d], val)?;
}
}
}
}
Ok(reshaped)
}
/// Reshape tensor back from multi-head attention
fn reshape_from_attention(
&self,
tensor: &Tensor,
batch_size: usize,
seq_len: usize,
) -> TorshResult<Tensor> {
// Reshape from (batch, num_heads, seq_len, head_dim) to (batch, seq_len, model_dim)
let reshaped = zeros::<f32>(&[batch_size, seq_len, self.model_dim])?;
for b in 0..batch_size {
for s in 0..seq_len {
for h in 0..self.num_heads {
for d in 0..self.head_dim {
let model_idx = h * self.head_dim + d;
let val = tensor.get(&[b, h, s, d])?;
reshaped.set(&[b, s, model_idx], val)?;
}
}
}
}
Ok(reshaped)
}
/// Compute sparse attention with optional attention mask
#[allow(clippy::too_many_arguments)]
fn compute_sparse_attention(
&self,
query: &Tensor,
key: &Tensor,
value: &Tensor,
batch_size: usize,
seq_len: usize,
attention_mask: Option<&CsrTensor>,
) -> TorshResult<Tensor> {
let output = zeros::<f32>(&[batch_size, self.num_heads, seq_len, self.head_dim])?;
for b in 0..batch_size {
for h in 0..self.num_heads {
// Compute attention scores for this head
let scores = zeros::<f32>(&[seq_len, seq_len])?;
for i in 0..seq_len {
for j in 0..seq_len {
// Check if this position should be computed (based on sparse mask)
let should_compute = if let Some(mask) = attention_mask {
// Only compute if position is non-zero in mask
let (cols, _) = mask.get_row(i)?;
cols.contains(&j)
} else {
true // Compute all positions if no mask
};
if should_compute {
let mut score = 0.0;
for d in 0..self.head_dim {
score += query.get(&[b, h, i, d])? * key.get(&[b, h, j, d])?;
}
scores.set(&[i, j], score * self.scale)?;
} else {
scores.set(&[i, j], f32::NEG_INFINITY)?;
}
}
}
// Apply softmax to attention scores
for i in 0..seq_len {
// Find max for numerical stability
let mut max_score = f32::NEG_INFINITY;
for j in 0..seq_len {
let score = scores.get(&[i, j])?;
if score > max_score && score != f32::NEG_INFINITY {
max_score = score;
}
}
// Compute softmax
let mut sum_exp = 0.0;
let mut exp_scores = vec![0.0; seq_len];
#[allow(clippy::needless_range_loop)]
for j in 0..seq_len {
let score = scores.get(&[i, j])?;
if score != f32::NEG_INFINITY {
exp_scores[j] = (score - max_score).exp();
sum_exp += exp_scores[j];
}
}
// Normalize and apply to values
for d in 0..self.head_dim {
let mut weighted_sum = 0.0;
#[allow(clippy::needless_range_loop)]
for j in 0..seq_len {
if exp_scores[j] > 0.0 {
let attention_weight = exp_scores[j] / sum_exp;
weighted_sum += attention_weight * value.get(&[b, h, j, d])?;
}
}
output.set(&[b, h, i, d], weighted_sum)?;
}
}
}
}
Ok(output)
}
/// Get the number of parameters
pub fn num_parameters(&self) -> usize {
self.query_proj.num_parameters()
+ self.key_proj.num_parameters()
+ self.value_proj.num_parameters()
+ self.output_proj.num_parameters()
}
/// Get model dimension
pub fn model_dim(&self) -> usize {
self.model_dim
}
/// Get number of heads
pub fn num_heads(&self) -> usize {
self.num_heads
}
/// Get head dimension
pub fn head_dim(&self) -> usize {
self.head_dim
}
/// Get scale factor
pub fn scale(&self) -> f32 {
self.scale
}
/// Reshape 3D tensor to 2D for linear projections
fn reshape_3d_to_2d(
&self,
tensor: &Tensor,
batch_size: usize,
seq_len: usize,
) -> TorshResult<Tensor> {
let reshaped = zeros::<f32>(&[batch_size * seq_len, self.model_dim])?;
for b in 0..batch_size {
for s in 0..seq_len {
for d in 0..self.model_dim {
let val = tensor.get(&[b, s, d])?;
reshaped.set(&[b * seq_len + s, d], val)?;
}
}
}
Ok(reshaped)
}
/// Reshape 2D tensor back to 3D
fn reshape_2d_to_3d(
&self,
tensor: &Tensor,
batch_size: usize,
seq_len: usize,
) -> TorshResult<Tensor> {
let reshaped = zeros::<f32>(&[batch_size, seq_len, self.model_dim])?;
for b in 0..batch_size {
for s in 0..seq_len {
for d in 0..self.model_dim {
let val = tensor.get(&[b * seq_len + s, d])?;
reshaped.set(&[b, s, d], val)?;
}
}
}
Ok(reshaped)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::SparseTensor;
use torsh_tensor::creation::ones;
#[test]
fn test_sparse_attention_creation() {
let attention =
SparseAttention::new(64, 8, 0.5, 0.1).expect("Sparse Attention should succeed");
assert_eq!(attention.model_dim(), 64);
assert_eq!(attention.num_heads(), 8);
assert_eq!(attention.head_dim(), 8);
assert!(attention.num_parameters() > 0);
}
#[test]
fn test_invalid_model_dim() {
// Model dim not divisible by num_heads
assert!(SparseAttention::new(65, 8, 0.5, 0.1).is_err());
}
#[test]
fn test_invalid_sparsity() {
assert!(SparseAttention::new(64, 8, 1.5, 0.1).is_err());
assert!(SparseAttention::new(64, 8, -0.1, 0.1).is_err());
}
#[test]
fn test_sparse_attention_forward() {
let attention =
SparseAttention::new(32, 4, 0.3, 0.1).expect("Sparse Attention should succeed");
let query = ones::<f32>(&[2, 5, 32]).expect("operation should succeed");
let key = ones::<f32>(&[2, 5, 32]).expect("operation should succeed");
let value = ones::<f32>(&[2, 5, 32]).expect("operation should succeed");
let output = attention
.forward(&query, &key, &value, None)
.expect("forward pass should succeed");
assert_eq!(output.shape().dims(), &[2, 5, 32]);
}
#[test]
fn test_self_attention() {
let attention =
SparseAttention::new(16, 2, 0.4, 0.0).expect("Sparse Attention should succeed");
let input = ones::<f32>(&[1, 4, 16]).expect("operation should succeed");
let output = attention
.self_attention(&input, None)
.expect("self-attention should succeed");
assert_eq!(output.shape().dims(), &[1, 4, 16]);
}
#[test]
fn test_local_attention_mask() {
let mask = SparseAttention::create_local_attention_mask(5, 1)
.expect("Sparse Attention should succeed");
assert_eq!(mask.shape().dims(), &[5, 5]);
assert!(mask.nnz() > 0);
assert!(mask.nnz() <= 15); // 5 positions * 3 connections each (max)
}
#[test]
fn test_strided_attention_mask() {
let mask = SparseAttention::create_strided_attention_mask(8, 2, 1)
.expect("Sparse Attention should succeed");
assert_eq!(mask.shape().dims(), &[8, 8]);
assert!(mask.nnz() > 0);
}
#[test]
fn test_attention_with_local_mask() {
let attention =
SparseAttention::new(16, 2, 0.2, 0.0).expect("Sparse Attention should succeed");
let input = ones::<f32>(&[1, 4, 16]).expect("operation should succeed");
let mask = SparseAttention::create_local_attention_mask(4, 1)
.expect("Sparse Attention should succeed");
let output = attention
.self_attention(&input, Some(&mask))
.expect("operation should succeed");
assert_eq!(output.shape().dims(), &[1, 4, 16]);
}
#[test]
fn test_dimension_validation() {
let attention =
SparseAttention::new(32, 4, 0.3, 0.1).expect("Sparse Attention should succeed");
let wrong_query = ones::<f32>(&[2, 5, 16]).expect("operation should succeed"); // Wrong model dim
let key = ones::<f32>(&[2, 5, 32]).expect("operation should succeed");
let value = ones::<f32>(&[2, 5, 32]).expect("operation should succeed");
assert!(attention.forward(&wrong_query, &key, &value, None).is_err());
}
#[test]
fn test_batch_size_validation() {
let attention =
SparseAttention::new(16, 2, 0.3, 0.1).expect("Sparse Attention should succeed");
let query = ones::<f32>(&[2, 5, 16]).expect("operation should succeed");
let key = ones::<f32>(&[3, 5, 16]).expect("operation should succeed"); // Different batch size
let value = ones::<f32>(&[2, 5, 16]).expect("operation should succeed");
assert!(attention.forward(&query, &key, &value, None).is_err());
}
#[test]
fn test_sequence_length_validation() {
let attention =
SparseAttention::new(16, 2, 0.3, 0.1).expect("Sparse Attention should succeed");
let query = ones::<f32>(&[2, 5, 16]).expect("operation should succeed");
let key = ones::<f32>(&[2, 4, 16]).expect("operation should succeed"); // Different seq len
let value = ones::<f32>(&[2, 4, 16]).expect("operation should succeed");
assert!(attention.forward(&query, &key, &value, None).is_err());
}
#[test]
fn test_invalid_stride() {
assert!(SparseAttention::create_strided_attention_mask(8, 0, 1).is_err());
}
}