1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//! Neural network layers for transformer architectures.
//!
//! This module provides the fundamental building blocks for constructing
//! transformer models. Each layer implements the [`Layer`](crate::traits::Layer)
//! trait and can be composed to create complete architectures.
//!
//! # Overview
//!
//! The layers module includes:
//!
//! - **Basic Layers**:
//! - [`Linear`]: Fully connected linear transformation
//! - [`Embedding`]: Token and position embeddings
//! - [`LayerNorm`]: Layer normalization for training stability
//! - [`FeedForward`]: Position-wise feed-forward networks
//!
//! - **Attention Mechanisms**:
//! - [`MultiHeadAttention`]: Standard multi-head self/cross attention
//! - [`FlashAttention`]: Memory-efficient attention with O(N) complexity
//! - [`PagedAttention`]: Paged KV cache for efficient inference
//! - [`SDPA`]: Scaled Dot-Product Attention with optimizations
//! - [`MultiQueryAttention`]: Efficient attention with shared KV heads
//! - [`GroupedQueryAttention`]: Balance between MHA and MQA
//!
//! # Example
//!
//! ```no_run
//! use trustformers_core::layers::{Linear, LayerNorm, MultiHeadAttention};
//! use trustformers_core::tensor::Tensor;
//! use trustformers_core::traits::Layer;
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Create layers for a transformer block
//! let attention = MultiHeadAttention::new(768, 12, 0.1, true)?;
//! let norm1 = LayerNorm::new(vec![768], 1e-5)?;
//! let ffn = Linear::new(768, 3072, true);
//! let norm2 = LayerNorm::new(vec![768], 1e-5)?;
//!
//! // Use in a forward pass
//! let input = Tensor::randn(&[2, 128, 768])?;
//! let attended = attention.forward(input)?;
//! # Ok(())
//! # }
//! ```
//!
//! # Performance Notes
//!
//! - Use `FlashAttention` for long sequences to reduce memory usage
//! - `PagedAttention` is optimal for inference with KV caching
//! - SIMD operations are used throughout for better CPU performance
//! - GPU acceleration is available with appropriate features enabled
pub use ;
pub use Conv2d;
// Keep AttentionInput for backward compatibility
pub use Dropout;
pub use Embedding;
pub use FeedForward;
pub use ;
pub use ;
pub use Linear;
pub use ;
pub use SDPA;