1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
//! LLM inference utilities: paged KV cache, block manager, prefix caching.
//!
//! This module implements a production-grade paged KV cache for LLM inference,
//! inspired by vLLM's PagedAttention architecture. Key features:
//!
//! - **Non-contiguous memory**: Keys/values stored in fixed-size pages, allowing
//! flexible memory allocation without fragmentation.
//! - **Block manager**: Tracks page chains per sequence, handles allocation and eviction.
//! - **Prefix sharing**: Shared prefix cache enables KV reuse across requests with
//! common prefixes (e.g., system prompts).
//! - **Paged attention**: Attention computation over non-contiguous page chains.
//!
//! ## Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────┐
//! │ KvPagePool │
//! │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │
//! │ │ Page 0 │ │ Page 1 │ │ Page 2 │ │ Page 3 │ ... │
//! │ │[bs,H,D]│ │[bs,H,D]│ │[bs,H,D]│ │[bs,H,D]│ │
//! │ └────────┘ └────────┘ └────────┘ └────────┘ │
//! └─────────────────────────────────────────────────────┘
//! ↑
//! BlockManager maps SeqId → [PageId, PageId, ...]
//! ```
//!
//! ## Example
//!
//! ```rust
//! use scirs2_neural::inference::{
//! KvPageConfig, KvPagePool, BlockManagerConfig, BlockManager,
//! PagedAttentionConfig, PagedAttentionForward,
//! };
//!
//! // Configure pages: block_size=16 tokens, 8 heads, head_dim=64
//! let page_cfg = KvPageConfig {
//! block_size: 16,
//! num_heads: 8,
//! head_dim: 64,
//! dtype_bytes: 4,
//! };
//! let pool = KvPagePool::<f32>::new(128, page_cfg);
//! let bm_cfg = BlockManagerConfig {
//! max_sequences: 32,
//! max_pages_per_seq: 64,
//! };
//! let _manager = BlockManager::<f32>::new(pool, bm_cfg);
//! ```
pub use ;
pub use ;
pub use ;
use crateNeuralError;
/// Errors specific to inference / paged KV cache operations.
/// Convenience alias for `Result<T, InferenceError>`.
pub type InferenceResult<T> = ;