1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
//! Common types used across RuvLLM
//!
//! This module contains shared type definitions, enums, and data structures
//! used throughout the RuvLLM crate.
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// Model size variants supported by RuvLLM
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ModelSize {
/// 350M parameter model - fastest, lower quality
Tiny,
/// 700M parameter model - balanced
Small,
/// 1.2B parameter model - higher quality
Medium,
/// 2.6B parameter model - highest quality, slowest
Large,
}
impl Default for ModelSize {
fn default() -> Self {
Self::Small
}
}
impl ModelSize {
/// Get the approximate parameter count
pub fn param_count(&self) -> usize {
match self {
Self::Tiny => 350_000_000,
Self::Small => 700_000_000,
Self::Medium => 1_200_000_000,
Self::Large => 2_600_000_000,
}
}
/// Get the model name string
pub fn name(&self) -> &'static str {
match self {
Self::Tiny => "350M",
Self::Small => "700M",
Self::Medium => "1.2B",
Self::Large => "2.6B",
}
}
}
/// Precision levels for quantization
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Precision {
/// Full precision (32-bit float)
FP32,
/// Half precision (16-bit float)
FP16,
/// 8-bit quantization
Q8,
/// 4-bit quantization (K-quants)
Q4K,
/// 4-bit quantization (standard)
Q4,
}
impl Default for Precision {
fn default() -> Self {
Self::FP16
}
}
impl Precision {
/// Get bytes per element
pub fn bytes_per_element(&self) -> f32 {
match self {
Self::FP32 => 4.0,
Self::FP16 => 2.0,
Self::Q8 => 1.0,
Self::Q4K => 0.5,
Self::Q4 => 0.5,
}
}
/// Get the compression ratio relative to FP32
pub fn compression_ratio(&self) -> f32 {
4.0 / self.bytes_per_element()
}
}
/// Allocation types sharing the unified memory pool
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum AllocationType {
/// KV cache pages
KvCache {
/// Associated session ID
session_id: String,
/// Cache tier
tier: String,
/// Number of pages allocated
page_count: usize,
},
/// LoRA adapter weights
LoraAdapter {
/// Adapter identifier
adapter_id: String,
/// LoRA rank
rank: usize,
/// Number of layers
layer_count: usize,
},
/// Router weights
RouterWeights {
/// Version number
version: u64,
},
}
/// Allocation tracking entry
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Allocation {
/// Unique allocation ID
pub id: Uuid,
/// Allocation type
pub allocation_type: AllocationType,
/// Size in bytes
pub size_bytes: usize,
/// Priority for eviction (lower = evict first)
pub priority: f32,
/// Creation timestamp
pub created_at: DateTime<Utc>,
/// Last access timestamp
pub last_accessed: DateTime<Utc>,
}
/// Memory pool statistics
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct MemoryStats {
/// Total memory budget
pub total_budget: usize,
/// Currently allocated bytes
pub allocated_bytes: usize,
/// Number of active allocations
pub allocation_count: usize,
/// KV cache allocations
pub kv_cache_bytes: usize,
/// LoRA adapter allocations
pub lora_adapter_bytes: usize,
/// Router weight allocations
pub router_bytes: usize,
}
/// Request metadata for tracking
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestMetadata {
/// Unique request ID
pub request_id: Uuid,
/// Session ID
pub session_id: String,
/// User ID if available
pub user_id: Option<String>,
/// Request timestamp
pub timestamp: DateTime<Utc>,
/// Input token count
pub input_tokens: usize,
/// Output token count
pub output_tokens: usize,
}
/// Error information for witness logging
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorInfo {
/// Error code
pub code: String,
/// Error message
pub message: String,
/// Stack trace if available
pub stack_trace: Option<String>,
/// Recovery attempted
pub recovery_attempted: bool,
}
/// Quality metrics for evaluation
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct QualityMetrics {
/// Overall quality score (0.0 - 1.0)
pub overall_score: f32,
/// Relevance score
pub relevance: f32,
/// Coherence score
pub coherence: f32,
/// Factuality score
pub factuality: f32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_model_size() {
assert_eq!(ModelSize::Tiny.param_count(), 350_000_000);
assert_eq!(ModelSize::Large.name(), "2.6B");
}
#[test]
fn test_precision() {
assert_eq!(Precision::FP32.bytes_per_element(), 4.0);
assert_eq!(Precision::Q4.compression_ratio(), 8.0);
}
}