1use crate::{DataType, Device, ModelId, ModelInfo, SamplingParams, SamplingPresets};
4use serde::{Deserialize, Serialize};
5use std::{collections::HashMap, time::Duration};
6
7#[derive(Debug, Clone, Serialize, Deserialize, Default)]
9pub struct EngineConfig {
10 pub model: EngineModelConfig,
11 pub scheduler: SchedulerConfig,
12 pub sampling: SamplingConfig,
13 pub backend: BackendConfig,
14 pub kv_cache: KvCacheConfig,
15 pub memory: MemoryConfig,
16 pub batching: BatchConfig,
17 pub monitoring: MonitoringConfig,
18}
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct EngineModelConfig {
22 pub model_id: ModelId,
23 pub model_info: Option<ModelInfo>,
24 pub tokenizer: TokenizerConfig,
25}
26
27impl Default for EngineModelConfig {
28 fn default() -> Self {
29 Self {
30 model_id: ModelId::new("default"),
31 model_info: None,
32 tokenizer: TokenizerConfig::default(),
33 }
34 }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct SchedulerConfig {
40 pub policy: SchedulingPolicy,
42 pub max_waiting_requests: usize,
44 pub max_running_requests: usize,
46 pub enable_preemption: bool,
48 pub enable_load_balancing: bool,
50 pub fair_share_weights: HashMap<String, f32>,
52 pub enable_sla_enforcement: bool,
54}
55
56impl Default for SchedulerConfig {
57 fn default() -> Self {
58 Self {
59 policy: SchedulingPolicy::Priority,
60 max_waiting_requests: 1000,
61 max_running_requests: 256,
62 enable_preemption: true,
63 enable_load_balancing: false,
64 fair_share_weights: HashMap::new(),
65 enable_sla_enforcement: false,
66 }
67 }
68}
69
70#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
72pub enum SchedulingPolicy {
73 FCFS,
75 Priority,
77 FairShare,
79 SJF,
81 RoundRobin,
83 ContinuousBatch,
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct KvCacheConfig {
90 pub cache_type: KvCacheType,
92 pub block_size: usize,
94 pub max_blocks: usize,
96 pub enable_compression: bool,
98 pub compression_ratio: f32,
100 pub enable_multi_level: bool,
102 pub swap_threshold: f32,
104 pub enable_prefix_caching: bool,
106 pub prefix_cache_size: usize,
108}
109
110impl Default for KvCacheConfig {
111 fn default() -> Self {
112 Self {
113 cache_type: KvCacheType::Contiguous,
114 block_size: 16,
115 max_blocks: 1024,
116 enable_compression: false,
117 compression_ratio: 0.5,
118 enable_multi_level: true,
119 swap_threshold: 0.8,
120 enable_prefix_caching: true,
121 prefix_cache_size: 100,
122 }
123 }
124}
125
126#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
128pub enum KvCacheType {
129 Contiguous,
131 Paged,
133 Tree,
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct MemoryConfig {
140 pub pool_size: Option<usize>,
142 pub enable_pooling: bool,
144 pub alignment: usize,
146 pub enable_defragmentation: bool,
148 pub defragmentation_threshold: f32,
150 pub enable_memory_stats: bool,
152 pub pressure_warning_threshold: f32,
154 pub pressure_critical_threshold: f32,
156}
157
158impl Default for MemoryConfig {
159 fn default() -> Self {
160 Self {
161 pool_size: None,
162 enable_pooling: true,
163 alignment: 256,
164 enable_defragmentation: false,
165 defragmentation_threshold: 0.7,
166 enable_memory_stats: true,
167 pressure_warning_threshold: 0.8,
168 pressure_critical_threshold: 0.95,
169 }
170 }
171}
172
173#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct BackendConfig {
176 pub backend_type: BackendType,
178 pub device: Device,
180 pub dtype: DataType,
182 pub enable_optimizations: bool,
184 pub optimization_level: u8,
186 pub enable_cuda_graphs: bool,
188 pub enable_kernel_fusion: bool,
190 pub backend_options: HashMap<String, serde_json::Value>,
192}
193
194impl Default for BackendConfig {
195 fn default() -> Self {
196 Self {
197 backend_type: BackendType::Candle,
198 device: Device::CPU,
199 dtype: DataType::FP16,
200 enable_optimizations: true,
201 optimization_level: 2,
202 enable_cuda_graphs: false,
203 enable_kernel_fusion: true,
204 backend_options: HashMap::new(),
205 }
206 }
207}
208
209#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
211pub enum BackendType {
212 Candle,
214 OnnxRuntime,
216 TensorRT,
218 Custom,
220}
221
222#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct TokenizerConfig {
225 pub tokenizer_type: TokenizerType,
227 pub tokenizer_path: Option<String>,
229 pub enable_fast: bool,
231 pub add_special_tokens: bool,
233 pub truncation: Option<TruncationConfig>,
235 pub padding: Option<PaddingConfig>,
237}
238
239impl Default for TokenizerConfig {
240 fn default() -> Self {
241 Self {
242 tokenizer_type: TokenizerType::BPE,
243 tokenizer_path: None,
244 enable_fast: true,
245 add_special_tokens: true,
246 truncation: None,
247 padding: None,
248 }
249 }
250}
251
252#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
254pub enum TokenizerType {
255 BPE,
257 WordPiece,
259 SentencePiece,
261 Tiktoken,
263 Custom,
265}
266
267#[derive(Debug, Clone, Serialize, Deserialize)]
269pub struct TruncationConfig {
270 pub max_length: usize,
272 pub strategy: TruncationStrategy,
274}
275
276#[derive(Debug, Clone, Serialize, Deserialize)]
278pub enum TruncationStrategy {
279 TruncateStart,
281 TruncateEnd,
283 TruncateBoth,
285}
286
287#[derive(Debug, Clone, Serialize, Deserialize)]
289pub struct PaddingConfig {
290 pub strategy: PaddingStrategy,
292 pub token_id: u32,
294 pub target_length: Option<usize>,
296}
297
298#[derive(Debug, Clone, Serialize, Deserialize)]
300pub enum PaddingStrategy {
301 None,
303 MaxLength,
305 FixedLength,
307}
308
309#[derive(Debug, Clone, Serialize, Deserialize)]
313pub struct SecurityConfig {
314 pub enable_auth: bool,
316 pub api_keys: Vec<String>,
318 pub enable_rate_limiting: bool,
320 pub rate_limit_rpm: u32,
322 pub enable_content_filter: bool,
324 pub max_prompt_length: usize,
326 pub enable_prompt_validation: bool,
328 pub allowed_extensions: Vec<String>,
330}
331
332impl Default for SecurityConfig {
333 fn default() -> Self {
334 Self {
335 enable_auth: false,
336 api_keys: vec![],
337 enable_rate_limiting: true,
338 rate_limit_rpm: 60,
339 enable_content_filter: false,
340 max_prompt_length: 32768,
341 enable_prompt_validation: true,
342 allowed_extensions: vec!["txt".to_string(), "json".to_string()],
343 }
344 }
345}
346
347#[derive(Debug, Clone, Serialize, Deserialize, Default)]
348pub struct SamplingConfig {
349 pub default_params: SamplingParams,
350 pub presets: SamplingPresets,
351 pub enable_custom_processors: bool,
352}
353
354#[derive(Debug, Clone, Serialize, Deserialize)]
355pub struct MonitoringConfig {
356 pub enable_metrics: bool,
357 pub enable_tracing: bool,
358 pub export_interval: Duration,
359}
360
361impl Default for MonitoringConfig {
362 fn default() -> Self {
363 Self {
364 enable_metrics: true,
365 enable_tracing: true,
366 export_interval: Duration::from_secs(5),
367 }
368 }
369}
370
371#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct BatchConfig {
373 pub max_batch_size: usize,
374 pub max_wait_ms: u64,
375 pub enable_dynamic: bool,
376 pub enable_continuous: bool,
377}
378
379impl Default for BatchConfig {
380 fn default() -> Self {
381 Self {
382 max_batch_size: 16,
383 max_wait_ms: 8,
384 enable_dynamic: true,
385 enable_continuous: false,
386 }
387 }
388}