1use crate::{ids::*, Device, EngineConfig, RequestId};
4use chrono::{DateTime, Utc};
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct EngineStatus {
12 pub is_ready: bool,
14 pub loaded_models: Vec<ModelId>,
16 pub active_requests: usize,
18 pub queued_requests: usize,
20 pub memory_usage: MemoryUsage,
22 pub uptime_seconds: u64,
24 pub last_heartbeat: DateTime<Utc>,
26 pub version: String,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct MemoryUsage {
33 pub total_bytes: usize,
35 pub used_bytes: usize,
37 pub free_bytes: usize,
39 pub gpu_memory_bytes: Option<usize>,
41 pub cpu_memory_bytes: Option<usize>,
43 pub cache_memory_bytes: usize,
45 pub utilization_percent: f32,
47}
48
49impl MemoryUsage {
50 pub fn calculate_utilization(&mut self) {
52 if self.total_bytes > 0 {
53 self.utilization_percent = (self.used_bytes as f32 / self.total_bytes as f32) * 100.0;
54 } else {
55 self.utilization_percent = 0.0;
56 }
57 }
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct SchedulerStats {
63 pub waiting_requests: usize,
65 pub running_requests: usize,
67 pub preempted_requests: usize,
69 pub completed_requests: u64,
71 pub failed_requests: u64,
73 pub cancelled_requests: u64,
75 pub avg_wait_time_ms: f64,
77 pub avg_execution_time_ms: f64,
79 pub throughput_rps: f64,
81 pub queue_utilization: f32,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct CacheStats {
88 pub total_blocks: usize,
90 pub used_blocks: usize,
92 pub free_blocks: usize,
94 pub cache_hit_rate: f32,
96 pub cache_hits: u64,
98 pub cache_misses: u64,
100 pub eviction_count: u64,
102 pub avg_block_utilization: f32,
104 pub prefix_cache_stats: Option<PrefixCacheStats>,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct PrefixCacheStats {
111 pub cached_prefixes: usize,
113 pub prefix_hit_rate: f32,
115 pub avg_prefix_length: f32,
117 pub memory_saved_bytes: u64,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct BatchingMetrics {
124 pub avg_batch_size: f32,
126 pub batch_utilization: f32,
128 pub batches_created: u64,
130 pub batches_completed: u64,
132 pub avg_batch_time_ms: f64,
134 pub tokens_per_second: f64,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct LatencyMetrics {
141 pub avg_latency_ms: f64,
143 pub p50_latency_ms: f64,
145 pub p90_latency_ms: f64,
147 pub p95_latency_ms: f64,
149 pub p99_latency_ms: f64,
151 pub p999_latency_ms: f64,
153 pub avg_ttft_ms: f64,
155 pub avg_inter_token_latency_ms: f64,
157}
158
159#[derive(Debug, Clone, Serialize, Deserialize)]
161pub struct ThroughputMetrics {
162 pub requests_per_second: f64,
164 pub tokens_per_second: f64,
166 pub characters_per_second: f64,
168 pub batches_per_second: f64,
170 pub peak_tokens_per_second: f64,
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct ModelMetrics {
177 pub model_id: ModelId,
179 pub forward_passes: u64,
181 pub avg_forward_time_ms: f64,
183 pub prefill_metrics: PhaseMetrics,
185 pub decode_metrics: PhaseMetrics,
187 pub tokens_generated: u64,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct PhaseMetrics {
194 pub operations: u64,
196 pub avg_time_ms: f64,
198 pub total_time_ms: f64,
200 pub tokens_processed: u64,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct RequestMetrics {
207 pub request_id: RequestId,
209 pub client_id: Option<ClientId>,
211 pub model_id: ModelId,
213 pub created_at: DateTime<Utc>,
215 pub completed_at: Option<DateTime<Utc>>,
217 pub total_time_ms: u64,
219 pub queue_time_ms: u64,
221 pub prefill_time_ms: u64,
223 pub decode_time_ms: u64,
225 pub input_tokens: usize,
227 pub output_tokens: usize,
229 pub was_preempted: bool,
231 pub preemption_count: u32,
233}
234
235#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct SystemMetrics {
238 pub cpu_usage_percent: f32,
240 pub memory_usage: MemoryUsage,
242 pub gpu_utilization: Option<GpuMetrics>,
244 pub network_io: NetworkMetrics,
246 pub disk_io: DiskMetrics,
248 pub load_average: [f32; 3], }
251
252#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct GpuMetrics {
255 pub utilization_percent: f32,
257 pub memory_used_bytes: usize,
259 pub memory_total_bytes: usize,
261 pub temperature_celsius: f32,
263 pub power_watts: f32,
265}
266
267#[derive(Debug, Clone, Serialize, Deserialize)]
269pub struct NetworkMetrics {
270 pub rx_bytes_per_sec: u64,
272 pub tx_bytes_per_sec: u64,
274 pub rx_packets_per_sec: u64,
276 pub tx_packets_per_sec: u64,
278}
279
280#[derive(Debug, Clone, Serialize, Deserialize)]
282pub struct DiskMetrics {
283 pub read_bytes_per_sec: u64,
285 pub write_bytes_per_sec: u64,
287 pub read_ops_per_sec: u64,
289 pub write_ops_per_sec: u64,
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize, Default)]
295pub struct ErrorStats {
296 pub total_errors: u64,
298 pub errors_by_type: HashMap<String, u64>,
300 pub error_rate: f32,
302 pub recent_errors: Vec<ErrorEvent>,
304}
305
306#[derive(Debug, Clone, Serialize, Deserialize)]
308pub struct ErrorEvent {
309 pub timestamp: DateTime<Utc>,
311 pub error_type: String,
313 pub message: String,
315 pub request_id: Option<RequestId>,
317 pub context: HashMap<String, serde_json::Value>,
319}
320
321#[derive(Debug, Clone, Serialize, Deserialize, Default)]
323pub struct EngineMetrics {
324 pub total_requests: u64,
325 pub successful_requests: u64,
326 pub failed_requests: u64,
327 pub avg_request_latency_ms: f64,
328 pub p95_request_latency_ms: f64,
329 pub p99_request_latency_ms: f64,
330 pub throughput_rps: f32,
331 pub tokens_per_second: f32,
332 pub queue_metrics: QueueMetrics,
333 pub resource_utilization: ResourceMetrics,
334 pub error_stats: ErrorStats,
335 pub performance_breakdown: PerformanceBreakdown,
336}
337
338#[derive(Debug, Clone, Serialize, Deserialize, Default)]
339pub struct QueueMetrics {
340 pub current_queue_length: usize,
341 pub avg_queue_wait_time_ms: f64,
342 pub queue_throughput_rps: f32,
343 pub queue_rejection_rate: f32,
344}
345
346#[derive(Debug, Clone, Serialize, Deserialize, Default)]
347pub struct ResourceMetrics {
348 pub cpu_utilization: f32,
349 pub memory_utilization: f32,
350 pub gpu_utilization: Option<f32>,
351 pub network_utilization: f32,
352 pub disk_utilization: f32,
353}
354
355#[derive(Debug, Clone, Serialize, Deserialize, Default)]
356pub struct PerformanceBreakdown {
357 pub tokenization_time_ms: f64,
358 pub model_execution_time_ms: f64,
359 pub sampling_time_ms: f64,
360 pub scheduling_time_ms: f64,
361 pub memory_operations_time_ms: f64,
362 pub other_overhead_time_ms: f64,
363}
364
365#[derive(Debug, Clone, Serialize, Deserialize)]
367pub struct HealthStatus {
368 pub status: HealthStatusType,
370 pub component_status: ComponentStatus,
372 pub last_check: DateTime<Utc>,
374}
375
376impl HealthStatus {
377 pub fn healthy() -> Self {
379 Self {
380 status: HealthStatusType::Healthy,
381 component_status: ComponentStatus::healthy(),
382 last_check: Utc::now(),
383 }
384 }
385}
386
387#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
389pub enum HealthStatusType {
390 Healthy,
392 Warning,
394 Unhealthy,
396}
397
398#[derive(Debug, Clone, Serialize, Deserialize)]
400pub struct ComponentHealth {
401 pub status: HealthStatusType,
403 pub message: String,
405 pub metrics: HashMap<String, f64>,
407 pub last_check: DateTime<Utc>,
409}
410
411impl ComponentHealth {
412 pub fn healthy(component: &str) -> Self {
414 Self {
415 status: HealthStatusType::Healthy,
416 message: format!("{} healthy", component),
417 metrics: HashMap::new(),
418 last_check: Utc::now(),
419 }
420 }
421}
422
423#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct ComponentStatus {
426 pub scheduler: ComponentHealth,
427 pub model_executor: ComponentHealth,
428 pub tokenizer: ComponentHealth,
429 pub kv_cache: ComponentHealth,
430 pub memory_manager: ComponentHealth,
431 pub backend: ComponentHealth,
432}
433
434impl ComponentStatus {
435 pub fn healthy() -> Self {
437 Self {
438 scheduler: ComponentHealth::healthy("scheduler"),
439 model_executor: ComponentHealth::healthy("model"),
440 tokenizer: ComponentHealth::healthy("tokenizer"),
441 kv_cache: ComponentHealth::healthy("kv_cache"),
442 memory_manager: ComponentHealth::healthy("memory"),
443 backend: ComponentHealth::healthy("backend"),
444 }
445 }
446
447 pub fn entries(&self) -> [&ComponentHealth; 6] {
449 [
450 &self.scheduler,
451 &self.model_executor,
452 &self.tokenizer,
453 &self.kv_cache,
454 &self.memory_manager,
455 &self.backend,
456 ]
457 }
458}
459
460pub type ComponentHealthStatus = HealthStatusType;
462
463#[derive(Debug, Clone, Serialize, Deserialize)]
465pub struct DiagnosticsReport {
466 pub config_snapshot: EngineConfig,
468 pub current_metrics: EngineMetrics,
470 pub resource_usage: DetailedResourceUsage,
472 pub performance_analysis: PerformanceAnalysis,
474 pub component_diagnostics: HashMap<String, Value>,
476 pub system_info: SystemInfo,
478}
479
480#[derive(Debug, Clone, Serialize, Deserialize, Default)]
482pub struct DetailedResourceUsage {
483 pub memory_by_component: HashMap<String, u64>,
484 pub cpu_by_thread: HashMap<String, f32>,
485 pub gpu_memory_details: Option<GpuMemoryDetails>,
486 pub network_io_details: NetworkIODetails,
487}
488
489#[derive(Debug, Clone, Serialize, Deserialize, Default)]
491pub struct GpuMemoryDetails {
492 pub total_memory: u64,
493 pub used_memory: u64,
494 pub memory_by_type: HashMap<String, u64>,
495 pub large_allocations: Vec<AllocationInfo>,
496}
497
498#[derive(Debug, Clone, Serialize, Deserialize)]
500pub struct AllocationInfo {
501 pub size: u64,
502 pub allocation_type: String,
503 pub timestamp: DateTime<Utc>,
504}
505
506#[derive(Debug, Clone, Serialize, Deserialize, Default)]
508pub struct NetworkIODetails {
509 pub bytes_received_per_sec: u64,
510 pub bytes_sent_per_sec: u64,
511 pub connection_count: usize,
512 pub request_rate_per_sec: f32,
513}
514
515#[derive(Debug, Clone, Serialize, Deserialize, Default)]
517pub struct PerformanceAnalysis {
518 pub bottlenecks: Vec<PerformanceBottleneck>,
519 pub recommendations: Vec<PerformanceRecommendation>,
520 pub trends: PerformanceTrends,
521}
522
523#[derive(Debug, Clone, Serialize, Deserialize)]
525pub struct PerformanceBottleneck {
526 pub bottleneck_type: String,
527 pub severity: f32,
528 pub description: String,
529 pub performance_impact: f32,
530}
531
532#[derive(Debug, Clone, Serialize, Deserialize)]
534pub struct PerformanceRecommendation {
535 pub category: String,
536 pub description: String,
537 pub expected_impact: f32,
538 pub complexity: f32,
539}
540
541#[derive(Debug, Clone, Serialize, Deserialize, Default)]
543pub struct PerformanceTrends {
544 pub latency_trend: TrendDirection,
545 pub throughput_trend: TrendDirection,
546 pub error_rate_trend: TrendDirection,
547 pub resource_utilization_trend: TrendDirection,
548}
549
550#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
552pub enum TrendDirection {
553 Improving,
554 Stable,
555 Degrading,
556 Volatile,
557}
558
559impl Default for TrendDirection {
560 fn default() -> Self {
561 TrendDirection::Stable
562 }
563}
564
565#[derive(Debug, Clone, Serialize, Deserialize, Default)]
567pub struct SystemInfo {
568 pub os: String,
569 pub cpu_info: String,
570 pub total_memory: u64,
571 pub devices: Vec<Device>,
572 pub runtime_info: RuntimeInfo,
573}
574
575#[derive(Debug, Clone, Serialize, Deserialize, Default)]
577pub struct RuntimeInfo {
578 pub rust_version: String,
579 pub engine_version: String,
580 pub build_info: BuildInfo,
581 pub feature_flags: Vec<String>,
582}
583
584#[derive(Debug, Clone, Serialize, Deserialize, Default)]
586pub struct BuildInfo {
587 pub build_timestamp: String,
588 pub git_commit: Option<String>,
589 pub build_config: String,
590 pub compiler_flags: Vec<String>,
591}
592
593#[derive(Debug, Clone, Serialize, Deserialize)]
595pub struct EngineState {
596 pub config: EngineConfig,
597 pub metrics: EngineMetrics,
598 pub component_states: HashMap<String, Value>,
599 pub active_requests: Vec<EngineRequestState>,
600 pub timestamp: DateTime<Utc>,
601}
602
603#[derive(Debug, Clone, Serialize, Deserialize)]
605pub struct EngineRequestState {
606 pub request_id: RequestId,
607 pub current_phase: String,
608 pub progress: EngineRequestProgress,
609 pub allocated_resources: HashMap<String, Value>,
610}
611
612#[derive(Debug, Clone, Serialize, Deserialize, Default)]
614pub struct EngineRequestProgress {
615 pub tokens_processed: usize,
616 pub tokens_remaining: usize,
617 pub elapsed_time_ms: u64,
618 pub estimated_remaining_ms: Option<u64>,
619}
620
621#[derive(Debug, Clone, Serialize, Deserialize)]
623pub struct SpeculationConfig {
624 pub speculation_depth: usize,
625 pub acceptance_threshold: f32,
626 pub draft_model_config: Option<crate::ModelConfig>,
627}
628
629#[derive(Debug, Clone, Serialize, Deserialize, Default)]
631pub struct WarmupResult {
632 pub requests_processed: usize,
633 pub total_time_ms: u64,
634 pub avg_latency_ms: f64,
635 pub memory_allocated_bytes: u64,
636 pub success: bool,
637 pub issues: Vec<String>,
638}
639
640#[derive(Debug, Clone, Serialize, Deserialize)]
642pub struct HardwareConstraints {
643 pub available_devices: Vec<Device>,
644 pub total_memory: u64,
645 pub expected_request_rate: f32,
646 pub request_characteristics: RequestCharacteristics,
647}
648
649#[derive(Debug, Clone, Serialize, Deserialize)]
651pub struct RequestCharacteristics {
652 pub avg_input_tokens: usize,
653 pub avg_output_tokens: usize,
654 pub typical_batch_size: usize,
655 pub latency_requirements: LatencyRequirements,
656}
657
658#[derive(Debug, Clone, Serialize, Deserialize)]
660pub struct LatencyRequirements {
661 pub target_p95_latency_ms: u64,
662 pub target_p99_latency_ms: u64,
663 pub max_latency_ms: u64,
664}