use crate::{ids::*, Device, EngineConfig, RequestId};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineStatus {
pub is_ready: bool,
pub loaded_models: Vec<ModelId>,
pub active_requests: usize,
pub queued_requests: usize,
pub memory_usage: MemoryUsage,
pub uptime_seconds: u64,
pub last_heartbeat: DateTime<Utc>,
pub version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemoryUsage {
pub total_bytes: usize,
pub used_bytes: usize,
pub free_bytes: usize,
pub gpu_memory_bytes: Option<usize>,
pub cpu_memory_bytes: Option<usize>,
pub cache_memory_bytes: usize,
pub utilization_percent: f32,
}
impl MemoryUsage {
pub fn calculate_utilization(&mut self) {
if self.total_bytes > 0 {
self.utilization_percent = (self.used_bytes as f32 / self.total_bytes as f32) * 100.0;
} else {
self.utilization_percent = 0.0;
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SchedulerStats {
pub waiting_requests: usize,
pub running_requests: usize,
pub preempted_requests: usize,
pub completed_requests: u64,
pub failed_requests: u64,
pub cancelled_requests: u64,
pub avg_wait_time_ms: f64,
pub avg_execution_time_ms: f64,
pub throughput_rps: f64,
pub queue_utilization: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheStats {
pub total_blocks: usize,
pub used_blocks: usize,
pub free_blocks: usize,
pub cache_hit_rate: f32,
pub cache_hits: u64,
pub cache_misses: u64,
pub eviction_count: u64,
pub avg_block_utilization: f32,
pub prefix_cache_stats: Option<PrefixCacheStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PrefixCacheStats {
pub cached_prefixes: usize,
pub prefix_hit_rate: f32,
pub avg_prefix_length: f32,
pub memory_saved_bytes: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BatchingMetrics {
pub avg_batch_size: f32,
pub batch_utilization: f32,
pub batches_created: u64,
pub batches_completed: u64,
pub avg_batch_time_ms: f64,
pub tokens_per_second: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyMetrics {
pub avg_latency_ms: f64,
pub p50_latency_ms: f64,
pub p90_latency_ms: f64,
pub p95_latency_ms: f64,
pub p99_latency_ms: f64,
pub p999_latency_ms: f64,
pub avg_ttft_ms: f64,
pub avg_inter_token_latency_ms: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThroughputMetrics {
pub requests_per_second: f64,
pub tokens_per_second: f64,
pub characters_per_second: f64,
pub batches_per_second: f64,
pub peak_tokens_per_second: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelMetrics {
pub model_id: ModelId,
pub forward_passes: u64,
pub avg_forward_time_ms: f64,
pub prefill_metrics: PhaseMetrics,
pub decode_metrics: PhaseMetrics,
pub tokens_generated: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PhaseMetrics {
pub operations: u64,
pub avg_time_ms: f64,
pub total_time_ms: f64,
pub tokens_processed: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestMetrics {
pub request_id: RequestId,
pub client_id: Option<ClientId>,
pub model_id: ModelId,
pub created_at: DateTime<Utc>,
pub completed_at: Option<DateTime<Utc>>,
pub total_time_ms: u64,
pub queue_time_ms: u64,
pub prefill_time_ms: u64,
pub decode_time_ms: u64,
pub input_tokens: usize,
pub output_tokens: usize,
pub was_preempted: bool,
pub preemption_count: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemMetrics {
pub cpu_usage_percent: f32,
pub memory_usage: MemoryUsage,
pub gpu_utilization: Option<GpuMetrics>,
pub network_io: NetworkMetrics,
pub disk_io: DiskMetrics,
pub load_average: [f32; 3], }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuMetrics {
pub utilization_percent: f32,
pub memory_used_bytes: usize,
pub memory_total_bytes: usize,
pub temperature_celsius: f32,
pub power_watts: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NetworkMetrics {
pub rx_bytes_per_sec: u64,
pub tx_bytes_per_sec: u64,
pub rx_packets_per_sec: u64,
pub tx_packets_per_sec: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiskMetrics {
pub read_bytes_per_sec: u64,
pub write_bytes_per_sec: u64,
pub read_ops_per_sec: u64,
pub write_ops_per_sec: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ErrorStats {
pub total_errors: u64,
pub errors_by_type: HashMap<String, u64>,
pub error_rate: f32,
pub recent_errors: Vec<ErrorEvent>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorEvent {
pub timestamp: DateTime<Utc>,
pub error_type: String,
pub message: String,
pub request_id: Option<RequestId>,
pub context: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct EngineMetrics {
pub total_requests: u64,
pub successful_requests: u64,
pub failed_requests: u64,
pub avg_request_latency_ms: f64,
pub p95_request_latency_ms: f64,
pub p99_request_latency_ms: f64,
pub throughput_rps: f32,
pub tokens_per_second: f32,
pub queue_metrics: QueueMetrics,
pub resource_utilization: ResourceMetrics,
pub error_stats: ErrorStats,
pub performance_breakdown: PerformanceBreakdown,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct QueueMetrics {
pub current_queue_length: usize,
pub avg_queue_wait_time_ms: f64,
pub queue_throughput_rps: f32,
pub queue_rejection_rate: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ResourceMetrics {
pub cpu_utilization: f32,
pub memory_utilization: f32,
pub gpu_utilization: Option<f32>,
pub network_utilization: f32,
pub disk_utilization: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceBreakdown {
pub tokenization_time_ms: f64,
pub model_execution_time_ms: f64,
pub sampling_time_ms: f64,
pub scheduling_time_ms: f64,
pub memory_operations_time_ms: f64,
pub other_overhead_time_ms: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthStatus {
pub status: HealthStatusType,
pub component_status: ComponentStatus,
pub last_check: DateTime<Utc>,
}
impl HealthStatus {
pub fn healthy() -> Self {
Self {
status: HealthStatusType::Healthy,
component_status: ComponentStatus::healthy(),
last_check: Utc::now(),
}
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
pub enum HealthStatusType {
Healthy,
Warning,
Unhealthy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentHealth {
pub status: HealthStatusType,
pub message: String,
pub metrics: HashMap<String, f64>,
pub last_check: DateTime<Utc>,
}
impl ComponentHealth {
pub fn healthy(component: &str) -> Self {
Self {
status: HealthStatusType::Healthy,
message: format!("{} healthy", component),
metrics: HashMap::new(),
last_check: Utc::now(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentStatus {
pub scheduler: ComponentHealth,
pub model_executor: ComponentHealth,
pub tokenizer: ComponentHealth,
pub kv_cache: ComponentHealth,
pub memory_manager: ComponentHealth,
pub backend: ComponentHealth,
}
impl ComponentStatus {
pub fn healthy() -> Self {
Self {
scheduler: ComponentHealth::healthy("scheduler"),
model_executor: ComponentHealth::healthy("model"),
tokenizer: ComponentHealth::healthy("tokenizer"),
kv_cache: ComponentHealth::healthy("kv_cache"),
memory_manager: ComponentHealth::healthy("memory"),
backend: ComponentHealth::healthy("backend"),
}
}
pub fn entries(&self) -> [&ComponentHealth; 6] {
[
&self.scheduler,
&self.model_executor,
&self.tokenizer,
&self.kv_cache,
&self.memory_manager,
&self.backend,
]
}
}
pub type ComponentHealthStatus = HealthStatusType;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiagnosticsReport {
pub config_snapshot: EngineConfig,
pub current_metrics: EngineMetrics,
pub resource_usage: DetailedResourceUsage,
pub performance_analysis: PerformanceAnalysis,
pub component_diagnostics: HashMap<String, Value>,
pub system_info: SystemInfo,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DetailedResourceUsage {
pub memory_by_component: HashMap<String, u64>,
pub cpu_by_thread: HashMap<String, f32>,
pub gpu_memory_details: Option<GpuMemoryDetails>,
pub network_io_details: NetworkIODetails,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct GpuMemoryDetails {
pub total_memory: u64,
pub used_memory: u64,
pub memory_by_type: HashMap<String, u64>,
pub large_allocations: Vec<AllocationInfo>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AllocationInfo {
pub size: u64,
pub allocation_type: String,
pub timestamp: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct NetworkIODetails {
pub bytes_received_per_sec: u64,
pub bytes_sent_per_sec: u64,
pub connection_count: usize,
pub request_rate_per_sec: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceAnalysis {
pub bottlenecks: Vec<PerformanceBottleneck>,
pub recommendations: Vec<PerformanceRecommendation>,
pub trends: PerformanceTrends,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceBottleneck {
pub bottleneck_type: String,
pub severity: f32,
pub description: String,
pub performance_impact: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceRecommendation {
pub category: String,
pub description: String,
pub expected_impact: f32,
pub complexity: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceTrends {
pub latency_trend: TrendDirection,
pub throughput_trend: TrendDirection,
pub error_rate_trend: TrendDirection,
pub resource_utilization_trend: TrendDirection,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
pub enum TrendDirection {
Improving,
Stable,
Degrading,
Volatile,
}
impl Default for TrendDirection {
fn default() -> Self {
TrendDirection::Stable
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct SystemInfo {
pub os: String,
pub cpu_info: String,
pub total_memory: u64,
pub devices: Vec<Device>,
pub runtime_info: RuntimeInfo,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct RuntimeInfo {
pub rust_version: String,
pub engine_version: String,
pub build_info: BuildInfo,
pub feature_flags: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct BuildInfo {
pub build_timestamp: String,
pub git_commit: Option<String>,
pub build_config: String,
pub compiler_flags: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineState {
pub config: EngineConfig,
pub metrics: EngineMetrics,
pub component_states: HashMap<String, Value>,
pub active_requests: Vec<EngineRequestState>,
pub timestamp: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineRequestState {
pub request_id: RequestId,
pub current_phase: String,
pub progress: EngineRequestProgress,
pub allocated_resources: HashMap<String, Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct EngineRequestProgress {
pub tokens_processed: usize,
pub tokens_remaining: usize,
pub elapsed_time_ms: u64,
pub estimated_remaining_ms: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpeculationConfig {
pub speculation_depth: usize,
pub acceptance_threshold: f32,
pub draft_model_config: Option<crate::ModelConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct WarmupResult {
pub requests_processed: usize,
pub total_time_ms: u64,
pub avg_latency_ms: f64,
pub memory_allocated_bytes: u64,
pub success: bool,
pub issues: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HardwareConstraints {
pub available_devices: Vec<Device>,
pub total_memory: u64,
pub expected_request_rate: f32,
pub request_characteristics: RequestCharacteristics,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestCharacteristics {
pub avg_input_tokens: usize,
pub avg_output_tokens: usize,
pub typical_batch_size: usize,
pub latency_requirements: LatencyRequirements,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyRequirements {
pub target_p95_latency_ms: u64,
pub target_p99_latency_ms: u64,
pub max_latency_ms: u64,
}