ferrum-types 0.6.0

Shared type definitions for the Ferrum LLM inference engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
//! Metrics and observability types

use crate::{ids::*, Device, EngineConfig, RequestId};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;

/// Engine status information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineStatus {
    /// Whether the engine is ready to accept requests
    pub is_ready: bool,
    /// Currently loaded models
    pub loaded_models: Vec<ModelId>,
    /// Number of active requests
    pub active_requests: usize,
    /// Number of queued requests
    pub queued_requests: usize,
    /// Current memory usage
    pub memory_usage: MemoryUsage,
    /// Engine uptime
    pub uptime_seconds: u64,
    /// Last heartbeat timestamp
    pub last_heartbeat: DateTime<Utc>,
    /// Engine version
    pub version: String,
}

/// Memory usage statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemoryUsage {
    /// Total system memory in bytes
    pub total_bytes: usize,
    /// Used memory in bytes
    pub used_bytes: usize,
    /// Free memory in bytes
    pub free_bytes: usize,
    /// GPU memory usage (if applicable)
    pub gpu_memory_bytes: Option<usize>,
    /// CPU memory usage
    pub cpu_memory_bytes: Option<usize>,
    /// Cache memory usage
    pub cache_memory_bytes: usize,
    /// Memory utilization percentage
    pub utilization_percent: f32,
}

impl MemoryUsage {
    /// Calculate memory utilization percentage
    pub fn calculate_utilization(&mut self) {
        if self.total_bytes > 0 {
            self.utilization_percent = (self.used_bytes as f32 / self.total_bytes as f32) * 100.0;
        } else {
            self.utilization_percent = 0.0;
        }
    }
}

/// Scheduler statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SchedulerStats {
    /// Number of waiting requests
    pub waiting_requests: usize,
    /// Number of running requests
    pub running_requests: usize,
    /// Number of preempted requests
    pub preempted_requests: usize,
    /// Total completed requests
    pub completed_requests: u64,
    /// Total failed requests
    pub failed_requests: u64,
    /// Total cancelled requests
    pub cancelled_requests: u64,
    /// Average wait time in milliseconds
    pub avg_wait_time_ms: f64,
    /// Average execution time in milliseconds
    pub avg_execution_time_ms: f64,
    /// Current throughput (requests per second)
    pub throughput_rps: f64,
    /// Queue utilization percentage
    pub queue_utilization: f32,
}

/// Cache statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheStats {
    /// Total number of cache blocks
    pub total_blocks: usize,
    /// Number of used blocks
    pub used_blocks: usize,
    /// Number of free blocks
    pub free_blocks: usize,
    /// Cache hit rate (0.0 to 1.0)
    pub cache_hit_rate: f32,
    /// Total number of cache hits
    pub cache_hits: u64,
    /// Total number of cache misses
    pub cache_misses: u64,
    /// Number of cache evictions
    pub eviction_count: u64,
    /// Average block utilization
    pub avg_block_utilization: f32,
    /// Prefix cache statistics
    pub prefix_cache_stats: Option<PrefixCacheStats>,
}

/// Prefix cache statistics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PrefixCacheStats {
    /// Number of cached prefixes
    pub cached_prefixes: usize,
    /// Prefix hit rate
    pub prefix_hit_rate: f32,
    /// Average prefix length
    pub avg_prefix_length: f32,
    /// Memory saved by prefix caching (bytes)
    pub memory_saved_bytes: u64,
}

/// Batch processing metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BatchingMetrics {
    /// Average batch size
    pub avg_batch_size: f32,
    /// Batch utilization rate
    pub batch_utilization: f32,
    /// Number of batches created
    pub batches_created: u64,
    /// Number of batches completed
    pub batches_completed: u64,
    /// Average batch processing time
    pub avg_batch_time_ms: f64,
    /// Tokens per second across all batches
    pub tokens_per_second: f64,
}

/// Request latency metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyMetrics {
    /// Average end-to-end latency
    pub avg_latency_ms: f64,
    /// P50 latency
    pub p50_latency_ms: f64,
    /// P90 latency
    pub p90_latency_ms: f64,
    /// P95 latency
    pub p95_latency_ms: f64,
    /// P99 latency
    pub p99_latency_ms: f64,
    /// P99.9 latency
    pub p999_latency_ms: f64,
    /// Time to first token (TTFT)
    pub avg_ttft_ms: f64,
    /// Inter-token latency
    pub avg_inter_token_latency_ms: f64,
}

/// Throughput metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThroughputMetrics {
    /// Requests per second
    pub requests_per_second: f64,
    /// Tokens per second
    pub tokens_per_second: f64,
    /// Characters per second
    pub characters_per_second: f64,
    /// Batch throughput
    pub batches_per_second: f64,
    /// Peak throughput achieved
    pub peak_tokens_per_second: f64,
}

/// Model execution metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelMetrics {
    /// Model identifier
    pub model_id: ModelId,
    /// Number of forward passes
    pub forward_passes: u64,
    /// Average forward pass time
    pub avg_forward_time_ms: f64,
    /// Prefill metrics
    pub prefill_metrics: PhaseMetrics,
    /// Decode metrics
    pub decode_metrics: PhaseMetrics,
    /// Total tokens generated
    pub tokens_generated: u64,
}

/// Metrics for different execution phases
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PhaseMetrics {
    /// Number of operations
    pub operations: u64,
    /// Average time per operation
    pub avg_time_ms: f64,
    /// Total time spent
    pub total_time_ms: f64,
    /// Tokens processed
    pub tokens_processed: u64,
}

/// Request-level metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestMetrics {
    /// Request identifier
    pub request_id: RequestId,
    /// Client identifier
    pub client_id: Option<ClientId>,
    /// Model used
    pub model_id: ModelId,
    /// Request creation time
    pub created_at: DateTime<Utc>,
    /// Request completion time
    pub completed_at: Option<DateTime<Utc>>,
    /// Total processing time
    pub total_time_ms: u64,
    /// Time waiting in queue
    pub queue_time_ms: u64,
    /// Time spent in prefill phase
    pub prefill_time_ms: u64,
    /// Time spent in decode phase
    pub decode_time_ms: u64,
    /// Number of input tokens
    pub input_tokens: usize,
    /// Number of output tokens
    pub output_tokens: usize,
    /// Whether request was preempted
    pub was_preempted: bool,
    /// Number of preemptions
    pub preemption_count: u32,
}

/// System resource metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemMetrics {
    /// CPU usage percentage
    pub cpu_usage_percent: f32,
    /// Memory usage
    pub memory_usage: MemoryUsage,
    /// GPU utilization (if applicable)
    pub gpu_utilization: Option<GpuMetrics>,
    /// Network I/O statistics
    pub network_io: NetworkMetrics,
    /// Disk I/O statistics
    pub disk_io: DiskMetrics,
    /// System load average
    pub load_average: [f32; 3], // 1min, 5min, 15min
}

/// GPU-specific metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuMetrics {
    /// GPU utilization percentage
    pub utilization_percent: f32,
    /// GPU memory usage in bytes
    pub memory_used_bytes: usize,
    /// GPU memory total in bytes
    pub memory_total_bytes: usize,
    /// GPU temperature in Celsius
    pub temperature_celsius: f32,
    /// Power consumption in watts
    pub power_watts: f32,
}

/// Network I/O metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NetworkMetrics {
    /// Bytes received per second
    pub rx_bytes_per_sec: u64,
    /// Bytes transmitted per second
    pub tx_bytes_per_sec: u64,
    /// Packets received per second
    pub rx_packets_per_sec: u64,
    /// Packets transmitted per second
    pub tx_packets_per_sec: u64,
}

/// Disk I/O metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiskMetrics {
    /// Bytes read per second
    pub read_bytes_per_sec: u64,
    /// Bytes written per second
    pub write_bytes_per_sec: u64,
    /// Read operations per second
    pub read_ops_per_sec: u64,
    /// Write operations per second
    pub write_ops_per_sec: u64,
}

/// Error statistics
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ErrorStats {
    /// Total number of errors
    pub total_errors: u64,
    /// Errors by type
    pub errors_by_type: HashMap<String, u64>,
    /// Error rate (errors per request)
    pub error_rate: f32,
    /// Recent errors
    pub recent_errors: Vec<ErrorEvent>,
}

/// Individual error event
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorEvent {
    /// When the error occurred
    pub timestamp: DateTime<Utc>,
    /// Error type/category
    pub error_type: String,
    /// Error message
    pub message: String,
    /// Request ID that caused the error (if applicable)
    pub request_id: Option<RequestId>,
    /// Additional context
    pub context: HashMap<String, serde_json::Value>,
}

/// Aggregated engine metrics
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct EngineMetrics {
    pub total_requests: u64,
    pub successful_requests: u64,
    pub failed_requests: u64,
    pub avg_request_latency_ms: f64,
    pub p95_request_latency_ms: f64,
    pub p99_request_latency_ms: f64,
    pub throughput_rps: f32,
    pub tokens_per_second: f32,
    pub queue_metrics: QueueMetrics,
    pub resource_utilization: ResourceMetrics,
    pub error_stats: ErrorStats,
    pub performance_breakdown: PerformanceBreakdown,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct QueueMetrics {
    pub current_queue_length: usize,
    pub avg_queue_wait_time_ms: f64,
    pub queue_throughput_rps: f32,
    pub queue_rejection_rate: f32,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ResourceMetrics {
    pub cpu_utilization: f32,
    pub memory_utilization: f32,
    pub gpu_utilization: Option<f32>,
    pub network_utilization: f32,
    pub disk_utilization: f32,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceBreakdown {
    pub tokenization_time_ms: f64,
    pub model_execution_time_ms: f64,
    pub sampling_time_ms: f64,
    pub scheduling_time_ms: f64,
    pub memory_operations_time_ms: f64,
    pub other_overhead_time_ms: f64,
}

/// Health check status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthStatus {
    /// Overall health status
    pub status: HealthStatusType,
    /// Individual component health
    pub component_status: ComponentStatus,
    /// Last health check time
    pub last_check: DateTime<Utc>,
}

impl HealthStatus {
    /// Construct a healthy status with default component health
    pub fn healthy() -> Self {
        Self {
            status: HealthStatusType::Healthy,
            component_status: ComponentStatus::healthy(),
            last_check: Utc::now(),
        }
    }
}

/// Health status types
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
pub enum HealthStatusType {
    /// System is healthy
    Healthy,
    /// System has warnings but is functional
    Warning,
    /// System is unhealthy
    Unhealthy,
}

/// Individual component health snapshot
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentHealth {
    /// Component health status
    pub status: HealthStatusType,
    /// Health message
    pub message: String,
    /// Component-specific metrics
    pub metrics: HashMap<String, f64>,
    /// Last check time
    pub last_check: DateTime<Utc>,
}

impl ComponentHealth {
    /// Construct a healthy component entry with default state
    pub fn healthy(component: &str) -> Self {
        Self {
            status: HealthStatusType::Healthy,
            message: format!("{} healthy", component),
            metrics: HashMap::new(),
            last_check: Utc::now(),
        }
    }
}

/// Aggregated component health map
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentStatus {
    pub scheduler: ComponentHealth,
    pub model_executor: ComponentHealth,
    pub tokenizer: ComponentHealth,
    pub kv_cache: ComponentHealth,
    pub memory_manager: ComponentHealth,
    pub backend: ComponentHealth,
}

impl ComponentStatus {
    /// Construct a fully healthy component status snapshot
    pub fn healthy() -> Self {
        Self {
            scheduler: ComponentHealth::healthy("scheduler"),
            model_executor: ComponentHealth::healthy("model"),
            tokenizer: ComponentHealth::healthy("tokenizer"),
            kv_cache: ComponentHealth::healthy("kv_cache"),
            memory_manager: ComponentHealth::healthy("memory"),
            backend: ComponentHealth::healthy("backend"),
        }
    }

    /// Iterate over component entries for aggregation helpers
    pub fn entries(&self) -> [&ComponentHealth; 6] {
        [
            &self.scheduler,
            &self.model_executor,
            &self.tokenizer,
            &self.kv_cache,
            &self.memory_manager,
            &self.backend,
        ]
    }
}

/// Component health status alias for backwards compatibility
pub type ComponentHealthStatus = HealthStatusType;

/// Diagnostics report aggregating engine state
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiagnosticsReport {
    /// Engine configuration snapshot
    pub config_snapshot: EngineConfig,
    /// Current metrics
    pub current_metrics: EngineMetrics,
    /// Resource usage details
    pub resource_usage: DetailedResourceUsage,
    /// Performance analysis
    pub performance_analysis: PerformanceAnalysis,
    /// Component diagnostics
    pub component_diagnostics: HashMap<String, Value>,
    /// System information
    pub system_info: SystemInfo,
}

/// Detailed resource usage information
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DetailedResourceUsage {
    pub memory_by_component: HashMap<String, u64>,
    pub cpu_by_thread: HashMap<String, f32>,
    pub gpu_memory_details: Option<GpuMemoryDetails>,
    pub network_io_details: NetworkIODetails,
}

/// GPU memory usage details
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct GpuMemoryDetails {
    pub total_memory: u64,
    pub used_memory: u64,
    pub memory_by_type: HashMap<String, u64>,
    pub large_allocations: Vec<AllocationInfo>,
}

/// Allocation detail record
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AllocationInfo {
    pub size: u64,
    pub allocation_type: String,
    pub timestamp: DateTime<Utc>,
}

/// Network I/O details
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct NetworkIODetails {
    pub bytes_received_per_sec: u64,
    pub bytes_sent_per_sec: u64,
    pub connection_count: usize,
    pub request_rate_per_sec: f32,
}

/// Performance analysis report
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceAnalysis {
    pub bottlenecks: Vec<PerformanceBottleneck>,
    pub recommendations: Vec<PerformanceRecommendation>,
    pub trends: PerformanceTrends,
}

/// Performance bottleneck metadata
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceBottleneck {
    pub bottleneck_type: String,
    pub severity: f32,
    pub description: String,
    pub performance_impact: f32,
}

/// Optimization recommendation record
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceRecommendation {
    pub category: String,
    pub description: String,
    pub expected_impact: f32,
    pub complexity: f32,
}

/// Performance trend indicators
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceTrends {
    pub latency_trend: TrendDirection,
    pub throughput_trend: TrendDirection,
    pub error_rate_trend: TrendDirection,
    pub resource_utilization_trend: TrendDirection,
}

/// Trend direction enum
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
pub enum TrendDirection {
    Improving,
    Stable,
    Degrading,
    Volatile,
}

impl Default for TrendDirection {
    fn default() -> Self {
        TrendDirection::Stable
    }
}

/// System information snapshot
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct SystemInfo {
    pub os: String,
    pub cpu_info: String,
    pub total_memory: u64,
    pub devices: Vec<Device>,
    pub runtime_info: RuntimeInfo,
}

/// Runtime information block
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct RuntimeInfo {
    pub rust_version: String,
    pub engine_version: String,
    pub build_info: BuildInfo,
    pub feature_flags: Vec<String>,
}

/// Build information metadata
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct BuildInfo {
    pub build_timestamp: String,
    pub git_commit: Option<String>,
    pub build_config: String,
    pub compiler_flags: Vec<String>,
}

/// Engine state export for diagnostics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineState {
    pub config: EngineConfig,
    pub metrics: EngineMetrics,
    pub component_states: HashMap<String, Value>,
    pub active_requests: Vec<EngineRequestState>,
    pub timestamp: DateTime<Utc>,
}

/// Request state for debugging
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineRequestState {
    pub request_id: RequestId,
    pub current_phase: String,
    pub progress: EngineRequestProgress,
    pub allocated_resources: HashMap<String, Value>,
}

/// Request progress summary
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct EngineRequestProgress {
    pub tokens_processed: usize,
    pub tokens_remaining: usize,
    pub elapsed_time_ms: u64,
    pub estimated_remaining_ms: Option<u64>,
}

/// Speculation configuration for speculative decoding
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpeculationConfig {
    pub speculation_depth: usize,
    pub acceptance_threshold: f32,
    pub draft_model_config: Option<crate::ModelConfig>,
}

/// Warmup result structure
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct WarmupResult {
    pub requests_processed: usize,
    pub total_time_ms: u64,
    pub avg_latency_ms: f64,
    pub memory_allocated_bytes: u64,
    pub success: bool,
    pub issues: Vec<String>,
}

/// Hardware constraints for recommendations
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HardwareConstraints {
    pub available_devices: Vec<Device>,
    pub total_memory: u64,
    pub expected_request_rate: f32,
    pub request_characteristics: RequestCharacteristics,
}

/// Request characteristics summary
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestCharacteristics {
    pub avg_input_tokens: usize,
    pub avg_output_tokens: usize,
    pub typical_batch_size: usize,
    pub latency_requirements: LatencyRequirements,
}

/// Latency requirements for configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyRequirements {
    pub target_p95_latency_ms: u64,
    pub target_p99_latency_ms: u64,
    pub max_latency_ms: u64,
}