kyro 0.1.0

A high-performance ML inference engine
#![allow(dead_code)]

use anyhow::Result;
use prometheus::{Counter, Gauge, Histogram, Registry};
use std::sync::Arc;

pub struct EngineMetrics {
    pub total_requests: Counter,
    pub total_tokens_generated: Counter,
    pub token_latency: Histogram,
    pub time_to_first_token: Histogram,
    pub time_between_tokens: Histogram,
    pub kv_cache_usage: Gauge,
}

impl EngineMetrics {
    pub fn new(registry: &Registry) -> Result<Arc<Self>> {
        let total_requests = prometheus::register_counter_with_registry!(
            "kyro_requests_total",
            "Total number of requests processed",
            registry
        )?;

        let total_tokens_generated = prometheus::register_counter_with_registry!(
            "kyro_tokens_total",
            "Total number of tokens generated",
            registry
        )?;

        let token_latency = prometheus::register_histogram_with_registry!(
            "kyro_token_latency_seconds",
            "Latency per token generation",
            registry
        )?;

        let time_to_first_token = prometheus::register_histogram_with_registry!(
            "kyro_ttft_ms",
            "Time to first token",
            registry
        )?;

        let time_between_tokens = prometheus::register_histogram_with_registry!(
            "kyro_tbt_ms",
            "Time between tokens",
            registry
        )?;

        let kv_cache_usage = prometheus::register_gauge_with_registry!(
            "kyro_kv_cache_usage_percent",
            "KV cache utilization percentage",
            registry
        )?;

        Ok(Arc::new(Self {
            total_requests,
            total_tokens_generated,
            token_latency,
            time_to_first_token,
            time_between_tokens,
            kv_cache_usage,
        }))
    }
}