use chrono::{DateTime, Utc};
use reqwest::Client;
use serde::{Deserialize, Serialize};
use tracing::info;
const DEFAULT_BASE_URL: &str = "http://localhost:8317";
#[derive(Debug, Clone)]
pub struct CLIProxyMetricsConfig {
pub base_url: String,
pub api_key: Option<String>,
pub cache_ttl_seconds: u64,
}
impl Default for CLIProxyMetricsConfig {
fn default() -> Self {
Self {
base_url: DEFAULT_BASE_URL.to_string(),
api_key: None,
cache_ttl_seconds: 60, }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestMetrics {
pub request_id: String,
pub session_id: Option<String>,
pub model: String,
pub provider: String,
pub input_tokens: u64,
pub output_tokens: u64,
pub total_tokens: u64,
pub cache_write_tokens: u64,
pub cache_read_tokens: u64,
pub tool_input_tokens: u64,
pub tool_output_tokens: u64,
pub reasoning_tokens: Option<u64>,
pub prompt_tokens: Option<u64>,
pub cache_saved_tokens: Option<u64>,
pub native_input_tokens: Option<u64>,
pub native_output_tokens: Option<u64>,
pub time_to_first_token_ms: Option<f64>,
pub total_latency_ms: Option<f64>,
pub tokens_per_second: Option<f64>,
pub ttft_seconds: Option<f64>,
pub e2e_500_tokens_ms: Option<f64>,
pub time_to_first_answer_ms: Option<f64>,
pub inter_token_latency_ms: Option<f64>,
pub latency_p99_ms: Option<f64>,
pub queue_time_ms: Option<f64>,
pub context_processing_ms: Option<f64>,
pub time_to_first_tool_ms: Option<f64>,
pub inter_tool_call_ms: Option<f64>,
pub turn_count: u32,
pub tool_call_count: u32,
pub verbosity_score: Option<f64>,
pub avg_response_length: Option<f64>,
pub images_processed: Option<u32>,
pub audio_segments: Option<u32>,
pub code_blocks: Option<u32>,
pub token_density: Option<f64>,
pub repetition_score: Option<f64>,
pub function_call_count: u32,
pub parallel_tool_calls: Option<u32>,
pub tool_success_rate: Option<f64>,
pub token_efficiency: Option<f64>,
pub sentiment_score: Option<f64>,
pub success: bool,
pub error_message: Option<String>,
pub error_code: Option<String>,
pub error_category: Option<String>,
pub retry_count: u32,
pub rate_limited: bool,
pub truncated: bool,
pub content_filtered: bool,
pub rate_limit_wait_ms: Option<u64>,
pub error_recovery_ms: Option<f64>,
pub partial_failure: bool,
pub timed_out: bool,
pub cost_input_usd: Option<f64>,
pub cost_output_usd: Option<f64>,
pub cost_total_usd: Option<f64>,
pub cost_per_1k_tokens: Option<f64>,
pub cache_discount_percent: Option<f64>,
pub cost_before_optimization: Option<f64>,
pub cost_reasoning_usd: Option<f64>,
pub cost_tool_usd: Option<f64>,
pub routing_strategy: Option<String>,
pub used_fallback: bool,
pub routing_latency_savings_ms: Option<f64>,
pub routing_cost_savings_percent: Option<f64>,
pub providers_tried: u32,
pub primary_provider_latency_ms: Option<f64>,
pub fallback_provider_latency_ms: Option<f64>,
pub context_window: Option<u64>,
pub context_utilization: Option<f64>,
pub context_message_count: u32,
pub hit_context_limit: bool,
pub context_overflow_tokens: Option<u64>,
pub sliding_window_position: Option<f64>,
pub is_agentic: bool,
pub agent_loop_count: u32,
pub tool_chain_depth: Option<u32>,
pub used_external_tools: bool,
pub task_completed: Option<bool>,
pub task_steps: Option<u32>,
pub self_correction_count: u32,
pub reflection_tokens: Option<u64>,
pub used_planning: bool,
pub replan_count: u32,
pub timestamp: DateTime<Utc>,
}
impl Default for RequestMetrics {
fn default() -> Self {
Self {
request_id: String::new(),
session_id: None,
model: String::new(),
provider: String::new(),
input_tokens: 0,
output_tokens: 0,
total_tokens: 0,
cache_write_tokens: 0,
cache_read_tokens: 0,
tool_input_tokens: 0,
tool_output_tokens: 0,
reasoning_tokens: None,
prompt_tokens: None,
cache_saved_tokens: None,
native_input_tokens: None,
native_output_tokens: None,
time_to_first_token_ms: None,
total_latency_ms: None,
tokens_per_second: None,
ttft_seconds: None,
e2e_500_tokens_ms: None,
time_to_first_answer_ms: None,
inter_token_latency_ms: None,
latency_p99_ms: None,
queue_time_ms: None,
context_processing_ms: None,
time_to_first_tool_ms: None,
inter_tool_call_ms: None,
turn_count: 1,
tool_call_count: 0,
verbosity_score: None,
avg_response_length: None,
images_processed: None,
audio_segments: None,
code_blocks: None,
token_density: None,
repetition_score: None,
function_call_count: 0,
parallel_tool_calls: None,
tool_success_rate: None,
token_efficiency: None,
sentiment_score: None,
success: true,
error_message: None,
error_code: None,
error_category: None,
retry_count: 0,
rate_limited: false,
truncated: false,
content_filtered: false,
rate_limit_wait_ms: None,
error_recovery_ms: None,
partial_failure: false,
timed_out: false,
cost_input_usd: None,
cost_output_usd: None,
cost_total_usd: None,
cost_per_1k_tokens: None,
cache_discount_percent: None,
cost_before_optimization: None,
cost_reasoning_usd: None,
cost_tool_usd: None,
routing_strategy: None,
used_fallback: false,
routing_latency_savings_ms: None,
routing_cost_savings_percent: None,
providers_tried: 0,
primary_provider_latency_ms: None,
fallback_provider_latency_ms: None,
context_window: None,
context_utilization: None,
context_message_count: 0,
hit_context_limit: false,
context_overflow_tokens: None,
sliding_window_position: None,
is_agentic: false,
agent_loop_count: 0,
tool_chain_depth: None,
used_external_tools: false,
task_completed: None,
task_steps: None,
self_correction_count: 0,
reflection_tokens: None,
used_planning: false,
replan_count: 0,
timestamp: Utc::now(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProviderMetrics {
pub provider: String,
pub total_input_tokens: u64,
pub total_output_tokens: u64,
pub total_tokens: u64,
pub total_cache_write_tokens: u64,
pub total_cache_read_tokens: u64,
pub total_tool_input_tokens: u64,
pub total_tool_output_tokens: u64,
pub total_reasoning_tokens: Option<u64>,
pub avg_tokens_per_second: Option<f64>,
pub median_tokens_per_second: Option<f64>,
pub p95_tokens_per_second: Option<f64>,
pub avg_latency_ms: Option<f64>,
pub median_latency_ms: Option<f64>,
pub total_requests: u64,
pub successful_requests: u64,
pub failed_requests: u64,
pub success_rate: Option<f64>,
pub rate_limited_requests: u64,
pub total_cost_usd: Option<f64>,
pub avg_cost_per_request: Option<f64>,
pub cost_per_1k_tokens: Option<f64>,
pub total_turns: u64,
pub total_tool_calls: u64,
pub updated_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelMetrics {
pub model: String,
pub provider: String,
pub total_input_tokens: u64,
pub total_output_tokens: u64,
pub total_tokens: u64,
pub total_cache_write_tokens: u64,
pub total_cache_read_tokens: u64,
pub total_tool_input_tokens: u64,
pub total_tool_output_tokens: u64,
pub total_reasoning_tokens: Option<u64>,
pub avg_tokens_per_second: Option<f64>,
pub median_tokens_per_second: Option<f64>,
pub p95_tokens_per_second: Option<f64>,
pub avg_latency_ms: Option<f64>,
pub median_latency_ms: Option<f64>,
pub avg_verbosity: Option<f64>,
pub median_verbosity: Option<f64>,
pub total_requests: u64,
pub successful_requests: u64,
pub failed_requests: u64,
pub success_rate: Option<f64>,
pub avg_turns_per_request: Option<f64>,
pub avg_tool_calls_per_request: Option<f64>,
pub total_cost_usd: Option<f64>,
pub avg_cost_per_request: Option<f64>,
pub cost_per_1k_tokens: Option<f64>,
pub updated_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelRanking {
pub rank: u32,
pub model_id: String,
pub provider: String,
pub quality_score: Option<f64>,
pub estimated_cost: Option<f64>,
pub latency_ms: Option<u32>,
pub weekly_tokens: Option<u64>,
pub market_share_percent: Option<f64>,
pub category: Option<String>,
}
pub struct CLIProxyMetricsClient {
http_client: Client,
config: CLIProxyMetricsConfig,
}
impl CLIProxyMetricsClient {
pub fn new(config: CLIProxyMetricsConfig) -> Self {
Self {
http_client: Client::new(),
config,
}
}
pub async fn get_provider_metrics(&self) -> Result<Vec<ProviderMetrics>, reqwest::Error> {
let url = format!("{}/v1/metrics/providers", self.config.base_url);
let mut request = self.http_client.get(&url);
if let Some(ref key) = self.config.api_key {
request = request.header("Authorization", format!("Bearer {}", key));
}
let response = request.send().await?;
let metrics: Vec<ProviderMetrics> = response.json().await?;
info!("Fetched provider metrics for {} providers", metrics.len());
Ok(metrics)
}
pub async fn get_model_metrics(&self) -> Result<Vec<ModelMetrics>, reqwest::Error> {
let url = format!("{}/v1/metrics/models", self.config.base_url);
let mut request = self.http_client.get(&url);
if let Some(ref key) = self.config.api_key {
request = request.header("Authorization", format!("Bearer {}", key));
}
let response = request.send().await?;
let metrics: Vec<ModelMetrics> = response.json().await?;
info!("Fetched model metrics for {} models", metrics.len());
Ok(metrics)
}
pub async fn get_rankings(
&self,
category: Option<&str>,
limit: Option<u32>,
) -> Result<Vec<ModelRanking>, reqwest::Error> {
let mut url = format!("{}/v1/rankings", self.config.base_url);
let mut params = Vec::new();
if let Some(cat) = category {
params.push(format!("category={}", cat));
}
if let Some(l) = limit {
params.push(format!("limit={}", l));
}
if !params.is_empty() {
url = format!("{}?{}", url, params.join("&"));
}
let mut request = self.http_client.get(&url);
if let Some(ref key) = self.config.api_key {
request = request.header("Authorization", format!("Bearer {}", key));
}
let response = request.send().await?;
#[derive(Deserialize)]
struct RankingsResponse {
rankings: Vec<ModelRanking>,
}
let result: RankingsResponse = response.json().await?;
info!("Fetched {} rankings", result.rankings.len());
Ok(result.rankings)
}
pub async fn get_usage(&self) -> Result<UsageAnalytics, reqwest::Error> {
let url = format!("{}/v1/usage", self.config.base_url);
let mut request = self.http_client.get(&url);
if let Some(ref key) = self.config.api_key {
request = request.header("Authorization", format!("Bearer {}", key));
}
let response = request.send().await?;
let usage: UsageAnalytics = response.json().await?;
Ok(usage)
}
pub async fn health_check(&self) -> Result<HealthStatus, reqwest::Error> {
let url = format!("{}/health", self.config.base_url);
let response = self.http_client.get(&url).send().await?;
let health: HealthStatus = response.json().await?;
Ok(health)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UsageAnalytics {
pub period: String,
pub total_requests: u64,
pub total_tokens: u64,
pub total_cost_usd: Option<f64>,
pub by_provider: Vec<ProviderUsage>,
pub by_model: Vec<ModelUsage>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProviderUsage {
pub provider: String,
pub requests: u64,
pub tokens: u64,
pub cost_usd: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelUsage {
pub model: String,
pub provider: String,
pub requests: u64,
pub tokens: u64,
pub cost_usd: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthStatus {
pub status: String,
pub version: Option<String>,
pub uptime_seconds: Option<u64>,
pub providers: Vec<ProviderHealth>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProviderHealth {
pub name: String,
pub healthy: bool,
pub latency_ms: Option<u32>,
pub error_rate: Option<f64>,
}
impl RequestMetrics {
pub fn calculate_verbosity(&mut self) {
if self.output_tokens > 0 && self.input_tokens > 0 {
self.verbosity_score = Some(self.output_tokens as f64 / self.input_tokens as f64);
}
}
pub fn calculate_tps(&mut self) {
if let Some(latency) = self.total_latency_ms {
if latency > 0.0 {
self.tokens_per_second = Some(self.output_tokens as f64 / (latency / 1000.0));
}
}
}
pub fn calculate_cost(&mut self, input_price_per_1m: f64, output_price_per_1m: f64) {
self.cost_input_usd = Some(self.input_tokens as f64 * input_price_per_1m / 1_000_000.0);
self.cost_output_usd = Some(self.output_tokens as f64 * output_price_per_1m / 1_000_000.0);
self.cost_total_usd = self
.cost_input_usd
.map(|i| i + self.cost_output_usd.unwrap_or(0.0));
if self.total_tokens > 0 {
self.cost_per_1k_tokens = self
.cost_total_usd
.map(|c| c * 1000.0 / self.total_tokens as f64);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_request_metrics_calculations() {
let mut metrics = RequestMetrics {
request_id: "test-123".to_string(),
model: "gpt-4o".to_string(),
provider: "openai".to_string(),
input_tokens: 1000,
output_tokens: 500,
total_tokens: 1500,
..Default::default()
};
metrics.total_latency_ms = Some(1000.0);
metrics.calculate_verbosity();
metrics.calculate_tps();
metrics.calculate_cost(2.0, 8.0);
assert_eq!(metrics.verbosity_score, Some(0.5)); assert_eq!(metrics.tokens_per_second, Some(500.0)); assert_eq!(metrics.cost_input_usd, Some(0.002)); assert_eq!(metrics.cost_output_usd, Some(0.004)); assert_eq!(metrics.cost_total_usd, Some(0.006));
}
}