swarm-engine-eval 0.1.6

Evaluation framework for SwarmEngine
Documentation
//! Performance metrics

use std::time::Duration;

use serde::{Deserialize, Serialize};

/// Performance metrics
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PerformanceMetrics {
    /// Average tick latency in milliseconds
    pub avg_tick_latency_ms: f64,

    /// 95th percentile tick latency in milliseconds
    pub tick_latency_p95_ms: f64,

    /// 99th percentile tick latency in milliseconds
    pub tick_latency_p99_ms: f64,

    /// Tick miss rate (ticks exceeding target duration / total ticks)
    pub tick_miss_rate: f64,

    /// Tick jitter (std_dev / mean of tick durations)
    pub tick_jitter: f64,

    /// Makespan ratio (actual_ticks / max_ticks)
    pub makespan_ratio: f64,

    /// Raw throughput (all actions per second, including failures)
    pub raw_throughput_per_sec: f64,

    /// Effective throughput (successful actions per second)
    pub effective_throughput_per_sec: f64,

    /// Total duration in milliseconds
    pub total_duration_ms: f64,

    /// Total LLM invocations
    pub llm_invocations: u64,

    /// LLM invocation errors (parse error, server down, etc.)
    pub llm_invoke_errors: u64,

    /// LLM error rate (errors / invocations)
    pub llm_error_rate: f64,
}

impl PerformanceMetrics {
    /// Calculate from tick latencies
    pub fn from_latencies(
        latencies: &[Duration],
        target_tick_duration: Option<Duration>,
        total_actions: u64,
        max_ticks: u64,
    ) -> Self {
        if latencies.is_empty() {
            return Self::default();
        }

        // Convert to f64 milliseconds for calculations
        let mut latencies_ms: Vec<f64> =
            latencies.iter().map(|d| d.as_secs_f64() * 1000.0).collect();
        latencies_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());

        let n = latencies_ms.len();

        // Mean
        let sum: f64 = latencies_ms.iter().sum();
        let mean = sum / n as f64;

        // Standard deviation
        let variance: f64 = latencies_ms.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n as f64;
        let std_dev = variance.sqrt();

        // Jitter (coefficient of variation)
        let jitter = if mean > 0.0 { std_dev / mean } else { 0.0 };

        // Percentiles
        let p95_idx = ((n as f64) * 0.95).ceil() as usize - 1;
        let p99_idx = ((n as f64) * 0.99).ceil() as usize - 1;
        let p95 = latencies_ms[p95_idx.min(n - 1)];
        let p99 = latencies_ms[p99_idx.min(n - 1)];

        // Miss rate
        let miss_rate = if let Some(target) = target_tick_duration {
            let target_ms = target.as_secs_f64() * 1000.0;
            let misses = latencies_ms.iter().filter(|&&l| l > target_ms).count();
            misses as f64 / n as f64
        } else {
            0.0
        };

        // Total duration
        let total_duration_ms = sum;

        // Throughput
        let throughput = if total_duration_ms > 0.0 {
            (total_actions as f64) / (total_duration_ms / 1000.0)
        } else {
            0.0
        };

        // Makespan ratio
        let makespan_ratio = if max_ticks > 0 {
            n as f64 / max_ticks as f64
        } else {
            0.0
        };

        Self {
            avg_tick_latency_ms: mean,
            tick_latency_p95_ms: p95,
            tick_latency_p99_ms: p99,
            tick_miss_rate: miss_rate,
            tick_jitter: jitter,
            makespan_ratio,
            raw_throughput_per_sec: throughput,
            effective_throughput_per_sec: 0.0, // Set by caller with successful_actions
            total_duration_ms,
            llm_invocations: 0,   // Set by caller
            llm_invoke_errors: 0, // Set by caller
            llm_error_rate: 0.0,  // Set by caller
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_empty_latencies() {
        let metrics = PerformanceMetrics::from_latencies(&[], None, 0, 0);
        assert_eq!(metrics.avg_tick_latency_ms, 0.0);
    }

    #[test]
    fn test_latency_calculations() {
        let latencies: Vec<Duration> = (1..=100).map(Duration::from_millis).collect();

        let metrics = PerformanceMetrics::from_latencies(
            &latencies,
            Some(Duration::from_millis(80)),
            1000,
            100,
        );

        // Mean should be around 50.5ms
        assert!((metrics.avg_tick_latency_ms - 50.5).abs() < 1.0);

        // P95 should be around 95ms
        assert!((metrics.tick_latency_p95_ms - 95.0).abs() < 1.0);

        // P99 should be around 99ms
        assert!((metrics.tick_latency_p99_ms - 99.0).abs() < 1.0);

        // Miss rate: 20 out of 100 (81-100ms > 80ms target)
        assert!((metrics.tick_miss_rate - 0.2).abs() < 0.01);

        // Makespan ratio should be 1.0
        assert!((metrics.makespan_ratio - 1.0).abs() < 0.01);
    }
}