ai_tokenopt 0.5.7

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Prometheus-compatible optimization metrics.
//!
//! Tracks cumulative token savings, per-strategy usage counts, and
//! output token capping. Exposes metrics in Prometheus text exposition
//! format for easy integration with monitoring infrastructure.

use std::collections::HashMap;
use std::sync::RwLock;
use std::sync::atomic::{AtomicU64, Ordering};

/// Thread-safe optimization metrics using atomic counters.
#[derive(Debug)]
pub struct OptimizationMetrics {
    /// Cumulative input tokens saved across all optimization calls
    tokens_saved_total: AtomicU64,
    /// Total number of optimization calls
    optimizations_total: AtomicU64,
    /// Per-strategy usage counts
    strategy_uses: RwLock<HashMap<String, AtomicU64>>,
    /// Cumulative reduction ratio sum (× 1000 for fixed-point)
    reduction_ratio_sum: AtomicU64,
    /// Number of times output max_tokens was capped below model max
    output_tokens_capped_total: AtomicU64,
}

impl OptimizationMetrics {
    /// Create a new metrics instance with zeroed counters.
    #[must_use]
    pub fn new() -> Self {
        Self {
            tokens_saved_total: AtomicU64::new(0),
            optimizations_total: AtomicU64::new(0),
            strategy_uses: RwLock::new(HashMap::new()),
            reduction_ratio_sum: AtomicU64::new(0),
            output_tokens_capped_total: AtomicU64::new(0),
        }
    }

    /// Record a completed optimization operation.
    pub fn record_optimization(&self, tokens_before: u32, tokens_after: u32, strategy: &str) {
        let saved = tokens_before.saturating_sub(tokens_after);
        self.tokens_saved_total
            .fetch_add(u64::from(saved), Ordering::Relaxed);
        self.optimizations_total.fetch_add(1, Ordering::Relaxed);

        // Record reduction ratio as fixed-point (× 1000)
        if tokens_before > 0 {
            #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
            let ratio_fp = ((f64::from(saved) / f64::from(tokens_before)) * 1000.0).round() as u64;
            self.reduction_ratio_sum
                .fetch_add(ratio_fp, Ordering::Relaxed);
        }

        // Increment strategy counter
        if let Ok(mut map) = self.strategy_uses.write() {
            map.entry(strategy.to_string())
                .or_insert_with(|| AtomicU64::new(0))
                .fetch_add(1, Ordering::Relaxed);
        }
    }

    /// Record that output max_tokens was capped below the model maximum.
    pub fn record_output_cap(&self) {
        self.output_tokens_capped_total
            .fetch_add(1, Ordering::Relaxed);
    }

    /// Get cumulative tokens saved.
    #[must_use]
    pub fn tokens_saved(&self) -> u64 {
        self.tokens_saved_total.load(Ordering::Relaxed)
    }

    /// Get total optimization call count.
    #[must_use]
    pub fn total_optimizations(&self) -> u64 {
        self.optimizations_total.load(Ordering::Relaxed)
    }

    /// Get average reduction ratio (0.0–1.0).
    #[must_use]
    pub fn avg_reduction_ratio(&self) -> f64 {
        let total = self.optimizations_total.load(Ordering::Relaxed);
        if total == 0 {
            return 0.0;
        }
        let sum = self.reduction_ratio_sum.load(Ordering::Relaxed);
        #[allow(clippy::cast_precision_loss)]
        let avg = (sum as f64 / total as f64) / 1000.0;
        avg
    }

    /// Render metrics in Prometheus text exposition format.
    #[must_use]
    pub fn as_prometheus_text(&self) -> String {
        let mut out = String::with_capacity(512);

        out.push_str(
            "# HELP tokenopt_tokens_saved_total Cumulative input tokens saved by optimization\n",
        );
        out.push_str("# TYPE tokenopt_tokens_saved_total counter\n");
        out.push_str(&format!(
            "tokenopt_tokens_saved_total {}\n",
            self.tokens_saved_total.load(Ordering::Relaxed)
        ));

        out.push_str("# HELP tokenopt_optimizations_total Total optimization calls\n");
        out.push_str("# TYPE tokenopt_optimizations_total counter\n");
        out.push_str(&format!(
            "tokenopt_optimizations_total {}\n",
            self.optimizations_total.load(Ordering::Relaxed)
        ));

        out.push_str("# HELP tokenopt_output_capped_total Times output max_tokens was reduced\n");
        out.push_str("# TYPE tokenopt_output_capped_total counter\n");
        out.push_str(&format!(
            "tokenopt_output_capped_total {}\n",
            self.output_tokens_capped_total.load(Ordering::Relaxed)
        ));

        out.push_str("# HELP tokenopt_avg_reduction_ratio Average token reduction ratio (0-1)\n");
        out.push_str("# TYPE tokenopt_avg_reduction_ratio gauge\n");
        out.push_str(&format!(
            "tokenopt_avg_reduction_ratio {:.4}\n",
            self.avg_reduction_ratio()
        ));

        if let Ok(map) = self.strategy_uses.read() {
            if !map.is_empty() {
                out.push_str("# HELP tokenopt_strategy_uses_total Per-strategy usage count\n");
                out.push_str("# TYPE tokenopt_strategy_uses_total counter\n");
                let mut entries: Vec<_> = map.iter().collect();
                entries.sort_by(|(a, _), (b, _)| a.cmp(b));
                for (strategy, count) in entries {
                    out.push_str(&format!(
                        "tokenopt_strategy_uses_total{{strategy=\"{strategy}\"}} {}\n",
                        count.load(Ordering::Relaxed)
                    ));
                }
            }
        }

        out
    }
}

impl Default for OptimizationMetrics {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn new_metrics_are_zeroed() {
        let m = OptimizationMetrics::new();
        assert_eq!(m.tokens_saved(), 0);
        assert_eq!(m.total_optimizations(), 0);
        assert!((m.avg_reduction_ratio()).abs() < f64::EPSILON);
    }

    #[test]
    fn record_optimization_increments_counters() {
        let m = OptimizationMetrics::new();
        m.record_optimization(1000, 700, "lossless");
        assert_eq!(m.tokens_saved(), 300);
        assert_eq!(m.total_optimizations(), 1);
    }

    #[test]
    fn multiple_strategies_tracked() {
        let m = OptimizationMetrics::new();
        m.record_optimization(1000, 700, "lossless");
        m.record_optimization(1000, 500, "extractive");
        m.record_optimization(1000, 800, "lossless");
        assert_eq!(m.total_optimizations(), 3);
        assert_eq!(m.tokens_saved(), 1000); // 300 + 500 + 200
    }

    #[test]
    fn avg_reduction_ratio_computed() {
        let m = OptimizationMetrics::new();
        m.record_optimization(1000, 500, "test"); // 50% reduction
        m.record_optimization(1000, 800, "test"); // 20% reduction
        let avg = m.avg_reduction_ratio();
        // (500 + 200) / 1000 / 2 = 0.35
        assert!((avg - 0.35).abs() < 0.01);
    }

    #[test]
    fn output_cap_counter() {
        let m = OptimizationMetrics::new();
        m.record_output_cap();
        m.record_output_cap();
        assert_eq!(m.output_tokens_capped_total.load(Ordering::Relaxed), 2);
    }

    #[test]
    fn prometheus_text_format() {
        let m = OptimizationMetrics::new();
        m.record_optimization(1000, 700, "lossless");
        let text = m.as_prometheus_text();

        assert!(text.contains("tokenopt_tokens_saved_total 300"));
        assert!(text.contains("tokenopt_optimizations_total 1"));
        assert!(text.contains("tokenopt_strategy_uses_total{strategy=\"lossless\"} 1"));
        assert!(text.contains("# TYPE tokenopt_tokens_saved_total counter"));
    }

    #[test]
    fn prometheus_text_empty() {
        let m = OptimizationMetrics::new();
        let text = m.as_prometheus_text();
        assert!(text.contains("tokenopt_tokens_saved_total 0"));
        assert!(text.contains("tokenopt_optimizations_total 0"));
    }
}