oxibonsai-runtime 0.1.2

Inference runtime, sampling, tokenizer, and server for OxiBonsai
Documentation
//! Admin API endpoints for operational management.
//!
//! Provides non-OpenAI routes used by operators to inspect and control a
//! running OxiBonsai server instance.
//!
//! | Method | Path                    | Description                        |
//! |--------|-------------------------|------------------------------------|
//! | GET    | `/admin/status`         | Server status and live metrics     |
//! | GET    | `/admin/config`         | Current configuration snapshot     |
//! | POST   | `/admin/reset-metrics`  | Reset all metric counters to zero  |
//! | GET    | `/admin/cache-stats`    | KV/inference cache statistics      |
//!
//! # Example
//!
//! ```rust,ignore
//! use std::sync::Arc;
//! use oxibonsai_runtime::admin::{AdminState, create_admin_router};
//! use oxibonsai_runtime::metrics::InferenceMetrics;
//!
//! let metrics = Arc::new(InferenceMetrics::new());
//! let state = Arc::new(AdminState::new(metrics));
//! let router = create_admin_router(Arc::clone(&state));
//! ```

use axum::{
    extract::State,
    http::StatusCode,
    response::IntoResponse,
    routing::{get, post},
    Json, Router,
};
use serde::Serialize;
use std::sync::Arc;
use std::time::Instant;

use crate::metrics::InferenceMetrics;

// ─── Server status response ─────────────────────────────────────────────────

/// Live server status snapshot.
#[derive(Debug, Serialize)]
pub struct ServerStatus {
    /// Crate version string (from `CARGO_PKG_VERSION`).
    pub version: &'static str,
    /// Seconds elapsed since the server started.
    pub uptime_secs: u64,
    /// Whether the inference model has been loaded.
    pub model_loaded: bool,
    /// Cumulative requests received since last reset.
    pub requests_total: u64,
    /// Cumulative tokens generated since last reset.
    pub tokens_generated: u64,
    /// Number of requests currently in flight.
    pub active_connections: u64,
    /// Process resident-set-size in bytes, if available.
    pub memory_rss_bytes: Option<u64>,
}

// ─── Config snapshot response ────────────────────────────────────────────────

/// Snapshot of key server configuration values.
#[derive(Debug, Serialize)]
pub struct ConfigSnapshot {
    /// Default maximum generation tokens per request.
    pub max_tokens_default: usize,
    /// Default sampling temperature.
    pub temperature_default: f32,
    /// Default nucleus sampling probability threshold.
    pub top_p_default: f32,
    /// Crate version string.
    pub server_version: &'static str,
    /// List of compiled-in feature flags.
    pub features: Vec<String>,
}

// ─── AdminState ──────────────────────────────────────────────────────────────

/// Shared state passed to all admin route handlers.
pub struct AdminState {
    /// Time at which the server was started (used to compute uptime).
    pub started_at: Instant,
    /// Shared metrics instance.
    pub metrics: Arc<InferenceMetrics>,
}

impl AdminState {
    /// Create a new `AdminState` with the given metrics.
    pub fn new(metrics: Arc<InferenceMetrics>) -> Self {
        Self {
            started_at: Instant::now(),
            metrics,
        }
    }

    /// Return the number of whole seconds the server has been running.
    pub fn uptime_secs(&self) -> u64 {
        self.started_at.elapsed().as_secs()
    }
}

// ─── Route handlers ──────────────────────────────────────────────────────────

/// `GET /admin/status` — return live server status and metrics.
pub async fn get_status(State(state): State<Arc<AdminState>>) -> impl IntoResponse {
    let rss = {
        let rss_raw = crate::memory::get_rss_bytes();
        if rss_raw == 0 {
            None
        } else {
            Some(rss_raw)
        }
    };

    let status = ServerStatus {
        version: env!("CARGO_PKG_VERSION"),
        uptime_secs: state.uptime_secs(),
        // We treat "model loaded" as true when at least one request has been
        // handled (a placeholder heuristic; callers can extend AdminState for
        // a real flag).
        model_loaded: state.metrics.requests_total.get() > 0
            || state.metrics.tokens_generated_total.get() > 0,
        requests_total: state.metrics.requests_total.get(),
        tokens_generated: state.metrics.tokens_generated_total.get(),
        active_connections: state.metrics.active_requests.get() as u64,
        memory_rss_bytes: rss,
    };

    (StatusCode::OK, Json(status))
}

/// `GET /admin/config` — return current configuration snapshot.
pub async fn get_config(_state: State<Arc<AdminState>>) -> impl IntoResponse {
    let snapshot = ConfigSnapshot {
        max_tokens_default: 256,
        temperature_default: 0.7,
        top_p_default: 0.9,
        server_version: env!("CARGO_PKG_VERSION"),
        features: features_enabled(),
    };

    (StatusCode::OK, Json(snapshot))
}

/// `POST /admin/reset-metrics` — reset all metric counters to zero.
///
/// Returns a JSON object: `{"reset": true, "timestamp": "<ISO-8601>"}`.
pub async fn reset_metrics(State(state): State<Arc<AdminState>>) -> impl IntoResponse {
    // Reset counters by reading current values and subtracting them.
    let requests = state.metrics.requests_total.get();
    state.metrics.requests_total.inc_by(0); // ensure no-op reads are fine

    // Use inc_by with wrapping: fetch current, set back to 0 by subtracting.
    // Counter only supports inc_by(n) — we reset by exploiting u64 wrap-around
    // with a large subtraction. A cleaner approach: read-subtract current value.
    // Since Counter doesn't expose a reset(), we achieve "reset" semantics by
    // subtracting the current reading. Under normal (non-overflow) circumstances
    // this yields exactly 0.
    let tokens = state.metrics.tokens_generated_total.get();
    let errors = state.metrics.errors_total.get();
    let prompt = state.metrics.prompt_tokens_total.get();

    // Subtract current values to bring counters back to 0 (u64 wrapping arithmetic).
    state
        .metrics
        .requests_total
        .inc_by(u64::MAX.wrapping_sub(requests).wrapping_add(1));
    state
        .metrics
        .tokens_generated_total
        .inc_by(u64::MAX.wrapping_sub(tokens).wrapping_add(1));
    state
        .metrics
        .errors_total
        .inc_by(u64::MAX.wrapping_sub(errors).wrapping_add(1));
    state
        .metrics
        .prompt_tokens_total
        .inc_by(u64::MAX.wrapping_sub(prompt).wrapping_add(1));

    // Also reset gauges.
    state.metrics.active_requests.set(0.0);
    state.metrics.kv_cache_utilization.set(0.0);

    let ts = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs();

    let body = serde_json::json!({
        "reset": true,
        "timestamp": ts,
    });

    (StatusCode::OK, Json(body))
}

/// `GET /admin/cache-stats` — return placeholder cache statistics.
pub async fn get_cache_stats(_state: State<Arc<AdminState>>) -> impl IntoResponse {
    let body = serde_json::json!({
        "kv_cache": {
            "capacity_blocks": 0,
            "used_blocks": 0,
            "utilization": 0.0,
            "evictions_total": 0,
        },
        "prefix_cache": {
            "entries": 0,
            "hit_rate": 0.0,
        },
        "status": "ok",
    });

    (StatusCode::OK, Json(body))
}

// ─── Router builder ──────────────────────────────────────────────────────────

/// Build the Axum router for all admin endpoints.
///
/// Mount at a path prefix such as `/admin` in your main router, or use
/// directly on its own in tests.
pub fn create_admin_router(state: Arc<AdminState>) -> Router<Arc<AdminState>> {
    Router::new()
        .route("/admin/status", get(get_status))
        .route("/admin/config", get(get_config))
        .route("/admin/reset-metrics", post(reset_metrics))
        .route("/admin/cache-stats", get(get_cache_stats))
        .with_state(state)
}

// ─── Feature detection ───────────────────────────────────────────────────────

/// Return the list of Cargo features that were enabled at compile time.
#[allow(clippy::vec_init_then_push)]
pub fn features_enabled() -> Vec<String> {
    let mut features = Vec::new();

    #[cfg(feature = "server")]
    features.push("server".to_owned());

    #[cfg(feature = "rag")]
    features.push("rag".to_owned());

    #[cfg(feature = "wasm")]
    features.push("wasm".to_owned());

    #[cfg(target_arch = "wasm32")]
    features.push("wasm32".to_owned());

    #[cfg(target_arch = "x86_64")]
    features.push("x86_64".to_owned());

    #[cfg(target_arch = "aarch64")]
    features.push("aarch64".to_owned());

    // Always include the runtime itself.
    features.push("runtime".to_owned());

    features
}

// ─── Unit tests ──────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_admin_state_uptime() {
        let metrics = Arc::new(InferenceMetrics::new());
        let state = AdminState::new(metrics);
        // Uptime should be 0 right after construction (well under 1 second).
        let uptime = state.uptime_secs();
        assert!(
            uptime < 5,
            "uptime should be nearly 0 at creation; got {uptime}"
        );
    }

    #[test]
    fn test_features_enabled_non_empty() {
        let features = features_enabled();
        assert!(!features.is_empty(), "features list should not be empty");
        assert!(
            features.contains(&"runtime".to_owned()),
            "should always include 'runtime'"
        );
    }

    #[test]
    fn test_server_version_non_empty() {
        let version: &'static str = env!("CARGO_PKG_VERSION");
        assert!(!version.is_empty(), "CARGO_PKG_VERSION should not be empty");
    }
}