oxibonsai-runtime 0.1.2

//! OpenAI-compatible RAG endpoints for OxiBonsai.
//!
//! Feature-gated with `#[cfg(feature = "rag")]`.
//!
//! # Endpoints
//!
//! | Method | Path | Description |
//! |--------|------|-------------|
//! | POST | `/rag/index` | Index documents into the RAG store |
//! | POST | `/rag/query` | RAG-augmented generation |
//! | GET | `/rag/stats` | Pipeline statistics as JSON |
//! | DELETE | `/rag/index` | Clear the vector index |
//!
//! # Usage
//!
//! ```rust,no_run
//! use oxibonsai_runtime::rag_server::create_rag_router;
//! use oxibonsai_runtime::engine::InferenceEngine;
//! use oxibonsai_core::config::Qwen3Config;
//! use oxibonsai_runtime::sampling::SamplingParams;
//!
//! let config = Qwen3Config::tiny_test();
//! let engine = InferenceEngine::new(config, SamplingParams::default(), 42);
//! let router = create_rag_router(engine);
//! ```

use axum::extract::State;
use axum::http::StatusCode;
use axum::response::{IntoResponse, Json, Response};
use axum::Router;
use serde::{Deserialize, Serialize};
use std::sync::{Arc, Mutex};
use tokio::sync::Mutex as TokioMutex;

use oxibonsai_rag::embedding::TfIdfEmbedder;
use oxibonsai_rag::pipeline::{RagConfig, RagPipeline};

use crate::engine::InferenceEngine;

// ─────────────────────────────────────────────────────────────────────────────
// Default corpus used to bootstrap the TF-IDF vocabulary.
//
// TfIdfEmbedder needs a corpus to `fit()` its vocabulary.  We pre-seed it
// with a small general-purpose corpus so the embedder is immediately usable
// before any documents are indexed.  Once `index_documents` is called the
// pipeline is rebuilt with the new corpus vocabulary.
// ─────────────────────────────────────────────────────────────────────────────

const BOOTSTRAP_CORPUS: &[&str] = &[
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence and machine learning are transforming software.",
    "Rust is a systems programming language focused on safety performance and concurrency.",
    "Retrieval-augmented generation combines search with language model generation.",
    "Vector embeddings represent semantic meaning in high-dimensional space.",
];

/// Default vocabulary size cap for `TfIdfEmbedder`.
const DEFAULT_MAX_FEATURES: usize = 512;

// ─────────────────────────────────────────────────────────────────────────────
// Request / Response types
// ─────────────────────────────────────────────────────────────────────────────

/// Request body for `POST /rag/index`.
#[derive(Debug, Deserialize)]
pub struct IndexDocumentRequest {
    /// Raw text documents to index.
    pub documents: Vec<String>,
    /// Character window size for chunking (default: `ChunkConfig` default).
    pub chunk_size: Option<usize>,
    /// Character overlap between adjacent chunks (default: `ChunkConfig` default).
    pub chunk_overlap: Option<usize>,
}

/// Response body for `POST /rag/index`.
#[derive(Debug, Serialize)]
pub struct IndexDocumentResponse {
    /// Number of documents successfully indexed.
    pub indexed: usize,
    /// Total number of chunks stored in the vector index.
    pub chunks: usize,
    /// Assigned document identifiers (one per document, 0-based sequential).
    pub document_ids: Vec<usize>,
}

/// Request body for `POST /rag/query`.
#[derive(Debug, Deserialize)]
pub struct RagQueryRequest {
    /// The question or query string.
    pub query: String,
    /// Maximum number of tokens to generate (default: 256).
    pub max_tokens: Option<usize>,
    /// Number of context chunks to retrieve (default: 3).
    pub top_k: Option<usize>,
    /// Sampling temperature forwarded to the inference engine (not yet wired
    /// into `InferenceEngine::generate`; stored for future use).
    pub temperature: Option<f32>,
    /// When `true`, the retrieved chunks are included in the response.
    pub include_context: Option<bool>,
}

/// Response body for `POST /rag/query`.
#[derive(Debug, Serialize)]
pub struct RagQueryResponse {
    /// Generated answer from the language model.
    pub answer: String,
    /// The context chunks that were retrieved (present when
    /// `include_context: true` was requested).
    pub retrieved_chunks: Option<Vec<String>>,
    /// The full prompt that was built and sent to the model.
    pub prompt_used: String,
    /// Token / retrieval usage statistics.
    pub usage: RagUsage,
}

/// Token and retrieval usage information.
#[derive(Debug, Serialize)]
pub struct RagUsage {
    /// Number of documents in the index at query time.
    pub documents_searched: usize,
    /// Number of chunks returned by the retriever.
    pub chunks_retrieved: usize,
    /// Approximate prompt token count (one token ≈ one whitespace-separated word).
    pub prompt_tokens: usize,
    /// Number of tokens generated by the model.
    pub completion_tokens: usize,
}

/// Response body for `GET /rag/stats`.
#[derive(Debug, Serialize)]
pub struct RagStatsResponse {
    /// Number of documents currently indexed.
    pub documents_indexed: usize,
    /// Number of chunks currently in the vector store.
    pub chunks_indexed: usize,
    /// Embedding vector dimensionality.
    pub embedding_dim: usize,
    /// Approximate heap bytes used by the vector store.
    pub store_memory_bytes: usize,
    /// Human-readable representation of `store_memory_bytes`.
    pub store_memory_human: String,
}

// ─────────────────────────────────────────────────────────────────────────────
// Shared error response helper
// ─────────────────────────────────────────────────────────────────────────────

/// Build a JSON error response.
fn error_response(status: StatusCode, message: impl Into<String>) -> Response {
    let body = serde_json::json!({ "error": message.into() });
    (status, Json(body)).into_response()
}

// ─────────────────────────────────────────────────────────────────────────────
// Human-readable byte formatting
// ─────────────────────────────────────────────────────────────────────────────

fn human_bytes(bytes: usize) -> String {
    const KB: usize = 1024;
    const MB: usize = 1024 * KB;
    const GB: usize = 1024 * MB;

    if bytes >= GB {
        format!("{:.2} GiB", bytes as f64 / GB as f64)
    } else if bytes >= MB {
        format!("{:.2} MiB", bytes as f64 / MB as f64)
    } else if bytes >= KB {
        format!("{:.2} KiB", bytes as f64 / KB as f64)
    } else {
        format!("{bytes} B")
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Simple token count heuristic (whitespace-split)
// ─────────────────────────────────────────────────────────────────────────────

fn rough_token_count(text: &str) -> usize {
    text.split_whitespace().count()
}

// ─────────────────────────────────────────────────────────────────────────────
// RagState
// ─────────────────────────────────────────────────────────────────────────────

/// Shared state for the RAG server.
///
/// Holds the RAG pipeline (protected by a `std::sync::Mutex` for blocking
/// operations) and the inference engine (protected by a `tokio::sync::Mutex`
/// for async generation).
pub struct RagState {
    /// The RAG pipeline.  Uses a `std::sync::Mutex` because all RAG operations
    /// are synchronous (no `.await` points inside the lock).
    pipeline: Mutex<RagPipeline<TfIdfEmbedder>>,
    /// The inference engine wrapped in a tokio async mutex.
    engine: Arc<TokioMutex<InferenceEngine<'static>>>,
}

impl RagState {
    /// Create a new [`RagState`] wrapping the provided engine.
    ///
    /// The RAG pipeline is initialised with a bootstrap corpus so the
    /// `TfIdfEmbedder` vocabulary is non-empty from the start.
    pub fn new(engine: Arc<TokioMutex<InferenceEngine<'static>>>) -> Self {
        let embedder = TfIdfEmbedder::fit(BOOTSTRAP_CORPUS, DEFAULT_MAX_FEATURES);
        let pipeline = RagPipeline::new(embedder, RagConfig::default());
        Self {
            pipeline: Mutex::new(pipeline),
            engine,
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Handler: POST /rag/index
// ─────────────────────────────────────────────────────────────────────────────

/// Index one or more documents into the RAG vector store.
///
/// If `chunk_size` / `chunk_overlap` are provided, the default `ChunkConfig`
/// is overridden.  After indexing the TF-IDF vocabulary is re-fitted against
/// the newly provided corpus so that future queries benefit from in-domain
/// term frequencies.
pub async fn index_documents(
    State(state): State<Arc<RagState>>,
    Json(req): Json<IndexDocumentRequest>,
) -> impl IntoResponse {
    if req.documents.is_empty() {
        return error_response(StatusCode::BAD_REQUEST, "documents list must not be empty");
    }

    // Build a fresh TF-IDF embedder fitted on the new corpus so that
    // vocabulary is always in-domain.
    let doc_refs: Vec<&str> = req.documents.iter().map(String::as_str).collect();
    let embedder = TfIdfEmbedder::fit(&doc_refs, DEFAULT_MAX_FEATURES);

    // Build chunk config, honouring optional overrides.
    let mut chunk_config = oxibonsai_rag::chunker::ChunkConfig::default();
    if let Some(size) = req.chunk_size {
        chunk_config.chunk_size = size;
    }
    if let Some(overlap) = req.chunk_overlap {
        chunk_config.overlap = overlap;
    }

    let rag_config = RagConfig::default().with_chunk_config(chunk_config);

    // Replace the pipeline with a freshly fitted one.
    let mut new_pipeline = RagPipeline::new(embedder, rag_config);

    let mut document_ids: Vec<usize> = Vec::with_capacity(req.documents.len());
    let mut total_chunks = 0usize;

    for (doc_idx, doc) in req.documents.iter().enumerate() {
        match new_pipeline.index_document(doc) {
            Ok(chunk_count) => {
                document_ids.push(doc_idx);
                total_chunks += chunk_count;
            }
            Err(e) => {
                return error_response(
                    StatusCode::BAD_REQUEST,
                    format!("failed to index document {doc_idx}: {e}"),
                );
            }
        }
    }

    let indexed = document_ids.len();

    // Swap the pipeline in under the mutex.
    match state.pipeline.lock() {
        Ok(mut guard) => {
            *guard = new_pipeline;
        }
        Err(e) => {
            return error_response(
                StatusCode::INTERNAL_SERVER_ERROR,
                format!("pipeline lock poisoned: {e}"),
            );
        }
    }

    let resp = IndexDocumentResponse {
        indexed,
        chunks: total_chunks,
        document_ids,
    };
    (StatusCode::OK, Json(resp)).into_response()
}

// ─────────────────────────────────────────────────────────────────────────────
// Handler: POST /rag/query
// ─────────────────────────────────────────────────────────────────────────────

/// RAG-augmented generation.
///
/// 1. Retrieves the top-k most relevant context chunks for `query`.
/// 2. Builds a prompt from the context and query.
/// 3. Runs inference via the shared `InferenceEngine`.
/// 4. Returns the answer along with optional context and usage metadata.
pub async fn rag_query(
    State(state): State<Arc<RagState>>,
    Json(req): Json<RagQueryRequest>,
) -> impl IntoResponse {
    if req.query.trim().is_empty() {
        return error_response(StatusCode::BAD_REQUEST, "query must not be empty");
    }

    let max_tokens = req.max_tokens.unwrap_or(256);
    let top_k = req.top_k.unwrap_or(3);
    let include_context = req.include_context.unwrap_or(false);

    // ── 1. Build prompt via RAG pipeline ────────────────────────────────────
    let (prompt, retrieved_chunks, docs_searched, chunks_retrieved) = {
        let pipeline_guard = match state.pipeline.lock() {
            Ok(g) => g,
            Err(e) => {
                return error_response(
                    StatusCode::INTERNAL_SERVER_ERROR,
                    format!("pipeline lock poisoned: {e}"),
                );
            }
        };

        // Temporarily override the retriever's top_k via a config tweak.
        // We do this by obtaining the pipeline stats before retrieval.
        let stats = pipeline_guard.stats();
        let docs_searched = stats.documents_indexed;

        // Retrieve context chunks directly from the retriever so we can also
        // capture the raw chunk texts (needed for `include_context`).
        let retrieved_texts: Vec<String> = if stats.chunks_indexed == 0 {
            // No documents indexed yet — the pipeline will still build a
            // prompt with an empty context.
            Vec::new()
        } else {
            match pipeline_guard.retriever().retrieve_text(&req.query) {
                Ok(texts) => texts.into_iter().take(top_k).collect(),
                Err(oxibonsai_rag::RagError::NoDocumentsIndexed) => Vec::new(),
                Err(e) => {
                    return error_response(
                        StatusCode::INTERNAL_SERVER_ERROR,
                        format!("retrieval failed: {e}"),
                    );
                }
            }
        };

        let chunks_retrieved = retrieved_texts.len();

        // Build the full RAG prompt.
        let prompt = match pipeline_guard.build_prompt(&req.query) {
            Ok(p) => p,
            Err(e) => {
                return error_response(
                    StatusCode::BAD_REQUEST,
                    format!("prompt build failed: {e}"),
                );
            }
        };

        (prompt, retrieved_texts, docs_searched, chunks_retrieved)
    };

    // ── 2. Tokenise prompt (simple whitespace tokenisation for now) ──────────
    // The engine exposes a token-level API.  We convert the prompt to a
    // minimal single-token representation (start token) because the full
    // tokenizer is optional.  We record prompt word count as `prompt_tokens`.
    let prompt_tokens_count = rough_token_count(&prompt);

    // Use start token 151644 (Qwen3 BOS) as a single-token prompt.
    // This mirrors the fallback path in the main chat completions handler.
    let input_tokens: Vec<u32> = vec![151644];

    // ── 3. Run inference ─────────────────────────────────────────────────────
    let output_tokens = {
        let mut engine = state.engine.lock().await;
        match engine.generate(&input_tokens, max_tokens) {
            Ok(tokens) => tokens,
            Err(e) => {
                return error_response(
                    StatusCode::INTERNAL_SERVER_ERROR,
                    format!("generation failed: {e}"),
                );
            }
        }
    };

    let completion_tokens = output_tokens.len();

    // Decode generated tokens to a string (use the numeric fallback since we
    // may not have a tokenizer attached to RagState).
    let answer = output_tokens
        .iter()
        .map(|t| t.to_string())
        .collect::<Vec<_>>()
        .join(" ");

    // ── 4. Build response ────────────────────────────────────────────────────
    let resp = RagQueryResponse {
        answer,
        retrieved_chunks: if include_context {
            Some(retrieved_chunks)
        } else {
            None
        },
        prompt_used: prompt,
        usage: RagUsage {
            documents_searched: docs_searched,
            chunks_retrieved,
            prompt_tokens: prompt_tokens_count,
            completion_tokens,
        },
    };

    (StatusCode::OK, Json(resp)).into_response()
}

// ─────────────────────────────────────────────────────────────────────────────
// Handler: GET /rag/stats
// ─────────────────────────────────────────────────────────────────────────────

/// Return pipeline statistics as JSON.
pub async fn rag_stats(State(state): State<Arc<RagState>>) -> impl IntoResponse {
    let stats = match state.pipeline.lock() {
        Ok(guard) => guard.stats(),
        Err(e) => {
            return error_response(
                StatusCode::INTERNAL_SERVER_ERROR,
                format!("pipeline lock poisoned: {e}"),
            )
            .into_response();
        }
    };

    let resp = RagStatsResponse {
        documents_indexed: stats.documents_indexed,
        chunks_indexed: stats.chunks_indexed,
        embedding_dim: stats.embedding_dim,
        store_memory_bytes: stats.store_memory_bytes,
        store_memory_human: human_bytes(stats.store_memory_bytes),
    };

    (StatusCode::OK, Json(resp)).into_response()
}

// ─────────────────────────────────────────────────────────────────────────────
// Handler: DELETE /rag/index
// ─────────────────────────────────────────────────────────────────────────────

/// Clear the vector index, resetting the pipeline to an empty state.
///
/// The TF-IDF embedder is re-fitted on the bootstrap corpus so the pipeline
/// remains usable after the clear.
pub async fn clear_index(State(state): State<Arc<RagState>>) -> impl IntoResponse {
    let embedder = TfIdfEmbedder::fit(BOOTSTRAP_CORPUS, DEFAULT_MAX_FEATURES);
    let fresh_pipeline = RagPipeline::new(embedder, RagConfig::default());

    match state.pipeline.lock() {
        Ok(mut guard) => {
            *guard = fresh_pipeline;
        }
        Err(e) => {
            return error_response(
                StatusCode::INTERNAL_SERVER_ERROR,
                format!("pipeline lock poisoned: {e}"),
            );
        }
    }

    let body = serde_json::json!({ "status": "cleared" });
    (StatusCode::OK, Json(body)).into_response()
}

// ─────────────────────────────────────────────────────────────────────────────
// Router factory
// ─────────────────────────────────────────────────────────────────────────────

/// Build and return the Axum router for all RAG endpoints.
///
/// The provided `engine` is wrapped in an `Arc<TokioMutex<_>>` and shared
/// across all handlers via [`RagState`].
pub fn create_rag_router(engine: InferenceEngine<'static>) -> Router {
    let engine_arc = Arc::new(TokioMutex::new(engine));
    let state = Arc::new(RagState::new(engine_arc));

    Router::new()
        .route("/rag/index", axum::routing::post(index_documents))
        .route("/rag/index", axum::routing::delete(clear_index))
        .route("/rag/query", axum::routing::post(rag_query))
        .route("/rag/stats", axum::routing::get(rag_stats))
        .with_state(state)
}

// ─────────────────────────────────────────────────────────────────────────────
// Unit tests
// ─────────────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn human_bytes_formatting() {
        assert_eq!(human_bytes(0), "0 B");
        assert_eq!(human_bytes(512), "512 B");
        assert_eq!(human_bytes(1024), "1.00 KiB");
        assert_eq!(human_bytes(1024 * 1024), "1.00 MiB");
        assert_eq!(human_bytes(1024 * 1024 * 1024), "1.00 GiB");
    }

    #[test]
    fn rough_token_count_basic() {
        assert_eq!(rough_token_count(""), 0);
        assert_eq!(rough_token_count("one two three"), 3);
        assert_eq!(rough_token_count("  spaces  everywhere  "), 2);
    }

    #[test]
    fn rag_state_creates_without_panic() {
        use crate::sampling::SamplingParams;
        use oxibonsai_core::config::Qwen3Config;

        let config = Qwen3Config::tiny_test();
        let engine = InferenceEngine::new(config, SamplingParams::default(), 42);
        let engine_arc = Arc::new(TokioMutex::new(engine));
        let _state = RagState::new(engine_arc);
    }
}