reflex-server 0.2.2

OpenAI-compatible HTTP gateway for Reflex cache
Documentation
mod common;

use async_openai::types::chat::{
    ChatCompletionRequestMessage, ChatCompletionRequestUserMessageArgs,
    CreateChatCompletionRequestArgs,
};
use common::harness::{TestServerConfig, spawn_real_server};
use common::http_client::TestClient;
use std::time::Instant;

fn create_request(prompt: &str) -> async_openai::types::chat::CreateChatCompletionRequest {
    CreateChatCompletionRequestArgs::default()
        .model("gpt-4o")
        .messages([ChatCompletionRequestMessage::User(
            ChatCompletionRequestUserMessageArgs::default()
                .content(prompt)
                .build()
                .unwrap(),
        )])
        .build()
        .unwrap()
}

#[tokio::test]
#[ignore]
async fn test_benchmark_performance() {
    println!("\n=== Starting Performance Benchmark ===");

    let cwd = std::env::current_dir().expect("Failed to get CWD");

    let model_path = cwd.join(".models/qwen2.5-0.5b-instruct-q8_0.gguf");
    if model_path.exists() {
        println!("> Using Local Embedder: {:?}", model_path);
        unsafe {
            std::env::set_var("REFLEX_MODEL_PATH", model_path);
        }
    } else {
        println!("> Local Embedder not found. Using Stub Embedder.");
    }

    let reranker_path = cwd.join(".models/ms-marco-MiniLM-L-6-v2");
    if reranker_path.exists() {
        println!("> Using Local Reranker: {:?}", reranker_path);
        unsafe {
            std::env::set_var("REFLEX_RERANKER_PATH", reranker_path);
        }
    } else {
        println!("> Local Reranker not found. Downloading via hf-hub...");
        println!("> Skipping download complexity. Will run with Stub Reranker.");
    }

    let config = TestServerConfig::default();
    let server = spawn_real_server(config)
        .await
        .expect("Failed to spawn server");
    let client = TestClient::new(server.url());

    let prompt = "How do I implement a binary search tree in Rust with interior mutability?";
    let req = create_request(prompt);

    println!("> Sending Initial Request (Miss)...");
    let start_miss = Instant::now();
    let (_, status1) = client
        .chat_completions(req.clone())
        .await
        .expect("Request failed");
    let duration_miss = start_miss.elapsed();
    println!("  Status: {}", status1);
    println!("  Latency: {:.2?}", duration_miss);

    println!("> Waiting for Async Indexing...");
    tokio::time::sleep(std::time::Duration::from_secs(2)).await;

    println!("> Sending Second Request (Targeting Hit)...");
    let prompt2 = "Show me a Rust binary search tree implementation using interior mutability.";
    let req2 = create_request(prompt2);
    let start_hit = Instant::now();
    let (_, status2) = client.chat_completions(req2).await.expect("Request failed");
    let duration_hit = start_hit.elapsed();
    println!("  Status: {}", status2);
    println!("  Latency: {:.2?}", duration_hit);

    println!("\n--- Results ---");
    println!("Miss Latency: {:.2?}", duration_miss);
    println!("Hit Latency:  {:.2?}", duration_hit);
    if duration_hit < duration_miss {
        let speedup = duration_miss.as_secs_f64() / duration_hit.as_secs_f64();
        println!("Speedup: {:.2}x", speedup);
    } else {
        println!("Speedup: None (Hit was slower?)");
    }
    println!("======================================\n");
}