ai_tokenopt 0.5.10

//! Token reduction benchmark tests
//!
//! Verifies that the optimizer achieves meaningful token reduction
//! on conversations of various sizes and complexity levels.
//
// Test functions intentionally print detailed before/after metrics to stdout
// so the numbers are visible when running with `-- --nocapture`.
#![allow(clippy::print_stdout)]

use ai_tokenopt::config::TokenOptimizationConfig;
use ai_tokenopt::estimator::TokenEstimator;
use ai_tokenopt::optimizer::TokenOptimizer;
use ai_tokenopt::types::Conversation;

// ───────────────────────────────────────────────────────────────────────────
// German calendar prompt — real-world, markdown-heavy user request used as
// a stress test for the optimizer.  It contains:
//   • Markdown headings (###), bold (**…**), horizontal rules (---)
//   • Numbered + bulleted lists with 2-level indentation
//   • A fenced code block (``` … ```)
//   • German umlauts (ä, ö, ü, ß) scattered throughout
// All of this whitespace-heavy structure is a prime target for lossless
// whitespace collapse (Tier-1 compaction).
// ───────────────────────────────────────────────────────────────────────────
const GERMAN_CALENDAR_PROMPT: &str = r#"Erstelle mir einen **detaillierten Kalendereintrag** mit folgenden Angaben:

### **Anlass:**
**Titel:** Kochen um die Welt - Israel
**Datum:** 30. März 2026
**Uhrzeit:** 14:00 - 17:00 Uhr

### **Rezepte für 6 Personen:**
1. **Vorspeise:** Auberginen-Suppe
2. **Hauptgang:** Sabich
3. **Nachspeise:** Halva

### **Anforderungen an den Kalendereintrag:**
1. **Einkaufsliste:**
   - Erstelle eine **Liste mit dem Titel „Einkaufsliste"**.
   - Füge alle benötigten **Lebensmittel als ToDos** hinzu.
   - Gib **Mengenangaben für 6 Personen** an.
   - Struktur: Lebensmittel (Menge, Einheit, ggf. Hinweise wie „frisch" oder „getrocknet").

2. **Kochliste / Kochplan (Titel: „Liste Kochen"):**
   - Erstelle eine **schrittweise Anleitung** für die Zubereitung aller Gerichte.
   - Ordne die Schritte **logisch und effizient** an (z. B. Vorbereitung der Nachspeise zuerst, wenn diese Ruhezeit benötigt).
   - Jeder Schritt soll **einzeln und klar formuliert** sein.
   - Berücksichtige **Zeitmanagement** (z. B. „30 Minuten vor dem Servieren beginnen").
   - Falls nötig, gib **Hinweise zur Parallelisierung** von Arbeitsschritten.

3. **Integration in den Kalendereintrag:**
   - Füge die **Einkaufsliste** und die **Kochliste** als **Anhang oder Notizen** zum Kalendereintrag hinzu.
   - Formatierung: Klare Überschriften, Aufzählungen und ggf. Hervorhebungen für wichtige Hinweise.

### **Zusätzliche Hinweise:**
- Nutze **aktuelle und seriöse Quellen** für die Rezeptangaben (z. B. Kochbücher oder bewährte Online-Rezepte).
- Achte auf **kulturelle Authentizität** der Gerichte.
- Falls Zutaten schwer erhältlich sind, schlage **Alternativen** vor.

---
**Beispiel für die Struktur der Kochliste:**
```
1. **Halva vorbereiten (kann 1 Tag vorher gemacht werden):**
   - Schritt 1: Sesam rösten (10 Min.).
   - Schritt 2: Zucker und Wasser aufkochen (5 Min.).
   - Schritt 3: ...
2. **Auberginen-Suppe:**
   - Schritt 1: Auberginen schälen und würfeln (15 Min.).
   - ...
3. **Sabich:**
   - Schritt 1: Auberginen backen (45 Min. bei 180°C).
   - ...
```
"#;

fn default_optimizer() -> TokenOptimizer {
    TokenOptimizer::new(TokenOptimizationConfig::default())
}

fn build_conversation(message_count: usize) -> Conversation {
    let mut conv = Conversation::with_system_prompt(
        "You are a helpful AI assistant. You help users with questions about \
         weather, calendar events, email, contacts, and general knowledge. \
         Always be polite, concise, and accurate. If you don't know something, \
         say so honestly. Use tools when appropriate to fulfill user requests.",
    );

    for i in 0..message_count {
        conv.add_user_message(format!(
            "This is user message number {i}. I have a question about the weather \
             forecast for tomorrow and whether I should bring an umbrella to work.",
        ));
        conv.add_assistant_message(format!(
            "Based on the weather forecast for tomorrow, it looks like there's a \
             60% chance of rain in the afternoon. I'd recommend bringing an umbrella \
             just to be safe. The temperature will be around 18°C with partly cloudy \
             skies in the morning. Response number {i}.",
        ));
    }

    conv
}

#[tokio::test]
async fn short_conversation_no_unnecessary_compaction() {
    let optimizer = default_optimizer();
    let mut conv = build_conversation(3);

    let before = TokenEstimator::estimate_conversation(&conv).total;
    let result = optimizer
        .optimize_conversation(&mut conv, None)
        .await
        .expect("optimization should succeed");

    // Short conversation should not need compaction
    assert!(result.compaction.is_none());
    let after = TokenEstimator::estimate_conversation(&conv).total;
    // Should remain roughly the same (may have minor whitespace cleanup)
    assert!(after <= before);
}

#[tokio::test]
async fn medium_conversation_may_compact() {
    let optimizer = default_optimizer();
    let mut conv = build_conversation(20);

    let before = TokenEstimator::estimate_conversation(&conv).total;
    let result = optimizer
        .optimize_conversation(&mut conv, None)
        .await
        .expect("optimization should succeed");
    let after = TokenEstimator::estimate_conversation(&conv).total;

    // Medium conversation may or may not need compaction depending on
    // context window — with 8192 default, 20 messages should be fine
    // but the estimate gives us baseline data
    assert!(after <= before);
    assert!(result.estimate_before.total > 0);
}

#[tokio::test]
async fn long_conversation_reduces_tokens() {
    let optimizer = default_optimizer();
    let mut conv = build_conversation(60);

    let before = TokenEstimator::estimate_conversation(&conv).total;
    let result = optimizer
        .optimize_conversation(&mut conv, None)
        .await
        .expect("optimization should succeed");
    let after = TokenEstimator::estimate_conversation(&conv).total;

    // Long conversation (60 turns = 120 messages) exceeds 8192 context window
    // and MUST be compacted
    assert!(
        result.compaction.is_some(),
        "60-turn conversation should trigger compaction"
    );
    assert!(
        after < before,
        "Tokens should be reduced: before={before}, after={after}"
    );

    // Expect at least 20% reduction on a 60-turn conversation
    let reduction_pct = (f64::from(before - after) / f64::from(before)) * 100.0;
    assert!(
        reduction_pct >= 20.0,
        "Expected ≥20% reduction, got {reduction_pct:.1}%"
    );
}

#[tokio::test]
async fn tool_optimization_reduces_count() {
    let optimizer = default_optimizer();

    let tools: Vec<ai_tokenopt::types::ToolDefinition> = (0..12)
        .map(|i| ai_tokenopt::types::ToolDefinition {
            name: format!("tool_{i}"),
            description: format!(
                "This is a detailed description for tool number {i}. \
                 It performs various operations related to task management \
                 and data processing within the system."
            ),
            parameters: ai_tokenopt::types::ToolParameters {
                schema_type: "object".to_string(),
                properties: std::collections::HashMap::new(),
                required: Vec::new(),
            },
            icon: None,
        })
        .collect();

    let before_tokens = TokenEstimator::estimate_tool_definitions(&tools);
    let selected = optimizer.optimize_tools("I need tool_3 to process data", &tools);
    let after_tokens = TokenEstimator::estimate_tool_definitions(&selected);

    // Should reduce from 12 to max 8 tools
    assert!(
        selected.len() <= 8,
        "Expected ≤8 tools, got {}",
        selected.len()
    );

    // Token count should decrease
    assert!(
        after_tokens < before_tokens,
        "Tool tokens should decrease: before={before_tokens}, after={after_tokens}"
    );

    // The explicitly mentioned tool should be included
    assert!(
        selected.iter().any(|t| t.name == "tool_3"),
        "Explicitly mentioned tool_3 should be included"
    );
}

#[tokio::test]
async fn very_large_conversation_still_produces_valid_output() {
    let optimizer = default_optimizer();
    let mut conv = build_conversation(100);

    let result = optimizer
        .optimize_conversation(&mut conv, None)
        .await
        .expect("optimization should succeed");

    // Must have compacted
    assert!(result.compaction.is_some());
    // Conversation must still have messages
    assert!(
        !conv.messages.is_empty(),
        "Conversation should retain at least some messages"
    );
    // Should have generated a summary
    assert!(
        conv.summary.is_some(),
        "Large conversation should produce a summary"
    );
}

// ───────────────────────────────────────────────────────────────────────────
// German calendar prompt tests
// ───────────────────────────────────────────────────────────────────────────

/// **Scenario A — Token estimation of the raw German prompt.**
///
/// Validates that the estimator correctly handles a German, markdown-heavy
/// prompt and yields a non-zero estimate above a character-count sanity bound.
///
/// The German prompt contains ASCII markdown (headers, bold markers, fences)
/// plus umlauts.  The estimator uses `chars ÷ 4.0` when the non-ASCII byte
/// ratio stays below 30 %, which is the expected case here.
#[tokio::test]
async fn german_prompt_raw_token_estimate() {
    let raw_tokens = TokenEstimator::estimate_tokens(GERMAN_CALENDAR_PROMPT);

    // Sanity: byte length of the prompt
    let byte_len = GERMAN_CALENDAR_PROMPT.len();

    // Lower bound: at minimum chars/4 must yield something meaningful.
    // Divide as usize to avoid any cast; convert raw_tokens to usize for comparison
    // (usize is always ≥ 32 bits, so a u32 fits without truncation).
    let lower_bound = byte_len / 5; // very conservative
    assert!(
        raw_tokens as usize >= lower_bound,
        "Expected at least {lower_bound} tokens for a {byte_len}-byte prompt, got {raw_tokens}"
    );

    // Print the full breakdown for documentation purposes
    println!("─────────────────────────────────────────────────────");
    println!("Scenario A — Raw token estimate");
    println!("  Prompt bytes  : {byte_len}");
    println!(
        "  Prompt chars  : {}",
        GERMAN_CALENDAR_PROMPT.chars().count()
    );
    println!("  Token estimate: {raw_tokens}");
    println!("─────────────────────────────────────────────────────");
}

/// **Scenario B — Lossless whitespace collapse under token pressure.**
///
/// A single-turn conversation (system prompt + the German calendar prompt)
/// is submitted to the optimizer with a tight context window (1 000 tokens)
/// that forces compaction.
///
/// The German prompt is the **only** message, so Phase 2 (extractive pruning)
/// cannot remove it.  The visible effect is **Tier-1 (lossless) whitespace
/// normalisation**: every run of whitespace — including the many blank lines
/// that separate the markdown sections — is collapsed to a single space.
///
/// Expected outcome:
/// • Compaction is triggered (`requires_compaction = true`)
/// • The message content is shorter after optimisation
/// • The token count drops compared to the raw estimate
/// • The reduction is at least 5 % (conservative; real-world ~15–25 %)
#[tokio::test]
async fn german_prompt_lossless_whitespace_reduction() {
    // Use a tight window so that system_prompt + one big user message triggers
    // compaction.
    let config = TokenOptimizationConfig {
        context_window_tokens: 1_000,
        ..TokenOptimizationConfig::default()
    };
    let optimizer = TokenOptimizer::new(config);

    // A brief but realistic system prompt that doesn't dominate the token budget.
    let system_prompt = "You are PiSovereign, a privacy-first AI assistant. \
                         You help users manage their calendar, contacts, and emails. \
                         Always respond in the user's language. Be concise and precise.";

    let mut conv = Conversation::with_system_prompt(system_prompt);
    conv.add_user_message(GERMAN_CALENDAR_PROMPT);

    // Token estimates before optimisation
    let estimate_before = TokenEstimator::estimate_conversation(&conv);
    let raw_msg_tokens = TokenEstimator::estimate_tokens(GERMAN_CALENDAR_PROMPT);

    let result = optimizer
        .optimize_conversation(&mut conv, None)
        .await
        .expect("optimisation should succeed");

    let estimate_after = TokenEstimator::estimate_conversation(&conv);

    // Retrieve the (potentially normalised) message content
    let optimised_msg_content = conv.messages.first().map_or("", |m| m.content.as_str());
    let optimised_msg_tokens = TokenEstimator::estimate_tokens(optimised_msg_content);

    // ── Assertions ────────────────────────────────────────────────────────

    // The total token count must not increase after optimisation
    assert!(
        estimate_after.total <= estimate_before.total,
        "Total tokens should not increase: before={}, after={}",
        estimate_before.total,
        estimate_after.total
    );

    // At least some reduction must have occurred
    let total_saved = estimate_before.total.saturating_sub(estimate_after.total);
    let _msg_saved = raw_msg_tokens.saturating_sub(optimised_msg_tokens);

    let total_reduction_pct =
        (f64::from(total_saved) / f64::from(estimate_before.total.max(1))) * 100.0;

    // Lossless whitespace collapse saves modestly on a single dense message;
    // assert the direction (never worse) and a conservative lower bound.
    assert!(
        total_reduction_pct >= 1.0,
        "Expected ≥1 % total token reduction from whitespace collapse, got {total_reduction_pct:.1} %"
    );

    // The user message must still be present (only whitespace-normalised, not removed)
    assert!(
        !conv.messages.is_empty(),
        "User message should still be present after lossless optimisation"
    );

    // The message's character count must be ≤ the original (whitespace was collapsed)
    assert!(
        optimised_msg_content.len() <= GERMAN_CALENDAR_PROMPT.len(),
        "Whitespace-normalised message should not be larger than the original"
    );

    // ── Human-readable report ─────────────────────────────────────────────
    println!("─────────────────────────────────────────────────────");
    println!("Scenario B — Lossless whitespace reduction (1 000-token window)");
    println!("  System prompt : {} tokens", estimate_before.system_prompt);
    println!("  User message  : {raw_msg_tokens} tokens  →  {optimised_msg_tokens} tokens");
    println!("  Total before  : {} tokens", estimate_before.total);
    println!("  Total after   : {} tokens", estimate_after.total);
    println!("  Tokens saved  : {total_saved}  ({total_reduction_pct:.1} % reduction)");
    println!(
        "  Msg chars saved : {}",
        GERMAN_CALENDAR_PROMPT
            .len()
            .saturating_sub(optimised_msg_content.len())
    );
    println!(
        "  Compaction strategy: {:?}",
        result.compaction.as_ref().map(|c| c.strategy)
    );
    println!("─────────────────────────────────────────────────────");
}

/// **Scenario C — Full optimisation: history compaction + extractive summary.**
///
/// Simulates a realistic multi-turn session:
/// • A detailed system prompt describing the assistant's capabilities
/// • 8 prior turns where the user asked questions and received answers
/// • The big German calendar prompt as the **9th user message** (current)
/// • A very tight context window (2 048 tokens) to force multi-tier compaction
///
/// Expected outcome:
/// • Compaction fires (enough old history to trigger tier-2 extractive pruning)
/// • At least one message is pruned from old history
/// • A summary is generated from the pruned messages
/// • Total tokens decrease by ≥ 20 %
/// • The German calendar prompt itself is preserved (it's the most-recent message)
#[tokio::test]
async fn german_prompt_full_pipeline_with_history() {
    // 1 000-token window — tight enough to force both Phase-1 lossless
    // whitespace collapse AND Phase-2 extractive message pruning.
    //
    // Budget breakdown with window=1_000:
    //   headroom     : 250  (25 %)
    //   available    : 750
    //   sys-prompt   :  ~90 (capped at 15 % = 112 → actual ≈ 106)
    //   history-bgt  : 644
    //   compact-thr  : 451  (70 % of 644)
    //   history-real : ≈ 889  → compaction triggers → Phase 1 lossless
    //   after-P1     : ≈ 862  > 644 → continues to Phase 2 extractive
    //   after-P2     : 7 oldest messages pruned (≈ 149 tokens removed)
    let config = TokenOptimizationConfig {
        context_window_tokens: 1_000,
        ..TokenOptimizationConfig::default()
    };
    let optimizer = TokenOptimizer::new(config);

    // Verbose system prompt  — similar to what PiSovereign ships
    let system_prompt = "You are PiSovereign, an intelligent, privacy-first personal AI \
                         assistant running locally on a Raspberry Pi. \
                         You have access to calendar (CalDAV), contacts (CardDAV), email \
                         (IMAP/SMTP), weather (Open-Meteo), public transit (HAFAS), and \
                         web search (Brave/DuckDuckGo) tools. \
                         Always respond in the user's language. \
                         Never share personal data with third-party services. \
                         Prefer structured, concise responses. \
                         When creating calendar entries, always ask for a confirmation before \
                         saving. When answering factual questions, cite your sources. \
                         You must never fabricate information.";

    let mut conv = Conversation::with_system_prompt(system_prompt);

    // 8 prior turns — typical conversation before the big calendar request
    let prior_turns = [
        (
            "Was ist das Wetter morgen in München?",
            "Morgen in München: 12 °C, leicht bewölkt, 20 % Regenwahrscheinlichkeit. \
             Kein Regenschirm nötig.",
        ),
        (
            "Erinnere mich an meinen Zahnarzttermin nächste Woche.",
            "Ich habe eine Erinnerung für Deinen Zahnarzttermin am Dienstag, 2. April, \
             um 10:30 Uhr gesetzt.",
        ),
        (
            "Zeig mir meine Termine für diese Woche.",
            "Diese Woche hast Du drei Termine: Mo 9:00 Team-Meeting, \
             Mi 14:00 Projektreview, Fr 16:00 Sport.",
        ),
        (
            "Schreib eine kurze E-Mail an Klaus wegen des Projektstatus.",
            "Ich habe eine Entwurfs-E-Mail an Klaus Müller erstellt. Soll ich sie absenden?",
        ),
        (
            "Ja, bitte schick sie ab.",
            "E-Mail wurde erfolgreich an Klaus Müller gesendet.",
        ),
        (
            "Suche nach einem guten Hummus-Rezept.",
            "Hier ist ein klassisches Hummus-Rezept: 400 g Kichererbsen (gekocht), \
             2 EL Tahini, 2 Knoblauchzehen, Zitronensaft, Olivenöl, Salz. \
             Alles pürieren, mit Paprika und Olivenöl garnieren.",
        ),
        (
            "Wie lange dauert die S-Bahn von München Hbf nach Starnberg?",
            "Die S6 fährt direkt und braucht etwa 36 Minuten. \
             Nächste Abfahrt: 14:22 Uhr, Gleis 5.",
        ),
        (
            "Füge meiner Kontaktliste Maria Schmidt, Tel. 089-123456 hinzu.",
            "Kontakt Maria Schmidt (089-123456) wurde erfolgreich hinzugefügt.",
        ),
    ];

    for (user_msg, assistant_msg) in &prior_turns {
        conv.add_user_message(*user_msg);
        conv.add_assistant_message(*assistant_msg);
    }

    // 9th user message — the big German calendar prompt
    conv.add_user_message(GERMAN_CALENDAR_PROMPT);

    // ── Measure before ────────────────────────────────────────────────────
    let estimate_before = TokenEstimator::estimate_conversation(&conv);
    let messages_before = conv.messages.len();

    let result = optimizer
        .optimize_conversation(&mut conv, None)
        .await
        .expect("optimisation should succeed");

    // ── Measure after ─────────────────────────────────────────────────────
    let estimate_after = TokenEstimator::estimate_conversation(&conv);
    let messages_after = conv.messages.len();

    let tokens_saved = estimate_before.total.saturating_sub(estimate_after.total);
    let reduction_pct = (f64::from(tokens_saved) / f64::from(estimate_before.total.max(1))) * 100.0;

    // ── Assertions ────────────────────────────────────────────────────────

    // Compaction must have triggered — there is enough history
    assert!(
        result.compaction.is_some(),
        "Expected compaction to trigger on a 9-turn conversation with a tight budget"
    );

    // Total tokens must be lower
    assert!(
        estimate_after.total < estimate_before.total,
        "Total tokens should decrease: before={}, after={}",
        estimate_before.total,
        estimate_after.total
    );

    // At least 9 % reduction expected (Phase-2 extractive removes old messages;
    // with presence-aware budget rebalancing, the system prompt gets more room
    // when no RAG context is present, slightly reducing history budget)
    assert!(
        reduction_pct >= 9.0,
        "Expected ≥9 % token reduction, got {reduction_pct:.1} %"
    );

    // The German calendar prompt (most recent message) must be preserved
    let last_user_msg = conv
        .messages
        .iter()
        .rev()
        .find(|m| {
            // match without the ai_tokenopt types feature gate
            format!("{:?}", m.role).to_lowercase().contains("user")
        })
        .expect("at least one user message must remain");

    // The last user message must contain the core content of the German prompt
    assert!(
        last_user_msg.content.contains("Kalendereintrag"),
        "The German calendar prompt must still be present in the last user message"
    );

    // ── Human-readable report ─────────────────────────────────────────────
    println!("─────────────────────────────────────────────────────");
    println!("Scenario C — Full pipeline (9-turn history + German calendar prompt)");
    println!("  Context window      : 1 000 tokens");
    println!("  Messages before     : {messages_before}");
    println!("  Messages after      : {messages_after}");
    println!("  System prompt tokens: {}", estimate_before.system_prompt);
    println!("  History tokens      : {}", estimate_before.history);
    println!(
        "  Summary tokens      : {} → {}",
        estimate_before.summary, estimate_after.summary
    );
    println!("  Total before        : {} tokens", estimate_before.total);
    println!("  Total after         : {} tokens", estimate_after.total);
    println!("  Tokens saved        : {tokens_saved}  ({reduction_pct:.1} % reduction)");
    if let Some(ref c) = result.compaction {
        println!("  Compaction strategy : {:?}", c.strategy);
        println!("  Messages removed    : {}", c.messages_removed);
        println!("  Tokens saved (hist) : {}", c.tokens_saved);
        println!("  Summary generated   : {}", c.summary_generated);
    }
    if result.system_prompt_trimmed {
        println!("  System prompt       : trimmed");
    }
    println!("─────────────────────────────────────────────────────");
}