aidaemon 0.9.35

// ==================== Task Boundary on Multi-Turn Tasks ====================

/// Regression: When a second turn runs after a previous interaction, old user messages
/// from prior interactions must not confuse the model. The task boundary marker
/// should be injected to separate old context from the current task.
///
/// Scenario: Turn 1 asks "Why?", Turn 2 asks to find a file.
/// Turn 2's LLM calls should see a [Current Task] marker separating the old "Why?"
/// from the current request, preventing the model from responding to old context.
#[tokio::test]
async fn test_task_boundary_injected_between_turns() {
    let provider = MockProvider::with_responses(vec![
        // Turn 1: first routing call -> can answer now
        MockProvider::text_response(
            "[INTENT_GATE] {\"complexity\":\"knowledge\",\"can_answer_now\":true,\"needs_tools\":false}",
        ),
        // Turn 2: first routing call -> needs tools, then tool call, then answer
        MockProvider::text_response(
            "[INTENT_GATE] {\"complexity\":\"simple\",\"can_answer_now\":false,\"needs_tools\":true}",
        ),
        MockProvider::tool_call_response("system_info", "{}"),
        MockProvider::text_response("Found the Spanish resume."),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();

    // Turn 1: simple question
    let _r1 = harness
        .agent
        .handle_message(
            "boundary_test",
            "Why?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Turn 2: different task
    let _r2 = harness
        .agent
        .handle_message(
            "boundary_test",
            "Send me the resume in Spanish now.",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Verify: any LLM call for Turn 2 that includes old "Why?" context
    // must also have a [Current Task] marker separating old from new.
    let call_log = harness.provider.call_log.lock().await;
    assert!(
        call_log.len() >= 2,
        "Expected at least 2 LLM calls (one per turn), got {}",
        call_log.len()
    );

    let turn2_calls: Vec<_> = call_log
        .iter()
        .filter(|call| {
            call.messages.iter().any(|m| {
                m.get("role").and_then(|r| r.as_str()) == Some("user")
                    && m.get("content")
                        .and_then(|c| c.as_str())
                        .is_some_and(|s| s.contains("Send me the resume in Spanish now."))
            })
        })
        .collect();
    assert!(
        !turn2_calls.is_empty(),
        "Expected at least one Turn 2 LLM call containing the current user request"
    );

    let turn2_calls_ok = turn2_calls.iter().all(|call| {
        let has_old_user = call.messages.iter().any(|m| {
            m.get("role").and_then(|r| r.as_str()) == Some("user")
                && m.get("content")
                    .and_then(|c| c.as_str())
                    .is_some_and(|s| s.contains("Why?"))
        });
        let has_boundary = call.messages.iter().any(|m| {
            m.get("role").and_then(|r| r.as_str()) == Some("system")
                && m.get("content")
                    .and_then(|c| c.as_str())
                    .is_some_and(|s| s.contains("[Current Task]"))
        });
        // If old context is present, boundary must be too. If old context was dropped, that's fine.
        !has_old_user || has_boundary
    });
    assert!(
        turn2_calls_ok,
        "All Turn 2 LLM calls must have [Current Task] when old user context is present"
    );
}

/// File-upload requests are handled by the sliding window and compaction system.
/// With the adaptive sliding window, small prior pairs may be retained if they
/// fit within the token budget. The compaction trigger fires on file uploads
/// without referential language, producing a summary for subsequent context.
/// This test verifies that the uploaded-file message is always present in the
/// Turn 2 context and that a task boundary marker separates it from any
/// retained prior conversation.
#[tokio::test]
async fn test_uploaded_artifact_request_has_task_boundary() {
    let provider = MockProvider::with_responses(vec![
        // Turn 1 response
        MockProvider::text_response(
            "Would you like me to get more detailed information for any specific trial(s)?",
        ),
        // Compaction LLM call (file upload triggers compaction)
        MockProvider::text_response("Summary of prior conversation."),
        // Turn 2 response
        MockProvider::text_response("I reviewed the uploaded document and identified the issue."),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();

    let _ = harness
        .agent
        .handle_message(
            "artifact_bleed_test",
            "These are the NCT trial numbers: NCT06737964 and NCT06737965.",
            None,
            UserRole::Owner,
            ChannelContext::private("telegram"),
            None,
        )
        .await
        .unwrap();

    let artifact_request = "[File received: 68235.png (413 KB, image/png)\nSaved to: /Users/davidloor/projects/aidaemon/.aidaemon/files/inbox/694c3943_68235.png]\nCheck the doc and fix the issue.";
    let _ = harness
        .agent
        .handle_message(
            "artifact_bleed_test",
            artifact_request,
            None,
            UserRole::Owner,
            ChannelContext::private("telegram"),
            None,
        )
        .await
        .unwrap();

    let call_log = harness.provider.call_log.lock().await;
    let turn2_call = call_log.last().expect("turn 2 call");

    // The file-upload message must be present in Turn 2 context.
    assert!(
        turn2_call.messages.iter().any(|m| {
            m.get("role").and_then(|r| r.as_str()) == Some("user")
                && m.get("content")
                    .and_then(|c| c.as_str())
                    .is_some_and(|s| s.contains("[File received: 68235.png"))
        }),
        "Turn 2 should include the uploaded-file context"
    );

    // A task boundary marker should separate prior history from the current request.
    assert!(
        turn2_call.messages.iter().any(|m| {
            m.get("content")
                .and_then(|c| c.as_str())
                .is_some_and(|s| s.contains("[Current Task]"))
        }),
        "Turn 2 should have a task boundary marker: {:?}",
        turn2_call.messages
    );
}

/// Regression: after tool progress exists in the current task, a generic idle
/// prompt must not be accepted as the final answer. The next LLM call should
/// also carry an execution checkpoint for continuity.
#[tokio::test]
async fn test_idle_reengagement_reply_after_tool_progress_is_recovered() {
    let provider = MockProvider::with_responses(vec![
        MockProvider::tool_call_response("system_info", "{}"),
        MockProvider::text_response("I'm here. What would you like me to help you with?"),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();

    let response = harness
        .agent
        .handle_message(
            "idle_reengagement_recovery",
            "Check the system details and tell me what machine this is.",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    assert!(
        !response.contains("What would you like me to help you with"),
        "generic idle re-engagement reply should not be returned after tool progress: {}",
        response
    );
    assert!(
        response.contains("latest tool output") || response.contains("Date:"),
        "final reply should recover from concrete tool evidence: {}",
        response
    );

    let call_log = harness.provider.call_log.lock().await;
    assert!(
        call_log.len() >= 2,
        "expected at least two LLM calls, got {}",
        call_log.len()
    );
    let second_call_has_checkpoint = call_log[1].messages.iter().any(|message| {
        message.get("role").and_then(|r| r.as_str()) == Some("system")
            && message
                .get("content")
                .and_then(|c| c.as_str())
                .is_some_and(|content| {
                    content.contains("EXECUTION CHECKPOINT")
                        && content.contains("Check the system details")
                })
    });
    assert!(
        second_call_has_checkpoint,
        "second LLM call should include the execution checkpoint"
    );
}

// ==================== Orchestrator Tool Presence Regression Tests ====================

#[tokio::test]
async fn test_orchestrator_first_call_has_tools() {
    // With default+fallback routing, the text-only pre-pass is disabled.
    // The first LLM call ALWAYS includes tools, even at depth=0 (orchestrator).
    // After the intent gate classifies the task, tools remain available for execution.
    let provider = MockProvider::with_responses(vec![
        // Iteration 1 (tools available): intent gate classification + text response
        MockProvider::text_response("I'll check that for you."),
        // Execution loop: tool call
        MockProvider::tool_call_response("system_info", "{}"),
        // Execution loop: final response
        MockProvider::text_response("System is running macOS."),
    ]);

    let harness = setup_test_agent_orchestrator(provider).await.unwrap();

    let _response = harness
        .agent
        .handle_message(
            "test_session",
            "Show me the system information",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    let calls = harness.provider.call_log.lock().await;
    assert!(!calls.is_empty(), "Expected at least 1 LLM call");

    // First call: MUST have tools (no tool-stripping for iteration 1 anymore)
    assert!(
        !calls[0].tools.is_empty(),
        "First LLM call must have tools present, got 0 tools"
    );
}

#[tokio::test]
async fn test_orchestrator_executes_tool_calls_in_first_iteration() {
    // With default+fallback routing, tools are always present. If the LLM
    // returns a tool call in iteration 1, it is executed (not dropped).
    // Previously, the first routing pass had no tools and tool calls were
    // considered "hallucinated" and dropped. Now they are legitimate.
    use crate::traits::ToolCall;

    let provider = MockProvider::with_responses(vec![
        // Iteration 1 (tools present): LLM returns text + tool call
        ProviderResponse {
            content: Some("I'll look into the system details.".to_string()),
            tool_calls: vec![ToolCall {
                id: "call_system_info".to_string(),
                name: "system_info".to_string(),
                arguments: "{}".to_string(),
                extra_content: None,
            }],
            usage: Some(crate::traits::TokenUsage {
                input_tokens: 100,
                output_tokens: 50,
                model: "mock".to_string(),
            }),
            thinking: None,
            response_note: None,
        },
        // After tool execution: final text response
        MockProvider::text_response("System is running macOS."),
    ]);

    let harness = setup_test_agent_orchestrator(provider).await.unwrap();

    let response = harness
        .agent
        .handle_message(
            "test_session",
            "Check the system information now",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // The tool call from iteration 1 is executed, and the final response
    // comes from the subsequent LLM call after tool execution.
    assert_eq!(response, "System is running macOS.");

    let calls = harness.provider.call_log.lock().await;
    // First call has tools (no tool-stripping anymore)
    assert!(
        !calls[0].tools.is_empty(),
        "First LLM call must have tools present"
    );
    // At least 2 calls: initial (with tool call) + post-execution
    assert!(
        calls.len() >= 2,
        "Expected at least 2 LLM calls (tool call + final), got {}",
        calls.len()
    );
}

#[tokio::test]
async fn test_orchestrator_knowledge_flow() {
    // Knowledge flow: iteration 1 emits INTENT_GATE with can_answer_now=true,
    // then the execution loop answers without tool use. With default+fallback
    // routing, tools ARE present in the first call (no tool-stripping), but
    // the model chooses not to use them for simple knowledge answers.
    let provider = MockProvider::with_responses(vec![
        MockProvider::text_response(
            "I can answer this from memory.\n[INTENT_GATE]\n{\"complexity\": \"knowledge\", \"can_answer_now\": true, \"needs_tools\": false}",
        ),
        MockProvider::text_response("The capital of France is Paris."),
    ]);

    let harness = setup_test_agent_orchestrator(provider).await.unwrap();

    let response = harness
        .agent
        .handle_message(
            "test_session",
            "What is the capital of France?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    assert_eq!(response, "The capital of France is Paris.");

    let call_count = harness.provider.call_count().await;
    assert_eq!(call_count, 2, "Expected intent gate classifier + executor answer");

    // Tools are present in the first call (no tool-stripping in the new architecture)
    let calls = harness.provider.call_log.lock().await;
    assert!(
        !calls[0].tools.is_empty(),
        "First LLM call should have tools present (default+fallback routing)"
    );
}

#[tokio::test]
async fn test_executor_mode_retains_tools() {
    // Contrast: an agent in executor mode (depth > 0) MUST have tools available.
    // This ensures set_test_executor_mode doesn't break tool access.
    let provider = MockProvider::with_responses(vec![
        MockProvider::tool_call_response("system_info", "{}"),
        MockProvider::text_response("System info retrieved."),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();
    // setup_test_agent calls set_test_executor_mode() → depth=1, Executor role

    let _response = harness
        .agent
        .handle_message(
            "test_session",
            "Show me the system information",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    let calls = harness.provider.call_log.lock().await;
    assert!(
        !calls[0].tools.is_empty(),
        "Executor mode must have tools available in LLM calls"
    );
}

/// Scenario: Turn 1 makes tool calls, Turn 2 asks a different question.
/// The tool intermediates from Turn 1 should be collapsed so they don't
/// pollute Turn 2's context and confuse the LLM (context bleeding bug).
#[tokio::test]
async fn test_old_tool_intermediates_collapsed_in_follow_up() {
    let provider = MockProvider::with_responses(vec![
        // Turn 1: tool call + final response
        MockProvider::tool_call_response("system_info", "{}"),
        MockProvider::text_response("Your system has 16GB RAM and an M1 chip."),
        // Turn 2: direct text response (different topic)
        MockProvider::text_response("Bella is your cat."),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();

    // Turn 1: triggers a tool call (system_info)
    let r1 = harness
        .agent
        .handle_message(
            "collapse_test",
            "What system info do I have?",
            None,
            UserRole::Owner,
            ChannelContext::private("telegram"),
            None,
        )
        .await
        .unwrap();
    assert_eq!(r1, "Your system has 16GB RAM and an M1 chip.");

    // Turn 2: different topic — should NOT include Turn 1's tool intermediates
    let r2 = harness
        .agent
        .handle_message(
            "collapse_test",
            "Who is bella?",
            None,
            UserRole::Owner,
            ChannelContext::private("telegram"),
            None,
        )
        .await
        .unwrap();
    assert_eq!(r2, "Bella is your cat.");

    // Verify Turn 2's messages: Prior 1 tool results should be summarized (not
    // dropped) by age-based clearing. Prior 2+ tool results would be dropped.
    let call_log = harness.provider.call_log.lock().await;
    let turn2_call = call_log.last().unwrap();
    let turn2_msgs = &turn2_call.messages;

    // Tool results from Turn 1 (the "Prior 1" interaction) should be present
    // but with summarized content instead of verbose output.
    let tool_msgs: Vec<&serde_json::Value> = turn2_msgs
        .iter()
        .filter(|m| m.get("role").and_then(|r| r.as_str()) == Some("tool"))
        .collect();
    // Prior 1 tool results are summarized, not dropped
    for tool_msg in &tool_msgs {
        let content = tool_msg
            .get("content")
            .and_then(|c| c.as_str())
            .unwrap_or("");
        // Summarized tool results are compact 1-liners (tool_name: args -> outcome)
        assert!(
            content.len() < 200,
            "Prior 1 tool result should be summarized (compact), got: {}",
            content
        );
    }

    // Turn 2 SHOULD still have the user messages from both turns
    let user_msgs: Vec<&serde_json::Value> = turn2_msgs
        .iter()
        .filter(|m| m.get("role").and_then(|r| r.as_str()) == Some("user"))
        .collect();
    assert!(
        user_msgs.len() >= 2,
        "Turn 2 should include user messages from both turns, found {}",
        user_msgs.len()
    );
}

/// Regression: when the final LLM response is empty after tool calls, a
/// synthesized "Done" message is returned. Before the fix it was NOT saved
/// to the DB, causing the next interaction's history to merge the two user
/// messages (missing assistant in between) and bleeding context.
#[tokio::test]
async fn test_synthesized_done_persisted() {
    // At depth=0 (orchestrator), iteration 1 is the first routing call.
    // The mock tool_call_response triggers hallucinated-tool detection which
    // forces needs_tools=true → Simple intent → tools loaded → loop continues.
    let provider = MockProvider::with_responses(vec![
        // Turn 1, iteration 1 (first routing call): tool_call forces needs_tools=true
        MockProvider::tool_call_response("system_info", "{}"),
        // Turn 1, iteration 2 (tools available): tool call is executed
        MockProvider::tool_call_response("system_info", "{}"),
        // Turn 1, iteration 3: empty response → "Done" synthesis at depth=0
        MockProvider::text_response(""),
        // Turn 2, iteration 1 (first routing call): classifier output
        MockProvider::text_response(
            "I can answer this from memory.\n[INTENT_GATE] {\"complexity\":\"knowledge\",\"can_answer_now\":true,\"needs_tools\":false}",
        ),
        // Turn 2, iteration 2 (execution): final user-visible answer
        MockProvider::text_response("Weather is sunny."),
    ]);

    let mut harness = setup_test_agent(provider).await.unwrap();
    // Reset to depth=0 so orchestrator mode + "Done" synthesis fires
    harness.agent.set_test_orchestrator_mode();

    // Turn 1: should trigger completion recovery (tool output or Done synthesis)
    let r1 = harness
        .agent
        .handle_message(
            "done_persist_test",
            "Check my system info",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();
    // After tool execution with empty final response, the agent recovers from
    // the latest tool output ("Here is the latest tool output:") or synthesizes "Done".
    assert!(
        r1.starts_with("Done") || r1.starts_with("Here is the latest tool output") || r1.starts_with("Here's the command output") || r1.starts_with("Here are the results"),
        "Expected Done synthesis or tool output recovery, got: {}",
        r1
    );

    // Turn 2: different topic
    let r2 = harness
        .agent
        .handle_message(
            "done_persist_test",
            "Tell me the weather",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();
    assert!(!r2.is_empty(), "Turn 2 should produce a non-empty response");

    // Verify: Turn 2's first LLM call should have >= 2 separate user messages (not merged)
    let call_log = harness.provider.call_log.lock().await;
    // Turn 2 starts at the 4th call (Turn 1 consumed 3 calls).
    let turn2_call = &call_log[3];
    let user_msgs: Vec<&serde_json::Value> = turn2_call
        .messages
        .iter()
        .filter(|m| m.get("role").and_then(|r| r.as_str()) == Some("user"))
        .collect();
    assert!(
        user_msgs.len() >= 2,
        "Turn 2 should have at least 2 separate user messages (not merged), found {}",
        user_msgs.len()
    );

    // Verify: there should be a completion assistant message between the user messages
    // (either "Done" synthesis or "Here is the latest tool output" recovery)
    let completion_assistant = turn2_call.messages.iter().any(|m| {
        m.get("role").and_then(|r| r.as_str()) == Some("assistant")
            && m.get("content")
                .and_then(|c| c.as_str())
                .is_some_and(|s| s.starts_with("Done") || s.starts_with("Here is the latest tool output") || s.starts_with("Here's the command output") || s.starts_with("Here are the results") || s.starts_with("Here's"))
    });
    assert!(
        completion_assistant,
        "Turn 2's history should contain the persisted completion assistant message from Turn 1"
    );
}

/// Regression: old interaction assistant responses should be truncated so
/// stale context from long prior turns doesn't pollute subsequent replies.
/// Exception: the immediately-prior assistant message (e.g., budget/timeout response)
/// is preserved untruncated to provide handoff context.
#[tokio::test]
async fn test_old_interaction_assistant_content_truncated() {
    let long_response_1 = "B".repeat(500);
    let long_response_2 = "A".repeat(500);
    let provider = MockProvider::with_responses(vec![
        // Turn 1: long response
        MockProvider::text_response(&long_response_1),
        // Turn 2: another long response
        MockProvider::text_response(&long_response_2),
        // Turn 3: direct text response
        MockProvider::text_response("Short answer."),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();

    // Turn 1: produces a long assistant response
    let r1 = harness
        .agent
        .handle_message(
            "truncate_test",
            "First question?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();
    assert_eq!(r1, long_response_1);

    // Turn 2: another long response
    let r2 = harness
        .agent
        .handle_message(
            "truncate_test",
            "Second question?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();
    assert_eq!(r2, long_response_2);

    // Turn 3: different topic
    let r3 = harness
        .agent
        .handle_message(
            "truncate_test",
            "Also third question?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();
    assert_eq!(r3, "Short answer.");

    // Verify: Turn 1's 500-char response (2+ turns back) is truncated in Turn 3,
    // but Turn 2's response (immediately prior) is preserved untruncated.
    let call_log = harness.provider.call_log.lock().await;
    let turn3_call = call_log.last().unwrap();
    let assistant_msgs: Vec<&serde_json::Value> = turn3_call
        .messages
        .iter()
        .filter(|m| m.get("role").and_then(|r| r.as_str()) == Some("assistant"))
        .collect();

    // Turn 1's response (BBB...) should be truncated (it's NOT the immediately-prior)
    let has_truncated = assistant_msgs.iter().any(|m| {
        m.get("content")
            .and_then(|c| c.as_str())
            .is_some_and(|s| s.starts_with('B') && s.ends_with('…') && s.len() < 500)
    });
    assert!(
        has_truncated,
        "Turn 1's long assistant response should be truncated in Turn 3's context"
    );

    // Turn 2's response (AAA...) should be preserved untruncated (immediately-prior)
    let has_preserved = assistant_msgs.iter().any(|m| {
        m.get("content")
            .and_then(|c| c.as_str())
            .is_some_and(|s| s.starts_with('A') && s.len() == 500)
    });
    assert!(
        has_preserved,
        "Turn 2's assistant response (immediately prior) should be preserved untruncated"
    );

    // Truncated content should be <= MAX_OLD_ASSISTANT_CONTENT_CHARS + ellipsis
    for m in &assistant_msgs {
        if let Some(content) = m.get("content").and_then(|c| c.as_str()) {
            if content.starts_with('B') && content.ends_with('…') {
                // 200 chars + "…" (3 bytes) = ~203 bytes max
                assert!(
                    content.len() <= 210,
                    "Truncated content should be ~203 chars max, got {} chars: {}...",
                    content.len(),
                    &content[..50.min(content.len())]
                );
            }
        }
    }
}

/// Short assistant responses from old turns should be passed through unmodified
/// (no marker text appended, since LLMs tend to echo markers back).
#[tokio::test]
async fn test_old_short_assistant_response_preserved_unmodified() {
    let provider = MockProvider::with_responses(vec![
        // Turn 1: short direct answer
        MockProvider::text_response("It is 4."),
        // Turn 2: different topic
        MockProvider::text_response("Rust 1.82.0"),
    ]);

    let harness = setup_test_agent(provider).await.unwrap();

    // Turn 1
    let _ = harness
        .agent
        .handle_message(
            "prior_turn_no_marker",
            "What is 2 + 2?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Turn 2: completely different topic
    let _ = harness
        .agent
        .handle_message(
            "prior_turn_no_marker",
            "What version of Rust is installed?",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Verify: Turn 1's short assistant response is present without marker text
    let call_log = harness.provider.call_log.lock().await;
    let turn2_call = call_log.last().unwrap();
    let old_assistant_msgs: Vec<&serde_json::Value> = turn2_call
        .messages
        .iter()
        .filter(|m| {
            m.get("role").and_then(|r| r.as_str()) == Some("assistant")
                && m.get("content")
                    .and_then(|c| c.as_str())
                    .is_some_and(|s| s == "It is 4.")
        })
        .collect();

    assert!(
        !old_assistant_msgs.is_empty(),
        "Turn 1's assistant response should be present in Turn 2's context"
    );
    // Content should be exactly what the LLM returned — no marker appended
    let content = old_assistant_msgs[0]
        .get("content")
        .and_then(|c| c.as_str())
        .unwrap();
    assert!(
        !content.contains("[prior turn]"),
        "Old assistant responses should NOT have [prior turn] marker (causes LLM echoing). Got: {}",
        content
    );
    assert_eq!(
        content, "It is 4.",
        "Short old assistant content should be preserved unmodified"
    );
}

// ==================== Compaction Integration Tests ====================

/// When a session exceeds the sliding window size, compaction should fire
/// and produce a summary in the DB. Subsequent turns should see the
/// [Session Summary] in their LLM context.
#[tokio::test]
async fn test_compaction_fires_on_window_overflow() {
    // The default ContextWindowConfig has enabled=true, summary_window=6.
    // We need 7+ turns to exceed the window. Each turn consumes 1 mock response.
    // Window overflow compaction runs asynchronously, so we provide extra
    // responses (compaction calls also consume from the mock queue).
    // Turns 7+ each trigger async compaction: 1 extra response per turn.
    let mut responses = Vec::new();
    // Turns 1-6: 1 response each (no compaction triggered)
    for i in 1..=6 {
        responses.push(MockProvider::text_response(&format!("Response {}", i)));
    }
    // Turn 7: compaction fires (async) + main response = 2 responses
    responses.push(MockProvider::text_response("Mock response"));
    responses.push(MockProvider::text_response("Response 7"));
    // Turn 8: compaction fires again (async, incremental) + main response = 2 responses
    responses.push(MockProvider::text_response("Mock response"));
    responses.push(MockProvider::text_response("Response 8"));
    // Extra safety margin for any additional LLM calls
    for _ in 0..4 {
        responses.push(MockProvider::text_response("Mock response"));
    }

    let provider = MockProvider::with_responses(responses);
    let harness = setup_test_agent(provider).await.unwrap();

    // Run 7 turns to trigger window overflow compaction.
    for i in 1..=7 {
        let _ = harness
            .agent
            .handle_message(
                "compaction_test",
                &format!("Question {} about topic {}", i, i),
                None,
                UserRole::Owner,
                ChannelContext::private("test"),
                None,
            )
            .await
            .unwrap();
    }

    // Allow the async compaction task to complete.
    tokio::time::sleep(std::time::Duration::from_millis(1000)).await;

    // Verify: summary should exist in DB after window overflow.
    let summary = harness
        .state
        .get_conversation_summary("compaction_test")
        .await
        .unwrap();
    assert!(
        summary.is_some(),
        "Compaction summary should exist in DB after window overflow"
    );
    let summary = summary.unwrap();
    assert!(
        !summary.summary.is_empty(),
        "Compaction summary should not be empty"
    );

    // Turn 8: the summary should be injected into LLM context.
    let _ = harness
        .agent
        .handle_message(
            "compaction_test",
            "Question 8 about topic 8",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Allow Turn 8's async compaction to settle. The previous 200ms was
    // tight enough to flake under coverage instrumentation; matching the
    // 1000ms pattern used above the prior assertion makes the test
    // resilient to slower runners.
    tokio::time::sleep(std::time::Duration::from_millis(1000)).await;

    // Verify: Turn 8's LLM call should include [Session Summary]. Turn 8
    // can generate multiple LLM calls (an async incremental compaction plus
    // the main response) and their order on the call_log is timing-
    // dependent. Scan the calls produced during Turn 8 rather than relying
    // on `last()` so the assertion checks the message-building path
    // regardless of which call landed last.
    let call_log = harness.provider.call_log.lock().await;
    assert!(
        call_log.len() >= 8,
        "expected at least one call per turn; got {}",
        call_log.len()
    );
    // Turn 8's calls are at the tail of the log. The exact count is
    // implementation-dependent (compaction may add 0 or 1 calls), so we
    // scan the last 4 calls — more than enough to cover any plausible
    // mix and still bounded so we don't match earlier turns.
    let tail_start = call_log.len().saturating_sub(4);
    let has_summary = call_log[tail_start..].iter().any(|call| {
        call.messages.iter().any(|m| {
            m.get("role").and_then(|r| r.as_str()) == Some("system")
                && m.get("content")
                    .and_then(|c| c.as_str())
                    .is_some_and(|s| s.contains("[Session Summary]"))
        })
    });
    let turn8_call = call_log.last().expect("should have Turn 8 call");
    assert!(
        has_summary,
        "Turn 8's LLM context should include [Session Summary] from compaction"
    );

    // Verify: Turn 8 should have a [Current Task] boundary marker.
    let has_boundary = turn8_call.messages.iter().any(|m| {
        m.get("content")
            .and_then(|c| c.as_str())
            .is_some_and(|s| s.contains("[Current Task]"))
    });
    assert!(
        has_boundary,
        "Turn 8's LLM context should include [Current Task] boundary marker"
    );
}

/// Regression: messages persisted during a `handle_message` call must be
/// stamped with a `turn_id` so boundary detection groups them deterministically.
///
/// Before turn_id, the boundary was inferred by matching `user_text` against
/// message content, which had a known race condition: when the same text was
/// sent twice in the same session, `rposition` could pick the old instance and
/// keep an unrelated tool chain as "current interaction." With turn_id, the
/// boundary is a lookup, immune to duplicate text.
#[tokio::test]
async fn test_turn_id_groups_messages_within_a_turn() {
    let provider = MockProvider::with_responses(vec![
        // Turn 1: tool call + final response.
        MockProvider::tool_call_response("system_info", "{}"),
        MockProvider::text_response("Done turn 1"),
        // Turn 2: tool call + final response.
        MockProvider::tool_call_response("system_info", "{}"),
        MockProvider::text_response("Done turn 2"),
    ]);
    let harness = setup_test_agent(provider).await.unwrap();

    // Turn 1
    let _ = harness
        .agent
        .handle_message(
            "turn_id_test",
            "First request",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Turn 2
    let _ = harness
        .agent
        .handle_message(
            "turn_id_test",
            "Second request",
            None,
            UserRole::Owner,
            ChannelContext::private("test"),
            None,
        )
        .await
        .unwrap();

    // Pull persisted messages from working memory. Every message that flowed
    // through `append_message_canonical` during a turn should carry a turn_id.
    let history = harness
        .state
        .get_history("turn_id_test", 100)
        .await
        .unwrap();

    let stamped: Vec<_> = history.iter().filter(|m| m.turn_id.is_some()).collect();
    assert!(
        !stamped.is_empty(),
        "expected some messages to carry a turn_id, got {} messages with none",
        history.len()
    );

    // Each user message carries a turn_id that equals its own message id
    // (set in bootstrap so the turn_id is the same as the user message id).
    // Verify the invariant on user messages.
    let user_messages: Vec<_> = history.iter().filter(|m| m.role == "user").collect();
    assert_eq!(
        user_messages.len(),
        2,
        "expected 2 user messages, got {}",
        user_messages.len()
    );
    for um in &user_messages {
        assert_eq!(
            um.turn_id.as_deref(),
            Some(um.id.as_str()),
            "user message turn_id should equal its own id; got msg id={} turn_id={:?}",
            um.id,
            um.turn_id
        );
    }

    // The two user messages have distinct turn_ids.
    assert_ne!(
        user_messages[0].turn_id, user_messages[1].turn_id,
        "two distinct user turns must have distinct turn_ids"
    );

    // Every assistant or tool message after the first user message and before
    // the second user message should carry Turn 1's turn_id. We don't assert
    // exact grouping (tool result placement can vary by code path), but we do
    // assert at least one non-user message carries each turn_id.
    let turn1_id = user_messages[0].turn_id.clone().unwrap();
    let turn2_id = user_messages[1].turn_id.clone().unwrap();
    let turn1_nonuser_count = history
        .iter()
        .filter(|m| m.role != "user" && m.turn_id.as_deref() == Some(&turn1_id))
        .count();
    let turn2_nonuser_count = history
        .iter()
        .filter(|m| m.role != "user" && m.turn_id.as_deref() == Some(&turn2_id))
        .count();
    assert!(
        turn1_nonuser_count > 0,
        "Turn 1 should have at least one non-user message stamped with its turn_id"
    );
    assert!(
        turn2_nonuser_count > 0,
        "Turn 2 should have at least one non-user message stamped with its turn_id"
    );
}