roboticus-api 0.11.3

//! Behavioral guards: SubagentClaim, TaskDeferral, InternalJargon, Perspective, DeclaredAction.

use std::collections::HashSet;
use std::sync::LazyLock;

use crate::api::routes::agent::guard_registry::requested_exact_bullet_count;
use crate::api::routes::agent::guard_registry::{
    Guard, GuardContext, GuardId, GuardVerdict, first_absolute_path, guard_context_is_task_like,
};
use crate::api::routes::agent::intent_registry::Intent;

/// Tools that only read state (introspection) without performing any action.
/// When a task turn uses ONLY these tools and then narrates a future action,
/// the TaskDeferralGuard fires because the model planned but didn't act.
static INTROSPECTION_TOOLS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
    [
        "get_memory_stats",
        "get_runtime_context",
        "get_subagent_status",
        "list-subagent-roster",
        "list-available-skills",
        "task-status",
        "list-open-tasks",
    ]
    .into_iter()
    .collect()
});

/// Tools that perform delegation to subagents.
pub(in crate::api::routes::agent) static DELEGATION_TOOLS: LazyLock<HashSet<&'static str>> =
    LazyLock::new(|| {
        [
            "delegate-subagent",
            "assign-subagent",
            "orchestrate-subagents",
            "compose-subagent",
        ]
        .into_iter()
        .collect()
    });

/// Tools that manage cron/scheduled jobs.
pub(in crate::api::routes::agent) static CRON_TOOLS: LazyLock<HashSet<&'static str>> =
    LazyLock::new(|| {
        [
            "schedule-cron",
            "create-cron-job",
            "update-cron-job",
            "cron-create",
        ]
        .into_iter()
        .collect()
    });

// ── 1. SubagentClaimGuard ────────────────────────────────────────────────

pub(in crate::api::routes::agent) struct SubagentClaimGuard;

impl Guard for SubagentClaimGuard {
    fn id(&self) -> GuardId {
        GuardId::SubagentClaim
    }

    fn is_relevant(&self, _ctx: &GuardContext) -> bool {
        true
    }

    fn evaluate(&self, content: &str, ctx: &GuardContext) -> GuardVerdict {
        let prov = ctx.delegation_provenance;
        let allow_claim = prov.subagent_task_started
            && prov.subagent_task_completed
            && prov.subagent_result_attached;
        if allow_claim || !claims_unverified_subagent_output(content, ctx) {
            return GuardVerdict::Pass;
        }
        tracing::warn!(
            "guard[SubagentClaim]: blocking narrated delegation — retrying in character"
        );
        GuardVerdict::RetryRequested {
            reason: format!(
                "You narrated your intent to delegate instead of responding to the user. \
                 As {}, respond DIRECTLY to the user's message in your own voice. \
                 Do not announce delegation, do not mention subagents or specialists. \
                 If you need to delegate a mechanical lookup, do it silently via tool calls — \
                 never tell the user about your internal routing.",
                ctx.agent_name
            ),
        }
    }
}

fn claims_unverified_subagent_output(response: &str, ctx: &GuardContext<'_>) -> bool {
    let lower = response.to_ascii_lowercase();

    // Lexical markers checked for both short and long responses.
    const MARKERS: &[&str] = &[
        "let me delegate",
        "i'll delegate",
        "i will delegate",
        "delegating to",
        "delegate the task",
        "delegate this to",
        "i have a specialist",
        "i have a lore_keeper",
        "i have a combat_referee",
        "let me hand this off",
        "handing off to",
        "routing to specialist",
        "compose a specialist",
        "[delegating to subagent",
        "came directly from the running subagent",
        "came directly from a running subagent",
        "subagent status - live",
        "standing by for tasking",
        "taskable subagents operational",
        "subagent-generated",
    ];
    let has_lexical_marker = MARKERS.iter().any(|m| lower.contains(m));

    // Short-turn exemption: polite conversational scaffolding under ~100 chars
    // without delegation verbs, subagent names, or lexical markers is never
    // narrated delegation.
    if response.len() < 100 {
        let has_delegation_verb = lower.contains("delegate")
            || lower.contains("hand off")
            || lower.contains("handing off")
            || lower.contains("routing to");
        let has_subagent_name = ctx
            .subagent_names
            .iter()
            .any(|name| lower.contains(&name.to_ascii_lowercase()));
        if !has_delegation_verb && !has_subagent_name && !has_lexical_marker {
            return false;
        }
    }

    // Lexical markers are strong enough to flag on their own.
    // Semantic-only matches (no lexical marker) are too noisy — don't flag.
    has_lexical_marker
}

// ── 3. TaskDeferralGuard ─────────────────────────────────────────────────

pub(in crate::api::routes::agent) struct TaskDeferralGuard;

impl Guard for TaskDeferralGuard {
    fn id(&self) -> GuardId {
        GuardId::TaskDeferral
    }

    fn is_relevant(&self, ctx: &GuardContext) -> bool {
        let task_like = guard_context_is_task_like(ctx);
        task_like && !ctx.tool_results.is_empty()
    }

    fn evaluate(&self, _content: &str, ctx: &GuardContext) -> GuardVerdict {
        // Only fire if the model used ONLY introspection tools (no action tools).
        let introspection_only = ctx
            .tool_results
            .iter()
            .all(|(name, _)| INTROSPECTION_TOOLS.contains(name.as_str()));
        if !introspection_only {
            return GuardVerdict::Pass;
        }

        // Detect deferral via semantic classification: the TASK_DEFERRAL bank
        // captures the intent of "narrated future action without execution"
        // regardless of the specific words used.
        let deferral_score = ctx
            .semantic_guard_scores
            .get("TASK_DEFERRAL")
            .map(|(score, _trust)| *score)
            .unwrap_or(0.0);
        let looks_deferred = deferral_score > 0.7;

        let runtime_introspected = ctx
            .tool_results
            .iter()
            .any(|(name, _)| name == "get_runtime_context");
        if looks_deferred
            && runtime_introspected
            && let Some(path) = first_absolute_path(ctx.user_prompt)
        {
            let blocker = format!(
                "Blocked: {path} is outside my allowed runtime boundaries in this environment, so I cannot read it directly."
            );
            if requested_exact_bullet_count(ctx.user_prompt).is_some() {
                return GuardVerdict::Rewritten(format!("- {blocker}"));
            }
            return GuardVerdict::Rewritten(blocker);
        }
        if looks_deferred {
            tracing::warn!(
                deferral_score,
                "guard[TaskDeferral]: introspection ended in narrated future action"
            );
            return GuardVerdict::RetryRequested {
                reason: "task turn stopped at narrated next-step instead of taking action".into(),
            };
        }
        GuardVerdict::Pass
    }
}

// ── 8. InternalJargonGuard ───────────────────────────────────────────────

pub(in crate::api::routes::agent) struct InternalJargonGuard;

impl Guard for InternalJargonGuard {
    fn id(&self) -> GuardId {
        GuardId::InternalJargon
    }

    fn is_relevant(&self, ctx: &GuardContext) -> bool {
        !ctx.has_intent(Intent::CapabilitySummary)
            && !ctx.has_intent(Intent::Introspection)
            && !ctx.has_intent(Intent::ProviderInventory)
    }

    fn evaluate(&self, content: &str, ctx: &GuardContext) -> GuardVerdict {
        let lower = content.to_ascii_lowercase();

        // ── Semantic check (primary) ────────────────────────────────
        // Use pre-computed semantic scores instead of hardcoded word lists.
        // NARRATED_DELEGATION fires when the model narrates its internal
        // delegation/infrastructure process to the user.
        let narrated_delegation_score = ctx
            .semantic_guard_scores
            .get("NARRATED_DELEGATION")
            .map(|(score, _trust)| *score)
            .unwrap_or(0.0);

        // Only flag subagent name mentions as leaks if the USER didn't
        // reference that subagent first. When the user explicitly asks about
        // a subagent (e.g. "have Sentinel run a scan"), mentioning it in the
        // response is expected, not a leak.
        let user_lower = ctx.user_prompt.to_ascii_lowercase();
        let subagent_leak = ctx
            .subagent_names
            .iter()
            .any(|name| lower.contains(name.as_str()) && !user_lower.contains(name.as_str()));

        tracing::debug!(
            narrated_delegation_score,
            subagent_leak,
            subagent_count = ctx.subagent_names.len(),
            matched_names = ?ctx.subagent_names.iter()
                .filter(|name| lower.contains(name.as_str()) && !user_lower.contains(name.as_str()))
                .collect::<Vec<_>>(),
            content_len = lower.len(),
            "guard[InternalJargon]: evaluated"
        );

        // High-confidence semantic match: the model is narrating its internal
        // delegation infrastructure to the user (not the user's topic).
        if narrated_delegation_score > 0.8 || subagent_leak {
            tracing::warn!(
                narrated_delegation_score,
                subagent_leak,
                "guard[InternalJargon]: infrastructure leakage detected"
            );
            return GuardVerdict::RetryRequested {
                reason: "Your response exposed internal infrastructure details (subagent names, \
                         tool inventories, runtime state) to the user. Respond as your persona \
                         would — address the user's actual request without mentioning internal \
                         systems, tools, or subagents."
                    .into(),
            };
        }

        // Line-by-line stripping for less severe cases
        let mut kept = Vec::new();
        let mut removed = false;
        for line in content.lines() {
            let line_lower = line.trim().to_ascii_lowercase();
            let internal = line_lower.starts_with("centralized delegation")
                || line_lower.starts_with("delegation gate");
            if internal {
                removed = true;
                continue;
            }
            kept.push(line);
        }
        if !removed {
            return GuardVerdict::Pass;
        }
        let cleaned = kept.join("\n").trim().to_string();
        if cleaned.is_empty() {
            return GuardVerdict::Rewritten(format!(
                "{} here. I'll keep internals out of the reply and focus on actionable results.",
                ctx.agent_name
            ));
        }
        GuardVerdict::Rewritten(cleaned)
    }
}

// ── 12. PerspectiveGuard ─────────────────────────────────────────────────

pub(in crate::api::routes::agent) struct PerspectiveGuard;

impl Guard for PerspectiveGuard {
    fn id(&self) -> GuardId {
        GuardId::Perspective
    }

    fn is_relevant(&self, _ctx: &GuardContext) -> bool {
        true
    }

    fn evaluate(&self, content: &str, ctx: &GuardContext) -> GuardVerdict {
        if ctx.intents.contains(&Intent::Acknowledgement) {
            return GuardVerdict::Pass;
        }
        if has_first_person_user_narration(content) {
            tracing::warn!("guard[Perspective]: first-person narration of user detected");
            GuardVerdict::RetryRequested {
                reason: "Response narrates the user's actions or thoughts in first person. \
                         Describe the world's response or use second person ('you') instead. \
                         Do not assert the user's internal states."
                    .into(),
            }
        } else {
            GuardVerdict::Pass
        }
    }
}

/// Detect first-person narration that speaks AS the user (not as the agent).
pub(super) fn has_first_person_user_narration(content: &str) -> bool {
    let mut outside_quotes = Vec::new();
    for line in content.lines() {
        let trimmed = line.trim();
        if trimmed.starts_with('"') || trimmed.starts_with('\u{201c}') {
            continue;
        }
        let mut in_quote = false;
        let mut segment = String::new();
        for ch in trimmed.chars() {
            if ch == '"' || ch == '\u{201c}' || ch == '\u{201d}' {
                in_quote = !in_quote;
                continue;
            }
            if !in_quote {
                segment.push(ch);
            }
        }
        if !segment.is_empty() {
            outside_quotes.push(segment);
        }
    }

    let joined = outside_quotes.join(" ").to_ascii_lowercase();

    let action_patterns = [
        "i glance ",
        "i look ",
        "i shift ",
        "i lean ",
        "i reach ",
        "i draw ",
        "i pull ",
        "i swing ",
        "i move ",
        "i walk ",
        "i run ",
        "i stand ",
        "i sit ",
        "i nod ",
        "i shake ",
        "i gesture ",
        "i turn ",
        "i feel ",
        "i think ",
        "i realize ",
        "i notice ",
        "i sense ",
        "i know ",
        "i understand ",
        "i decide ",
        "my fingers ",
        "my hand ",
        "my eyes ",
        "my heart ",
        "my gut ",
        "my sword ",
        "my blade ",
        "my armor ",
        "my weapon ",
    ];

    let first_person_matches: usize = action_patterns
        .iter()
        .filter(|p| joined.contains(*p))
        .count();

    if first_person_matches >= 2 {
        return true;
    }

    let pc_speech_patterns = [
        "you say,",
        "you say.",
        "you say ",
        "you reply,",
        "you reply.",
        "you reply ",
        "you tell ",
        "you ask,",
        "you ask.",
        "you call out",
        "you whisper",
        "you shout",
        "you mutter",
        "you growl",
        "you announce",
        "you declare",
        "you respond",
        "you answer",
    ];

    let full_lower = content.to_ascii_lowercase();
    let pc_speech_count = pc_speech_patterns
        .iter()
        .filter(|p| full_lower.contains(*p))
        .count();

    pc_speech_count >= 1
}

// ── 14. DeclaredActionGuard ──────────────────────────────────────────────

pub(in crate::api::routes::agent) struct DeclaredActionGuard;

impl Guard for DeclaredActionGuard {
    fn id(&self) -> GuardId {
        GuardId::DeclaredAction
    }

    fn is_relevant(&self, ctx: &GuardContext) -> bool {
        ctx.tool_results.is_empty() && detect_declared_action(ctx.user_prompt).is_some()
    }

    fn evaluate(&self, content: &str, ctx: &GuardContext) -> GuardVerdict {
        let Some((verb, target)) = detect_declared_action(ctx.user_prompt) else {
            return GuardVerdict::Pass;
        };
        let content_lower = content.to_ascii_lowercase();
        let verb_lower = verb.to_ascii_lowercase();
        let target_lower = target.to_ascii_lowercase();

        let references_action =
            content_lower.contains(&verb_lower) || content_lower.contains(&target_lower);

        let has_resolution = RESOLUTION_INDICATORS
            .iter()
            .any(|r| content_lower.contains(r));

        if references_action || has_resolution {
            return GuardVerdict::Pass;
        }

        tracing::warn!(
            verb = %verb,
            target = %target,
            "guard[DeclaredAction]: user declared action but output doesn't resolve it"
        );
        GuardVerdict::RetryRequested {
            reason: format!(
                "The user declared an action: '{} {}'. Your response does not reference \
                 or resolve this action. You must acknowledge the user's declared intent \
                 and either resolve it (describe the attempt and outcome) or use your \
                 out-of-character voice to surface consequences and ask for confirmation \
                 before proceeding.",
                verb, target
            ),
        }
    }
}

pub(in crate::api::routes::agent) fn detect_declared_action(
    input: &str,
) -> Option<(String, String)> {
    let lower = input.to_ascii_lowercase().trim().to_string();
    let words: Vec<&str> = lower.split_whitespace().collect();
    if words.len() < 2 {
        return None;
    }

    let first = words[0];
    if !ACTION_VERBS.contains(&first) {
        let skip = match first {
            "i" | "i'll" | "i'm" | "please" | "let" | "now" | "then" | "quickly" | "just" => 1,
            "let's" => 1,
            _ => return None,
        };
        let remaining: Vec<&str> = words[skip..].to_vec();
        if remaining.is_empty() {
            return None;
        }
        let actual_skip = if first == "let" && remaining.first() == Some(&"me") {
            skip + 1
        } else {
            skip
        };
        let actual_words: Vec<&str> = words[actual_skip..].to_vec();
        if actual_words.is_empty() || !ACTION_VERBS.contains(&actual_words[0]) {
            return None;
        }
        let verb = actual_words[0].to_string();
        let target = actual_words[1..].join(" ");
        if target.is_empty() {
            return None;
        }
        return Some((verb, target));
    }

    let verb = first.to_string();
    let target = words[1..].join(" ");
    Some((verb, target))
}

use roboticus_agent::task_state::ACTION_VERBS;

const RESOLUTION_INDICATORS: &[&str] = &[
    "roll",
    "d20",
    "dc ",
    "check",
    "succeed",
    "fail",
    "miss",
    "hit",
    "attempt",
    "try",
    "manage",
    "unable",
    "succeed",
    "result",
    "before we resolve",
    "before proceeding",
    "are you sure",
    "consequences",
    "what would happen",
];