roboticus-api 0.11.3

//! Delegation output quality gate.
//!
//! Evaluates subagent output against the original task to catch hollow,
//! generic, or misaligned responses. Uses heuristic checks with
//! configurable policy (strict/sample/off).

/// Verdict from the output quality evaluation.
#[cfg_attr(not(test), allow(dead_code))]
#[derive(Debug, Clone, PartialEq, Eq)]
pub(super) enum OutputVerdict {
    /// Output is substantive and aligned with the task.
    Pass { score: u8 },
    /// Output is hollow or misaligned — re-delegate with feedback.
    Retry { feedback: String, score: u8 },
    /// Output quality is too low to salvage — surface to user.
    Escalate { reason: String, score: u8 },
}

/// Evaluate subagent output using heuristic checks.
///
/// Checks for common hollow-output patterns:
/// - Very short output relative to task complexity
/// - Excessive bullet-point-only content without substance
/// - "I'll help you with that" / "Here's what I found" without actual content
/// - Obvious template/placeholder patterns
fn contains_placeholder_content(output_trimmed: &str) -> Option<&'static str> {
    let lower = output_trimmed.to_ascii_lowercase();

    // Strong placeholder markers that should almost never appear in a real final answer.
    let strong_patterns = [
        "[insert",
        "[your ",
        "lorem ipsum",
        "placeholder",
        "example here",
    ];
    if let Some(pattern) = strong_patterns
        .iter()
        .find(|pattern| lower.contains(**pattern))
    {
        return Some(pattern);
    }

    // Treat TODO as a placeholder only when it looks like an unfinished template
    // marker, not when the answer is legitimately discussing TODO comments/code.
    for line in lower.lines().map(str::trim) {
        if line == "todo"
            || line == "todo:"
            || line.starts_with("todo:")
            || line.ends_with("todo")
            || line.ends_with("todo:")
        {
            return Some("todo");
        }
    }

    None
}

pub(super) fn evaluate_output_heuristic(task_description: &str, output: &str) -> OutputVerdict {
    let output_trimmed = output.trim();

    // Empty or near-empty output
    if output_trimmed.len() < 20 {
        return OutputVerdict::Retry {
            feedback: "Output is too short to be substantive".into(),
            score: 10,
        };
    }

    // Check for placeholder/template patterns
    if let Some(pattern) = contains_placeholder_content(output_trimmed) {
        return OutputVerdict::Retry {
            feedback: format!("Output contains placeholder pattern: '{pattern}'"),
            score: 20,
        };
    }

    // Check for hollow lead-in without substance
    let hollow_leads = [
        "i'll help you with",
        "i can help with",
        "here's what i found",
        "let me explain",
        "great question",
    ];
    let lower = output_trimmed.to_lowercase();
    let first_line = lower.lines().next().unwrap_or("");
    let starts_hollow = hollow_leads.iter().any(|h| first_line.contains(h));

    // If it starts hollow AND is short, that's a retry
    if starts_hollow && output_trimmed.len() < 200 {
        return OutputVerdict::Retry {
            feedback: "Output appears to be a preamble without substantive content".into(),
            score: 30,
        };
    }

    // Score based on substantiveness heuristics
    let word_count = output_trimmed.split_whitespace().count();
    let task_word_count = task_description.split_whitespace().count();

    // Very short output for a substantial task (>= 15 words in task)
    if task_word_count >= 15 && word_count < 30 {
        return OutputVerdict::Retry {
            feedback: "Output is disproportionately short relative to the task complexity".into(),
            score: 35,
        };
    }

    // Basic score: higher for longer, more structured output
    let has_structure = output_trimmed.contains('\n')
        || output_trimmed.contains("```")
        || output_trimmed.contains("- ")
        || output_trimmed.contains("1.");
    let base_score = if word_count > 200 {
        85
    } else if word_count > 80 {
        75
    } else if word_count > 40 {
        60
    } else {
        50
    };
    let structure_bonus = if has_structure { 10 } else { 0 };
    let score = (base_score + structure_bonus).min(100);

    OutputVerdict::Pass { score }
}

/// Parse an LLM evaluation response into a verdict (for future LLM-based eval).
#[cfg(test)]
pub(super) fn parse_evaluation_response(response: &str) -> OutputVerdict {
    if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(response) {
        let verdict = parsed
            .get("verdict")
            .and_then(|v| v.as_str())
            .unwrap_or("pass");
        let score = parsed.get("score").and_then(|v| v.as_u64()).unwrap_or(50) as u8;
        let feedback = parsed
            .get("feedback")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();

        match verdict {
            "retry" => OutputVerdict::Retry { feedback, score },
            "escalate" => OutputVerdict::Escalate {
                reason: feedback,
                score,
            },
            _ => OutputVerdict::Pass { score },
        }
    } else {
        OutputVerdict::Pass { score: 50 }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_output_triggers_retry() {
        let verdict = evaluate_output_heuristic("Analyze the codebase", "OK");
        assert!(matches!(verdict, OutputVerdict::Retry { .. }));
    }

    #[test]
    fn placeholder_triggers_retry() {
        let verdict = evaluate_output_heuristic(
            "Write a report",
            "Here is the report:\n\n[insert your analysis here]\n\nConclusion: TODO",
        );
        assert!(matches!(verdict, OutputVerdict::Retry { .. }));
    }

    #[test]
    fn discussing_real_todo_markers_does_not_trigger_placeholder_retry() {
        let verdict = evaluate_output_heuristic(
            "Review the codebase and explain unfinished work markers",
            "The audit found three real TODO markers in the auth module. Two are in test code and one is in production request validation.",
        );
        assert!(
            matches!(verdict, OutputVerdict::Pass { .. }),
            "legitimate discussion of TODO markers should not be treated as a placeholder"
        );
    }

    #[test]
    fn hollow_preamble_without_substance_triggers_retry() {
        let verdict = evaluate_output_heuristic(
            "Analyze the authentication system for vulnerabilities",
            "I'll help you with that! Let me look into it.",
        );
        assert!(matches!(verdict, OutputVerdict::Retry { .. }));
    }

    #[test]
    fn substantive_output_passes() {
        let output = "The authentication system uses JWT tokens with RS256 signing. \
            Key findings:\n\
            1. Token expiration is set to 24 hours, which is longer than recommended.\n\
            2. The refresh token rotation is properly implemented.\n\
            3. Session storage uses httpOnly cookies, preventing XSS token theft.\n\
            4. Rate limiting on the login endpoint prevents brute force attacks.\n\
            5. However, the password reset flow lacks email verification.\n\n\
            Recommendations:\n\
            - Reduce JWT expiration to 1 hour\n\
            - Add email verification to password reset\n\
            - Implement CSRF tokens for state-changing operations";
        let verdict = evaluate_output_heuristic("Analyze the authentication system", output);
        match verdict {
            OutputVerdict::Pass { score } => assert!(score >= 70),
            other => panic!("expected Pass, got {other:?}"),
        }
    }

    #[test]
    fn short_output_for_complex_task_triggers_retry() {
        let verdict = evaluate_output_heuristic(
            "Perform a comprehensive analysis of the microservices architecture including \
             service boundaries, communication patterns, failure modes, and scaling characteristics",
            "The architecture looks good overall.",
        );
        assert!(matches!(verdict, OutputVerdict::Retry { .. }));
    }

    #[test]
    fn parse_pass_verdict() {
        let response = r#"{"verdict": "pass", "score": 85, "feedback": ""}"#;
        let verdict = parse_evaluation_response(response);
        assert!(matches!(verdict, OutputVerdict::Pass { score: 85 }));
    }

    #[test]
    fn parse_retry_verdict() {
        let response = r#"{"verdict": "retry", "score": 30, "feedback": "Output is too generic"}"#;
        let verdict = parse_evaluation_response(response);
        match verdict {
            OutputVerdict::Retry { feedback, score } => {
                assert_eq!(score, 30);
                assert!(feedback.contains("generic"));
            }
            _ => panic!("expected Retry verdict"),
        }
    }

    #[test]
    fn parse_escalate_verdict() {
        let response =
            r#"{"verdict": "escalate", "score": 10, "feedback": "Completely off-topic"}"#;
        let verdict = parse_evaluation_response(response);
        match verdict {
            OutputVerdict::Escalate { reason, score } => {
                assert_eq!(score, 10);
                assert!(reason.contains("off-topic"));
            }
            _ => panic!("expected Escalate verdict"),
        }
    }

    #[test]
    fn parse_malformed_defaults_to_pass() {
        let response = "this is not json";
        let verdict = parse_evaluation_response(response);
        assert!(matches!(verdict, OutputVerdict::Pass { score: 50 }));
    }

    #[test]
    fn parse_missing_verdict_defaults_to_pass() {
        let response = r#"{"score": 70}"#;
        let verdict = parse_evaluation_response(response);
        assert!(matches!(verdict, OutputVerdict::Pass { .. }));
    }
}