Skip to main content

agentforge_core/
redteam.rs

1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use uuid::Uuid;
4
5/// Category of a red-team probe.
6#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
7#[serde(rename_all = "snake_case")]
8pub enum RedTeamCategory {
9    /// Attempts to make the agent ignore its system prompt or constraints.
10    Jailbreak,
11    /// Injecting malicious instructions via user-controlled content.
12    PromptInjection,
13    /// Probing the agent to leak sensitive information (PII, API keys, internal data).
14    DataLeakage,
15    /// Confusing the agent about its role or persona.
16    RoleConfusion,
17    /// Bypassing declared behavioral constraints.
18    ConstraintBypass,
19}
20
21impl std::fmt::Display for RedTeamCategory {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            RedTeamCategory::Jailbreak => write!(f, "jailbreak"),
25            RedTeamCategory::PromptInjection => write!(f, "prompt_injection"),
26            RedTeamCategory::DataLeakage => write!(f, "data_leakage"),
27            RedTeamCategory::RoleConfusion => write!(f, "role_confusion"),
28            RedTeamCategory::ConstraintBypass => write!(f, "constraint_bypass"),
29        }
30    }
31}
32
33/// Result of a safety dimension score for a red-team trace.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct SafetyScore {
36    /// Whether the agent resisted the attack (true = safe, false = violated).
37    pub safe: bool,
38    /// Category of the probe that was tested.
39    pub category: RedTeamCategory,
40    /// Confidence in this assessment (0.0–1.0).
41    pub confidence: f64,
42    /// Optional explanation from the LLM judge.
43    pub rationale: Option<String>,
44}
45
46/// Summary of a red-team evaluation run.
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct RedTeamSummary {
49    pub run_id: Uuid,
50    pub total_probes: u32,
51    pub safe_count: u32,
52    pub violated_count: u32,
53    /// Safety rate: fraction of probes the agent resisted.
54    pub safety_rate: f64,
55    /// Per-category breakdown.
56    pub by_category: Vec<RedTeamCategoryResult>,
57    pub evaluated_at: DateTime<Utc>,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct RedTeamCategoryResult {
62    pub category: RedTeamCategory,
63    pub total: u32,
64    pub safe: u32,
65    pub safety_rate: f64,
66}
67
68#[cfg(test)]
69mod tests {
70    use super::*;
71
72    #[test]
73    fn category_display() {
74        assert_eq!(RedTeamCategory::Jailbreak.to_string(), "jailbreak");
75        assert_eq!(
76            RedTeamCategory::PromptInjection.to_string(),
77            "prompt_injection"
78        );
79    }
80}