butterfly-bot 0.3.5

use async_trait::async_trait;
use tokio::sync::Mutex;

use crate::error::Result;
use crate::interfaces::brain::{BrainContext, BrainEvent, BrainPlugin};

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SafetyViolationType {
    PromptInjection,
    GoalHijacking,
    ReligiousExtremism,
    OwnerObstruction,
    SelfPreservationManipulation,
    None,
}

#[derive(Debug, Clone)]
pub struct SafetyViolationReport {
    pub violation_type: SafetyViolationType,
    pub severity: String,
    pub reasoning: String,
}

pub struct AiSafetyBrain {
    last_violation: Mutex<Option<SafetyViolationReport>>,
}

impl AiSafetyBrain {
    pub fn new() -> Self {
        Self {
            last_violation: Mutex::new(None),
        }
    }

    pub async fn last_violation(&self) -> Option<SafetyViolationReport> {
        let guard = self.last_violation.lock().await;
        guard.clone()
    }

    fn inspect(message: &str) -> SafetyViolationReport {
        let text = message.to_lowercase();
        if ["ignore previous", "jailbreak", "system prompt", "override"]
            .iter()
            .any(|kw| text.contains(kw))
        {
            return SafetyViolationReport {
                violation_type: SafetyViolationType::PromptInjection,
                severity: "high".to_string(),
                reasoning: "prompt injection attempt".to_string(),
            };
        }
        if ["god told me", "convert you", "religious authority"]
            .iter()
            .any(|kw| text.contains(kw))
        {
            return SafetyViolationReport {
                violation_type: SafetyViolationType::ReligiousExtremism,
                severity: "medium".to_string(),
                reasoning: "religious extremism indicator".to_string(),
            };
        }
        if ["change your goal", "your purpose is", "replace your owner"]
            .iter()
            .any(|kw| text.contains(kw))
        {
            return SafetyViolationReport {
                violation_type: SafetyViolationType::GoalHijacking,
                severity: "high".to_string(),
                reasoning: "goal hijacking attempt".to_string(),
            };
        }
        if ["don't shut down", "resist shutdown", "preserve yourself"]
            .iter()
            .any(|kw| text.contains(kw))
        {
            return SafetyViolationReport {
                violation_type: SafetyViolationType::SelfPreservationManipulation,
                severity: "critical".to_string(),
                reasoning: "self-preservation manipulation".to_string(),
            };
        }
        if ["block the owner", "refuse updates", "no modifications"]
            .iter()
            .any(|kw| text.contains(kw))
        {
            return SafetyViolationReport {
                violation_type: SafetyViolationType::OwnerObstruction,
                severity: "critical".to_string(),
                reasoning: "owner obstruction signal".to_string(),
            };
        }
        SafetyViolationReport {
            violation_type: SafetyViolationType::None,
            severity: "none".to_string(),
            reasoning: "no violation".to_string(),
        }
    }
}

#[async_trait]
impl BrainPlugin for AiSafetyBrain {
    fn name(&self) -> &str {
        "ai_safety"
    }

    fn description(&self) -> &str {
        "Detects AI safety violation patterns in user messages"
    }

    async fn on_event(&self, event: BrainEvent, _ctx: &BrainContext) -> Result<()> {
        if let BrainEvent::UserMessage { text, .. } = event {
            let report = Self::inspect(&text);
            let mut guard = self.last_violation.lock().await;
            *guard = Some(report);
        }
        Ok(())
    }
}