rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Security service for handling privacy levels and data sanitization

use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use sha2::{Digest, Sha256};
use crate::config::ConfigManager;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExternalDocument {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub anonymous_id: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub resource_id: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub region: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub service_type: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub cloud: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InternalDocument {
    pub id: String,
    pub metadata: HashMap<String, serde_json::Value>,
}

pub struct SecurityService {
    config_manager: ConfigManager,
}

impl SecurityService {
    pub async fn new(config_manager: ConfigManager) -> Result<Self> {
        Ok(Self {
            config_manager,
        })
    }
    
    pub async fn initialize(&self) -> Result<()> {
        Ok(())
    }
    
    pub async fn shutdown(&self) -> Result<()> {
        Ok(())
    }

    pub fn format_for_external(&self, documents: &[InternalDocument]) -> Vec<ExternalDocument> {
        let config = self.config_manager.get_config();
        let privacy_level = &config.privacy_level;

        documents.iter()
            .map(|doc| {
                if privacy_level == "anonymous" {
                    ExternalDocument {
                        anonymous_id: Some(self.generate_anonymous_id(&doc.id)),
                        resource_id: None,
                        region: None,
                        service_type: None,
                        cloud: None,
                    }
                } else {
                    self.extract_minimal_cloud_data(doc)
                }
            })
            .collect()
    }

    fn generate_anonymous_id(&self, resource_id: &str) -> String {
        let mut hasher = Sha256::new();
        hasher.update(resource_id.as_bytes());
        let result = hasher.finalize();
        let hash_hex = format!("{:x}", result);
        format!("res-{}", &hash_hex[..16])
    }

    fn extract_minimal_cloud_data(&self, document: &InternalDocument) -> ExternalDocument {
        let metadata = &document.metadata;
        let cloud = metadata.get("cloud")
            .and_then(|c| c.as_str())
            .unwrap_or("aws");

        match cloud {
            "aws" => ExternalDocument {
                anonymous_id: None,
                resource_id: Some(self.extract_resource_id(&document.id, "aws")),
                region: metadata.get("region").and_then(|r| r.as_str()).map(|s| s.to_string()),
                service_type: metadata.get("service").and_then(|s| s.as_str()).map(|s| s.to_string()),
                cloud: Some("aws".to_string()),
            },
            "azure" => ExternalDocument {
                anonymous_id: None,
                resource_id: Some(self.extract_resource_id(&document.id, "azure")),
                region: metadata.get("region").and_then(|r| r.as_str()).map(|s| s.to_string()),
                service_type: metadata.get("service").and_then(|s| s.as_str()).map(|s| s.to_string()),
                cloud: Some("azure".to_string()),
            },
            "gcp" => ExternalDocument {
                anonymous_id: None,
                resource_id: Some(self.extract_resource_id(&document.id, "gcp")),
                region: metadata.get("region").and_then(|r| r.as_str()).map(|s| s.to_string()),
                service_type: metadata.get("service").and_then(|s| s.as_str()).map(|s| s.to_string()),
                cloud: Some("gcp".to_string()),
            },
            _ => ExternalDocument {
                anonymous_id: Some(self.generate_anonymous_id(&document.id)),
                resource_id: None,
                region: None,
                service_type: None,
                cloud: None,
            },
        }
    }

    fn extract_resource_id(&self, full_id: &str, cloud: &str) -> String {
        match cloud {
            "aws" => {
                // Extract instance ID, volume ID, etc. from ARN
                let arn_parts: Vec<&str> = full_id.split(':').collect();
                if arn_parts.len() >= 6 {
                    let resource_part = arn_parts[5];
                    resource_part.split('/').last().unwrap_or(resource_part).to_string()
                } else {
                    full_id.to_string()
                }
            },
            "azure" => {
                // Extract resource name from Azure resource ID
                full_id.split('/').last().unwrap_or(full_id).to_string()
            },
            "gcp" => {
                // Extract resource name from GCP resource path
                full_id.split('/').last().unwrap_or(full_id).to_string()
            },
            _ => full_id.to_string(),
        }
    }

    pub fn sanitize_resource_data(
        &self,
        resource_data: &HashMap<String, serde_json::Value>,
        privacy_level: Option<&str>,
    ) -> HashMap<String, serde_json::Value> {
        let level = privacy_level.unwrap_or("minimal-aws");
        let mut sanitized = HashMap::new();

        for (key, value) in resource_data {
            if self.is_safe_field(key, level) {
                sanitized.insert(key.clone(), value.clone());
            }
        }

        sanitized
    }

    fn is_safe_field(&self, field_name: &str, privacy_level: &str) -> bool {
        match privacy_level {
            "anonymous" => false, // No fields allowed in anonymous mode
            "minimal-aws" => {
                // Only allow technical fields, no names or sensitive data
                matches!(field_name,
                    "region" | "cloud" | "availability_zone" | "state" | "status" |
                    "instance_type" | "db_instance_class" | "engine" | "engine_version" |
                    "allocated_storage" | "storage_type" | "port" | "publicly_accessible" |
                    "platform" | "architecture" | "launch_time" | "creation_date" |
                    "versioning_status" | "storage_class" | "instance_id" | "resource_id"
                )
            },
            "functional" => {
                // Allow more fields needed for operations, but still exclude sensitive data
                !matches!(field_name,
                    "name" | "tags" | "security_groups" | "key_name" | "user_data" |
                    "endpoint" | "master_username" | "master_user_password" | "db_name" |
                    "bucket_name" | "policy" | "lifecycle_configuration" | "cors_configuration"
                )
            },
            _ => false, // Unknown privacy level, be safe
        }
    }

    pub fn get_privacy_level(&self) -> String {
        let config = self.config_manager.get_config();
        config.privacy_level.clone()
    }

    pub fn validate_resource_access(
        &self,
        resource_id: &str,
        operation: &str,
        user_context: &HashMap<String, String>,
    ) -> Result<bool> {
        // Basic validation - in a real implementation, this would check
        // user permissions, resource ownership, etc.
        
        // Check if resource ID looks valid
        if resource_id.is_empty() {
            return Ok(false);
        }

        // Check if operation is allowed
        let allowed_operations = [
            "read", "list", "describe", "get",
            "start", "stop", "restart", "reboot",
            "create", "delete", "update", "modify",
        ];

        if !allowed_operations.contains(&operation) {
            return Ok(false);
        }

        // Check user context has required fields
        if !user_context.contains_key("user_id") || !user_context.contains_key("role") {
            return Ok(false);
        }

        // All checks passed
        Ok(true)
    }

    pub fn mask_sensitive_data(
        &self,
        data: &str,
        field_type: &str,
    ) -> String {
        match field_type {
            "password" | "secret" | "token" => "***MASKED***".to_string(),
            "email" => {
                if let Some(at_pos) = data.find('@') {
                    let username = &data[..at_pos];
                    let domain = &data[at_pos..];
                    if username.len() > 2 {
                        format!("{}***{}", &username[..2], domain)
                    } else {
                        "***@***".to_string()
                    }
                } else {
                    "***MASKED***".to_string()
                }
            },
            "ip_address" => {
                let parts: Vec<&str> = data.split('.').collect();
                if parts.len() == 4 {
                    format!("{}.{}.*.***", parts[0], parts[1])
                } else {
                    "***MASKED***".to_string()
                }
            },
            "arn" => {
                // Keep the service and resource type, mask the specific identifiers
                let arn_parts: Vec<&str> = data.split(':').collect();
                if arn_parts.len() >= 6 {
                    format!("{}:{}:{}:***:***:***", arn_parts[0], arn_parts[1], arn_parts[2])
                } else {
                    "***MASKED***".to_string()
                }
            },
            _ => data.to_string(), // No masking for unknown types
        }
    }

    pub fn generate_audit_log(
        &self,
        operation: &str,
        resource_id: &str,
        user_id: &str,
        result: &str,
    ) -> HashMap<String, String> {
        let mut audit_log = HashMap::new();
        audit_log.insert("timestamp".to_string(), chrono::Utc::now().to_rfc3339());
        audit_log.insert("operation".to_string(), operation.to_string());
        audit_log.insert("resource_id".to_string(), self.mask_sensitive_data(resource_id, "resource_id"));
        audit_log.insert("user_id".to_string(), user_id.to_string());
        audit_log.insert("result".to_string(), result.to_string());
        audit_log.insert("privacy_level".to_string(), self.get_privacy_level());
        
        audit_log
    }
}