rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Type definitions for the RAG module

use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use indexmap::IndexMap;
use uuid::Uuid;
use chrono::{DateTime, Utc};

/// Document structure for storage in vector database
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Document {
    pub id: String,
    #[serde(rename = "vectorId")]
    pub vector_id: String,
    pub content: String,
    pub embedding: Option<Vec<f32>>,
    pub metadata: IndexMap<String, serde_json::Value>,
    pub created_at: DateTime<Utc>,
    pub updated_at: DateTime<Utc>,
}

impl Document {
    pub fn new(id: String, content: String) -> Self {
        let now = Utc::now();
        Self {
            id: id.clone(),
            vector_id: Uuid::new_v4().to_string(),
            content,
            embedding: None,
            metadata: IndexMap::new(),
            created_at: now,
            updated_at: now,
        }
    }

    pub fn new_with_vector_id(id: String, vector_id: String, content: String) -> Self {
        let now = Utc::now();
        Self {
            id,
            vector_id,
            content,
            embedding: None,
            metadata: IndexMap::new(),
            created_at: now,
            updated_at: now,
        }
    }
    
    pub fn with_metadata(mut self, metadata: IndexMap<String, serde_json::Value>) -> Self {
        self.metadata = metadata;
        self
    }
    
    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
        self.embedding = Some(embedding);
        self
    }
}

/// Search options for vector search
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct SearchOptions {
    pub limit: Option<usize>,
    pub score_threshold: Option<f32>,
    pub filter: Option<SearchFilter>,
    pub collection_name: Option<String>,
    pub privacy_level: Option<PrivacyLevel>,
    pub with_payload: Option<bool>,
    pub parameters: Option<HashMap<String, serde_json::Value>>, // String = exact match, Array = OR condition (any of values)
}

/// Search filter for metadata filtering
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchFilter {
    pub must: Option<Vec<FilterCondition>>,
    pub must_not: Option<Vec<FilterCondition>>,
    pub should: Option<Vec<FilterCondition>>,
}

/// Individual filter condition
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FilterCondition {
    pub key: String,
    pub r#match: MatchCondition,
}

/// Match condition for filtering
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum MatchCondition {
    Value { value: serde_json::Value },
    Any { any: Vec<serde_json::Value> },
    Range { gte: Option<f64>, lte: Option<f64> },
}

/// Search result structure
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
    pub id: String,
    pub score: f32,
    pub document: Option<Document>,
    pub payload: Option<HashMap<String, serde_json::Value>>,
}


/// Chat context structure
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatContext {
    pub context_id: String,
    pub user_id: Option<String>,
    pub session_id: Option<String>,
    pub title: Option<String>,
    pub created_at: DateTime<Utc>,
    pub updated_at: DateTime<Utc>,
    pub metadata: Option<HashMap<String, serde_json::Value>>,
}

/// Privacy level for data filtering
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum PrivacyLevel {
    Full,
    MinimalAws,
    Anonymous,
}

impl Default for PrivacyLevel {
    fn default() -> Self {
        PrivacyLevel::MinimalAws
    }
}

/// Collection type enumeration
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum CollectionType {
    ChatHistory,
    AwsEstate,
    KnowledgeBase,
    Custom(String),
}

impl CollectionType {
    pub fn as_str(&self) -> &str {
        match self {
            CollectionType::ChatHistory => "chat_history",
            CollectionType::AwsEstate => "aws_estate",
            CollectionType::KnowledgeBase => "knowledge_base",
            CollectionType::Custom(name) => name,
        }
    }
}

impl From<&str> for CollectionType {
    fn from(s: &str) -> Self {
        match s {
            "chat_history" => CollectionType::ChatHistory,
            "aws_estate" => CollectionType::AwsEstate,
            "knowledge_base" => CollectionType::KnowledgeBase,
            name => CollectionType::Custom(name.to_string()),
        }
    }
}

/// AWS resource types
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum AwsResourceType {
    Ec2Instance,
    RdsInstance,
    S3Bucket,
    LambdaFunction,
    IamUser,
    IamRole,
    IamPolicy,
}

/// Azure resource types
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum AzureResourceType {
    VirtualMachine,
    SqlDatabase,
    StorageAccount,
    FunctionApp,
}

/// GCP resource types
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum GcpResourceType {
    ComputeEngine,
    CloudSql,
    CloudStorage,
    CloudFunction,
}

/// Cloud resource structure
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CloudResource {
    pub resource_id: String,
    pub resource_type: String,
    pub cloud_provider: String,
    pub account_id: String,
    pub region: String,
    pub metadata: HashMap<String, serde_json::Value>,
    pub created_at: DateTime<Utc>,
    pub updated_at: DateTime<Utc>,
}

/// Embedding configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingConfig {
    pub model: String,
    pub dimensions: usize,
    pub service_url: Option<String>,
    pub api_key: Option<String>,
    pub batch_size: Option<usize>,
}

impl Default for EmbeddingConfig {
    fn default() -> Self {
        Self {
            model: "embaas/sentence-transformers-e5-large-v2".to_string(),
            dimensions: 1024,
            service_url: None, // Disable HTTP API to force local model usage
            api_key: None,
            batch_size: Some(32),
        }
    }
}

/// Encryption configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EncryptionConfig {
    pub algorithm: String,
    pub enable_content_encryption: bool,
    pub enable_embedding_encryption: bool,
    pub enable_metadata_encryption: bool,
    pub key_rotation_days: Option<u32>,
}

impl Default for EncryptionConfig {
    fn default() -> Self {
        Self {
            algorithm: "AES-256-GCM".to_string(),
            enable_content_encryption: false,
            enable_embedding_encryption: false,
            enable_metadata_encryption: false,
            key_rotation_days: Some(90),
        }
    }
}

/// Qdrant server connection configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QdrantConnectionConfig {
    pub url: String,
    pub api_key: Option<String>,
    pub timeout_secs: u64,
}

impl Default for QdrantConnectionConfig {
    fn default() -> Self {
        Self {
            url: "".to_string(),  // Empty for embedded mode, set when using server mode
            api_key: None,
            timeout_secs: 30,
        }
    }
}

/// Vector store configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VectorStoreConfig {
    pub backend: String,
    pub url: Option<String>,
    pub collection_prefix: Option<String>,
    pub distance_metric: Option<String>,
    pub connection: QdrantConnectionConfig,
    pub storage_path: Option<String>,
}

impl Default for VectorStoreConfig {
    fn default() -> Self {
        Self {
            backend: "qdrant-embedded".to_string(),
            url: None,
            collection_prefix: None,
            distance_metric: Some("Cosine".to_string()),
            connection: QdrantConnectionConfig::default(),
            storage_path: Some("./qdrant-data".to_string()),
        }
    }
}

/// Error types for the RAG module
#[derive(Debug, thiserror::Error)]
pub enum RagError {
    #[error("Configuration error: {0}")]
    Configuration(String),
    
    #[error("Vector store error: {0}")]
    VectorStore(String),
    
    #[error("Embedding service error: {0}")]
    Embedding(String),
    
    #[error("Encryption error: {0}")]
    Encryption(String),
    
    #[error("Document not found: {0}")]
    DocumentNotFound(String),
    
    #[error("Collection not found: {0}")]
    CollectionNotFound(String),
    
    #[error("Serialization error: {0}")]
    Serialization(#[from] serde_json::Error),
    
    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),
    
    #[error("Database error: {0}")]
    Database(String),
}

pub type RagResult<T> = Result<T, RagError>;

/// Document metadata structure 
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMetadata {
    pub created_at: DateTime<Utc>,
    pub updated_at: DateTime<Utc>,
    pub metadata: HashMap<String, serde_json::Value>,
}

/// Result structure for collection deletion operations
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionDeleteResult {
    pub collection_name: String,
    pub user_id: String,
    pub collection_deleted: bool,
    pub files_removed: usize,
    pub removed_files: Vec<String>,
    pub errors: Vec<String>,
}

impl CollectionDeleteResult {
    pub fn new(collection_name: String, user_id: String) -> Self {
        Self {
            collection_name,
            user_id,
            collection_deleted: false,
            files_removed: 0,
            removed_files: Vec::new(),
            errors: Vec::new(),
        }
    }

    pub fn is_successful(&self) -> bool {
        self.collection_deleted && self.errors.is_empty()
    }

    pub fn has_partial_success(&self) -> bool {
        self.collection_deleted || self.files_removed > 0
    }
}