rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
use rag_module::RagModule;
use serde_json::json;
use tempfile::tempdir;
use tokio;
use anyhow::Result;

#[tokio::test]
async fn test_content_based_deduplication() -> Result<()> {
    // Create temporary directory for test
    let temp_dir = tempdir()?;
    let base_path = temp_dir.path().to_path_buf();
    
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_dedup";
    let collection_name = "test_dedup";
    
    // Set user context
    rag.set_user_context(user_id).await?;

    // Test 1: Same content with same metadata - should not duplicate
    let doc1 = json!({
        "content": "EC2 instance for web application",
        "service": "EC2",
        "region": "us-east-1"
    });
    
    rag.ingest_aws_estate(doc1.clone(), user_id, collection_name).await?;
    let count_after_first = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(count_after_first, 1);
    
    // Insert same document again
    rag.ingest_aws_estate(doc1.clone(), user_id, collection_name).await?;
    let count_after_duplicate = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(count_after_duplicate, 1, "Duplicate content should not increase count");

    Ok(())
}

#[tokio::test]
async fn test_metadata_update_same_content() -> Result<()> {
    let temp_dir = tempdir()?;
    let base_path = temp_dir.path().to_path_buf();
    
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_metadata";
    let collection_name = "test_metadata_update";
    
    rag.set_user_context(user_id).await?;

    // Insert document with initial metadata
    let doc_v1 = json!({
        "content": "RDS PostgreSQL database for user data",
        "service": "RDS",
        "engine": "postgres",
        "version": "13.7"
    });
    
    rag.ingest_aws_estate(doc_v1, user_id, collection_name).await?;
    
    // Update same content with different metadata
    let doc_v2 = json!({
        "content": "RDS PostgreSQL database for user data", // Same content
        "service": "RDS", 
        "engine": "postgres",
        "version": "14.9", // Updated version
        "backup_retention": 7, // New field
        "multi_az": true       // New field
    });
    
    rag.ingest_aws_estate(doc_v2, user_id, collection_name).await?;
    
    // Should still have only 1 document (updated, not duplicated)
    let final_count = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(final_count, 1, "Metadata update should not create new document");
    
    Ok(())
}

#[tokio::test] 
async fn test_different_content_creates_new_documents() -> Result<()> {
    let temp_dir = tempdir()?;
    let base_path = temp_dir.path().to_path_buf();
    
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_different";
    let collection_name = "test_different_content";
    
    rag.set_user_context(user_id).await?;

    // Insert document 1
    let doc1 = json!({
        "content": "S3 bucket for storing images",
        "service": "S3",
        "bucket_name": "images-bucket"
    });
    
    rag.ingest_aws_estate(doc1, user_id, collection_name).await?;
    let count_after_first = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(count_after_first, 1);
    
    // Insert document 2 with completely different content
    let doc2 = json!({
        "content": "Lambda function for image processing", // Different content
        "service": "Lambda",
        "runtime": "python3.9"
    });
    
    rag.ingest_aws_estate(doc2, user_id, collection_name).await?;
    let count_after_second = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(count_after_second, 2, "Different content should create new document");
    
    Ok(())
}

#[tokio::test]
async fn test_batch_deduplication() -> Result<()> {
    let temp_dir = tempdir()?;
    let base_path = temp_dir.path().to_path_buf();
    
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_batch";
    let collection_name = "test_batch_dedup";
    
    rag.set_user_context(user_id).await?;

    // Create batch with duplicates
    let batch = vec![
        json!({
            "content": "VPC for production environment",
            "service": "VPC",
            "cidr": "10.0.0.0/16"
        }),
        json!({
            "content": "VPC for production environment", // Duplicate content
            "service": "VPC", 
            "cidr": "10.0.0.0/16",
            "dns_hostnames": true // Additional metadata
        }),
        json!({
            "content": "Security group for web servers", // Different content
            "service": "EC2",
            "type": "SecurityGroup"
        }),
        json!({
            "content": "VPC for production environment", // Duplicate again
            "service": "VPC",
            "cidr": "10.0.0.0/16", 
            "dns_hostnames": true,
            "dns_resolution": true // More metadata
        })
    ];
    
    rag.ingest_aws_estate_batch(batch, user_id, collection_name).await?;
    
    // Should only have 2 unique documents (VPC + SecurityGroup)
    let final_count = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(final_count, 2, "Batch should deduplicate and result in 2 unique documents");
    
    Ok(())
}

#[tokio::test]
async fn test_content_hash_consistency() -> Result<()> {
    let temp_dir = tempdir()?;
    let base_path = temp_dir.path().to_path_buf();
    
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_consistency";
    let collection_name = "test_consistency";
    
    rag.set_user_context(user_id).await?;

    // Same content in different JSON structures should hash the same
    let doc1 = json!({
        "content": "IAM role for Lambda execution",
        "service": "IAM",
        "type": "Role",
        "trust_policy": {
            "Version": "2012-10-17",
            "Statement": []
        }
    });
    
    let doc2 = json!({
        "content": "IAM role for Lambda execution", // Same content
        "service": "IAM",
        "type": "Role", 
        "trust_policy": {
            "Version": "2012-10-17",
            "Statement": []
        },
        "description": "Role for Lambda function execution" // Additional field
    });
    
    rag.ingest_aws_estate(doc1, user_id, collection_name).await?;
    let count_after_first = rag.get_document_count(Some(collection_name), None).await?;
    
    rag.ingest_aws_estate(doc2, user_id, collection_name).await?;
    let count_after_second = rag.get_document_count(Some(collection_name), None).await?;
    
    assert_eq!(count_after_first, 1);
    assert_eq!(count_after_second, 1, "Same content should produce same hash regardless of metadata");
    
    Ok(())
}

#[tokio::test]
async fn test_large_scale_deduplication() -> Result<()> {
    let temp_dir = tempdir()?;
    let base_path = temp_dir.path().to_path_buf();
    
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_large";
    let collection_name = "test_large_scale";
    
    rag.set_user_context(user_id).await?;

    // Create 100 documents where 50 have same content (should deduplicate to 51 total)
    let mut batch = Vec::new();
    
    // 50 duplicate documents with same content
    for i in 0..50 {
        batch.push(json!({
            "content": "Shared EBS volume for data storage", // Same content for all
            "service": "EBS",
            "volume_id": format!("vol-{:06}", i), // Different volume IDs
            "size": 100 + i, // Different sizes
        }));
    }
    
    // 50 unique documents
    for i in 0..50 {
        batch.push(json!({
            "content": format!("Unique EBS volume {} for application data", i), // Unique content
            "service": "EBS", 
            "volume_id": format!("vol-unique-{:06}", i),
            "size": 200 + i,
        }));
    }
    
    // Shuffle to make test more realistic
    use rand::seq::SliceRandom;
    let mut rng = rand::thread_rng();
    batch.shuffle(&mut rng);
    
    rag.ingest_aws_estate_batch(batch, user_id, collection_name).await?;
    
    // Should have 51 documents total (1 for shared content + 50 unique)
    let final_count = rag.get_document_count(Some(collection_name), None).await?;
    assert_eq!(final_count, 51, "Large scale deduplication should work correctly");
    
    Ok(())
}