rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Test hash collision fixes
//! 
//! This example tests the improved canonical ID generation and SHA-256 hashing
//! to ensure different resources get unique IDs.

use rag_module::RagModule;
use serde_json::json;
use std::collections::HashMap;

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    tracing_subscriber::fmt::init();
    
    println!("๐Ÿงช Testing Hash Collision Fixes");
    println!("================================\n");

    // Initialize RAG module
    let rag = RagModule::new("./hash-collision-test-data").await?;
    rag.initialize().await?;

    let user_id = "collision_test_user";
    let collection_name = "hash_test";

    // Create test documents that might cause collisions with old system
    let test_documents = vec![
        // Two EC2 instances with similar metadata but different IDs
        json!({
            "content": "EC2 instance i-1234567890abcdef0 running in us-east-1",
            "resource_type": "ec2_instance",
            "instance_id": "i-1234567890abcdef0",
            "instance_type": "t3.micro",
            "state": "running",
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        json!({
            "content": "EC2 instance i-1234567890abcdef1 running in us-east-1", 
            "resource_type": "ec2_instance",
            "instance_id": "i-1234567890abcdef1", // Different by one character
            "instance_type": "t3.micro",
            "state": "running",
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        
        // S3 buckets with similar names
        json!({
            "content": "S3 bucket my-test-bucket-001 for application data storage",
            "resource_type": "s3_bucket",
            "bucket_name": "my-test-bucket-001",
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        json!({
            "content": "S3 bucket my-test-bucket-002 for application data storage",
            "resource_type": "s3_bucket", 
            "bucket_name": "my-test-bucket-002", // Different by one digit
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        
        // RDS instances with similar identifiers
        json!({
            "content": "RDS MySQL database prod-db-main-01 running version 8.0.35",
            "resource_type": "rds_instance",
            "db_instance_identifier": "prod-db-main-01",
            "engine": "mysql",
            "engine_version": "8.0.35",
            "instance_class": "db.t3.micro",
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        json!({
            "content": "RDS MySQL database prod-db-main-02 running version 8.0.35",
            "resource_type": "rds_instance",
            "db_instance_identifier": "prod-db-main-02", // Different by one digit
            "engine": "mysql", 
            "engine_version": "8.0.35",
            "instance_class": "db.t3.micro",
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        
        // Documents with minimal differences that might hash to same value
        json!({
            "content": "Lambda function user-service-handler processing user requests",
            "resource_type": "lambda_function",
            "function_name": "user-service-handler",
            "runtime": "python3.11",
            "memory_size": 128,
            "timeout": 30,
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        json!({
            "content": "Lambda function user-service-handler processing user responses", // Very similar content
            "resource_type": "lambda_function",
            "function_name": "user-service-handler2", // Slightly different name
            "runtime": "python3.11",
            "memory_size": 128,
            "timeout": 30,
            "region": "us-east-1", 
            "account_id": "123456789012"
        }),

        // Fallback case - unknown resource type with minimal identifying info
        json!({
            "content": "Unknown service resource with minimal metadata",
            "resource_type": "unknown_service",
            "name": "resource-001",
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
        json!({
            "content": "Unknown service resource with minimal metadata", // Same content
            "resource_type": "unknown_service",
            "name": "resource-002", // Different name
            "region": "us-east-1",
            "account_id": "123456789012"
        }),
    ];

    println!("๐Ÿ“Š Testing {} documents for hash collisions...\n", test_documents.len());

    // Ingest documents and track generated IDs
    let mut generated_ids = HashMap::new();
    let mut canonical_ids = Vec::new();
    
    for (idx, doc) in test_documents.iter().enumerate() {
        println!("Processing document {}/{}", idx + 1, test_documents.len());
        
        // Extract metadata to generate canonical ID (similar to internal logic)
        let metadata_obj = doc.as_object().unwrap().clone();
        let mut metadata_without_content = metadata_obj.clone();
        metadata_without_content.remove("content");
        
        // This is a simplified version - the actual function is internal
        let resource_type = doc.get("resource_type").and_then(|r| r.as_str()).unwrap_or("unknown");
        let identifier = doc.get("instance_id")
            .or_else(|| doc.get("bucket_name"))
            .or_else(|| doc.get("db_instance_identifier"))
            .or_else(|| doc.get("function_name"))
            .or_else(|| doc.get("name"))
            .and_then(|i| i.as_str())
            .unwrap_or("unknown");
        
        let simulated_canonical_id = format!("{}:{}", resource_type, identifier);
        canonical_ids.push(simulated_canonical_id.clone());
        
        // Ingest the document
        let result = rag.ingest_aws_estate(doc.clone(), user_id, collection_name).await?;
        
        if result.create_result.created > 0 {
            println!("  โœ… Document ingested successfully");
            println!("     Canonical ID: {}", simulated_canonical_id);
        } else {
            println!("  โŒ Document ingestion failed: {:?}", result.create_result.failed);
        }
    }

    println!("\n๐Ÿ” Analysis Results:");
    println!("===================");
    
    // Check for duplicate canonical IDs 
    let mut canonical_counts = HashMap::new();
    for id in &canonical_ids {
        *canonical_counts.entry(id.clone()).or_insert(0) += 1;
    }
    
    let duplicates: Vec<_> = canonical_counts.iter()
        .filter(|(_, &count)| count > 1)
        .collect();
        
    if duplicates.is_empty() {
        println!("โœ… No duplicate canonical IDs found - collision prevention working!");
    } else {
        println!("โš ๏ธ  Found {} duplicate canonical IDs:", duplicates.len());
        for (id, count) in duplicates {
            println!("   - '{}' appears {} times", id, count);
        }
    }

    // Test the actual batch ingestion with potential duplicates
    println!("\n๐Ÿ”„ Testing batch ingestion with potential duplicates...");
    
    // Create a batch with intentional duplicates
    let duplicate_batch = vec![
        test_documents[0].clone(), // Same EC2 instance
        test_documents[0].clone(), // Exact duplicate
        test_documents[1].clone(), // Similar but different EC2 instance
    ];
    
    let batch_result = rag.ingest_aws_estate_batch(duplicate_batch, user_id, collection_name).await?;
    
    println!("Batch ingestion results:");
    println!("  Total resources: {}", batch_result.total_resources);
    println!("  Successfully created: {}", batch_result.create_result.created);
    println!("  Failed: {}", batch_result.failed_resources);
    
    if batch_result.create_result.created == 2 {
        println!("โœ… Deduplication working - 3 resources became 2 documents (duplicate removed)");
    } else if batch_result.create_result.created == 3 {
        println!("โš ๏ธ  No deduplication occurred - might indicate issue");
    } else {
        println!("โŒ Unexpected result - {} documents created", batch_result.create_result.created);
    }

    println!("\n๐ŸŽ‰ Hash collision test completed!");
    println!("Check the generated documents to ensure uniqueness.");
    
    Ok(())
}