rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Hash Collision Fix Demo
//! 
//! This demonstrates the before/after improvements to canonical ID generation
//! and hash collision prevention.

use serde_json::json;
use sha2::{Sha256, Digest};
use std::collections::{HashMap, HashSet};

fn main() {
    println!("🛠️  Hash Collision Fix Demonstration");
    println!("=====================================\n");

    // Test data that would cause collisions in the old system
    let test_resources = vec![
        ("EC2 Instance 1", json!({
            "resource_type": "ec2_instance",
            "instance_id": "i-1234567890abcdef0",
            "region": "us-east-1",
            "account_id": "123456789012"
        })),
        ("EC2 Instance 2", json!({
            "resource_type": "ec2_instance", 
            "instance_id": "i-1234567890abcdef1", // Very similar ID
            "region": "us-east-1",
            "account_id": "123456789012"
        })),
        ("S3 Bucket 1", json!({
            "resource_type": "s3_bucket",
            "bucket_name": "my-test-bucket-001",
            "account_id": "123456789012"
        })),
        ("S3 Bucket 2", json!({
            "resource_type": "s3_bucket",
            "bucket_name": "my-test-bucket-002", // Similar name
            "account_id": "123456789012"
        })),
        ("Unknown Resource 1", json!({
            "resource_type": "unknown_service",
            "name": "resource-001",
            "region": "us-east-1",
            "account_id": "123456789012"
        })),
        ("Unknown Resource 2", json!({
            "resource_type": "unknown_service",
            "name": "resource-002", // Similar but different
            "region": "us-east-1", 
            "account_id": "123456789012"
        })),
    ];

    println!("🔍 Testing canonical ID generation improvements:\n");

    let mut canonical_ids = Vec::new();
    let mut sha256_hashes = Vec::new();

    for (name, metadata) in &test_resources {
        let canonical_id = generate_improved_canonical_id(metadata.as_object().unwrap());
        let sha256_hash = generate_sha256_hash(&canonical_id);
        
        canonical_ids.push(canonical_id.clone());
        sha256_hashes.push(sha256_hash.clone());
        
        println!("📊 {}", name);
        println!("   Canonical ID: {}", canonical_id);
        println!("   SHA256 Hash:  {}", sha256_hash);
        println!();
    }

    // Check for collisions
    println!("🚫 Collision Analysis:");
    println!("======================");
    
    // Check canonical IDs
    let unique_canonical: HashSet<_> = canonical_ids.iter().collect();
    if unique_canonical.len() == canonical_ids.len() {
        println!("✅ Canonical IDs: No collisions! ({} unique IDs)", unique_canonical.len());
    } else {
        println!("❌ Canonical IDs: {} collisions detected!", canonical_ids.len() - unique_canonical.len());
    }

    // Check SHA256 hashes
    let unique_hashes: HashSet<_> = sha256_hashes.iter().collect();
    if unique_hashes.len() == sha256_hashes.len() {
        println!("✅ SHA256 Hashes: No collisions! ({} unique hashes)", unique_hashes.len());
    } else {
        println!("❌ SHA256 Hashes: {} collisions detected!", sha256_hashes.len() - unique_hashes.len());
    }

    // Demonstrate old vs new approach
    println!("\n🔄 Old vs New Hash Comparison:");
    println!("===============================");
    
    for (name, _) in &test_resources[0..2] { // Just first two for demo
        let canonical_id = &canonical_ids[if name.contains("1") { 0 } else { 1 }];
        
        // Old approach (DefaultHasher - simulated)
        let old_hash = old_style_hash(canonical_id);
        
        // New approach (SHA256)
        let new_hash = &sha256_hashes[if name.contains("1") { 0 } else { 1 }];
        
        println!("📊 {}", name);
        println!("   Old Hash (weak):  {:x}", old_hash);
        println!("   New Hash (strong): {}", new_hash);
        println!();
    }

    println!("🎉 Improvements Summary:");
    println!("========================");
    println!("✅ Enhanced canonical ID generation with multiple field fallbacks");
    println!("✅ Replaced DefaultHasher with SHA-256 for collision resistance");  
    println!("✅ Better handling of edge cases and unknown resource types");
    println!("✅ Deterministic but collision-resistant document IDs");
}

// Improved canonical ID generation (simplified version of the actual function)
fn generate_improved_canonical_id(metadata: &serde_json::Map<String, serde_json::Value>) -> String {
    let resource_type = metadata.get("resource_type")
        .and_then(|r| r.as_str())
        .unwrap_or("unknown")
        .to_lowercase();
    
    // AWS Resources
    if let Some(account_id) = metadata.get("account_id").and_then(|a| a.as_str()) {
        match resource_type.as_str() {
            "ec2_instance" => {
                if let (Some(region), Some(instance_id)) = (
                    metadata.get("region").and_then(|r| r.as_str()),
                    metadata.get("instance_id").and_then(|i| i.as_str())
                ) {
                    return format!("aws:ec2:{}:{}:{}", account_id, region, instance_id);
                }
            },
            "s3_bucket" => {
                if let Some(bucket_name) = metadata.get("bucket_name").and_then(|b| b.as_str()) {
                    return format!("aws:s3:{}:{}", account_id, bucket_name);
                }
            },
            _ => {}
        }
    }
    
    // Enhanced fallback with multiple fields
    let mut id_components = Vec::new();
    id_components.push(format!("type:{}", resource_type));
    
    // Try multiple identifying fields in priority order
    let identifying_fields = ["instance_id", "bucket_name", "name", "id"];
    for field in &identifying_fields {
        if let Some(value) = metadata.get(*field).and_then(|v| v.as_str()) {
            id_components.push(format!("{}:{}", field, value));
            break;
        }
    }
    
    // Add region if available
    if let Some(region) = metadata.get("region").and_then(|r| r.as_str()) {
        id_components.push(format!("region:{}", region));
    }
    
    // Add account if available
    if let Some(account) = metadata.get("account_id").and_then(|a| a.as_str()) {
        id_components.push(format!("account:{}", account));
    }
    
    format!("fallback:{}", id_components.join(":"))
}

// Generate SHA256 hash (new approach)
fn generate_sha256_hash(input: &str) -> String {
    let mut hasher = Sha256::new();
    hasher.update(input.as_bytes());
    let hash = hasher.finalize();
    hex::encode(&hash[0..8]) // Use first 8 bytes for shorter ID
}

// Simulate old hash approach (for comparison only)
fn old_style_hash(input: &str) -> u64 {
    use std::collections::hash_map::DefaultHasher;
    use std::hash::{Hash, Hasher};
    
    let mut hasher = DefaultHasher::new();
    input.hash(&mut hasher);
    hasher.finish()
}