rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
use rag_module::RagModule;
use serde_json::json;
use std::path::PathBuf;
use anyhow::Result;

#[tokio::main]
async fn main() -> Result<()> {
    println!("๐Ÿงช Testing AWS Estate Deduplication");
    println!("=====================================");

    // Initialize RAG module
    let base_path = PathBuf::from("./test_data");
    let rag = RagModule::new(base_path).await?;
    
    let user_id = "test_user_dedup";
    let collection_name = "aws_estate_dedup_test";

    // Set user context
    rag.set_user_context(user_id).await?;

    // Test Case 1: Insert same content with same metadata - should NOT create duplicate
    println!("\n๐Ÿ“‹ Test Case 1: Same content + Same metadata");
    println!("-".repeat(50));
    
    let same_content_same_metadata = json!({
        "content": "This is an EC2 instance with standard configuration",
        "service": "EC2",
        "region": "us-east-1",
        "instance_type": "t3.micro",
        "status": "running"
    });

    println!("๐Ÿ”„ Ingesting document first time...");
    let result1 = rag.ingest_aws_estate(same_content_same_metadata.clone(), user_id, collection_name).await?;
    println!("โœ… First ingestion completed");

    println!("๐Ÿ”„ Ingesting SAME document again...");
    let result2 = rag.ingest_aws_estate(same_content_same_metadata.clone(), user_id, collection_name).await?;
    println!("โœ… Second ingestion completed");

    // Check document count - should be 1, not 2
    let count1 = rag.get_document_count(Some(collection_name), None).await?;
    println!("๐Ÿ“Š Document count after same content ingestion: {}", count1);
    assert_eq!(count1, 1, "Should only have 1 document, not duplicates!");

    // Test Case 2: Same content but different metadata - should UPDATE existing
    println!("\n๐Ÿ“‹ Test Case 2: Same content + Different metadata");
    println!("-".repeat(50));
    
    let same_content_diff_metadata = json!({
        "content": "This is an EC2 instance with standard configuration", // SAME content
        "service": "EC2",
        "region": "us-west-2",  // DIFFERENT region
        "instance_type": "t3.small", // DIFFERENT instance type
        "status": "stopped",    // DIFFERENT status
        "updated_at": "2024-12-24T10:00:00Z" // NEW field
    });

    println!("๐Ÿ”„ Ingesting document with SAME content but DIFFERENT metadata...");
    let result3 = rag.ingest_aws_estate(same_content_diff_metadata, user_id, collection_name).await?;
    println!("โœ… Metadata update ingestion completed");

    // Check document count - should STILL be 1 (updated, not duplicated)
    let count2 = rag.get_document_count(Some(collection_name), None).await?;
    println!("๐Ÿ“Š Document count after metadata update: {}", count2);
    assert_eq!(count2, 1, "Should still only have 1 document after metadata update!");

    // Test Case 3: Different content - should CREATE new document
    println!("\n๐Ÿ“‹ Test Case 3: Different content");
    println!("-".repeat(50));

    let different_content = json!({
        "content": "This is a completely different RDS database instance", // DIFFERENT content
        "service": "RDS",
        "region": "us-east-1",
        "engine": "postgres",
        "status": "available"
    });

    println!("๐Ÿ”„ Ingesting document with DIFFERENT content...");
    let result4 = rag.ingest_aws_estate(different_content, user_id, collection_name).await?;
    println!("โœ… Different content ingestion completed");

    // Check document count - should NOW be 2 (one for each unique content)
    let count3 = rag.get_document_count(Some(collection_name), None).await?;
    println!("๐Ÿ“Š Document count after different content: {}", count3);
    assert_eq!(count3, 2, "Should have 2 documents for 2 different contents!");

    // Test Case 4: Batch ingestion with mixed scenarios
    println!("\n๐Ÿ“‹ Test Case 4: Batch ingestion with duplicates");
    println!("-".repeat(50));

    let batch_data = vec![
        json!({
            "content": "S3 bucket for storing application logs", 
            "service": "S3",
            "bucket_name": "app-logs-bucket"
        }),
        json!({
            "content": "S3 bucket for storing application logs", // DUPLICATE content
            "service": "S3",
            "bucket_name": "app-logs-bucket-updated", // Different metadata
            "encryption": "AES-256"
        }),
        json!({
            "content": "Lambda function for image processing",  // UNIQUE content
            "service": "Lambda",
            "runtime": "python3.9"
        }),
        json!({
            "content": "S3 bucket for storing application logs", // DUPLICATE again
            "service": "S3",
            "bucket_name": "final-bucket-name", // Different metadata again
            "encryption": "AES-256",
            "versioning": "enabled"
        })
    ];

    println!("๐Ÿ”„ Ingesting batch with {} documents (including duplicates)...", batch_data.len());
    let batch_result = rag.ingest_aws_estate_batch(batch_data, user_id, collection_name).await?;
    println!("โœ… Batch ingestion completed");

    // Check final count - should be 4 total (2 from before + 2 unique contents from batch)
    let final_count = rag.get_document_count(Some(collection_name), None).await?;
    println!("๐Ÿ“Š Final document count: {}", final_count);
    assert_eq!(final_count, 4, "Should have 4 documents total (2 + 2 unique contents)!");

    // Search to verify we can find the documents
    println!("\n๐Ÿ“‹ Test Case 5: Search verification");
    println!("-".repeat(50));
    
    let search_results = rag.search(collection_name, "EC2 instance", Default::default()).await?;
    println!("๐Ÿ” Search for 'EC2 instance' found {} results", search_results.len());
    
    let s3_results = rag.search(collection_name, "S3 bucket", Default::default()).await?;
    println!("๐Ÿ” Search for 'S3 bucket' found {} results", s3_results.len());

    println!("\n๐ŸŽ‰ All deduplication tests PASSED!");
    println!("โœ… Same content creates same document ID");
    println!("โœ… Metadata updates replace existing document");  
    println!("โœ… Different content creates new documents");
    println!("โœ… Batch processing handles mixed scenarios correctly");
    println!("โœ… No duplicate documents in storage");

    Ok(())
}