rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
use rag_module::{RagModule, SearchOptions};
use serde_json::json;
use std::path::PathBuf;
use anyhow::Result;

#[tokio::main]
async fn main() -> Result<()> {
    println!("🔍 Verifying Deduplication with Content Hashes");
    println!("===============================================");

    // Clean start with fresh directory
    let base_path = PathBuf::from("./verify_dedup_test");
    if base_path.exists() {
        std::fs::remove_dir_all(&base_path).ok();
    }

    let rag = RagModule::new(base_path.clone()).await?;
    let user_id = "verify_user";
    let collection_name = "verify_estate";
    rag.set_user_context(user_id).await?;

    println!("✅ Fresh RAG instance created");

    // Simple test data with same content
    let content = "RDS instance test-db, id test-db, region us-east-1, engine mysql, class db.t3.micro";
    
    let doc1 = json!({
        "content": content,
        "service": "RDS",
        "region": "us-east-1",
        "instance_id": "test-db-1"
    });

    let doc2 = json!({
        "content": content, // SAME content
        "service": "RDS", 
        "region": "us-east-1",
        "instance_id": "test-db-2", // DIFFERENT metadata
        "updated": true
    });

    let doc3 = json!({
        "content": "Different S3 bucket for storage", // DIFFERENT content
        "service": "S3",
        "bucket_name": "test-bucket"
    });

    println!("\n📋 Test Content Hashes:");
    let hash1 = {
        use std::collections::hash_map::DefaultHasher;
        use std::hash::{Hash, Hasher};
        let mut hasher = DefaultHasher::new();
        content.hash(&mut hasher);
        hasher.finish()
    };
    let doc_id1 = format!("{}-{:x}", collection_name, hash1);
    println!("   Content 1: {:x}{}", hash1, doc_id1);

    let hash2 = {
        use std::collections::hash_map::DefaultHasher;
        use std::hash::{Hash, Hasher};
        let mut hasher = DefaultHasher::new();
        content.hash(&mut hasher); // Same content
        hasher.finish()
    };
    let doc_id2 = format!("{}-{:x}", collection_name, hash2);
    println!("   Content 2: {:x}{}", hash2, doc_id2);

    let hash3 = {
        use std::collections::hash_map::DefaultHasher;
        use std::hash::{Hash, Hasher};
        let mut hasher = DefaultHasher::new();
        "Different S3 bucket for storage".hash(&mut hasher);
        hasher.finish()
    };
    let doc_id3 = format!("{}-{:x}", collection_name, hash3);
    println!("   Content 3: {:x}{}", hash3, doc_id3);

    if doc_id1 == doc_id2 {
        println!("✅ Same content generates same document ID");
    } else {
        println!("❌ Same content generates different document IDs");
    }

    if doc_id1 != doc_id3 {
        println!("✅ Different content generates different document ID");
    } else {
        println!("❌ Different content generates same document ID");
    }

    println!("\n📋 Ingesting Documents:");
    
    println!("🔄 Ingesting document 1...");
    rag.ingest_aws_estate(doc1, user_id, collection_name).await?;
    let count1 = rag.get_document_count(Some(collection_name), None).await?;
    println!("   Document count: {}", count1);
    
    println!("🔄 Ingesting document 2 (same content)...");
    rag.ingest_aws_estate(doc2, user_id, collection_name).await?;
    let count2 = rag.get_document_count(Some(collection_name), None).await?;
    println!("   Document count: {}", count2);
    
    println!("🔄 Ingesting document 3 (different content)...");
    rag.ingest_aws_estate(doc3, user_id, collection_name).await?;
    let count3 = rag.get_document_count(Some(collection_name), None).await?;
    println!("   Document count: {}", count3);

    println!("\n📊 Final Results:");
    if count1 == 1 && count2 == 1 && count3 == 2 {
        println!("🎉 PERFECT! Deduplication working correctly:");
        println!("   - Same content: {} docs (expected 1)", count2);
        println!("   - + Different content: {} docs (expected 2)", count3);
    } else {
        println!("⚠️  Results need investigation:");
        println!("   - After doc 1: {}", count1);
        println!("   - After doc 2 (duplicate): {}", count2);
        println!("   - After doc 3 (different): {}", count3);
    }

    // Check storage files
    let storage_file = base_path.join("qdrant-data").join(user_id).join(format!("{}-documents.json", collection_name));
    if storage_file.exists() {
        println!("\n📁 Storage File Check:");
        println!("   Path: {}", storage_file.display());
        if let Ok(content) = std::fs::read_to_string(&storage_file) {
            if let Ok(json_data) = serde_json::from_str::<serde_json::Value>(&content) {
                if let Some(docs) = json_data.get("documents").and_then(|d| d.as_object()) {
                    println!("   Document keys in storage: {}", docs.len());
                    for key in docs.keys() {
                        if let Some(doc) = docs.get(key) {
                            if let Some(id) = doc.get("id") {
                                println!("     - Key: {} → ID: {}", key, id);
                            }
                        }
                    }
                }
                if let Some(count) = json_data.get("count") {
                    println!("   Storage count field: {}", count);
                }
            }
        }
    }

    Ok(())
}