rag-module 0.6.7

//! Fresh storage and search test - store documents with BGE-M3 and immediately test

use anyhow::Result;
use serde_json::json;
use rag_module::*;
use rag_module::services::search_service::EstateSearchOptions;

#[tokio::main]
async fn main() -> Result<()> {
    println!("🆕 Fresh Storage + Search Test");
    println!("===============================");
    println!("This test will store NEW documents with BGE-M3 and immediately search them");

    // Use a completely fresh directory
    let base_path = std::env::current_dir()?.join("fresh-storage-test");
    
    // Clean up any existing data
    if base_path.exists() {
        std::fs::remove_dir_all(&base_path).ok();
    }
    
    let rag = create_rag_module(base_path).await?;
    rag.initialize().await?;

    let user_id = "fresh_test_user";
    
    println!("\n📝 Step 1: Store RDS data with current BGE-M3 model");
    
    // Create RDS data that should match our query
    let rds_data = json!([{
        "account_id": "123456789012",
        "account_name": "Test Production Account",
        "services": {
            "rds": {
                "instances": [
                    {
                        "db_instance_identifier": "prod-mysql-db",
                        "db_name": "production",
                        "engine": "mysql",
                        "engine_version": "8.0.35",
                        "db_instance_class": "db.t3.medium",
                        "allocated_storage": 100,
                        "storage_type": "gp2",
                        "multi_az": true,
                        "publicly_accessible": false,
                        "description": "Production MySQL database for user authentication and core application data",
                        "tags": {
                            "Environment": "production",
                            "Application": "web-app",
                            "Owner": "backend-team"
                        }
                    },
                    {
                        "db_instance_identifier": "analytics-postgres-db", 
                        "db_name": "analytics",
                        "engine": "postgres",
                        "engine_version": "15.4",
                        "db_instance_class": "db.r5.large", 
                        "allocated_storage": 500,
                        "storage_type": "gp3",
                        "multi_az": false,
                        "publicly_accessible": false,
                        "description": "PostgreSQL database for analytics, reporting, and business intelligence queries",
                        "tags": {
                            "Environment": "production",
                            "Application": "analytics",
                            "Owner": "data-team"
                        }
                    }
                ]
            },
            "lambda": {
                "functions": [
                    {
                        "function_name": "db-backup-scheduler",
                        "runtime": "python3.9",
                        "description": "Lambda function that schedules and manages RDS database backups"
                    }
                ]
            }
        }
    }]);

    println!("Adding AWS estate data...");
    let doc_ids = rag.process_aws_estate(rds_data, user_id).await?;
    println!("✅ Stored {} documents with BGE-M3 embeddings", doc_ids.len());

    // Wait for storage to complete
    println!("\n⏳ Waiting for storage and indexing...");
    tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;

    println!("\n🔍 Step 2: Verify documents are stored correctly");
    let docs = rag.get_collection_documents("aws_estate", user_id).await?;
    println!("Documents in collection: {}", docs.len());
    
    for (i, doc) in docs.iter().enumerate() {
        println!("  Doc {}: ID={}, Content={}chars, Embedding={}D", 
            i+1, &doc.id[..8], doc.content.len(), doc.embedding.len());
        
        // Verify embedding quality
        if doc.embedding.len() == 1024 {
            let sum: f32 = doc.embedding.iter().sum();
            let avg = sum / 1024.0;
            let variance: f32 = doc.embedding.iter()
                .map(|x| (x - avg).powi(2)).sum() / 1024.0;
                
            println!("    Embedding: avg={:.6}, variance={:.6}", avg, variance);
            
            if variance > 0.001 {
                println!("    ✅ Good variance - real BGE-M3 embedding");
            } else {
                println!("    ⚠️  Low variance - might be dummy");
            }
        } else {
            println!("    ❌ Wrong embedding dimensions: {}", doc.embedding.len());
        }
    }

    if docs.is_empty() {
        println!("❌ No documents stored! This is the problem.");
        return Ok(());
    }

    println!("\n🔍 Step 3: Test multiple search queries with different thresholds");
    
    let test_queries = vec![
        ("RDS database instances", 0.3),
        ("RDS database instances", 0.1), 
        ("RDS database instances", 0.0),
        ("MySQL database", 0.0),
        ("database", 0.0),
        ("production", 0.0),
    ];

    for (query, threshold) in test_queries {
        println!("\n🔎 Query: '{}' (threshold: {})", query, threshold);
        
        let search_options = EstateSearchOptions {
            resource_types: None,
            account_ids: None,
            regions: None,
            services: None,
            states: None,
            environment: None,
            application: None,
            synced_after: None,
            limit: Some(5),
            score_threshold: Some(threshold),
            include_metadata: true,
            use_anonymous_ids: false,
        };

        let results = rag.search_service
            .search_estate_resources(query, search_options, None)
            .await?;

        println!("   Results: {}", results.len());
        
        for (i, result) in results.iter().enumerate() {
            if let Some(score) = result.get("score").and_then(|s| s.as_f64()) {
                let service = result.get("service").and_then(|s| s.as_str()).unwrap_or("unknown");
                println!("     {}: {} (score: {:.4})", i+1, service, score);
            }
        }
        
        if results.is_empty() && threshold == 0.0 {
            println!("   ❌ Even with threshold 0.0, no results! Vector search has an issue.");
        } else if !results.is_empty() {
            println!("   ✅ Found results! Threshold {} works.", threshold);
        }
    }

    println!("\n📊 Step 4: Manual similarity check");
    if let Some(first_doc) = docs.first() {
        println!("Testing manual embedding generation and similarity...");
        
        let query = "RDS database";
        let query_embedding = rag.embedding_service.generate_embedding(query).await?;
        
        if query_embedding.len() == 1024 && first_doc.embedding.len() == 1024 {
            // Manual cosine similarity
            let dot_product: f32 = query_embedding.iter()
                .zip(first_doc.embedding.iter())
                .map(|(a, b)| a * b)
                .sum();
                
            let norm_q: f32 = query_embedding.iter().map(|x| x.powi(2)).sum::<f32>().sqrt();
            let norm_d: f32 = first_doc.embedding.iter().map(|x| x.powi(2)).sum::<f32>().sqrt();
            
            let similarity = if norm_q > 0.0 && norm_d > 0.0 {
                dot_product / (norm_q * norm_d)
            } else {
                0.0
            };
            
            println!("📊 Manual similarity calculation:");
            println!("   Query embedding norm: {:.6}", norm_q);
            println!("   Doc embedding norm: {:.6}", norm_d);
            println!("   Dot product: {:.6}", dot_product);
            println!("   Cosine similarity: {:.6}", similarity);
            
            if similarity > 0.1 {
                println!("   ✅ Good similarity - vector search should work");
            } else {
                println!("   ⚠️  Low similarity - but should still appear with threshold 0.0");
            }
        }
    }

    println!("\n🎯 CONCLUSION:");
    if docs.len() > 0 {
        println!("✅ Documents stored successfully with BGE-M3");
        println!("If search still returns 0 results, the issue is in the vector search logic");
    } else {
        println!("❌ Document storage failed");  
    }

    Ok(())
}