rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Investigate vector search issue - why only some documents are found

use anyhow::Result;
use rag_module::*;
use rag_module::services::search_service::EstateSearchOptions;

#[tokio::main]
async fn main() -> Result<()> {
    println!("🔍 INVESTIGATING VECTOR SEARCH ISSUE");
    println!("===================================\n");
    
    let rag = create_rag_module("./test_data").await?;
    rag.initialize().await?;
    
    let user_id = "test_user_123";
    
    // First, let's test a completely unfiltered search with a generic query
    println!("🔍 TEST 1: Generic unfiltered search (no service filter)");
    let generic_results = rag.search_service.search_estate_resources(
        "aws cloud instances databases",
        EstateSearchOptions {
            resource_types: None,
            account_ids: None,
            regions: None,
            services: None, // NO SERVICE FILTER
            states: None,
            environment: None,
            application: None,
            synced_after: None,
            limit: Some(20), // Get lots of results
            score_threshold: None, // NO THRESHOLD
            include_metadata: true,
            use_anonymous_ids: false,
        },
        None,
        user_id,
    ).await?;
    
    println!("Found {} generic results:", generic_results.len());
    for (i, result) in generic_results.iter().take(10).enumerate() {
        let id = result.get("id").and_then(|v| v.as_str()).unwrap_or("unknown");
        let service = result.get("service").and_then(|v| v.as_str()).unwrap_or("unknown");
        let score = result.get("score").and_then(|v| v.as_f64()).unwrap_or(0.0);
        
        // Extract resource identifier
        let resource_id = if id.contains("instance:") {
            id.split("instance:").last().unwrap_or("unknown")
        } else if id.contains("db:") {
            id.split("db:").last().unwrap_or("unknown")
        } else if id.contains(":::") {
            id.split(":::").last().unwrap_or("unknown")
        } else {
            id.split("/").last().unwrap_or("unknown")
        };
        
        println!("  {}. Service: {} | Resource: {} | Score: {:.6}", i + 1, service, resource_id, score);
    }
    
    println!("\n{}", "=".repeat(60));
    
    // Test 2: EC2-specific query with no service filter to see what happens
    println!("🔍 TEST 2: EC2 query without service filter");
    let ec2_no_filter = rag.search_service.search_estate_resources(
        "ec2 instances running stopped",
        EstateSearchOptions {
            resource_types: None,
            account_ids: None,
            regions: None,
            services: None, // NO SERVICE FILTER
            states: None,
            environment: None,
            application: None,
            synced_after: None,
            limit: Some(20),
            score_threshold: None,
            include_metadata: true,
            use_anonymous_ids: false,
        },
        None,
        user_id,
    ).await?;
    
    println!("Found {} results without service filter:", ec2_no_filter.len());
    for (i, result) in ec2_no_filter.iter().take(10).enumerate() {
        let id = result.get("id").and_then(|v| v.as_str()).unwrap_or("unknown");
        let service = result.get("service").and_then(|v| v.as_str()).unwrap_or("unknown");
        let score = result.get("score").and_then(|v| v.as_f64()).unwrap_or(0.0);
        
        let resource_id = if id.contains("instance:") {
            id.split("instance:").last().unwrap_or("unknown")
        } else {
            id.split("/").last().unwrap_or("unknown")
        };
        
        println!("  {}. Service: {} | Resource: {} | Score: {:.6}", i + 1, service, resource_id, score);
    }
    
    println!("\n{}", "=".repeat(60));
    
    // Test 3: Same EC2 query WITH service filter 
    println!("🔍 TEST 3: Same EC2 query WITH service filter");
    let ec2_with_filter = rag.search_service.search_estate_resources(
        "ec2 instances running stopped",
        EstateSearchOptions {
            resource_types: None,
            account_ids: None,
            regions: None,
            services: Some(vec!["ec2".to_string()]), // WITH SERVICE FILTER
            states: None,
            environment: None,
            application: None,
            synced_after: None,
            limit: Some(20),
            score_threshold: None,
            include_metadata: true,
            use_anonymous_ids: false,
        },
        None,
        user_id,
    ).await?;
    
    println!("Found {} results WITH service filter:", ec2_with_filter.len());
    for (i, result) in ec2_with_filter.iter().take(10).enumerate() {
        let id = result.get("id").and_then(|v| v.as_str()).unwrap_or("unknown");
        let service = result.get("service").and_then(|v| v.as_str()).unwrap_or("unknown");
        let score = result.get("score").and_then(|v| v.as_f64()).unwrap_or(0.0);
        
        let resource_id = if id.contains("instance:") {
            id.split("instance:").last().unwrap_or("unknown")
        } else {
            id.split("/").last().unwrap_or("unknown")
        };
        
        println!("  {}. Service: {} | Resource: {} | Score: {:.6}", i + 1, service, resource_id, score);
    }
    
    println!("\n📊 COMPARISON:");
    println!("  Without service filter: {} results", ec2_no_filter.len());
    println!("  With service filter: {} results", ec2_with_filter.len());
    
    if ec2_no_filter.len() > ec2_with_filter.len() {
        println!("  ❌ Service filter is removing valid results!");
    } else if ec2_no_filter.len() < 4 {
        println!("  ❌ Vector search itself is not finding all EC2 instances!");
    } else {
        println!("  ✅ Issue is likely elsewhere");
    }
    
    Ok(())
}