rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Debug the vector store indexing vs search mismatch

use anyhow::Result;
use rag_module::*;

#[tokio::main]
async fn main() -> Result<()> {
    println!("🔍 DEBUGGING VECTOR STORE INDEXING VS SEARCH");
    println!("============================================\n");
    
    let rag = create_rag_module("./test_data").await?;
    rag.initialize().await?;
    
    let user_id = "test_user_123";
    
    println!("📋 STEP 1: Check Vector Store Raw Contents");
    println!("==========================================");
    
    // Check the vector index file
    let vector_index_file = std::path::Path::new("test_data/qdrant-data/test_user_123/aws_estate-vector-index.json");
    if vector_index_file.exists() {
        let content = std::fs::read_to_string(vector_index_file)?;
        let index_data: serde_json::Value = serde_json::from_str(&content)?;
        
        if let Some(vectors) = index_data.get("vectors").and_then(|v| v.as_array()) {
            println!("   📊 Vector index contains {} vectors", vectors.len());
            
            // Show first 5 vector entries
            for (i, vector) in vectors.iter().take(5).enumerate() {
                let doc_id = vector.get("documentId").and_then(|v| v.as_str()).unwrap_or("N/A");
                let vector_id = vector.get("vectorId").and_then(|v| v.as_str()).unwrap_or("N/A");
                let position = vector.get("position").and_then(|v| v.as_u64()).unwrap_or(0);
                
                println!("   {}. Doc: {} → Vector: {} (pos: {})", i + 1, doc_id, vector_id, position);
            }
        }
    }
    
    println!("\n📋 STEP 2: Check Document Collection");
    println!("====================================");
    
    let collection_docs = rag.get_collection_documents("aws_estate", user_id).await?;
    println!("   📊 Collection documents: {}", collection_docs.len());
    
    // Show first 5 document IDs
    for (i, doc) in collection_docs.iter().take(5).enumerate() {
        println!("   {}. Document ID: {}", i + 1, doc.id);
    }
    
    println!("\n📋 STEP 3: Perform Vector Search and Trace IDs");
    println!("==============================================");
    
    // Use the vector store directly to see what it returns
    let search_options = rag_module::types::SearchOptions {
        limit: Some(10),
        score_threshold: Some(0.001),
        filter: None,
        collection_name: None,
        privacy_level: None,
        with_payload: Some(true),
    };
    
    let search_results = rag.vector_store.search(
        "aws_estate",
        vec![0.1; 1024], // Dummy query vector
        search_options,
    ).await?;
    
    println!("   📊 Vector search returned {} results", search_results.len());
    
    // Show the IDs returned by vector search
    for (i, result) in search_results.iter().take(5).enumerate() {
        println!("   {}. Vector search result ID: {} (score: {:.4})", 
                i + 1, result.id, result.score);
        
        // Check if this ID exists in our collection
        let exists_in_collection = collection_docs.iter().any(|doc| doc.id == result.id);
        println!("      Exists in collection: {}", exists_in_collection);
    }
    
    println!("\n📋 STEP 4: Cross-Reference Analysis");
    println!("===================================");
    
    // Check if document IDs from collection match vector search results
    let collection_ids: std::collections::HashSet<_> = collection_docs.iter().map(|doc| &doc.id).collect();
    let search_result_ids: std::collections::HashSet<_> = search_results.iter().map(|result| &result.id).collect();
    
    let intersection: Vec<_> = collection_ids.intersection(&search_result_ids).collect();
    let collection_only: Vec<_> = collection_ids.difference(&search_result_ids).collect();
    let search_only: Vec<_> = search_result_ids.difference(&collection_ids).collect();
    
    println!("   📊 ID Analysis:");
    println!("      Documents in both collection & search: {}", intersection.len());
    println!("      Documents only in collection: {}", collection_only.len());
    println!("      Documents only in search results: {}", search_only.len());
    
    if !collection_only.is_empty() {
        println!("\n   🔍 Collection-only documents (first 3):");
        for (i, id) in collection_only.iter().take(3).enumerate() {
            println!("      {}. {}", i + 1, id);
        }
    }
    
    if !search_only.is_empty() {
        println!("\n   🔍 Search-only documents (first 3):");
        for (i, id) in search_only.iter().take(3).enumerate() {
            println!("      {}. {}", i + 1, id);
        }
    }
    
    println!("\n📋 STEP 5: Debug Estate Search Function");
    println!("======================================");
    
    // Test the estate search function specifically
    let estate_search_options = rag_module::services::search_service::EstateSearchOptions {
        resource_types: None,
        account_ids: None,
        regions: None,
        services: None,
        states: None,
        environment: None,
        application: None,
        synced_after: None,
        limit: Some(5),
        score_threshold: Some(0.001),
        include_metadata: true,
        use_anonymous_ids: false,
    };
    
    let estate_results = rag.search_service.search_estate_resources(
        "aws resources",
        estate_search_options,
        None,
        user_id,
    ).await?;
    
    println!("   📊 Estate search returned {} results", estate_results.len());
    
    for (i, result) in estate_results.iter().take(3).enumerate() {
        let result_id = result.get("id").and_then(|v| v.as_str()).unwrap_or("N/A");
        let service = result.get("service").and_then(|v| v.as_str()).unwrap_or("null");
        let resource_type = result.get("resource_type").and_then(|v| v.as_str()).unwrap_or("null");
        
        println!("   {}. Estate result ID: {} (service: {}, type: {})", 
                i + 1, result_id, service, resource_type);
        
        // Check if this matches our collection
        let exists_in_collection = collection_docs.iter().any(|doc| doc.id == result_id);
        println!("      Exists in collection: {}", exists_in_collection);
    }
    
    println!("\n🏁 DIAGNOSIS:");
    println!("=============");
    
    if intersection.is_empty() {
        println!("❌ CRITICAL: No overlap between indexed documents and search results!");
        println!("   This indicates the vector search is accessing a different dataset");
        println!("   than what was indexed. Possible causes:");
        println!("   - User context isolation bug");
        println!("   - Collection name mismatch");
        println!("   - Vector store corruption");
    } else {
        println!("✅ Some documents found in both collection and search");
    }
    
    Ok(())
}