rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Debug script to trace UUID mismatch step by step

use anyhow::Result;
use rag_module::*;

#[tokio::main]
async fn main() -> Result<()> {
    println!("🔍 DEBUGGING UUID MISMATCH STEP BY STEP");
    println!("======================================\n");
    
    let rag = create_rag_module("./test_data").await?;
    rag.initialize().await?;
    
    let user_id = "test_user_123";
    
    println!("📋 STEP 1: Check Raw Documents in Collection");
    println!("============================================");
    
    // Set user context and check by reading from file system directly
    if let Some(embedded_store) = rag.vector_store.as_any().downcast_ref::<crate::db::EmbeddedQdrantVectorStore>() {
        embedded_store.set_user_context(user_id).await;
    }
    
    // Read documents from file system directly to see what's actually there
    let documents_file = format!("test_data/qdrant-data/{}/aws_estate-documents.json", user_id);
    let docs_content = match std::fs::read_to_string(&documents_file) {
        Ok(content) => {
            let json_value: serde_json::Value = serde_json::from_str(&content)?;
            if let Some(docs_array) = json_value.get("documents").and_then(|d| d.as_array()) {
                docs_array.len()
            } else {
                0
            }
        },
        Err(_) => {
            println!("   ❌ Could not read documents file: {}", documents_file);
            0
        }
    };
    
    println!("   📊 Total documents in file system: {}", docs_content);
    
    // Show first 3 document IDs from the file
    if docs_content > 0 {
        let content = std::fs::read_to_string(&documents_file)?;
        let json_value: serde_json::Value = serde_json::from_str(&content)?;
        if let Some(docs_array) = json_value.get("documents").and_then(|d| d.as_array()) {
            println!("   🔍 First 3 document IDs from file system:");
            for (i, doc) in docs_array.iter().take(3).enumerate() {
                let doc_id = doc.get("id").and_then(|v| v.as_str()).unwrap_or("N/A");
                println!("      {}. {}", i + 1, doc_id);
            }
        }
    }
    
    println!("\n📋 STEP 2: Test Vector Store Search Directly");
    println!("===========================================");
    
    // Set user context manually
    if let Some(embedded_store) = rag.vector_store.as_any().downcast_ref::<crate::db::EmbeddedQdrantVectorStore>() {
        embedded_store.set_user_context(user_id).await;
    }
    
    // Use vector store directly with a simple query
    let search_options = rag_module::types::SearchOptions {
        limit: Some(5),
        score_threshold: Some(0.001),
        filter: None,
        collection_name: None,
        privacy_level: None,
        with_payload: Some(true),
    };
    
    println!("   🔍 Performing direct vector search...");
    let direct_results = rag.vector_store.search(
        "aws_estate",
        vec![0.1; 1024], // Dummy embedding
        search_options,
    ).await?;
    
    println!("   📊 Direct vector search returned {} results", direct_results.len());
    println!("   🔍 Direct search result IDs:");
    for (i, result) in direct_results.iter().take(3).enumerate() {
        println!("      {}. {} (score: {:.4})", i + 1, result.id, result.score);
        
        // Check if this ID exists in our file system docs
        let exists = docs_content > 0; // We'll do a proper check later
        println!("         Exists in file system: {}", exists);
    }
    
    println!("\n📋 STEP 3: Test Estate Search Function");
    println!("=====================================");
    
    let search_options = rag_module::services::search_service::EstateSearchOptions {
        resource_types: None,
        account_ids: None,
        regions: None,
        services: None,
        states: None,
        environment: None,
        application: None,
        synced_after: None,
        limit: Some(3),
        score_threshold: Some(0.001),
        include_metadata: true,
        use_anonymous_ids: false,
    };
    
    println!("   🔍 Performing estate search...");
    let estate_results = rag.search_service.search_estate_resources(
        "aws resource",
        search_options,
        None,
        user_id,
    ).await?;
    
    println!("   📊 Estate search returned {} results", estate_results.len());
    println!("   🔍 Estate search result IDs:");
    for (i, result) in estate_results.iter().take(3).enumerate() {
        let result_id = result.get("id").and_then(|v| v.as_str()).unwrap_or("N/A");
        println!("      {}. {}", i + 1, result_id);
        
        // Check if this ID exists in our file system docs
        let exists = docs_content > 0; // We'll do a proper check later
        println!("         Exists in file system: {}", exists);
        
        // Check if this ID matches any direct search result
        let matches_direct = direct_results.iter().any(|dr| dr.id == result_id);
        println!("         Matches direct search: {}", matches_direct);
    }
    
    println!("\n📋 STEP 4: Cross-Reference Analysis");
    println!("===================================");
    
    // For now, create empty set - we'll improve this later
    let collection_ids: std::collections::HashSet<&str> = std::collections::HashSet::new();
    let direct_ids: std::collections::HashSet<_> = direct_results.iter().map(|r| r.id.as_str()).collect();
    let estate_ids: std::collections::HashSet<_> = estate_results.iter()
        .filter_map(|r| r.get("id").and_then(|v| v.as_str()))
        .collect();
    
    println!("   📊 ID Set Sizes:");
    println!("      Collection IDs: {}", collection_ids.len());
    println!("      Direct search IDs: {}", direct_ids.len());
    println!("      Estate search IDs: {}", estate_ids.len());
    
    let collection_direct_overlap: Vec<_> = collection_ids.intersection(&direct_ids).collect();
    let direct_estate_overlap: Vec<_> = direct_ids.intersection(&estate_ids).collect();
    let collection_estate_overlap: Vec<_> = collection_ids.intersection(&estate_ids).collect();
    
    println!("\n   📊 ID Overlaps:");
    println!("      Collection ∩ Direct: {} IDs", collection_direct_overlap.len());
    println!("      Direct ∩ Estate: {} IDs", direct_estate_overlap.len());
    println!("      Collection ∩ Estate: {} IDs", collection_estate_overlap.len());
    
    if collection_direct_overlap.is_empty() {
        println!("      ❌ CRITICAL: No overlap between collection and direct search!");
        println!("         This means direct vector search returns different IDs than stored docs");
    }
    
    if direct_estate_overlap.is_empty() {
        println!("      ❌ CRITICAL: No overlap between direct and estate search!");
        println!("         This means estate search transforms/generates new IDs");
    }
    
    println!("\n📋 STEP 5: Examine Vector Index Mapping");
    println!("=======================================");
    
    // Check the vector index to see document to vector mapping
    let vector_index_content = std::fs::read_to_string("test_data/qdrant-data/test_user_123/aws_estate-vector-index.json")?;
    let vector_index: serde_json::Value = serde_json::from_str(&vector_index_content)?;
    
    if let Some(vectors) = vector_index.get("vectors").and_then(|v| v.as_array()) {
        println!("   📊 Vector index contains {} mappings", vectors.len());
        println!("   🔍 First 3 vector mappings:");
        
        for (i, vector) in vectors.iter().take(3).enumerate() {
            let doc_id = vector.get("documentId").and_then(|v| v.as_str()).unwrap_or("N/A");
            let vector_id = vector.get("vectorId").and_then(|v| v.as_str()).unwrap_or("N/A");
            
            println!("      {}. Doc: {} → Vector: {}", i + 1, doc_id, vector_id);
            
            // Check if the vector ID appears in our search results
            let vector_in_direct = direct_results.iter().any(|r| r.id == vector_id);
            let vector_in_estate = estate_ids.contains(&vector_id);
            let doc_in_direct = direct_results.iter().any(|r| r.id == doc_id);
            let doc_in_estate = estate_ids.contains(&doc_id);
            
            println!("         Vector ID in direct results: {}", vector_in_direct);
            println!("         Vector ID in estate results: {}", vector_in_estate);
            println!("         Doc ID in direct results: {}", doc_in_direct);
            println!("         Doc ID in estate results: {}", doc_in_estate);
        }
    }
    
    println!("\n🏁 DIAGNOSIS:");
    println!("=============");
    
    if collection_direct_overlap.is_empty() {
        println!("❌ BUG: Vector search is returning vector IDs instead of document IDs!");
        println!("   The issue is in the vector store search implementation.");
        println!("   It should return document IDs but is returning vector UUIDs.");
    } else if direct_estate_overlap.is_empty() {
        println!("❌ BUG: Estate search is transforming IDs incorrectly!");
        println!("   The vector search works but estate processing breaks the IDs.");
    } else {
        println!("✅ ID mapping appears correct - issue might be elsewhere");
    }
    
    Ok(())
}