use anyhow::Result;
use serde_json::json;
use rag_module::*;
use rag_module::services::search_service::EstateSearchOptions;
#[tokio::main]
async fn main() -> Result<()> {
println!("🐛 Vector Search Debug Analysis");
println!("================================");
let base_path = std::env::current_dir()?.join("working-demo");
let rag = create_rag_module(base_path).await?;
rag.initialize().await?;
let user_id = "demo_user";
println!("📊 Step 1: Check documents in aws_estate collection");
let docs = rag.get_collection_documents("aws_estate", user_id).await?;
println!("Documents found: {}", docs.len());
if docs.is_empty() {
println!("❌ No documents! Adding test RDS data...");
let test_data = json!([{
"account_id": "123456789012",
"services": {
"rds": {
"instances": [{
"db_instance_identifier": "test-rds-1",
"engine": "mysql",
"description": "Test RDS MySQL database instance for debugging"
}]
}
}
}]);
rag.process_aws_estate(test_data, user_id).await?;
tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
let docs = rag.get_collection_documents("aws_estate", user_id).await?;
println!("✅ Added {} documents", docs.len());
}
for (i, doc) in docs.iter().enumerate() {
println!("\n📄 Document {}:", i + 1);
println!(" ID: {}", doc.id);
println!(" Content length: {}", doc.content.len());
println!(" Embedding dimensions: {}", doc.embedding.len());
if doc.embedding.len() == 1024 {
let sum: f32 = doc.embedding.iter().sum();
let avg = sum / 1024.0;
let variance: f32 = doc.embedding.iter().map(|x| (x - avg).powi(2)).sum() / 1024.0;
println!(" 📊 Embedding stats:");
println!(" Sum: {:.6}, Avg: {:.6}, Variance: {:.6}", sum, avg, variance);
println!(" Range: {:.6} to {:.6}",
doc.embedding.iter().cloned().fold(f32::INFINITY, f32::min),
doc.embedding.iter().cloned().fold(f32::NEG_INFINITY, f32::max)
);
if variance < 0.0001 {
println!(" ⚠️ WARNING: Very low variance - might be dummy/random embedding");
} else {
println!(" ✅ Good embedding variance - looks like real BGE-M3");
}
} else {
println!(" ❌ Wrong embedding size: expected 1024, got {}", doc.embedding.len());
}
}
println!("\n🔍 Step 2: Test embedding generation for query");
let query = "RDS database instances";
println!("Query: '{}'", query);
let query_embedding = rag.embedding_service.generate_embedding(query).await?;
println!("✅ Query embedding generated: {} dimensions", query_embedding.len());
if query_embedding.len() == 1024 {
let sum: f32 = query_embedding.iter().sum();
let avg = sum / 1024.0;
println!("📊 Query embedding stats: Sum: {:.6}, Avg: {:.6}", sum, avg);
}
println!("\n🧮 Step 3: Manual similarity calculation");
if let Some(first_doc) = docs.first() {
if first_doc.embedding.len() == 1024 && query_embedding.len() == 1024 {
let dot_product: f32 = query_embedding.iter()
.zip(first_doc.embedding.iter())
.map(|(a, b)| a * b)
.sum();
let norm_query: f32 = query_embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_doc: f32 = first_doc.embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
let similarity = if norm_query > 0.0 && norm_doc > 0.0 {
dot_product / (norm_query * norm_doc)
} else {
0.0
};
println!("🧮 Manual calculation:");
println!(" Dot product: {:.6}", dot_product);
println!(" Query norm: {:.6}", norm_query);
println!(" Doc norm: {:.6}", norm_doc);
println!(" Cosine similarity: {:.6}", similarity);
if similarity > 0.3 {
println!(" ✅ Good similarity - should pass 0.3 threshold");
} else if similarity > 0.1 {
println!(" ⚠️ Low similarity - would need threshold ≤ {:.3}", similarity);
} else {
println!(" ❌ Very low similarity - embeddings might be random");
}
}
}
println!("\n🔍 Step 4: Actual vector search test");
let search_options = EstateSearchOptions {
resource_types: None,
account_ids: None,
regions: None,
services: None,
states: None,
environment: None,
application: None,
synced_after: None,
limit: Some(5),
score_threshold: Some(0.0), include_metadata: true,
use_anonymous_ids: false,
};
println!("Testing with threshold 0.0 (accept all)...");
let results = rag.search_service
.search_estate_resources(query, search_options, None)
.await?;
println!("✅ Vector search results: {}", results.len());
for (i, result) in results.iter().enumerate() {
if let Some(score) = result.get("score").and_then(|s| s.as_f64()) {
println!(" {}: Score {:.6}", i + 1, score);
}
}
if results.is_empty() {
println!("\n❌ Still 0 results! The issue is likely:");
println!("1. Documents have random/dummy embeddings (not real BGE-M3)");
println!("2. Vector search logic has a bug");
println!("3. Documents aren't being loaded properly");
} else {
println!("\n✅ Vector search is working! Original issue was threshold too high.");
}
Ok(())
}