use anyhow::Result;
use serde_json::json;
use rag_module::*;
use rag_module::services::search_service::EstateSearchOptions;
#[tokio::main]
async fn main() -> Result<()> {
println!("🆕 Fresh Storage + Search Test");
println!("===============================");
println!("This test will store NEW documents with BGE-M3 and immediately search them");
let base_path = std::env::current_dir()?.join("fresh-storage-test");
if base_path.exists() {
std::fs::remove_dir_all(&base_path).ok();
}
let rag = create_rag_module(base_path).await?;
rag.initialize().await?;
let user_id = "fresh_test_user";
println!("\n📝 Step 1: Store RDS data with current BGE-M3 model");
let rds_data = json!([{
"account_id": "123456789012",
"account_name": "Test Production Account",
"services": {
"rds": {
"instances": [
{
"db_instance_identifier": "prod-mysql-db",
"db_name": "production",
"engine": "mysql",
"engine_version": "8.0.35",
"db_instance_class": "db.t3.medium",
"allocated_storage": 100,
"storage_type": "gp2",
"multi_az": true,
"publicly_accessible": false,
"description": "Production MySQL database for user authentication and core application data",
"tags": {
"Environment": "production",
"Application": "web-app",
"Owner": "backend-team"
}
},
{
"db_instance_identifier": "analytics-postgres-db",
"db_name": "analytics",
"engine": "postgres",
"engine_version": "15.4",
"db_instance_class": "db.r5.large",
"allocated_storage": 500,
"storage_type": "gp3",
"multi_az": false,
"publicly_accessible": false,
"description": "PostgreSQL database for analytics, reporting, and business intelligence queries",
"tags": {
"Environment": "production",
"Application": "analytics",
"Owner": "data-team"
}
}
]
},
"lambda": {
"functions": [
{
"function_name": "db-backup-scheduler",
"runtime": "python3.9",
"description": "Lambda function that schedules and manages RDS database backups"
}
]
}
}
}]);
println!("Adding AWS estate data...");
let doc_ids = rag.process_aws_estate(rds_data, user_id).await?;
println!("✅ Stored {} documents with BGE-M3 embeddings", doc_ids.len());
println!("\n⏳ Waiting for storage and indexing...");
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
println!("\n🔍 Step 2: Verify documents are stored correctly");
let docs = rag.get_collection_documents("aws_estate", user_id).await?;
println!("Documents in collection: {}", docs.len());
for (i, doc) in docs.iter().enumerate() {
println!(" Doc {}: ID={}, Content={}chars, Embedding={}D",
i+1, &doc.id[..8], doc.content.len(), doc.embedding.len());
if doc.embedding.len() == 1024 {
let sum: f32 = doc.embedding.iter().sum();
let avg = sum / 1024.0;
let variance: f32 = doc.embedding.iter()
.map(|x| (x - avg).powi(2)).sum() / 1024.0;
println!(" Embedding: avg={:.6}, variance={:.6}", avg, variance);
if variance > 0.001 {
println!(" ✅ Good variance - real BGE-M3 embedding");
} else {
println!(" ⚠️ Low variance - might be dummy");
}
} else {
println!(" ❌ Wrong embedding dimensions: {}", doc.embedding.len());
}
}
if docs.is_empty() {
println!("❌ No documents stored! This is the problem.");
return Ok(());
}
println!("\n🔍 Step 3: Test multiple search queries with different thresholds");
let test_queries = vec![
("RDS database instances", 0.3),
("RDS database instances", 0.1),
("RDS database instances", 0.0),
("MySQL database", 0.0),
("database", 0.0),
("production", 0.0),
];
for (query, threshold) in test_queries {
println!("\n🔎 Query: '{}' (threshold: {})", query, threshold);
let search_options = EstateSearchOptions {
resource_types: None,
account_ids: None,
regions: None,
services: None,
states: None,
environment: None,
application: None,
synced_after: None,
limit: Some(5),
score_threshold: Some(threshold),
include_metadata: true,
use_anonymous_ids: false,
};
let results = rag.search_service
.search_estate_resources(query, search_options, None)
.await?;
println!(" Results: {}", results.len());
for (i, result) in results.iter().enumerate() {
if let Some(score) = result.get("score").and_then(|s| s.as_f64()) {
let service = result.get("service").and_then(|s| s.as_str()).unwrap_or("unknown");
println!(" {}: {} (score: {:.4})", i+1, service, score);
}
}
if results.is_empty() && threshold == 0.0 {
println!(" ❌ Even with threshold 0.0, no results! Vector search has an issue.");
} else if !results.is_empty() {
println!(" ✅ Found results! Threshold {} works.", threshold);
}
}
println!("\n📊 Step 4: Manual similarity check");
if let Some(first_doc) = docs.first() {
println!("Testing manual embedding generation and similarity...");
let query = "RDS database";
let query_embedding = rag.embedding_service.generate_embedding(query).await?;
if query_embedding.len() == 1024 && first_doc.embedding.len() == 1024 {
let dot_product: f32 = query_embedding.iter()
.zip(first_doc.embedding.iter())
.map(|(a, b)| a * b)
.sum();
let norm_q: f32 = query_embedding.iter().map(|x| x.powi(2)).sum::<f32>().sqrt();
let norm_d: f32 = first_doc.embedding.iter().map(|x| x.powi(2)).sum::<f32>().sqrt();
let similarity = if norm_q > 0.0 && norm_d > 0.0 {
dot_product / (norm_q * norm_d)
} else {
0.0
};
println!("📊 Manual similarity calculation:");
println!(" Query embedding norm: {:.6}", norm_q);
println!(" Doc embedding norm: {:.6}", norm_d);
println!(" Dot product: {:.6}", dot_product);
println!(" Cosine similarity: {:.6}", similarity);
if similarity > 0.1 {
println!(" ✅ Good similarity - vector search should work");
} else {
println!(" ⚠️ Low similarity - but should still appear with threshold 0.0");
}
}
}
println!("\n🎯 CONCLUSION:");
if docs.len() > 0 {
println!("✅ Documents stored successfully with BGE-M3");
println!("If search still returns 0 results, the issue is in the vector search logic");
} else {
println!("❌ Document storage failed");
}
Ok(())
}