use rag_module::RagModule;
use serde_json::json;
use std::collections::HashMap;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt::init();
println!("๐งช Testing Hash Collision Fixes");
println!("================================\n");
let rag = RagModule::new("./hash-collision-test-data").await?;
rag.initialize().await?;
let user_id = "collision_test_user";
let collection_name = "hash_test";
let test_documents = vec![
json!({
"content": "EC2 instance i-1234567890abcdef0 running in us-east-1",
"resource_type": "ec2_instance",
"instance_id": "i-1234567890abcdef0",
"instance_type": "t3.micro",
"state": "running",
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "EC2 instance i-1234567890abcdef1 running in us-east-1",
"resource_type": "ec2_instance",
"instance_id": "i-1234567890abcdef1", "instance_type": "t3.micro",
"state": "running",
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "S3 bucket my-test-bucket-001 for application data storage",
"resource_type": "s3_bucket",
"bucket_name": "my-test-bucket-001",
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "S3 bucket my-test-bucket-002 for application data storage",
"resource_type": "s3_bucket",
"bucket_name": "my-test-bucket-002", "region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "RDS MySQL database prod-db-main-01 running version 8.0.35",
"resource_type": "rds_instance",
"db_instance_identifier": "prod-db-main-01",
"engine": "mysql",
"engine_version": "8.0.35",
"instance_class": "db.t3.micro",
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "RDS MySQL database prod-db-main-02 running version 8.0.35",
"resource_type": "rds_instance",
"db_instance_identifier": "prod-db-main-02", "engine": "mysql",
"engine_version": "8.0.35",
"instance_class": "db.t3.micro",
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "Lambda function user-service-handler processing user requests",
"resource_type": "lambda_function",
"function_name": "user-service-handler",
"runtime": "python3.11",
"memory_size": 128,
"timeout": 30,
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "Lambda function user-service-handler processing user responses", "resource_type": "lambda_function",
"function_name": "user-service-handler2", "runtime": "python3.11",
"memory_size": 128,
"timeout": 30,
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "Unknown service resource with minimal metadata",
"resource_type": "unknown_service",
"name": "resource-001",
"region": "us-east-1",
"account_id": "123456789012"
}),
json!({
"content": "Unknown service resource with minimal metadata", "resource_type": "unknown_service",
"name": "resource-002", "region": "us-east-1",
"account_id": "123456789012"
}),
];
println!("๐ Testing {} documents for hash collisions...\n", test_documents.len());
let mut generated_ids = HashMap::new();
let mut canonical_ids = Vec::new();
for (idx, doc) in test_documents.iter().enumerate() {
println!("Processing document {}/{}", idx + 1, test_documents.len());
let metadata_obj = doc.as_object().unwrap().clone();
let mut metadata_without_content = metadata_obj.clone();
metadata_without_content.remove("content");
let resource_type = doc.get("resource_type").and_then(|r| r.as_str()).unwrap_or("unknown");
let identifier = doc.get("instance_id")
.or_else(|| doc.get("bucket_name"))
.or_else(|| doc.get("db_instance_identifier"))
.or_else(|| doc.get("function_name"))
.or_else(|| doc.get("name"))
.and_then(|i| i.as_str())
.unwrap_or("unknown");
let simulated_canonical_id = format!("{}:{}", resource_type, identifier);
canonical_ids.push(simulated_canonical_id.clone());
let result = rag.ingest_aws_estate(doc.clone(), user_id, collection_name).await?;
if result.create_result.created > 0 {
println!(" โ
Document ingested successfully");
println!(" Canonical ID: {}", simulated_canonical_id);
} else {
println!(" โ Document ingestion failed: {:?}", result.create_result.failed);
}
}
println!("\n๐ Analysis Results:");
println!("===================");
let mut canonical_counts = HashMap::new();
for id in &canonical_ids {
*canonical_counts.entry(id.clone()).or_insert(0) += 1;
}
let duplicates: Vec<_> = canonical_counts.iter()
.filter(|(_, &count)| count > 1)
.collect();
if duplicates.is_empty() {
println!("โ
No duplicate canonical IDs found - collision prevention working!");
} else {
println!("โ ๏ธ Found {} duplicate canonical IDs:", duplicates.len());
for (id, count) in duplicates {
println!(" - '{}' appears {} times", id, count);
}
}
println!("\n๐ Testing batch ingestion with potential duplicates...");
let duplicate_batch = vec![
test_documents[0].clone(), test_documents[0].clone(), test_documents[1].clone(), ];
let batch_result = rag.ingest_aws_estate_batch(duplicate_batch, user_id, collection_name).await?;
println!("Batch ingestion results:");
println!(" Total resources: {}", batch_result.total_resources);
println!(" Successfully created: {}", batch_result.create_result.created);
println!(" Failed: {}", batch_result.failed_resources);
if batch_result.create_result.created == 2 {
println!("โ
Deduplication working - 3 resources became 2 documents (duplicate removed)");
} else if batch_result.create_result.created == 3 {
println!("โ ๏ธ No deduplication occurred - might indicate issue");
} else {
println!("โ Unexpected result - {} documents created", batch_result.create_result.created);
}
println!("\n๐ Hash collision test completed!");
println!("Check the generated documents to ensure uniqueness.");
Ok(())
}