use rag_module::RagModule;
use serde_json::json;
use tempfile::tempdir;
use tokio;
use anyhow::Result;
#[tokio::test]
async fn test_content_based_deduplication() -> Result<()> {
let temp_dir = tempdir()?;
let base_path = temp_dir.path().to_path_buf();
let rag = RagModule::new(base_path).await?;
let user_id = "test_user_dedup";
let collection_name = "test_dedup";
rag.set_user_context(user_id).await?;
let doc1 = json!({
"content": "EC2 instance for web application",
"service": "EC2",
"region": "us-east-1"
});
rag.ingest_aws_estate(doc1.clone(), user_id, collection_name).await?;
let count_after_first = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(count_after_first, 1);
rag.ingest_aws_estate(doc1.clone(), user_id, collection_name).await?;
let count_after_duplicate = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(count_after_duplicate, 1, "Duplicate content should not increase count");
Ok(())
}
#[tokio::test]
async fn test_metadata_update_same_content() -> Result<()> {
let temp_dir = tempdir()?;
let base_path = temp_dir.path().to_path_buf();
let rag = RagModule::new(base_path).await?;
let user_id = "test_user_metadata";
let collection_name = "test_metadata_update";
rag.set_user_context(user_id).await?;
let doc_v1 = json!({
"content": "RDS PostgreSQL database for user data",
"service": "RDS",
"engine": "postgres",
"version": "13.7"
});
rag.ingest_aws_estate(doc_v1, user_id, collection_name).await?;
let doc_v2 = json!({
"content": "RDS PostgreSQL database for user data", "service": "RDS",
"engine": "postgres",
"version": "14.9", "backup_retention": 7, "multi_az": true });
rag.ingest_aws_estate(doc_v2, user_id, collection_name).await?;
let final_count = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(final_count, 1, "Metadata update should not create new document");
Ok(())
}
#[tokio::test]
async fn test_different_content_creates_new_documents() -> Result<()> {
let temp_dir = tempdir()?;
let base_path = temp_dir.path().to_path_buf();
let rag = RagModule::new(base_path).await?;
let user_id = "test_user_different";
let collection_name = "test_different_content";
rag.set_user_context(user_id).await?;
let doc1 = json!({
"content": "S3 bucket for storing images",
"service": "S3",
"bucket_name": "images-bucket"
});
rag.ingest_aws_estate(doc1, user_id, collection_name).await?;
let count_after_first = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(count_after_first, 1);
let doc2 = json!({
"content": "Lambda function for image processing", "service": "Lambda",
"runtime": "python3.9"
});
rag.ingest_aws_estate(doc2, user_id, collection_name).await?;
let count_after_second = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(count_after_second, 2, "Different content should create new document");
Ok(())
}
#[tokio::test]
async fn test_batch_deduplication() -> Result<()> {
let temp_dir = tempdir()?;
let base_path = temp_dir.path().to_path_buf();
let rag = RagModule::new(base_path).await?;
let user_id = "test_user_batch";
let collection_name = "test_batch_dedup";
rag.set_user_context(user_id).await?;
let batch = vec![
json!({
"content": "VPC for production environment",
"service": "VPC",
"cidr": "10.0.0.0/16"
}),
json!({
"content": "VPC for production environment", "service": "VPC",
"cidr": "10.0.0.0/16",
"dns_hostnames": true }),
json!({
"content": "Security group for web servers", "service": "EC2",
"type": "SecurityGroup"
}),
json!({
"content": "VPC for production environment", "service": "VPC",
"cidr": "10.0.0.0/16",
"dns_hostnames": true,
"dns_resolution": true })
];
rag.ingest_aws_estate_batch(batch, user_id, collection_name).await?;
let final_count = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(final_count, 2, "Batch should deduplicate and result in 2 unique documents");
Ok(())
}
#[tokio::test]
async fn test_content_hash_consistency() -> Result<()> {
let temp_dir = tempdir()?;
let base_path = temp_dir.path().to_path_buf();
let rag = RagModule::new(base_path).await?;
let user_id = "test_user_consistency";
let collection_name = "test_consistency";
rag.set_user_context(user_id).await?;
let doc1 = json!({
"content": "IAM role for Lambda execution",
"service": "IAM",
"type": "Role",
"trust_policy": {
"Version": "2012-10-17",
"Statement": []
}
});
let doc2 = json!({
"content": "IAM role for Lambda execution", "service": "IAM",
"type": "Role",
"trust_policy": {
"Version": "2012-10-17",
"Statement": []
},
"description": "Role for Lambda function execution" });
rag.ingest_aws_estate(doc1, user_id, collection_name).await?;
let count_after_first = rag.get_document_count(Some(collection_name), None).await?;
rag.ingest_aws_estate(doc2, user_id, collection_name).await?;
let count_after_second = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(count_after_first, 1);
assert_eq!(count_after_second, 1, "Same content should produce same hash regardless of metadata");
Ok(())
}
#[tokio::test]
async fn test_large_scale_deduplication() -> Result<()> {
let temp_dir = tempdir()?;
let base_path = temp_dir.path().to_path_buf();
let rag = RagModule::new(base_path).await?;
let user_id = "test_user_large";
let collection_name = "test_large_scale";
rag.set_user_context(user_id).await?;
let mut batch = Vec::new();
for i in 0..50 {
batch.push(json!({
"content": "Shared EBS volume for data storage", "service": "EBS",
"volume_id": format!("vol-{:06}", i), "size": 100 + i, }));
}
for i in 0..50 {
batch.push(json!({
"content": format!("Unique EBS volume {} for application data", i), "service": "EBS",
"volume_id": format!("vol-unique-{:06}", i),
"size": 200 + i,
}));
}
use rand::seq::SliceRandom;
let mut rng = rand::thread_rng();
batch.shuffle(&mut rng);
rag.ingest_aws_estate_batch(batch, user_id, collection_name).await?;
let final_count = rag.get_document_count(Some(collection_name), None).await?;
assert_eq!(final_count, 51, "Large scale deduplication should work correctly");
Ok(())
}