use rag_module::{RagModule, SearchOptions};
use serde_json::json;
use std::path::PathBuf;
use anyhow::Result;
#[tokio::main]
async fn main() -> Result<()> {
println!("🔍 Verifying Deduplication with Content Hashes");
println!("===============================================");
let base_path = PathBuf::from("./verify_dedup_test");
if base_path.exists() {
std::fs::remove_dir_all(&base_path).ok();
}
let rag = RagModule::new(base_path.clone()).await?;
let user_id = "verify_user";
let collection_name = "verify_estate";
rag.set_user_context(user_id).await?;
println!("✅ Fresh RAG instance created");
let content = "RDS instance test-db, id test-db, region us-east-1, engine mysql, class db.t3.micro";
let doc1 = json!({
"content": content,
"service": "RDS",
"region": "us-east-1",
"instance_id": "test-db-1"
});
let doc2 = json!({
"content": content, "service": "RDS",
"region": "us-east-1",
"instance_id": "test-db-2", "updated": true
});
let doc3 = json!({
"content": "Different S3 bucket for storage", "service": "S3",
"bucket_name": "test-bucket"
});
println!("\n📋 Test Content Hashes:");
let hash1 = {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher);
hasher.finish()
};
let doc_id1 = format!("{}-{:x}", collection_name, hash1);
println!(" Content 1: {:x} → {}", hash1, doc_id1);
let hash2 = {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher); hasher.finish()
};
let doc_id2 = format!("{}-{:x}", collection_name, hash2);
println!(" Content 2: {:x} → {}", hash2, doc_id2);
let hash3 = {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
"Different S3 bucket for storage".hash(&mut hasher);
hasher.finish()
};
let doc_id3 = format!("{}-{:x}", collection_name, hash3);
println!(" Content 3: {:x} → {}", hash3, doc_id3);
if doc_id1 == doc_id2 {
println!("✅ Same content generates same document ID");
} else {
println!("❌ Same content generates different document IDs");
}
if doc_id1 != doc_id3 {
println!("✅ Different content generates different document ID");
} else {
println!("❌ Different content generates same document ID");
}
println!("\n📋 Ingesting Documents:");
println!("🔄 Ingesting document 1...");
rag.ingest_aws_estate(doc1, user_id, collection_name).await?;
let count1 = rag.get_document_count(Some(collection_name), None).await?;
println!(" Document count: {}", count1);
println!("🔄 Ingesting document 2 (same content)...");
rag.ingest_aws_estate(doc2, user_id, collection_name).await?;
let count2 = rag.get_document_count(Some(collection_name), None).await?;
println!(" Document count: {}", count2);
println!("🔄 Ingesting document 3 (different content)...");
rag.ingest_aws_estate(doc3, user_id, collection_name).await?;
let count3 = rag.get_document_count(Some(collection_name), None).await?;
println!(" Document count: {}", count3);
println!("\n📊 Final Results:");
if count1 == 1 && count2 == 1 && count3 == 2 {
println!("🎉 PERFECT! Deduplication working correctly:");
println!(" - Same content: {} docs (expected 1)", count2);
println!(" - + Different content: {} docs (expected 2)", count3);
} else {
println!("⚠️ Results need investigation:");
println!(" - After doc 1: {}", count1);
println!(" - After doc 2 (duplicate): {}", count2);
println!(" - After doc 3 (different): {}", count3);
}
let storage_file = base_path.join("qdrant-data").join(user_id).join(format!("{}-documents.json", collection_name));
if storage_file.exists() {
println!("\n📁 Storage File Check:");
println!(" Path: {}", storage_file.display());
if let Ok(content) = std::fs::read_to_string(&storage_file) {
if let Ok(json_data) = serde_json::from_str::<serde_json::Value>(&content) {
if let Some(docs) = json_data.get("documents").and_then(|d| d.as_object()) {
println!(" Document keys in storage: {}", docs.len());
for key in docs.keys() {
if let Some(doc) = docs.get(key) {
if let Some(id) = doc.get("id") {
println!(" - Key: {} → ID: {}", key, id);
}
}
}
}
if let Some(count) = json_data.get("count") {
println!(" Storage count field: {}", count);
}
}
}
}
Ok(())
}