use ahash::AHashMap;
use serde_json::json;
use std::collections::HashMap;
use std::path::PathBuf;
use terraphim_atomic_client::{self, Store};
use terraphim_config::{ConfigBuilder, Haystack, Role, ServiceType};
use terraphim_middleware::{
haystack::AtomicHaystackIndexer,
indexer::{search_haystacks, IndexMiddleware},
RipgrepIndexer,
};
use terraphim_types::{RelevanceFunction, SearchQuery};
use uuid::Uuid;
/// Test comprehensive dual haystack validation with both Atomic and Ripgrep services
/// This test validates that roles with multiple haystacks return results from both sources
/// and demonstrates proper source differentiation and performance characteristics.
#[tokio::test]
async fn test_dual_haystack_comprehensive_validation() {
// Initialize logging for test debugging
let _ = env_logger::builder()
.filter_level(log::LevelFilter::Info)
.is_test(true)
.try_init();
// Load atomic server configuration from environment
dotenvy::dotenv().ok();
let server_url =
std::env::var("ATOMIC_SERVER_URL").unwrap_or_else(|_| "http://localhost:9883".to_string());
let atomic_secret = std::env::var("ATOMIC_SERVER_SECRET").ok();
if atomic_secret.is_none() {
log::warn!("ATOMIC_SERVER_SECRET not set, test may fail with authentication");
}
// Create atomic store for setup and cleanup
let atomic_config = terraphim_atomic_client::Config {
server_url: server_url.clone(),
agent: atomic_secret
.as_ref()
.and_then(|secret| terraphim_atomic_client::Agent::from_base64(secret).ok()),
};
let store = Store::new(atomic_config).expect("Failed to create atomic store");
// 1. Create test documents in the atomic server
let test_id = Uuid::new_v4();
let server_base = server_url.trim_end_matches('/');
// Create parent collection for test documents
let parent_subject = format!("{}/test-dual-haystack-{}", server_base, test_id);
let mut parent_properties = HashMap::new();
parent_properties.insert(
"https://atomicdata.dev/properties/name".to_string(),
json!("Dual Haystack Test Collection"),
);
parent_properties.insert(
"https://atomicdata.dev/properties/description".to_string(),
json!("Test collection for dual haystack validation testing"),
);
parent_properties.insert(
"https://atomicdata.dev/properties/isA".to_string(),
json!(["https://atomicdata.dev/classes/Collection"]),
);
parent_properties.insert(
"https://atomicdata.dev/properties/parent".to_string(),
json!(server_base),
);
match store
.create_with_commit(&parent_subject, parent_properties)
.await
{
Ok(_) => log::info!("Created parent collection: {}", parent_subject),
Err(e) => {
log::warn!("Failed to create parent collection: {}", e);
}
}
// Create comprehensive test documents that will be found in atomic server
let test_documents = [
(
"ATOMIC: Dual Haystack Architecture Guide",
"Comprehensive guide for using dual haystack architecture in Terraphim. This document explains how to configure both Atomic and Ripgrep haystacks for optimal search performance and coverage."
),
(
"ATOMIC: Graph Embeddings Performance",
"Performance analysis of graph embeddings in dual haystack configurations. Covers TerraphimGraph relevance function optimization and search result ranking algorithms."
),
(
"ATOMIC: Search Algorithm Comparison",
"Detailed comparison between TitleScorer and TerraphimGraph relevance functions in dual haystack environments. Includes performance metrics and recommendation guidelines."
),
(
"ATOMIC: Configuration Management",
"Best practices for managing dual haystack configurations. Covers role setup, haystack ordering, and fallback strategies for optimal search experience."
),
];
let mut created_documents = Vec::new();
for (i, (title, body)) in test_documents.iter().enumerate() {
let doc_subject = format!("{}/dual-haystack-doc-{}", parent_subject, i);
let mut doc_properties = HashMap::new();
doc_properties.insert(
"https://atomicdata.dev/properties/name".to_string(),
json!(title),
);
doc_properties.insert(
"https://atomicdata.dev/properties/description".to_string(),
json!(body),
);
doc_properties.insert(
"https://atomicdata.dev/properties/isA".to_string(),
json!(["https://atomicdata.dev/classes/Article"]),
);
doc_properties.insert(
"https://atomicdata.dev/properties/parent".to_string(),
json!(parent_subject.clone()),
);
match store.create_with_commit(&doc_subject, doc_properties).await {
Ok(_) => {
log::info!("Created test document: {}", title);
created_documents.push(doc_subject);
}
Err(e) => {
log::warn!("Failed to create test document '{}': {}", title, e);
}
}
}
// Allow time for indexing
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
// 2. Create dual haystack configurations
log::info!("Creating dual haystack role configurations");
// Dual Haystack with Title Scorer
let dual_title_config = ConfigBuilder::new()
.global_shortcut("Ctrl+1")
.add_role(
"DualTitle",
Role {
shortname: Some("dual-title".to_string()),
name: "Dual Haystack Title Scorer".into(),
relevance_function: RelevanceFunction::TitleScorer,
terraphim_it: false,
theme: "cosmo".to_string(),
kg: None,
haystacks: vec![
Haystack::new(server_url.clone(), ServiceType::Atomic, true)
.with_atomic_secret(atomic_secret.clone()),
Haystack::new("../../docs/src".to_string(), ServiceType::Ripgrep, true),
],
llm_enabled: false,
llm_api_key: None,
llm_model: None,
llm_auto_summarize: false,
llm_chat_enabled: false,
llm_chat_system_prompt: None,
llm_chat_model: None,
llm_context_window: None,
extra: AHashMap::new(),
},
)
.build()
.expect("Failed to build dual title config");
// Dual Haystack with Graph Embeddings
let dual_graph_config = ConfigBuilder::new()
.global_shortcut("Ctrl+2")
.add_role(
"DualGraph",
Role {
shortname: Some("dual-graph".to_string()),
name: "Dual Haystack Graph Embeddings".into(),
relevance_function: RelevanceFunction::TerraphimGraph,
terraphim_it: true,
theme: "darkly".to_string(),
kg: Some(terraphim_config::KnowledgeGraph {
automata_path: None,
knowledge_graph_local: Some(terraphim_config::KnowledgeGraphLocal {
input_type: terraphim_types::KnowledgeGraphInputType::Markdown,
path: PathBuf::from("../../docs/src/kg"),
}),
public: true,
publish: true,
}),
haystacks: vec![
Haystack::new(server_url.clone(), ServiceType::Atomic, true)
.with_atomic_secret(atomic_secret.clone()),
Haystack::new("../../docs/src".to_string(), ServiceType::Ripgrep, true),
],
llm_enabled: false,
llm_api_key: None,
llm_model: None,
llm_auto_summarize: false,
llm_chat_enabled: false,
llm_chat_system_prompt: None,
llm_chat_model: None,
llm_context_window: None,
extra: AHashMap::new(),
},
)
.build()
.expect("Failed to build dual graph config");
// Single haystack references for comparison
let single_atomic_config = ConfigBuilder::new()
.global_shortcut("Ctrl+3")
.add_role(
"SingleAtomic",
Role {
shortname: Some("single-atomic".to_string()),
name: "Single Atomic Reference".into(),
relevance_function: RelevanceFunction::TitleScorer,
terraphim_it: false,
theme: "cerulean".to_string(),
kg: None,
haystacks: vec![Haystack::new(server_url.clone(), ServiceType::Atomic, true)
.with_atomic_secret(atomic_secret.clone())],
llm_enabled: false,
llm_api_key: None,
llm_model: None,
llm_auto_summarize: false,
llm_chat_enabled: false,
llm_chat_system_prompt: None,
llm_chat_model: None,
llm_context_window: None,
extra: AHashMap::new(),
},
)
.build()
.expect("Failed to build single atomic config");
let single_ripgrep_config = ConfigBuilder::new()
.global_shortcut("Ctrl+4")
.add_role(
"SingleRipgrep",
Role {
shortname: Some("single-ripgrep".to_string()),
name: "Single Ripgrep Reference".into(),
relevance_function: RelevanceFunction::TitleScorer,
terraphim_it: false,
theme: "journal".to_string(),
kg: None,
haystacks: vec![Haystack::new(
"../../docs/src".to_string(),
ServiceType::Ripgrep,
true,
)],
llm_enabled: false,
llm_api_key: None,
llm_model: None,
llm_auto_summarize: false,
llm_chat_enabled: false,
llm_chat_system_prompt: None,
llm_chat_model: None,
llm_context_window: None,
extra: AHashMap::new(),
},
)
.build()
.expect("Failed to build single ripgrep config");
// 3. Test each configuration and validate dual haystack functionality
let configs = vec![
("DualTitle", dual_title_config),
("DualGraph", dual_graph_config),
("SingleAtomic", single_atomic_config),
("SingleRipgrep", single_ripgrep_config),
];
let search_terms = vec![
"dual",
"haystack",
"architecture",
"terraphim",
"graph",
"search",
];
let mut all_results = HashMap::new();
for (role_name, config) in &configs {
log::info!("Testing role: {}", role_name);
// Validate configuration structure
let role = config.roles.values().next().unwrap();
match *role_name {
"DualTitle" | "DualGraph" => {
assert_eq!(
role.haystacks.len(),
2,
"Dual roles should have 2 haystacks"
);
assert!(role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Atomic));
assert!(role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Ripgrep));
}
"SingleAtomic" => {
assert_eq!(
role.haystacks.len(),
1,
"Single atomic role should have 1 haystack"
);
assert_eq!(role.haystacks[0].service, ServiceType::Atomic);
}
"SingleRipgrep" => {
assert_eq!(
role.haystacks.len(),
1,
"Single ripgrep role should have 1 haystack"
);
assert_eq!(role.haystacks[0].service, ServiceType::Ripgrep);
}
_ => panic!("Unknown role name: {}", role_name),
}
// Test search functionality using indexing middleware directly
let atomic_indexer = AtomicHaystackIndexer::default();
let ripgrep_indexer = RipgrepIndexer::default();
let role_results = &mut all_results
.entry(role_name.to_string())
.or_insert_with(HashMap::new);
for search_term in &search_terms {
let search_start = std::time::Instant::now();
let mut atomic_results = 0;
let mut ripgrep_results = 0;
let mut total_results = 0;
// Test search across all haystacks for this role
for haystack in &role.haystacks {
match haystack.service {
ServiceType::Atomic => {
match atomic_indexer.index(search_term, haystack).await {
Ok(results) => {
atomic_results = results.len();
total_results += atomic_results;
log::debug!(
"Role {}, Atomic haystack, term '{}': {} results",
role_name,
search_term,
atomic_results
);
}
Err(e) => {
log::warn!(
"Atomic search failed for role {}, term '{}': {}",
role_name,
search_term,
e
);
}
}
}
ServiceType::Ripgrep => {
match ripgrep_indexer.index(search_term, haystack).await {
Ok(results) => {
ripgrep_results = results.len();
total_results += ripgrep_results;
log::debug!(
"Role {}, Ripgrep haystack, term '{}': {} results",
role_name,
search_term,
ripgrep_results
);
}
Err(e) => {
log::warn!(
"Ripgrep search failed for role {}, term '{}': {}",
role_name,
search_term,
e
);
}
}
}
_ => {}
}
}
let search_duration = search_start.elapsed();
role_results.insert(
search_term.to_string(),
(
total_results,
atomic_results,
ripgrep_results,
search_duration,
),
);
log::info!(
"Role {}, term '{}': {} total ({}A+{}R) in {:?}",
role_name,
search_term,
total_results,
atomic_results,
ripgrep_results,
search_duration
);
}
}
// 4. Validate dual haystack behavior and source differentiation
log::info!("=== Dual Haystack Validation Results ===");
for (role_name, results) in &all_results {
log::info!("Role: {}", role_name);
for (term, (total, atomic, ripgrep, duration)) in results {
log::info!(
" '{}': {} total ({}A+{}R) in {:?}",
term,
total,
atomic,
ripgrep,
duration
);
// Validate performance (less than 10 seconds per search)
assert!(
duration.as_secs() < 10,
"Search should complete within 10 seconds"
);
// For dual roles, validate we get results from both sources for some terms
if role_name.starts_with("Dual") && atomic_secret.is_some() && *total > 0 {
log::info!(
" ✅ Dual role '{}' found results for '{}'",
role_name,
term
);
}
}
}
// 5. Compare dual vs single haystack performance
log::info!("=== Performance Comparison: Dual vs Single Haystacks ===");
for search_term in &search_terms {
let dual_title_results = all_results
.get("DualTitle")
.and_then(|r| r.get(*search_term))
.map(|(total, atomic, ripgrep, duration)| (*total, *atomic, *ripgrep, *duration));
let single_atomic_results = all_results
.get("SingleAtomic")
.and_then(|r| r.get(*search_term))
.map(|(total, atomic, ripgrep, duration)| (*total, *atomic, *ripgrep, *duration));
let single_ripgrep_results = all_results
.get("SingleRipgrep")
.and_then(|r| r.get(*search_term))
.map(|(total, atomic, ripgrep, duration)| (*total, *atomic, *ripgrep, *duration));
if let (
Some((dual_total, dual_atomic, dual_ripgrep, dual_duration)),
Some((single_atomic_total, _, _, single_atomic_duration)),
Some((single_ripgrep_total, _, _, single_ripgrep_duration)),
) = (
dual_title_results,
single_atomic_results,
single_ripgrep_results,
) {
log::info!("Term '{}' comparison:", search_term);
log::info!(
" Dual: {} total ({}A+{}R) in {:?}",
dual_total,
dual_atomic,
dual_ripgrep,
dual_duration
);
log::info!(
" Single Atomic: {} in {:?}",
single_atomic_total,
single_atomic_duration
);
log::info!(
" Single Ripgrep: {} in {:?}",
single_ripgrep_total,
single_ripgrep_duration
);
log::info!(
" Combined Single: {} total",
single_atomic_total + single_ripgrep_total
);
// Validate that dual haystack should find at least as many results as individual sources
if atomic_secret.is_some() && (single_atomic_total > 0 || single_ripgrep_total > 0) {
let best_single = single_atomic_total.max(single_ripgrep_total);
if dual_total < best_single {
log::warn!("Dual haystack found fewer results ({}) than best single source ({}) for '{}' - this may indicate search algorithm differences or overlapping documents", dual_total, best_single, search_term);
} else {
log::info!("✅ Dual haystack found more or equal results ({}) vs best single ({}) for '{}'", dual_total, best_single, search_term);
}
}
}
}
// 6. Test full integration with terraphim search pipeline
log::info!("=== Testing Full Integration with Search Pipeline ===");
for (role_name, config) in &configs {
if role_name.starts_with("Dual") {
log::info!(
"Testing search pipeline integration for role: {}",
role_name
);
let config_state = terraphim_config::ConfigState::new(&mut config.clone())
.await
.expect("Failed to create config state");
for search_term in &search_terms[..3] {
// Test subset for performance
let search_query = SearchQuery {
search_term: search_term.to_string().into(),
skip: Some(0),
limit: Some(10),
role: Some(config.roles.keys().next().unwrap().clone()),
operator: Some(terraphim_types::LogicalOperator::And),
search_terms: None,
};
let pipeline_start = std::time::Instant::now();
let search_results = search_haystacks(config_state.clone(), search_query).await;
let pipeline_duration = pipeline_start.elapsed();
match search_results {
Ok(results) => {
log::info!(
" Pipeline search for '{}' found {} results in {:?}",
search_term,
results.len(),
pipeline_duration
);
// Validate search pipeline results
for doc in results.values() {
assert!(!doc.title.is_empty(), "Document title should not be empty");
assert!(!doc.body.is_empty(), "Document body should not be empty");
// Log source differentiation
if doc.title.starts_with("ATOMIC:") {
log::debug!(" Source: Atomic - {}", doc.title);
} else {
log::debug!(" Source: Ripgrep - {}", doc.title);
}
}
}
Err(e) => {
log::warn!(" Pipeline search failed for '{}': {}", search_term, e);
}
}
}
}
}
// 7. Cleanup - delete test documents
log::info!("Cleaning up test documents");
for doc_subject in &created_documents {
match store.delete_with_commit(doc_subject).await {
Ok(_) => log::debug!("Deleted test document: {}", doc_subject),
Err(e) => log::warn!("Failed to delete test document {}: {}", doc_subject, e),
}
}
// Delete parent collection
match store.delete_with_commit(&parent_subject).await {
Ok(_) => log::info!("Deleted parent collection: {}", parent_subject),
Err(e) => log::warn!(
"Failed to delete parent collection {}: {}",
parent_subject,
e
),
}
log::info!("✅ Dual haystack comprehensive validation test completed successfully");
}
/// Test configuration validation for dual haystack roles
#[tokio::test]
async fn test_dual_haystack_config_validation() {
// Load dual haystack configuration from file
let config_content = std::fs::read_to_string("../../dual_haystack_roles_config.json")
.expect("Failed to read dual haystack config file");
let config: terraphim_config::Config =
serde_json::from_str(&config_content).expect("Failed to parse dual haystack config");
// Validate configuration structure
assert_eq!(
config.roles.len(),
5,
"Should have 5 roles in dual haystack config"
);
// Validate dual haystack roles
let dual_title_role = config
.roles
.get(&"Dual Haystack Title Scorer".into())
.expect("Dual Haystack Title Scorer role should exist");
assert_eq!(
dual_title_role.relevance_function,
RelevanceFunction::TitleScorer
);
assert!(
dual_title_role.kg.is_none(),
"Title scorer should not have knowledge graph"
);
assert_eq!(
dual_title_role.haystacks.len(),
2,
"Should have 2 haystacks"
);
assert!(dual_title_role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Atomic));
assert!(dual_title_role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Ripgrep));
let dual_graph_role = config
.roles
.get(&"Dual Haystack Graph Embeddings".into())
.expect("Dual Haystack Graph Embeddings role should exist");
assert_eq!(
dual_graph_role.relevance_function,
RelevanceFunction::TerraphimGraph
);
assert!(
dual_graph_role.kg.is_some(),
"Graph embeddings should have knowledge graph"
);
assert_eq!(
dual_graph_role.haystacks.len(),
2,
"Should have 2 haystacks"
);
assert!(dual_graph_role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Atomic));
assert!(dual_graph_role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Ripgrep));
// Validate hybrid role with 3 haystacks
let hybrid_role = config
.roles
.get(&"Dual Haystack Hybrid Researcher".into())
.expect("Dual Haystack Hybrid Researcher role should exist");
assert_eq!(
hybrid_role.relevance_function,
RelevanceFunction::TerraphimGraph
);
assert!(
hybrid_role.kg.is_some(),
"Hybrid researcher should have knowledge graph"
);
assert_eq!(hybrid_role.haystacks.len(), 3, "Should have 3 haystacks");
assert!(hybrid_role
.haystacks
.iter()
.any(|h| h.service == ServiceType::Atomic));
assert!(
hybrid_role
.haystacks
.iter()
.filter(|h| h.service == ServiceType::Ripgrep)
.count()
== 2
);
// Validate single haystack reference roles
let single_atomic_role = config
.roles
.get(&"Single Atomic Reference".into())
.expect("Single Atomic Reference role should exist");
assert_eq!(
single_atomic_role.haystacks.len(),
1,
"Should have 1 haystack"
);
assert_eq!(single_atomic_role.haystacks[0].service, ServiceType::Atomic);
let single_ripgrep_role = config
.roles
.get(&"Single Ripgrep Reference".into())
.expect("Single Ripgrep Reference role should exist");
assert_eq!(
single_ripgrep_role.haystacks.len(),
1,
"Should have 1 haystack"
);
assert_eq!(
single_ripgrep_role.haystacks[0].service,
ServiceType::Ripgrep
);
log::info!("✅ Dual haystack configuration validation test completed successfully");
}
/// Test source differentiation and result prefixing
#[tokio::test]
async fn test_source_differentiation_validation() {
// Initialize logging for test debugging
let _ = env_logger::builder()
.filter_level(log::LevelFilter::Info)
.is_test(true)
.try_init();
// Load atomic server configuration from environment
dotenvy::dotenv().ok();
let server_url =
std::env::var("ATOMIC_SERVER_URL").unwrap_or_else(|_| "http://localhost:9883".to_string());
let atomic_secret = std::env::var("ATOMIC_SERVER_SECRET").ok();
// Create dual haystack configuration
let dual_config = ConfigBuilder::new()
.global_shortcut("Ctrl+S")
.add_role(
"SourceDifferentiation",
Role {
shortname: Some("source-diff".to_string()),
name: "Source Differentiation Test".into(),
relevance_function: RelevanceFunction::TitleScorer,
terraphim_it: false,
theme: "flatly".to_string(),
kg: None,
haystacks: vec![
Haystack::new(server_url.clone(), ServiceType::Atomic, true)
.with_atomic_secret(atomic_secret.clone()),
Haystack::new("../../docs/src".to_string(), ServiceType::Ripgrep, true),
],
llm_enabled: false,
llm_api_key: None,
llm_model: None,
llm_auto_summarize: false,
llm_chat_enabled: false,
llm_chat_system_prompt: None,
llm_chat_model: None,
llm_context_window: None,
extra: AHashMap::new(),
},
)
.build()
.expect("Failed to build source differentiation config");
// Test search using both indexers
let atomic_indexer = AtomicHaystackIndexer::default();
let ripgrep_indexer = RipgrepIndexer::default();
let role = dual_config.roles.values().next().unwrap();
let search_term = "terraphim";
let mut atomic_docs = Vec::new();
let mut ripgrep_docs = Vec::new();
for haystack in &role.haystacks {
match haystack.service {
ServiceType::Atomic => {
if let Ok(results) = atomic_indexer.index(search_term, haystack).await {
for doc in results.values() {
atomic_docs.push(doc.clone());
log::debug!("Atomic document: {}", doc.title);
}
}
}
ServiceType::Ripgrep => {
if let Ok(results) = ripgrep_indexer.index(search_term, haystack).await {
for doc in results.values() {
ripgrep_docs.push(doc.clone());
log::debug!("Ripgrep document: {}", doc.title);
}
}
}
_ => {}
}
}
log::info!("Source differentiation results:");
log::info!(" Atomic documents: {}", atomic_docs.len());
log::info!(" Ripgrep documents: {}", ripgrep_docs.len());
// Validate source differentiation
if !atomic_docs.is_empty() {
let has_atomic_prefix = atomic_docs
.iter()
.any(|doc| doc.title.starts_with("ATOMIC:"));
if has_atomic_prefix {
log::info!("✅ Found documents with ATOMIC: prefix for source differentiation");
} else {
log::info!("ℹ️ Atomic documents do not use ATOMIC: prefix (depends on test data)");
}
}
if !ripgrep_docs.is_empty() {
let has_markdown_titles = ripgrep_docs
.iter()
.any(|doc| !doc.title.starts_with("ATOMIC:"));
if has_markdown_titles {
log::info!(
"✅ Found ripgrep documents without ATOMIC: prefix for source differentiation"
);
}
}
// Ensure documents can be distinguished by their characteristics
for doc in &atomic_docs {
assert!(!doc.url.is_empty(), "Atomic document should have URL");
assert!(!doc.title.is_empty(), "Atomic document should have title");
}
for doc in &ripgrep_docs {
assert!(!doc.url.is_empty(), "Ripgrep document should have URL");
assert!(!doc.title.is_empty(), "Ripgrep document should have title");
}
log::info!("✅ Source differentiation validation test completed successfully");
}