use crate::{
builder::ArrowSpaceBuilder,
clustering::{ClusteringHeuristic, euclidean_dist, kmeans_lloyd, nearest_centroid},
tests::{CLUSTERING_TEST_DATA, test_data::make_gaussian_blob},
};
use log::debug;
use serial_test::serial;
#[test]
fn test_euclidean_dist_basic() {
let a = vec![0.0, 0.0, 0.0];
let b = vec![1.0, 1.0, 1.0];
let dist = euclidean_dist(&a, &b);
assert!((dist - 3.0_f64.sqrt()).abs() < 1e-10);
}
#[test]
fn test_euclidean_dist_identity() {
let a = vec![3.5, -2.1, 4.8];
let dist = euclidean_dist(&a, &a);
assert!(dist.abs() < 1e-10);
}
#[test]
fn test_euclidean_dist_one_dimensional() {
let a = vec![5.0];
let b = vec![2.0];
let dist = euclidean_dist(&a, &b);
assert!((dist - 3.0).abs() < 1e-10);
}
#[test]
fn test_nearest_centroid_single() {
let centroids = vec![vec![1.0, 2.0], vec![5.0, 6.0], vec![9.0, 10.0]];
let query = vec![1.1, 2.1];
let (idx, dist2) = nearest_centroid(&query, ¢roids);
assert_eq!(idx, 0);
assert!(dist2 < 0.03);
}
#[test]
fn test_nearest_centroid_middle() {
let centroids = vec![vec![0.0, 0.0], vec![5.0, 5.0], vec![10.0, 10.0]];
let query = vec![4.9, 5.1];
let (idx, _dist2) = nearest_centroid(&query, ¢roids);
assert_eq!(idx, 1);
}
#[test]
#[serial]
fn test_kmeans_lloyd_gaussian_blobs() {
let data = make_gaussian_blob(99, 0.2);
let assignments = kmeans_lloyd(&data, 3, 50, 42);
let unique_labels: std::collections::HashSet<_> = assignments.iter().copied().collect();
assert_eq!(unique_labels.len(), 3, "Should find 3 clusters");
let mut label_counts = std::collections::HashMap::new();
for &label in &assignments {
*label_counts.entry(label).or_insert(0) += 1;
}
for (&label, &count) in &label_counts {
assert!(
count >= 10 && count <= 70,
"Cluster {} has {} points (expected 20-80, initialization-dependent)",
label,
count
);
}
for (&label, &count) in &label_counts {
assert!(count >= 10, "Cluster {} too small: {}", label, count);
}
debug!("✓ K-means produced valid clustering:");
for (label, count) in &label_counts {
debug!(" Cluster {}: {} points", label, count);
}
}
#[test]
fn test_kmeans_lloyd_k_equals_n() {
let rows = vec![vec![1.0], vec![2.0], vec![3.0]];
let assignments = kmeans_lloyd(&rows, 3, 10, 128);
let unique: std::collections::HashSet<_> = assignments.iter().collect();
assert_eq!(unique.len(), 3);
}
#[test]
fn test_intrinsic_dimension_line() {
let mut rows = Vec::new();
for i in 0..100 {
let t = i as f64 / 10.0;
rows.push(vec![t, 2.0 * t, 3.0 * t]);
}
let builder = ArrowSpaceBuilder::new();
let id = builder.estimate_intrinsic_dimension(&rows, rows.len(), 3, 42);
debug!("Estimated ID for 1D line: {}", id);
assert!(id >= 1 && id <= 3, "Expected ID near 1, got {}", id);
}
#[test]
fn test_intrinsic_dimension_plane() {
let mut rows = Vec::new();
for i in 0..100 {
let x = (i as f64 / 10.0).sin();
let y = (i as f64 / 10.0).cos();
rows.push(vec![x, y, 0.0]);
}
let builder = ArrowSpaceBuilder::new();
let id = builder.estimate_intrinsic_dimension(&rows, rows.len(), 3, 42);
debug!("Estimated ID for 2D plane: {}", id);
assert!(id >= 1 && id <= 3, "Expected ID near 2, got {}", id);
}
#[test]
fn test_intrinsic_dimension_full_space() {
let mut rows = Vec::new();
for _ in 0..200 {
rows.push(vec![
rand::random(),
rand::random(),
rand::random(),
rand::random(),
rand::random(),
]);
}
let builder = ArrowSpaceBuilder::new();
let id = builder.estimate_intrinsic_dimension(&rows, rows.len(), 5, 42);
debug!("Estimated ID for 5D full space: {}", id);
assert!(id >= 2 && id <= 5, "Expected ID near 5, got {}", id);
}
#[test]
fn test_intrinsic_dimension_small_n() {
let rows = vec![vec![1.0, 2.0], vec![3.0, 4.0]];
let builder = ArrowSpaceBuilder::new();
let id = builder.estimate_intrinsic_dimension(&rows, 2, 2, 42);
assert!(id <= 2);
}
#[test]
fn test_step1_bounds_small_dataset() {
let rows = vec![vec![1.0]; 10];
let builder = ArrowSpaceBuilder::new();
let (k_min, k_max, _id) = builder.step1_bounds(&rows, 10, 1, None, 42);
debug!("step 1 bounds (N=10, F=1): [{}, {}]", k_min, k_max);
assert!(k_min >= 2, "k_min should be at least 2");
assert!(k_max >= k_min, "k_max should be >= k_min");
assert!(k_max <= 10, "k_max should not exceed N");
}
#[test]
fn test_step1_bounds_large_n_small_f() {
let rows = vec![vec![0.0; 5]; 1000];
let builder = ArrowSpaceBuilder::new();
let (k_min, k_max, _id) = builder.step1_bounds(&rows, 1000, 5, None, 42);
debug!("step 1 bounds (N=1000, F=5): [{}, {}]", k_min, k_max);
assert!(k_min <= k_max);
assert!(k_max <= 1000 / 10, "k_max should respect N/10 constraint");
}
#[test]
fn test_step1_bounds_high_dimensional() {
let rows = vec![vec![0.0; 100]; 50];
let builder = ArrowSpaceBuilder::new();
let (k_min, k_max, _id) = builder.step1_bounds(&rows, 50, 100, None, 42);
debug!("step 1 bounds (N=50, F=100): [{}, {}]", k_min, k_max);
assert!(k_min >= 2);
assert!(k_max <= 25, "k_max should not exceed N/2");
}
#[test]
fn test_calinski_harabasz_well_separated() {
use rand::Rng;
let mut rng = rand::rng();
let mut rows = Vec::new();
for _ in 0..50 {
rows.push(vec![
rng.random_range(-0.5..0.5),
rng.random_range(-0.5..0.5),
]);
}
for _ in 0..50 {
rows.push(vec![
10.0 + rng.random_range(-0.5..0.5),
10.0 + rng.random_range(-0.5..0.5),
]);
}
let builder = ArrowSpaceBuilder::new();
let k_suggested = builder.step2_calinski_harabasz(&rows, 2, 10, 42);
debug!(
"Calinski-Harabasz suggested K: {} (expected 2)",
k_suggested
);
assert!(
k_suggested >= 2 && k_suggested <= 4,
"Expected K around 2, got {}",
k_suggested
);
}
#[test]
fn test_calinski_harabasz_three_clusters() {
use rand::Rng;
let mut rng = rand::rng();
let mut rows = Vec::new();
for _ in 0..50 {
rows.push(vec![
rng.random_range(-0.5..0.5),
rng.random_range(-0.5..0.5),
]);
}
for _ in 0..50 {
rows.push(vec![
5.0 + rng.random_range(-0.5..0.5),
5.0 + rng.random_range(-0.5..0.5),
]);
}
for _ in 0..50 {
rows.push(vec![
10.0 + rng.random_range(-0.5..0.5),
10.0 + rng.random_range(-0.5..0.5),
]);
}
let builder = ArrowSpaceBuilder::new();
let k_suggested = builder.step2_calinski_harabasz(&rows, 2, 10, 42);
debug!(
"Calinski-Harabasz suggested K: {} (expected 3)",
k_suggested
);
assert!(
k_suggested >= 2 && k_suggested <= 5,
"Expected K around 3, got {}",
k_suggested
);
}
#[test]
fn test_calinski_harabasz_single_cluster() {
let mut rows = Vec::new();
for i in 0..100 {
let noise = (i as f64) * 0.001;
rows.push(vec![5.0 + noise, 5.0 + noise]);
}
let builder = ArrowSpaceBuilder::new();
let k_suggested = builder.step2_calinski_harabasz(&rows, 2, 10, 42);
debug!("Calinski-Harabasz K for single cluster: {}", k_suggested);
assert!(k_suggested >= 2, "Should return at least k_min");
}
#[test]
fn test_threshold_from_pilot_two_clusters() {
let mut rows = Vec::new();
for _ in 0..50 {
rows.push(vec![0.0, 0.0]);
}
for _ in 0..50 {
rows.push(vec![10.0, 10.0]);
}
let builder = ArrowSpaceBuilder::new();
let radius = builder.compute_threshold_from_pilot(&rows, 2, 42);
debug!("Threshold radius for two tight clusters: {:.6}", radius);
assert!(
radius > 1.0 && radius < 80.0,
"Expected moderate threshold for zero-variance clusters with inter-centroid gap, got {}",
radius
);
}
#[test]
fn test_threshold_from_pilot_large_variance() {
let mut rows = Vec::new();
for i in 0..100 {
let noise = (i as f64 - 50.0) * 0.5;
rows.push(vec![noise, noise]);
}
let builder = ArrowSpaceBuilder::new();
let radius = builder.compute_threshold_from_pilot(&rows, 3, 42);
debug!("Threshold radius for spread cluster: {:.6}", radius);
assert!(
radius > 1.0,
"Expected larger threshold for spread data, got {}",
radius
);
}
#[test]
fn test_threshold_from_pilot_single_point_per_cluster() {
let rows = vec![vec![0.0], vec![10.0], vec![20.0]];
let builder = ArrowSpaceBuilder::new();
let radius = builder.compute_threshold_from_pilot(&rows, 3, 42);
assert!(radius >= 0.0);
}
#[test]
fn test_threshold_zero_variance_clusters() {
let rows = vec![
vec![0.0, 0.0],
vec![0.0, 0.0],
vec![10.0, 10.0],
vec![10.0, 10.0],
];
let builder = ArrowSpaceBuilder::new();
let radius = builder.compute_threshold_from_pilot(&rows, 2, 42);
debug!("Threshold for zero-variance clusters: {:.6}", radius);
assert!(
radius > 0.0,
"Should use inter-centroid fallback for zero variance"
);
assert!(
radius > 1.0,
"Inter-centroid fallback should give meaningful threshold"
);
}
#[test]
fn test_threshold_all_points_identical() {
let rows = vec![vec![5.0, 5.0]; 10];
let builder = ArrowSpaceBuilder::new();
let radius = builder.compute_threshold_from_pilot(&rows, 3, 42);
debug!("Threshold for identical points: {:.6}", radius);
assert!(
radius >= 1e-6,
"Should return minimum threshold for degenerate data"
);
}
#[test]
fn test_threshold_very_tight_clusters() {
let mut rows = Vec::new();
for _ in 0..20 {
rows.push(vec![0.0 + rand::random::<f64>() * 0.0001, 0.0]);
}
for _ in 0..20 {
rows.push(vec![100.0 + rand::random::<f64>() * 0.0001, 0.0]);
}
let builder = ArrowSpaceBuilder::new();
let radius = builder.compute_threshold_from_pilot(&rows, 2, 42);
debug!("Threshold for very tight clusters: {:.6}", radius);
assert!(
radius > 0.01,
"Should use inter-centroid distance, not tiny intra-cluster variance"
);
}
#[test]
fn test_optimal_k_heuristic_synthetic_three_clusters() {
use rand::Rng;
let mut rng = rand::rng();
let mut rows = Vec::new();
for _ in 0..100 {
rows.push(vec![
rng.random_range(-0.5..0.5),
rng.random_range(-0.5..0.5),
rng.random_range(-0.5..0.5),
]);
}
for _ in 0..100 {
rows.push(vec![
5.0 + rng.random_range(-0.5..0.5),
5.0 + rng.random_range(-0.5..0.5),
5.0 + rng.random_range(-0.5..0.5),
]);
}
for _ in 0..100 {
rows.push(vec![
10.0 + rng.random_range(-0.5..0.5),
10.0 + rng.random_range(-0.5..0.5),
10.0 + rng.random_range(-0.5..0.5),
]);
}
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, rows.len(), 3, None, 42);
debug!(
"Optimal K={}, radius={:.6}, ID={} for 3-cluster synthetic",
k, radius, id
);
assert!(
k >= 2 && k <= 7,
"Expected K around 3 for three clusters, got {}",
k
);
assert!(radius > 0.0, "radius should be positive");
assert!(id >= 1 && id <= 3, "Intrinsic dimension should be 1-3");
}
#[test]
fn test_optimal_k_heuristic_spherical_clusters() {
use rand::Rng;
let mut rng = rand::rng();
let mut rows = Vec::new();
let centers = vec![
vec![0.0, 0.0],
vec![10.0, 0.0],
vec![0.0, 10.0],
vec![10.0, 10.0],
];
for center in centers {
for _ in 0..75 {
rows.push(vec![
center[0] + rng.random_range(-0.5..0.5),
center[1] + rng.random_range(-0.5..0.5),
]);
}
}
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, rows.len(), 2, None, 42);
debug!(
"Optimal K={}, radius={:.6}, ID={} for 4 spherical clusters",
k, radius, id
);
assert!(
k >= 3 && k <= 6,
"Expected K around 4 for four clusters, got {}",
k
);
assert!(radius > 0.0, "radius should be positive");
assert!(id >= 1 && id <= 2, "Intrinsic dimension should be 1-2");
}
#[test]
fn test_optimal_k_heuristic_high_dimensional_random() {
let mut rows = Vec::new();
for _ in 0..200 {
rows.push(vec![
rand::random(),
rand::random(),
rand::random(),
rand::random(),
rand::random(),
rand::random(),
rand::random(),
rand::random(),
]);
}
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, rows.len(), 8, None, 42);
debug!(
"Optimal K={}, radius={:.6}, ID={} for 8D random",
k, radius, id
);
assert!(k >= 2, "K should be at least 2");
assert!(k <= 100, "K should respect N/10 constraint");
assert!(radius > 0.0);
assert!(id <= 8, "ID should not exceed F");
}
#[test]
fn test_optimal_k_heuristic_small_n() {
let rows = vec![
vec![1.0, 2.0],
vec![1.1, 2.1],
vec![5.0, 6.0],
vec![5.1, 6.1],
];
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, 4, 2, None, 42);
debug!("Optimal K={}, radius={:.6}, ID={} for N=4", k, radius, id);
assert!(k >= 2, "K should be at least 2");
assert!(k <= 4, "K should not exceed N");
assert!(radius > 0.0);
}
#[test]
fn test_optimal_k_heuristic_degenerate_identical() {
let rows = vec![vec![3.0, 4.0]; 100];
let builder = ArrowSpaceBuilder::new();
let (k, radius, _id) = builder.compute_optimal_k(&rows, 100, 2, None, 42);
debug!("Optimal K={}, radius={:.6} for identical points", k, radius);
assert!(k >= 2, "K should be at least 2 even for degenerate data");
assert!(radius >= 0.0);
}
#[test]
fn test_optimal_k_heuristic_single_feature() {
let mut rows = Vec::new();
for i in 0..100 {
rows.push(vec![i as f64]);
}
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, 100, 1, None, 42);
debug!(
"Optimal K={}, radius={:.6}, ID={} for 1D uniform",
k, radius, id
);
assert!(k >= 2, "K should be at least 2");
assert_eq!(id, 1, "Intrinsic dimension should be 1 for 1D data");
assert!(radius > 0.0);
}
#[test]
fn test_optimal_k_minimum_viable_dataset() {
let rows = vec![vec![0.0, 0.0], vec![1.0, 1.0]];
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, 2, 2, None, 42);
debug!("Optimal K={}, radius={:.6}, ID={} for N=2", k, radius, id);
assert!(k >= 2, "K should be at least 2");
assert!(radius >= 0.0);
}
#[test]
fn test_optimal_k_very_high_dimensional() {
let rows = vec![vec![0.0; 1000]; 20];
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, 20, 1000, None, 42);
debug!(
"Optimal K={}, radius={:.6}, ID={} for N=20, F=1000",
k, radius, id
);
assert!(k >= 2);
assert!(k <= 10, "K should not exceed N/2");
assert!(id <= 1000);
}
#[test]
fn test_optimal_k_mixed_scale_features() {
let mut rows = Vec::new();
for i in 0..100 {
rows.push(vec![(i as f64) * 0.001, (i as f64) * 1000.0]);
}
let builder = ArrowSpaceBuilder::new();
let (k, radius, _id) = builder.compute_optimal_k(&rows, 100, 2, None, 42);
debug!(
"Optimal K={}, radius={:.6} for mixed-scale features",
k, radius
);
assert!(k >= 2);
assert!(radius > 0.0);
}
#[test]
fn test_kmeans_k_greater_than_n() {
let rows = vec![vec![1.0], vec![2.0]];
let assignments = kmeans_lloyd(&rows, 5, 10, 128);
assert_eq!(assignments.len(), 2);
for &a in &assignments {
assert!(a < 2, "Assignment {} is out of bounds for k=2", a);
}
}
#[test]
#[should_panic]
fn test_kmeans_k_equals_zero() {
let rows = vec![vec![1.0], vec![2.0]];
let assignments = kmeans_lloyd(&rows, 0, 10, 128);
assert!(
assignments.is_empty(),
"k=0 should return empty assignments"
);
}
#[test]
#[should_panic]
fn test_kmeans_single_row() {
let rows = vec![vec![1.0, 2.0]];
let assignments = kmeans_lloyd(&rows, 3, 10, 128);
assert_eq!(assignments.len(), 1);
assert_eq!(assignments[0], 0, "Single row should be in cluster 0");
}
#[test]
fn test_kmeans_empty_cluster_recovery() {
let rows = vec![vec![0.0, 0.0], vec![0.001, 0.001], vec![100.0, 100.0]];
let assignments = kmeans_lloyd(&rows, 3, 20, 128);
assert_eq!(assignments.len(), 3);
for &a in &assignments {
assert!(a < 3, "Assignment out of bounds");
}
}
#[test]
fn test_kmeans_convergence_early_stop() {
let rows = vec![vec![5.0, 5.0]; 20];
let assignments = kmeans_lloyd(&rows, 3, 100, 128);
assert_eq!(assignments.len(), 20);
let first_cluster = assignments[0];
assert!(assignments.iter().all(|&a| a == first_cluster));
}
#[test]
fn test_clustering_heuristic_trait_interface() {
let rows = vec![
vec![0.0, 0.0],
vec![0.1, 0.1],
vec![10.0, 10.0],
vec![10.1, 10.1],
];
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, 4, 2, None, 42);
debug!("Trait interface: K={}, radius={:.6}, ID={}", k, radius, id);
assert!(k >= 2);
assert!(radius > 0.0, "Radius should be positive, got {}", radius);
assert!(id <= 2);
}
#[test]
#[ignore = "takes time to run, run separatly"]
#[serial]
fn test_optimal_k_performance_large_dataset() {
use std::time::Instant;
let mut rows = Vec::new();
for _ in 0..10000 {
rows.push(vec![
rand::random::<f64>(),
rand::random::<f64>(),
rand::random::<f64>(),
rand::random::<f64>(),
]);
}
let builder = ArrowSpaceBuilder::new();
let start = Instant::now();
let (k, radius, id) = builder.compute_optimal_k(&rows, rows.len(), 4, None, 42);
let elapsed = start.elapsed();
debug!(
"Large dataset (N=10000, F=4): K={}, radius={:.6}, ID={}, time={:?}",
k, radius, id, elapsed
);
assert!(elapsed.as_secs() < 30, "Should complete within 30s");
}
#[test]
fn test_consistent_results_with_seed() {
let cluster_a: Vec<Vec<f64>> = (0..10)
.map(|i| vec![i as f64 * 0.1, i as f64 * 0.1])
.collect();
let cluster_b: Vec<Vec<f64>> = (0..10)
.map(|i| vec![5.0 + i as f64 * 0.1, 5.0 + i as f64 * 0.1])
.collect();
let rows: Vec<Vec<f64>> = cluster_a.into_iter().chain(cluster_b).collect();
let n = rows.len(); let f = rows[0].len();
let builder = ArrowSpaceBuilder::new();
let (k1, radius_1, id1) = builder.compute_optimal_k(&rows, n, f, None, 42);
let (k2, radius_2, id2) = builder.compute_optimal_k(&rows, n, f, None, 42);
assert_eq!(k1, k2, "k must be identical across runs with the same seed");
assert_eq!(
radius_1, radius_2,
"radius must be bit-identical across runs — \
compute_threshold_from_pilot is deterministic with same seed"
);
assert_eq!(
id1, id2,
"id_est must be identical across runs — \
Two-NN uses a seeded RNG"
);
let (k3, _, _) = builder.compute_optimal_k(&rows, n, f, None, 99);
assert_eq!(
k1, k3,
"k should be seed-stable on clearly separable data (got k={k1} vs k={k3})"
);
assert_eq!(
k1, 2,
"k should be 2 for a dataset with two well-separated clusters"
);
assert!(
radius_1 > 0.0 && radius_1.is_finite(),
"radius must be positive and finite (got {radius_1})"
);
assert!(
id1 >= 1 && id1 <= f,
"id_est={id1} should be in [1, f={f}] for 2D data"
);
println!("✓ Determinism verified (n={n}, f={f}): k={k1}, radius={radius_1:.6}, id_est={id1}");
}
#[test]
fn test_readme_example() {
let mut rows = Vec::new();
for i in 0..50 {
rows.push(vec![(i as f64) * 0.1, (i as f64) * 0.1]);
}
for i in 0..50 {
rows.push(vec![10.0 + (i as f64) * 0.1, 10.0 + (i as f64) * 0.1]);
}
let builder = ArrowSpaceBuilder::new();
let (k, radius, id) = builder.compute_optimal_k(&rows, rows.len(), 2, None, 42);
debug!("README example: K={}, radius={:.6}, ID={}", k, radius, id);
assert!(k >= 2, "Should detect at least 2 clusters");
assert!(radius > 0.0);
}
#[test]
#[serial]
fn test_fast_clustering_100k_dimensions_completes() {
let n_items = 500;
let n_features = 100_000;
let rows: Vec<Vec<f64>> = (0..n_items)
.map(|i| {
let mut row = vec![0.0; n_features];
for _ in 0..10 {
row[(i * 7919 + i * i) % n_features] = 1.0;
}
row
})
.collect();
let mut builder = ArrowSpaceBuilder::new()
.with_lambda_graph(1.0, 15, 7, 2.0, Some(0.5))
.with_dims_reduction(true, Some(0.3))
.with_seed(999);
let start = std::time::Instant::now();
let output = builder.start_clustering_dim_reduce(rows);
let elapsed = start.elapsed();
assert!(
elapsed.as_secs() < 600,
"100k-dim clustering took {} seconds (expected <600)",
elapsed.as_secs()
);
assert!(
output.reduced_dim < 10_000,
"Reduced dim {} should be much less than 100k",
output.reduced_dim
);
debug!("✓ 100k-dim test passed in {:?}", elapsed);
}
#[test]
fn test_fast_clustering_no_reduction_fallback() {
let rows: Vec<Vec<f64>> = vec![
vec![1.0, 2.0, 3.0],
vec![4.0, 5.0, 6.0],
vec![7.0, 8.0, 9.0],
];
let mut builder = ArrowSpaceBuilder::new()
.with_seed(42)
.with_dims_reduction(false, None);
let output = builder.start_clustering_dim_reduce(rows);
assert!(output.aspace.projection_matrix.is_none());
assert_eq!(output.reduced_dim, 3); }
use crate::search::taumode::TauMode;
#[test]
fn test_with_cluster_max_clusters_override() {
let rows = &*CLUSTERING_TEST_DATA;
let builder_auto = ArrowSpaceBuilder::new()
.with_lambda_graph(0.5, 10, 5, 2.0, None)
.with_synthesis(TauMode::Median)
.with_dims_reduction(false, None)
.with_inline_sampling(None);
let (aspace_auto, _gl_auto) = builder_auto.build(rows.clone());
let k_auto = aspace_auto.n_clusters;
let builder_manual = ArrowSpaceBuilder::new()
.with_lambda_graph(0.5, 10, 5, 2.0, None)
.with_synthesis(TauMode::Median)
.with_cluster_max_clusters(3) .with_cluster_radius(0.4);
let (aspace_manual, _gl_manual) = builder_manual.build(rows.clone());
let k_manual = aspace_manual.n_clusters;
println!("Automatic K: {}, Manual K: {}", k_auto, k_manual);
assert!(
k_auto < 10,
"Heuristic should produce modest cluster count (got {})",
k_auto
);
assert_eq!(
k_manual, 3,
"Manual override should produce exactly 3 clusters (got {})",
k_manual
);
let lambda_range_auto = aspace_auto
.lambdas()
.iter()
.copied()
.fold(f64::NEG_INFINITY, f64::max)
- aspace_auto
.lambdas()
.iter()
.copied()
.fold(f64::INFINITY, f64::min);
let lambda_range_manual = aspace_manual
.lambdas()
.iter()
.copied()
.fold(f64::NEG_INFINITY, f64::max)
- aspace_manual
.lambdas()
.iter()
.copied()
.fold(f64::INFINITY, f64::min);
println!(
"Lambda range - Auto: {:.6}, Manual: {:.6}",
lambda_range_auto, lambda_range_manual
);
assert!(
lambda_range_manual > 0.5,
"Manual topology should have good lambda spread"
);
assert_eq!(
aspace_auto.n_clusters, k_auto,
"Cluster metadata should match (auto)"
);
assert_eq!(
aspace_manual.n_clusters, k_manual,
"Cluster metadata should match (manual)"
);
}
#[test]
fn test_with_cluster_radius_tightness() {
let rows = &*CLUSTERING_TEST_DATA;
let builder_loose = ArrowSpaceBuilder::new()
.with_lambda_graph(0.5, 10, 5, 2.0, None)
.with_cluster_max_clusters(10) .with_cluster_radius(50.0) .with_synthesis(TauMode::Median);
let (aspace_loose, _) = builder_loose.build(rows.clone());
let k_loose = aspace_loose.n_clusters;
let builder_tight = ArrowSpaceBuilder::new()
.with_lambda_graph(0.5, 10, 5, 2.0, None)
.with_cluster_max_clusters(15) .with_cluster_radius(2.0) .with_synthesis(TauMode::Median);
let (aspace_tight, _) = builder_tight.build(rows.clone());
let k_tight = aspace_tight.n_clusters;
println!("Loose radius K: {}, Tight radius K: {}", k_loose, k_tight);
assert!(
k_tight >= k_loose,
"Tighter radius should produce more clusters (tight={}, loose={})",
k_tight,
k_loose
);
assert!(
k_tight >= 5,
"Tight radius should discover at least 5 clusters (got {})",
k_tight
);
assert!(
(aspace_loose.cluster_radius - 50.0).abs() < 0.1,
"Stored radius should match builder config (expected 50.0, got {})",
aspace_loose.cluster_radius
);
assert!(
(aspace_tight.cluster_radius - 2.0).abs() < 0.1,
"Stored radius should match builder config (expected 2.0, got {})",
aspace_tight.cluster_radius
);
println!(
"Verified radius storage: loose={:.1}, tight={:.1}",
aspace_loose.cluster_radius, aspace_tight.cluster_radius
);
}
#[test]
fn test_dense_mesh_topology() {
let rows = &*CLUSTERING_TEST_DATA;
let target_k = 30; let tight_radius = 0.3;
let builder = ArrowSpaceBuilder::new()
.with_lambda_graph(0.5, 10, 5, 2.0, None)
.with_cluster_max_clusters(target_k)
.with_cluster_radius(tight_radius)
.with_dims_reduction(true, Some(0.2)) .with_synthesis(TauMode::Median);
let (aspace, _gl) = builder.build(rows.clone());
assert_eq!(
aspace.n_clusters, target_k,
"Should respect manual cluster count"
);
assert!(
(aspace.cluster_radius - tight_radius).abs() < 0.01,
"Should store configured radius (expected {}, got {})",
tight_radius,
aspace.cluster_radius
);
let lambda_spread = aspace
.lambdas()
.iter()
.copied()
.fold(f64::NEG_INFINITY, f64::max)
- aspace
.lambdas()
.iter()
.copied()
.fold(f64::INFINITY, f64::min);
println!(
"Dense mesh: {} clusters, lambda spread: {:.6}",
aspace.n_clusters, lambda_spread
);
assert!(
lambda_spread > 0.5,
"Rich topology should produce good lambda spread (got {:.6})",
lambda_spread
);
let near_zero_count = aspace
.lambdas()
.iter()
.filter(|&&l| l < 0.01) .count();
assert!(
near_zero_count < 5,
"Dense mesh should minimize clustered lambdas at minimum (found {})",
near_zero_count
);
println!(
"Lambdas near zero: {} ({:.1}%)",
near_zero_count,
(near_zero_count as f64 / aspace.nitems as f64) * 100.0
);
}
#[test]
fn test_step1_bounds_with_projection_basic() {
let builder = ArrowSpaceBuilder::default();
let rows = &*CLUSTERING_TEST_DATA; let n = rows.len(); let f = rows[0].len(); let effective_dim = Some(50);
let (k_min, k_max, id_est) = builder.step1_bounds(&rows, n, f, effective_dim, 42);
assert_eq!(k_min, 4, "k_min should be 4");
assert!(
k_max <= n / 10 + 1,
"k_max should be near n/10=9, the binding constraint for n=99 (got {k_max})"
);
assert!(
k_max >= k_min + 1,
"k_max must always exceed k_min to allow a valid search range (got {k_max})"
);
assert!(
k_max <= n / 2,
"k_max should never exceed n/2={} as a sanity cap (got {k_max})",
n / 2
);
assert!(id_est >= 1, "id_est must be at least 1 (got {id_est})");
println!(
"✓ Projection basic (n={n}, f={f}, effective_dim=50): \
k_min={k_min}, k_max={k_max}, id_est={id_est}"
);
}
#[test]
#[serial]
fn test_step1_bounds_projection_dominates_over_ambient() {
let builder = ArrowSpaceBuilder::default();
let rows = &*CLUSTERING_TEST_DATA;
let n = rows.len();
let f = rows[0].len();
let (_, k_max_no_proj, _) = builder.step1_bounds(&rows, n, f, None, 42);
let (_, k_max_with_proj, _) = builder.step1_bounds(&rows, n, f, Some(500), 42);
assert_eq!(
k_max_no_proj, k_max_with_proj,
"Projected and capped-ambient should give same k_max"
);
println!(
"✓ Projection vs capped ambient: no_proj={}, with_proj={}",
k_max_no_proj, k_max_with_proj
);
}
#[test]
#[serial]
fn test_step1_bounds_small_effective_dim() {
let builder = ArrowSpaceBuilder::default();
let rows = &*CLUSTERING_TEST_DATA;
let n = rows.len();
let f = rows[0].len();
let effective_dim = Some(50);
let (k_min, k_max, _id_est) = builder.step1_bounds(&rows, n, f, effective_dim, 42);
assert!(
k_max >= 9,
"k_max should be at least sqrt(n) = 9 (is {k_max})"
);
assert!(
k_max <= 100,
"k_max should be ≤ 2*effective_dim = 100 (is {k_max})"
);
println!(
"✓ Small effective_dim=100: k_min={}, k_max={}",
k_min, k_max
);
}
#[test]
#[serial]
fn test_step1_bounds_effective_dim_smaller_than_id() {
let builder = ArrowSpaceBuilder::default();
let rows = &*CLUSTERING_TEST_DATA;
let n = rows.len();
let f = rows[0].len();
let effective_dim = Some(50);
let (_k_min, k_max, id_est) = builder.step1_bounds(&rows, n, f, effective_dim, 42);
println!(
"✓ Effective_dim < id_est: effective={}, id_est={}, k_max={}",
effective_dim.unwrap(),
id_est,
k_max
);
assert!(k_max <= 100, "k_max limited by min of candidates");
}
#[test]
#[serial]
fn test_step1_bounds_effective_dim_equals_intrinsic() {
let builder = ArrowSpaceBuilder::default();
let rows = &*CLUSTERING_TEST_DATA;
let n = rows.len();
let f = rows[0].len();
let effective_dim = Some(120);
let (_k_min, k_max, id_est) = builder.step1_bounds(&rows, n, f, effective_dim, 42);
println!(
"✓ Effective_dim ≈ 2×id_est: effective={}, id_est={}, k_max={}",
effective_dim.unwrap(),
id_est,
k_max
);
assert!(k_max <= 100, "k_max should be bounded by n/10");
}
#[test]
#[serial]
fn test_step1_bounds_higher_effective_dim() {
let builder = ArrowSpaceBuilder::default();
let rows = &*CLUSTERING_TEST_DATA; let n = rows.len(); let f = rows[0].len(); let effective_dim = Some(100);
let (k_min, k_max, id_est) = builder.step1_bounds(&rows, n, f, effective_dim, 42);
assert_eq!(k_min, 4, "k_min should be 4 for n=99");
assert!(
k_max <= n / 10 + 1,
"k_max should be near n/10=9 — the binding constraint for n=99 (got {k_max})"
);
assert!(
k_max >= k_min + 1,
"k_max must exceed k_min to allow a valid search range (got {k_max})"
);
assert!(
k_max <= n / 2,
"k_max must never exceed n/2={} (got {k_max})",
n / 2
);
assert!(id_est >= 1, "id_est must be at least 1 (got {id_est})");
println!(
"✓ Higher effective_dim (n={n}, f={f}, effective_dim=100): \
k_min={k_min}, k_max={k_max}, id_est={id_est}"
);
}