#![allow(clippy::unwrap_used, clippy::expect_used)]
use vicinity::lid::{
aggregate_lid, estimate_lid_batch, estimate_twonn, estimate_twonn_with_ci, LidAggregation,
LidCategory, LidConfig, LidStats,
};
fn main() {
println!("Local Intrinsic Dimensionality (LID) Demonstration");
println!("===================================================\n");
demo_synthetic_sphere();
demo_twonn_vs_mle();
demo_outlier_detection();
demo_confidence_intervals();
}
fn demo_synthetic_sphere() {
println!("1. Estimating intrinsic dimension of embedded manifolds");
println!(" -----------------------------------------------\n");
let n = 1000; let true_id = 2;
let ambient_dim = 10;
println!(
" Generating {} points on {}-dim manifold in {}-dim space...",
n, true_id, ambient_dim
);
let vectors = generate_sphere_points(n, true_id, ambient_dim, 0.001);
let flat_vectors: Vec<f32> = vectors.iter().flatten().copied().collect();
let sample_idx = 0;
let query = &flat_vectors[sample_idx * ambient_dim..(sample_idx + 1) * ambient_dim];
let mut dists: Vec<f32> = (1..n.min(50))
.map(|j| {
let other = &flat_vectors[j * ambient_dim..(j + 1) * ambient_dim];
query
.iter()
.zip(other.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f32>()
.sqrt()
})
.collect();
dists.sort_by(|a, b| a.partial_cmp(b).unwrap());
println!("\n Sample distances from point 0 (first 10):");
for (i, d) in dists.iter().take(10).enumerate() {
println!(" d[{}] = {:.4}", i + 1, d);
}
let config = LidConfig {
k: 30, epsilon: 1e-10,
};
let estimates = estimate_lid_batch(&flat_vectors, ambient_dim, &config);
let mean_lid = aggregate_lid(&estimates, LidAggregation::Mean);
let median_lid = aggregate_lid(&estimates, LidAggregation::Median);
let harmonic_lid = aggregate_lid(&estimates, LidAggregation::HarmonicMean);
println!("\n MLE estimates (k={}):", config.k);
println!(" Mean LID: {:.2} (true: {})", mean_lid, true_id);
println!(" Median LID: {:.2} (true: {})", median_lid, true_id);
println!(" Harmonic LID: {:.2} (true: {})", harmonic_lid, true_id);
let mu_ratios = compute_twonn_ratios(&flat_vectors, ambient_dim);
let twonn_lid = estimate_twonn(&mu_ratios, 0.1);
println!("\n TwoNN estimate: {:.2} (true: {})", twonn_lid, true_id);
let error = ((median_lid - true_id as f32) / true_id as f32).abs();
if error < 0.5 {
println!("\n [PASS] Median MLE estimate within 50% of true dimension");
} else {
println!(
"\n [NOTE] MLE estimate differs from true dimension by {:.0}%",
error * 100.0
);
}
let twonn_error = ((twonn_lid - true_id as f32) / true_id as f32).abs();
if twonn_error < 0.5 {
println!(" [PASS] TwoNN estimate within 50% of true dimension");
} else {
println!(
" [NOTE] TwoNN estimate differs from true dimension by {:.0}%",
twonn_error * 100.0
);
}
println!();
}
fn demo_twonn_vs_mle() {
println!("2. Comparing TwoNN vs MLE estimators");
println!(" ----------------------------------\n");
let test_cases = [
("2D data in 10D", 2, 10, 300),
("5D data in 50D", 5, 50, 500),
("10D data in 100D", 10, 100, 1000),
];
println!(
" {:25} {:>8} {:>10} {:>10}",
"Dataset", "True ID", "MLE", "TwoNN"
);
println!(" {}", "-".repeat(55));
for (name, true_id, ambient_dim, n) in test_cases {
let vectors = generate_sphere_points(n, true_id, ambient_dim, 0.01);
let flat_vectors: Vec<f32> = vectors.iter().flatten().copied().collect();
let config = LidConfig {
k: 20,
epsilon: 1e-10,
};
let estimates = estimate_lid_batch(&flat_vectors, ambient_dim, &config);
let mle_lid = aggregate_lid(&estimates, LidAggregation::Median);
let mu_ratios = compute_twonn_ratios(&flat_vectors, ambient_dim);
let twonn_lid = estimate_twonn(&mu_ratios, 0.1);
println!(
" {:25} {:>8} {:>10.2} {:>10.2}",
name, true_id, mle_lid, twonn_lid
);
}
println!();
}
fn demo_outlier_detection() {
println!("3. Outlier detection using LID");
println!(" ---------------------------\n");
let n_normal = 100;
let n_outliers = 5;
let dim = 10;
println!(
" Generating {} normal points + {} outliers in {}D...",
n_normal, n_outliers, dim
);
let mut vectors = generate_sphere_points(n_normal, 2, dim, 0.01);
for i in 0..n_outliers {
let outlier: Vec<f32> = (0..dim)
.map(|j| ((i * dim + j) as f32 * 0.7).sin() * 5.0)
.collect();
vectors.push(outlier);
}
let flat_vectors: Vec<f32> = vectors.iter().flatten().copied().collect();
let config = LidConfig {
k: 15,
epsilon: 1e-10,
};
let estimates = estimate_lid_batch(&flat_vectors, dim, &config);
let stats = LidStats::from_estimates(&estimates);
println!("\n LID statistics:");
println!(" Mean: {:.2}", stats.mean);
println!(" Median: {:.2}", stats.median);
println!(" Std: {:.2}", stats.std_dev);
println!(" Min: {:.2}", stats.min);
println!(" Max: {:.2}", stats.max);
println!(" High threshold: {:.2}", stats.high_lid_threshold());
let mut detected_outliers = 0;
for (i, estimate) in estimates.iter().enumerate() {
let is_actual_outlier = i >= n_normal;
let is_detected_high = stats.categorize(estimate.lid) == LidCategory::High;
if is_actual_outlier && is_detected_high {
detected_outliers += 1;
}
}
println!(
"\n Outliers detected via high LID: {}/{}",
detected_outliers, n_outliers
);
if detected_outliers > 0 {
println!(" [PASS] LID successfully identified outliers");
}
println!();
}
fn demo_confidence_intervals() {
println!("4. TwoNN with confidence intervals");
println!(" --------------------------------\n");
let test_cases = [
("Small sample (n=50)", 50),
("Medium sample (n=200)", 200),
("Large sample (n=1000)", 1000),
];
let true_id = 3;
let ambient_dim = 20;
println!(" True ID = {}, ambient dim = {}\n", true_id, ambient_dim);
println!(
" {:25} {:>10} {:>8} {:>18}",
"Sample size", "Estimate", "SE", "95% CI"
);
println!(" {}", "-".repeat(65));
for (name, n) in test_cases {
let vectors = generate_sphere_points(n, true_id, ambient_dim, 0.01);
let flat_vectors: Vec<f32> = vectors.iter().flatten().copied().collect();
let mu_ratios = compute_twonn_ratios(&flat_vectors, ambient_dim);
let result = estimate_twonn_with_ci(&mu_ratios, 0.1);
println!(
" {:25} {:>10.2} {:>8.3} [{:>6.2}, {:>6.2}]",
name, result.dimension, result.std_error, result.ci_lower, result.ci_upper
);
if result.ci_lower <= true_id as f32 && true_id as f32 <= result.ci_upper {
println!(" {:25} (true value {} is within CI)", "", true_id);
}
}
println!();
}
fn generate_sphere_points(
n: usize,
intrinsic_dim: usize,
ambient_dim: usize,
noise: f32,
) -> Vec<Vec<f32>> {
let mut points = Vec::with_capacity(n);
let phi = 1.618_034_f32;
for i in 0..n {
let mut point = vec![0.0f32; ambient_dim];
for (j, p) in point.iter_mut().enumerate().take(intrinsic_dim) {
let alpha = (1.0 / (j as f32 + 1.0 + phi)).fract();
let val = ((i as f32 + 0.5) * alpha).fract();
*p = (val * 2.0 - 1.0) * 1.0;
}
for (j, p) in point.iter_mut().enumerate().take(ambient_dim) {
let noise_val = noise
* (((i * ambient_dim + j) as f32 * 0.414213).sin()
+ ((i * ambient_dim + j) as f32 * 0.732051).cos())
* 0.5;
*p += noise_val;
}
points.push(point);
}
points
}
fn compute_twonn_ratios(flat_vectors: &[f32], dim: usize) -> Vec<f32> {
let n = flat_vectors.len() / dim;
let mut ratios = Vec::with_capacity(n);
for i in 0..n {
let query = &flat_vectors[i * dim..(i + 1) * dim];
let mut distances: Vec<(usize, f32)> = (0..n)
.filter(|&j| j != i)
.map(|j| {
let other = &flat_vectors[j * dim..(j + 1) * dim];
let dist_sq: f32 = query
.iter()
.zip(other.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
(j, dist_sq.sqrt())
})
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
if distances.len() >= 2 {
let r1 = distances[0].1;
let r2 = distances[1].1;
if r1 > 1e-10 {
ratios.push(r2 / r1);
}
}
}
ratios
}