use ndarray::s;
use hnsw_rs::prelude::*;
use annembed::prelude::*;
use annembed::utils::mnistio::*;
use csv::*;
use cpu_time::ProcessTime;
use std::time::{Duration, SystemTime};
use annembed::fromhnsw::hubness;
use annembed::fromhnsw::kgproj::KGraphProjection;
use annembed::fromhnsw::kgraph::{KGraph, kgraph_from_hnsw_all};
const MNIST_DIGITS_DIR: &str = "/home/jpboth/Data/ANN/MNIST/";
#[allow(clippy::range_zip_with_len)]
pub fn main() {
let _ = env_logger::builder().is_test(true).try_init();
let digits_data = load_mnist_train_data(MNIST_DIGITS_DIR).unwrap();
let mut labels = digits_data.get_labels().to_vec();
let images = digits_data.get_images();
let (_, _, nbimages) = images.dim();
let mut images_as_v = Vec::<Vec<f32>>::with_capacity(nbimages);
for k in 0..nbimages {
let v: Vec<f32> = images
.slice(s![.., .., k])
.iter()
.map(|v| *v as f32)
.collect();
images_as_v.push(v);
}
let digits_test_data = load_mnist_test_data(MNIST_DIGITS_DIR).unwrap();
labels.append(&mut digits_test_data.get_labels().to_vec());
let images = digits_test_data.get_images();
let (_, _, nbimages) = images.dim();
for k in 0..nbimages {
let v: Vec<f32> = images
.slice(s![.., .., k])
.iter()
.map(|v| *v as f32)
.collect();
images_as_v.push(v);
}
let ef_c = 50;
let max_nb_connection = 70;
let nbimages = images_as_v.len();
let nb_layer = 16.min((nbimages as f32).ln().trunc() as usize);
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
let hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nbimages, nb_layer, ef_c, DistL2 {});
let data_with_id: Vec<(&Vec<f32>, usize)> =
images_as_v.iter().zip(0..images_as_v.len()).collect();
hnsw.parallel_insert(&data_with_id);
let cpu_time: Duration = cpu_start.elapsed();
println!(
" ann construction sys time(s) {:?} cpu time {:?}",
sys_now.elapsed().unwrap().as_secs(),
cpu_time.as_secs()
);
hnsw.dump_layer_info();
let kgraph: KGraph<f32>;
let graphprojection: KGraphProjection<f32>;
let mut embed_params = EmbedderParams::default();
embed_params.nb_grad_batch = 30;
embed_params.scale_rho = 1.;
embed_params.beta = 1.;
embed_params.b = 1.;
embed_params.grad_step = 1.;
embed_params.nb_sampling_by_edge = 10;
embed_params.dmap_init = true;
embed_params.hubness_weighting = false;
let mut embedder;
let hierarchical = false;
if !hierarchical {
let knbn = 6;
log::info!("calling kgraph.init_from_hnsw_all");
kgraph = kgraph_from_hnsw_all(&hnsw, knbn).unwrap();
log::info!("minimum number of neighbours {}", kgraph.get_max_nbng());
embedder = Embedder::new(&kgraph, embed_params);
let embed_res = embedder.embed();
assert!(embed_res.is_ok());
assert!(embedder.get_embedded().is_some());
} else {
log::info!("graph projection");
embed_params.nb_grad_batch = 20;
let knbn = 6;
embed_params.grad_factor = 4; graphprojection = KGraphProjection::<f32>::new(&hnsw, knbn, 1);
embedder = Embedder::from_hkgraph(&graphprojection, embed_params);
let embed_res = embedder.embed();
assert!(embed_res.is_ok());
assert!(embedder.get_embedded().is_some());
}
println!(
" ann embed time time {:.2e} s, cpu time : {}",
sys_now.elapsed().unwrap().as_secs(),
cpu_start.elapsed().as_secs()
);
log::info!("dumping initial embedding in csv file");
let mut csv_w = Writer::from_path("mnist_init_digits.csv").unwrap();
let _res = write_csv_labeled_array2(
&mut csv_w,
labels.as_slice(),
&embedder.get_initial_embedding_reindexed(),
);
csv_w.flush().unwrap();
log::info!("dumping in csv file");
let mut csv_w = Writer::from_path("mnist_digits.csv").unwrap();
let _res = write_csv_labeled_array2(
&mut csv_w,
labels.as_slice(),
&embedder.get_embedded_reindexed(),
);
csv_w.flush().unwrap();
let _quality = embedder.get_quality_estimate_from_edge_length(50);
let knbn = 25;
let kgraph: KGraph<f32> = kgraph_from_hnsw_all(&hnsw, knbn).unwrap();
log::info!("minimum number of neighbours {}", kgraph.get_max_nbng());
log::info!("dimension estimation...");
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
let sampling_size = 10000;
let dim_stat = kgraph.estimate_intrinsic_dim(sampling_size);
let cpu_time: Duration = cpu_start.elapsed();
println!(
" dimension estimation sys time(ms) : {:.3e}, cpu time(ms) {:?}",
sys_now.elapsed().unwrap().as_millis(),
cpu_time.as_millis()
);
if dim_stat.is_ok() {
let dim_stat = dim_stat.unwrap();
log::info!(
"\n dimension estimation with nbpoints : {}, dim : {:.3e}, sigma = {:.3e} \n",
sampling_size,
dim_stat.0,
dim_stat.1
);
println!(
" dimension estimation with nbpoints : {}, dim : {:.3e}, sigma = {:.3e}",
sampling_size, dim_stat.0, dim_stat.1
);
}
let sampling_size = 10000;
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
let dim_stat = kgraph.estimate_intrinsic_dim_2nn(sampling_size);
let cpu_time: Duration = cpu_start.elapsed();
println!(
"\n estimate_intrinsic_dim_2nn estimation sys time(ms) : {:.3e}, cpu time(ms) {:.5e}\n",
sys_now.elapsed().unwrap().as_millis(),
cpu_time.as_millis()
);
if dim_stat.is_ok() {
let dim_stat = dim_stat.unwrap();
log::info!(
"\n estimate_intrinsic_dim_2nn dimension estimation with nbpoints : {}, dim : {:.5e} \n",
sampling_size,
dim_stat,
);
println!(
" estimate_intrinsic_dim_2nn dimension estimation with nbpoints : {}, dim : {:.5e}",
sampling_size, dim_stat
);
}
let hubness = hubness::Hubness::new(&kgraph);
let s3_hubness = hubness.get_standard3m();
log::info!("\n graph hubness asymetry estimation : {:.3e}", s3_hubness);
println!(
"\n graph hubness asymetry estimation : {:.3e} \n",
s3_hubness
);
let _histo = hubness.get_hubness_histogram();
let largest = hubness.get_largest_hubs_by_dataid(20);
println!(" largest hubs, id, count and labels");
println!(" id count label");
for (id, count) in largest {
println!(" {:>5} {:>5} {:>5} ", id, count, labels[id]);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_load_mnist_digits() {
let digits_data = load_mnist_data(MNIST_DIGITS_DIR).unwrap();
let images = digits_data.get_images();
assert_eq!(0x3c, *images.get([9, 14, 9]).unwrap());
assert_eq!(0xfd, *images.get([14, 9, 9]).unwrap());
let labels = digits_data.get_labels();
assert_eq!(5, labels[0]);
assert_eq!(8, labels[labels.len() - 1]);
assert_eq!(1, 1);
} }