use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::sync::atomic::{AtomicU32, Ordering};
use num_traits::Float;
use num_traits::cast::FromPrimitive;
use hdrhistogram::Histogram;
use indxvec::{Indices, Vecops};
use cpu_time::ProcessTime;
use std::time::SystemTime;
use hnsw_rs::hnsw::DataId;
use super::kgraph::*;
pub struct Hubness<'a, F> {
kgraph: &'a KGraph<F>,
counts: Vec<u32>,
}
impl<'a, F> Hubness<'a, F>
where
F: FromPrimitive + Float + std::fmt::UpperExp + Sync + Send + std::iter::Sum,
{
pub fn new(kgraph: &'a KGraph<F>) -> Self {
log::debug!("doing hubness estimation ...");
let cpu_start = ProcessTime::now();
let sys_start = SystemTime::now();
let nb_nodes = kgraph.get_nb_nodes();
let mut counts_atom = Vec::<AtomicU32>::with_capacity(nb_nodes);
for _ in 0..nb_nodes {
counts_atom.push(AtomicU32::new(0));
}
let scan_node = |node: usize, counts_atom: &Vec<AtomicU32>| {
let neighbours = kgraph.get_out_edges_by_idx(node);
for edge in neighbours {
let n = edge.node;
counts_atom[n].fetch_add(1, Ordering::SeqCst);
}
};
(0..nb_nodes)
.into_par_iter()
.for_each(|n| scan_node(n, &counts_atom));
let mut counts = Vec::<u32>::with_capacity(nb_nodes);
for atom in &counts_atom {
counts.push(atom.load(Ordering::Relaxed));
}
log::debug!(
" hubness estimation, sys time(ms) {:?} cpu time {:?}",
sys_start.elapsed().unwrap().as_millis(),
cpu_start.elapsed().as_millis()
);
Hubness { kgraph, counts }
}
pub fn get_counts(&self) -> &Vec<u32> {
&self.counts
}
pub fn get_standard3m(&self) -> f64 {
if self.counts.len() <= 1 {
return 0.;
}
let mu = self.counts.iter().sum::<u32>() as f64 / self.counts.len() as f64;
let mut sum2 = 0f64;
let mut sum3 = 0.;
let mut incr;
for x in &self.counts {
incr = (f64::from(*x) - mu) * (f64::from(*x) - mu);
sum2 += incr;
sum3 += incr * (f64::from(*x) - mu);
}
sum3 /= self.counts.len() as f64;
let sigma = (sum2 / (self.counts.len() - 1) as f64).sqrt();
sum3 / sigma.powi(3)
}
pub fn get_hubness_histogram(&self) -> Result<Histogram<u32>, anyhow::Error> {
let max_value = 2 * (self.counts.len() as f64).sqrt() as u64;
let prec = 1u32;
assert!(
prec >= 1,
"precision for histogram construction should range >= 1"
);
let histo = Histogram::<u32>::new_with_bounds(1, max_value, 1);
if histo.is_err() {
log::error!(
"hubness::get_hubness_histogram, could not create histogram , error : {:?}",
histo.as_ref().err()
);
return Err(anyhow::anyhow!("histogram construction failed"));
}
let mut histo = histo.unwrap();
let mut nb_out_histo = 0u32;
for v in &self.counts {
let res = histo.record(*v as u64);
if res.is_err() {
nb_out_histo += 1;
}
}
if nb_out_histo > 0 {
log::warn!(
"number of too large values : {}, maximum value : {}",
nb_out_histo,
max_value
);
}
let quantiles = vec![0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999];
let thresholds = quantiles
.iter()
.map(|f| histo.value_at_quantile(*f))
.collect::<Vec<u64>>();
log::info!("\n hubness quantiles : ");
log::info!("======================");
log::info!("quantiles : {:?}", quantiles);
log::info!("thresholds : {:?}", thresholds);
log::info!("\n");
Ok(histo)
}
pub fn get_largest_hubs_by_dataid(&self, first_asked: usize) -> Vec<(DataId, usize)> {
let first = first_asked.min(self.counts.len());
let mut hubs_dataid = Vec::<(DataId, usize)>::with_capacity(first);
let ranks = self.counts.rank(false);
let index = ranks.invindex();
log::info!("get_largest_hubs_by_dataid");
for (i, index) in index.iter().enumerate().take(first) {
let data_id = self.kgraph.get_data_id_from_idx(*index).unwrap();
if i <= 10 {
log::info!(
"index : {} , rank : {}, count : {} , data_id : {:?}",
i,
index,
self.counts[*index],
data_id
);
}
hubs_dataid.push((*data_id, self.counts[*index] as usize));
}
hubs_dataid
}
pub fn get_dataid_hubness(&self, data_id: &DataId) -> usize {
let index = self.kgraph.get_idx_from_dataid(data_id).unwrap();
self.counts[index] as usize
} }