use std::collections::HashMap;
use ndarray::{Array1, Array2, Axis};
use num_traits::Float;
use quantiles::ckms::CKMS;
use sprs::{CsMat, TriMatBase};
use rayon::prelude::*;
use rand::Rng;
use crate::fromhnsw::kgraph::KGraph;
use crate::graphlaplace::*;
use crate::prelude::PROBA_MIN;
use crate::tools::{nodeparam::*, svdapprox::*};
pub(crate) fn to_proba_edges<F>(kgraph: &KGraph<F>, scale_rho: f32, beta: f32) -> NodeParams
where
F: Float
+ num_traits::cast::FromPrimitive
+ std::marker::Sync
+ std::marker::Send
+ std::fmt::UpperExp
+ std::iter::Sum,
{
let mut perplexity_q: CKMS<f32> = CKMS::<f32>::new(0.001);
let mut scale_q: CKMS<f32> = CKMS::<f32>::new(0.001);
let mut weight_q: CKMS<f32> = CKMS::<f32>::new(0.001);
let neighbour_hood = kgraph.get_neighbours();
let scale_perplexity = |i: usize| -> (usize, Option<(f32, NodeParam)>) {
if !neighbour_hood[i].is_empty() {
let node_param =
get_scale_from_proba_normalisation(kgraph, scale_rho, beta, &neighbour_hood[i]);
let perplexity = node_param.get_perplexity();
(i, Some((perplexity, node_param)))
} else {
(i, None)
}
};
let mut opt_node_params: Vec<(usize, Option<(f32, NodeParam)>)> =
Vec::<(usize, Option<(f32, NodeParam)>)>::new();
let mut node_params: Vec<NodeParam> = (0..neighbour_hood.len())
.map(|_| NodeParam::default())
.collect();
(0..neighbour_hood.len())
.into_par_iter()
.map(scale_perplexity)
.collect_into_vec(&mut opt_node_params);
let mut max_nbng = 0;
for opt_param in &opt_node_params {
match opt_param {
(i, Some(param)) => {
perplexity_q.insert(param.0);
scale_q.insert(param.1.scale);
let j = rand::rng().random_range(0..param.1.edges.len());
weight_q.insert(param.1.edges[j].weight);
max_nbng = param.1.edges.len().max(max_nbng);
assert_eq!(param.1.edges.len(), neighbour_hood[*i].len());
node_params[*i] = param.1.clone();
}
(i, None) => {
log::error!("to_proba_edges , node rank {}, has no neighbour, use hnsw.set_keeping_pruned(true)", i);
log::error!("to_proba_edges , node rank {}, has no neighbour, use hnsw.set_keeping_pruned(true)", i);
std::process::exit(1);
}
};
}
log::debug!("\n constructed initial space");
log::debug!(
"\n scales quantile at 0.05 : {:.2e} , 0.5 : {:.2e}, 0.95 : {:.2e}, 0.99 : {:.2e}",
scale_q.query(0.05).unwrap().1,
scale_q.query(0.5).unwrap().1,
scale_q.query(0.95).unwrap().1,
scale_q.query(0.99).unwrap().1
);
log::debug!(
"\n edge weight quantile at 0.05 : {:.2e} , 0.5 : {:.2e}, 0.95 : {:.2e}, 0.99 : {:.2e}",
weight_q.query(0.05).unwrap().1,
weight_q.query(0.5).unwrap().1,
weight_q.query(0.95).unwrap().1,
weight_q.query(0.99).unwrap().1
);
log::debug!(
"\n perplexity quantile at 0.05 : {:.2e} , 0.5 : {:.2e}, 0.95 : {:.2e}, 0.99 : {:.2e}",
perplexity_q.query(0.05).unwrap().1,
perplexity_q.query(0.5).unwrap().1,
perplexity_q.query(0.95).unwrap().1,
perplexity_q.query(0.99).unwrap().1
);
log::debug!("");
NodeParams::new(node_params, max_nbng)
}
fn get_scale_from_proba_normalisation<F>(
kgraph: &KGraph<F>,
scale_rho: f32,
beta: f32,
neighbours: &[OutEdge<F>],
) -> NodeParam
where
F: Float + num_traits::cast::FromPrimitive + Sync + Send + std::fmt::UpperExp + std::iter::Sum,
{
let nbgh = neighbours.len();
assert!(nbgh > 0);
let rho_x = neighbours[0].weight.to_f32().unwrap();
let mut rho_y_s = Vec::<f32>::with_capacity(neighbours.len() + 1);
for neighbour in neighbours {
let y_i = neighbour.node; rho_y_s.push(kgraph.get_neighbours()[y_i][0].weight.to_f32().unwrap());
} rho_y_s.push(rho_x);
let mean_rho = rho_y_s.iter().sum::<f32>() / (rho_y_s.len() as f32);
let scale = scale_rho * mean_rho;
let mut all_equal = false;
let first_dist = neighbours[0].weight.to_f32().unwrap();
let last_n = neighbours
.iter()
.rfind(|&n| n.weight.to_f32().unwrap() > 0.);
if last_n.is_none() {
all_equal = true;
}
let remap_weight = |w: F, shift: f32, scale: f32, beta: f32| {
(-((w.to_f32().unwrap() - shift).max(0.) / scale).powf(beta)).exp()
};
if !all_equal {
let last_dist = last_n.unwrap().weight.to_f32().unwrap();
if last_dist > first_dist {
let mut probas_edge = neighbours
.iter()
.map(|n| {
OutEdge::<f32>::new(
n.node,
remap_weight(n.weight, first_dist, scale, beta).max(PROBA_MIN),
)
})
.collect::<Vec<OutEdge<f32>>>();
let proba_range = probas_edge[probas_edge.len() - 1].weight / probas_edge[0].weight;
log::trace!(" first dist {:2e} last dist {:2e}", first_dist, last_dist);
log::trace!("scale : {:.2e} , first neighbour proba {:2e}, last neighbour proba {:2e} proba gap {:.2e}", scale, probas_edge[0].weight,
probas_edge[probas_edge.len() - 1].weight,
proba_range);
if proba_range < PROBA_MIN {
log::error!(" first dist {:2e} last dist {:2e}", first_dist, last_dist);
log::error!("scale : {:.2e} , first neighbour proba {:2e}, last neighbour proba {:2e} proba gap {:.2e}", scale, probas_edge[0].weight,
probas_edge[probas_edge.len() - 1].weight,
proba_range);
}
assert!(
proba_range >= PROBA_MIN,
"proba range {:.2e} too low edge proba, increase scale_rho or reduce beta",
proba_range
);
let sum = probas_edge.iter().map(|e| e.weight).sum::<f32>();
for p in probas_edge.iter_mut().take(nbgh) {
p.weight /= sum;
}
return NodeParam::new(scale, probas_edge);
} else {
all_equal = true;
}
}
if all_equal {
let probas_edge = neighbours
.iter()
.map(|n| OutEdge::<f32>::new(n.node, 1.0 / nbgh as f32))
.collect::<Vec<OutEdge<f32>>>();
NodeParam::new(scale, probas_edge)
} else {
log::error!("fatal error in get_scale_from_proba_normalisation, should not happen!");
std::panic!("incoherence error");
}
}
pub(crate) fn get_laplacian(initial_space: &NodeParams) -> GraphLaplacian {
log::info!("in fn get_laplacian using alfa = 0.5",);
let nbnodes = initial_space.get_nb_nodes();
let max_nbng = initial_space.get_max_nbng();
let node_params = initial_space;
if nbnodes <= FULL_MAT_REPR {
log::debug!("get_laplacian using full matrix");
let mut transition_proba = Array2::<f32>::zeros((nbnodes, nbnodes));
for i in 0..node_params.params.len() {
let node_param = node_params.get_node_param(i);
for j in 0..node_param.edges.len() {
let edge = node_param.edges[j];
transition_proba[[i, edge.node]] = edge.weight;
} } log::trace!("full matrix initialized");
let mut symgraph = (&transition_proba + &transition_proba.view().t()) * 0.5;
let diag = symgraph.sum_axis(Axis(1));
for i in 0..nbnodes {
let mut row = symgraph.row_mut(i);
for j in 0..nbnodes {
row[[j]] /= (diag[[i]] * diag[[j]]).sqrt();
}
}
log::trace!("\n allocating full matrix laplacian");
GraphLaplacian::new(MatRepr::from_array2(symgraph), diag)
} else {
log::debug!("Embedder using csr matrix");
let mut edge_list = HashMap::<(usize, usize), f32>::with_capacity(nbnodes * max_nbng);
for i in 0..node_params.params.len() {
let node_param = node_params.get_node_param(i);
for j in 0..node_param.edges.len() {
let edge = node_param.edges[j];
edge_list.insert((i, edge.node), node_param.edges[j].weight);
} }
let mut diagonal = Array1::<f32>::zeros(nbnodes);
let mut rows = Vec::<usize>::with_capacity(nbnodes * 2 * max_nbng);
let mut cols = Vec::<usize>::with_capacity(nbnodes * 2 * max_nbng);
let mut values = Vec::<f32>::with_capacity(nbnodes * 2 * max_nbng);
for ((i, j), val) in edge_list.iter() {
assert!(i != j);
let sym_val;
if let Some(t_val) = edge_list.get(&(*j, *i)) {
sym_val = (val + t_val) * 0.5;
} else {
sym_val = *val;
}
rows.push(*i);
cols.push(*j);
values.push(sym_val);
diagonal[*i] += sym_val;
rows.push(*j);
cols.push(*i);
values.push(sym_val);
diagonal[*j] += sym_val;
}
for i in 0..rows.len() {
let row = rows[i];
let col = cols[i];
if row != col {
values[i] /= (diagonal[row] * diagonal[col]).sqrt();
}
}
log::trace!("allocating csr laplacian");
let laplacian = TriMatBase::<Vec<usize>, Vec<f32>>::from_triplets(
(nbnodes, nbnodes),
rows,
cols,
values,
);
let csr_mat: CsMat<f32> = laplacian.to_csr();
GraphLaplacian::new(MatRepr::from_csrmat(csr_mat), diagonal)
} }