#![allow(clippy::doc_overindented_list_items)]
use num_traits::Float;
use num_traits::cast::FromPrimitive;
use indexmap::IndexSet;
use quantiles::ckms::CKMS;
use rayon::prelude::*;
use std::collections::HashMap;
use ndarray::{Array1, Array2, Axis};
use sprs::{CsMat, TriMatBase};
use crate::embedder::*;
use crate::fromhnsw::{kgraph::KGraph, kgraph_from_hnsw_all};
use crate::graphlaplace::*;
#[allow(unused)]
use crate::tools::{clip, kdumap::*, nodeparam::*, svdapprox::*};
use anyhow::Result;
use hnsw_rs::prelude::*;
#[derive(Copy, Clone)]
pub struct DiffusionParams {
asked_dim: usize,
alfa: f32,
beta: f32,
t: Option<f32>,
gnbn_opt: Option<usize>,
h_layer: Option<usize>,
}
impl DiffusionParams {
pub fn new(asked_dim: usize, t_opt: Option<f32>, g_opt: Option<usize>) -> Self {
DiffusionParams {
asked_dim,
alfa: 1.,
beta: 0.,
t: t_opt,
gnbn_opt: g_opt,
h_layer: None,
}
}
pub fn get_time(&self) -> Option<f32> {
self.t
}
pub fn get_data_dim(&self) -> usize {
self.asked_dim
}
pub fn get_gnbn(&self) -> Option<usize> {
self.gnbn_opt
}
pub fn set_alfa(&mut self, alfa: f32) {
if !(-1.01..=1.).contains(&alfa) {
log::warn!("not changing alfa, alfa should be in [-1. , 1.] ");
return;
}
self.alfa = alfa;
}
pub fn set_beta(&mut self, beta: f32) {
if (-1.01..=0.).contains(&beta) {
self.beta = beta;
} else {
log::warn!(
"not changing beta, beta should be in -1,0 Usual values are 0. -0.5 see doc "
);
}
}
pub fn set_hlayer(&mut self, layer: usize) {
self.h_layer = Some(layer);
}
pub fn get_alfa(&self) -> f32 {
self.alfa
}
pub fn get_beta(&self) -> f32 {
self.beta
}
pub fn get_hlayer(&self) -> usize {
self.h_layer.unwrap_or_default()
}
pub fn get_embedding_dimension(&self) -> usize {
self.asked_dim
}
pub fn set_embedding_dimension(&mut self, asked_dim: usize) {
self.asked_dim = asked_dim;
}
}
impl Default for DiffusionParams {
fn default() -> Self {
DiffusionParams {
asked_dim: 2,
alfa: 1.,
beta: 0.,
t: Some(5.),
gnbn_opt: Some(8),
h_layer: None,
}
}
}
pub struct DiffusionMaps {
params: DiffusionParams,
_node_params: Option<NodeParams>,
normed_scales: Option<Array1<f32>>,
mean_scale: f32,
q_density: Option<Vec<f32>>,
laplacian: Option<GraphLaplacian>,
index: Option<IndexSet<DataId>>,
}
impl DiffusionMaps {
pub fn new(params: DiffusionParams) -> Self {
DiffusionMaps {
params,
_node_params: None,
normed_scales: None,
mean_scale: 0.,
q_density: None,
laplacian: None,
index: None,
}
}
#[allow(unused)]
pub(crate) fn get_laplacian(&mut self) -> Option<&mut GraphLaplacian> {
self.laplacian.as_mut()
}
pub fn get_svd_res(&self) -> Option<&SvdResult<f32>> {
match &self.laplacian {
Some(laplacian) => laplacian.svd_res.as_ref(),
_ => None,
}
}
pub fn get_index(&self) -> Option<&IndexSet<DataId>> {
self.index.as_ref()
}
#[deprecated = "use embed_from_hnsw"]
pub fn embed_hnsw<T, D, F>(&mut self, hnsw: &Hnsw<T, D>) -> Array2<F>
where
D: Distance<T> + Send + Sync,
T: Clone + Send + Sync,
F: Float + FromPrimitive + std::marker::Sync + Send + std::fmt::UpperExp + std::iter::Sum,
{
let knbn = hnsw.get_max_nb_connection();
let kgraph_res: anyhow::Result<KGraph<F>> =
kgraph_from_hnsw_all::<T, D, F>(hnsw, knbn as usize);
if kgraph_res.is_err() {
panic!(
"kgraph_from_hnsw_all failed {:?}",
kgraph_res.err().unwrap()
);
};
let kgraph = kgraph_res.unwrap();
let nodeparams = to_proba_edges::<F>(&kgraph, 1., 2.);
get_dmap_embedding::<F>(&nodeparams, self.params.asked_dim, self.params.get_time())
}
pub(crate) fn laplacian_from_hnsw<T, D, F>(
&mut self,
hnsw: &Hnsw<T, D>,
dparams: &DiffusionParams,
) -> GraphLaplacian
where
T: Clone + Send + Sync,
D: Distance<T> + Send + Sync,
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
let gnbn = dparams.get_gnbn().unwrap_or(16);
let knbn = hnsw.get_max_nb_connection().min(gnbn as u8);
let kgraph_res = kgraph_from_hnsw_all::<T, D, F>(hnsw, knbn as usize);
if kgraph_res.is_err() {
panic!(
"kgraph_from_hnsw_all failed {:?}",
kgraph_res.err().unwrap()
);
}
let kgraph = kgraph_res.unwrap();
self.index = Some(kgraph.get_indexset().clone());
self.laplacian_from_kgraph(&kgraph)
}
pub(crate) fn laplacian_from_kgraph<F>(&mut self, kgraph: &KGraph<F>) -> GraphLaplacian
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
if self.index.is_none() {
self.index = Some(kgraph.get_indexset().clone());
}
let (nodeparams, mut q_density) =
self.compute_dmap_nodeparams::<F>(kgraph, kgraph.get_max_nbng());
let mut local_scale: Array1<f32> = nodeparams.params.iter().map(|n| n.scale).collect();
let mut mean_scale: f32 = local_scale.iter().sum();
mean_scale /= local_scale.len() as f32;
for l in &mut local_scale {
*l /= mean_scale;
}
for i in 0..q_density.len() {
q_density[i] /= local_scale[i]
}
self.get_quantiles("final q_density/scale", &q_density);
self.q_density = Some(q_density);
self.normed_scales = Some(local_scale);
self.mean_scale = mean_scale;
self.compute_laplacian(&nodeparams, self.params.get_alfa())
}
pub(crate) fn compute_laplacian(
&self,
initial_space: &NodeParams,
alfa: f32,
) -> GraphLaplacian {
let scale_renormalization = true;
log::info!(
"in GraphLaplacian::compute_laplacian, using alfa : {:.2e}",
alfa
);
let nbnodes = initial_space.get_nb_nodes();
let max_nbng = initial_space.get_max_nbng();
let node_params = initial_space;
let local_scale = self.normed_scales.as_ref().unwrap();
if nbnodes <= FULL_MAT_REPR {
log::debug!("get_laplacian using full matrix");
let mut transition_proba = Array2::<f32>::zeros((nbnodes, nbnodes));
for i in 0..node_params.params.len() {
let node_param = node_params.get_node_param(i);
for j in 0..node_param.edges.len() {
let edge = node_param.edges[j];
transition_proba[[i, edge.node]] = edge.weight;
} } log::trace!("full matrix initialized");
let mut symgraph = (&transition_proba + &transition_proba.view().t()) * 0.5;
let mut q = symgraph.sum_axis(Axis(1));
q /= local_scale;
let mut degrees = Array1::<f32>::zeros(q.len());
for i in 0..nbnodes {
let mut row = symgraph.row_mut(i);
for j in 0..nbnodes {
row[[j]] /= (q[[i]] * q[[j]]).powf(alfa);
}
degrees[[i]] = row.sum();
}
for i in 0..nbnodes {
let mut row = symgraph.row_mut(i);
for j in 0..nbnodes {
row[[j]] /= (degrees[[i]] * degrees[[j]]).sqrt();
if scale_renormalization {
row[[j]] /= local_scale[i] * local_scale[j];
}
}
}
log::trace!("\n allocating full matrix laplacian");
GraphLaplacian::new(MatRepr::from_array2(symgraph), degrees)
} else {
log::debug!("Embedder using csr matrix");
let mut edge_list = HashMap::<(usize, usize), f32>::with_capacity(nbnodes * max_nbng);
for i in 0..node_params.params.len() {
let node_param = node_params.get_node_param(i);
for j in 0..node_param.edges.len() {
let edge = node_param.edges[j];
edge_list.insert((i, edge.node), node_param.edges[j].weight);
} }
let mut q = Array1::<f32>::zeros(nbnodes);
let mut rows = Vec::<usize>::with_capacity(nbnodes * 2 * max_nbng);
let mut cols = Vec::<usize>::with_capacity(nbnodes * 2 * max_nbng);
let mut values = Vec::<f32>::with_capacity(nbnodes * 2 * max_nbng);
for ((i, j), val) in edge_list.iter() {
let sym_val;
if let Some(t_val) = edge_list.get(&(*j, *i)) {
sym_val = val.max(*t_val);
} else {
sym_val = *val;
}
rows.push(*i);
cols.push(*j);
values.push(sym_val);
q[*i] += sym_val;
rows.push(*j);
cols.push(*i);
values.push(sym_val);
q[*j] += sym_val;
}
q /= local_scale;
let mut degrees = Array1::<f32>::zeros(q.len());
for i in 0..rows.len() {
let row = rows[i];
let col = cols[i];
values[i] /= (q[row] * q[col]).powf(alfa);
}
for (i, v) in &mut values.iter().enumerate() {
let row = rows[i];
degrees[row] += v;
}
for i in 0..values.len() {
let row = rows[i];
let col = cols[i];
values[i] /= (degrees[row] * degrees[col]).sqrt();
if scale_renormalization {
values[i] /= local_scale[row] * local_scale[col];
}
}
log::trace!("allocating csr laplacian");
let laplacian = TriMatBase::<Vec<usize>, Vec<f32>>::from_triplets(
(nbnodes, nbnodes),
rows,
cols,
values,
);
let csr_mat: CsMat<f32> = laplacian.to_csr();
GraphLaplacian::new(MatRepr::from_csrmat(csr_mat), degrees)
} }
fn scales_to_kernel<F>(
&self,
kgraph: &KGraph<F>,
local_scales: &[F],
scale_ratio: f32,
scale_to_kernel: &dyn Fn(F, f32, f32) -> f32,
) -> (NodeParams, Vec<f32>)
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
log::debug!("in DiffusionMaps::scales_to_kernel");
let nb_nodes = kgraph.get_nb_nodes();
let neighbour_hood = kgraph.get_neighbours();
let mut nodeparams = Vec::<NodeParam>::with_capacity(nb_nodes);
let mut q_density: Vec<f32> = Vec::<f32>::with_capacity(nb_nodes);
let mut nb_weight_to_low = 0usize;
for i in 0..nb_nodes {
let neighbours = &neighbour_hood[i];
let mut all_equal = false;
let last_n = neighbours
.iter()
.rfind(|&n| n.weight.to_f32().unwrap() > 0.);
if let Some(edge) = last_n {
let last_e_w = edge.weight;
if last_e_w <= neighbours[0].weight {
all_equal = true;
}
} else {
all_equal = true;
}
let nb_edges = 1 + neighbours.len();
let mut edges = Vec::<OutEdge<f32>>::with_capacity(nb_edges);
if all_equal {
log::warn!("all equal for node {}", i);
let proba: f32 = 1. / (nb_edges as f32);
let self_edge = OutEdge::new(i, proba);
edges.push(self_edge);
for n in neighbours {
edges.push(OutEdge::new(n.node, proba));
}
} else {
let self_edge = OutEdge::<f32>::new(i, 1.);
edges.push(self_edge);
let mut sum: f32 = 0.;
let _shift = neighbours[0].weight.to_f32().unwrap();
let from_scale = local_scales[i];
for n in neighbours {
let to_scale = local_scales[n.node];
let local_scale = (to_scale * from_scale).sqrt();
let mut weight: f32 =
scale_to_kernel(n.weight, 0., scale_ratio * local_scale.to_f32().unwrap());
if weight < PROBA_MIN {
weight = weight.max(PROBA_MIN);
nb_weight_to_low += 1;
}
let edge = OutEdge::<f32>::new(n.node, weight);
edges.push(edge);
sum += weight;
}
edges[0].weight = 1. / 2.;
sum += edges[0].weight;
q_density.push(sum / edges.len() as f32);
}
let nodep = NodeParam::new(local_scales[i].to_f32().unwrap(), edges);
nodeparams.push(nodep);
}
let low_weight = nb_weight_to_low as f32 / nodeparams.len() as f32;
if low_weight > 0. {
log::info!(
"to_dmap_nodeparams: proba of weight < {:.2e} = {:.2e}",
PROBA_MIN,
low_weight
);
}
(
NodeParams::new(nodeparams, kgraph.get_max_nbng()),
q_density,
)
}
pub(crate) fn compute_dmap_nodeparams<F>(
&self,
kgraph: &KGraph<F>,
nbng: usize,
) -> (NodeParams, Vec<f32>)
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ std::iter::Sum
+ Into<f64>,
{
log::info!(
"Diffusion computing kernels with using alfa : {:.2e} , beta : {:.2e}",
self.params.get_alfa(),
self.params.get_beta()
);
let neighbour_hood = kgraph.get_neighbours();
let nbgh_size = kgraph.get_max_nbng().min(nbng);
log::info!(
"compute_dmap_nodeparams kgraph nbng : {}, using size {}",
kgraph.get_max_nbng(),
nbgh_size
);
let local_scales: Vec<F> = neighbour_hood
.par_iter()
.map(|edges| self.get_dist_l2_from_node(edges, nbgh_size))
.collect();
let scales_q = self.get_quantiles("scales quantiles first pass", &local_scales);
log::debug!("");
let exponent = 2.0f32;
let scale_width =
(scales_q.query(0.99).unwrap().1 / scales_q.query(0.01).unwrap().1) as f32;
log::info!(
"compute_dmap_nodeparams knbn : {}, epsil : {:.2e}",
nbgh_size,
scale_width
);
let epsil = scale_width / 2.;
let remap_weight = |w: F, shift: f32, scale: f32| {
let arg = ((w.to_f32().unwrap() - shift) / (epsil * scale)).powf(exponent);
(-arg).exp()
};
let (_, q_density) = self.scales_to_kernel(kgraph, &local_scales, epsil, &remap_weight);
self.get_quantiles("density quantiles first pass", &q_density);
let (nodeparams, q_density) = self.density_to_kernel(
kgraph,
nbgh_size,
&local_scales,
&q_density,
self.params.get_beta(),
&remap_weight,
);
self.get_quantiles("density quantiles second pass", &q_density);
(nodeparams, q_density)
}
fn density_to_kernel<F>(
&self,
kgraph: &KGraph<F>,
nbng: usize,
local_scales_step0: &[F],
q_densities: &Vec<f32>,
beta: f32,
remap_weight: &dyn Fn(F, f32, f32) -> f32,
) -> (NodeParams, Vec<f32>)
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ std::iter::Sum
+ Into<f64>,
{
log::info!("in DiffusionMaps::density_to_kernel, beta : {:.2e}", beta);
let nbgh_size = kgraph.get_max_nbng().min(nbng);
log::info!(
"compute_dmap_nodeparams kgraph nbng : {}, using size {}",
kgraph.get_max_nbng(),
nbgh_size
);
let sum_scale_step0: f32 = local_scales_step0.iter().copied().sum::<F>().into() as f32;
let mean_scale_step0 = sum_scale_step0 / local_scales_step0.len() as f32;
log::info!("mean scale step 0: {:.2e}", mean_scale_step0);
let local_scales: Vec<F> = q_densities
.par_iter()
.map(|d| F::from_f32(mean_scale_step0 * d.powf(beta)).unwrap())
.collect();
let scale_q = self.get_quantiles::<F>("scales quantiles second pass", &local_scales);
let epsil = (scale_q.query(0.99).unwrap().1 / scale_q.query(0.01).unwrap().1) as f32;
log::info!(
"density_to_kernel :nbgh_size {}, epsil : {:.2e}",
nbgh_size,
epsil
);
let sum_scale: f32 = local_scales.iter().copied().sum::<F>().into() as f32;
let mean_scale = sum_scale / local_scales.len() as f32;
log::info!("mean scale step 1 : {:.2e}", mean_scale);
let (nodeparams, q_density) =
self.scales_to_kernel(kgraph, &local_scales, epsil, &remap_weight);
(nodeparams, q_density)
}
pub(crate) fn get_quantiles<F>(&self, message: &str, values: &Vec<F>) -> CKMS<f64>
where
F: Float + Into<f64>,
{
log::info!("\n\n {}", message);
let mut quant_densities: CKMS<f64> = CKMS::<f64>::new(0.001);
for q in values {
quant_densities.insert((*q).into());
}
log::debug!(
"quantiles at 0.01 : {:.2e}, 0.05 : {:.2e} , 0.5 : {:.2e}, 0.95 : {:.2e}, 0.99 : {:.2e}",
quant_densities.query(0.01).unwrap().1,
quant_densities.query(0.05).unwrap().1,
quant_densities.query(0.5).unwrap().1,
quant_densities.query(0.95).unwrap().1,
quant_densities.query(0.99).unwrap().1
);
log::debug!("");
quant_densities
}
#[allow(unused)]
pub(crate) fn get_dist_around_node<F>(
&self,
kgraph: &KGraph<F>,
out_edges: &[OutEdge<F>],
nbng_used: usize,
) -> F
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::iter::Sum,
{
let rho_x = out_edges[0].weight;
let mut rho_y_s = Vec::<F>::with_capacity(out_edges.len() + 1);
for neighbour in out_edges.iter().take(nbng_used) {
let y_i = neighbour.node; rho_y_s.push(kgraph.get_neighbours()[y_i][0].weight);
} rho_y_s.push(rho_x);
let size = rho_y_s.len();
rho_y_s.into_iter().sum::<F>() / F::from(size).unwrap()
}
#[allow(unused)]
pub(crate) fn get_dist_l2_from_node<F>(&self, out_edges: &[OutEdge<F>], nbng: usize) -> F
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::iter::Sum,
{
let dist2: F = out_edges
.iter()
.take(nbng)
.map(|e| e.weight * e.weight)
.sum();
(dist2 / F::from(out_edges.len()).unwrap()).sqrt()
}
pub fn embed_from_kgraph<F>(
&mut self,
kgraph: &KGraph<F>,
dparams: &DiffusionParams,
) -> Result<Array2<F>>
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
log::info!("in DiffusionMaps::embed_from_kgraph");
let asked_dim = dparams.get_data_dim();
let t_opt = dparams.get_time();
let mut laplacian = self.laplacian_from_kgraph::<F>(kgraph);
let embedded = self
.embed_from_laplacian::<F>(&mut laplacian, asked_dim, t_opt)
.unwrap();
self.laplacian = Some(laplacian);
Ok(embedded)
}
pub(crate) fn embed_from_hnsw_intern<T, D, F>(
&mut self,
hnsw: &Hnsw<T, D>,
dparams: &DiffusionParams,
) -> Result<Array2<F>>
where
D: Distance<T> + Send + Sync,
T: Clone + Send + Sync,
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
let asked_dim = dparams.get_data_dim();
let t_opt = dparams.get_time();
let mut laplacian = self.laplacian_from_hnsw::<T, D, F>(hnsw, dparams);
let embedded = self
.embed_from_laplacian::<F>(&mut laplacian, asked_dim, t_opt)
.unwrap();
self.laplacian = Some(laplacian);
Ok(embedded)
}
pub fn embed_from_hnsw<T, D, F>(
&mut self,
hnsw: &Hnsw<T, D>,
dparams: &DiffusionParams,
) -> Result<Array2<F>>
where
D: Distance<T> + Send + Sync,
T: Clone + Send + Sync,
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
let embedded = self.embed_from_hnsw_intern(hnsw, dparams).unwrap();
let embedded_reindexed = self.reindex_embedding(&embedded);
Ok(embedded_reindexed)
}
fn embed_from_laplacian<F>(
&self,
laplacian: &mut GraphLaplacian,
asked_dim: usize,
t_opt: Option<f32>,
) -> Result<Array2<F>>
where
F: Float
+ FromPrimitive
+ std::marker::Sync
+ Send
+ std::fmt::UpperExp
+ std::iter::Sum
+ std::ops::AddAssign
+ std::ops::DivAssign
+ Into<f64>,
{
log::debug!("got laplacian, going to svd ... asked_dim : {}", asked_dim);
let svd_res = laplacian.do_svd(asked_dim + 15);
if svd_res.is_err() {
log::error!("embed_from_laplacian call of laplacian.do_svd failed");
panic!("embed_from_laplacian call of laplacian.do_svd failed");
} else {
log::debug!("do_svd returns OK");
}
let svd_res = svd_res.unwrap();
let lambdas = svd_res.get_sigma().as_ref().unwrap();
if lambdas.len() > 2 && lambdas[1] > lambdas[0] {
panic!("svd spectrum not decreasing");
}
log::info!(
" first 5 eigen values {:.2e} {:.2e} {:.2e} {:.2e} {:.2e} ",
lambdas[0],
lambdas[1],
lambdas[2],
lambdas[3],
lambdas[4],
);
log::info!(
" last eigenvalue computed rank {} value {:.2e}",
lambdas.len() - 1,
lambdas[lambdas.len() - 1]
);
log::debug!("keeping columns from 1 to : {}", asked_dim);
let u = svd_res.get_u().as_ref().unwrap();
log::debug!("u shape : nrows: {} , ncols : {} ", u.nrows(), u.ncols());
if u.ncols() < asked_dim {
log::warn!(
"asked dimension : {} svd obtained less than asked for : {}",
asked_dim,
u.ncols()
);
}
let real_dim = asked_dim.min(u.ncols() - 1);
let mut embedded = Array2::<F>::zeros((u.nrows(), real_dim));
let normalized_lambdas = lambdas / (*lambdas)[0];
let time = match t_opt {
Some(t) => t,
_ => 5.0f32.min(0.9f32.ln() / (normalized_lambdas[2] / normalized_lambdas[1]).ln()),
};
log::info!(
"DiffusionMaps::embed_from_hnsw applying dmap time {:.2e}",
time
);
let sum_diag = laplacian.degrees.iter().sum::<f32>();
let scales = self.normed_scales.as_ref().unwrap();
for i in 0..u.nrows() {
let row_i = u.row(i);
let weight_i = scales[i] * (laplacian.degrees[i] / sum_diag).sqrt();
for j in 0..real_dim {
embedded[[i, j]] =
F::from_f32(normalized_lambdas[j + 1].powf(time) * row_i[j + 1] / weight_i)
.unwrap();
}
}
log::debug!("DiffusionMaps::embed_from_hnsw ended");
laplacian.svd_res = Some(svd_res);
Ok(embedded)
}
pub fn reindex_embedding<F>(&self, embedded: &Array2<F>) -> Array2<F>
where
F: Float,
{
let (nbrow, dim) = embedded.dim();
let mut reindexed = Array2::<F>::zeros((nbrow, dim));
let index = self.get_index().unwrap();
for i in 0..nbrow {
let row = embedded.row(i);
let origin_id = index.get_index(i).unwrap();
for j in 0..dim {
reindexed[[*origin_id, j]] = row[j];
}
}
reindexed
} }
pub(crate) fn get_dmap_embedding<F>(
initial_space: &NodeParams,
asked_dim: usize,
t_opt: Option<f32>,
) -> Array2<F>
where
F: Float + FromPrimitive,
{
assert!(asked_dim >= 2);
let mut laplacian = get_laplacian(initial_space);
log::debug!("got laplacian, going to svd ... asked_dim : {}", asked_dim);
let svd_res = laplacian.do_svd(asked_dim + 25).unwrap();
let lambdas = svd_res.get_sigma().as_ref().unwrap();
if lambdas.len() > 2 && lambdas[1] > lambdas[0] {
panic!("svd spectrum not decreasing");
}
log::info!(
" first 3 eigen values {:.2e} {:.2e} {:2e}",
lambdas[0],
lambdas[1],
lambdas[2]
);
log::info!(
" last eigenvalue computed rank {} value {:.2e}",
lambdas.len() - 1,
lambdas[lambdas.len() - 1]
);
log::debug!("keeping columns from 1 to : {}", asked_dim);
let u = svd_res.get_u().as_ref().unwrap();
log::debug!("u shape : nrows: {} , ncols : {} ", u.nrows(), u.ncols());
if u.ncols() < asked_dim {
log::warn!(
"asked dimension : {} svd obtained less than asked for : {}",
asked_dim,
u.ncols()
);
}
let real_dim = asked_dim.min(u.ncols());
let mut embedded = Array2::<F>::zeros((u.nrows(), real_dim));
let normalized_lambdas = lambdas / (*lambdas)[0];
let time = match t_opt {
Some(t) => t,
_ => 5.0f32.min(0.9f32.ln() / (normalized_lambdas[2] / normalized_lambdas[1]).ln()),
};
log::info!("get_dmap_initial_embedding applying dmap time {:.2e}", time);
let sum_diag = laplacian.degrees.iter().sum::<f32>();
for i in 0..u.nrows() {
let row_i = u.row(i);
let weight_i = (laplacian.degrees[i] / sum_diag).sqrt();
for j in 0..real_dim {
embedded[[i, j]] =
F::from_f32(normalized_lambdas[j + 1].powf(time) * row_i[j + 1] / weight_i)
.unwrap();
}
}
log::debug!("ended get_dmap_embedding");
embedded
}
pub fn array2_insert_hnsw<T, D>(data: &Array2<T>, hnsw: &mut Hnsw<T, D>) -> Result<usize, usize>
where
T: Clone + Send + Sync,
D: Distance<T> + Send + Sync,
{
if hnsw.get_nb_point() > 0 {
log::error!(
"array2_insert_hnsw , insertion on non empty hnsw structure, nb point : {}",
hnsw.get_nb_point()
);
return Err(1);
}
let blocksize = 10000;
let (nb_row, _) = data.dim();
let nb_block = nb_row / blocksize;
for i in 0..nb_block {
let start = i * blocksize;
let end = i * blocksize + blocksize - 1;
let to_insert = (start..=end)
.map(|n| (data.row(n).to_slice().unwrap(), n))
.collect();
hnsw.parallel_insert_slice(&to_insert);
}
let start = nb_block * blocksize;
let to_insert = (start..nb_row)
.map(|n| (data.row(n).to_slice().unwrap(), n))
.collect();
hnsw.parallel_insert_slice(&to_insert);
Ok(hnsw.get_nb_point())
}
#[cfg(test)]
#[allow(unused)]
mod tests {
use super::*;
use crate::tools::io::write_csv_labeled_array2;
use crate::utils::mnistio::*;
use anyhow::anyhow;
use cpu_time::ProcessTime;
use ndarray::s;
use statrs::function::erf::*;
use std::fs::OpenOptions;
use std::path::PathBuf;
use std::time::{Duration, SystemTime};
const MNIST_FASHION_DIR: &str = "/home/jpboth/Data/ANN/Fashion-MNIST/";
const MNIST_DIGITS_DIR: &str = "/home/jpboth/Data/ANN/MNIST/";
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
fn generate_1d_gaussian(nbdata: usize) -> Vec<f32> {
let delta = 1. / (nbdata + 1) as f64;
let mut v = Vec::<f32>::with_capacity(nbdata);
for i in 1..nbdata {
let arg = 2. * delta * i as f64 - 1.;
let d = (2.0_f64.sqrt() * erf_inv(arg)) as f32;
if !d.is_normal() {
log::error!("float problem arg = {}, d = {:?}", arg, d);
panic!();
} else {
v.push(d);
}
}
v
}
#[test]
fn dmap_digits() {
log_init_test();
log::info!("running mnist_digits");
let mnist_data = load_mnist_train_data(MNIST_DIGITS_DIR).unwrap();
let labels = mnist_data.get_labels().to_vec();
let images = mnist_data.get_images();
let (_, _, nbimages) = images.dim();
let mut images_as_v = Vec::<Vec<f32>>::with_capacity(nbimages);
for k in 0..nbimages {
let v: Vec<f32> = images
.slice(s![.., .., k])
.iter()
.map(|v| *v as f32)
.collect();
images_as_v.push(v);
}
let dtime = 1.;
let gnbn: usize = 16;
let mut dparams: DiffusionParams = DiffusionParams::new(4, Some(dtime), Some(gnbn));
dparams.set_alfa(1.);
dparams.set_beta(-1.);
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
let mut hnsw = Hnsw::<f32, DistL2>::new(32, images_as_v.len(), 16, 200, DistL2::default());
hnsw.set_keeping_pruned(true);
let data_with_id: Vec<(&Vec<f32>, usize)> =
images_as_v.iter().zip(0..images_as_v.len()).collect();
hnsw.parallel_insert(&data_with_id);
let mut diffusion_map = DiffusionMaps::new(dparams);
let emmbedded_res = diffusion_map.embed_from_hnsw::<f32, DistL2, f32>(&mut hnsw, &dparams);
if emmbedded_res.is_err() {
log::error!("embedding failed");
panic!("dmap_fashion failed");
};
log::info!(
" dmap embed time {:.2e} s, cpu time : {}",
sys_now.elapsed().unwrap().as_secs(),
cpu_start.elapsed().as_secs()
);
log::info!("dumping initial embedding in csv file");
let mut csv_w = csv::Writer::from_path("mnist_digits_dmap.csv").unwrap();
let _res = write_csv_labeled_array2(&mut csv_w, labels.as_slice(), &emmbedded_res.unwrap());
csv_w.flush().unwrap();
}
#[test]
fn dmap_fashion() {
log_init_test();
log::info!("running mnist_fashion");
let fashion_train_data = load_mnist_train_data(MNIST_FASHION_DIR).unwrap();
let mut labels = fashion_train_data.get_labels().to_vec();
let images = fashion_train_data.get_images();
let (_, _, nbimages) = images.dim();
let mut images_as_v = Vec::<Vec<f32>>::with_capacity(nbimages);
for k in 0..nbimages {
let v: Vec<f32> = images
.slice(s![.., .., k])
.iter()
.map(|v| *v as f32)
.collect();
images_as_v.push(v);
}
let fashion_test_data = load_mnist_test_data(MNIST_FASHION_DIR).unwrap();
labels.append(&mut fashion_test_data.get_labels().to_vec());
let images = fashion_test_data.get_images();
let (_, _, nbimages) = images.dim();
for k in 0..nbimages {
let v: Vec<f32> = images
.slice(s![.., .., k])
.iter()
.map(|v| *v as f32)
.collect();
images_as_v.push(v);
}
let dtime = 1.;
let gnbn = 16;
let mut dparams: DiffusionParams = DiffusionParams::new(20, Some(dtime), Some(gnbn));
dparams.set_alfa(1.);
dparams.set_beta(0.);
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
let mut hnsw = Hnsw::<f32, DistL2>::new(64, images_as_v.len(), 16, 200, DistL2::default());
hnsw.set_keeping_pruned(true);
let data_with_id: Vec<(&Vec<f32>, usize)> =
images_as_v.iter().zip(0..images_as_v.len()).collect();
hnsw.parallel_insert(&data_with_id);
let mut diffusion_map = DiffusionMaps::new(dparams);
let emmbedded_res = diffusion_map.embed_from_hnsw::<f32, DistL2, f32>(&mut hnsw, &dparams);
if emmbedded_res.is_err() {
log::error!("embedding failed");
panic!("dmap_fashion failed");
};
log::info!(
" dmap embed time {:.2e} s, cpu time : {}",
sys_now.elapsed().unwrap().as_secs(),
cpu_start.elapsed().as_secs()
);
log::info!("dumping initial embedding in csv file");
let mut csv_w = csv::Writer::from_path("mnist_fashion_dmap.csv").unwrap();
let _res = write_csv_labeled_array2(&mut csv_w, labels.as_slice(), &emmbedded_res.unwrap());
csv_w.flush().unwrap();
} }