1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
//! This module provides conversion of a Point structure to a FlatPoint containing just the Id of a point
//! and those of its neighbours.
//! The whole Hnsw structure is then flattened into a Hashtable associating the data ID of a point to
//! its corresponding FlatPoint.
//! It can be used, for example, when reloading only the graph part of the data to have knowledge
//! of relative proximity of points as described just by their DataId
//!
use hashbrown::HashMap;
use std::cmp::Ordering;
use crate::hnsw;
use anndists::dist::distances::Distance;
use hnsw::*;
// an ordering of Neighbour of a Point
impl PartialEq for Neighbour {
fn eq(&self, other: &Neighbour) -> bool {
return self.distance == other.distance;
} // end eq
}
impl Eq for Neighbour {}
// order points by distance to self.
impl PartialOrd for Neighbour {
fn partial_cmp(&self, other: &Neighbour) -> Option<Ordering> {
self.distance.partial_cmp(&other.distance)
} // end cmp
} // end impl PartialOrd
impl Ord for Neighbour {
fn cmp(&self, other: &Neighbour) -> Ordering {
if !self.distance.is_nan() && !other.distance.is_nan() {
self.distance.partial_cmp(&other.distance).unwrap()
} else {
panic!("got a NaN in a distance");
}
} // end cmp
}
/// a reduced version of point inserted in the Hnsw structure.
/// It contains original id of point as submitted to the struct Hnsw
/// an ordered (by distance) list of neighbours to the point
/// and it position in layers.
#[derive(Clone)]
pub struct FlatPoint {
/// an id coming from client using hnsw, should identify point uniquely
origin_id: DataId,
/// a point id identifying point as stored in our structure
p_id: PointId,
/// neighbours info
neighbours: Vec<Neighbour>,
}
impl FlatPoint {
/// returns the neighbours orderded by distance.
pub fn get_neighbours(&self) -> &Vec<Neighbour> {
return &self.neighbours;
}
/// returns the origin id of the point
pub fn get_id(&self) -> DataId {
return self.origin_id;
}
///
pub fn get_p_id(&self) -> PointId {
return self.p_id;
}
} // end impl block for FlatPoint
fn flatten_point<T: Clone + Send + Sync>(point: &Point<T>) -> FlatPoint {
let neighbours = point.get_neighborhood_id();
// now we flatten neighbours
let mut flat_neighbours = Vec::<Neighbour>::new();
for layer in neighbours {
for neighbour in layer {
flat_neighbours.push(neighbour);
}
}
flat_neighbours.sort_unstable();
let fpoint = FlatPoint {
origin_id: point.get_origin_id(),
p_id: point.get_point_id(),
neighbours: flat_neighbours,
};
fpoint
} // end of flatten_point
/// A structure providing neighbourhood information of a point stored in the Hnsw structure given its DataId.
/// The structure uses the [FlatPoint] structure.
/// This structure can be obtained by FlatNeighborhood::from<&Hnsw<T,D>>
pub struct FlatNeighborhood {
hash_t: HashMap<DataId, FlatPoint>,
}
impl FlatNeighborhood {
/// get neighbour of a point given its id.
/// The neighbours are sorted in increasing distance from data_id.
pub fn get_neighbours(&self, p_id: DataId) -> Option<Vec<Neighbour>> {
let res = match self.hash_t.get(&p_id) {
Some(point) => Some(point.get_neighbours().clone()),
_ => None,
};
return res;
}
} // end impl block for FlatNeighborhood
impl<'b, T: Clone + Send + Sync, D: Distance<T> + Send + Sync> From<&Hnsw<'b, T, D>>
for FlatNeighborhood
{
/// extract from the Hnsw strucure a hashtable mapping original DataId into a FlatPoint structure gathering its neighbourhood information.
/// Useful after reloading from a dump with T=NoData and D = NoDist as points are then reloaded with neighbourhood information only.
fn from(hnsw: &Hnsw<T, D>) -> Self {
let mut hash_t = HashMap::new();
let mut ptiter = hnsw.get_point_indexation().into_iter();
//
loop {
if let Some(point) = ptiter.next() {
// println!("point : {:?}", _point.p_id);
let res_insert = hash_t.insert(point.get_origin_id(), flatten_point(&point));
match res_insert {
Some(old_point) => {
println!("2 points with same origin id {:?}", old_point.origin_id);
log::error!("2 points with same origin id {:?}", old_point.origin_id);
}
_ => (),
} // end match
} else {
break;
}
} // end while
return FlatNeighborhood { hash_t };
}
} // e,d of Fom implementation
#[cfg(test)]
mod tests {
use super::*;
use anndists::dist::distances::*;
use std::path::PathBuf;
use crate::api::AnnT;
use crate::hnswio::*;
use rand::distributions::{Distribution, Uniform};
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_dump_reload_graph_flatten() {
println!("\n\n test_dump_reload_graph_flatten");
log_init_test();
// generate a random test
let mut rng = rand::thread_rng();
let unif = Uniform::<f32>::new(0., 1.);
// 1000 vectors of size 10 f32
let nbcolumn = 1000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for i in 0..data.len() {
hnsw.insert((&data[i], i));
}
// some loggin info
hnsw.dump_layer_info();
// get flat neighbours of point 3
let neighborhood_before_dump = FlatNeighborhood::from(&hnsw);
let nbg_2_before = neighborhood_before_dump.get_neighbours(2).unwrap();
println!("voisins du point 2 {:?}", nbg_2_before);
// dump in a file. Must take care of name as tests runs in // !!!
let fname = String::from("dumpreloadtestflat");
let _res = hnsw.file_dump(&fname);
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
//
// reload
log::debug!("\n\n hnsw reload");
// we will need a procedural macro to get from distance name to its instanciation.
// from now on we test with DistL1
let directory = PathBuf::from(".");
let mut reloader = HnswIo::new(directory, String::from("dumpreloadtestflat"));
let hnsw_loaded: Hnsw<NoData, NoDist> = reloader.load_hnsw().unwrap();
let neighborhood_after_dump = FlatNeighborhood::from(&hnsw_loaded);
let nbg_2_after = neighborhood_after_dump.get_neighbours(2).unwrap();
println!("voisins du point 2 {:?}", nbg_2_after);
// test equality of neighborhood
assert_eq!(nbg_2_after.len(), nbg_2_before.len());
for i in 0..nbg_2_before.len() {
assert_eq!(nbg_2_before[i].p_id, nbg_2_after[i].p_id);
assert_eq!(nbg_2_before[i].distance, nbg_2_after[i].distance);
}
check_graph_equality(&hnsw_loaded, &hnsw);
//
let _ = std::fs::remove_file("dumpreloadtestflat.hnsw.data");
let _ = std::fs::remove_file("dumpreloadtestflat.hnsw.graph");
} // end of test_dump_reload
} // end module test