1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
use std::{
cell::RefCell,
cmp::Reverse,
collections::BinaryHeap,
marker::PhantomData,
sync::{Mutex, MutexGuard},
};
use crate::Dataset;
use crate::{
quantizer::{IdentityQuantizer, QueryEvaluator},
Float,
};
use rand::prelude::*;
use rayon::{prelude::*, ThreadPoolBuilder};
use super::{
add_neighbor_to_heaps, compute_closest_from_neighbors, config_hnsw::ConfigHnsw,
from_max_heap_to_min_heap, level::Level, visited_table::VisitedTable, Node,
};
/// A builder for constructing an HNSW (Hierarchical Navigable Small World) graph.
///
/// The `HnswBuilder` struct is responsible for creating and managing the HNSW graph. It handles
/// the assignment of levels to nodes, computation of neighbors, and construction of `Level` instances,
/// which store the neighbors of nodes at each level.
///
/// # Fields
///
/// - `probabs_levels`: A vector of probabilities associated with each level of the graph. These probabilities
/// determine the likelihood that a node is assigned to a particular level during graph construction.
///
/// - `neighbors`: A vector of `Mutex<Option<usize>>` representing the neighbor connections between nodes in the graph.
/// Each `Mutex` guards an optional ID, allowing for thread-safe updates during graph construction.
///
/// - `offsets`: A vector of offsets that helps locate the neighbors for each node. The offsets indicate
/// where the neighbors of a node start in the `neighbors` vector, enabling efficient access to neighbor lists.
///
/// - `levels_assigned`: A vector that tracks the level assigned to each node in the graph. The level of a node
/// determines its position in the hierarchical structure and influences the connections it forms with other nodes.
///
/// - `max_level`: A field representing the highest level present in the graph. This value is used
/// during the construction process to determine the maximum level assigned to any node.
///
/// - `entry_vector`: An optional index indicating the entry point node for the graph. This node is used as
/// the starting point for searching potential neighbors when a new vector is inserted into the graph.
///
/// - `cum_num_neigh_per_level`: A vector that holds the cumulative number of neighbors for each level.
/// This helps in determining the total number of neighbors assigned to nodes based on their respective levels.
///
/// - `dataset`: A reference to the dataset containing the actual data vectors for the HNSW graph.
///
/// - `_phantom`: A `PhantomData` marker that indicates the type `Q` is used in the context of the struct,
/// ensuring proper type safety without actually storing a value of type `Q`.
/// This helps the Rust compiler manage safety and type constraints related to `Q`.
pub struct HnswBuilder<'a, D, Q> {
probabs_levels: Vec<f32>,
neighbors: Vec<Mutex<Option<usize>>>,
offsets: Vec<usize>,
levels_assigned: Vec<u8>,
max_level: u8,
entry_vector: Option<usize>,
cum_num_neigh_per_level: Vec<usize>,
dataset: &'a D,
_phantom: PhantomData<&'a Q>,
}
impl<'a, D, Q> HnswBuilder<'a, D, Q>
where
D: Dataset<'a, Q> + Sync,
// The IdentityQuantizer enforces that the InputItem and OutputItem of the quantizer are of the same type.
// This is essential because:
// - The `query_evaluator` function in the `Dataset` trait requires a vector of type `QueryType`,
// where the `InputItem` must implement the `Float` trait.
// - The `get` function of the dataset returns a vector of type `DataType`, where the elements are
// of type `OutputItem` from the quantizer.
// By implementing the IdentityQuantizer trait, we ensure that `InputItem` and `OutputItem` are the same type,
// allowing the dataset's `get` function to directly provide data that can be used by `query_evaluator`
Q: IdentityQuantizer<DatasetType<'a> = D, T: Float> + Sync,
// This constraint is necessary because the vector returned by the dataset's "get" function is of type Datatype.
// The query evaluator, however, requires a vector of type Querytype.
Q::Evaluator<'a>: QueryEvaluator<'a, QueryType = <D as Dataset<'a, Q>>::DataType>,
{
/// Constructs a new `HnswBuilder` instance.
///
/// This function initializes a new `HnswBuilder` that is ready to build an HNSW (Hierarchical Navigable Small World)
/// graph for the given dataset. The builder is set up with default values for its internal parameters, and the probability
/// for level assignment is configured based on the number of neighbors per vector.
///
/// # Parameters
///
/// - `num_neighbors_per_vec`: The number of neighbors each vector should have. This value is used to determine the default
/// probabilities for level assignment in the graph.
/// - `dataset`: A reference to the dataset containing the data vectors that will be used to construct the HNSW graph.
///
/// # Returns
///
/// Returns a new `HnswBuilder` instance with initialized internal structures and default settings.
pub fn new(num_neighbors_per_vec: usize, dataset: &'a D) -> Self {
let mut hnsw_builder = Self {
probabs_levels: vec![],
neighbors: vec![],
offsets: vec![0],
levels_assigned: vec![],
max_level: 0,
entry_vector: None,
cum_num_neigh_per_level: vec![],
dataset,
_phantom: PhantomData,
};
let m = num_neighbors_per_vec as f32;
hnsw_builder.set_default_probas(1.0 / m.ln(), m as usize);
hnsw_builder
}
/// Computes the neighbors for each vector in the dataset and organizes them into levels.
///
/// This function calculates the neighbors for each vector in the dataset according to the HNSW graph
/// structure. It assigns levels to vectors, shuffles and processes them in parallel to compute their
/// neighbors, and then organizes the resulting data into `Level` instances.
///
/// The process involves:
/// 1. Assigning levels to each vector.
/// 2. Counting the number of vectors at each level.
/// 3. Ordering vector IDs by their assigned levels.
/// 4. Utilizing multiple threads to find the neighbors for each vector in parallel,
/// ensuring thread safety with mutexes and locks.
/// 5. Returning the computed levels and a mapping of new IDs to original IDs.
///
/// # Parameters
///
/// - `config`: A reference to a `ConfigHnsw` structure that contains configuration settings for
/// computing the HNSW graph.
///
/// # Returns
///
/// This function returns a triple:
///
/// - `Vec<Level>`: A vector of `Level` instances representing the neighbors for nodes at each level
/// of the HNSW graph.
/// - `Vec<usize>`: A vector where each element maps a new ID assigned to a vector to its original ID.
/// - `usize`: The entry vector, which is the ID of a node assigned to the maximum level, from which the
/// search will start.
///
/// # Example
///
/// Consider a dataset with 100 vectors and an HNSW graph with 3 levels.
///
/// - `num_ids_per_level` might be `[86, 20, 4]`, indicating the number of vectors at each level (0, 1, 2).
///
/// - `ids_per_level` might be organized as follows:
/// ```rust
/// vec![
/// // IDs for level 0 (indices 0 to 85)
/// 0, 1, 3, 6, 7, ..., 98,
/// // IDs for level 1 (indices 86 to 95)
/// 2, 4, 11, ..., 99,
/// // IDs for level 2 (indices 96 to 99)
/// 5, 10, 23, 60]
/// ```
/// During processing:
/// - For each level, the `begin` and `end` indices are calculated to determine the segment of `ids_per_level`
/// relevant for that level.
///
/// For example:
/// - At level 2, `end` might be `100`, and `begin` would be `96` (computed as `end - num_ids_per_level[level as usize]`),
/// so the IDs in the range from indices `96` to `99` are processed. This results in processing the IDs `[5, 10, 23, 60]`.
/// - At level 1, `end` is updated to `96`, and `begin` would be `86`, so the IDs from indices `86` to `95` are processed.
/// - At level 0, `end` is updated to `86`, and `begin` would be `0`, so the IDs from indices `0` to `85` are processed.
pub fn compute_graph(
&mut self,
config: &ConfigHnsw,
num_threads: usize,
) -> (Vec<Level>, Vec<usize>, usize) {
let num_vectors = self.dataset.len();
self.assign_level(num_vectors);
let num_ids_per_level = self.count_num_ids_per_level(num_vectors);
let ids_per_level = self.order_ids_by_level(&num_ids_per_level, num_vectors);
thread_local! {
static VISITED_TABLE: RefCell<Option<VisitedTable>> = RefCell::new(None);
}
let locks: Vec<Mutex<()>> = (0..num_vectors).map(|_| Mutex::new(())).collect();
let mut rng = StdRng::seed_from_u64(537);
let mut end = num_vectors;
let pool = ThreadPoolBuilder::new()
.num_threads(num_threads)
.build()
.unwrap();
pool.scope(|_| {
for level in (0..=self.max_level).rev() {
let begin = end - num_ids_per_level[level as usize];
let mut ids_curr_level: Vec<&usize> =
ids_per_level.iter().take(end).skip(begin).collect();
ids_curr_level.shuffle(&mut rng);
if self.entry_vector.is_none() && level as u8 == self.max_level {
// it assign as entry_vector the first id that got assigned the highest level
self.entry_vector = Some(*ids_curr_level[0]);
}
ids_curr_level.par_iter().for_each(|&&id| {
VISITED_TABLE.with(|visited_table| {
let mut visited_table = visited_table.borrow_mut();
if visited_table.is_none() {
*visited_table = Some(VisitedTable::new(self.dataset.len()));
}
self.compute_neighbors_for_vector(
id,
level,
&mut visited_table.as_mut().unwrap(),
&locks,
config,
);
});
});
end = begin;
}
});
self.compute_levels()
}
/// Computes and returns the levels of the HNSW graph along with mappings from new IDs to original IDs
/// and the entry vector.
///
/// This function constructs the levels of the HNSW graph by iterating over the levels from the
/// highest (max level) to the lowest.
/// For each level, it creates a `Level` instance that stores the neighbors of the nodes present
/// at that level. During this process, it also maintains two mappings:
///
/// 1. **`new_to_original`:** Maps the newly assigned IDs back to the original IDs,
/// enabling access to the original dataset vectors.
/// 2. **`original_to_new`:** Maps original IDs to their new IDs, allowing quick lookup
/// of new IDs based on the original IDs.
///
/// The levels are processed in reverse order (from the highest to the lowest) to ensure proper
/// level construction. Once all levels are computed, they are reversed to return them in the
/// correct order, from the lowest (level 0) to the highest level.
///
/// # Returns
///
/// This function returns a triple:
///
/// - `Vec<Level>`: A vector of `Level` instances representing the neighbors at each level in the HNSW graph.
/// - `Vec<usize>`: A vector mapping new IDs to original IDs, enabling access to the original dataset using new IDs.
/// - `usize`: The entry vector, which is the ID of a node assigned to the maximum level, from which the search will start.
#[inline]
fn compute_levels(&self) -> (Vec<Level>, Vec<usize>, usize) {
let mut processed_ids: Vec<usize> = Vec::with_capacity(self.dataset.len());
let mut new_to_original = Vec::with_capacity(self.dataset.len());
let mut original_to_new: Vec<usize> = vec![0; self.dataset.len()];
let mut levels: Vec<Level> = Vec::with_capacity((self.max_level + 1) as usize);
for curr_level in (0..=self.max_level).rev() {
levels.push(Level::new(
self,
curr_level,
&mut processed_ids,
&mut new_to_original,
&mut original_to_new,
));
}
levels.reverse();
(
levels,
new_to_original,
original_to_new[self.entry_vector.unwrap()],
)
}
/// Counts the number of vectors assigned to each level in the HNSW graph and returns a vector
/// representing this distribution.
///
/// This function iterates over all vectors in the dataset and counts how many vectors are assigned
/// to each level. It creates a vector `num_ids_per_level` where each index corresponds to a level,
/// and the value at that index represents the number of vectors assigned to that level.
///
/// # Parameters
///
/// - `num_vectors`: The total number of vectors in the dataset.
///
/// # Returns
///
/// - `Vec<usize>`: A vector where each element represents the number of vectors assigned to each
/// level in the HNSW graph. The index of the vector corresponds to the level number, and the
/// value at that index is the count of vectors at that level.
///
/// # Panics
///
/// This function will panic if it encounters a vector that does not have a level assigned,
/// which should not happen under normal circumstances as each vector should be assigned a level
/// before this function is called.
///
/// /// # Example
/// ```rust
/// let num_ids_per_level = hnsw_builder.count_num_ids_per_level(100);
/// // `num_ids_per_level[0]` might be 86, indicating 86 vectors are at level 0.
/// // `num_ids_per_level[1]` might be 10, indicating 10 vectors are at level 1.
/// // `num_ids_per_level[2]` might be 4, indicating 4 vectors are at level 2.
/// ```
fn count_num_ids_per_level(&self, num_vectors: usize) -> Vec<usize> {
// The num_ids_per_level vector stores the number of vectors to be added at each level
let mut num_ids_per_level: Vec<usize> = Vec::with_capacity(self.max_level as usize);
for id_vec_to_add in 0..num_vectors {
let level_vec = *self
.levels_assigned
.get(id_vec_to_add)
.expect("Level not assigned");
while level_vec >= num_ids_per_level.len() as u8 {
num_ids_per_level.push(0);
}
num_ids_per_level[level_vec as usize] += 1;
}
num_ids_per_level
}
/// Orders the IDs of vectors based on their assigned levels and returns a vector where IDs
/// are arranged in the order they should be inserted into the HNSW graph.
///
/// This function organizes the vector IDs based on the levels they are assigned to.
/// It creates a vector `ids_per_level` where the IDs of vectors are ordered by their respective levels.
/// Vectors from lower levels appear first, followed by those from higher levels. To compute this vector
/// it uses an offset vector which track the position where the ids for each level should be inserted
/// in the ids_per_level vector. For each node, using the id it retreives the level assigned to that node and uses
/// the offsets vector to get the position to be inserted in the ids_per_level
///
/// # Parameters
///
/// - `num_ids_per_level`: A reference to a vector containing the number of vectors assigned to each level. The index
/// corresponds to the level, and the value at that index represents the count of vectors at that level.
/// - `num_vectors`: The total number of vectors in the dataset.
///
/// # Returns
///
/// - `Vec<usize>`: A vector containing the IDs of vectors arranged according to their assigned levels. Vectors at lower
/// levels are placed at the beginning of the vector, followed by vectors at higher levels.
///
/// # Example
///
/// If there are 10 vectors distributed across 3 levels with counts `[5, 3, 2]`:
/// - The `offsets` vector would be `[0, 5, 8]`, indicating the starting positions for each level in the
/// `ids_per_level` vector.
/// - For vectors at level 1, IDs are inserted starting at position 5. After inserting each ID, the offset
/// is updated to [0,6,8], so the next ID for level 1 will be placed at position 6.
///
fn order_ids_by_level(&self, num_ids_per_level: &Vec<usize>, num_vectors: usize) -> Vec<usize> {
// The ids_per_level vector holds the IDs of elements in the order they should be inserted
// according to their respective levels.
let mut ids_per_level: Vec<usize> = vec![0; num_vectors];
// the offsets vectors contains the position in which the next vector for a specific level should be inserted
// in the ids_per_level vector
let mut offsets: Vec<usize> = vec![0; num_ids_per_level.len() - 1];
offsets.push(0);
for i in 0..num_ids_per_level.len() - 1 {
offsets[i + 1] = offsets[i] + num_ids_per_level[i];
}
for id_vec_to_add in 0..num_vectors {
let level_vec = *self
.levels_assigned
.get(id_vec_to_add)
.expect("Level not assigned");
ids_per_level[offsets[level_vec as usize]] = id_vec_to_add;
offsets[level_vec as usize] += 1;
}
ids_per_level
}
/// Computes the neighbors for a given vector based on its ID and level in the HNSW graph.
///
/// This function calculates the nearest neighbors of a vector specified by `id_vec` at a particular level `level_vec`.
/// It performs the following tasks:
///
/// - **Phase 1: Greedy Search in higher levels**
///
/// The function starts by performing a greedy search from the highest level (`max_level`) down to `curr_level`.
/// It iterates through these higher levels to find the closest existing vector to the new vector being added.
/// This closest vector is then used as the starting point for the subsequent phase.
///
/// - **Phase 2: Neighbor Computation at insertion levels**
/// After identifying the closest vector from the higher levels, the function performs neighbor computation
/// starting from `curr_level` and working down to level 0. It does this by calling the `add_links_starting_from`
/// function for each of these levels. This function identifies the neighbors for the vector being added at each level,
/// and updates the corresponding segment in the `neighbors` field to store the IDs of these neighbors.
fn compute_neighbors_for_vector(
&self,
id_vec: usize,
level_vec: u8,
visited_table: &mut VisitedTable,
locks: &Vec<Mutex<()>>,
config: &ConfigHnsw,
) {
// It returns since it's the first node inserted, so there are no neighbors to search
if id_vec == self.entry_vector.unwrap() {
return;
}
let mut nearest_vec = self.entry_vector.unwrap();
let vector = self.dataset.get(id_vec);
let query_evaluator = self.dataset.query_evaluator(vector);
let mut curr_level = self.max_level;
let mut dis_nearest_vec = query_evaluator.compute_distance(nearest_vec);
{
let _lock = &locks[id_vec].lock().unwrap();
while curr_level > level_vec {
self.greedy_update_nearest(
curr_level,
&query_evaluator,
&mut nearest_vec,
&mut dis_nearest_vec,
);
curr_level -= 1;
}
}
loop {
let guard = locks[id_vec].lock().unwrap();
self.add_links_starting_from(
&query_evaluator,
id_vec,
nearest_vec,
dis_nearest_vec,
curr_level,
visited_table,
locks,
guard,
config,
);
if curr_level == 0 {
break;
}
curr_level -= 1;
}
}
/// Updates the nearest vector to the vector being added using a greedy approach.
///
/// # Arguments
/// - `curr_level`: The level in the HNSW graph where the current search for the nearest neighbor is being conducted.
/// - `query_evaluator`: A reference to an object implementing the `QueryEvaluator` trait.
/// This object provides the method to compute the distance between the query vector and each neighbor.
/// - `nearest_vec`: A mutable reference to a `usize` variable that will be updated to the index of
/// the closest neighbor found.
/// - `dis_nearest_vec`: A mutable reference to a `f32` variable that will be updated to the distance
/// of the closest neighbor found.
///
/// # Description
///
/// The function begins by retrieving the neighbors of the current nearest vector at the specified level. It then uses
/// the `compute_closest_from_neighbors` function to evaluate these neighbors and determine if any are closer to the
/// vector being added. The nearest vector and its distance are updated if a closer neighbor is found. This process
/// continues iteratively until no closer neighbors are found, at which point the function exits.
fn greedy_update_nearest(
&self,
curr_level: u8,
query_evaluator: &impl QueryEvaluator<'a>,
nearest_vec: &mut usize,
dis_nearest_vec: &mut f32,
) {
loop {
let prec_nearest = *nearest_vec;
let neighbors = self.get_unlocked_neighbors(*nearest_vec, curr_level);
compute_closest_from_neighbors(
query_evaluator,
neighbors.as_slice(),
nearest_vec,
dis_nearest_vec,
);
if prec_nearest == *nearest_vec {
return;
}
}
}
/// Retrieves the neighbors for a vector at a specified level, unlocking the neighbors for access.
///
/// This method collects and returns the neighbors of a vector identified by `id` at the given `level`. The function
/// first obtains the list of neighbor `Mutex` objects from the specified level, then locks each `Mutex` to access
/// the neighbor IDs. It filters out any `None` values and collects the remaining neighbor IDs into a vector.
///
/// # Arguments
/// - `id`: The index of the vector whose neighbors are to be retrieved.
/// - `level`: The level in the HNSW graph from which to retrieve the neighbors.
///
/// # Returns
/// A `Vec<usize>` containing the IDs of the neighbors for the specified vector at the given level.
///
/// # Example
///
/// ```rust
/// // Consider we have an HNSW graph and the neighbors of the vector with id 5 at level 2 are stored as follows:
/// // vec![Mutex::new(Some(423)), Mutex::new(Some(12)), Mutex::new(Some(42)), Mutex::new(None), Mutex::new(None)]
///
/// let neighbors = hnsw_builder.get_unlocked_neighbors(5, 2);
///
/// // The function will lock each `Mutex` and retrieve the neighbor IDs, filtering out `None` values.
/// // The returned neighbors vector will be:
/// // vec![423, 12, 42] // Only valid neighbor IDs are returned.
/// ```
#[inline]
pub fn get_unlocked_neighbors(&self, id: usize, level: u8) -> Vec<usize> {
self.get_neighbors_from_level(id, level)
.iter()
.map(|mutex| mutex.lock().unwrap())
.filter_map(|neighbor| *neighbor)
.collect()
}
/// This function handles the process of finding the neighbors for a vector being inserted
/// at a specified level in the HNSW graph and establishing connections with them.
///
/// # Arguments
///
/// - `query_evaluator`: An object that implements the `QueryEvaluator` trait. This object is used to
/// evaluate distances between the vector being added and the vectors already present in the graph.
/// - `id_vec`: The index of the vector being inserted into the HNSW graph.
/// - `nearest_vec`: The index of the nearest vetor found in the highest levels, used as the starting point
/// for neighbor search.
/// - `dis_nearest_vec`: The distance to the nearest vector found in the highest levels.
/// - `curr_level`: The current level in the HNSW graph where links are being added.
/// - `visited_table`: A table tracking the vectors that have already been visited during the search.
/// - `locks`: A slice of `Mutex` objects used to lock the neighbors of each vector to ensure
/// thread safety during updates.
/// - `guard`: A `MutexGuard` object used to manage the current lock.
/// - `config`: The configuration settings for the HNSW algorithm.
///
/// # Description
///
/// The process includes the following steps:
///
/// 1. **Find Candidate Neighbors:** The function first identifies potential neighbors for the vector
/// being inserted by calling the `search_neighbors_to_add` method. This method populates a max-heap
/// with the closest vectors, where the farthest candidate is at the top.
/// 2. **Shrink Neighbor List:** After identifying the candidate neighbors, the function checks if the
/// number of candidates exceeds the allowed limit for that level. If necessary, it shrinks the list
/// to retain only the most relevant neighbors using the `shrink_neighbor_list` method.
/// 3. **Attempt to Set Neighbors:** The function then calls `add_link` to attempt to establish
/// connections between the vector being added and its candidate neighbors. If the vector's neighbor
/// list is already full, the function uses heuristics to select which of the candidates to keep and
/// which to discard. Reciprocal links are also managed, aiming to set the vector being added as a
/// neighbor for the selected candidates. To ensure thread safety during this process, the function
/// first acquires the lock for each vector whose neighbor list is being updated. This prevents concurrent
/// modifications by other threads, maintaining the integrity of the neighbor lists.
///
fn add_links_starting_from(
&self,
query_evaluator: &impl QueryEvaluator<'a>,
id_vec: usize,
nearest_vec: usize,
dis_nearest_vec: f32,
curr_level: u8,
visited_table: &mut VisitedTable,
locks: &[Mutex<()>],
guard: MutexGuard<()>,
config: &ConfigHnsw,
) {
//max-heap, on top is the farthest vector
let mut closest_vectors: BinaryHeap<Node> = BinaryHeap::new();
self.search_neighbors_to_add(
&mut closest_vectors,
query_evaluator,
nearest_vec,
dis_nearest_vec,
curr_level,
visited_table,
config,
);
let m = self.num_neighbors_per_level(curr_level);
self.shrink_neighbor_list(&mut closest_vectors, m);
closest_vectors.iter().for_each(|&neighbor| {
self.add_link(id_vec, neighbor.id_vec(), curr_level);
});
std::mem::drop(guard);
closest_vectors.iter().for_each(|&neighbor| {
let _lock = locks[neighbor.id_vec()].lock().unwrap();
self.add_link(neighbor.id_vec(), id_vec, curr_level);
});
}
/// This function updates the neighbor list for the source vector (`id_src`) by including the
/// destination vector (`id_dest`) at the specified level (`curr_level`). If the neighbor list
/// is full, the function uses a heuristic to select the most relevant neighbors to retain.
///
/// # Arguments
///
/// - `id_src`: The index of the source vector in the HNSW graph, which is being updated to include
/// the new neighbor.
/// - `id_dest`: The index of the destination vector in the HNSW graph, which is being considered
/// for inclusion in the neighbor list of `id_src`.
/// - `curr_level`: The level in the HNSW graph where the neighbor list of `id_src` is being updated.
///
/// # Description
///
/// 1. **Get Neighbor Offsets:** Retrieves the start (`begin`) and end (`end`) offsets for the neighbor
/// list of `id_src` at `curr_level`. The range `[begin, end)` defines where the neighbors are stored
/// for `id_src` at this level, with `end` being exclusive.
///
/// 2. **Allocate in Empty Position:** Attempts to insert `id_dest` into an available slot within the
/// neighbor list of `id_src`. If there is space available, the function returns immediately,
/// having successfully added the new neighbor.
///
/// 3. **Handle Full Neighbor List:** If the list is full, the function creates a max-heap containing
/// `id_dest` along with the existing neighbors of `id_src`.
///
/// 4. **Shrink Neighbor List:** After computing the candidate neighbors, the list is reduced to the
/// maximum allowed size for the current level (`max_size`) using `shrink_neighbor_list`.
/// This step ensures that only the most pertinent neighbors are kept in the list.
///
/// 5. **Update Neighbor List:** Updates the neighbor list of `id_src` for the specified level
/// with the selected neighbors from the `shrink_neighbor_list` function.
///
/// # Note
///
/// The function does not guarantee that `id_dest` will be included as a neighbor of `id_src`. If the neighbor
/// list of `id_src` is full, the heuristic applied in the `shrink_neighbor_list` function may not choose `id_dest`
/// as a neighbor.
fn add_link(&self, id_src: usize, id_dest: usize, curr_level: u8) {
let (begin, end) = self.get_offsets_neighbors_from_level(id_src, curr_level);
if self.allocate_in_empty_position(begin, end, id_dest) {
return;
}
let mut candidate_neighbors = self.compute_candidate_neighbors(id_src, id_dest, begin, end);
let max_size = end - begin;
self.shrink_neighbor_list(&mut candidate_neighbors, max_size);
self.fill_neighbors_list(&mut candidate_neighbors, begin, end);
}
/// Attempts to insert a new neighbor into an empty slot in the neighbor list of a vector.
///
/// # Arguments
///
/// - `begin`: The start index (inclusive) of the range in the neighbor list to check for available positions.
/// - `end`: The end index (exclusive) of the range in the neighbor list to check for available positions.
/// - `id_dest`: The index of the vector to be added as a neighbor.
///
/// # Description
///
/// This function attempts to insert `id_dest` into an empty slot in the neighbor list of a vector.
/// It operates within the range defined by `begin` and `end` indices. If an empty slot is found,
/// it places `id_dest` in that slot and returns `true`.
/// If no empty slots are available, it returns `false`.
///
/// # Example
///
/// ```rust
/// // Initial state of the neighbor list, where `None` represents an empty slot
/// let neighbors: Vec<Mutex<Option<usize>>> = vec![
/// Mutex::new(Some(3)), // suppose this is index 10
/// Mutex::new(Some(5)),
/// Mutex::new(None),
/// Mutex::new(None),
/// Mutex::new(None), // suppose this is index 14
/// ];
///
/// // Call to `allocate_in_empty_position` with `begin` = 10 and `end` = 15
/// let result = allocate_in_empty_position(10, 15, 7);
///
/// // After the call, the neighbor list is updated
/// // The state of the neighbor list will now be:
/// // [Some(3), Some(5), Some(7), None, None]
/// ```
#[inline]
fn allocate_in_empty_position(&self, begin: usize, end: usize, id_dest: usize) -> bool {
if self.neighbors[end - 1].lock().unwrap().is_none() {
let first_empty_spot = self.find_first_empty_spot(begin, end);
*self.neighbors[first_empty_spot].lock().unwrap() = Some(id_dest);
return true;
}
false
}
/// Finds the index of the first empty slot (i.e., `None`) within a specified range in the neighbor list.
///
/// # Arguments
///
/// - `begin`: The start index (inclusive) of the range within the `neighbors` list to search for empty slots.
/// - `end`: The end index (exclusive) of the range within the `neighbors` list to search for empty slots.
///
/// # Returns
///
/// - `usize`: The index of the first empty slot within the specified range. If no empty slots are found,
/// returns the `begin` index.
///
/// **Note**: This function assumes there is at least one empty slot in the range.
///
/// # Example
///
/// ```rust
/// // Example neighbor list with some slots filled and others empty
/// let neighbors: Vec<Mutex<Option<usize>>> = vec![
/// Mutex::new(Some(1)), \\ Suppose index is 10
/// Mutex::new(Some(2)),
/// Mutex::new(None),
/// Mutex::new(None),
/// Mutex::new(None), \\ Suppose index is 14
/// ];
///
/// // Finding the first empty slot in the range from index 10 to 15
/// let index = find_first_empty_spot(10, 15);
///
/// // The result will be the index of the first empty slot, which is 12
/// assert_eq!(index, 12);
/// ```
#[inline]
fn find_first_empty_spot(&self, begin: usize, end: usize) -> usize {
let mut i = end - 1;
while i > begin {
if self.neighbors[i - 1].lock().unwrap().is_some() {
break;
}
i -= 1;
}
i
}
/// Computes a max-heap of candidate neighbors for the vector with index `id_src`, including both
/// existing neighbors and a new vector.
///
/// # Arguments
///
/// - `id_src`: The index of the source vector for which potential neighbors are being computed.
/// - `id_dest`: The index of the new vector to be considered as a potential neighbor for `id_src`.
/// - `begin`: The start index (inclusive) of the segment in the neighbor list where neighbors of `id_src` are stored.
/// - `end`: The end index (exclusive) of the segment in the neighbor list where neighbors of `id_src` are stored.
///
/// # Description
///
/// This function constructs a max-heap of candidate neighbors for the vector with index `id_src`. The heap includes:
/// 1. The new vector with index `id_dest`, along with its distance to `id_src`.
/// 2. The existing neighbors within the range `[begin, end)` from the neighbor list of `id_src`, each
/// with their distance to `id_src`.
///
#[inline]
fn compute_candidate_neighbors(
&self,
id_src: usize,
id_dest: usize,
begin: usize,
end: usize,
) -> BinaryHeap<Node> {
let distance_src_dest = self.dataset.compute_distance_by_id(id_src, id_dest);
let mut candidate_neighbors: BinaryHeap<Node> = BinaryHeap::new();
candidate_neighbors.push(Node(distance_src_dest, id_dest));
for i in begin..end {
let neigh = self.neighbors[i].lock().unwrap().unwrap();
let distance_src_neigh = self.dataset.compute_distance_by_id(id_src, neigh);
candidate_neighbors.push(Node(distance_src_neigh, neigh));
}
candidate_neighbors
}
/// Updates the neighbor list of a vector with a selected set of candidate neighbors.
///
/// # Arguments
///
/// - `candidate_neighbors`: A mutable reference to a max-heap (`BinaryHeap`) of `Node` objects representing
/// the selected neighbors.
/// - `begin`: The start index (inclusive) of the range in the neighbor list where neighbors will be updated.
/// - `end`: The end index (exclusive) of the range in the neighbor list where neighbors will be updated.
///
/// # Description
///
/// This function fills the neighbor list of a vector with the selected candidate neighbors from the
/// provided `candidate_neighbors` heap.
/// The neighbors are stored in the range defined by the `begin` and `end` indices. The function processes
/// the heap until it is empty, populating the neighbor list starting from `begin`. If any slots remain after
/// all neighbors are added, they are filled with `None` to indicate empty slots.
#[inline]
fn fill_neighbors_list(
&self,
candidate_neighbors: &mut BinaryHeap<Node>,
begin: usize,
end: usize,
) {
let mut i = begin;
while !candidate_neighbors.is_empty() {
*self.neighbors[i].lock().unwrap() = Some(candidate_neighbors.pop().unwrap().id_vec());
i += 1;
}
while i < end {
*self.neighbors[i].lock().unwrap() = None;
i += 1;
}
}
/// Performs a search to find the nearest neighbors for a given vector being added to the HNSW graph.
///
/// /// # Arguments
///
/// - `closest_vectors`: A mutable reference to a max-heap (`BinaryHeap`) of `Node` objects.
/// This heap is initially empty and will be populated with the closest vectors found during the search.
/// - `query_evaluator`: An implementation of the `QueryEvaluator` trait that provides the method to compute
/// the distance between the query vector and the vectors already added to the graph.
/// - `nearest_vec`: The index of the nearest vector found in the highest levels that acts as the starting
/// node from which the search begins.
/// - `dis_nearest_vec`: The distance of the nearest vector found so far to the query vector.
/// - `curr_level`: The current level in the HNSW graph where the search is being conducted.
/// - `visited_table`: A mutable reference to the `VisitedTable`, which keeps track of the vectors that have
/// already been visited during the search.
/// - `config`: A reference to the `ConfigHnsw` structure, which contains configuration parameters for the search
/// process, such as the `ef_construction` value.
///
/// # Description
///
/// The search process begins by initializing a min-heap (`candidates`) with the nearest vector found so far.
/// The function then iteratively evaluates each candidate's neighbors. For each neighbor that hasn't been
/// visited yet, the function calculates the distance between the neighbor and the query vector using the
/// `query_evaluator`. If the neighbor is closer than the farthest vector in the `closest_vectors` heap,
/// it is added to both heaps and marked as visited.
///
/// The search continues until all candidates have been explored or until a candidate's distance exceeds
/// the maximum distance in the `closest_vectors` heap. If a candidate's distance is greater than this
/// maximum distance, the search stops as further candidates will have even larger distances and are thus
/// less likely to be relevant.
///
/// Once the search completes, `closest_vectors` will contain the closest neighbors to the query vector,
/// and the visited table is advanced to prepare for subsequent searches.
fn search_neighbors_to_add(
&self,
closest_vectors: &mut BinaryHeap<Node>,
query_evaluator: &impl QueryEvaluator<'a>,
nearest_vec: usize,
dis_nearest_vec: f32,
curr_level: u8,
visited_table: &mut VisitedTable,
config: &ConfigHnsw,
) {
//min-heap based on distance
let mut candidates: BinaryHeap<Reverse<Node>> = BinaryHeap::new();
let node = Node(dis_nearest_vec, nearest_vec);
candidates.push(Reverse(node));
closest_vectors.push(node);
visited_table.set(nearest_vec);
while let Some(node) = candidates.pop() {
let curr_node = node.0;
let farthest_in_heap = *closest_vectors.peek().unwrap();
if curr_node.distance() > farthest_in_heap.distance() {
break;
}
let curr_node = curr_node.id_vec();
let neighbors = self.get_unlocked_neighbors(curr_node, curr_level);
for &neighbor in neighbors.iter() {
if !visited_table.get(neighbor) {
visited_table.set(neighbor);
let distance_to_neighbor = query_evaluator.compute_distance(neighbor);
let neighbor_node = Node(distance_to_neighbor, neighbor);
add_neighbor_to_heaps(
&mut candidates,
closest_vectors,
neighbor_node,
config.get_ef_construction(),
);
}
}
}
visited_table.advance();
}
/// Shrinks the neighbor list to ensure it contains only the most relevant neighbors, up to a
/// specified maximum size. This function applies a heuristic from the HNSW paper (https://arxiv.org/abs/1603.09320)
/// by Malkov and Yashunin to determine which neighbors to keep.
///
/// # Arguments
///
/// - `closest_vectors`: A mutable reference to a `BinaryHeap<Node>` containing the current neighbors.
/// - `max_size`: The maximum number of neighbors to include in the list.
///
/// # Description
///
/// This function manages the size of the `closest_vectors` heap to ensure it does not exceed `max_size`.
/// If the heap already contains more neighbors than allowed, the function performs the following steps:
///
/// 1. **Convert Heap:** Converts the max-heap of `closest_vectors` into a min-heap for easier management
/// of the smallest elements.
///
/// 2. **Evaluate Nodes:** Iterates over the nodes in the min-heap and determines which nodes should remain.
/// A node will stay in the heap if it is closer to the query vector than any other node in `closest_vectors`.
///
/// 3. **Rebuild Heap:** Pushes the nodes that should stay back into the `closest_vectors` heap.
/// The process stops when the heap reaches the `max_size` or when all relevant nodes have been processed.
fn shrink_neighbor_list(&self, closest_vectors: &mut BinaryHeap<Node>, max_size: usize) {
if closest_vectors.len() < max_size {
return;
}
let mut min_heap = from_max_heap_to_min_heap(closest_vectors);
while let Some(node) = min_heap.pop() {
let node1 = node.0;
let dist_node1_vec = node1.distance();
let mut keep_node_1 = true;
for node2 in closest_vectors.iter() {
let dist_node_1_node2 = self
.dataset
.compute_distance_by_id(node1.id_vec(), node2.id_vec());
if dist_node_1_node2 < dist_node1_vec {
keep_node_1 = false;
break;
}
}
if keep_node_1 {
closest_vectors.push(node1);
if closest_vectors.len() >= max_size {
return;
}
}
}
}
/// Assigns levels to each vector in the graph and updates the internal `offsets` and `neighbors` vectors.
///
/// # Arguments
///
/// - `num_vectors`: The number of vectors to which levels will be assigned.
///
/// # Description
///
/// This function assigns a level to each vector in the graph. It updates the `offsets` vector based on the assigned
/// levels and initializes the `neighbors` vector with `None` to represent empty neighbor slots.
#[inline]
fn assign_level(&mut self, num_vectors: usize) {
let mut rng = StdRng::seed_from_u64(523);
for _ in 0..num_vectors {
let level = self.random_level(&mut rng);
self.levels_assigned.push(level);
if level > self.max_level {
self.max_level = level;
}
self.update_offset_vector(level);
}
// padd neighbors vector with None, which means that there are not neighbors yet
for _ in 0..*self.offsets.last().unwrap() {
self.neighbors.push(Mutex::new(None));
}
}
/// Updates the `offset` vector by pushing the position where the current node's neighbors end and the
/// next node's neighbors begin.
///
/// # Parameters
///
/// - `level`: The level assigned to the node for which the offset is being updated.
///
/// # Description
///
/// This function retrieves the last position in the `offset` vector, which indicates where the
/// previous node's neighbors ended and current node's neighbors begin. Then, based on the
/// level assigned to the current node, it calculates the number of neighbors the node has using
/// the `cum_num_neigh_per_level` vector.
/// Finally, it adds this number to the last position and pushes the resulting value to the `offset` vector.
/// This new value represents the position where the current node's neighbors end and the next node's
/// neighbors start in the neighbors vector.
///
/// # Example
///
/// Assume that the number of neighbors per vector is set to 16, and the `offset` vector initially
/// looks like `[0, 48, 80, 112]`. The `cum_num_neigh_per_level` vector is `[0, 32, 48, 64, 80]`,
/// where the neighbors for level 0 are `2 * num_neighbors_per_vec`.
///
/// If the node being updated is assigned to level 1, it will have a total of 48 neighbors:
/// 32 for level 0 and 16 for level 1.
/// After updating, the `offset` vector becomes `[0, 48, 80, 112, 160]`, meaning that the neighbors
/// of the current node are stored in the range `[112, 160)`.
fn update_offset_vector(&mut self, level: u8) {
let last_pos_offset = self.offsets.last().unwrap_or(&0);
let cum_num_neighbors = self
.cum_num_neigh_per_level
.get((level + 1) as usize)
.expect("Level not present");
self.offsets.push(last_pos_offset + cum_num_neighbors);
}
/// This function generates a random level for a node in the HNSW graph.
///
/// # Description
///
/// The function begins by generating a random floating-point number `f` between 0.0 and 1.0.
/// The function then iterates over the `probabs_levels` vector, comparing `f` with the probability thresholds for
/// each level. If `f` is less than the current level's probability, that level is selected and returned as a `u8`.
/// If `f` is larger, the function reduces `f` by the threshold value and continues to the next level. If no level
/// is selected, the maximum level, which corresponds to the last index of `probabs_levels`, is returned.
///
/// # Parameters
///
/// - `rng`: A mutable reference to a random number generator of type `StdRng`.
///
/// # Returns
///
/// - `u8`: The level selected for the node, ranging from 0 to the maximum level.
///
/// /// # Example
///
/// Assume `probabs_levels` contains `[0.6, 0.3, 0.1]` and the random value `f` is `0.65`.
/// After checking level 0 (0.6),`f` is decreased by 0.6 to become `0.05`. The function would then
/// return level 1, as `0.05` is less than the probability for level 1 (0.3).
fn random_level(&self, rng: &mut StdRng) -> u8 {
let mut f: f32 = rng.gen_range(0.0..1.0);
for (level, &prob) in self.probabs_levels.iter().enumerate() {
if f < prob {
return level as u8;
}
f -= prob;
}
// it returns the maximum level which is the size of the vector probabs_levels
(self.probabs_levels.len() - 1) as u8
}
/// This function retrieves a slice of the `neighbors` vector that corresponds to the neighbors
/// of a vector at a specific level.
/// # Parameters
///
/// - `id_vec`: The ID of the vector for which neighbors are being accessed.
/// - `level`: The level in the HNSW graph for which neighbors are to be retrieved.
///
/// # Returns
///
/// - `&[Mutex<Option<usize>>]`: A slice of the `neighbors` vector, containing the neighbors of
/// the specified vector at the specified level.
#[inline]
pub fn get_neighbors_from_level(&self, id_vec: usize, level: u8) -> &[Mutex<Option<usize>>] {
let (begin, end) = self.get_offsets_neighbors_from_level(id_vec, level);
&self.neighbors[begin..end]
}
/// Computes the range of indices for accessing neighbors of a vector at a specific level.
///
/// # Parameters
///
/// - `id_vec`: The ID of the vector for which the neighbor offsets are being computed.
/// - `level`: The level in the HNSW graph for which the neighbor offsets are to be determined.
///
/// # Returns
///
/// - `(usize, usize)`: A tuple containing:
/// - The starting index (`begin`) in the `neighbors` vector where the neighbors of the vector at
/// the specified level begin.
/// - The ending index (`end`) in the `neighbors` vector where the neighbors of the vector at the
/// specified level end (exclusive).
///
/// # Description
///
/// To compute these ranges, the function first retrieves the offset for the vector's ID,
/// which indicates the base position where the neighbors of that vector start.
/// To determine the exact range for the neighbors at the requested level, the function:
///
/// 1. Adds the cumulative number of neighbors up to the current level to the offset to
/// compute the `begin` index. This gives the starting position for the neighbors at the specified level.
/// 2. Adds the cumulative number of neighbors up to the next level to the offset to compute the `end` index.
/// This provides the ending position for the neighbors of the current level.
///
/// # Example
///
/// For an `offset` vector of `[0, 48, 96, 112]` and a `cum_num_neigh_per_level` vector of `[0, 32, 48, 64, 80]`,
/// to determine the range of neighbors for the vector with ID `1` at level `1`, follow these steps:
///
/// 1. Retrieve the offset for the vector with ID `1`, which is `48`.
/// 2. Compute the `begin` index by adding the cumulative number of neighbors for level `1`,
/// which is `32`, to the offset: `48 + 32 = 80`.
/// 3. Compute the `end` index by adding the cumulative number of neighbors for level `2`,
/// which is `48`, to the offset: `48 + 48 = 96`.
///
/// Therefore, the range of indices for the neighbors of the vector with ID `1` at level `1` is `[80, 96)`.
#[inline]
fn get_offsets_neighbors_from_level(&self, id_vec: usize, level: u8) -> (usize, usize) {
assert!(
level <= self.levels_assigned[id_vec],
"The required level for the vector with ID {} is higher than the one assigned.",
id_vec
);
let o = self.offsets.get(id_vec).expect("Id vector not present");
let begin = o + self
.cum_num_neigh_per_level
.get(level as usize)
.expect("Level not present");
let end = o + self
.cum_num_neigh_per_level
.get((level + 1) as usize)
.expect("Level not present");
(begin, end)
}
/// Initializes the default probability levels and cumulative neighbor counts for each level in the HNSW graph.
///
/// # Description
///
/// This function sets up two key components for the HNSW graph structure:
///
/// 1. **Probability Levels (`probabs_levels`)**: The probability of a vector being assigned
/// to each level of the graph. The probability decreases exponentially with increasing levels,
/// ensuring fewer vectors are assigned to higher levels.
/// The probabilities are computed based on the level multiplier (`level_mult`).
///
/// 2. **Cumulative Neighbors per Level (`cum_num_neigh_per_level`)**: The cumulative count
/// of neighbors up to each level. The function calculates the number of neighbors for each level,
/// with level 0 having twice the number of neighbors as higher levels.
/// This count is accumulated and stored in the `cum_num_neigh_per_level` vector,
/// where each entry represents the total number of neighbors up to that level.
///
/// The function continues to compute these values for increasing levels until the calculated
/// probability for a level falls below a small threshold (`1e-9`).
///
/// # Parameters
///
/// - `level_mult`: A factor that controls how quickly the probability decreases as the level increases.
/// A larger `level_mult` causes probabilities to drop more slowly, spreading vectors across more levels.
/// A smaller `level_mult` leads to a faster drop-off, concentrating most vectors in the lower levels.
/// - `num_neighbors_per_vec`: The base number of neighbors assigned to each vector at levels above 0.
/// Level 0 is assigned twice this number of neighbors.
/// # Example
///
/// After calling this function with a `level_mult` of `1.0` and `num_neighbors_per_vec` of `16`,
/// the `probabs_levels` and `cum_num_neigh_per_level` vectors might look like this:
///
/// ```rust
/// probabs_levels = [0.6321, 0.3679, 0.1353, ...];
/// cum_num_neigh_per_level = [0, 32, 48, 64, ...];
/// ```
///
fn set_default_probas(&mut self, level_mult: f32, num_neighbors_per_vec: usize) {
let mut nn = 0;
self.cum_num_neigh_per_level.push(0);
for level in 0.. {
let proba = (-level as f32 / level_mult).exp() * (1.0 - (-1.0 / level_mult).exp());
if proba < 1e-9 {
break;
}
self.probabs_levels.push(proba);
nn += if level == 0 {
num_neighbors_per_vec * 2
} else {
num_neighbors_per_vec
};
self.cum_num_neigh_per_level.push(nn);
}
}
/// Returns the number of neighbors for a given level.
///
/// This function calculates the number of neighbors assigned to a specific level by subtracting the cumulative
/// number of neighbors for the current level from the cumulative number of neighbors for the next level.
///
/// # Parameters
/// - `level`: The level for which the number of neighbors is to be retrieved.
///
/// # Returns
/// - `usize`: The number of neighbors assigned to the specified level.
///
/// # Example
///
///
/// Assume the `cum_num_neigh_per_level` vector is `[0, 32, 48, 64]`.
/// If we want to know the number of neighbors for level 2, we calculate it as `64 - 48`, which equals `16`.
///
#[inline]
fn num_neighbors_per_level(&self, level: u8) -> usize {
self.cum_num_neigh_per_level[(level + 1) as usize]
- self.cum_num_neigh_per_level[level as usize]
}
/// Returns a reference to the vector that stores the level assigned to each vector.
pub fn get_level_assigned(&self) -> &Vec<u8> {
&self.levels_assigned
}
}