kannolo 0.1.6

kANNolo is designed for easy prototyping of ANN Search algorithms while ensuring high effectiveness and efficiency over both dense and sparse vectors.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
use crate::quantizer::{IdentityQuantizer, Quantizer, QueryEvaluator};
use crate::topk_selectors::topk_heap::TopkHeap;
use crate::topk_selectors::OnlineTopKSelector;
use crate::{hnsw_utils::*, DistanceType};
use crate::{Dataset, Float, GrowableDataset};
use crate::{DotProduct, EuclideanDistance};
use config_hnsw::ConfigHnsw;
use hnsw_builder::HnswBuilder;
use level::Level;
use serde::{Deserialize, Serialize};
use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashSet};
use std::marker::PhantomData;

/// A `GraphIndex` represents a Hierarchical Navigable Small World (HNSW) graph structure that is used
/// for approximate nearest neighbor (ANN) search. Constructed from either a dense or sparse dataset and
/// configuration settings, it efficiently finds the k closest vectors in the graph for each query within
/// the provided query dataset.
///
/// # Fields
///
/// - `levels`: A boxed slice containing the hierarchical levels of the HNSW graph. Each level stores
///   the neighbors of the vectors at that level, allowing for multi-level search to balance speed and accuracy.
/// - `dataset`: The dataset, either dense or sparse, that the graph index is built upon. This dataset holds
///   the vectors and provides access to their representations for searching.
/// - `num_neighbors_per_vec`: The number of neighbors per vector at each level in the HNSW graph. This parameter
///   determines the connectivity of the graph and affects the search performance and accuracy.
///    This helps the Rust compiler manage safety and type constraints related to `Q`.
/// - `id_permutation`: A boxed slice containing the permutation of vector IDs. This permutation allows
///   the retrieval of the position of a node in the dataset, enabling access to the corresponding vector values.
/// - `entry_vec`: This is the ID of the vector from which the search begins. It is a vector assigned to the
///   highest level in the hierarchy.
/// - `_phantom`: A `PhantomData` marker that indicates the type `Q` is used in the context of the struct,
///    ensuring proper type safety without actually storing a value of type `Q`.
#[derive(Serialize, Deserialize)]
pub struct GraphIndex<'a, D, Q>
where
    D: Dataset<'a, Q>,
    Q: Quantizer<DatasetType<'a> = D> + 'a,
{
    levels: Box<[Level]>,
    dataset: D,
    num_neighbors_per_vec: usize,
    id_permutation: Box<[usize]>,
    entry_vec: usize,
    _phantom: PhantomData<&'a Q>,
}

impl<'a, D, Q> GraphIndex<'a, D, Q>
where
    D: Dataset<'a, Q> + GrowableDataset<'a, Q>,
    Q: Quantizer<DatasetType<'a> = D>,
{
    /// Constructs a new `GraphIndex` by building an HNSW graph from a given dataset, configuration, and quantizer.
    ///
    /// This function creates a `GraphIndex` using a source dataset, which can be either dense or sparse, along
    /// with the provided HNSW configuration settings and a quantizer. The source dataset provided does not need
    /// to encode its vectors, as it is required to implement the `IdentityQuantizer` trait. This indicates
    /// that the dataset handles raw vector data and relies on the supplied quantizer to encode the vectors
    /// during the graph construction.
    ///
    /// The resulting `GraphIndex` stores a permutated dataset in which vectors are reordered according to the new
    /// IDs assigned during graph construction. This permutated dataset is used during the search.
    ///
    /// # Arguments
    ///
    /// - `source_dataset`: A reference to the dataset containing the vectors to be indexed. This dataset implements
    ///   the `Dataset` trait, and the quantizer associated with it must implement the `IdentityQuantizer` trait,
    ///   meaning the dataset handles raw vector data and does not encode the vectors itself.
    /// - `config`: A reference to `ConfigHnsw`, which holds the configuration parameters for building the HNSW graph,
    ///   such as the number of neighbors per vector.
    /// - `quantizer`: A quantizer that implements the `Quantizer` trait and is responsible for encoding the vectors
    ///   of the dataset during graph construction.
    ///
    /// # Returns
    ///
    /// A new `GraphIndex` constructed from the provided dataset, configuration settings, and quantizer.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use struttura_kANNolo::{
    /// hnsw::graph_index::GraphIndex,
    /// hnsw_utils::config_hnsw::ConfigHnsw};
    /// use rand::prelude::*;
    /// use std::iter;
    /// use struttura_kANNolo::plain_quantizer::PlainQuantizer;
    /// use struttura_kANNolo::{DenseDataset,GrowableDataset,DistanceType};
    ///
    /// let mut rng = rand::thread_rng();
    ///
    /// let n_vecs=1000;
    /// let dim_vecs = 10;
    ///
    /// // Set the number of threads to use to build the GraphIndex
    /// let num_threads=32;
    ///
    /// // Generate a vector of random floating-point numbers for the dataset.
    /// let vectors :Vec<f32> = iter::repeat_with(|| rng.gen::<f32>()).take(n_vecs*dim_vecs).collect();
    ///
    /// // Create a DenseDataset from the generated vectors.    
    /// let dataset = DenseDataset::from_vec(vectors, dim_vecs, PlainQuantizer::<f32>::new(dim_vecs, DistanceType::Euclidean));
    ///
    /// // Create a quantizer for encoding vectors during graph construction.
    /// let quantizer = PlainQuantizer::<f32>::new(dim_vecs, DistanceType::Euclidean);
    ///
    /// // Build the HNSW graph configuration with default settings.
    /// let config = ConfigHnsw::new().build();
    ///
    /// // Create the GraphIndex using the dataset, configuration, and quantizer.
    /// let hnsw_index = GraphIndex::from_dataset(&dataset, &config, quantizer, num_threads);
    ///
    /// // At this point, the `hnsw_index` is ready for performing nearest neighbor searches.
    /// ```
    pub fn from_dataset<SD, IQ>(
        source_dataset: &'a SD,
        config: &ConfigHnsw,
        quantizer: Q,
        num_threads: usize,
    ) -> Self
    where
        SD: Dataset<'a, IQ> + Sync,

        IQ: IdentityQuantizer<DatasetType<'a> = SD, T: Float> + Sync + 'a,
        // This constraint is necessary because the vector returned by the dataset's get function is of type Datatype.
        // The query evaluator, however, requires a vector of type Querytype.
        <IQ as Quantizer>::Evaluator<'a>:
            QueryEvaluator<'a, QueryType = <SD as Dataset<'a, IQ>>::DataType>,

        // This constraint is necessary because the `push` function of the new_dataset
        // expects input types of InputDataType, while we iterate over types of DataType from the source_dataset.
        D: GrowableDataset<'a, Q, InputDataType = <SD as Dataset<'a, IQ>>::DataType>,
    {
        let mut hnsw_builder = HnswBuilder::new(config.get_num_neighbors_per_vec(), source_dataset);

        let (levels, id_permutation, entry_vector) =
            hnsw_builder.compute_graph(config, num_threads);

        let mut encoded_dataset = D::new(quantizer, source_dataset.dim());

        for id in 0..source_dataset.len() {
            let vec = source_dataset.get(id_permutation[id]);
            encoded_dataset.push(&vec);
        }

        GraphIndex::new(
            levels,
            encoded_dataset,
            config.get_num_neighbors_per_vec(),
            id_permutation,
            entry_vector,
        )
    }

    /// This function initializes a `GraphIndex` by taking the constructed levels of the HNSW graph, the dataset
    /// that contains the vectors, the number of neighbors per vector, and an ID permutation that maps vector IDs
    /// to their positions in the dataset used for constructing the graph.
    ///
    /// # Arguments
    ///
    /// - `levels`: A `Vec<Level>` representing the different levels of the HNSW graph. Each level contains neighbor
    ///   information for vectors at that level.
    /// - `dataset`: The dataset of vectors that were used to construct the graph. This dataset can be dense or sparse
    ///   and implements the `Dataset` trait.
    /// - `num_neighbors_per_vec`: The number of neighbors per vector, used to control the connectivity in the HNSW graph.
    /// - `id_permutation`: A boxed slice containing the permutation of vector IDs. This permutation is used to permute
    ///   the dataset based on the new IDs assigned during graph construction, ensuring the dataset used for the search
    ///   reflects the updated IDs. It is also used during the search to map the IDs of the closest vectors found back
    ///   to their original positions in the dataset used to construct the graph.
    ///
    /// # Returns
    ///
    /// A new instance of `GraphIndex` containing the provided levels, dataset, number of neighbors per vector,
    /// and ID permutation.

    fn new(
        levels: Vec<Level>,
        dataset: D,
        num_neighbors_per_vec: usize,
        id_permutation: Vec<usize>,
        entry_vec: usize,
    ) -> Self {
        Self {
            levels: levels.into_boxed_slice(),
            dataset,
            num_neighbors_per_vec,
            _phantom: PhantomData,
            id_permutation: id_permutation.into_boxed_slice(),
            entry_vec,
        }
    }
}

impl<'a, D, Q> GraphIndex<'a, D, Q>
where
    D: Dataset<'a, Q> + Sync,
    Q: Quantizer<InputItem: Float, DatasetType<'a> = D> + Sync,
{
    /// Performs a nearest neighbor search for a given set of query vectors on the HNSW graph.
    ///
    /// This function searches for the `k` nearest neighbors for each vector in the provided query dataset.
    /// It utilizes the HNSW (Hierarchical Navigable Small World) graph structure to efficiently find the
    /// closest vectors in the index.
    ///
    /// # Arguments
    ///
    /// - `queries`: A reference to a query dataset containing the vectors for which nearest neighbors need to be found.
    ///   This dataset implements the `Dataset` trait.
    /// - `k`: The number of nearest neighbors to return for each query vector.
    /// - `config`: A reference to `ConfigHnsw`, which holds configuration parameters for the search process.
    ///
    /// # Returns
    ///
    /// A `Vec<(f32, usize)>` containing tuples of the distance and the ID of the nearest neighbors for each query vector.
    /// The distances are in ascending order, with the closest vectors listed first. The IDs are adjusted according
    /// to their positions in the dataset used to build the `GraphIndex`.
    ///
    /// # Example
    ///
    /// ```rust
    /// use struttura_kANNolo::{
    ///     hnsw::graph_index::GraphIndex,
    ///     hnsw_utils::config_hnsw::ConfigHnsw,
    ///     DenseDataset,
    ///     plain_quantizer::PlainQuantizer,
    ///     DistanceType,
    /// };
    /// use rand::prelude::*;
    /// use std::iter;
    ///
    ///
    /// let mut rng = rand::thread_rng();
    ///
    /// let n_vecs = 1000;
    /// let dim_vecs = 10;
    ///
    /// // Initialize the dataset.
    /// let vectors: Vec<f32> = iter::repeat_with(|| rng.gen::<f32>()).take(n_vecs * dim_vecs).collect();
    /// let dataset = DenseDataset::from_vec(vectors, dim_vecs, PlainQuantizer::new(dim_vecs, DistanceType::Euclidean));
    ///
    /// let config = ConfigHnsw::new().build();
    /// let k = 10; // Number of nearest neighbors to retrieve.
    ///
    /// // Create the GraphIndex.
    /// let hnsw_index = GraphIndex::from_dataset(&dataset, &config, PlainQuantizer::new(dim_vecs, DistanceType::Euclidean), 1);
    ///
    /// // Prepare the query dataset with random vectors.
    /// let query_vectors: Vec<f32> = iter::repeat_with(|| rng.gen::<f32>()).take(20 * dim_vecs).collect(); // 20 queries, each with `dim_vecs` dimensions
    /// let query_dataset = DenseDataset::from_vec(query_vectors, dim_vecs, PlainQuantizer::new(dim_vecs, DistanceType::Euclidean));
    ///
    /// // Perform the search.
    /// let results = hnsw_index.search(&query_dataset, k, &config);
    ///
    /// // `results` now contains the nearest neighbors for each query vector.
    /// ```
    pub fn search<QD, QQ>(
        &'a self,
        query: QD::DataType,
        k: usize,
        config: &ConfigHnsw,
    ) -> Vec<(f32, usize)>
    where
        // The query dataset type (QD) could be directly of type D, but this would not work if D is a Dataset
        // with a ProductQuantizer, this because queries is a dataset with a PlainQuantizer.
        QD: Dataset<'a, QQ> + Sync,
        QQ: Quantizer<DatasetType<'a> = QD> + 'a + Sync,
        // This constraint is necessary because the find_k_nearest_neighbors function takes an input parameter
        // of type QueryType, which is an associated type of the QueryEvaluator associated with the quantizer Q.
        // However, the queries are of type DataType, which is an associated type of the dataset QD.
        <Q as Quantizer>::Evaluator<'a>:
            QueryEvaluator<'a, QueryType = <QD as Dataset<'a, QQ>>::DataType>,
        <Q as Quantizer>::InputItem: EuclideanDistance<<Q as Quantizer>::InputItem>
            + DotProduct<<Q as Quantizer>::InputItem>,
    {
        let query_topk = self.find_k_nearest_neighbors(query, k, config);

        // remap ids based on their position in the dataset
        let mut topk: Vec<(f32, usize)> = query_topk
            .iter()
            .map(|x| (x.0, self.id_permutation[x.1]))
            .collect();

        // Adjust distance if using DotProduct distance type
        if self.dataset.quantizer().distance() == DistanceType::DotProduct {
            topk.iter_mut().for_each(|(dis, _)| *dis = -(*dis));
        }
        topk
    }

    /// Searches for the `k`-nearest neighbors of a given query vector within the HNSW graph.
    /// It starts by the entry point and performs a greedy search through the upper levels of the HNSW graph,
    /// updating the nearest neighbor (`nearest_vec`) and its distance (`dis_nearest_vec`) at each level.
    /// Once the search reaches level 0, it performs a more exhaustive search. The search continues until the
    /// `candidates` heap is empty or the distance to the farthest element in `top_candidates`
    /// (the best results found) is less than the distance to the closest element in `candidates`.
    ///
    /// # Description
    /// This function performs a search to find the `k` closest vectors to a given query vector using the
    /// HNSW graph structure.
    ///
    /// # Parameters
    ///
    /// - `query_vec`: The query vector for which the nearest neighbors are being searched.
    /// - `k`: The number of nearest neighbors to retrieve.
    /// - `config`: A reference to `ConfigHnsw`, which holds configuration parameters for the search.
    /// - `visited_table`: A mutable reference to `VisitedTable`, which keeps track of visited nodes to
    ///   avoid redundant searches.
    ///
    /// # Returns
    ///
    /// A `Vec<(f32, usize)>` containing tuples where each tuple represents a nearest neighbor. The first element
    /// is the distance to the neighbor, and the second element is the neighbor's ID.
    /// The results are sorted in ascending order of distance, with the closest vectors appearing first.

    pub fn find_k_nearest_neighbors(
        &'a self,
        query_vec: <Q::Evaluator<'a> as QueryEvaluator<'a>>::QueryType,
        k: usize,
        config: &ConfigHnsw,
    ) -> Vec<(f32, usize)>
    where
        <Q as Quantizer>::InputItem: EuclideanDistance<<Q as Quantizer>::InputItem>
            + DotProduct<<Q as Quantizer>::InputItem>,
    {
        let mut topk_heap = TopkHeap::new(k);
        let query_evaluator = self.dataset.query_evaluator(query_vec);

        // Start from the entry point
        let mut nearest_vec = self.entry_vec;
        let mut dis_nearest_vec = query_evaluator.compute_distance(nearest_vec);

        // Greedy search through the upper levels
        for level in self.levels.iter().skip(1).rev() {
            level.greedy_update_nearest(&query_evaluator, &mut nearest_vec, &mut dis_nearest_vec);
        }

        let ef = std::cmp::max(config.get_ef_search(), k);

        // Search on ground level
        let mut top_candidates = self.search_from_candidates_unbounded(
            Node(dis_nearest_vec, nearest_vec),
            &query_evaluator,
            ef,
            &self.levels[0],
        );
        while top_candidates.len() > k {
            top_candidates.pop();
        }
        while let Some(node) = top_candidates.pop() {
            topk_heap.push_with_id(node.distance(), node.id_vec());
        }

        topk_heap.topk()
    }

    /// Performs an unbounded search at the ground level of the HNSW graph to find the nearest neighbors for the given query.
    ///
    /// # Parameters
    ///
    /// - `starting_node`: The initial candidate node from which the search starts.
    /// - `query_evaluator`: Evaluates the distance between the query vector and nodes in the graph.
    /// - `ef`: The number of neighbors to consider during the search, affecting the size of heaps.
    /// - `visited_table`: Keeps track of nodes that have been visited to avoid redundant evaluations.
    /// - `level`: The current graph level where the search is conducted.
    ///
    /// # Description
    ///
    /// This function performs an unbounded search starting from a single candidate node. It maintains two heaps:
    /// - **`top_candidates`**: A max-heap that stores the top candidates found so far, ordered by their distance
    ///     from the query vector.
    /// - **`candidates`**: A min-heap that holds nodes to be evaluated, ordered by their distance from the query vector.
    ///
    /// The function proceeds as follows:
    /// 1. Initializes both heaps with the starting node and marks it as visited.
    /// 2. Iteratively pops nodes from the `candidates` heap.
    /// 3. If the distance of the current node is greater than the maximum distance in `top_candidates`, the search stops.
    /// 4. Otherwise, retrieves the neighbors of the current node and updates the heaps with these neighbors
    ///    if they haven’t been visited.
    ///
    /// The search continues until:
    /// - The `candidates` heap is empty.
    /// - The distance of the current node exceeds the maximum distance in the `top_candidates` heap.
    ///
    fn search_from_candidates_unbounded(
        &self,
        starting_node: Node,
        query_evaluator: &impl QueryEvaluator<'a>,
        ef: usize,
        level: &'a Level,
    ) -> BinaryHeap<Node> {
        // max-heap
        let mut top_candidates: BinaryHeap<Node> = BinaryHeap::new();
        // min-heap
        let mut candidates: BinaryHeap<Reverse<Node>> = BinaryHeap::new();

        let mut visited_table = HashSet::default();

        top_candidates.push(starting_node);
        candidates.push(Reverse(starting_node));

        visited_table.insert(starting_node.id_vec());

        while !candidates.is_empty() {
            let node = candidates.peek().unwrap().0;
            let id_candidate = node.id_vec();
            let distance_candidate = node.distance();

            if distance_candidate > top_candidates.peek().unwrap().distance() {
                break;
            }
            candidates.pop();

            let neighbors = level.get_neighbors_from_id(id_candidate);

            self.process_neighbors(
                neighbors,
                &mut visited_table,
                query_evaluator,
                |dis_neigh, neighbor| {
                    add_neighbor_to_heaps(
                        &mut candidates,
                        &mut top_candidates,
                        Node(dis_neigh, neighbor),
                        ef,
                    );
                },
            )
        }
        top_candidates
    }

    /// Processes a list of neighboring nodes, computes their distances from the query vector, and updates
    /// various sets based on a callback function.
    ///
    /// # Parameters
    ///
    /// - `neighbors`: A slice of node IDs representing the neighboring nodes to be processed.
    /// - `visited_table`: Keeps track of which nodes have been visited to prevent redundant evaluations.
    /// - `query_evaluator`: Computes distances between the query vector and nodes in the graph.
    /// - `add_distances_fn`: A callback function that processes the distance and node ID for each
    ///   unvisited neighbor. Depending on the context in which `process_neighbors` is called, this
    ///   function may add the distances and node IDs to various data structures
    ///
    /// /// # Description
    ///
    /// This function handles the processing of neighboring nodes in the following way:
    /// 1. **Visit Tracking**: It marks each neighbor as visited to avoid reprocessing.
    /// 2. **Batch Processing**: Neighbors are processed in batches (up to 4 at a time) for efficiency.
    /// 3. **Distance Computation**: For each batch, the function computes the distances from the query vector
    ///      using `query_evaluator`.
    /// 4. **Update Sets**: The `add_distances_fn` callback is invoked with the computed distance and node ID.
    ///    This function handles updating the appropriate data structures, such as heaps, based on how
    ///   `process_neighbors` is used.
    /// 5. **Final Handling**: Any remaining neighbors that did not form a complete batch are processed
    ///    and their distances are computed and added.
    fn process_neighbors<F>(
        &self,
        neighbors: &'a [usize],
        visited_table: &mut HashSet<usize>,
        query_evaluator: &impl QueryEvaluator<'a>,
        mut add_distances_fn: F,
    ) where
        F: FnMut(f32, usize),
    {
        let mut counter = 0;
        // Stores the IDs of the neighbors whose distances will be computed
        let mut ids: Vec<usize> = vec![0; 4];

        for &neighbor in neighbors.iter() {
            let visited = visited_table.contains(&neighbor);
            visited_table.insert(neighbor);

            ids[counter] = neighbor;

            if !visited {
                counter += 1;
            }

            if counter == 4 {
                let distances = query_evaluator.compute_four_distances(ids.iter().copied());
                for (dis_neigh, &neighbor) in distances.zip(ids.iter()) {
                    add_distances_fn(dis_neigh, neighbor);
                }
                counter = 0;
            }
        }

        // Add the remaining neighbors, if there are any left
        for neighbor in ids.iter().take(counter) {
            let distance_neighbor: f32 = query_evaluator.compute_distance(*neighbor);
            add_distances_fn(distance_neighbor, *neighbor);
        }
    }

    /// Help function to print the space usage of the index.
    pub fn print_space_usage_byte(&self) -> usize {
        println!("Space Usage:");
        let forward: usize = self.dataset.get_space_usage_bytes();
        println!("\tForward Index: {:} Bytes", forward);
        let levels: usize = self
            .levels
            .iter()
            .map(|level| level.get_space_usage_bytes())
            .sum();

        let permutation: usize = self.id_permutation.len() * std::mem::size_of::<usize>();

        let additional: usize = 2 * std::mem::size_of::<usize>();

        println!(
            "\tLinks structure: {:} Bytes",
            levels + permutation + additional
        );

        println!(
            "\tTotal: {:} Bytes",
            forward + permutation + additional + levels
        );

        forward + permutation + additional + levels
    }
}