lsh_rs2/table/
mem.rs

1use crate::data::Integer;
2use crate::{
3    constants::DESCRIBE_MAX,
4    data::Numeric,
5    prelude::*,
6    table::general::{Bucket, HashTables},
7    utils::{all_eq, increase_capacity},
8};
9use fnv::{FnvHashMap as HashMap, FnvHashSet};
10use serde::{Deserialize, Serialize};
11use std::iter::FromIterator;
12
13/// Indexible vector storage.
14/// indexes will be stored in hashtables. The original vectors can be looked up in this data structure.
15#[derive(Debug, Deserialize, Serialize)]
16pub struct VecStore<N> {
17    pub map: Vec<Vec<N>>,
18}
19
20impl<N: Numeric> VecStore<N> {
21    fn push(&mut self, d: Vec<N>) -> u32 {
22        self.map.push(d);
23        (self.map.len() - 1) as u32
24    }
25
26    fn position(&self, d: &[N]) -> Option<u32> {
27        self.map.iter().position(|x| all_eq(x, d)).map(|x| x as u32)
28    }
29
30    fn get(&self, idx: u32) -> &Vec<N> {
31        &self.map[idx as usize]
32    }
33
34    fn increase_storage(&mut self, size: usize) {
35        increase_capacity(size, &mut self.map);
36    }
37}
38
39/// In memory backend for [LSH](struct.LSH.html).
40#[derive(Deserialize, Serialize)]
41pub struct MemoryTable<N, K>
42where
43    N: Numeric,
44    K: Integer,
45{
46    hash_tables: Vec<HashMap<Vec<K>, Bucket>>,
47    n_hash_tables: usize,
48    pub vec_store: VecStore<N>,
49    only_index_storage: bool,
50    counter: u32,
51}
52
53impl<N, K> MemoryTable<N, K>
54where
55    N: Numeric,
56    K: Integer,
57{
58    fn remove_idx(&mut self, idx: u32, hash: &[K], hash_table: usize) -> Result<()> {
59        let tbl = &mut self.hash_tables[hash_table];
60        let bucket = tbl.get_mut(hash);
61        match bucket {
62            None => return Err(Error::NotFound),
63            Some(bucket) => {
64                bucket.remove(&idx);
65                Ok(())
66            }
67        }
68    }
69    fn insert_idx(&mut self, idx: u32, hash: Vec<K>, hash_table: usize) {
70        debug_assert!(hash_table < self.n_hash_tables);
71        let tbl = unsafe { self.hash_tables.get_unchecked_mut(hash_table) };
72        let bucket = tbl.entry(hash).or_insert_with(|| FnvHashSet::default());
73        bucket.insert(idx);
74    }
75}
76
77impl<N, K> HashTables<N, K> for MemoryTable<N, K>
78where
79    N: Numeric,
80    K: Integer,
81{
82    fn new(n_hash_tables: usize, only_index_storage: bool, _: &str) -> Result<Box<Self>> {
83        // TODO: Check the average number of vectors in the buckets.
84        // this way the capacity can be approximated by the number of DataPoints that will
85        // be stored.
86        let hash_tables = vec![HashMap::default(); n_hash_tables];
87        let vector_store = VecStore { map: vec![] };
88        let m = MemoryTable {
89            hash_tables,
90            n_hash_tables,
91            vec_store: vector_store,
92            only_index_storage,
93            counter: 0,
94        };
95        Ok(Box::new(m))
96    }
97
98    fn put(&mut self, hash: Vec<K>, d: &[N], hash_table: usize) -> Result<u32> {
99        // Store hash and id/idx
100        let idx = self.counter;
101        self.insert_idx(idx, hash, hash_table);
102
103        // There are N hash_tables per unique vector. So we only store
104        // the unique v hash_table 0 and increment the counter (the id)
105        // after we've update the last (N) hash_table.
106        if (hash_table == 0) && (!self.only_index_storage) {
107            self.vec_store.push(d.to_vec());
108        } else if hash_table == self.n_hash_tables - 1 {
109            self.counter += 1
110        }
111        Ok(idx)
112    }
113
114    /// Expensive operation we need to do a linear search over all datapoints
115    fn delete(&mut self, hash: &[K], d: &[N], hash_table: usize) -> Result<()> {
116        // First find the data point in the VecStore
117        let idx = match self.vec_store.position(d) {
118            None => return Ok(()),
119            Some(idx) => idx,
120        };
121        // Note: data point remains in VecStore as shrinking the vector would mean we need to
122        // re-hash all datapoints.
123        self.remove_idx(idx, &hash, hash_table)
124    }
125
126    fn update_by_idx(
127        &mut self,
128        old_hash: &[K],
129        new_hash: Vec<K>,
130        idx: u32,
131        hash_table: usize,
132    ) -> Result<()> {
133        self.remove_idx(idx, old_hash, hash_table)?;
134        self.insert_idx(idx, new_hash, hash_table);
135        Ok(())
136    }
137
138    /// Query the whole bucket
139    fn query_bucket(&self, hash: &[K], hash_table: usize) -> Result<Bucket> {
140        let tbl = &self.hash_tables[hash_table];
141        match tbl.get(hash) {
142            None => Err(Error::NotFound),
143            Some(bucket) => Ok(bucket.clone()),
144        }
145    }
146
147    fn idx_to_datapoint(&self, idx: u32) -> Result<&Vec<N>> {
148        Ok(self.vec_store.get(idx))
149    }
150
151    fn increase_storage(&mut self, size: usize) {
152        increase_capacity(size, &mut self.hash_tables);
153        self.vec_store.increase_storage(size);
154    }
155
156    fn describe(&self) -> Result<String> {
157        let mut lengths = vec![];
158        let mut max_len = 0;
159        let mut min_len = 1000000;
160        let mut set: FnvHashSet<i32> = FnvHashSet::default();
161        // iterator over hash tables 0..L
162        for map in self.hash_tables.iter() {
163            // iterator over all hashes
164            // zip to truncate at the describe maximum
165            for ((k, v), _) in map.iter().zip(0..DESCRIBE_MAX) {
166                let len = v.len();
167                let hash_values: FnvHashSet<i32> =
168                    FnvHashSet::from_iter(k.iter().map(|&k| k.to_i32().unwrap()));
169                set = set.union(&hash_values).copied().collect();
170                lengths.push(len);
171                if len > max_len {
172                    max_len = len
173                }
174                if len < min_len {
175                    min_len = len
176                }
177            }
178        }
179
180        let avg = lengths.iter().sum::<usize>() as f32 / lengths.len() as f32;
181        let var = lengths
182            .iter()
183            .map(|&v| (avg - v as f32).powf(2.))
184            .sum::<f32>()
185            / lengths.len() as f32;
186        let std_dev = var.powf(0.5);
187
188        let mut out = String::from(&format!("No. of tables: {}\n", self.n_hash_tables));
189        out.push_str(&format!("Unique hash values:\n{:?}\n", set));
190        out.push_str("\nHash collisions:\n");
191        out.push_str(&format!("avg:\t{:?}\n", avg));
192        out.push_str(&format!("std-dev:\t{:?}\n", std_dev));
193        out.push_str(&format!("min:\t{:?}\n", min_len));
194        out.push_str(&format!("max:\t{:?}\n", max_len));
195
196        Ok(out)
197    }
198
199    fn get_unique_hash_int(&self) -> FnvHashSet<i32> {
200        let mut hash_numbers = FnvHashSet::default();
201
202        for ht in &self.hash_tables {
203            for ((hash, _), _i) in ht.iter().zip(0..100) {
204                for &v in hash {
205                    hash_numbers.insert(v.to_i32().unwrap());
206                }
207            }
208        }
209        hash_numbers
210    }
211}
212
213impl<N, K> std::fmt::Debug for MemoryTable<N, K>
214where
215    N: Numeric,
216    K: Integer,
217{
218    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
219        write!(f, "hash_tables:\nhash, \t buckets\n")?;
220        for ht in self.hash_tables.iter() {
221            write!(f, "{:?}\n", ht)?;
222        }
223        Ok(())
224    }
225}