Skip to main content

ph_temp/phast/seed_chooser/
mod.rs

1mod k;
2pub use k::SeedOnlyK;
3
4mod shift;
5pub use shift::{ShiftOnly};
6
7mod shift_wrap;
8pub use shift_wrap::{ShiftOnlyWrapped, ShiftSeedWrapped};
9
10use crate::phast::{cyclic::{GenericUsedValue, UsedValueSet}, Params, Weights};
11
12use super::conf::Conf;
13
14/// Returns slice length for regular PHast.
15pub(crate) fn slice_len(output_without_shift_range: usize, bits_per_seed: u8, preferred_slice_len: u16) -> u16 {
16    match output_without_shift_range {
17        n @ 0..64 => (n/2+1).next_power_of_two() as u16,
18        64..1300 => 64,
19        1300..9500 => 128,
20        9500..12000 => 256,
21        12000..140000 => 512,
22        _ if bits_per_seed < 6 => if preferred_slice_len == 0 { 512 } else { preferred_slice_len },
23        _ if bits_per_seed < 12 => if preferred_slice_len == 0 { 1024 } else { preferred_slice_len },   // for 11 2048 gives ~0.002 bit/key smaller size at cost of ~5% longer construction
24        _ => if preferred_slice_len == 0 { 2048 } else { preferred_slice_len }
25    }
26}
27
28/// Choose best seed in bucket. It affects the trade-off between size and evaluation and construction time.
29pub trait SeedChooser: Copy {
30    /// Specifies whether bumping is allowed.
31    const BUMPING: bool = true;
32
33    /// The lowest seed that does not indicate bumping.
34    const FIRST_SEED: u16 = if Self::BUMPING { 1 } else { 0 };
35
36    /// Size of last level of Function2. Important when `extra_shift()>0` (i.e. for `ShiftOnly`).
37    const FUNCTION2_THRESHOLD: usize = 4096;
38
39    type UsedValues: GenericUsedValue;
40
41    /// Returns maximum number of keys mapped to each output value; `k` of `k`-perfect function.
42    #[inline(always)] fn k(self) -> u8 { 1 }
43
44    /// Returns output range of minimal (perfect or k-perfect) function for given number of keys.
45    #[inline(always)] fn minimal_output_range(self, num_of_keys: usize) -> usize { num_of_keys }
46
47    #[inline] fn bucket_evaluator(&self, bits_per_seed: u8, slice_len: u16) -> Weights {
48        Weights::new(bits_per_seed, slice_len)
49    }
50
51    fn conf(self, output_range: usize, input_size: usize, bits_per_seed: u8, bucket_size_100: u16, preferred_slice_len: u16) -> Conf {
52        let max_shift = self.extra_shift(bits_per_seed);
53        let slice_len = slice_len(output_range.saturating_sub(max_shift as usize), bits_per_seed.into(), preferred_slice_len);
54        Conf::new(output_range, input_size, bucket_size_100, slice_len, max_shift)
55    }
56
57    #[inline(always)] fn conf_for_minimal(self, num_of_keys: usize, bits_per_seed: u8, bucket_size_100: u16, preferred_slice_len: u16) -> Conf {
58        self.conf(self.minimal_output_range(num_of_keys), num_of_keys, bits_per_seed, bucket_size_100, preferred_slice_len)
59    }
60
61    #[inline(always)] fn conf_for_minimal_p<SS: Copy+Into<u8>>(self, num_of_keys: usize, params: &Params<SS>) -> Conf {
62        self.conf_for_minimal(num_of_keys, params.seed_size.into(), params.bucket_size100, params.preferred_slice_len)
63    }
64
65    /// How much the chooser can add to value over slice length.
66    #[inline(always)] fn extra_shift(self, _bits_per_seed: u8) -> u16 { 0 }
67
68    /// Returns function value for given primary code and seed.
69    fn f(self, primary_code: u64, seed: u16, conf: &Conf) -> usize;
70    
71    /// Returns best seed to store in seeds array or `u16::MAX` if `NO_BUMPING` is `true` and there is no feasible seed.
72    fn best_seed(self, used_values: &mut Self::UsedValues, keys: &[u64], conf: &Conf, bits_per_seed: u8) -> u16;
73}
74
75#[inline(always)]
76fn best_seed_big<SC: SeedChooser>(seed_chooser: SC, best_value: &mut usize, best_seed: &mut u16, used_values: &mut UsedValueSet, keys: &[u64], conf: &Conf, seeds_num: u16) {
77    let mut values_used_by_seed = Vec::with_capacity(keys.len());
78    let simd_keys = keys.len() / 4 * 4;
79    //assert!(simd_keys <= keys.len());
80    'outer: for seed in SC::FIRST_SEED..seeds_num {    // seed=0 is special = no seed,
81        values_used_by_seed.clear();
82        for i in (0..simd_keys).step_by(4) {
83            let values = [
84                seed_chooser.f(keys[i], seed, conf),
85                seed_chooser.f(keys[i+1], seed, conf),
86                seed_chooser.f(keys[i+2], seed, conf),
87                seed_chooser.f(keys[i+3], seed, conf),
88            ];
89            let contains = [
90                used_values.contain(values[0]),
91                used_values.contain(values[1]),
92                used_values.contain(values[2]),
93                used_values.contain(values[3]),
94            ];
95            if contains.iter().any(|b| *b) { continue 'outer; }
96            //if contains[0] || contains[1] || contains[2] || contains[3] { continue 'outer; }
97            values_used_by_seed.push(values[0]);
98            values_used_by_seed.push(values[1]);
99            values_used_by_seed.push(values[2]);
100            values_used_by_seed.push(values[3]);
101        }
102        //assert!(keys.len() - simd_keys < 4);
103        for i in simd_keys..keys.len() {
104            let value = seed_chooser.f(keys[i], seed, conf);
105            if used_values.contain(value) { continue 'outer; }
106            values_used_by_seed.push(value);
107        }
108        let seed_value = values_used_by_seed.iter().sum();
109        if seed_value < *best_value {
110            values_used_by_seed.sort();
111            if values_used_by_seed.windows(2).any(|v| v[0]==v[1]) {
112                //SELF_COLLISION_KEYS.fetch_add(keys.len() as u64, std::sync::atomic::Ordering::Relaxed);
113                //SELF_COLLISION_BUCKETS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
114                //if SC::BUMPING { return; }
115                continue;
116            }
117            *best_value = seed_value;
118            *best_seed = seed;
119        }
120    }
121}
122
123#[inline(always)]
124fn best_seed_small<SC: SeedChooser>(seed_chooser: SC, best_value: &mut usize, best_seed: &mut u16, used_values: &mut UsedValueSet, keys: &[u64], conf: &Conf, seeds_num: u16) {
125    assert!(keys.len() <= SMALL_BUCKET_LIMIT);  // seems to speeds up a bit
126    let mut values_used_by_seed = arrayvec::ArrayVec::<_, SMALL_BUCKET_LIMIT>::new(); // Vec::with_capacity(keys.len());
127    'outer: for seed in SC::FIRST_SEED..seeds_num {    // seed=0 is special = no seed,
128        values_used_by_seed.clear();
129        for key in keys.iter().copied() {
130            let value = seed_chooser.f(key, seed, conf);
131            if used_values.contain(value) { continue 'outer; }
132            values_used_by_seed.push(value);
133        }
134        let seed_value = values_used_by_seed.iter().sum();
135        if seed_value < *best_value {
136            values_used_by_seed.sort_unstable();
137            for i in 1..values_used_by_seed.len() {
138                if values_used_by_seed[i-1] == values_used_by_seed[i] {
139                    //SELF_COLLISION_KEYS.fetch_add(keys.len() as u64, std::sync::atomic::Ordering::Relaxed);
140                    //SELF_COLLISION_BUCKETS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
141                    //if SC::BUMPING { return; }
142                    continue 'outer;
143                }
144            }
145            *best_value = seed_value;
146            *best_seed = seed;
147        }
148    }
149}
150
151const SMALL_BUCKET_LIMIT: usize = 8;
152
153/// [`SeedChooser`] to build (1-)perfect functions.
154/// 
155/// Can be used with any function type: [`Function`], [`Function2`], [`Perfect`].
156/// 
157/// It chooses best seed with quite strong hasher, without shift component,
158/// which should lead to small size, but long construction time.
159#[derive(Clone, Copy)]
160pub struct SeedOnly;
161
162impl SeedChooser for SeedOnly {
163    type UsedValues = UsedValueSet;
164    
165    #[inline(always)] fn f(self, primary_code: u64, seed: u16, conf: &Conf) -> usize {
166        conf.f(primary_code, seed)
167    }
168
169    /*#[inline(always)] fn f_slice(primary_code: u64, slice_begin: usize, seed: u16, conf: &Conf) -> usize {
170        slice_begin + conf.in_slice(primary_code, seed)
171    }*/
172
173    #[inline(always)]
174    fn best_seed(self, used_values: &mut Self::UsedValues, keys: &[u64], conf: &Conf, bits_per_seed: u8) -> u16 {
175        let mut best_seed = 0;
176        let mut best_value = usize::MAX;
177        if keys.len() <= SMALL_BUCKET_LIMIT {
178            best_seed_small(self, &mut best_value, &mut best_seed, used_values, keys, conf, 1<<bits_per_seed)
179        } else {
180            best_seed_big(self, &mut best_value, &mut best_seed, used_values, keys, conf, 1<<bits_per_seed)
181        };
182        if best_seed != 0 { // can assign seed to the bucket
183            for key in keys {
184                used_values.add(conf.f(*key, best_seed));
185            }
186        };
187        best_seed
188    }
189}
190
191/// Choose best seed without shift component.
192#[derive(Clone, Copy)]
193pub struct SeedOnlyNoBump;
194
195impl SeedChooser for SeedOnlyNoBump {
196    const BUMPING: bool = false;
197    const FIRST_SEED: u16 = 0;
198
199    type UsedValues = UsedValueSet;
200
201    #[inline(always)] fn f(self, primary_code: u64, seed: u16, conf: &Conf) -> usize {
202        conf.f_nobump(primary_code, seed)
203    }
204
205    #[inline]
206    fn best_seed(self, used_values: &mut Self::UsedValues, keys: &[u64], conf: &Conf, bits_per_seed: u8) -> u16 {
207        //let _: [(); Self::FIRST_SEED as usize] = [];
208        let mut best_seed = u16::MAX;
209        let mut best_value = usize::MAX;
210        if keys.len() <= SMALL_BUCKET_LIMIT {
211            best_seed_small(self, &mut best_value, &mut best_seed, used_values, keys, conf, 1<<bits_per_seed)
212        } else {
213            best_seed_big(self, &mut best_value, &mut best_seed, used_values, keys, conf, 1<<bits_per_seed)
214        };
215        if best_seed != u16::MAX { // can assign seed to the bucket
216            for key in keys {
217                used_values.add(conf.f_nobump(*key, best_seed));
218            }
219        };
220        best_seed
221    }
222}
223
224