Skip to main content

ph_temp/fmph/
goindexing.rs

1//! Utils for indexing with group optimization.
2
3use std::convert::{TryFrom, TryInto};
4use std::ops::Mul;
5use binout::{AsIs, Serializer};
6use bitm::{BitAccess, ceiling_div};
7use crate::seeds::{to_io_error, Bits, TwoToPowerBitsStatic};
8use crate::utils::{map32_to_32, map64_to_64};
9
10/// Calculates group number for a given `key` at level of the size `level_size_groups` groups, whose number is already hashed in `hasher`.
11/// Modifies `hasher`, which can be farther used to calculate index in the group by just writing to it the seed of the group.
12#[inline(always)]
13pub fn group_nr(hash: u64, level_size_groups: usize) -> usize {
14    //map64_to_32(hash, level_size_groups)
15    //map32_to_32((hash >> 32) as u32, level_size_groups as u32) as u64
16    //map48_to_64(hash >> 16, level_size_groups)
17    map64_to_64(hash, level_size_groups as u64) as usize    // note that the lowest x bits of hash are ignored by map64_to_64 if level_size_groups < 2^(64-x) and it is save to use the lowest bits by in_group_index
18}
19
20/*#[inline]
21fn mix64(mut x: u64) -> u64 {
22    x = (x ^ (x >> 30)).wrapping_mul(0xbf58476d1ce4e5b9u64);
23    x = (x ^ (x >> 27)).wrapping_mul(0x94d049bb133111ebu64);
24    x ^ (x >> 31)
25}*/
26
27#[inline(always)]
28fn mix32(mut x: u32) -> u32 {
29    x = (x ^ (x >> 16)).wrapping_mul(0x21f0aaad);
30    x = (x ^ (x >> 15)).wrapping_mul(0xd35a2d97);
31    x ^ (x >> 15)
32}
33
34/*#[inline(always)]
35fn mix16(mut x: u16) -> u16 {
36    x = (x ^ (x >> 8)).wrapping_mul(0xa3d3);
37    x = (x ^ (x >> 7)).wrapping_mul(0x4b2d);
38    x ^ (x >> 9)
39}
40
41#[inline(always)]
42fn mix16fast(mut x: u16) -> u16 {
43    x += x << 7; x ^= x >> 8;
44    x += x << 3; x ^= x >> 2;
45    x += x << 4; x ^= x >> 8;
46    x
47}*/
48
49/// Implementations of `GroupSize` represent group size in fingerprinting-based minimal perfect hashing with group optimization.
50pub trait GroupSize: Sized + Mul<usize, Output=usize> + Copy + Into<u8> + TryFrom<u8, Error=&'static str> {
51
52    fn validate(&self) -> Result<Self, &'static str> { Ok(*self) }
53
54    /// Returns `hash` modulo `self`.
55    #[inline(always)]
56    fn hash_to_group(&self, hash: u32) -> u8 {
57        map32_to_32(hash as u32, Into::<u8>::into(*self) as u32) as u8
58        //map16_to_16(hash as u16, Into::<u8>::into(*self) as u16) as u8
59    }
60
61    /// Returns index in the group with given seed `group_seed` using `hasher`
62    /// which must be modified earlier by `group_nr` function.
63    #[inline(always)]
64    fn in_group_index(&self, hash: u64, group_seed: u16) -> u8 {
65        //self.hash_to_group(mix16((hash as u16) ^ group_seed as u16))
66        self.hash_to_group(mix32((hash as u32) ^ (group_seed as u32)))
67
68        //self.hash_to_group(mix32((hash as u32).wrapping_add(group_seed as u32)))
69        //self.hash_to_group(mix32(((hash as u32) & 0xFFFF) | ((group_seed as u32) << 16)))
70        //self.hash_to_group(mix64(hash ^ group_seed as u64))
71    }
72
73    /// Returns bit index inside the group with number `group` and seed `group_seed`,
74    /// assigned to the key hashed by the `hasher`.
75    #[inline]
76    fn bit_index_for_seed(&self, hash: u64, group_seed: u16, group: usize) -> usize {
77        (*self * group as usize) + self.in_group_index(hash, group_seed) as usize
78    }
79
80    /// Returns number of groups and 64-bit segments for given `desired_total_size`.
81    fn level_size_groups_segments(&self, mut desired_total_size: usize) -> (usize, usize) {
82        let remainder = desired_total_size % 64;
83        if remainder != 0 { desired_total_size += 64 - remainder; } // round up to multiple of 64
84        let group_size = Into::<u8>::into(*self) as usize;
85        while desired_total_size % group_size != 0 { desired_total_size += 64; } // and also to multiple of group_size
86        return (desired_total_size / group_size, desired_total_size / 64);
87    }
88
89    /// Returns number of ones in the group.
90    /*#[inline] fn ones_in_group(&self, arr: &[u64], group_index: usize) -> u8 {
91        arr.get_fragment(group_index, (*self).into()).count_ones() as u8
92    }*/
93
94    // If `predicate` is `true`, copy group with given index, from `src` to `dst`.
95    // Predicate is called with the content of the groups of, successively, `dst` and `src`.
96    //#[inline] fn conditionally_copy_group<Pred>(&self, dst: &mut [u64], src: &[u64], group_index: usize, predicate: Pred)
97    //    where Pred: FnOnce(u64, u64) -> bool {
98    //    dst.conditionally_copy_fragment(src, predicate, group_index, (*self).into())
99    //}
100
101    /// If group with given index has more ones in `dst` than in `src`, then copy it from `src` to `dst` and call `callback`.
102    #[inline(always)] fn copy_group_if_better<CB>(&self, dst: &mut [u64], src: &[u64], group_index: usize, callback: CB)
103        where CB: FnOnce()
104    {
105        dst.conditionally_copy_fragment(src, |best, new|
106            if best.count_ones() < new.count_ones() {
107                callback();
108                true
109            } else { false }, group_index, (*self).into())
110    }
111
112    /// Returns number of bytes that `self.write` writes to the output.
113    #[inline] fn write_size_bytes(&self) -> usize {
114        std::mem::size_of::<u8>()
115    }
116
117    /// Writes `self` to `output`.
118    fn write(&self, output: &mut dyn std::io::Write) -> std::io::Result<()> {
119        AsIs::write(output, (*self).into())
120    }
121
122    /// Reads `Self` from `input`.
123    fn read(input: &mut dyn std::io::Read) -> std::io::Result<Self> {
124        let group_size: u8 = AsIs::read(input)?;
125        let result = TryInto::<Self>::try_into(group_size).map_err(to_io_error)?;
126        result.validate().map_err(to_io_error)
127    }
128}
129
130/// Size being the power of two.
131#[derive(Copy, Clone)]
132pub struct TwoToPowerBits {
133    log2size: u8,
134    mask: u8
135}
136
137impl TwoToPowerBits {
138    /// Returns [`TwoToPowerBits`] that represent two to power of `log2size`.
139    pub fn new(log2size: u8) -> Self {
140        assert!(log2size <= 7);
141        Self { log2size, mask: (1u8<<log2size)-1 }
142    }
143
144    /*pub fn get_fragment(&self, arr: &[u64], index: usize) -> u64 {
145        let bit_index = index << self.log2size;
146        (arr[bit_index / 64] >> (bit_index%64)) & nie ten self.mask
147    }*/
148}
149
150impl Mul<usize> for TwoToPowerBits {
151    type Output = usize;
152
153    #[inline(always)] fn mul(self, rhs: usize) -> Self::Output {
154        rhs << self.log2size
155    }
156}
157
158impl Into<u8> for TwoToPowerBits {
159    #[inline(always)] fn into(self) -> u8 {
160        1<<self.log2size
161    }
162}
163
164impl TryFrom<u8> for TwoToPowerBits {
165    type Error = &'static str;
166
167    fn try_from(value: u8) -> Result<Self, Self::Error> {
168        if value.is_power_of_two() && value <= 128 {
169            Ok(Self::new(value.trailing_zeros() as u8))
170        } else {
171            Err("group size must be the power of two, not greater than 128")
172        }
173    }
174}
175
176impl GroupSize for TwoToPowerBits {
177    #[inline(always)] fn hash_to_group(&self, hash: u32) -> u8 {
178        hash as u8 & self.mask
179    }
180
181    fn level_size_groups_segments(&self, desired_total_size: usize) -> (usize, usize) {
182        let level_size_segments;
183        let level_size_groups;
184        if self.log2size > 6 {
185            //level_size_segments = div_up(input_size << segments_per_group_log2, 64) << segments_per_group_log2;
186            //level_size_groups = (level_size_segments >> segments_per_group_log2) as u32;
187            level_size_groups = ceiling_div(desired_total_size, 1usize<<self.log2size);
188            level_size_segments = (level_size_groups as usize) << (self.log2size - 6);
189        } else {
190            level_size_segments = ceiling_div(desired_total_size, 64);
191            level_size_groups = level_size_segments << (6 - self.log2size);
192        }
193        (level_size_groups, level_size_segments)
194    }
195
196    /*fn ones_in_group(&self, arr: &[u64], group_index: usize) -> u8 {
197        if self.log2size >= 6 {
198            let log2_segments_per_group = self.log2size - 6;
199            let segments_per_group = 1 << log2_segments_per_group;
200            let begin_segment = group_index << log2_segments_per_group; // * segments_per_group
201            arr[begin_segment..begin_segment+segments_per_group].iter().map(|v| v.count_ones() as u8).sum()
202        } else {
203            arr.get_fragment(group_index, (*self).into()).count_ones() as u8
204        }
205    }*/
206}
207
208impl GroupSize for Bits {
209    fn validate(&self) -> Result<Self, &'static str> {
210        if self.0 <= 63 { Ok(*self) } else { Err("group sizes grater than 63 are not supported by Bits") }
211    }
212}
213
214impl<const LOG2_BITS: u8> GroupSize for TwoToPowerBitsStatic<LOG2_BITS> {
215    #[inline(always)] fn hash_to_group(&self, hash: u32) -> u8 {
216        hash as u8 & Self::LOG2_MASK
217    }
218
219    fn level_size_groups_segments(&self, desired_total_size: usize) -> (usize, usize) {
220        let level_size_segments;
221        let level_size_groups;
222        if LOG2_BITS > 6 {
223            //level_size_segments = div_up(input_size << segments_per_group_log2, 64) << segments_per_group_log2;
224            //level_size_groups = (level_size_segments >> segments_per_group_log2) as u32;
225            level_size_groups = ceiling_div(desired_total_size, 1usize<<LOG2_BITS);
226            level_size_segments = (level_size_groups as usize) << (LOG2_BITS - 6);
227        } else {
228            level_size_segments = ceiling_div(desired_total_size, 64);
229            level_size_groups = level_size_segments << (6 - LOG2_BITS);
230        }
231        (level_size_groups, level_size_segments)
232    }
233
234    #[inline(always)] fn copy_group_if_better<CB>(&self, dst: &mut [u64], src: &[u64], group_index: usize, callback: CB)
235        where CB: FnOnce()
236    {
237        let vec_index = group_index / Self::VALUES_PER_64 as usize;
238        let shift = Self::shift_for(group_index);
239        let dst_v = &mut dst[vec_index];
240        let best = (*dst_v >> shift) & Self::MASK64;
241        let new = (src[vec_index] >> shift) & Self::MASK64;
242        if best.count_ones() < new.count_ones() {
243            callback();
244            *dst_v &= !(Self::MASK64 << shift);
245            *dst_v |= new << shift;
246        }
247    }
248}