fid_rs/fid.rs
1mod block;
2mod blocks;
3mod chunk;
4mod chunks;
5mod fid_impl;
6mod fid_iter;
7
8use super::internal_data_structure::popcount_table::PopcountTable;
9
10#[cfg(feature = "serde")]
11use serde::{Deserialize, Serialize};
12
13#[cfg(feature = "mem_dbg")]
14use mem_dbg::{MemDbg, MemSize};
15
16/// FID (Fully Indexable Dictionary).
17///
18/// This class can handle bit sequence of virtually **arbitrary length.**
19///
20/// In fact, _N_ (FID's length) is designed to be limited to: _N <= 2^64_.<br>
21/// It should be enough for almost all usecases since a binary data of length of _2^64_ consumes _2^21 = 2,097,152_ TB (terabyte), which is hard to handle by state-of-the-art computer architecture.
22///
23/// # Implementation detail
24/// [Index<u64>](#impl-Index<u64>)'s implementation is trivial.
25///
26/// [select()](#method.select) just uses binary search of `rank()` results.
27///
28/// [rank()](#method.rank)'s implementation is standard but non-trivial.
29/// So here explains implementation of _rank()_.
30///
31/// ## [rank()](#method.rank)'s implementation
32/// Say you have the following bit vector.
33///
34/// ```text
35/// 00001000 01000001 00000100 11000000 00100000 00000101 10100000 00010000 001 ; (N=67)
36/// ```
37///
38/// Answer _rank(48)_ in _O(1)_ time-complexity and _o(N)_ space-complexity.
39///
40/// Naively, you can count the number of '1' from left to right.
41/// You will find _rank(48) == 10_ but it took _O(N)_ time-complexity.
42///
43/// To reduce time-complexity to _O(1)_, you can use _memonization_ technique.<br>
44/// Of course, you can memonize results of _rank(i)_ for every _i ([0, N-1])_.
45///
46/// ```text
47/// Bit vector; 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 [1] 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 ; (N=67)
48/// Memo rank(i); 0 0 0 0 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 5 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 8 8 9 10 10 11 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12 13
49/// ```
50///
51/// From this memo, you can answer _rank(48) == 10_ in constant time, although space-complexity for this memo is _O(N) > o(N)_.
52///
53/// To reduce space-complexity using memonization, we divide the bit vector into **Chunk** and **Block**.
54///
55/// ```text
56/// Bit vector; 00001000 01000001 00000100 11000000 00100000 00000101 [1]0100000 00010000 001 ; (N=67)
57/// Chunk; | 7 | 13 | ; (size = (log N)^2 = 36)
58/// Block; |0 |1 |1 |2 |2 |3 |3 |4 |6 |6 |6 |7 |0 |0 |0 |2 |4 |4 |4 |5 |5 |5 |6| ; (size = (log N) / 2 = 3)
59/// ```
60///
61/// - A **Chunk** has size of _(log N)^2_. Its value is _rank(<u>index of the last bit of the chunk</u>)_.
62/// - A **Block** has size of _(log N) / 2_. A chunk has many blocks. Block's value is the number of '1's in _[<u>index of the first bit of the chunk the block belongs to</u>, <u>index of the last bit of the block</u>]_ (note that the value is reset to 0 at the first bit of a chunk).
63///
64/// Now you want to answer _rank(48)_. 48-th bit is in the 2nd chunk, and in the 5th block in the chunk.<br>
65/// So the _rank(48)_ is at least:
66///
67/// _<u>7 (value of 1st chunk)</u> + <u>2 (value of 4th block in the 2nd chunk)</u>_
68///
69/// Then, focus on 3 bits in 5th block in the 2nd chunk; `[1]01`.<br>
70/// As you can see, only 1 '1' is included up to 48-th bit (`101` has 2 '1's but 2nd '1' is 50-th bit, irrelevant to _rank(48)_).
71///
72/// Therefore, the _rank(48)_ is calculated as:
73///
74/// _<u>7 (value of 1st chunk)</u> + <u>2 (value of 4th block in the 2nd chunk)</u> + <u>1 ('1's in 5th block up to 48-th bit)</u>_
75///
76/// OK. That's all... Wait!<br>
77/// _rank()_ must be in _O(1)_ time-complexity.
78///
79/// - _<u>7 (value of 1st chunk)</u>_: _O(1)_ if you store chunk value in array structure.
80/// - _<u>2 (value of 4th block in the 2nd chunk)</u>_: Same as above.
81/// - _<u>1 ('1's in 5th block up to 48-th bit)</u>_: **_O(<u>length of block</u>) = O(log N)_** !
82///
83/// Counting '1's in a block must also be _O(1)_, while using _o(N)_ space.<br>
84/// We use **Table** for this purpose.
85///
86/// | Block content | Number of '1's in block |
87/// |---------------|-------------------------|
88/// | `000` | 0 |
89/// | `001` | 1 |
90/// | `010` | 1 |
91/// | `011` | 2 |
92/// | `100` | 1 |
93/// | `101` | 2 |
94/// | `110` | 2 |
95/// | `111` | 3 |
96///
97/// This table is constructed in `build()`. So we can find the number of '1's in block in _O(1)_ time.<br>
98/// Note that this table has _O(log N) = o(N)_ length.
99///
100/// In summary:
101///
102/// _rank() = (value of left chunk) + (value of left block) + (value of table keyed by inner block bits)_.
103#[derive(Clone, Debug)]
104#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
105#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
106pub struct Fid {
107 /// Raw data.
108 byte_vec: Vec<u8>,
109
110 /// Bit length
111 bit_len: u64,
112
113 /// Total popcount of _[0, <u>last bit of the chunk</u>]_.
114 ///
115 /// Each chunk takes _2^64_ at max (when every bit is '1' for bit vector of length of _2^64_).
116 /// A chunk has blocks.
117 chunks: Chunks,
118
119 /// Table to calculate inner-block `rank()` in _O(1)_.
120 table: PopcountTable,
121}
122
123pub struct FidIter<'iter> {
124 fid: &'iter Fid,
125 i: u64,
126}
127
128/// Collection of Chunk.
129#[derive(Clone, Debug)]
130#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
131#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
132struct Chunks {
133 chunks: Vec<Chunk>,
134 chunks_cnt: u64,
135}
136
137/// Total popcount of _[0, <u>last bit of the chunk</u>]_ of a bit vector.
138///
139/// Each chunk takes _2^64_ at max (when every bit is '1' for Fid of length of _2^64_).
140#[derive(Clone, Debug)]
141#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
142#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
143struct Chunk {
144 value: u64, // popcount
145 blocks: Blocks,
146}
147
148/// Collection of Block in a Chunk.
149#[derive(Clone, Debug)]
150#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
151#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
152struct Blocks {
153 blocks: Vec<Block>,
154 blocks_cnt: u16,
155}
156
157/// Total popcount of _[_first bit of the chunk which the block belongs to_, _last bit of the block_]_ of a bit vector.
158///
159/// Each block takes (log 2^64)^2 = 64^2 = 2^16 at max (when every bit in a chunk is 1 for Fid of length of 2^64)
160#[derive(Clone, Debug)]
161#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
162#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
163struct Block {
164 value: u16, // popcount
165 length: u8,
166}