1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef _HLL_HPP_
#define _HLL_HPP_
#include "common_defs.hpp"
#include "HllUtil.hpp"
#include <memory>
#include <iostream>
#include <vector>
namespace datasketches {
/**
* This is a high performance implementation of Phillipe Flajolet’s HLL sketch but with
* significantly improved error behavior. If the ONLY use case for sketching is counting
* uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
* storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
* 16 times smaller than the Theta sketch family for the same accuracy.
*
* <p>This implementation offers three different types of HLL sketch, each with different
* trade-offs with accuracy, space and performance. These types are specified with the
* {@link TgtHllType} parameter.
*
* <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
* distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
* The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
* where <i>K</i> is the number of buckets or slots for the sketch.
*
* <p>During warmup, when the sketch has only received a small number of unique items
* (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
* algorithms with significantly better accuracy.
*
* <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
* created by the user, the sketch will perform all of its updates and internal phase transitions
* in that object, which can actually reside either on-heap or off-heap based on how it is
* configured. In large systems that must update and merge many millions of sketches, having the
* sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
* to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
* delays.
*
* author Jon Malkin
* author Lee Rhodes
* author Kevin Lang
*/
/**
* Specifies the target type of HLL sketch to be created. It is a target in that the actual
* allocation of the HLL array is deferred until sufficient number of items have been received by
* the warm-up phases.
*
* <p>These three target types are isomorphic representations of the same underlying HLL algorithm.
* Thus, given the same value of <i>lg_config_k</i> and the same input, all three HLL target types
* will produce identical estimates and have identical error distributions.</p>
*
* <p>The memory (and also the serialization) of the sketch during this early warmup phase starts
* out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required
* until the full HLL array is allocated. This transition point occurs at about 10% of K for
* sketches where lg_config_k is > 8.</p>
*
* <ul>
* <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the
* fastest in terms of update time, but has the largest storage footprint of about
* <i>K</i> bytes.</li>
*
* <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
* in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>
*
* <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require
* the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
* For the values of <i>lg_config_k > 13</i> (<i>K</i> = 8192),
* this additional array adds about 3% to the overall storage. It is generally the slowest in
* terms of update time, but has the smallest storage footprint of about
* <i>K/2 * 1.03</i> bytes.</li>
* </ul>
*/
enum target_hll_type {
HLL_4, ///< 4 bits per entry (most compact, size may vary)
HLL_6, ///< 6 bits per entry (fixed size)
HLL_8 ///< 8 bits per entry (fastest, fixed size)
};
template<typename A>
class HllSketchImpl;
template<typename A>
class hll_union_alloc;
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
template<typename A = std::allocator<uint8_t> >
class hll_sketch_alloc final {
public:
/**
* Constructs a new HLL sketch.
* @param lg_config_k Sketch can hold 2^lg_config_k rows
* @param tgt_type The HLL mode to use, if/when the sketch reaches that state
* @param start_full_size Indicates whether to start in HLL mode,
* keeping memory use constant (if HLL_6 or HLL_8) at the cost of
* starting out using much more memory
*/
explicit hll_sketch_alloc(uint8_t lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false, const A& allocator = A());
/**
* Copy constructor
*/
hll_sketch_alloc(const hll_sketch_alloc<A>& that);
/**
* Copy constructor to a new target type
*/
hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type);
/**
* Move constructor
*/
hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept;
/**
* Reconstructs a sketch from a serialized image on a stream.
* @param is An input stream with a binary image of a sketch
*/
static hll_sketch_alloc deserialize(std::istream& is, const A& allocator = A());
/**
* Reconstructs a sketch from a serialized image in a byte array.
* @param is bytes An input array with a binary image of a sketch
* @param len Length of the input array, in bytes
*/
static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A());
//! Class destructor
virtual ~hll_sketch_alloc();
//! Copy assignment operator
hll_sketch_alloc operator=(const hll_sketch_alloc<A>& other);
//! Move assignment operator
hll_sketch_alloc operator=(hll_sketch_alloc<A>&& other);
/**
* Resets the sketch to an empty state in coupon collection mode.
* Does not re-use existing internal objects.
*/
void reset();
typedef vector_u8<A> vector_bytes; // alias for users
/**
* Serializes the sketch to a byte array, compacting data structures
* where feasible to eliminate unused storage in the serialized image.
* @param header_size_bytes Allows for PostgreSQL integration
*/
vector_bytes serialize_compact(unsigned header_size_bytes = 0) const;
/**
* Serializes the sketch to a byte array, retaining all internal
* data structures in their current form.
*/
vector_bytes serialize_updatable() const;
/**
* Serializes the sketch to an ostream, compacting data structures
* where feasible to eliminate unused storage in the serialized image.
* @param os std::ostream to use for output.
*/
void serialize_compact(std::ostream& os) const;
/**
* Serializes the sketch to an ostream, retaining all internal data
* structures in their current form.
* @param os std::ostream to use for output.
*/
void serialize_updatable(std::ostream& os) const;
/**
* Human readable summary with optional detail
* @param summary if true, output the sketch summary
* @param detail if true, output the internal data array
* @param auxDetail if true, output the internal Aux array, if it exists.
* @param all if true, outputs all entries including empty ones
* @return human readable string with optional detail.
*/
string<A> to_string(bool summary = true,
bool detail = false,
bool aux_detail = false,
bool all = false) const;
/**
* Present the given std::string as a potential unique item.
* The string is converted to a byte array using UTF8 encoding.
* If the string is null or empty no update attempt is made and the method returns.
* @param datum The given string.
*/
void update(const std::string& datum);
/**
* Present the given unsigned 64-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint64_t datum);
/**
* Present the given unsigned 32-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint32_t datum);
/**
* Present the given unsigned 16-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint16_t datum);
/**
* Present the given unsigned 8-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint8_t datum);
/**
* Present the given signed 64-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int64_t datum);
/**
* Present the given signed 32-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int32_t datum);
/**
* Present the given signed 16-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int16_t datum);
/**
* Present the given signed 8-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int8_t datum);
/**
* Present the given 64-bit floating point value as a potential unique item.
* @param datum The given double.
*/
void update(double datum);
/**
* Present the given 32-bit floating point value as a potential unique item.
* @param datum The given float.
*/
void update(float datum);
/**
* Present the given data array as a potential unique item.
* @param data The given array.
* @param length_bytes The array length in bytes.
*/
void update(const void* data, size_t length_bytes);
/**
* Returns the current cardinality estimate
* @return the cardinality estimate
*/
double get_estimate() const;
/**
* This is less accurate than the getEstimate() method
* and is automatically used when the sketch has gone through
* union operations where the more accurate HIP estimator cannot
* be used.
*
* This is made public only for error characterization software
* that exists in separate packages and is not intended for normal
* use.
* @return the composite cardinality estimate
*/
double get_composite_estimate() const;
/**
* Returns the approximate lower error bound given the specified
* number of standard deviations.
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
* @return The approximate lower bound.
*/
double get_lower_bound(uint8_t num_std_dev) const;
/**
* Returns the approximate upper error bound given the specified
* number of standard deviations.
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
* @return The approximate upper bound.
*/
double get_upper_bound(uint8_t num_std_dev) const;
/**
* Returns sketch's configured lg_k value.
* @return Configured lg_k value.
*/
uint8_t get_lg_config_k() const;
/**
* Returns the sketch's target HLL mode (from #target_hll_type).
* @return The sketch's target HLL mode.
*/
target_hll_type get_target_type() const;
/**
* Indicates if the sketch is currently stored compacted.
* @return True if the sketch is stored in compact form.
*/
bool is_compact() const;
/**
* Indicates if the sketch is currently empty.
* @return True if the sketch is empty.
*/
bool is_empty() const;
/**
* Returns the size of the sketch serialized in compact form.
* @return Size of the sketch serialized in compact form, in bytes.
*/
uint32_t get_compact_serialization_bytes() const;
/**
* Returns the size of the sketch serialized without compaction.
* @return Size of the sketch serialized without compaction, in bytes.
*/
uint32_t get_updatable_serialization_bytes() const;
/**
* Returns the maximum size in bytes that this sketch can grow to
* given lg_config_k. However, for the HLL_4 sketch type, this
* value can be exceeded in extremely rare cases. If exceeded, it
* will be larger by only a few percent.
*
* @param lg_config_k The Log2 of K for the target HLL sketch. This value must be
* between 4 and 21 inclusively.
* @param tgt_type the desired Hll type
* @return the maximum size in bytes that this sketch can grow to.
*/
static uint32_t get_max_updatable_serialization_bytes(uint8_t lg_k, target_hll_type tgt_type);
/**
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
* parameters. This is used primarily for testing.
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
* @param unioned set true if the sketch is the result of a union operation.
* @param lg_config_k the configured value for the sketch.
* @param num_std_dev the given number of Standard Deviations. This must be an integer between
* 1 and 3, inclusive.
* @return the current (approximate) RelativeError
*/
static double get_rel_err(bool upper_bound, bool unioned,
uint8_t lg_config_k, uint8_t num_std_dev);
private:
explicit hll_sketch_alloc(HllSketchImpl<A>* that);
void coupon_update(uint32_t coupon);
std::string type_as_string() const;
std::string mode_as_string() const;
hll_mode get_current_mode() const;
uint8_t get_serialization_version() const;
bool is_out_of_order_flag() const;
bool is_estimation_mode() const;
typedef typename std::allocator_traits<A>::template rebind_alloc<hll_sketch_alloc> AllocHllSketch;
HllSketchImpl<A>* sketch_impl;
friend hll_union_alloc<A>;
};
/**
* This performs union operations for HLL sketches. This union operator is configured with a
* <i>lgMaxK</i> instead of the normal <i>lg_config_k</i>.
*
* <p>This union operator does permit the unioning of sketches with different values of
* <i>lg_config_k</i>. The user should be aware that the resulting accuracy of a sketch returned
* at the end of the unioning process will be a function of the smallest of <i>lg_max_k</i> and
* <i>lg_config_k</i> that the union operator has seen.
*
* <p>This union operator also permits unioning of any of the three different target hll_sketch
* types.
*
* <p>Although the API for this union operator parallels many of the methods of the
* <i>HllSketch</i>, the behavior of the union operator has some fundamental differences.
*
* <p>First, the user cannot specify the #tgt_hll_type as an input parameter.
* Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope).
*
* <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can
* change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen.
*
* author Jon Malkin
* author Lee Rhodes
* author Kevin Lang
*/
template<typename A = std::allocator<uint8_t> >
class hll_union_alloc {
public:
/**
* Construct an hll_union operator with the given maximum log2 of k.
* @param lg_max_k The maximum size, in log2, of k. The value must
* be between 7 and 21, inclusive.
*/
explicit hll_union_alloc(uint8_t lg_max_k, const A& allocator = A());
/**
* Returns the current cardinality estimate
* @return the cardinality estimate
*/
double get_estimate() const;
/**
* This is less accurate than the get_estimate() method
* and is automatically used when the union has gone through
* union operations where the more accurate HIP estimator cannot
* be used.
*
* This is made public only for error characterization software
* that exists in separate packages and is not intended for normal
* use.
* @return the composite cardinality estimate
*/
double get_composite_estimate() const;
/**
* Returns the approximate lower error bound given the specified
* number of standard deviations.
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
* @return The approximate lower bound.
*/
double get_lower_bound(uint8_t num_std_dev) const;
/**
* Returns the approximate upper error bound given the specified
* number of standard deviations.
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
* @return The approximate upper bound.
*/
double get_upper_bound(uint8_t num_std_dev) const;
/**
* Returns union's configured lg_k value.
* @return Configured lg_k value.
*/
uint8_t get_lg_config_k() const;
/**
* Returns the union's target HLL mode (from #target_hll_type).
* @return The union's target HLL mode.
*/
target_hll_type get_target_type() const;
/**
* Indicates if the union is currently empty.
* @return True if the union is empty.
*/
bool is_empty() const;
/**
* Resets the union to an empty state in coupon collection mode.
* Does not re-use existing internal objects.
*/
void reset();
/**
* Returns the result of this union operator with the specified
* #tgt_hll_type.
* @param The tgt_hll_type enum value of the desired result (Default: HLL_4)
* @return The result of this union with the specified tgt_hll_type
*/
hll_sketch_alloc<A> get_result(target_hll_type tgt_type = HLL_4) const;
/**
* Update this union operator with the given sketch.
* @param The given sketch.
*/
void update(const hll_sketch_alloc<A>& sketch);
/**
* Update this union operator with the given temporary sketch.
* @param The given sketch.
*/
void update(hll_sketch_alloc<A>&& sketch);
/**
* Present the given std::string as a potential unique item.
* The string is converted to a byte array using UTF8 encoding.
* If the string is null or empty no update attempt is made and the method returns.
* @param datum The given string.
*/
void update(const std::string& datum);
/**
* Present the given unsigned 64-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint64_t datum);
/**
* Present the given unsigned 32-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint32_t datum);
/**
* Present the given unsigned 16-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint16_t datum);
/**
* Present the given unsigned 8-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(uint8_t datum);
/**
* Present the given signed 64-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int64_t datum);
/**
* Present the given signed 32-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int32_t datum);
/**
* Present the given signed 16-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int16_t datum);
/**
* Present the given signed 8-bit integer as a potential unique item.
* @param datum The given integer.
*/
void update(int8_t datum);
/**
* Present the given 64-bit floating point value as a potential unique item.
* @param datum The given double.
*/
void update(double datum);
/**
* Present the given 32-bit floating point value as a potential unique item.
* @param datum The given float.
*/
void update(float datum);
/**
* Present the given data array as a potential unique item.
* @param data The given array.
* @param length_bytes The array length in bytes.
*/
void update(const void* data, size_t length_bytes);
/**
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
* parameters. This is used primarily for testing.
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
* @param unioned set true if the sketch is the result of a union operation.
* @param lg_config_k the configured value for the sketch.
* @param num_std_dev the given number of Standard Deviations. This must be an integer between
* 1 and 3, inclusive.
* @return the current (approximate) RelativeError
*/
static double get_rel_err(bool upper_bound, bool unioned,
uint8_t lg_config_k, uint8_t num_std_dev);
private:
/**
* Union the given source and destination sketches. This method examines the state of
* the current internal gadget and the incoming sketch and determines the optimal way to
* perform the union. This may involve swapping, down-sampling, transforming, and / or
* copying one of the arguments and may completely replace the internals of the union.
*
* @param incoming_impl the given incoming sketch, which may not be modified.
* @param lg_max_k the maximum value of log2 K for this union.
*/
inline void union_impl(const hll_sketch_alloc<A>& sketch, uint8_t lg_max_k);
static HllSketchImpl<A>* copy_or_downsample(const HllSketchImpl<A>* src_impl, uint8_t tgt_lg_k);
void coupon_update(uint32_t coupon);
hll_mode get_current_mode() const;
bool is_out_of_order_flag() const;
bool is_estimation_mode() const;
// calls couponUpdate on sketch, freeing the old sketch upon changes in hll_mode
static HllSketchImpl<A>* leak_free_coupon_update(HllSketchImpl<A>* impl, uint32_t coupon);
uint8_t lg_max_k_;
hll_sketch_alloc<A> gadget_;
};
/// convenience alias for hll_sketch with default allocator
typedef hll_sketch_alloc<> hll_sketch;
/// convenience alias for hll_union with default allocator
typedef hll_union_alloc<> hll_union;
} // namespace datasketches
#include "hll.private.hpp"
#endif // _HLL_HPP_