streaming_algorithms/
lib.rs

1//! SIMD-accelerated implementations of various [streaming algorithms](https://en.wikipedia.org/wiki/Streaming_algorithm).
2//!
3//! <p style="font-family: 'Fira Sans',sans-serif;padding:0.3em 0"><strong>
4//! <a href="https://crates.io/crates/streaming_algorithms">đŸ“¦&nbsp;&nbsp;Crates.io</a>&nbsp;&nbsp;│&nbsp;&nbsp;<a href="https://github.com/alecmocatta/streaming_algorithms">đŸ“‘&nbsp;&nbsp;GitHub</a>&nbsp;&nbsp;│&nbsp;&nbsp;<a href="https://constellation.zulipchat.com/#narrow/stream/213236-subprojects">đŸ’¬&nbsp;&nbsp;Chat</a>
5//! </strong></p>
6//!
7//! This library is a work in progress. PRs are very welcome! Currently implemented algorithms include:
8//!
9//!  * Count–min sketch
10//!  * Top k (Count–min sketch plus a doubly linked hashmap to track heavy hitters / top k keys when ordered by aggregated value)
11//!  * HyperLogLog
12//!  * Reservoir sampling
13//!
14//! A goal of this library is to enable composition of these algorithms; for example Top k + HyperLogLog to enable an approximate version of something akin to `SELECT key FROM table GROUP BY key ORDER BY COUNT(DISTINCT value) DESC LIMIT k`.
15//!
16//! Run your application with `RUSTFLAGS="-C target-cpu=native"` and the `nightly` feature to benefit from the SIMD-acceleration like so:
17//!
18//! ```bash
19//! RUSTFLAGS="-C target-cpu=native" cargo run --features "streaming_algorithms/nightly" --release
20//! ```
21//!
22//! See [this gist](https://gist.github.com/debasishg/8172796) for a good list of further algorithms to be implemented. Other resources are [Probabilistic data structures – Wikipedia](https://en.wikipedia.org/wiki/Category:Probabilistic_data_structures), [DataSketches – A similar Java library originating at Yahoo](https://datasketches.github.io/), and [Algebird  – A similar Java library originating at Twitter](https://github.com/twitter/algebird).
23//!
24//! As these implementations are often in hot code paths, unsafe is used, albeit only when necessary to a) achieve the asymptotically optimal algorithm or b) mitigate an observed bottleneck.
25
26#![doc(html_root_url = "https://docs.rs/streaming_algorithms/0.3.0")]
27#![warn(
28	missing_copy_implementations,
29	missing_debug_implementations,
30	missing_docs,
31	trivial_casts,
32	trivial_numeric_casts,
33	unused_import_braces,
34	unused_qualifications,
35	unused_results,
36	clippy::pedantic
37)]
38// from https://github.com/rust-unofficial/patterns/blob/master/anti_patterns/deny-warnings.md
39#![allow(
40	dead_code,
41	clippy::doc_markdown,
42	clippy::inline_always,
43	clippy::module_name_repetitions,
44	clippy::if_not_else,
45	clippy::needless_pass_by_value,
46	clippy::suspicious_op_assign_impl,
47	clippy::float_cmp,
48	clippy::unsafe_derive_deserialize,
49	clippy::must_use_candidate,
50	clippy::unused_self
51)]
52
53mod count_min;
54mod distinct;
55mod linked_list;
56mod ordered_linked_list;
57mod sample;
58mod top;
59mod traits;
60
61pub use count_min::*;
62pub use distinct::*;
63pub use sample::*;
64pub use top::*;
65pub use traits::*;
66
67// TODO: replace all instances of the following with a.try_into().unwrap() if/when that exists https://github.com/rust-lang/rust/pull/47857
68#[allow(
69	clippy::cast_possible_truncation,
70	clippy::cast_sign_loss,
71	clippy::cast_precision_loss,
72	clippy::cast_lossless
73)]
74fn f64_to_usize(a: f64) -> usize {
75	assert!(a.is_sign_positive() && a <= usize::max_value() as f64 && a.fract() == 0.0);
76	a as usize
77}
78
79#[allow(
80	clippy::cast_possible_truncation,
81	clippy::cast_sign_loss,
82	clippy::cast_precision_loss
83)]
84fn f64_to_u8(a: f64) -> u8 {
85	assert!(a.is_sign_positive() && a <= f64::from(u8::max_value()) && a.fract() == 0.0);
86	a as u8
87}
88
89#[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
90fn usize_to_f64(a: usize) -> f64 {
91	assert!(a as u64 <= 1_u64 << 53);
92	a as f64
93}
94#[allow(clippy::cast_precision_loss)]
95fn u64_to_f64(a: u64) -> f64 {
96	assert!(a <= 1_u64 << 53);
97	a as f64
98}