1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#![allow(clippy::type_complexity)]

use derive_new::new;
use rand::thread_rng;
use serde::{Deserialize, Serialize};
use std::hash::Hash;
use streaming_algorithms::{HyperLogLogMagnitude, SampleUnstable as SASampleUnstable, Top};

use super::{
	folder_par_sink, FolderSync, FolderSyncReducer, FolderSyncReducerFactory, ParallelPipe, ParallelSink, SumFolder, SumZeroFolder
};

#[derive(new)]
#[must_use]
pub struct SampleUnstable<I> {
	i: I,
	samples: usize,
}

impl_par_dist! {
	impl<I: ParallelPipe<Source>, Source> ParallelSink<Source>
		for SampleUnstable<I>
	where
		I::Item: Send + 'static,
	{
		folder_par_sink!(SampleUnstableFolder, SumFolder<SASampleUnstable<I::Item>>, self, SampleUnstableFolder::new(self.samples), SumFolder::new());
	}
}

#[derive(Clone, Serialize, Deserialize, new)]
pub struct SampleUnstableFolder {
	samples: usize,
}

impl<A> FolderSync<A> for SampleUnstableFolder {
	type Output = SASampleUnstable<A>;

	fn zero(&mut self) -> Self::Output {
		SASampleUnstable::new(self.samples)
	}
	fn push(&mut self, state: &mut Self::Output, item: A) {
		state.push(item, &mut thread_rng())
	}
}

#[derive(new)]
#[must_use]
pub struct MostFrequent<I> {
	i: I,
	n: usize,
	probability: f64,
	tolerance: f64,
}

impl_par_dist! {
	impl<I: ParallelPipe<Source>, Source> ParallelSink<Source>
		for MostFrequent<I>
	where
		I::Item: Clone + Hash + Eq + Send + 'static,
	{
		folder_par_sink!(MostFrequentFolder, SumZeroFolder<Top<I::Item, usize>>, self, MostFrequentFolder::new(self.n, self.probability, self.tolerance), SumZeroFolder::new(Top::new(self.n, self.probability, self.tolerance, ())));
	}
}

#[derive(Clone, Serialize, Deserialize, new)]
pub struct MostFrequentFolder {
	n: usize,
	probability: f64,
	tolerance: f64,
}

impl<A> FolderSync<A> for MostFrequentFolder
where
	A: Clone + Hash + Eq + Send + 'static,
{
	type Output = Top<A, usize>;

	fn zero(&mut self) -> Self::Output {
		Top::new(self.n, self.probability, self.tolerance, ())
	}
	fn push(&mut self, state: &mut Self::Output, item: A) {
		state.push(item, &1)
	}
}

#[derive(new)]
#[must_use]
pub struct MostDistinct<I> {
	i: I,
	n: usize,
	probability: f64,
	tolerance: f64,
	error_rate: f64,
}

impl_par_dist! {
	impl<I: ParallelPipe<Source, Item = (A, B)>, Source, A, B> ParallelSink<Source> for MostDistinct<I>
	where
		A: Clone + Hash + Eq + Send + 'static,
		B: Hash + 'static,
	{
		folder_par_sink!(MostDistinctFolder, SumZeroFolder<Top<A, HyperLogLogMagnitude<B>>>, self, MostDistinctFolder::new(self.n, self.probability, self.tolerance, self.error_rate), SumZeroFolder::new(Top::new(self.n, self.probability, self.tolerance, self.error_rate)));
	}
}

#[derive(Clone, Serialize, Deserialize, new)]
pub struct MostDistinctFolder {
	n: usize,
	probability: f64,
	tolerance: f64,
	error_rate: f64,
}

impl<A, B> FolderSync<(A, B)> for MostDistinctFolder
where
	A: Clone + Hash + Eq + Send + 'static,
	B: Hash + 'static,
{
	type Output = Top<A, HyperLogLogMagnitude<B>>;

	fn zero(&mut self) -> Self::Output {
		Top::new(self.n, self.probability, self.tolerance, self.error_rate)
	}
	fn push(&mut self, state: &mut Self::Output, item: (A, B)) {
		state.push(item.0, &item.1)
	}
}