basiccms/
lib.rs

1#![crate_name = "basiccms"]
2#![crate_type = "lib"]
3
4extern crate rand;
5
6use std::collections::hash_map::DefaultHasher;
7use std::hash::{ Hasher, Hash };
8
9pub struct Sketch {
10    width  : usize,
11    depth  : usize,
12    buckets: Vec<Vec<u64>>,
13    hash_offsets: Vec<u64>
14}
15
16impl Sketch {
17
18    pub fn add<T: IntoSketch>(&mut self, val: T) -> &Sketch {
19        for row in 0..self.depth {
20            let key = self.index(&val, row);
21            let value = self.buckets[row][key];
22
23            self.buckets[row][key] = value + 1;
24        }
25
26        self
27    }
28
29    pub fn point<T: IntoSketch>(&mut self, val: T) -> u64 {
30        let mut counts = Vec::with_capacity(self.depth);
31
32        for row in 0..self.depth {
33            let key   = self.index(&val, row);
34            let value = self.buckets[row][key];
35            counts.push(value);
36        }
37
38        *counts.iter().min().unwrap()
39    }
40
41    pub fn new(epsilon: f64, delta: f64) -> Sketch {
42        if epsilon < 0.0 {
43            panic!("CMS: epsilon must be positive");
44        }
45
46        if delta < 0.0  || delta > 1.0 {
47            panic!("CMS: delta must be in (0.0, 1.0)");
48        }
49
50        let width        = (std::f64::consts::E / epsilon).ceil() as usize;
51        let depth        = (1.0 / delta).ln().ceil() as usize;
52        let hash_offsets = Sketch::make_hash_offsets(depth);
53
54        Sketch {
55            width: width,
56            depth: depth,
57            buckets: vec![vec![0; width]; depth],
58            hash_offsets: hash_offsets
59        }
60    }
61
62    fn index<T: IntoSketch>(&mut self, val: &T, row: usize) -> usize {
63        let sketched = val.asu64();
64        let offset   = self.hash_offsets[row];
65        let hashed   = sketched.wrapping_mul(offset);
66
67        (hashed % (self.width as u64)) as usize
68    }
69
70    fn make_hash_offsets(depth: usize) -> Vec<u64> {
71        let mut hashers: Vec<u64> = Vec::with_capacity(depth);
72
73        for _ in 0..depth {
74            hashers.push(rand::random::<u64>());
75        }
76
77        hashers
78    }
79}
80
81fn elementwise_sum(left: &Vec<Vec<u64>>, right: &Vec<Vec<u64>>) -> Vec<Vec<u64>> {
82    left.iter()
83        .zip(right.iter())
84        .map(|(l, r)| l.iter().zip(r.iter()).map(|(x, y)| x + y).collect())
85        .collect()
86}
87
88impl<'a> std::ops::Add<&'a Sketch> for &'a Sketch {
89    type Output = Sketch;
90
91    fn add(self, other: &'a Sketch) -> Sketch {
92        assert_eq!(self.width, other.width);
93        assert_eq!(self.depth, other.depth);
94        assert_eq!(self.hash_offsets, other.hash_offsets);
95
96        Sketch {
97            width: self.width,
98            depth: self.depth,
99            hash_offsets: self.hash_offsets.clone(),
100            buckets: elementwise_sum(&self.buckets, &other.buckets)
101        }
102    }
103}
104
105impl std::clone::Clone for Sketch {
106    fn clone(&self) -> Sketch {
107        Sketch {
108            width: self.width,
109            depth: self.depth,
110            hash_offsets: self.hash_offsets.clone(),
111            buckets: self.buckets.clone()
112        }
113    }
114}
115
116pub trait IntoSketch {
117    fn asu64(&self) -> u64;
118}
119
120impl IntoSketch for u64 {
121    fn asu64(&self) -> u64 {
122        *self
123    }
124}
125
126impl IntoSketch for String {
127    fn asu64(&self) -> u64 {
128        let mut hasher = DefaultHasher::new();
129        self.hash(&mut hasher);
130
131        hasher.finish()
132    }
133}
134
135impl<'a> IntoSketch for &'a str {
136    fn asu64(&self) -> u64 {
137        let mut hasher = DefaultHasher::new();
138        self.hash(&mut hasher);
139
140        hasher.finish()
141    }
142}