1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#![crate_name = "basiccms"]
#![crate_type = "lib"]

extern crate rand;

use std::hash::{ SipHasher, Hasher, Hash };

pub struct Sketch {
    width  : usize,
    depth  : usize,
    buckets: Vec<Vec<u64>>,
    hash_offsets: Vec<u64>
}

impl Sketch {

    pub fn add<T: IntoSketch>(&mut self, val: T) -> &Sketch {
        for row in 0..self.depth {
            let key = self.index(&val, row);
            let value = self.buckets[row][key];

            self.buckets[row][key] = value + 1;
        }

        self
    }

    pub fn point<T: IntoSketch>(&mut self, val: T) -> u64 {
        let mut counts = Vec::with_capacity(self.depth);

        for row in 0..self.depth {
            let key   = self.index(&val, row);
            let value = self.buckets[row][key];
            counts.push(value);
        }

        *counts.iter().min().unwrap()
    }

    pub fn new(epsilon: f64, delta: f64) -> Sketch {
        if epsilon < 0.0 {
            panic!("CMS: epsilon must be positive");
        }

        if delta < 0.0  || delta > 1.0 {
            panic!("CMS: delta must be in (0.0, 1.0)");
        }

        let width        = (std::f64::consts::E / epsilon).ceil() as usize;
        let depth        = (1.0 / delta).ln().ceil() as usize;
        let hash_offsets = Sketch::make_hash_offsets(depth);

        Sketch {
            width: width,
            depth: depth,
            buckets: vec![vec![0; width]; depth],
            hash_offsets: hash_offsets
        }
    }

    fn index<T: IntoSketch>(&mut self, val: &T, row: usize) -> usize {
        let sketched = val.asu64();
        let offset   = self.hash_offsets[row];
        let hashed   = sketched.wrapping_mul(offset);

        (hashed % (self.width as u64)) as usize
    }

    fn make_hash_offsets(depth: usize) -> Vec<u64> {
        let mut hashers: Vec<u64> = Vec::with_capacity(depth);

        for _ in 0..depth {
            hashers.push(rand::random::<u64>());
        }

        hashers
    }
}

fn elementwise_sum(left: &Vec<Vec<u64>>, right: &Vec<Vec<u64>>) -> Vec<Vec<u64>> {
    left.iter()
        .zip(right.iter())
        .map(|(l, r)| l.iter().zip(r.iter()).map(|(x, y)| x + y).collect())
        .collect()
}

impl<'a> std::ops::Add<&'a Sketch> for &'a Sketch {
    type Output = Sketch;

    fn add(self, other: &'a Sketch) -> Sketch {
        assert_eq!(self.width, other.width);
        assert_eq!(self.depth, other.depth);
        assert_eq!(self.hash_offsets, other.hash_offsets);

        Sketch {
            width: self.width,
            depth: self.depth,
            hash_offsets: self.hash_offsets.clone(),
            buckets: elementwise_sum(&self.buckets, &other.buckets)
        }
    }
}

impl std::clone::Clone for Sketch {
    fn clone(&self) -> Sketch {
        Sketch {
            width: self.width,
            depth: self.depth,
            hash_offsets: self.hash_offsets.clone(),
            buckets: self.buckets.clone()
        }
    }
}

pub trait IntoSketch {
    fn asu64(&self) -> u64;
}

impl IntoSketch for u64 {
    fn asu64(&self) -> u64 {
        *self
    }
}

impl IntoSketch for String {
    fn asu64(&self) -> u64 {
        let mut hasher = SipHasher::new();
        self.hash(&mut hasher);

        hasher.finish()
    }
}

impl<'a> IntoSketch for &'a str {
    fn asu64(&self) -> u64 {
        let mut hasher = SipHasher::new();
        self.hash(&mut hasher);

        hasher.finish()
    }
}