cvm/
lib.rs

1use rand::Rng;
2use std::collections::HashSet;
3use std::fs::File;
4use std::io::{self, BufRead};
5use std::path::PathBuf;
6
7pub fn estimate_from_many(paths: &[PathBuf], delta: f64, epsilon: f64) -> io::Result<usize> {
8    let m = count_rows(paths)?;
9    
10    let mut rng = rand::thread_rng();
11
12    let mut x: HashSet<String> = HashSet::new();
13    let mut p: f64 = 1.0;
14
15    let thresh = ((12.0 / epsilon.powi(2)) * (8.0 * m as f64 / delta).log2()).ceil() as usize;
16
17    for path in paths {
18        let file = File::open(path)?;
19        let reader = io::BufReader::new(file);
20
21        for line in reader.lines() {
22            let el = line?;
23            x.remove(&el);
24
25            if rng.gen_bool(p) {
26                x.insert(el.clone());            
27            }
28
29            if x.len() == thresh {
30                x.retain(|_| rng.gen_bool(0.5));
31                p /= 2.0;
32            }
33        }
34    }
35
36    Ok((x.len() as f64 / p) as usize)
37}
38
39fn count_rows(paths: &[PathBuf]) -> io::Result<usize> {
40    let mut total_lines = 0;
41    for path in paths {
42        let file = File::open(path)?;
43        let reader = io::BufReader::new(file);
44        total_lines += reader.lines().count();
45    }
46    Ok(total_lines)
47}
48
49pub fn estimate(source: Vec<String>, delta: f64, epsilon: f64) -> usize {
50    let mut rng = rand::thread_rng();
51
52    let mut x: HashSet<String> = HashSet::new();
53    let mut p: f64 = 1 as f64;
54    let m = source.len();
55
56    let thresh = ((12.0 / epsilon.powi(2)) * (8.0 * m as f64 / delta).log2()).ceil() as usize;
57
58
59    for el in source.iter() {
60        x.remove(el);
61
62        if rng.gen_bool(p) {
63            x.insert(el.to_owned());
64        }
65
66        if x.len() == thresh {
67            x.retain(|_| rng.gen_bool(0.5));
68            p = p/2.0;
69        }
70    }
71
72    (x.len() as f64 / p) as usize
73}