simstring_rust/measures/
cosine.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
use super::SimilarityMeasure;
use crate::SimStringDB;
use std::collections::HashSet;

pub struct Cosine;

impl Default for Cosine {
    fn default() -> Self {
        Self::new()
    }
}

impl Cosine {
    pub fn new() -> Self {
        Cosine
    }
}

impl SimilarityMeasure for Cosine {
    fn minimum_feature_size(&self, query_size: i64, alpha: f64) -> i64 {
        (alpha * alpha * query_size as f64).ceil() as i64
    }

    fn maximum_feature_size(&self, _db: &impl SimStringDB, query_size: i64, alpha: f64) -> i64 {
        (query_size as f64 / (alpha * alpha)).floor() as i64
    }

    fn similarity_score(&self, x: &[i64], y: &[i64]) -> f64 {
        let set_x: HashSet<_> = x.iter().collect();
        let set_y: HashSet<_> = y.iter().collect();
        let intersection_count = set_x.intersection(&set_y).count() as f64;
        let denominator = ((set_x.len() * set_y.len()) as f64).sqrt();

        intersection_count / denominator
    }

    fn minimum_overlap(&self, query_size: i64, candidate_size: i64, alpha: f64) -> i64 {
        (alpha * (query_size as f64 * candidate_size as f64).sqrt()).ceil() as i64
    }
}