yake_rust/
result_item.rs

1use std::cmp::max;
2
3use levenshtein::levenshtein;
4
5use crate::{Candidate, LTerm};
6
7/// Represents a key phrase.
8#[derive(PartialEq, Clone, Debug)]
9pub struct ResultItem {
10    /// The first occurrence in the text. Not exact, as words are joined by a single space.
11    pub raw: String,
12    /// A lowercased key phrase consisting of 1…N words, where N is configured through [`Config::ngrams`].
13    pub keyword: LTerm,
14    /// Key importance, where 0 is the most important.
15    pub score: f64,
16}
17
18impl PartialEq<(&str, &str, f64)> for ResultItem {
19    fn eq(&self, (raw, key_phrase, score): &(&str, &str, f64)) -> bool {
20        self.raw.eq(raw) && self.keyword.eq(key_phrase) && self.score.eq(score)
21    }
22}
23
24impl From<Candidate<'_>> for ResultItem {
25    fn from(candidate: Candidate) -> Self {
26        ResultItem { raw: candidate.raw.join(" "), keyword: candidate.lc_terms.join(" "), score: candidate.score }
27    }
28}
29
30pub(crate) fn remove_duplicates(threshold: f64, results: Vec<ResultItem>, n: usize) -> Vec<ResultItem> {
31    let mut unique: Vec<ResultItem> = Vec::new();
32
33    for res in results {
34        if unique.len() >= n {
35            break;
36        }
37
38        let is_duplicate = unique.iter().any(|it| levenshtein_ratio(&it.keyword, &res.keyword) > threshold);
39
40        if !is_duplicate {
41            unique.push(res);
42        }
43    }
44
45    unique
46}
47
48/// Returns a number in 0..1 range, where 0 is distant and 1 is close.
49fn levenshtein_ratio(seq1: &str, seq2: &str) -> f64 {
50    let distance = if seq1.len() <= seq2.len() { levenshtein(seq1, seq2) } else { levenshtein(seq2, seq1) };
51    let length = max(seq1.len(), seq2.len());
52    1.0 - (distance as f64 / length as f64)
53}