1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
use infisearch_common::dictionary::Dictionary;

mod edit_distance;

const CACHE_SIZE: usize = 8;

pub struct BestTermCorrector {
    cache: [(String, Option<String>); CACHE_SIZE],
    cache_idx: usize,
}

impl BestTermCorrector {
    #[inline(always)]
    pub fn new() -> Self {
        BestTermCorrector {
            cache: [
                ("".to_owned(), None),
                ("".to_owned(), None),
                ("".to_owned(), None),
                ("".to_owned(), None),
                ("".to_owned(), None),
                ("".to_owned(), None),
                ("".to_owned(), None),
                ("".to_owned(), None),
            ],
            cache_idx: 0,
        }
    }

    #[inline(always)]
    pub fn get_best_corrected_term(&mut self, dict: &Dictionary, misspelled_term: &str) -> Option<String> {
        if let Some((_, corrected)) = self.cache.iter().find(|(base_term, _)| base_term == misspelled_term) {
            return corrected.clone();
        }

        let mut best_term = None;
        let mut max_doc_freq = 0;
    
        let base_term_char_count = misspelled_term.chars().count();
        let mut min_edit_distance: usize = match base_term_char_count {
            0..=4 => 1,
            5..=8 => 2,
            _ => 3,
        };
    
        let mut cache = [255_usize; 255];
    
        for (term, term_info) in dict.term_infos.iter() {
            if term.chars().count().abs_diff(base_term_char_count) > min_edit_distance {
                continue;
            }
    
            if min_edit_distance == 1 && term_info.doc_freq < max_doc_freq {
                continue;
            }
    
            let edit_distance = edit_distance::levenshtein(
                term,
                misspelled_term,
                base_term_char_count,
                &mut cache,
            );
            if edit_distance < min_edit_distance {
                min_edit_distance = edit_distance;
                max_doc_freq = term_info.doc_freq;
                best_term = Some(term);
            } else if edit_distance == min_edit_distance && term_info.doc_freq > max_doc_freq {
                max_doc_freq = term_info.doc_freq;
                best_term = Some(term);
            }
        }
    
        let result = if let Some(best_term) = best_term {
            let normal_string = std::string::String::from(best_term.as_str());
            Some(normal_string)
        } else {
            None
        };

        unsafe {
            *self.cache.get_unchecked_mut(self.cache_idx) = (
                misspelled_term.to_owned(), result.clone(),
            );
        }
        self.cache_idx = (self.cache_idx + 1) % CACHE_SIZE;

        result
    }
}