tf_idf_vectorizer/vectorizer/
mod.rs

1
2use token::TokenFrequency;
3
4pub mod index;
5pub mod token;
6pub mod analyzer;
7
8pub struct TFIDFVectorizer {
9    pub corpus: TokenFrequency,
10    doc_num: u64,
11}
12
13impl TFIDFVectorizer {
14    pub fn new() -> Self {
15        Self {
16            corpus: TokenFrequency::new(),
17            doc_num: 0,
18        }
19    }
20
21    pub fn doc_num(&self) -> u64 {
22        self.doc_num
23    }
24
25    pub fn add_corpus(&mut self, tokens: &[&str]) {
26        // TFの計算
27        let mut doc_tf = TokenFrequency::new();
28        doc_tf.add_tokens(tokens);
29
30        // corpus_token_freqに追加
31        self.corpus.add_tokens(tokens);
32 
33        self.doc_num += 1;
34    }
35
36    pub fn tf_idf_vector(&self, tokens: &[&str]) -> Vec<(&str, f64)> {
37        // TFの計算
38        let mut doc_tf = TokenFrequency::new();
39        doc_tf.add_tokens(tokens);
40
41        let mut result: Vec<(&str, f64)> = Vec::new();
42        // corpus_token_freqに追加
43        let idf_vec: Vec<(&str, f64)> = self.corpus.idf_vector_ref_str(self.doc_num as u64);
44        for (added_token, idf) in idf_vec.iter() {
45            let tf: f64 = doc_tf.tf_token(added_token);
46            if tf != 0.0 {
47                let tf_idf = tf * idf;
48                result.push((*added_token, tf_idf));
49            }
50        }
51        result.sort_by(|a, b| b.1.total_cmp(&a.1));
52        result
53    }
54}