tf_idf_vectorizer/vectorizer/
mod.rs1
2use token::TokenFrequency;
3
4pub mod index;
5pub mod token;
6pub mod analyzer;
7
8pub struct TFIDFVectorizer {
9 pub corpus: TokenFrequency,
10 doc_num: u64,
11}
12
13impl TFIDFVectorizer {
14 pub fn new() -> Self {
15 Self {
16 corpus: TokenFrequency::new(),
17 doc_num: 0,
18 }
19 }
20
21 pub fn doc_num(&self) -> u64 {
22 self.doc_num
23 }
24
25 pub fn add_corpus(&mut self, tokens: &[&str]) {
26 let mut doc_tf = TokenFrequency::new();
28 doc_tf.add_tokens(tokens);
29
30 self.corpus.add_tokens(tokens);
32
33 self.doc_num += 1;
34 }
35
36 pub fn tf_idf_vector(&self, tokens: &[&str]) -> Vec<(&str, f64)> {
37 let mut doc_tf = TokenFrequency::new();
39 doc_tf.add_tokens(tokens);
40
41 let mut result: Vec<(&str, f64)> = Vec::new();
42 let idf_vec: Vec<(&str, f64)> = self.corpus.idf_vector_ref_str(self.doc_num as u64);
44 for (added_token, idf) in idf_vec.iter() {
45 let tf: f64 = doc_tf.tf_token(added_token);
46 if tf != 0.0 {
47 let tf_idf = tf * idf;
48 result.push((*added_token, tf_idf));
49 }
50 }
51 result.sort_by(|a, b| b.1.total_cmp(&a.1));
52 result
53 }
54}