tf_idf_vectorizer/vectorizer/
mod.rs1
2use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
3use token::TokenFrequency;
4
5pub mod index;
6pub mod token;
7pub mod analyzer;
8
9pub struct TFIDFVectorizer {
10 pub corpus: TokenFrequency,
11 doc_num: u64,
12}
13
14impl TFIDFVectorizer {
15 pub fn new() -> Self {
17 Self {
18 corpus: TokenFrequency::new(),
19 doc_num: 0,
20 }
21 }
22
23 pub fn doc_num(&self) -> u64 {
25 self.doc_num
26 }
27
28 pub fn add_corpus(&mut self, tokens: &[&str]) {
31 let mut doc_tf = TokenFrequency::new();
33 doc_tf.add_tokens(tokens);
34
35 self.corpus.add_tokens(tokens);
37
38 self.doc_num += 1;
39 }
40
41 pub fn tf_idf_vector(&self, tokens: &[&str]) -> Vec<(&str, f64)> {
49 let mut doc_tf = TokenFrequency::new();
51 doc_tf.add_tokens(tokens);
52
53 let mut result: Vec<(&str, f64)> = Vec::new();
54 let idf_vec: Vec<(&str, f64)> = self.corpus.idf_vector_ref_str(self.doc_num as u64);
56 for (added_token, idf) in idf_vec.iter() {
57 let tf: f64 = doc_tf.tf_token(added_token);
58 if tf != 0.0 {
59 let tf_idf = tf * idf;
60 result.push((*added_token, tf_idf));
61 }
62 }
63 result.sort_by(|a, b| b.1.total_cmp(&a.1));
64 result
65 }
66
67 pub fn tf_idf_vector_parallel(&self, tokens: &[&str], thread_count: usize) -> Vec<(&str, f64)> {
77 let mut doc_tf = TokenFrequency::new();
79 doc_tf.add_tokens(tokens);
80
81 let idf_vec: Vec<(&str, f64)> = self.corpus.idf_vector_ref_str(self.doc_num as u64);
82
83 let pool = rayon::ThreadPoolBuilder::new().num_threads(thread_count).build().unwrap();
85 let mut result: Vec<(&str, f64)> = pool.install(|| {
86 idf_vec
87 .par_iter()
88 .filter_map(|(added_token, idf)| {
89 let tf: f64 = doc_tf.tf_token(added_token);
90 if tf != 0.0 {
91 Some((*added_token, tf * idf))
92 } else {
93 None
94 }
95 })
96 .collect()
97 });
98 result.sort_by(|a, b| b.1.total_cmp(&a.1));
99 result
100 }
101}