tf_idf_vectorizer/vectorizer/
mod.rs1pub mod corpus;
2pub mod tfidf;
3pub mod token;
4pub mod serde;
5pub mod compute;
6pub mod evaluate;
7
8use num::Num;
9use ::serde::{Deserialize, Serialize};
10
11use crate::{utils::math::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{compute::compare::{Compare, DefaultCompare}, corpus::Corpus, tfidf::{DefaultTFIDFEngine, TFIDFEngine}, token::TokenFrequency}};
12use std::collections::HashSet;
13
14#[derive(Debug)]
15pub struct TFIDFVectorizer<'a, N = f32, K = String, E = DefaultTFIDFEngine, C = DefaultCompare>
16where
17 N: Num + Copy,
18 E: TFIDFEngine<N>,
19 C: Compare<N>,
20{
21 pub documents: Vec<TFVector<N, K>>,
23 pub token_dim_sample: Vec<String>,
25 pub token_dim_set: HashSet<String>,
27 pub corpus_ref: &'a Corpus,
29 pub idf: IDFVector<N>,
31 _marker: std::marker::PhantomData<E>,
32 _compare_marker: std::marker::PhantomData<C>,
33}
34
35#[derive(Debug, Serialize, Deserialize)]
36pub struct TFVector<N, K>
37where
38 N: Num + Copy,
39{
40 pub tf_vec: ZeroSpVec<N>,
43 pub token_sum: u64,
45 pub denormalize_num: f64,
47 pub key: K,
49}
50
51impl<N, K> TFVector<N, K>
52where
53 N: Num + Copy,
54{
55 pub fn shrink_to_fit(&mut self) {
56 self.tf_vec.shrink_to_fit();
57 }
58}
59
60#[derive(Debug, Serialize, Deserialize)]
61pub struct IDFVector<N>
62where
63 N: Num,
64{
65 pub idf_vec: Vec<N>,
67 pub denormalize_num: f64,
69 pub latest_entropy: u64,
71 pub doc_num: u64,
73}
74
75impl <N> IDFVector<N>
76where
77 N: Num,
78{
79 pub fn new() -> Self {
80 Self {
81 idf_vec: Vec::new(),
82 denormalize_num: 1.0,
83 latest_entropy: 0,
84 doc_num: 0,
85 }
86 }
87}
88
89impl <'a, N, K, E, C> TFIDFVectorizer<'a, N, K, E, C>
90where
91 N: Num + Copy,
92 E: TFIDFEngine<N>,
93 C: Compare<N>,
94{
95 pub fn new(corpus_ref: &'a Corpus) -> Self {
97 let mut instance = Self {
98 documents: Vec::new(),
99 token_dim_sample: Vec::new(),
100 token_dim_set: HashSet::new(),
101 corpus_ref,
102 idf: IDFVector::new(),
103 _marker: std::marker::PhantomData,
104 _compare_marker: std::marker::PhantomData,
105 };
106 instance.re_calc_idf();
107 instance
108 }
109
110 pub fn set_corpus_ref(&mut self, corpus_ref: &'a Corpus) {
112 self.corpus_ref = corpus_ref;
113 self.re_calc_idf();
114 }
115
116 pub fn update_idf(&mut self) {
118 if self.corpus_ref.get_gen_num() != self.idf.latest_entropy {
119 self.re_calc_idf();
120 }
121 }
123
124 fn re_calc_idf(&mut self) {
126 self.idf.latest_entropy = self.corpus_ref.get_gen_num();
127 self.idf.doc_num = self.corpus_ref.get_doc_num();
128 (self.idf.idf_vec, self.idf.denormalize_num) = E::idf_vec(&self.corpus_ref, &self.token_dim_sample)
129 }
130}
131
132impl <'a, N, K, E, C> TFIDFVectorizer<'a, N, K, E, C>
133where
134 N: Num + Copy,
135 E: TFIDFEngine<N>,
136 C: Compare<N>,
137{
138 pub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency) {
141 let token_sum = doc.token_sum();
142 self.add_corpus(doc);
144 for tok in doc.token_set_ref_str() {
146 if !self.token_dim_set.contains(tok) {
147 self.token_dim_sample.push(tok.to_string());
148 self.token_dim_set.insert(tok.to_string());
149 }
150 }
151
152 let (tf_vec, denormalize_num) = E::tf_vec(doc, &self.token_dim_sample);
153 let mut doc = TFVector {
154 tf_vec,
155 token_sum,
156 denormalize_num,
157 key: doc_id,
158 };
159 doc.shrink_to_fit();
160 self.documents.push(doc);
161 }
162
163 fn add_corpus(&mut self, doc: &TokenFrequency) {
165 self.corpus_ref.add_set(&doc.token_set_ref_str());
167 }
168}