tf_idf_vectorizer/vectorizer/
tfidf.rs1use num_traits::Num;
2
3use crate::{utils::datastruct::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
4
5pub trait TFIDFEngine<N, K>: Send + Sync
6where
7 N: Num + Copy,
8{
9 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
17 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<N>, f64);
24}
25
26#[derive(Debug)]
29pub struct DefaultTFIDFEngine;
30impl DefaultTFIDFEngine {
31 pub fn new() -> Self {
32 DefaultTFIDFEngine
33 }
34}
35
36impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
37{
38 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
39 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
40 let doc_num = corpus.get_doc_num() as f64;
41 for token in token_dim_sample.iter() {
42 let doc_freq = corpus.get_token_count(token);
43 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
44 }
45 (idf_vec, 1.0)
46 }
47
48 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f32>, f64) {
49 let total_count = freq.token_sum() as f32;
51 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
52 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
53 let len = token_dim_sample.len();
54 let inv_total = 1.0f32 / total_count;
55 for (idx, token) in token_dim_sample.iter().enumerate() {
56 let count = freq.token_count(token) as f32;
57 if count == 0.0 { continue; }
58 raw.push((idx, count * inv_total));
59 }
60 (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
61 }
62}
63
64impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
65{
66 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
67 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68 let doc_num = corpus.get_doc_num() as f64;
69 for token in token_dim_sample.iter() {
70 let doc_freq = corpus.get_token_count(token);
71 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
72 }
73 (idf_vec, 1.0)
74 }
75
76 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f64>, f64) {
77 let total_count = freq.token_sum() as f64;
79 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
81 let len = token_dim_sample.len();
82 let inv_total = 1.0f64 / total_count;
83 for (idx, token) in token_dim_sample.iter().enumerate() {
84 let count = freq.token_count(token) as f64;
85 if count == 0.0 { continue; }
86 raw.push((idx, count * inv_total));
87 }
88 (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
89 }
90}
91
92impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
93{
94 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
95 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
96 let doc_num = corpus.get_doc_num() as f64;
97 for token in token_dim_sample.iter() {
98 let doc_freq = corpus.get_token_count(token);
99 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
100 }
101 let max = idf_vec
102 .iter()
103 .max_by(|a, b| a.total_cmp(b))
104 .copied()
105 .unwrap_or(1.0);
106 (
107 idf_vec
108 .into_iter()
109 .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
110 .collect(),
111 max
112 )
113 }
114
115 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u32>, f64) {
116 let total_count = freq.token_sum() as f64;
118 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
119 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
120 let mut max_val = 0.0f64;
121 let inv_total = 1.0f64 / total_count;
122 for (idx, token) in token_dim_sample.iter().enumerate() {
123 let count = freq.token_count(token) as f64;
124 if count == 0.0 { continue; }
125 let v = count * inv_total;
126 if v > max_val { max_val = v; }
127 raw.push((idx, v));
128 }
129 let len = token_dim_sample.len();
130 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
131 let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
132 let mul_norm = (u32::MAX as f64) / max_val; for (idx, v) in raw.into_iter() {
134 let q = (v * mul_norm).ceil() as u32;
135 vec_u32.push((idx, q));
136 }
137 (unsafe { ZeroSpVec::from_raw_iter(vec_u32.into_iter(), len) }, total_count)
138 }
139}
140
141impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
142{
143 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
144 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
145 let doc_num = corpus.get_doc_num() as f64;
146 for token in token_dim_sample.iter() {
147 let doc_freq = corpus.get_token_count(token);
148 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
149 }
150 let max = idf_vec
151 .iter()
152 .max_by(|a, b| a.total_cmp(b))
153 .copied()
154 .unwrap_or(1.0);
155 (
156 idf_vec
157 .into_iter()
158 .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
159 .collect(),
160 max
161 )
162 }
163
164 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u16>, f64) {
165 let total_count = freq.token_sum() as f64;
167 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
169 let mut max_val = 0.0f32;
170 let div_total = (1.0 / total_count) as f32;
171 for (idx, token) in token_dim_sample.iter().enumerate() {
172 let count = freq.token_count(token);
173 if count == 0 { continue; }
174 let v = count as f32 * div_total;
175 if v > max_val { max_val = v; }
176 raw.push((idx, v));
177 }
178 let len = token_dim_sample.len();
179 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
180 let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
182 let norm_div_max = (u16::MAX as f32) / max_val; for (idx, v) in raw.into_iter() {
184 let q = (v * norm_div_max).ceil() as u16;
185 vec_u16.push((idx, q));
186 }
187 (unsafe { ZeroSpVec::from_raw_iter(vec_u16.into_iter(), len) }, total_count)
188 }
189}
190
191impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
192{
193 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
194 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
195 let doc_num = corpus.get_doc_num() as f64;
196 for token in token_dim_sample.iter() {
197 let doc_freq = corpus.get_token_count(token);
198 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
199 }
200 let max = idf_vec
201 .iter()
202 .max_by(|a, b| a.total_cmp(b))
203 .copied()
204 .unwrap_or(1.0);
205 (
206 idf_vec
207 .into_iter()
208 .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
209 .collect(),
210 max
211 )
212 }
213
214 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u8>, f64) {
215 let total_count_f64 = freq.token_sum() as f64;
217 if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
218 let total_count = total_count_f64 as f32;
220 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
221 let mut max_val = 0.0f32;
222 let inv_total = 1.0f32 / total_count;
223 for (idx, token) in token_dim_sample.iter().enumerate() {
224 let count = freq.token_count(token) as f32;
225 if count == 0.0 { continue; }
226 let v = count * inv_total;
227 if v > max_val { max_val = v; }
228 raw.push((idx, v));
229 }
230 let len = token_dim_sample.len();
231 if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
232 let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
233 let mul_norm = (u8::MAX as f32) / max_val; for (idx, v) in raw.into_iter() {
235 let q = (v * mul_norm).ceil() as u8;
236 vec_u8.push((idx, q));
237 }
238 (unsafe { ZeroSpVec::from_raw_iter(vec_u8.into_iter(), len) }, total_count_f64)
239 }
240}