tf_idf_vectorizer/vectorizer/
tfidf.rs1use crate::{utils::math::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
2
3pub trait TFIDFEngine<N>
4where
5 N: num::Num + Copy,
6{
7 fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<N>, f64);
15 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<N>, f64);
22 fn tfidf_iter_calc(tf: impl Iterator<Item = N>, tf_denorm: f64, idf: impl Iterator<Item = N>, idf_denorm: f64) -> (impl Iterator<Item = N>, f64);
33 fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, N)>, tf_denorm: f64, idf: &Vec<N>, idf_denorm: f64) -> (impl Iterator<Item = (usize, N)>, f64);
34}
35
36#[derive(Debug)]
39pub struct DefaultTFIDFEngine;
40impl DefaultTFIDFEngine {
41 pub fn new() -> Self {
42 DefaultTFIDFEngine
43 }
44}
45
46impl TFIDFEngine<f32> for DefaultTFIDFEngine
47{
48 fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<f32>, f64) {
49 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
50 let doc_num = corpus.get_doc_num() as f64;
51 for token in token_dim_sample {
52 let doc_freq = corpus.get_token_count(token);
53 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
54 }
55 (idf_vec, 1.0)
56 }
57
58 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<f32>, f64) {
59 let total_count = freq.token_sum() as f32;
61 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
62 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
63 for (idx, token) in token_dim_sample.iter().enumerate() {
64 let count = freq.token_count(token) as f32;
65 if count == 0.0 { continue; }
66 raw.push((idx, count / total_count));
67 }
68 (ZeroSpVec::from_raw_iter(raw.into_iter()), total_count.into())
69 }
70
71 fn tfidf_iter_calc(tf: impl Iterator<Item = f32>, tf_denorm: f64, idf: impl Iterator<Item = f32>, idf_denorm: f64) -> (impl Iterator<Item = f32>, f64) {
72 let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
73 let tfidf = tf_val * idf_val;
74 tfidf
75 });
76 (tfidf, tf_denorm * idf_denorm)
77 }
78
79 fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, f32)>, tf_denorm: f64, idf: &Vec<f32>, idf_denorm: f64) -> (impl Iterator<Item = (usize, f32)>, f64) {
80 let tfidf = tf.map(move |(idx, tf_val)| {
81 let idf_val = idf.get(idx).copied().unwrap_or(0.0);
82 (idx, tf_val * idf_val)
83 });
84 (tfidf, tf_denorm * idf_denorm)
85 }
86}
87
88impl TFIDFEngine<f64> for DefaultTFIDFEngine
89{
90 fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<f64>, f64) {
91 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
92 let doc_num = corpus.get_doc_num() as f64;
93 for token in token_dim_sample {
94 let doc_freq = corpus.get_token_count(token);
95 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
96 }
97 (idf_vec, 1.0)
98 }
99
100 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<f64>, f64) {
101 let total_count = freq.token_sum() as f64;
103 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
104 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
105 for (idx, token) in token_dim_sample.iter().enumerate() {
106 let count = freq.token_count(token) as f64;
107 if count == 0.0 { continue; }
108 raw.push((idx, count / total_count));
109 }
110 (ZeroSpVec::from_raw_iter(raw.into_iter()), total_count.into())
111 }
112
113 fn tfidf_iter_calc(tf: impl Iterator<Item = f64>, tf_denorm: f64, idf: impl Iterator<Item = f64>, idf_denorm: f64) -> (impl Iterator<Item = f64>, f64) {
114 let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
115 let tfidf = tf_val * idf_val;
116 tfidf
117 });
118 (tfidf, tf_denorm * idf_denorm)
119 }
120
121 fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, f64)>, tf_denorm: f64, idf: &Vec<f64>, idf_denorm: f64) -> (impl Iterator<Item = (usize, f64)>, f64) {
122 let tfidf = tf.map(move |(idx, tf_val)| {
123 let idf_val = idf.get(idx).copied().unwrap_or(0.0);
124 (idx, tf_val * idf_val)
125 });
126 (tfidf, tf_denorm * idf_denorm)
127 }
128}
129
130impl TFIDFEngine<u32> for DefaultTFIDFEngine
131{
132 fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<u32>, f64) {
133 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
134 let doc_num = corpus.get_doc_num() as f64;
135 for token in token_dim_sample {
136 let doc_freq = corpus.get_token_count(token);
137 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as u32);
138 }
139 (idf_vec, 1.0)
140 }
141
142 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<u32>, f64) {
143 let total_count = freq.token_sum() as f64;
145 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
146 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
147 let mut max_val = 0.0f64;
148 for (idx, token) in token_dim_sample.iter().enumerate() {
149 let count = freq.token_count(token) as f64;
150 if count == 0.0 { continue; }
151 let v = count / total_count;
152 if v > max_val { max_val = v; }
153 raw.push((idx, v));
154 }
155 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
156 let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
157 for (idx, v) in raw.into_iter() {
158 let q = (v / max_val * u32::MAX as f64).ceil() as u32;
159 vec_u32.push((idx, q));
160 }
161 (ZeroSpVec::from_raw_iter(vec_u32.into_iter()), total_count)
162 }
163
164 fn tfidf_iter_calc(tf: impl Iterator<Item = u32>, tf_denorm: f64, idf: impl Iterator<Item = u32>, idf_denorm: f64) -> (impl Iterator<Item = u32>, f64) {
165 let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
180 let tfidf = (tf_val as u64 * idf_val as u64) / u32::MAX as u64;
181 tfidf as u32
182 });
183 (tfidf, tf_denorm * idf_denorm)
184 }
185
186 fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, u32)>, tf_denorm: f64, idf: &Vec<u32>, idf_denorm: f64) -> (impl Iterator<Item = (usize, u32)>, f64) {
187 let tfidf = tf.map(move |(idx, tf_val)| {
188 let idf_val = *idf.get(idx).unwrap_or(&0);
189 let v = (tf_val as u64 * idf_val as u64 / u32::MAX as u64) as u32;
190 (idx, v)
191 });
192 (tfidf, tf_denorm * idf_denorm)
193 }
194}
195
196impl TFIDFEngine<u16> for DefaultTFIDFEngine
197{
198 fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<u16>, f64) {
199 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
200 let doc_num = corpus.get_doc_num() as f64;
201 for token in token_dim_sample {
202 let doc_freq = corpus.get_token_count(token);
203 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
204 }
205 let max = idf_vec
206 .iter()
207 .max_by(|a, b| a.total_cmp(b))
208 .copied()
209 .unwrap_or(1.0);
210 (
211 idf_vec
212 .into_iter()
213 .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
214 .collect(),
215 max
216 )
217 }
218
219 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<u16>, f64) {
220 let total_count = freq.token_sum() as f64;
222 let mut raw: Vec<(usize, f64)> = Vec::new();
224 raw.reserve(freq.token_num());
225 let mut max_val = 0.0f64;
226 for (idx, token) in token_dim_sample.iter().enumerate() {
227 let count = freq.token_count(token) as f64;
228 if count == 0.0 { continue; }
229 let v = count / total_count;
230 if v > max_val { max_val = v; }
231 raw.push((idx, v));
232 }
233 if max_val == 0.0 { return (ZeroSpVec::new(), total_count);
235 }
236 let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
238 for (idx, v) in raw.into_iter() {
239 let q = (v / max_val * u16::MAX as f64).ceil() as u16;
240 vec_u16.push((idx, q));
241 }
242 (ZeroSpVec::from_raw_iter(vec_u16.into_iter()), total_count)
243 }
244
245 fn tfidf_iter_calc(tf: impl Iterator<Item = u16>, tf_denorm: f64, idf: impl Iterator<Item = u16>, idf_denorm: f64) -> (impl Iterator<Item = u16>, f64) {
246 let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
247 let tfidf = (tf_val as u32 * idf_val as u32) / u16::MAX as u32;
248 tfidf as u16
249 });
250 (tfidf, tf_denorm * idf_denorm)
251 }
252
253 fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, u16)>, tf_denorm: f64, idf: &Vec<u16>, idf_denorm: f64) -> (impl Iterator<Item = (usize, u16)>, f64) {
254 let tfidf = tf.map(move |(idx, tf_val)| {
255 let idf_val = *idf.get(idx).unwrap_or(&0);
256 let v = (tf_val as u32 * idf_val as u32 / u16::MAX as u32) as u16;
257 (idx, v)
258 });
259 (tfidf, tf_denorm * idf_denorm)
260 }
261}
262
263impl TFIDFEngine<u8> for DefaultTFIDFEngine
264{
265 fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<u8>, f64) {
266 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
267 let doc_num = corpus.get_doc_num() as f64;
268 for token in token_dim_sample {
269 let doc_freq = corpus.get_token_count(token);
270 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
271 }
272 let max = idf_vec
273 .iter()
274 .max_by(|a, b| a.total_cmp(b))
275 .copied()
276 .unwrap_or(1.0);
277 (
278 idf_vec
279 .into_iter()
280 .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
281 .collect(),
282 max
283 )
284 }
285
286 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<u8>, f64) {
287 let total_count = freq.token_sum() as f64;
289 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
290 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
291 let mut max_val = 0.0f64;
292 for (idx, token) in token_dim_sample.iter().enumerate() {
293 let count = freq.token_count(token) as f64;
294 if count == 0.0 { continue; }
295 let v = count / total_count;
296 if v > max_val { max_val = v; }
297 raw.push((idx, v));
298 }
299 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
300 let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
301 for (idx, v) in raw.into_iter() {
302 let q = (v / max_val * u8::MAX as f64).ceil() as u8;
303 vec_u8.push((idx, q));
304 }
305 (ZeroSpVec::from_raw_iter(vec_u8.into_iter()), total_count)
306 }
307
308 fn tfidf_iter_calc(tf: impl Iterator<Item = u8>, tf_denorm: f64, idf: impl Iterator<Item = u8>, idf_denorm: f64) -> (impl Iterator<Item = u8>, f64) {
309 let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
310 let tfidf = (tf_val as u32 * idf_val as u32) / u8::MAX as u32;
311 tfidf as u8
312 });
313 (tfidf, tf_denorm * idf_denorm)
314 }
315
316 fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, u8)>, tf_denorm: f64, idf: &Vec<u8>, idf_denorm: f64) -> (impl Iterator<Item = (usize, u8)>, f64) {
317 let tfidf = tf.map(move |(idx, tf_val)| {
318 let idf_val = *idf.get(idx).unwrap_or(&0);
319 let v = (tf_val as u32 * idf_val as u32 / u8::MAX as u32) as u8;
320 (idx, v)
321 });
322 (tfidf, tf_denorm * idf_denorm)
323 }
324}
325
326
327
328
329
330
331