tf_idf_vectorizer/vectorizer/
tfidf.rs1
2use half::f16;
3use num_traits::Num;
4
5use crate::{utils::datastruct::{map::IndexSet, vector::{ZeroSpVec, ZeroSpVecTrait}}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
6
7pub trait TFIDFEngine<N, K>: Send + Sync
8where
9 N: Num + Copy,
10{
11 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
19 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<N>, f64);
26}
27
28#[derive(Debug)]
31pub struct DefaultTFIDFEngine;
32impl DefaultTFIDFEngine {
33 pub fn new() -> Self {
34 DefaultTFIDFEngine
35 }
36}
37
38impl<K> TFIDFEngine<f16, K> for DefaultTFIDFEngine {
39 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f16>, f64) {
40 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
41 let doc_num = corpus.get_doc_num() as f64;
42 for token in token_dim_sample.iter() {
43 let doc_freq = corpus.get_token_count(token);
44 idf_vec.push(f16::from_f64(doc_num / (doc_freq as f64 + 1.0)));
45 }
46 (idf_vec, 1.0)
47 }
48
49 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f16>, f64) {
50 let total_count = freq.token_sum() as f32;
52 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
53 let len = token_dim_sample.len();
54 let inv_total = 1.0f32 / total_count;
55 let mut raw = freq.iter().filter_map(|(token, count)| {
56 let idx = token_dim_sample.get_index(token)?;
57 Some((idx, f16::from_f32((count as f32) * inv_total)))
58 }).collect::<Vec<_>>();
59 raw.sort_unstable_by_key(|(idx, _)| *idx);
60 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
61 }
62}
63
64impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
65{
66 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
67 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68 let doc_num = corpus.get_doc_num() as f64;
69 for token in token_dim_sample.iter() {
70 let doc_freq = corpus.get_token_count(token);
71 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
72 }
73 (idf_vec, 1.0)
74 }
75
76 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f32>, f64) {
77 let total_count = freq.token_sum() as f32;
79 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80 let len = token_dim_sample.len();
81 let inv_total = 1.0f32 / total_count;
82 let mut raw = freq.iter().filter_map(|(token, count)| {
83 let idx = token_dim_sample.get_index(token)?;
84 Some((idx, (count as f32) * inv_total))
85 }).collect::<Vec<_>>();
86 raw.sort_unstable_by_key(|(idx, _)| *idx);
87 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
88 }
89}
90
91impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
92{
93 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
94 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
95 let doc_num = corpus.get_doc_num() as f64;
96 for token in token_dim_sample.iter() {
97 let doc_freq = corpus.get_token_count(token);
98 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
99 }
100 (idf_vec, 1.0)
101 }
102
103 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f64>, f64) {
104 let total_count = freq.token_sum() as f64;
106 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
107 let len = token_dim_sample.len();
108 let inv_total = 1.0f64 / total_count;
109 let mut raw = freq.iter().filter_map(|(token, count)| {
110 let idx = token_dim_sample.get_index(token)?;
111 Some((idx, (count as f64) * inv_total))
112 }).collect::<Vec<_>>();
113 raw.sort_unstable_by_key(|(idx, _)| *idx);
114 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
115 }
116}
117
118impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
119{
120 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
121 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
122 let doc_num = corpus.get_doc_num() as f64;
123 for token in token_dim_sample.iter() {
124 let doc_freq = corpus.get_token_count(token);
125 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
126 }
127 let max = idf_vec
128 .iter()
129 .max_by(|a, b| a.total_cmp(b))
130 .copied()
131 .unwrap_or(1.0);
132 (
133 idf_vec
134 .into_iter()
135 .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
136 .collect(),
137 max
138 )
139 }
140
141 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u32>, f64) {
142 let total_count = freq.token_sum() as f64;
144 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
145 let mut max_val = 0.0f64;
146 let inv_total = 1.0f64 / total_count;
147 let mut raw: Vec<(usize, f64)> = freq.iter().filter_map(|(token, count)| {
148 let idx = token_dim_sample.get_index(token)?;
149 let v = (count as f64) * inv_total;
150 max_val = max_val.max(v);
151 Some((idx, v))
152 }).collect::<Vec<_>>();
153 let len = token_dim_sample.len();
154 let mul_norm = (u32::MAX as f64) / max_val; let vec_u32 = raw.drain(..)
156 .map(|(idx, v)| {
157 let q = (v * mul_norm).ceil() as u32;
158 (idx, q)
159 })
160 .collect::<Vec<_>>();
161 (unsafe { ZeroSpVec::from_sparse_iter(vec_u32.into_iter(), len) }, total_count)
162 }
163}
164
165impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
166{
167 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
168 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
169 let doc_num = corpus.get_doc_num() as f64;
170 for token in token_dim_sample.iter() {
171 let doc_freq = corpus.get_token_count(token);
172 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
173 }
174 let max = idf_vec
175 .iter()
176 .max_by(|a, b| a.total_cmp(b))
177 .copied()
178 .unwrap_or(1.0);
179 (
180 idf_vec
181 .into_iter()
182 .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
183 .collect(),
184 max
185 )
186 }
187
188 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u16>, f64) {
189 let total_count = freq.token_sum() as f64;
191 let mut max_val = 0.0f32;
193 let div_total = (1.0 / total_count) as f32;
194 let raw = freq.iter().filter_map(|(token, count)| {
195 let idx = token_dim_sample.get_index(token)?;
196 let v = (count as f32) * div_total;
197 max_val = max_val.max(v);
198 Some((idx, v))
199 }).collect::<Vec<_>>();
200 let len = token_dim_sample.len();
201 let norm_div_max = (u16::MAX as f32) / max_val; let vec_u16 = raw.into_iter()
204 .map(|(idx, v)| {
205 let q = (v * norm_div_max).ceil() as u16;
206 (idx, q)
207 })
208 .collect::<Vec<_>>();
209 (unsafe { ZeroSpVec::from_sparse_iter(vec_u16.into_iter(), len) }, total_count)
210 }
211}
212
213impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
214{
215 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
216 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
217 let doc_num = corpus.get_doc_num() as f64;
218 for token in token_dim_sample.iter() {
219 let doc_freq = corpus.get_token_count(token);
220 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
221 }
222 let max = idf_vec
223 .iter()
224 .max_by(|a, b| a.total_cmp(b))
225 .copied()
226 .unwrap_or(1.0);
227 (
228 idf_vec
229 .into_iter()
230 .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
231 .collect(),
232 max
233 )
234 }
235
236 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u8>, f64) {
237 let total_count_f64 = freq.token_sum() as f64;
239 if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
240 let total_count = total_count_f64 as f32;
242 let mut max_val = 0.0f32;
243 let inv_total = 1.0f32 / total_count;
244 let raw = freq.iter().filter_map(|(token, count)| {
245 let idx = token_dim_sample.get_index(token)?;
246 let v = (count as f32) * inv_total;
247 max_val = max_val.max(v);
248 Some((idx, v))
249 }).collect::<Vec<_>>();
250 let len = token_dim_sample.len();
251 if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
252 let mul_norm = (u8::MAX as f32) / max_val; let vec_u8 = raw.into_iter()
254 .map(|(idx, v)| {
255 let q = (v * mul_norm).ceil() as u8;
256 (idx, q)
257 })
258 .collect::<Vec<_>>();
259 (unsafe { ZeroSpVec::from_sparse_iter(vec_u8.into_iter(), len) }, total_count_f64)
260 }
261}