tf_idf_vectorizer/vectorizer/
tfidf.rs1
2use num_traits::Num;
3
4use crate::{utils::datastruct::{map::IndexSet, vector::{ZeroSpVec, ZeroSpVecTrait}}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
5
6pub trait TFIDFEngine<N, K>: Send + Sync
7where
8 N: Num + Copy,
9{
10 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
18 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<N>, f64);
25}
26
27#[derive(Debug)]
30pub struct DefaultTFIDFEngine;
31impl DefaultTFIDFEngine {
32 pub fn new() -> Self {
33 DefaultTFIDFEngine
34 }
35}
36
37impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
38{
39 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
40 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
41 let doc_num = corpus.get_doc_num() as f64;
42 for token in token_dim_sample.iter() {
43 let doc_freq = corpus.get_token_count(token);
44 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
45 }
46 (idf_vec, 1.0)
47 }
48
49 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f32>, f64) {
50 let total_count = freq.token_sum() as f32;
52 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
53 let len = token_dim_sample.len();
54 let inv_total = 1.0f32 / total_count;
55 let mut raw = freq.iter().map(|(token, count)| {
56 let idx = token_dim_sample.get_index(token).unwrap();
57 (idx, (count as f32) * inv_total)
58 }).collect::<Vec<_>>();
59 raw.sort_unstable_by_key(|(idx, _)| *idx);
60 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
61 }
62}
63
64impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
65{
66 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
67 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68 let doc_num = corpus.get_doc_num() as f64;
69 for token in token_dim_sample.iter() {
70 let doc_freq = corpus.get_token_count(token);
71 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
72 }
73 (idf_vec, 1.0)
74 }
75
76 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f64>, f64) {
77 let total_count = freq.token_sum() as f64;
79 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80 let len = token_dim_sample.len();
81 let inv_total = 1.0f64 / total_count;
82 let mut raw = freq.iter().map(|(token, count)| {
83 let idx = token_dim_sample.get_index(token).unwrap();
84 (idx, (count as f64) * inv_total)
85 }).collect::<Vec<_>>();
86 raw.sort_unstable_by_key(|(idx, _)| *idx);
87 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
88 }
89}
90
91impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
92{
93 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
94 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
95 let doc_num = corpus.get_doc_num() as f64;
96 for token in token_dim_sample.iter() {
97 let doc_freq = corpus.get_token_count(token);
98 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
99 }
100 let max = idf_vec
101 .iter()
102 .max_by(|a, b| a.total_cmp(b))
103 .copied()
104 .unwrap_or(1.0);
105 (
106 idf_vec
107 .into_iter()
108 .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
109 .collect(),
110 max
111 )
112 }
113
114 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u32>, f64) {
115 let total_count = freq.token_sum() as f64;
117 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
118 let mut max_val = 0.0f64;
119 let inv_total = 1.0f64 / total_count;
120 let mut raw: Vec<(usize, f64)> = freq.iter().map(|(token, count)| {
121 let idx = token_dim_sample.get_index(token).unwrap();
122 let v = (count as f64) * inv_total;
123 max_val = max_val.max(v);
124 (idx, v)
125 }).collect::<Vec<_>>();
126 let len = token_dim_sample.len();
127 let mul_norm = (u32::MAX as f64) / max_val; let vec_u32 = raw.drain(..)
129 .map(|(idx, v)| {
130 let q = (v * mul_norm).ceil() as u32;
131 (idx, q)
132 })
133 .collect::<Vec<_>>();
134 (unsafe { ZeroSpVec::from_sparse_iter(vec_u32.into_iter(), len) }, total_count)
135 }
136}
137
138impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
139{
140 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
141 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
142 let doc_num = corpus.get_doc_num() as f64;
143 for token in token_dim_sample.iter() {
144 let doc_freq = corpus.get_token_count(token);
145 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
146 }
147 let max = idf_vec
148 .iter()
149 .max_by(|a, b| a.total_cmp(b))
150 .copied()
151 .unwrap_or(1.0);
152 (
153 idf_vec
154 .into_iter()
155 .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
156 .collect(),
157 max
158 )
159 }
160
161 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u16>, f64) {
162 let total_count = freq.token_sum() as f64;
164 let mut max_val = 0.0f32;
166 let div_total = (1.0 / total_count) as f32;
167 let raw = freq.iter().map(|(token, count)| {
168 let idx = token_dim_sample.get_index(token).unwrap();
169 let v = (count as f32) * div_total;
170 max_val = max_val.max(v);
171 (idx, v)
172 }).collect::<Vec<_>>();
173 let len = token_dim_sample.len();
174 let norm_div_max = (u16::MAX as f32) / max_val; let vec_u16 = raw.into_iter()
177 .map(|(idx, v)| {
178 let q = (v * norm_div_max).ceil() as u16;
179 (idx, q)
180 })
181 .collect::<Vec<_>>();
182 (unsafe { ZeroSpVec::from_sparse_iter(vec_u16.into_iter(), len) }, total_count)
183 }
184}
185
186impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
187{
188 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
189 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
190 let doc_num = corpus.get_doc_num() as f64;
191 for token in token_dim_sample.iter() {
192 let doc_freq = corpus.get_token_count(token);
193 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
194 }
195 let max = idf_vec
196 .iter()
197 .max_by(|a, b| a.total_cmp(b))
198 .copied()
199 .unwrap_or(1.0);
200 (
201 idf_vec
202 .into_iter()
203 .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
204 .collect(),
205 max
206 )
207 }
208
209 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u8>, f64) {
210 let total_count_f64 = freq.token_sum() as f64;
212 if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
213 let total_count = total_count_f64 as f32;
215 let mut max_val = 0.0f32;
216 let inv_total = 1.0f32 / total_count;
217 let raw = freq.iter().map(|(token, count)| {
218 let idx = token_dim_sample.get_index(token).unwrap();
219 let v = (count as f32) * inv_total;
220 max_val = max_val.max(v);
221 (idx, v)
222 }).collect::<Vec<_>>();
223 let len = token_dim_sample.len();
224 if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
225 let mul_norm = (u8::MAX as f32) / max_val; let vec_u8 = raw.into_iter()
227 .map(|(idx, v)| {
228 let q = (v * mul_norm).ceil() as u8;
229 (idx, q)
230 })
231 .collect::<Vec<_>>();
232 (unsafe { ZeroSpVec::from_sparse_iter(vec_u8.into_iter(), len) }, total_count_f64)
233 }
234}