tf_idf_vectorizer/vectorizer/
tfidf.rs1use ahash::RandomState;
2use indexmap::IndexSet;
3use num_traits::Num;
4
5use crate::{utils::math::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
6
7pub trait TFIDFEngine<N>: Send + Sync
8where
9 N: Num + Copy,
10{
11 fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<N>, f64);
19 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<N>, f64);
26}
27
28#[derive(Debug)]
31pub struct DefaultTFIDFEngine;
32impl DefaultTFIDFEngine {
33 pub fn new() -> Self {
34 DefaultTFIDFEngine
35 }
36}
37
38impl TFIDFEngine<f32> for DefaultTFIDFEngine
39{
40 fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<f32>, f64) {
41 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
42 let doc_num = corpus.get_doc_num() as f64;
43 for token in token_dim_sample {
44 let doc_freq = corpus.get_token_count(token);
45 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
46 }
47 (idf_vec, 1.0)
48 }
49
50 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<f32>, f64) {
51 let total_count = freq.token_sum() as f32;
53 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
54 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
55 let len = token_dim_sample.len();
56 let inv_total = 1.0f32 / total_count;
57 for (idx, token) in token_dim_sample.iter().enumerate() {
58 let count = freq.token_count(token) as f32;
59 if count == 0.0 { continue; }
60 raw.push((idx, count * inv_total));
61 }
62 (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
63 }
64}
65
66impl TFIDFEngine<f64> for DefaultTFIDFEngine
67{
68 fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<f64>, f64) {
69 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
70 let doc_num = corpus.get_doc_num() as f64;
71 for token in token_dim_sample {
72 let doc_freq = corpus.get_token_count(token);
73 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
74 }
75 (idf_vec, 1.0)
76 }
77
78 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<f64>, f64) {
79 let total_count = freq.token_sum() as f64;
81 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
82 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
83 let len = token_dim_sample.len();
84 let inv_total = 1.0f64 / total_count;
85 for (idx, token) in token_dim_sample.iter().enumerate() {
86 let count = freq.token_count(token) as f64;
87 if count == 0.0 { continue; }
88 raw.push((idx, count * inv_total));
89 }
90 (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
91 }
92}
93
94impl TFIDFEngine<u32> for DefaultTFIDFEngine
95{
96 fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<u32>, f64) {
97 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
98 let doc_num = corpus.get_doc_num() as f64;
99 for token in token_dim_sample {
100 let doc_freq = corpus.get_token_count(token);
101 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as u32);
102 }
103 (idf_vec, 1.0)
104 }
105
106 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<u32>, f64) {
107 let total_count = freq.token_sum() as f64;
109 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
110 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
111 let mut max_val = 0.0f64;
112 let inv_total = 1.0f64 / total_count;
113 for (idx, token) in token_dim_sample.iter().enumerate() {
114 let count = freq.token_count(token) as f64;
115 if count == 0.0 { continue; }
116 let v = count * inv_total;
117 if v > max_val { max_val = v; }
118 raw.push((idx, v));
119 }
120 let len = token_dim_sample.len();
121 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
122 let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
123 let mul_norm = (u32::MAX as f64) / max_val; for (idx, v) in raw.into_iter() {
125 let q = (v * mul_norm).ceil() as u32;
126 vec_u32.push((idx, q));
127 }
128 (unsafe { ZeroSpVec::from_raw_iter(vec_u32.into_iter(), len) }, total_count)
129 }
130}
131
132impl TFIDFEngine<u16> for DefaultTFIDFEngine
133{
134 fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<u16>, f64) {
135 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
136 let doc_num = corpus.get_doc_num() as f64;
137 for token in token_dim_sample {
138 let doc_freq = corpus.get_token_count(token);
139 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
140 }
141 let max = idf_vec
142 .iter()
143 .max_by(|a, b| a.total_cmp(b))
144 .copied()
145 .unwrap_or(1.0);
146 (
147 idf_vec
148 .into_iter()
149 .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
150 .collect(),
151 max
152 )
153 }
154
155 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<u16>, f64) {
156 let total_count = freq.token_sum() as f64;
158 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
160 let mut max_val = 0.0f32;
161 let div_total = (1.0 / total_count) as f32;
162 for (idx, token) in token_dim_sample.iter().enumerate() {
163 let count = freq.token_count(token);
164 if count == 0 { continue; }
165 let v = count as f32 * div_total;
166 if v > max_val { max_val = v; }
167 raw.push((idx, v));
168 }
169 let len = token_dim_sample.len();
170 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
171 let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
173 let norm_div_max = (u16::MAX as f32) / max_val; for (idx, v) in raw.into_iter() {
175 let q = (v * norm_div_max).ceil() as u16;
176 vec_u16.push((idx, q));
177 }
178 (unsafe { ZeroSpVec::from_raw_iter(vec_u16.into_iter(), len) }, total_count)
179 }
180}
181
182impl TFIDFEngine<u8> for DefaultTFIDFEngine
183{
184 fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<u8>, f64) {
185 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
186 let doc_num = corpus.get_doc_num() as f64;
187 for token in token_dim_sample {
188 let doc_freq = corpus.get_token_count(token);
189 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
190 }
191 let max = idf_vec
192 .iter()
193 .max_by(|a, b| a.total_cmp(b))
194 .copied()
195 .unwrap_or(1.0);
196 (
197 idf_vec
198 .into_iter()
199 .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
200 .collect(),
201 max
202 )
203 }
204
205 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<u8>, f64) {
206 let total_count_f64 = freq.token_sum() as f64;
208 if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
209 let total_count = total_count_f64 as f32;
211 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
212 let mut max_val = 0.0f32;
213 let inv_total = 1.0f32 / total_count;
214 for (idx, token) in token_dim_sample.iter().enumerate() {
215 let count = freq.token_count(token) as f32;
216 if count == 0.0 { continue; }
217 let v = count * inv_total;
218 if v > max_val { max_val = v; }
219 raw.push((idx, v));
220 }
221 let len = token_dim_sample.len();
222 if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
223 let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
224 let mul_norm = (u8::MAX as f32) / max_val; for (idx, v) in raw.into_iter() {
226 let q = (v * mul_norm).ceil() as u8;
227 vec_u8.push((idx, q));
228 }
229 (unsafe { ZeroSpVec::from_raw_iter(vec_u8.into_iter(), len) }, total_count_f64)
230 }
231}