tf_idf_vectorizer/vectorizer/
tfidf.rs1use num_traits::Num;
2
3use crate::{utils::datastruct::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
4
5pub trait TFIDFEngine<N, K>: Send + Sync
6where
7 N: Num + Copy,
8{
9 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
17 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<N>, f64);
24}
25
26#[derive(Debug)]
29pub struct DefaultTFIDFEngine;
30impl DefaultTFIDFEngine {
31 pub fn new() -> Self {
32 DefaultTFIDFEngine
33 }
34}
35
36impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
37{
38 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
39 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
40 let doc_num = corpus.get_doc_num() as f64;
41 for token in token_dim_sample.iter() {
42 let doc_freq = corpus.get_token_count(token);
43 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
44 }
45 (idf_vec, 1.0)
46 }
47
48 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f32>, f64) {
49 let total_count = freq.token_sum() as f32;
51 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
52 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
53 let len = token_dim_sample.len();
54 let inv_total = 1.0f32 / total_count;
55 for (idx, token) in token_dim_sample.iter().enumerate() {
56 let count = freq.token_count(token) as f32;
57 if count == 0.0 { continue; }
58 raw.push((idx, count * inv_total));
59 }
60 (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
61 }
62}
63
64impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
65{
66 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
67 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68 let doc_num = corpus.get_doc_num() as f64;
69 for token in token_dim_sample.iter() {
70 let doc_freq = corpus.get_token_count(token);
71 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
72 }
73 (idf_vec, 1.0)
74 }
75
76 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f64>, f64) {
77 let total_count = freq.token_sum() as f64;
79 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
81 let len = token_dim_sample.len();
82 let inv_total = 1.0f64 / total_count;
83 for (idx, token) in token_dim_sample.iter().enumerate() {
84 let count = freq.token_count(token) as f64;
85 if count == 0.0 { continue; }
86 raw.push((idx, count * inv_total));
87 }
88 (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
89 }
90}
91
92impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
93{
94 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
95 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
96 let doc_num = corpus.get_doc_num() as f64;
97 for token in token_dim_sample.iter() {
98 let doc_freq = corpus.get_token_count(token);
99 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as u32);
100 }
101 (idf_vec, 1.0)
102 }
103
104 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u32>, f64) {
105 let total_count = freq.token_sum() as f64;
107 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
108 let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
109 let mut max_val = 0.0f64;
110 let inv_total = 1.0f64 / total_count;
111 for (idx, token) in token_dim_sample.iter().enumerate() {
112 let count = freq.token_count(token) as f64;
113 if count == 0.0 { continue; }
114 let v = count * inv_total;
115 if v > max_val { max_val = v; }
116 raw.push((idx, v));
117 }
118 let len = token_dim_sample.len();
119 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
120 let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
121 let mul_norm = (u32::MAX as f64) / max_val; for (idx, v) in raw.into_iter() {
123 let q = (v * mul_norm).ceil() as u32;
124 vec_u32.push((idx, q));
125 }
126 (unsafe { ZeroSpVec::from_raw_iter(vec_u32.into_iter(), len) }, total_count)
127 }
128}
129
130impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
131{
132 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
133 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
134 let doc_num = corpus.get_doc_num() as f64;
135 for token in token_dim_sample.iter() {
136 let doc_freq = corpus.get_token_count(token);
137 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
138 }
139 let max = idf_vec
140 .iter()
141 .max_by(|a, b| a.total_cmp(b))
142 .copied()
143 .unwrap_or(1.0);
144 (
145 idf_vec
146 .into_iter()
147 .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
148 .collect(),
149 max
150 )
151 }
152
153 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u16>, f64) {
154 let total_count = freq.token_sum() as f64;
156 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
158 let mut max_val = 0.0f32;
159 let div_total = (1.0 / total_count) as f32;
160 for (idx, token) in token_dim_sample.iter().enumerate() {
161 let count = freq.token_count(token);
162 if count == 0 { continue; }
163 let v = count as f32 * div_total;
164 if v > max_val { max_val = v; }
165 raw.push((idx, v));
166 }
167 let len = token_dim_sample.len();
168 if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
169 let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
171 let norm_div_max = (u16::MAX as f32) / max_val; for (idx, v) in raw.into_iter() {
173 let q = (v * norm_div_max).ceil() as u16;
174 vec_u16.push((idx, q));
175 }
176 (unsafe { ZeroSpVec::from_raw_iter(vec_u16.into_iter(), len) }, total_count)
177 }
178}
179
180impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
181{
182 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
183 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
184 let doc_num = corpus.get_doc_num() as f64;
185 for token in token_dim_sample.iter() {
186 let doc_freq = corpus.get_token_count(token);
187 idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
188 }
189 let max = idf_vec
190 .iter()
191 .max_by(|a, b| a.total_cmp(b))
192 .copied()
193 .unwrap_or(1.0);
194 (
195 idf_vec
196 .into_iter()
197 .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
198 .collect(),
199 max
200 )
201 }
202
203 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u8>, f64) {
204 let total_count_f64 = freq.token_sum() as f64;
206 if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
207 let total_count = total_count_f64 as f32;
209 let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
210 let mut max_val = 0.0f32;
211 let inv_total = 1.0f32 / total_count;
212 for (idx, token) in token_dim_sample.iter().enumerate() {
213 let count = freq.token_count(token) as f32;
214 if count == 0.0 { continue; }
215 let v = count * inv_total;
216 if v > max_val { max_val = v; }
217 raw.push((idx, v));
218 }
219 let len = token_dim_sample.len();
220 if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
221 let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
222 let mul_norm = (u8::MAX as f32) / max_val; for (idx, v) in raw.into_iter() {
224 let q = (v * mul_norm).ceil() as u8;
225 vec_u8.push((idx, q));
226 }
227 (unsafe { ZeroSpVec::from_raw_iter(vec_u8.into_iter(), len) }, total_count_f64)
228 }
229}