tf_idf_vectorizer/vectorizer/tfidf.rs
1
2use half::f16;
3use num_traits::Num;
4
5use crate::{utils::datastruct::{map::IndexSet, vector::{ZeroSpVec, ZeroSpVecTrait}}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
6
7pub trait TFIDFEngine<N, K>: Send + Sync
8where
9 N: Num + Copy,
10{
11 /// Method to generate the IDF vector
12 /// # Arguments
13 /// * `corpus` - The corpus
14 /// * `token_dim_sample` - Token dimension sample
15 /// # Returns
16 /// * `Vec<N>` - The IDF vector
17 /// * `denormalize_num` - Value for denormalization
18 fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
19 let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
20 let doc_num = corpus.get_doc_num() as f64;
21 for token in token_dim_sample.iter() {
22 let doc_freq = corpus.get_token_count(token);
23 idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
24 }
25 (idf_vec, 1.0)
26 }
27 /// Method to generate the TF vector
28 /// # Arguments
29 /// * `freq` - Token frequency
30 /// * `token_dim_sample` - Token dimension sample
31 /// # Returns
32 /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
33 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<N>, f64);
34}
35
36/// デフォルトのTF-IDFエンジン
37/// `f16`, `f32`, `f64`, `u32`, `u16`, `u8`の型に対応
38#[derive(Debug)]
39pub struct DefaultTFIDFEngine;
40impl DefaultTFIDFEngine {
41 pub fn new() -> Self {
42 DefaultTFIDFEngine
43 }
44}
45
46impl<K> TFIDFEngine<f16, K> for DefaultTFIDFEngine {
47 // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f16>, f64) {
48 // let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
49 // let doc_num = corpus.get_doc_num() as f64;
50 // for token in token_dim_sample.iter() {
51 // let doc_freq = corpus.get_token_count(token);
52 // idf_vec.push(f16::from_f64(doc_num / (doc_freq as f64 + 1.0)));
53 // }
54 // (idf_vec, 1.0)
55 // }
56
57 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f16>, f64) {
58 // Build sparse TF vector: only non-zero entries are stored
59 let total_count = freq.token_sum() as f32;
60 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
61 let len = token_dim_sample.len();
62 let inv_total = 1.0f32 / total_count;
63 let mut raw = freq.iter().filter_map(|(token, count)| {
64 let idx = token_dim_sample.get_index(token)?;
65 Some((idx, f16::from_f32((count as f32) * inv_total)))
66 }).collect::<Vec<_>>();
67 raw.sort_unstable_by_key(|(idx, _)| *idx);
68 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
69 }
70}
71
72impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
73{
74 // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
75 // let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
76 // let doc_num = corpus.get_doc_num() as f64;
77 // for token in token_dim_sample.iter() {
78 // let doc_freq = corpus.get_token_count(token);
79 // idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
80 // }
81 // (idf_vec, 1.0)
82 // }
83
84 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f32>, f64) {
85 // Build sparse TF vector: only non-zero entries are stored
86 let total_count = freq.token_sum() as f32;
87 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
88 let len = token_dim_sample.len();
89 let inv_total = 1.0f32 / total_count;
90 let mut raw = freq.iter().filter_map(|(token, count)| {
91 let idx = token_dim_sample.get_index(token)?;
92 Some((idx, (count as f32) * inv_total))
93 }).collect::<Vec<_>>();
94 raw.sort_unstable_by_key(|(idx, _)| *idx);
95 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
96 }
97}
98
99impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
100{
101 // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
102 // let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
103 // let doc_num = corpus.get_doc_num() as f64;
104 // for token in token_dim_sample.iter() {
105 // let doc_freq = corpus.get_token_count(token);
106 // idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
107 // }
108 // (idf_vec, 1.0)
109 // }
110
111 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f64>, f64) {
112 // Build sparse TF vector: only non-zero entries are stored
113 let total_count = freq.token_sum() as f64;
114 if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
115 let len = token_dim_sample.len();
116 let inv_total = 1.0f64 / total_count;
117 let mut raw = freq.iter().filter_map(|(token, count)| {
118 let idx = token_dim_sample.get_index(token)?;
119 Some((idx, (count as f64) * inv_total))
120 }).collect::<Vec<_>>();
121 raw.sort_unstable_by_key(|(idx, _)| *idx);
122 (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
123 }
124}
125
126impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
127{
128 // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
129 // let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
130 // let doc_num = corpus.get_doc_num() as f64;
131 // for token in token_dim_sample.iter() {
132 // let doc_freq = corpus.get_token_count(token);
133 // idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
134 // }
135 // let max = idf_vec
136 // .iter()
137 // .max_by(|a, b| a.total_cmp(b))
138 // .copied()
139 // .unwrap_or(1.0);
140 // (
141 // idf_vec
142 // .into_iter()
143 // .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
144 // .collect(),
145 // max
146 // )
147 // }
148
149 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u32>, f64) {
150 // Build sparse TF vector without allocating dense Vec
151 let total_count = freq.token_sum() as f64;
152 if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
153 let mut max_val = 0.0f64;
154 let inv_total = 1.0f64 / total_count;
155 let mut raw: Vec<(usize, f64)> = freq.iter().filter_map(|(token, count)| {
156 let idx = token_dim_sample.get_index(token)?;
157 let v = (count as f64) * inv_total;
158 max_val = max_val.max(v);
159 Some((idx, v))
160 }).collect::<Vec<_>>();
161 let len = token_dim_sample.len();
162 let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
163 let vec_u32 = raw.drain(..)
164 .map(|(idx, v)| {
165 let q = (v * mul_norm).ceil() as u32;
166 (idx, q)
167 })
168 .collect::<Vec<_>>();
169 (unsafe { ZeroSpVec::from_sparse_iter(vec_u32.into_iter(), len) }, total_count)
170 }
171}
172
173impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
174{
175 // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
176 // let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
177 // let doc_num = corpus.get_doc_num() as f64;
178 // for token in token_dim_sample.iter() {
179 // let doc_freq = corpus.get_token_count(token);
180 // idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
181 // }
182 // let max = idf_vec
183 // .iter()
184 // .max_by(|a, b| a.total_cmp(b))
185 // .copied()
186 // .unwrap_or(1.0);
187 // (
188 // idf_vec
189 // .into_iter()
190 // .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
191 // .collect(),
192 // max
193 // )
194 // }
195
196 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u16>, f64) {
197 // Build sparse TF vector without allocating a dense Vec<f64>
198 let total_count = freq.token_sum() as f64;
199 // First pass: compute raw tf values and track max
200 let mut max_val = 0.0f32;
201 let div_total = (1.0 / total_count) as f32;
202 let raw = freq.iter().filter_map(|(token, count)| {
203 let idx = token_dim_sample.get_index(token)?;
204 let v = (count as f32) * div_total;
205 max_val = max_val.max(v);
206 Some((idx, v))
207 }).collect::<Vec<_>>();
208 let len = token_dim_sample.len();
209 // Second pass: normalize into quantized u16 and build sparse vector
210 let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
211 let vec_u16 = raw.into_iter()
212 .map(|(idx, v)| {
213 let q = (v * norm_div_max).ceil() as u16;
214 (idx, q)
215 })
216 .collect::<Vec<_>>();
217 (unsafe { ZeroSpVec::from_sparse_iter(vec_u16.into_iter(), len) }, total_count)
218 }
219}
220
221impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
222{
223 // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
224 // let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
225 // let doc_num = corpus.get_doc_num() as f64;
226 // for token in token_dim_sample.iter() {
227 // let doc_freq = corpus.get_token_count(token);
228 // idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
229 // }
230 // let max = idf_vec
231 // .iter()
232 // .max_by(|a, b| a.total_cmp(b))
233 // .copied()
234 // .unwrap_or(1.0);
235 // (
236 // idf_vec
237 // .into_iter()
238 // .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
239 // .collect(),
240 // max
241 // )
242 // }
243
244 fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u8>, f64) {
245 // Build sparse TF vector without allocating dense Vec
246 let total_count_f64 = freq.token_sum() as f64;
247 if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
248 // Use f32 intermediates for u8 to reduce cost and memory
249 let total_count = total_count_f64 as f32;
250 let mut max_val = 0.0f32;
251 let inv_total = 1.0f32 / total_count;
252 let raw = freq.iter().filter_map(|(token, count)| {
253 let idx = token_dim_sample.get_index(token)?;
254 let v = (count as f32) * inv_total;
255 max_val = max_val.max(v);
256 Some((idx, v))
257 }).collect::<Vec<_>>();
258 let len = token_dim_sample.len();
259 if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
260 let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
261 let vec_u8 = raw.into_iter()
262 .map(|(idx, v)| {
263 let q = (v * mul_norm).ceil() as u8;
264 (idx, q)
265 })
266 .collect::<Vec<_>>();
267 (unsafe { ZeroSpVec::from_sparse_iter(vec_u8.into_iter(), len) }, total_count_f64)
268 }
269}