tf_idf_vectorizer/vectorizer/
tfidf.rs

1use crate::{utils::math::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
2
3pub trait TFIDFEngine<N>
4where
5    N: num::Num + Copy,
6{
7    /// IDFベクトルを生成するメソッド
8    /// # Arguments
9    /// * `corpus` - コーパス
10    /// * `token_dim_sample` - トークンの次元サンプル
11    /// # Returns
12    /// * `Vec<N>` - IDFベクトル
13    /// * `denormalize_num` - 正規化解除のための数値
14    fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<N>, f64);
15    /// TFベクトルを生成するメソッド
16    /// # Arguments
17    /// * `freq` - トークン頻度
18    /// * `token_dim_sample` - トークンの次元サンプル
19    /// # Returns
20    /// * `(ZeroSpVec<N>, f64)` - TFベクトルと正規化解除のための数値
21    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<N>, f64);
22    /// TF-IDFを計算するイテレータ
23    /// # Arguments
24    /// * `tf` - TFベクトルのイテレータ
25    /// * `tf_denorm` - TFの正規化解除のための数値
26    /// * `idf` - IDFベクトルのイテレータ
27    /// * `idf_denorm` - IDFの正規化解除のための数値
28    /// # Returns
29    /// * `(impl Iterator<Item = N>, f64)` - TF-IDFのイテレータと正規化解除のための数値
30    /// 
31    /// tfidfのdenormは tf idf ともにmaxが 1.0 のはずなので tf_denorm * idf_denorm で計算できる(intでの計算くそめんどいやつ)
32    fn tfidf_iter_calc(tf: impl Iterator<Item = N>, tf_denorm: f64, idf: impl Iterator<Item = N>, idf_denorm: f64) -> (impl Iterator<Item = N>, f64);
33    fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, N)>, tf_denorm: f64, idf: &Vec<N>, idf_denorm: f64) -> (impl Iterator<Item = (usize, N)>, f64);
34}
35
36/// デフォルトのTF-IDFエンジン
37/// `f32`、`f64`、`u32`、`u16`、`u8`の型に対応
38#[derive(Debug)]
39pub struct DefaultTFIDFEngine;
40impl DefaultTFIDFEngine {
41    pub fn new() -> Self {
42        DefaultTFIDFEngine
43    }
44}
45
46impl TFIDFEngine<f32> for DefaultTFIDFEngine
47{
48    fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<f32>, f64) {
49        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
50        let doc_num = corpus.get_doc_num() as f64;
51        for token in token_dim_sample {
52            let doc_freq = corpus.get_token_count(token);
53            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
54        }
55        (idf_vec, 1.0)
56    }
57
58    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<f32>, f64) {
59        // Build sparse TF vector: only non-zero entries are stored
60        let total_count = freq.token_sum() as f32;
61        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
62        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
63        for (idx, token) in token_dim_sample.iter().enumerate() {
64            let count = freq.token_count(token) as f32;
65            if count == 0.0 { continue; }
66            raw.push((idx, count / total_count));
67        }
68        (ZeroSpVec::from_raw_iter(raw.into_iter()), total_count.into())
69    }
70
71    fn tfidf_iter_calc(tf: impl Iterator<Item = f32>, tf_denorm: f64, idf: impl Iterator<Item = f32>, idf_denorm: f64) -> (impl Iterator<Item = f32>, f64) {
72        let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
73            let tfidf = tf_val * idf_val;
74            tfidf
75        });
76        (tfidf, tf_denorm * idf_denorm)
77    }
78
79    fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, f32)>, tf_denorm: f64, idf: &Vec<f32>, idf_denorm: f64) -> (impl Iterator<Item = (usize, f32)>, f64) {
80        let tfidf = tf.map(move |(idx, tf_val)| {
81            let idf_val = idf.get(idx).copied().unwrap_or(0.0);
82            (idx, tf_val * idf_val)
83        });
84        (tfidf, tf_denorm * idf_denorm)
85    }
86}
87
88impl TFIDFEngine<f64> for DefaultTFIDFEngine
89{
90    fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<f64>, f64) {
91        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
92        let doc_num = corpus.get_doc_num() as f64;
93        for token in token_dim_sample {
94            let doc_freq = corpus.get_token_count(token);
95            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
96        }
97        (idf_vec, 1.0)
98    }
99
100    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<f64>, f64) {
101        // Build sparse TF vector: only non-zero entries are stored
102        let total_count = freq.token_sum() as f64;
103        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
104        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
105        for (idx, token) in token_dim_sample.iter().enumerate() {
106            let count = freq.token_count(token) as f64;
107            if count == 0.0 { continue; }
108            raw.push((idx, count / total_count));
109        }
110        (ZeroSpVec::from_raw_iter(raw.into_iter()), total_count.into())
111    }
112
113    fn tfidf_iter_calc(tf: impl Iterator<Item = f64>, tf_denorm: f64, idf: impl Iterator<Item = f64>, idf_denorm: f64) -> (impl Iterator<Item = f64>, f64) {
114        let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
115            let tfidf = tf_val * idf_val;
116            tfidf
117        });
118        (tfidf, tf_denorm * idf_denorm)
119    }
120
121    fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, f64)>, tf_denorm: f64, idf: &Vec<f64>, idf_denorm: f64) -> (impl Iterator<Item = (usize, f64)>, f64) {
122        let tfidf = tf.map(move |(idx, tf_val)| {
123            let idf_val = idf.get(idx).copied().unwrap_or(0.0);
124            (idx, tf_val * idf_val)
125        });
126        (tfidf, tf_denorm * idf_denorm)
127    }
128}
129
130impl TFIDFEngine<u32> for DefaultTFIDFEngine
131{
132    fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<u32>, f64) {
133        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
134        let doc_num = corpus.get_doc_num() as f64;
135        for token in token_dim_sample {
136            let doc_freq = corpus.get_token_count(token);
137            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as u32);
138        }
139        (idf_vec, 1.0)
140    }
141
142    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<u32>, f64) {
143        // Build sparse TF vector without allocating dense Vec
144        let total_count = freq.token_sum() as f64;
145        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
146        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
147        let mut max_val = 0.0f64;
148        for (idx, token) in token_dim_sample.iter().enumerate() {
149            let count = freq.token_count(token) as f64;
150            if count == 0.0 { continue; }
151            let v = count / total_count;
152            if v > max_val { max_val = v; }
153            raw.push((idx, v));
154        }
155        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
156        let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
157        for (idx, v) in raw.into_iter() {
158            let q = (v / max_val * u32::MAX as f64).ceil() as u32;
159            vec_u32.push((idx, q));
160        }
161        (ZeroSpVec::from_raw_iter(vec_u32.into_iter()), total_count)
162    }
163
164    fn tfidf_iter_calc(tf: impl Iterator<Item = u32>, tf_denorm: f64, idf: impl Iterator<Item = u32>, idf_denorm: f64) -> (impl Iterator<Item = u32>, f64) {
165        // denormのコストを考える
166        // (tf_val / u32::MAX) * tf_denorm 
167        // 除算遅いから
168        // const base = 1 / u32::MAX as f64;
169        // (tf_val * base) * tf_denorm
170        // で計算する
171        // 合計5回の乗算
172        // を const val = base * tf_denorm * base * idf_denorm
173        // で (tf * idf * val)
174        // でf64 生の値が出る
175        // を0-1に正規化 楽観的なmaxとしてtf_denorm * idf_denorm
176        // tf * idf * (1 / u32::MAX) * (1 / u32::MAX) * tf_denorm * idf_denorm / (tf_denorm * idf_denorm) * u32::MAX 
177        // tf * idf * (1 / u32::MAX)
178        // done
179        let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
180            let tfidf = (tf_val as u64 * idf_val as u64) / u32::MAX as u64;
181            tfidf as u32
182        });
183        (tfidf, tf_denorm * idf_denorm)
184    }
185
186    fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, u32)>, tf_denorm: f64, idf: &Vec<u32>, idf_denorm: f64) -> (impl Iterator<Item = (usize, u32)>, f64) {
187        let tfidf = tf.map(move |(idx, tf_val)| {
188            let idf_val = *idf.get(idx).unwrap_or(&0);
189            let v = (tf_val as u64 * idf_val as u64 / u32::MAX as u64) as u32;
190            (idx, v)
191        });
192        (tfidf, tf_denorm * idf_denorm)
193    }
194}
195
196impl TFIDFEngine<u16> for DefaultTFIDFEngine
197{
198    fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<u16>, f64) {
199        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
200        let doc_num = corpus.get_doc_num() as f64;
201        for token in token_dim_sample {
202            let doc_freq = corpus.get_token_count(token);
203            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
204        }
205        let max = idf_vec
206            .iter()
207            .max_by(|a, b| a.total_cmp(b))
208            .copied()
209            .unwrap_or(1.0);
210        (
211        idf_vec
212            .into_iter()
213            .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
214            .collect(),
215        max
216        )
217    }
218
219    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<u16>, f64) {
220        // Build sparse TF vector without allocating a dense Vec<f64>
221        let total_count = freq.token_sum() as f64;
222        // First pass: compute raw tf values and track max
223        let mut raw: Vec<(usize, f64)> = Vec::new();
224        raw.reserve(freq.token_num());
225        let mut max_val = 0.0f64;
226        for (idx, token) in token_dim_sample.iter().enumerate() {
227            let count = freq.token_count(token) as f64;
228            if count == 0.0 { continue; }
229            let v = count / total_count;
230            if v > max_val { max_val = v; }
231            raw.push((idx, v));
232        }
233        if max_val == 0.0 { // avoid division by zero
234            return (ZeroSpVec::new(), total_count);
235        }
236        // Second pass: normalize into quantized u16 and build sparse vector
237        let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
238        for (idx, v) in raw.into_iter() {
239            let q = (v / max_val * u16::MAX as f64).ceil() as u16;
240            vec_u16.push((idx, q));
241        }
242        (ZeroSpVec::from_raw_iter(vec_u16.into_iter()), total_count)
243    }
244
245    fn tfidf_iter_calc(tf: impl Iterator<Item = u16>, tf_denorm: f64, idf: impl Iterator<Item = u16>, idf_denorm: f64) -> (impl Iterator<Item = u16>, f64) {
246        let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
247            let tfidf = (tf_val as u32 * idf_val as u32) / u16::MAX as u32;
248            tfidf as u16
249        });
250        (tfidf, tf_denorm * idf_denorm)
251    }
252
253    fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, u16)>, tf_denorm: f64, idf: &Vec<u16>, idf_denorm: f64) -> (impl Iterator<Item = (usize, u16)>, f64) {
254        let tfidf = tf.map(move |(idx, tf_val)| {
255            let idf_val = *idf.get(idx).unwrap_or(&0);
256            let v = (tf_val as u32 * idf_val as u32 / u16::MAX as u32) as u16;
257            (idx, v)
258        });
259        (tfidf, tf_denorm * idf_denorm)
260    }
261}
262
263impl TFIDFEngine<u8> for DefaultTFIDFEngine
264{
265    fn idf_vec(corpus: &Corpus, token_dim_sample: &[String]) -> (Vec<u8>, f64) {
266        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
267        let doc_num = corpus.get_doc_num() as f64;
268        for token in token_dim_sample {
269            let doc_freq = corpus.get_token_count(token);
270            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
271        }
272        let max = idf_vec
273            .iter()
274            .max_by(|a, b| a.total_cmp(b))
275            .copied()
276            .unwrap_or(1.0);
277        (
278        idf_vec
279            .into_iter()
280            .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
281            .collect(),
282        max
283        )
284    }
285
286    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &[String]) -> (ZeroSpVec<u8>, f64) {
287        // Build sparse TF vector without allocating dense Vec
288        let total_count = freq.token_sum() as f64;
289        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
290        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
291        let mut max_val = 0.0f64;
292        for (idx, token) in token_dim_sample.iter().enumerate() {
293            let count = freq.token_count(token) as f64;
294            if count == 0.0 { continue; }
295            let v = count / total_count;
296            if v > max_val { max_val = v; }
297            raw.push((idx, v));
298        }
299        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
300        let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
301        for (idx, v) in raw.into_iter() {
302            let q = (v / max_val * u8::MAX as f64).ceil() as u8;
303            vec_u8.push((idx, q));
304        }
305        (ZeroSpVec::from_raw_iter(vec_u8.into_iter()), total_count)
306    }
307
308    fn tfidf_iter_calc(tf: impl Iterator<Item = u8>, tf_denorm: f64, idf: impl Iterator<Item = u8>, idf_denorm: f64) -> (impl Iterator<Item = u8>, f64) {
309        let tfidf = tf.zip(idf).map(move |(tf_val, idf_val)| {
310            let tfidf = (tf_val as u32 * idf_val as u32) / u8::MAX as u32;
311            tfidf as u8
312        });
313        (tfidf, tf_denorm * idf_denorm)
314    }
315
316    fn tfidf_iter_calc_sparse(tf: impl Iterator<Item = (usize, u8)>, tf_denorm: f64, idf: &Vec<u8>, idf_denorm: f64) -> (impl Iterator<Item = (usize, u8)>, f64) {
317        let tfidf = tf.map(move |(idx, tf_val)| {
318            let idf_val = *idf.get(idx).unwrap_or(&0);
319            let v = (tf_val as u32 * idf_val as u32 / u8::MAX as u32) as u8;
320            (idx, v)
321        });
322        (tfidf, tf_denorm * idf_denorm)
323    }
324}
325
326
327
328
329
330
331