tf_idf_vectorizer/vectorizer/
tfidf.rs

1
2use half::f16;
3use num_traits::Num;
4
5use crate::{utils::datastruct::{map::IndexSet, vector::{ZeroSpVec, ZeroSpVecTrait}}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
6
7pub trait TFIDFEngine<N, K>: Send + Sync
8where
9    N: Num + Copy,
10{
11    /// Method to generate the IDF vector
12    /// # Arguments
13    /// * `corpus` - The corpus
14    /// * `token_dim_sample` - Token dimension sample
15    /// # Returns
16    /// * `Vec<N>` - The IDF vector
17    /// * `denormalize_num` - Value for denormalization
18    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
19    /// Method to generate the TF vector
20    /// # Arguments
21    /// * `freq` - Token frequency
22    /// * `token_dim_sample` - Token dimension sample
23    /// # Returns
24    /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
25    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<N>, f64);
26}
27
28/// デフォルトのTF-IDFエンジン
29/// `f16`, `f32`, `f64`, `u32`, `u16`, `u8`の型に対応
30#[derive(Debug)]
31pub struct DefaultTFIDFEngine;
32impl DefaultTFIDFEngine {
33    pub fn new() -> Self {
34        DefaultTFIDFEngine
35    }
36}
37
38impl<K> TFIDFEngine<f16, K> for DefaultTFIDFEngine {
39    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f16>, f64) {
40        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
41        let doc_num = corpus.get_doc_num() as f64;
42        for token in token_dim_sample.iter() {
43            let doc_freq = corpus.get_token_count(token);
44            idf_vec.push(f16::from_f64(doc_num / (doc_freq as f64 + 1.0)));
45        }
46        (idf_vec, 1.0)
47    }
48    
49    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f16>, f64) {
50        // Build sparse TF vector: only non-zero entries are stored
51        let total_count = freq.token_sum() as f32;
52        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
53        let len = token_dim_sample.len();
54        let inv_total = 1.0f32 / total_count;
55        let mut raw = freq.iter().filter_map(|(token, count)| {
56            let idx = token_dim_sample.get_index(token)?;
57            Some((idx, f16::from_f32((count as f32) * inv_total)))
58        }).collect::<Vec<_>>();
59        raw.sort_unstable_by_key(|(idx, _)| *idx);
60        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
61    }
62}
63
64impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
65{
66    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
67        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68        let doc_num = corpus.get_doc_num() as f64;
69        for token in token_dim_sample.iter() {
70            let doc_freq = corpus.get_token_count(token);
71            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
72        }
73        (idf_vec, 1.0)
74    }
75
76    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f32>, f64) {
77        // Build sparse TF vector: only non-zero entries are stored
78        let total_count = freq.token_sum() as f32;
79        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80        let len = token_dim_sample.len();
81        let inv_total = 1.0f32 / total_count;
82        let mut raw = freq.iter().filter_map(|(token, count)| {
83            let idx = token_dim_sample.get_index(token)?;
84            Some((idx, (count as f32) * inv_total))
85        }).collect::<Vec<_>>();
86        raw.sort_unstable_by_key(|(idx, _)| *idx);
87        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
88    }
89}
90
91impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
92{
93    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
94        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
95        let doc_num = corpus.get_doc_num() as f64;
96        for token in token_dim_sample.iter() {
97            let doc_freq = corpus.get_token_count(token);
98            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
99        }
100        (idf_vec, 1.0)
101    }
102
103    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f64>, f64) {
104        // Build sparse TF vector: only non-zero entries are stored
105        let total_count = freq.token_sum() as f64;
106        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
107        let len = token_dim_sample.len();
108        let inv_total = 1.0f64 / total_count;
109        let mut raw = freq.iter().filter_map(|(token, count)| {
110            let idx = token_dim_sample.get_index(token)?;
111            Some((idx, (count as f64) * inv_total))
112        }).collect::<Vec<_>>();
113        raw.sort_unstable_by_key(|(idx, _)| *idx);
114        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
115    }
116}
117
118impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
119{
120    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
121        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
122        let doc_num = corpus.get_doc_num() as f64;
123        for token in token_dim_sample.iter() {
124            let doc_freq = corpus.get_token_count(token);
125            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
126        }
127        let max = idf_vec
128            .iter()
129            .max_by(|a, b| a.total_cmp(b))
130            .copied()
131            .unwrap_or(1.0);
132        (
133        idf_vec
134            .into_iter()
135            .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
136            .collect(),
137        max
138        )
139    }
140
141    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u32>, f64) {
142        // Build sparse TF vector without allocating dense Vec
143        let total_count = freq.token_sum() as f64;
144        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
145        let mut max_val = 0.0f64;
146        let inv_total = 1.0f64 / total_count;
147        let mut raw: Vec<(usize, f64)> = freq.iter().filter_map(|(token, count)| {
148            let idx = token_dim_sample.get_index(token)?;
149            let v = (count as f64) * inv_total;
150            max_val = max_val.max(v);
151            Some((idx, v))
152        }).collect::<Vec<_>>();
153        let len = token_dim_sample.len();
154        let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
155        let vec_u32 = raw.drain(..)
156            .map(|(idx, v)| {
157                let q = (v * mul_norm).ceil() as u32;
158                (idx, q)
159            })
160            .collect::<Vec<_>>();
161        (unsafe { ZeroSpVec::from_sparse_iter(vec_u32.into_iter(), len) }, total_count)
162    }
163}
164
165impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
166{
167    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
168        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
169        let doc_num = corpus.get_doc_num() as f64;
170        for token in token_dim_sample.iter() {
171            let doc_freq = corpus.get_token_count(token);
172            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
173        }
174        let max = idf_vec
175            .iter()
176            .max_by(|a, b| a.total_cmp(b))
177            .copied()
178            .unwrap_or(1.0);
179        (
180        idf_vec
181            .into_iter()
182            .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
183            .collect(),
184        max
185        )
186    }
187
188    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u16>, f64) {
189        // Build sparse TF vector without allocating a dense Vec<f64>
190        let total_count = freq.token_sum() as f64;
191        // First pass: compute raw tf values and track max
192        let mut max_val = 0.0f32;
193        let div_total = (1.0 / total_count) as f32;
194        let raw = freq.iter().filter_map(|(token, count)| {
195            let idx = token_dim_sample.get_index(token)?;
196            let v = (count as f32) * div_total;
197            max_val = max_val.max(v);
198            Some((idx, v))
199        }).collect::<Vec<_>>();
200        let len = token_dim_sample.len();
201        // Second pass: normalize into quantized u16 and build sparse vector
202        let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
203        let vec_u16 = raw.into_iter()
204            .map(|(idx, v)| {
205                let q = (v * norm_div_max).ceil() as u16;
206                (idx, q)
207            })
208            .collect::<Vec<_>>();
209        (unsafe { ZeroSpVec::from_sparse_iter(vec_u16.into_iter(), len) }, total_count)
210    }
211}
212
213impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
214{
215    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
216        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
217        let doc_num = corpus.get_doc_num() as f64;
218        for token in token_dim_sample.iter() {
219            let doc_freq = corpus.get_token_count(token);
220            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
221        }
222        let max = idf_vec
223            .iter()
224            .max_by(|a, b| a.total_cmp(b))
225            .copied()
226            .unwrap_or(1.0);
227        (
228        idf_vec
229            .into_iter()
230            .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
231            .collect(),
232        max
233        )
234    }
235
236    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u8>, f64) {
237        // Build sparse TF vector without allocating dense Vec
238        let total_count_f64 = freq.token_sum() as f64;
239        if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
240        // Use f32 intermediates for u8 to reduce cost and memory
241        let total_count = total_count_f64 as f32;
242        let mut max_val = 0.0f32;
243        let inv_total = 1.0f32 / total_count;
244        let raw = freq.iter().filter_map(|(token, count)| {
245            let idx = token_dim_sample.get_index(token)?;
246            let v = (count as f32) * inv_total;
247            max_val = max_val.max(v);
248            Some((idx, v))
249        }).collect::<Vec<_>>();
250        let len = token_dim_sample.len();
251        if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
252        let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
253        let vec_u8 = raw.into_iter()
254            .map(|(idx, v)| {
255                let q = (v * mul_norm).ceil() as u8;
256                (idx, q)
257            })
258            .collect::<Vec<_>>();
259        (unsafe { ZeroSpVec::from_sparse_iter(vec_u8.into_iter(), len) }, total_count_f64)
260    }
261}