tf_idf_vectorizer/vectorizer/
tfidf.rs

1
2use half::f16;
3use num_traits::Num;
4
5use crate::{utils::datastruct::{map::IndexSet, vector::{ZeroSpVec, ZeroSpVecTrait}}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
6
7pub trait TFIDFEngine<N, K>: Send + Sync
8where
9    N: Num + Copy,
10{
11    /// Method to generate the IDF vector
12    /// # Arguments
13    /// * `corpus` - The corpus
14    /// * `token_dim_sample` - Token dimension sample
15    /// # Returns
16    /// * `Vec<N>` - The IDF vector
17    /// * `denormalize_num` - Value for denormalization
18    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
19        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
20        let doc_num = corpus.get_doc_num() as f64;
21        for token in token_dim_sample.iter() {
22            let doc_freq = corpus.get_token_count(token);
23            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
24        }
25        (idf_vec, 1.0)
26    }
27    /// Method to generate the TF vector
28    /// # Arguments
29    /// * `freq` - Token frequency
30    /// * `token_dim_sample` - Token dimension sample
31    /// # Returns
32    /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
33    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<N>, f64);
34}
35
36/// デフォルトのTF-IDFエンジン
37/// `f16`, `f32`, `f64`, `u32`, `u16`, `u8`の型に対応
38#[derive(Debug)]
39pub struct DefaultTFIDFEngine;
40impl DefaultTFIDFEngine {
41    pub fn new() -> Self {
42        DefaultTFIDFEngine
43    }
44}
45
46impl<K> TFIDFEngine<f16, K> for DefaultTFIDFEngine {
47    // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f16>, f64) {
48    //     let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
49    //     let doc_num = corpus.get_doc_num() as f64;
50    //     for token in token_dim_sample.iter() {
51    //         let doc_freq = corpus.get_token_count(token);
52    //         idf_vec.push(f16::from_f64(doc_num / (doc_freq as f64 + 1.0)));
53    //     }
54    //     (idf_vec, 1.0)
55    // }
56    
57    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f16>, f64) {
58        // Build sparse TF vector: only non-zero entries are stored
59        let total_count = freq.token_sum() as f32;
60        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
61        let len = token_dim_sample.len();
62        let inv_total = 1.0f32 / total_count;
63        let mut raw = freq.iter().filter_map(|(token, count)| {
64            let idx = token_dim_sample.get_index(token)?;
65            Some((idx, f16::from_f32((count as f32) * inv_total)))
66        }).collect::<Vec<_>>();
67        raw.sort_unstable_by_key(|(idx, _)| *idx);
68        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
69    }
70}
71
72impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
73{
74    // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
75    //     let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
76    //     let doc_num = corpus.get_doc_num() as f64;
77    //     for token in token_dim_sample.iter() {
78    //         let doc_freq = corpus.get_token_count(token);
79    //         idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
80    //     }
81    //     (idf_vec, 1.0)
82    // }
83
84    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f32>, f64) {
85        // Build sparse TF vector: only non-zero entries are stored
86        let total_count = freq.token_sum() as f32;
87        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
88        let len = token_dim_sample.len();
89        let inv_total = 1.0f32 / total_count;
90        let mut raw = freq.iter().filter_map(|(token, count)| {
91            let idx = token_dim_sample.get_index(token)?;
92            Some((idx, (count as f32) * inv_total))
93        }).collect::<Vec<_>>();
94        raw.sort_unstable_by_key(|(idx, _)| *idx);
95        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
96    }
97}
98
99impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
100{
101    // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
102    //     let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
103    //     let doc_num = corpus.get_doc_num() as f64;
104    //     for token in token_dim_sample.iter() {
105    //         let doc_freq = corpus.get_token_count(token);
106    //         idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
107    //     }
108    //     (idf_vec, 1.0)
109    // }
110
111    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f64>, f64) {
112        // Build sparse TF vector: only non-zero entries are stored
113        let total_count = freq.token_sum() as f64;
114        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
115        let len = token_dim_sample.len();
116        let inv_total = 1.0f64 / total_count;
117        let mut raw = freq.iter().filter_map(|(token, count)| {
118            let idx = token_dim_sample.get_index(token)?;
119            Some((idx, (count as f64) * inv_total))
120        }).collect::<Vec<_>>();
121        raw.sort_unstable_by_key(|(idx, _)| *idx);
122        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
123    }
124}
125
126impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
127{
128    // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
129    //     let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
130    //     let doc_num = corpus.get_doc_num() as f64;
131    //     for token in token_dim_sample.iter() {
132    //         let doc_freq = corpus.get_token_count(token);
133    //         idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
134    //     }
135    //     let max = idf_vec
136    //         .iter()
137    //         .max_by(|a, b| a.total_cmp(b))
138    //         .copied()
139    //         .unwrap_or(1.0);
140    //     (
141    //     idf_vec
142    //         .into_iter()
143    //         .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
144    //         .collect(),
145    //     max
146    //     )
147    // }
148
149    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u32>, f64) {
150        // Build sparse TF vector without allocating dense Vec
151        let total_count = freq.token_sum() as f64;
152        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
153        let mut max_val = 0.0f64;
154        let inv_total = 1.0f64 / total_count;
155        let mut raw: Vec<(usize, f64)> = freq.iter().filter_map(|(token, count)| {
156            let idx = token_dim_sample.get_index(token)?;
157            let v = (count as f64) * inv_total;
158            max_val = max_val.max(v);
159            Some((idx, v))
160        }).collect::<Vec<_>>();
161        let len = token_dim_sample.len();
162        let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
163        let vec_u32 = raw.drain(..)
164            .map(|(idx, v)| {
165                let q = (v * mul_norm).ceil() as u32;
166                (idx, q)
167            })
168            .collect::<Vec<_>>();
169        (unsafe { ZeroSpVec::from_sparse_iter(vec_u32.into_iter(), len) }, total_count)
170    }
171}
172
173impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
174{
175    // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
176    //     let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
177    //     let doc_num = corpus.get_doc_num() as f64;
178    //     for token in token_dim_sample.iter() {
179    //         let doc_freq = corpus.get_token_count(token);
180    //         idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
181    //     }
182    //     let max = idf_vec
183    //         .iter()
184    //         .max_by(|a, b| a.total_cmp(b))
185    //         .copied()
186    //         .unwrap_or(1.0);
187    //     (
188    //     idf_vec
189    //         .into_iter()
190    //         .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
191    //         .collect(),
192    //     max
193    //     )
194    // }
195
196    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u16>, f64) {
197        // Build sparse TF vector without allocating a dense Vec<f64>
198        let total_count = freq.token_sum() as f64;
199        // First pass: compute raw tf values and track max
200        let mut max_val = 0.0f32;
201        let div_total = (1.0 / total_count) as f32;
202        let raw = freq.iter().filter_map(|(token, count)| {
203            let idx = token_dim_sample.get_index(token)?;
204            let v = (count as f32) * div_total;
205            max_val = max_val.max(v);
206            Some((idx, v))
207        }).collect::<Vec<_>>();
208        let len = token_dim_sample.len();
209        // Second pass: normalize into quantized u16 and build sparse vector
210        let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
211        let vec_u16 = raw.into_iter()
212            .map(|(idx, v)| {
213                let q = (v * norm_div_max).ceil() as u16;
214                (idx, q)
215            })
216            .collect::<Vec<_>>();
217        (unsafe { ZeroSpVec::from_sparse_iter(vec_u16.into_iter(), len) }, total_count)
218    }
219}
220
221impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
222{
223    // fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
224    //     let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
225    //     let doc_num = corpus.get_doc_num() as f64;
226    //     for token in token_dim_sample.iter() {
227    //         let doc_freq = corpus.get_token_count(token);
228    //         idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
229    //     }
230    //     let max = idf_vec
231    //         .iter()
232    //         .max_by(|a, b| a.total_cmp(b))
233    //         .copied()
234    //         .unwrap_or(1.0);
235    //     (
236    //     idf_vec
237    //         .into_iter()
238    //         .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
239    //         .collect(),
240    //     max
241    //     )
242    // }
243
244    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u8>, f64) {
245        // Build sparse TF vector without allocating dense Vec
246        let total_count_f64 = freq.token_sum() as f64;
247        if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
248        // Use f32 intermediates for u8 to reduce cost and memory
249        let total_count = total_count_f64 as f32;
250        let mut max_val = 0.0f32;
251        let inv_total = 1.0f32 / total_count;
252        let raw = freq.iter().filter_map(|(token, count)| {
253            let idx = token_dim_sample.get_index(token)?;
254            let v = (count as f32) * inv_total;
255            max_val = max_val.max(v);
256            Some((idx, v))
257        }).collect::<Vec<_>>();
258        let len = token_dim_sample.len();
259        if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
260        let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
261        let vec_u8 = raw.into_iter()
262            .map(|(idx, v)| {
263                let q = (v * mul_norm).ceil() as u8;
264                (idx, q)
265            })
266            .collect::<Vec<_>>();
267        (unsafe { ZeroSpVec::from_sparse_iter(vec_u8.into_iter(), len) }, total_count_f64)
268    }
269}