tf_idf_vectorizer/vectorizer/
tfidf.rs

1
2use num_traits::Num;
3
4use crate::{utils::datastruct::{map::IndexSet, vector::{ZeroSpVec, ZeroSpVecTrait}}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
5
6pub trait TFIDFEngine<N, K>: Send + Sync
7where
8    N: Num + Copy,
9{
10    /// Method to generate the IDF vector
11    /// # Arguments
12    /// * `corpus` - The corpus
13    /// * `token_dim_sample` - Token dimension sample
14    /// # Returns
15    /// * `Vec<N>` - The IDF vector
16    /// * `denormalize_num` - Value for denormalization
17    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
18    /// Method to generate the TF vector
19    /// # Arguments
20    /// * `freq` - Token frequency
21    /// * `token_dim_sample` - Token dimension sample
22    /// # Returns
23    /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
24    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<N>, f64);
25}
26
27/// デフォルトのTF-IDFエンジン
28/// `f32`、`f64`、`u32`、`u16`、`u8`の型に対応
29#[derive(Debug)]
30pub struct DefaultTFIDFEngine;
31impl DefaultTFIDFEngine {
32    pub fn new() -> Self {
33        DefaultTFIDFEngine
34    }
35}
36
37impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
38{
39    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
40        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
41        let doc_num = corpus.get_doc_num() as f64;
42        for token in token_dim_sample.iter() {
43            let doc_freq = corpus.get_token_count(token);
44            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
45        }
46        (idf_vec, 1.0)
47    }
48
49    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f32>, f64) {
50        // Build sparse TF vector: only non-zero entries are stored
51        let total_count = freq.token_sum() as f32;
52        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
53        let len = token_dim_sample.len();
54        let inv_total = 1.0f32 / total_count;
55        let mut raw = freq.iter().map(|(token, count)| {
56            let idx = token_dim_sample.get_index(token).unwrap();
57            (idx, (count as f32) * inv_total)
58        }).collect::<Vec<_>>();
59        raw.sort_unstable_by_key(|(idx, _)| *idx);
60        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
61    }
62}
63
64impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
65{
66    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
67        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68        let doc_num = corpus.get_doc_num() as f64;
69        for token in token_dim_sample.iter() {
70            let doc_freq = corpus.get_token_count(token);
71            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
72        }
73        (idf_vec, 1.0)
74    }
75
76    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<f64>, f64) {
77        // Build sparse TF vector: only non-zero entries are stored
78        let total_count = freq.token_sum() as f64;
79        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80        let len = token_dim_sample.len();
81        let inv_total = 1.0f64 / total_count;
82        let mut raw = freq.iter().map(|(token, count)| {
83            let idx = token_dim_sample.get_index(token).unwrap();
84            (idx, (count as f64) * inv_total)
85        }).collect::<Vec<_>>();
86        raw.sort_unstable_by_key(|(idx, _)| *idx);
87        (unsafe { ZeroSpVec::from_sparse_iter(raw.into_iter(), len) }, total_count.into())
88    }
89}
90
91impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
92{
93    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
94        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
95        let doc_num = corpus.get_doc_num() as f64;
96        for token in token_dim_sample.iter() {
97            let doc_freq = corpus.get_token_count(token);
98            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
99        }
100        let max = idf_vec
101            .iter()
102            .max_by(|a, b| a.total_cmp(b))
103            .copied()
104            .unwrap_or(1.0);
105        (
106        idf_vec
107            .into_iter()
108            .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
109            .collect(),
110        max
111        )
112    }
113
114    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u32>, f64) {
115        // Build sparse TF vector without allocating dense Vec
116        let total_count = freq.token_sum() as f64;
117        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
118        let mut max_val = 0.0f64;
119        let inv_total = 1.0f64 / total_count;
120        let mut raw: Vec<(usize, f64)> = freq.iter().map(|(token, count)| {
121            let idx = token_dim_sample.get_index(token).unwrap();
122            let v = (count as f64) * inv_total;
123            max_val = max_val.max(v);
124            (idx, v)
125        }).collect::<Vec<_>>();
126        let len = token_dim_sample.len();
127        let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
128        let vec_u32 = raw.drain(..)
129            .map(|(idx, v)| {
130                let q = (v * mul_norm).ceil() as u32;
131                (idx, q)
132            })
133            .collect::<Vec<_>>();
134        (unsafe { ZeroSpVec::from_sparse_iter(vec_u32.into_iter(), len) }, total_count)
135    }
136}
137
138impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
139{
140    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
141        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
142        let doc_num = corpus.get_doc_num() as f64;
143        for token in token_dim_sample.iter() {
144            let doc_freq = corpus.get_token_count(token);
145            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
146        }
147        let max = idf_vec
148            .iter()
149            .max_by(|a, b| a.total_cmp(b))
150            .copied()
151            .unwrap_or(1.0);
152        (
153        idf_vec
154            .into_iter()
155            .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
156            .collect(),
157        max
158        )
159    }
160
161    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u16>, f64) {
162        // Build sparse TF vector without allocating a dense Vec<f64>
163        let total_count = freq.token_sum() as f64;
164        // First pass: compute raw tf values and track max
165        let mut max_val = 0.0f32;
166        let div_total = (1.0 / total_count) as f32;
167        let raw = freq.iter().map(|(token, count)| {
168            let idx = token_dim_sample.get_index(token).unwrap();
169            let v = (count as f32) * div_total;
170            max_val = max_val.max(v);
171            (idx, v)
172        }).collect::<Vec<_>>();
173        let len = token_dim_sample.len();
174        // Second pass: normalize into quantized u16 and build sparse vector
175    let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
176        let vec_u16 = raw.into_iter()
177            .map(|(idx, v)| {
178                let q = (v * norm_div_max).ceil() as u16;
179                (idx, q)
180            })
181            .collect::<Vec<_>>();
182        (unsafe { ZeroSpVec::from_sparse_iter(vec_u16.into_iter(), len) }, total_count)
183    }
184}
185
186impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
187{
188    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
189        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
190        let doc_num = corpus.get_doc_num() as f64;
191        for token in token_dim_sample.iter() {
192            let doc_freq = corpus.get_token_count(token);
193            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
194        }
195        let max = idf_vec
196            .iter()
197            .max_by(|a, b| a.total_cmp(b))
198            .copied()
199            .unwrap_or(1.0);
200        (
201        idf_vec
202            .into_iter()
203            .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
204            .collect(),
205        max
206        )
207    }
208
209    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>>) -> (ZeroSpVec<u8>, f64) {
210        // Build sparse TF vector without allocating dense Vec
211        let total_count_f64 = freq.token_sum() as f64;
212        if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
213        // Use f32 intermediates for u8 to reduce cost and memory
214        let total_count = total_count_f64 as f32;
215        let mut max_val = 0.0f32;
216        let inv_total = 1.0f32 / total_count;
217        let raw = freq.iter().map(|(token, count)| {
218            let idx = token_dim_sample.get_index(token).unwrap();
219            let v = (count as f32) * inv_total;
220            max_val = max_val.max(v);
221            (idx, v)
222        }).collect::<Vec<_>>();
223        let len = token_dim_sample.len();
224        if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
225        let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
226        let vec_u8 = raw.into_iter()
227            .map(|(idx, v)| {
228                let q = (v * mul_norm).ceil() as u8;
229                (idx, q)
230            })
231            .collect::<Vec<_>>();
232        (unsafe { ZeroSpVec::from_sparse_iter(vec_u8.into_iter(), len) }, total_count_f64)
233    }
234}