tf_idf_vectorizer/vectorizer/
tfidf.rs

1use num_traits::Num;
2
3use crate::{utils::datastruct::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
4
5pub trait TFIDFEngine<N, K>: Send + Sync
6where
7    N: Num + Copy,
8{
9    /// Method to generate the IDF vector
10    /// # Arguments
11    /// * `corpus` - The corpus
12    /// * `token_dim_sample` - Token dimension sample
13    /// # Returns
14    /// * `Vec<N>` - The IDF vector
15    /// * `denormalize_num` - Value for denormalization
16    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
17    /// Method to generate the TF vector
18    /// # Arguments
19    /// * `freq` - Token frequency
20    /// * `token_dim_sample` - Token dimension sample
21    /// # Returns
22    /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
23    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<N>, f64);
24}
25
26/// デフォルトのTF-IDFエンジン
27/// `f32`、`f64`、`u32`、`u16`、`u8`の型に対応
28#[derive(Debug)]
29pub struct DefaultTFIDFEngine;
30impl DefaultTFIDFEngine {
31    pub fn new() -> Self {
32        DefaultTFIDFEngine
33    }
34}
35
36impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
37{
38    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
39        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
40        let doc_num = corpus.get_doc_num() as f64;
41        for token in token_dim_sample.iter() {
42            let doc_freq = corpus.get_token_count(token);
43            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
44        }
45        (idf_vec, 1.0)
46    }
47
48    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f32>, f64) {
49        // Build sparse TF vector: only non-zero entries are stored
50        let total_count = freq.token_sum() as f32;
51        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
52        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
53        let len = token_dim_sample.len();
54        let inv_total = 1.0f32 / total_count;
55        for (idx, token) in token_dim_sample.iter().enumerate() {
56            let count = freq.token_count(token) as f32;
57            if count == 0.0 { continue; }
58            raw.push((idx, count * inv_total));
59        }
60        (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
61    }
62}
63
64impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
65{
66    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
67        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68        let doc_num = corpus.get_doc_num() as f64;
69        for token in token_dim_sample.iter() {
70            let doc_freq = corpus.get_token_count(token);
71            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
72        }
73        (idf_vec, 1.0)
74    }
75
76    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f64>, f64) {
77        // Build sparse TF vector: only non-zero entries are stored
78        let total_count = freq.token_sum() as f64;
79        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
81        let len = token_dim_sample.len();
82        let inv_total = 1.0f64 / total_count;
83        for (idx, token) in token_dim_sample.iter().enumerate() {
84            let count = freq.token_count(token) as f64;
85            if count == 0.0 { continue; }
86            raw.push((idx, count * inv_total));
87        }
88        (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
89    }
90}
91
92impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
93{
94    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
95        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
96        let doc_num = corpus.get_doc_num() as f64;
97        for token in token_dim_sample.iter() {
98            let doc_freq = corpus.get_token_count(token);
99            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
100        }
101        let max = idf_vec
102            .iter()
103            .max_by(|a, b| a.total_cmp(b))
104            .copied()
105            .unwrap_or(1.0);
106        (
107        idf_vec
108            .into_iter()
109            .map(|idf| (idf / max * u32::MAX as f64).ceil() as u32)
110            .collect(),
111        max
112        )
113    }
114
115    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u32>, f64) {
116        // Build sparse TF vector without allocating dense Vec
117        let total_count = freq.token_sum() as f64;
118        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
119        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
120        let mut max_val = 0.0f64;
121        let inv_total = 1.0f64 / total_count;
122        for (idx, token) in token_dim_sample.iter().enumerate() {
123            let count = freq.token_count(token) as f64;
124            if count == 0.0 { continue; }
125            let v = count * inv_total;
126            if v > max_val { max_val = v; }
127            raw.push((idx, v));
128        }
129        let len = token_dim_sample.len();
130        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
131        let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
132        let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
133        for (idx, v) in raw.into_iter() {
134            let q = (v * mul_norm).ceil() as u32;
135            vec_u32.push((idx, q));
136        }
137        (unsafe { ZeroSpVec::from_raw_iter(vec_u32.into_iter(), len) }, total_count)
138    }
139}
140
141impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
142{
143    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
144        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
145        let doc_num = corpus.get_doc_num() as f64;
146        for token in token_dim_sample.iter() {
147            let doc_freq = corpus.get_token_count(token);
148            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
149        }
150        let max = idf_vec
151            .iter()
152            .max_by(|a, b| a.total_cmp(b))
153            .copied()
154            .unwrap_or(1.0);
155        (
156        idf_vec
157            .into_iter()
158            .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
159            .collect(),
160        max
161        )
162    }
163
164    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u16>, f64) {
165        // Build sparse TF vector without allocating a dense Vec<f64>
166        let total_count = freq.token_sum() as f64;
167        // First pass: compute raw tf values and track max
168        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
169        let mut max_val = 0.0f32;
170    let div_total = (1.0 / total_count) as f32;
171        for (idx, token) in token_dim_sample.iter().enumerate() {
172            let count = freq.token_count(token);
173            if count == 0 { continue; }
174            let v = count as f32 * div_total;
175            if v > max_val { max_val = v; }
176            raw.push((idx, v));
177        }
178        let len = token_dim_sample.len();
179        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
180        // Second pass: normalize into quantized u16 and build sparse vector
181        let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
182    let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
183        for (idx, v) in raw.into_iter() {
184            let q = (v * norm_div_max).ceil() as u16;
185            vec_u16.push((idx, q));
186        }
187        (unsafe { ZeroSpVec::from_raw_iter(vec_u16.into_iter(), len) }, total_count)
188    }
189}
190
191impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
192{
193    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
194        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
195        let doc_num = corpus.get_doc_num() as f64;
196        for token in token_dim_sample.iter() {
197            let doc_freq = corpus.get_token_count(token);
198            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
199        }
200        let max = idf_vec
201            .iter()
202            .max_by(|a, b| a.total_cmp(b))
203            .copied()
204            .unwrap_or(1.0);
205        (
206        idf_vec
207            .into_iter()
208            .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
209            .collect(),
210        max
211        )
212    }
213
214    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u8>, f64) {
215        // Build sparse TF vector without allocating dense Vec
216        let total_count_f64 = freq.token_sum() as f64;
217        if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
218        // Use f32 intermediates for u8 to reduce cost and memory
219        let total_count = total_count_f64 as f32;
220        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
221        let mut max_val = 0.0f32;
222        let inv_total = 1.0f32 / total_count;
223        for (idx, token) in token_dim_sample.iter().enumerate() {
224            let count = freq.token_count(token) as f32;
225            if count == 0.0 { continue; }
226            let v = count * inv_total;
227            if v > max_val { max_val = v; }
228            raw.push((idx, v));
229        }
230        let len = token_dim_sample.len();
231        if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
232        let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
233        let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
234        for (idx, v) in raw.into_iter() {
235            let q = (v * mul_norm).ceil() as u8;
236            vec_u8.push((idx, q));
237        }
238        (unsafe { ZeroSpVec::from_raw_iter(vec_u8.into_iter(), len) }, total_count_f64)
239    }
240}