tf_idf_vectorizer/vectorizer/
tfidf.rs

1use num_traits::Num;
2
3use crate::{utils::datastruct::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
4
5pub trait TFIDFEngine<N, K>: Send + Sync
6where
7    N: Num + Copy,
8{
9    /// Method to generate the IDF vector
10    /// # Arguments
11    /// * `corpus` - The corpus
12    /// * `token_dim_sample` - Token dimension sample
13    /// # Returns
14    /// * `Vec<N>` - The IDF vector
15    /// * `denormalize_num` - Value for denormalization
16    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<N>, f64);
17    /// Method to generate the TF vector
18    /// # Arguments
19    /// * `freq` - Token frequency
20    /// * `token_dim_sample` - Token dimension sample
21    /// # Returns
22    /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
23    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<N>, f64);
24}
25
26/// デフォルトのTF-IDFエンジン
27/// `f32`、`f64`、`u32`、`u16`、`u8`の型に対応
28#[derive(Debug)]
29pub struct DefaultTFIDFEngine;
30impl DefaultTFIDFEngine {
31    pub fn new() -> Self {
32        DefaultTFIDFEngine
33    }
34}
35
36impl<K> TFIDFEngine<f32, K> for DefaultTFIDFEngine
37{
38    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f32>, f64) {
39        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
40        let doc_num = corpus.get_doc_num() as f64;
41        for token in token_dim_sample.iter() {
42            let doc_freq = corpus.get_token_count(token);
43            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
44        }
45        (idf_vec, 1.0)
46    }
47
48    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f32>, f64) {
49        // Build sparse TF vector: only non-zero entries are stored
50        let total_count = freq.token_sum() as f32;
51        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
52        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
53        let len = token_dim_sample.len();
54        let inv_total = 1.0f32 / total_count;
55        for (idx, token) in token_dim_sample.iter().enumerate() {
56            let count = freq.token_count(token) as f32;
57            if count == 0.0 { continue; }
58            raw.push((idx, count * inv_total));
59        }
60        (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
61    }
62}
63
64impl<K> TFIDFEngine<f64, K> for DefaultTFIDFEngine
65{
66    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<f64>, f64) {
67        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
68        let doc_num = corpus.get_doc_num() as f64;
69        for token in token_dim_sample.iter() {
70            let doc_freq = corpus.get_token_count(token);
71            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
72        }
73        (idf_vec, 1.0)
74    }
75
76    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<f64>, f64) {
77        // Build sparse TF vector: only non-zero entries are stored
78        let total_count = freq.token_sum() as f64;
79        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
80        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
81        let len = token_dim_sample.len();
82        let inv_total = 1.0f64 / total_count;
83        for (idx, token) in token_dim_sample.iter().enumerate() {
84            let count = freq.token_count(token) as f64;
85            if count == 0.0 { continue; }
86            raw.push((idx, count * inv_total));
87        }
88        (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
89    }
90}
91
92impl<K> TFIDFEngine<u32, K> for DefaultTFIDFEngine
93{
94    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u32>, f64) {
95        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
96        let doc_num = corpus.get_doc_num() as f64;
97        for token in token_dim_sample.iter() {
98            let doc_freq = corpus.get_token_count(token);
99            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as u32);
100        }
101        (idf_vec, 1.0)
102    }
103
104    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u32>, f64) {
105        // Build sparse TF vector without allocating dense Vec
106        let total_count = freq.token_sum() as f64;
107        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
108        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
109        let mut max_val = 0.0f64;
110        let inv_total = 1.0f64 / total_count;
111        for (idx, token) in token_dim_sample.iter().enumerate() {
112            let count = freq.token_count(token) as f64;
113            if count == 0.0 { continue; }
114            let v = count * inv_total;
115            if v > max_val { max_val = v; }
116            raw.push((idx, v));
117        }
118        let len = token_dim_sample.len();
119        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
120        let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
121        let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
122        for (idx, v) in raw.into_iter() {
123            let q = (v * mul_norm).ceil() as u32;
124            vec_u32.push((idx, q));
125        }
126        (unsafe { ZeroSpVec::from_raw_iter(vec_u32.into_iter(), len) }, total_count)
127    }
128}
129
130impl<K> TFIDFEngine<u16, K> for DefaultTFIDFEngine
131{
132    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u16>, f64) {
133        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
134        let doc_num = corpus.get_doc_num() as f64;
135        for token in token_dim_sample.iter() {
136            let doc_freq = corpus.get_token_count(token);
137            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
138        }
139        let max = idf_vec
140            .iter()
141            .max_by(|a, b| a.total_cmp(b))
142            .copied()
143            .unwrap_or(1.0);
144        (
145        idf_vec
146            .into_iter()
147            .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
148            .collect(),
149        max
150        )
151    }
152
153    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u16>, f64) {
154        // Build sparse TF vector without allocating a dense Vec<f64>
155        let total_count = freq.token_sum() as f64;
156        // First pass: compute raw tf values and track max
157        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
158        let mut max_val = 0.0f32;
159    let div_total = (1.0 / total_count) as f32;
160        for (idx, token) in token_dim_sample.iter().enumerate() {
161            let count = freq.token_count(token);
162            if count == 0 { continue; }
163            let v = count as f32 * div_total;
164            if v > max_val { max_val = v; }
165            raw.push((idx, v));
166        }
167        let len = token_dim_sample.len();
168        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
169        // Second pass: normalize into quantized u16 and build sparse vector
170        let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
171    let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
172        for (idx, v) in raw.into_iter() {
173            let q = (v * norm_div_max).ceil() as u16;
174            vec_u16.push((idx, q));
175        }
176        (unsafe { ZeroSpVec::from_raw_iter(vec_u16.into_iter(), len) }, total_count)
177    }
178}
179
180impl<K> TFIDFEngine<u8, K> for DefaultTFIDFEngine
181{
182    fn idf_vec(corpus: &Corpus, token_dim_sample: &Vec<Box<str>>) -> (Vec<u8>, f64) {
183        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
184        let doc_num = corpus.get_doc_num() as f64;
185        for token in token_dim_sample.iter() {
186            let doc_freq = corpus.get_token_count(token);
187            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
188        }
189        let max = idf_vec
190            .iter()
191            .max_by(|a, b| a.total_cmp(b))
192            .copied()
193            .unwrap_or(1.0);
194        (
195        idf_vec
196            .into_iter()
197            .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
198            .collect(),
199        max
200        )
201    }
202
203    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &Vec<Box<str>>) -> (ZeroSpVec<u8>, f64) {
204        // Build sparse TF vector without allocating dense Vec
205        let total_count_f64 = freq.token_sum() as f64;
206        if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
207        // Use f32 intermediates for u8 to reduce cost and memory
208        let total_count = total_count_f64 as f32;
209        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
210        let mut max_val = 0.0f32;
211        let inv_total = 1.0f32 / total_count;
212        for (idx, token) in token_dim_sample.iter().enumerate() {
213            let count = freq.token_count(token) as f32;
214            if count == 0.0 { continue; }
215            let v = count * inv_total;
216            if v > max_val { max_val = v; }
217            raw.push((idx, v));
218        }
219        let len = token_dim_sample.len();
220        if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
221        let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
222        let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
223        for (idx, v) in raw.into_iter() {
224            let q = (v * mul_norm).ceil() as u8;
225            vec_u8.push((idx, q));
226        }
227        (unsafe { ZeroSpVec::from_raw_iter(vec_u8.into_iter(), len) }, total_count_f64)
228    }
229}