tf_idf_vectorizer/vectorizer/
tfidf.rs

1use ahash::RandomState;
2use indexmap::IndexSet;
3use num_traits::Num;
4
5use crate::{utils::math::vector::{ZeroSpVec, ZeroSpVecTrait}, vectorizer::{corpus::Corpus, token::TokenFrequency}};
6
7pub trait TFIDFEngine<N>: Send + Sync
8where
9    N: Num + Copy,
10{
11    /// Method to generate the IDF vector
12    /// # Arguments
13    /// * `corpus` - The corpus
14    /// * `token_dim_sample` - Token dimension sample
15    /// # Returns
16    /// * `Vec<N>` - The IDF vector
17    /// * `denormalize_num` - Value for denormalization
18    fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<N>, f64);
19    /// Method to generate the TF vector
20    /// # Arguments
21    /// * `freq` - Token frequency
22    /// * `token_dim_sample` - Token dimension sample
23    /// # Returns
24    /// * `(ZeroSpVec<N>, f64)` - TF vector and value for denormalization
25    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<N>, f64);
26}
27
28/// デフォルトのTF-IDFエンジン
29/// `f32`、`f64`、`u32`、`u16`、`u8`の型に対応
30#[derive(Debug)]
31pub struct DefaultTFIDFEngine;
32impl DefaultTFIDFEngine {
33    pub fn new() -> Self {
34        DefaultTFIDFEngine
35    }
36}
37
38impl TFIDFEngine<f32> for DefaultTFIDFEngine
39{
40    fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<f32>, f64) {
41        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
42        let doc_num = corpus.get_doc_num() as f64;
43        for token in token_dim_sample {
44            let doc_freq = corpus.get_token_count(token);
45            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
46        }
47        (idf_vec, 1.0)
48    }
49
50    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<f32>, f64) {
51        // Build sparse TF vector: only non-zero entries are stored
52        let total_count = freq.token_sum() as f32;
53        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
54        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
55        let len = token_dim_sample.len();
56        let inv_total = 1.0f32 / total_count;
57        for (idx, token) in token_dim_sample.iter().enumerate() {
58            let count = freq.token_count(token) as f32;
59            if count == 0.0 { continue; }
60            raw.push((idx, count * inv_total));
61        }
62        (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
63    }
64}
65
66impl TFIDFEngine<f64> for DefaultTFIDFEngine
67{
68    fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<f64>, f64) {
69        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
70        let doc_num = corpus.get_doc_num() as f64;
71        for token in token_dim_sample {
72            let doc_freq = corpus.get_token_count(token);
73            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
74        }
75        (idf_vec, 1.0)
76    }
77
78    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<f64>, f64) {
79        // Build sparse TF vector: only non-zero entries are stored
80        let total_count = freq.token_sum() as f64;
81        if total_count == 0.0 { return (ZeroSpVec::new(), total_count.into()); }
82        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
83        let len = token_dim_sample.len();
84        let inv_total = 1.0f64 / total_count;
85        for (idx, token) in token_dim_sample.iter().enumerate() {
86            let count = freq.token_count(token) as f64;
87            if count == 0.0 { continue; }
88            raw.push((idx, count * inv_total));
89        }
90        (unsafe { ZeroSpVec::from_raw_iter(raw.into_iter(), len) }, total_count.into())
91    }
92}
93
94impl TFIDFEngine<u32> for DefaultTFIDFEngine
95{
96    fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<u32>, f64) {
97        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
98        let doc_num = corpus.get_doc_num() as f64;
99        for token in token_dim_sample {
100            let doc_freq = corpus.get_token_count(token);
101            idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as u32);
102        }
103        (idf_vec, 1.0)
104    }
105
106    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<u32>, f64) {
107        // Build sparse TF vector without allocating dense Vec
108        let total_count = freq.token_sum() as f64;
109        if total_count == 0.0 { return (ZeroSpVec::new(), total_count); }
110        let mut raw: Vec<(usize, f64)> = Vec::with_capacity(freq.token_num());
111        let mut max_val = 0.0f64;
112        let inv_total = 1.0f64 / total_count;
113        for (idx, token) in token_dim_sample.iter().enumerate() {
114            let count = freq.token_count(token) as f64;
115            if count == 0.0 { continue; }
116            let v = count * inv_total;
117            if v > max_val { max_val = v; }
118            raw.push((idx, v));
119        }
120        let len = token_dim_sample.len();
121        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
122        let mut vec_u32: Vec<(usize, u32)> = Vec::with_capacity(raw.len());
123        let mul_norm = (u32::MAX as f64) / max_val; // == (1/max_val) * u32::MAX
124        for (idx, v) in raw.into_iter() {
125            let q = (v * mul_norm).ceil() as u32;
126            vec_u32.push((idx, q));
127        }
128        (unsafe { ZeroSpVec::from_raw_iter(vec_u32.into_iter(), len) }, total_count)
129    }
130}
131
132impl TFIDFEngine<u16> for DefaultTFIDFEngine
133{
134    fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<u16>, f64) {
135        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
136        let doc_num = corpus.get_doc_num() as f64;
137        for token in token_dim_sample {
138            let doc_freq = corpus.get_token_count(token);
139            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
140        }
141        let max = idf_vec
142            .iter()
143            .max_by(|a, b| a.total_cmp(b))
144            .copied()
145            .unwrap_or(1.0);
146        (
147        idf_vec
148            .into_iter()
149            .map(|idf| (idf / max * u16::MAX as f64).ceil() as u16)
150            .collect(),
151        max
152        )
153    }
154
155    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<u16>, f64) {
156        // Build sparse TF vector without allocating a dense Vec<f64>
157        let total_count = freq.token_sum() as f64;
158        // First pass: compute raw tf values and track max
159        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
160        let mut max_val = 0.0f32;
161    let div_total = (1.0 / total_count) as f32;
162        for (idx, token) in token_dim_sample.iter().enumerate() {
163            let count = freq.token_count(token);
164            if count == 0 { continue; }
165            let v = count as f32 * div_total;
166            if v > max_val { max_val = v; }
167            raw.push((idx, v));
168        }
169        let len = token_dim_sample.len();
170        if max_val == 0.0 { return (ZeroSpVec::new(), total_count); }
171        // Second pass: normalize into quantized u16 and build sparse vector
172        let mut vec_u16: Vec<(usize, u16)> = Vec::with_capacity(raw.len());
173    let norm_div_max = (u16::MAX as f32) / max_val; // == (1/max_val) * u16::MAX
174        for (idx, v) in raw.into_iter() {
175            let q = (v * norm_div_max).ceil() as u16;
176            vec_u16.push((idx, q));
177        }
178        (unsafe { ZeroSpVec::from_raw_iter(vec_u16.into_iter(), len) }, total_count)
179    }
180}
181
182impl TFIDFEngine<u8> for DefaultTFIDFEngine
183{
184    fn idf_vec(corpus: &Corpus, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (Vec<u8>, f64) {
185        let mut idf_vec = Vec::with_capacity(token_dim_sample.len());
186        let doc_num = corpus.get_doc_num() as f64;
187        for token in token_dim_sample {
188            let doc_freq = corpus.get_token_count(token);
189            idf_vec.push(doc_num / (doc_freq as f64 + 1.0));
190        }
191        let max = idf_vec
192            .iter()
193            .max_by(|a, b| a.total_cmp(b))
194            .copied()
195            .unwrap_or(1.0);
196        (
197        idf_vec
198            .into_iter()
199            .map(|idf| (idf / max * u8::MAX as f64).ceil() as u8)
200            .collect(),
201        max
202        )
203    }
204
205    fn tf_vec(freq: &TokenFrequency, token_dim_sample: &IndexSet<Box<str>, RandomState>) -> (ZeroSpVec<u8>, f64) {
206        // Build sparse TF vector without allocating dense Vec
207        let total_count_f64 = freq.token_sum() as f64;
208        if total_count_f64 == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
209        // Use f32 intermediates for u8 to reduce cost and memory
210        let total_count = total_count_f64 as f32;
211        let mut raw: Vec<(usize, f32)> = Vec::with_capacity(freq.token_num());
212        let mut max_val = 0.0f32;
213        let inv_total = 1.0f32 / total_count;
214        for (idx, token) in token_dim_sample.iter().enumerate() {
215            let count = freq.token_count(token) as f32;
216            if count == 0.0 { continue; }
217            let v = count * inv_total;
218            if v > max_val { max_val = v; }
219            raw.push((idx, v));
220        }
221        let len = token_dim_sample.len();
222        if max_val == 0.0 { return (ZeroSpVec::new(), total_count_f64); }
223        let mut vec_u8: Vec<(usize, u8)> = Vec::with_capacity(raw.len());
224        let mul_norm = (u8::MAX as f32) / max_val; // == (1/max_val) * u8::MAX
225        for (idx, v) in raw.into_iter() {
226            let q = (v * mul_norm).ceil() as u8;
227            vec_u8.push((idx, q));
228        }
229        (unsafe { ZeroSpVec::from_raw_iter(vec_u8.into_iter(), len) }, total_count_f64)
230    }
231}