tf_idf_vectorizer/vectorizer/
token.rs

1use core::str;
2use std::{collections::{hash_map::Keys, HashMap, HashSet}, fmt::Debug};
3
4use fst::Map;
5use serde::{Deserialize, Serialize};
6use rayon::prelude::*;
7
8/*
9/// paralel操作は順序が保証されないため、順序が重要な場合は注意が必要
10*/
11
12///  TokenFrequency 構造体
13#[derive(Serialize, Deserialize, Debug, Clone)]
14pub struct TokenFrequency {
15    pub token_count: HashMap<String, u32>,
16    pub total_token_count: u64,
17}
18
19impl TokenFrequency {
20    pub fn new() -> Self {
21        TokenFrequency {
22            token_count: HashMap::new(),
23            total_token_count: 0,
24        }
25    }
26
27    #[inline(always)]
28    pub fn add_token(&mut self, token: &str) -> &mut Self {
29        let count = self.token_count.entry(token.to_string()).or_insert(0);
30        *count += 1;
31        self.total_token_count += 1;
32        self
33    }
34
35    #[inline(always)]
36    pub fn add_token_n(&mut self, token: &str, n: u32) -> &mut Self {
37        let count = self.token_count.entry(token.to_string()).or_insert(0);
38        *count += n;
39        self.total_token_count += n as u64;
40        self
41    }
42
43    #[inline(always)]
44    pub fn add_tokens(&mut self, tokens: &[&str]) -> &mut Self {
45        for &token in tokens {
46            let count = self.token_count.entry(token.to_string()).or_insert(0);
47            *count += 1;
48            self.total_token_count += 1;
49        }
50        self
51    }
52
53    #[inline(always)]
54    pub fn add_tokens_string(&mut self, tokens: &[String]) -> &mut Self {
55        for token in tokens {
56            let count = self.token_count.entry(token.clone()).or_insert(0);
57            *count += 1;
58            self.total_token_count += 1;
59        }
60        self
61    }
62
63    #[inline(always)]
64    pub fn sub_token(&mut self, token: &str) -> &mut Self {
65        if let Some(count) = self.token_count.get_mut(token) {
66            if *count > 0 {
67                *count -= 1;
68                self.total_token_count -= 1;
69            }
70        }
71        self
72    }
73
74    #[inline(always)]
75    pub fn sub_token_n(&mut self, token: &str, n: u32) -> &mut Self {
76        if let Some(count) = self.token_count.get_mut(token) {
77            if *count >= n {
78                *count -= n;
79                self.total_token_count -= n as u64;
80            }
81        }
82        self
83    }
84
85    #[inline(always)]
86    pub fn sub_tokens(&mut self, tokens: &[&str]) -> &mut Self {
87        for &token in tokens {
88            if let Some(count) = self.token_count.get_mut(token) {
89                if *count > 0 {
90                    *count -= 1;
91                    self.total_token_count -= 1;
92                }
93            }
94        }
95        self
96    }
97
98    #[inline(always)]
99    pub fn sub_tokens_string(&mut self, tokens: &[String]) -> &mut Self {
100        for token in tokens {
101            if let Some(count) = self.token_count.get_mut(token.as_str()) {
102                if *count > 0 {
103                    *count -= 1;
104                    self.total_token_count -= 1;
105                }
106            }
107        }
108        self
109    }
110
111    #[inline(always)]
112    pub fn tf_calc(max_count: u32, count: u32) -> f64 {
113        (count as f64 + 1.0).ln() / (max_count as f64 + 1.0).ln()
114    }
115
116    #[inline(always)]
117    pub fn tf_calc_as_u16(max_count: u32, count: u32) -> u16 {
118        let normalized_value = (count as f64 + 1.0).ln() / (max_count as f64 + 1.0).ln();
119        // 0~65535 にスケール
120        (normalized_value * 65535.0).round() as u16
121    }
122    
123    #[inline(always)]
124    pub fn tf_calc_as_u32(max_count: u32, count: u32) -> u32 {
125        let normalized_value = (count as f64 + 1.0).ln() / (max_count as f64 + 1.0).ln();
126        // 0~4294967295 にスケール
127        (normalized_value * 4294967295.0).round() as u32
128    }
129
130    // Vec<(String, u16)>を取得
131    #[inline(always)]
132    pub fn get_tf_vector(&self) -> Vec<(String, u16)> {
133        let max_count = self.get_most_frequent_token_count();
134        self.token_count
135            .iter()
136            .map(|(token, &count)| {
137                (token.clone(), Self::tf_calc_as_u16(max_count, count))
138            })
139            .collect()
140    }
141
142    // 並列処理でVec<(String, u16)>を取得
143    #[inline(always)]
144    pub fn get_tf_vector_parallel(&self) -> Vec<(String, u16)> {
145        let max_count = self.get_most_frequent_token_count();
146        self.token_count
147            .par_iter()
148            .map(|(token, &count)| {
149                (token.clone(), Self::tf_calc_as_u16(max_count, count))
150            })
151            .collect()
152    }
153
154    // Vec<(&str, u16)>を取得
155    #[inline(always)]
156    pub fn get_tf_vector_ref(&self) -> Vec<(&str, u16)> {
157        let max_count = self.get_most_frequent_token_count();
158        self.token_count
159            .iter()
160            .map(|(token, &count)| {
161                (token.as_str(), Self::tf_calc_as_u16(max_count, count))
162            })
163            .collect()
164    }
165
166    // 並列処理でVec<(&str, u16)>を取得
167    #[inline(always)]
168    pub fn get_tf_vector_ref_parallel(&self) -> Vec<(&str, u16)> {
169        let max_count = self.get_most_frequent_token_count();
170        self.token_count
171            .par_iter()
172            .map(|(token, &count)| {
173                (token.as_str(), Self::tf_calc_as_u16(max_count, count))
174            })
175            .collect()
176    }
177
178    // HashMap<String, u16>を取得
179    #[inline(always)]
180    pub fn get_tf_hashmap(&self) -> HashMap<String, u16> {
181        let max_count = self.get_most_frequent_token_count();
182        self.token_count
183            .iter()
184            .map(|(token, &count)| {
185                (token.clone(), Self::tf_calc_as_u16(max_count, count))
186            })
187            .collect()
188    }
189
190    // 並列処理でHashMap<String, u16>を取得
191    #[inline(always)]
192    pub fn get_tf_hashmap_parallel(&self) -> HashMap<String, u16> {
193        let max_count = self.get_most_frequent_token_count();
194        self.token_count
195            .par_iter()
196            .map(|(token, &count)| {
197                (token.clone(), Self::tf_calc_as_u16(max_count, count))
198            })
199            .collect()
200    }
201
202    // HashMap<&str, u16>を取得
203    #[inline(always)]
204    pub fn get_tf_hashmap_ref(&self) -> HashMap<&str, u16> {
205        let max_count = self.get_most_frequent_token_count();
206        self.token_count
207            .iter()
208            .map(|(token, &count)| {
209                (token.as_str(), Self::tf_calc_as_u16(max_count, count))
210            })
211            .collect()
212    }
213
214    // 並列処理でHashMap<&str, u16>を取得
215    #[inline(always)]
216    pub fn get_tf_hashmap_ref_parallel(&self) -> HashMap<&str, u16> {
217        let max_count = self.get_most_frequent_token_count();
218        self.token_count
219            .par_iter()
220            .map(|(token, &count)| {
221                (token.as_str(), Self::tf_calc_as_u16(max_count, count))
222            })
223            .collect()
224    }
225
226    // 特定のトークンのTFを取得
227    #[inline(always)]
228    pub fn get_token_tf(&self, token: &str) -> u16 {
229        let max_count = self.get_most_frequent_token_count();
230        let count = self.token_count.get(token).copied().unwrap_or(0);
231        Self::tf_calc_as_u16(max_count, count)
232    }
233
234    #[inline(always)]
235    pub fn idf_max(&self, total_doc_count: u64) -> f64 {
236        (1.0 + total_doc_count as f64 / (2.0)).ln()
237    }
238
239    #[inline(always)]
240    pub fn idf_calc(total_doc_count: u64, max_idf: f64, doc_count: u32) -> f64 {
241        (1.0 + total_doc_count as f64 / (1.0 + doc_count as f64)).ln() / max_idf
242    }
243
244    #[inline(always)]
245    pub fn idf_calc_as_u16(total_doc_count: u64, max_idf: f64, doc_count: u32) -> u16 {
246        let normalized_value = (1.0 + total_doc_count as f64 / (1.0 + doc_count as f64)).ln() / max_idf;
247        // 0~65535 にスケール
248        (normalized_value * 65535.0).round() as u16
249    }
250
251    #[inline(always)]
252    pub fn idf_calc_as_u32(total_doc_count: u64, max_idf: f64, doc_count: u32) -> u32 {
253        let normalized_value = (1.0 + total_doc_count as f64 / (1.0 + doc_count as f64)).ln() / max_idf;
254        // 0~4294967295 にスケール
255        (normalized_value * 4294967295.0).round() as u32
256    }
257
258    #[inline(always)]
259    pub fn get_idf_vector(&self, total_doc_count: u64) -> Vec<(String, u16)> {
260        self.token_count
261            .iter()
262            .map(|(token, &doc_count)| {
263                let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
264                (token.clone(), idf)
265            })
266            .collect()
267    }
268
269    #[inline(always)]
270    pub fn get_idf_vector_ref(&self, total_doc_count: u64) -> Vec<(&str, u16)> {
271        self.token_count.iter().map(|(token, &doc_count)| {
272            let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
273            (token.as_str(), idf)
274        }).collect()
275    }
276
277    #[inline(always)]
278    pub fn get_idf_vector_parallel(&self, total_doc_count: u64) -> Vec<(String, u16)> {
279        self.token_count
280            .par_iter()
281            .map(|(token, &doc_count)| {
282                let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
283                (token.clone(), idf)
284            })
285            .collect()
286    }
287
288    #[inline(always)]
289    pub fn get_idf_vector_ref_parallel(&self, total_doc_count: u64) -> Vec<(&str, u16)> {
290        self.token_count.par_iter().map(|(token, &doc_count)| {
291            let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
292            (token.as_str(), idf)
293        }).collect()
294    }
295
296    #[inline(always)]
297    pub fn get_idf_hashmap(&self, total_doc_count: u64) -> HashMap<String, u16> {
298        self.token_count
299            .iter()
300            .map(|(token, &doc_count)| {
301                let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
302                (token.clone(), idf)
303            })
304            .collect()
305    }
306
307    #[inline(always)]
308    pub fn get_idf_hashmap_ref(&self, total_doc_count: u64) -> HashMap<&str, u16> {
309        self.token_count.iter().map(|(token, &doc_count)| {
310            let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
311            (token.as_str(), idf)
312        }).collect()
313    }
314
315    #[inline(always)]
316    pub fn get_idf_hashmap_parallel(&self, total_doc_count: u64) -> HashMap<String, u16> {
317        self.token_count
318            .par_iter()
319            .map(|(token, &doc_count)| {
320                let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
321                (token.clone(), idf)
322            })
323            .collect()
324    }
325
326    #[inline(always)]
327    pub fn get_idf_hashmap_ref_parallel(&self, total_doc_count: u64) -> HashMap<&str, u16> {
328        self.token_count.par_iter().map(|(token, &doc_count)| {
329            let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
330            (token.as_str(), idf)
331        }).collect()
332    }
333
334    #[inline(always)]
335    pub fn get_token_count_vector(&self) -> Vec<(String, u32)> {
336        self.token_count.iter().map(|(token, &count)| {
337            (token.clone(), count)
338        }).collect()
339    }
340
341    #[inline(always)]
342    pub fn get_token_count_hashmap(&self) -> HashMap<String, u32> {
343        self.token_count.clone()
344    }
345
346    #[inline(always)]
347    pub fn get_token_count_hashmap_ref(&self) -> HashMap<&str, u32> {
348        self.token_count.iter().map(|(token, &count)| {
349            (token.as_str(), count)
350        }).collect()
351    }
352
353    #[inline(always)]
354    pub fn get_total_token_count(&self) -> u64 {
355        self.total_token_count
356    }
357
358    #[inline(always)]
359    pub fn get_total_token_count_ref(&self) -> &u64 {
360        &self.total_token_count
361    }
362
363    #[inline(always)]
364    pub fn get_token_count(&self, token: &str) -> u32 {
365        *self.token_count.get(token).unwrap_or(&0)
366    }
367
368    #[inline(always)]
369    pub fn get_token_count_ref(&self, token: &str) -> &u32 {
370        self.token_count.get(token).unwrap_or(&0)
371    }
372
373    #[inline(always)]
374    pub fn get_most_frequent_tokens(&self) -> Vec<(String, u32)> {
375        if let Some(&max_count) = self.token_count.values().max() {
376            self.token_count.iter()
377                .filter(|&(_, &count)| count == max_count)
378                .map(|(token, &count)| (token.clone(), count))
379                .collect()
380        } else {
381            Vec::new()
382        }
383    }
384
385    #[inline(always)]
386    pub fn get_most_frequent_token_count(&self) -> u32 {
387        if let Some(&max_count) = self.token_count.values().max() {
388            max_count
389        } else {
390            0
391        }
392    }
393
394    #[inline(always)]
395    pub fn get_most_frequent_tokens_parallel(&self) -> Vec<(String, u32)> {
396        if self.token_count.is_empty() {
397            return Vec::new();
398        }
399        let max_frequency = self
400            .token_count
401            .par_iter()
402            .map(|(_, &count)| count)
403            .max()
404            .unwrap();
405        self.token_count
406            .par_iter()
407            .filter(|&(_, &count)| count == max_frequency)
408            .map(|(token, &count)| (token.clone(), count))
409            .collect()
410    }
411
412    #[inline(always)]
413    pub fn tfidf_calc(tf : f64, idf: f64) -> f64 {
414        tf * idf
415    }
416
417    #[inline(always)]
418    pub fn tfidf_calc_as_u16(tf: u16, idf: u16) -> u16 {
419        let product = tf as u32 * idf as u32;
420        ((product + 65_535) / 65_536) as u16
421    }
422
423    #[inline(always)]
424    pub fn tfidf_calc_as_u32(tf : u32, idf: u32) -> u32 {
425        let product = tf as u64 * idf as u64;
426        ((product + 4_294_967_295) / 4_294_967_296) as u32
427    }
428
429    #[inline(always)]
430    pub fn get_tfidf_vector(&self, idf_map: &HashMap<String, u16>) -> Vec<(String, u16)> {
431        self.token_count.iter().map(|(token, &count)| {
432            let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
433            let idf = idf_map.get(token).copied().unwrap_or(0);
434            (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
435        }).collect()
436    }
437
438    #[inline(always)]
439    pub fn get_tfidf_vector_fst(&self, idf_map: &Map<Vec<u8>>) -> Vec<(String, u16)> {
440        self.token_count.iter().map(|(token, &count)| {
441            let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
442            let idf = match idf_map.get(token.as_bytes()) {
443                Some(idf) => idf as u16,
444                None => 0,
445            };
446            (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
447        }).collect()
448    }
449
450    #[inline(always)]
451    pub fn get_tfidf_hashmap(&self, idf_map: &HashMap<String, u16>) -> HashMap<String, u16> {
452        self.token_count.iter().map(|(token, &count)| {
453            let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
454            let idf = idf_map.get(token).copied().unwrap_or(0);
455            (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
456        }).collect()
457    }
458
459    #[inline(always)]
460    pub fn get_tfidf_hashmap_fst(&self, idf_map: &Map<Vec<u8>>) -> HashMap<String, u16> {
461        self.token_count.iter().map(|(token, &count)| {
462            let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
463            let idf = match idf_map.get(token.as_bytes()) {
464                Some(idf) => idf as u16,
465                None => 0,
466            };
467            (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
468        }).collect()
469    }
470
471    #[inline(always)]
472    pub fn get_tfidf_vector_parallel(&self, idf_map: &HashMap<String, u16>) -> Vec<(String, u16)> {
473        self.token_count
474            .par_iter()
475            .map(|(token, &count)| {
476                let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
477                let idf = idf_map.get(token).copied().unwrap_or(0);
478                (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
479            })
480            .collect()
481    }
482
483    #[inline(always)]
484    pub fn get_tfidf_vector_fst_parallel(&self, idf_map: &Map<Vec<u8>>) -> Vec<(String, u16)> {
485        self.token_count
486            .par_iter()
487            .map(|(token, &count)| {
488                let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
489                let idf = match idf_map.get(token.as_bytes()) {
490                    Some(idf) => idf as u16,
491                    None => 0,
492                };
493                (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
494            })
495            .collect()
496    }
497
498    #[inline(always)]
499    pub fn get_tfidf_hashmap_parallel(&self, idf_map: &HashMap<String, u16>) -> HashMap<String, u16> {
500        self.token_count
501            .par_iter()
502            .map(|(token, &count)| {
503                let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
504                let idf = idf_map.get(token).copied().unwrap_or(0);
505                (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
506            })
507            .collect()
508    }
509
510    #[inline(always)]
511    pub fn get_tfidf_hashmap_fst_parallel(&self, idf_map: &Map<Vec<u8>>) -> HashMap<String, u16> {
512        self.token_count
513            .par_iter()
514            .map(|(token, &count)| {
515                let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
516                let idf = match idf_map.get(token.as_bytes()) {
517                    Some(idf) => idf as u16,
518                    None => 0,
519                };
520                (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
521            })
522            .collect()
523    }
524
525    #[inline(always)]
526    pub fn contains_token(&self, token: &str) -> bool {
527        self.token_count.contains_key(token)
528    }
529
530    #[inline(always)]
531    pub fn get_token_set(&self) -> Vec<String> {
532        self.token_count.keys().cloned().collect()
533    }
534
535    #[inline(always)]
536    pub fn get_token_set_ref(&self) -> Vec<&str> {
537        self.token_count.keys().map(|s| s.as_str()).collect()
538    }
539
540    #[inline(always)]
541    pub fn get_token_hashset(&self) -> HashSet<String> {
542        self.token_count.keys().cloned().collect()
543    }
544
545    #[inline(always)]
546    pub fn get_token_hashset_ref(&self) -> HashSet<&str> {
547        self.token_count.keys().map(|s| s.as_str()).collect()
548    }
549
550    #[inline(always)]
551    pub fn get_token_set_len(&self) -> usize {
552        self.token_count.len()
553    }
554
555    #[inline(always)]
556    pub fn get_token_set_iter(&self) -> Keys<String, u32> {
557        self.token_count.keys()
558    }
559
560    #[inline(always)]
561    pub fn get_token_set_iter_ref(&self) -> impl Iterator<Item = &str> {
562        self.token_count.keys().map(|s| s.as_str())
563    }
564
565    #[inline(always)]
566    pub fn get_token_length_stats(&self) -> Option<(usize, usize, f64)> {
567        if self.token_count.is_empty() {
568            return None;
569        }
570
571        let lengths: Vec<usize> = self.token_count.keys().map(|token| token.len()).collect();
572        let min_len = *lengths.iter().min().unwrap();
573        let max_len = *lengths.iter().max().unwrap();
574        let avg_len = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
575
576        Some((min_len, max_len, avg_len))
577    }
578
579    #[inline(always)]
580    pub fn get_token_length_stats_ref(&self) -> Option<(usize, usize, f64)> {
581        if self.token_count.is_empty() {
582            return None;
583        }
584
585        let lengths: Vec<usize> = self.token_count.keys().map(|token| token.len()).collect();
586        let min_len = *lengths.iter().min().unwrap();
587        let max_len = *lengths.iter().max().unwrap();
588        let avg_len = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
589
590        Some((min_len, max_len, avg_len))
591    }
592
593    #[inline(always)]
594    pub fn get_token_length_stats_parallel(&self) -> Option<(usize, usize, f64)> {
595        if self.token_count.is_empty() {
596            return None;
597        }
598
599        let (min_len, max_len, total_len, count) = self.token_count
600            .par_iter()
601            .map(|(token, _)| (token.len(), token.len(), token.len(), 1))
602            .reduce(
603                || (usize::MAX, 0, 0, 0),
604                |acc, len| {
605                    let min_len = acc.0.min(len.0);
606                    let max_len = acc.1.max(len.1);
607                    let total_len = acc.2 + len.2;
608                    let count = acc.3 + len.3;
609                    (min_len, max_len, total_len, count)
610                },
611            );
612
613        Some((min_len, max_len, total_len as f64 / count as f64))
614    }
615
616    #[inline(always)]
617    pub fn remove_stop_tokens(&mut self, stop_tokens: &[&str]) {
618        for &stop_token in stop_tokens {
619            if let Some(count) = self.token_count.remove(stop_token) {
620                self.total_token_count -= count as u64;
621            }
622        }
623    }
624
625    #[inline(always)]
626    pub fn remove_stop_tokens_parallel(&mut self, stop_tokens: &[&str]) {
627        let to_remove: Vec<String> = stop_tokens
628            .par_iter()
629            .filter_map(|&stop_token| {
630                self.token_count.get(stop_token).map(|_| stop_token.to_string())
631            })
632            .collect();
633
634        for token in to_remove {
635            if let Some(count) = self.token_count.remove(&token) {
636                self.total_token_count -= count as u64;
637            }
638        }
639    }
640
641    #[inline(always)]
642    pub fn remove_tokens_by_condition<F>(&mut self, condition: F) -> u64
643    where
644        F: Fn(&str, &u32) -> bool,
645    {
646        let mut removed_total_count: u64 = 0;
647        self.token_count.retain(|token, count| {
648            if condition(token, count) {
649                removed_total_count += *count as u64;
650                false
651            } else {
652                true
653            }
654        });
655        self.total_token_count -= removed_total_count as u64;
656
657        removed_total_count
658    }
659
660    #[inline(always)]
661    pub fn get_sorted_by_frequency_desc(&self) -> Vec<(String, u32)> {
662        let mut token_list: Vec<(String, u32)> = self.token_count
663            .iter()
664            .map(|(token, &count)| (token.clone(), count))
665            .collect();
666
667        token_list.sort_by(|a, b| b.1.cmp(&a.1));
668        token_list
669    }
670
671    #[inline(always)]
672    pub fn get_sorted_by_frequency_desc_parallel(&self) -> Vec<(String, u32)> {
673        let mut token_list: Vec<(String, u32)> = self.token_count
674            .par_iter()
675            .map(|(token, &count)| (token.clone(), count))
676            .collect();
677
678        token_list.par_sort_by(|a, b| b.1.cmp(&a.1));
679        token_list
680    }
681
682    #[inline(always)]
683    pub fn get_sorted_by_frequency_asc(&self) -> Vec<(String, u32)> {
684        let mut token_list: Vec<(String, u32)> = self.token_count
685            .iter()
686            .map(|(token, &count)| (token.clone(), count))
687            .collect();
688
689        token_list.sort_by(|a, b| a.1.cmp(&b.1));
690        token_list
691    }
692
693    #[inline(always)]
694    pub fn get_sorted_by_frequency_asc_parallel(&self) -> Vec<(String, u32)> {
695        let mut token_list: Vec<(String, u32)> = self.token_count
696            .par_iter()
697            .map(|(token, &count)| (token.clone(), count))
698            .collect();
699
700        token_list.par_sort_by(|a, b| a.1.cmp(&b.1));
701        token_list
702    }
703
704    #[inline(always)]
705    pub fn get_sorted_by_alphabetical_asc(&self) -> Vec<(String, u32)> {
706        let mut token_list: Vec<(String, u32)> = self.token_count
707            .iter()
708            .map(|(token, &count)| (token.clone(), count))
709            .collect();
710
711        token_list.sort_by(|a, b| a.0.cmp(&b.0));
712        token_list
713    }
714
715    #[inline(always)]
716    pub fn get_sorted_by_alphabetical_asc_parallel(&self) -> Vec<(String, u32)> {
717        let mut token_list: Vec<(String, u32)> = self.token_count
718            .par_iter()
719            .map(|(token, &count)| (token.clone(), count))
720            .collect();
721
722        token_list.par_sort_by(|a, b| a.0.cmp(&b.0));
723        token_list
724    }
725
726    #[inline(always)]
727    pub fn get_sorted_by_alphabetical_desc(&self) -> Vec<(String, u32)> {
728        let mut token_list: Vec<(String, u32)> = self.token_count
729            .iter()
730            .map(|(token, &count)| (token.clone(), count))
731            .collect();
732
733        token_list.sort_by(|a, b| b.0.cmp(&a.0));
734        token_list
735    }
736
737    #[inline(always)]
738    pub fn get_sorted_by_alphabetical_desc_parallel(&self) -> Vec<(String, u32)> {
739        let mut token_list: Vec<(String, u32)> = self.token_count
740            .par_iter()
741            .map(|(token, &count)| (token.clone(), count))
742            .collect();
743
744        token_list.par_sort_by(|a, b| b.0.cmp(&a.0));
745        token_list
746    }
747
748    #[inline(always)]
749    pub fn get_sorted_by_length_desc(&self) -> Vec<(String, u32)> {
750        let mut token_list: Vec<(String, u32)> = self.token_count
751            .iter()
752            .map(|(token, &count)| (token.clone(), count))
753            .collect();
754
755        token_list.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
756        token_list
757    }
758
759    #[inline(always)]
760    pub fn get_sorted_by_length_desc_parallel(&self) -> Vec<(String, u32)> {
761        let mut token_list: Vec<(String, u32)> = self.token_count
762            .par_iter()
763            .map(|(token, &count)| (token.clone(), count))
764            .collect();
765
766        token_list.par_sort_by(|a, b| b.0.len().cmp(&a.0.len()));
767        token_list
768    }
769
770    #[inline(always)]
771    pub fn get_sorted_by_length_asc(&self) -> Vec<(String, u32)> {
772        let mut token_list: Vec<(String, u32)> = self.token_count
773            .iter()
774            .map(|(token, &count)| (token.clone(), count))
775            .collect();
776
777        token_list.sort_by(|a, b| a.0.len().cmp(&b.0.len()));
778        token_list
779    }
780
781    #[inline(always)]
782    pub fn get_sorted_by_length_asc_parallel(&self) -> Vec<(String, u32)> {
783        let mut token_list: Vec<(String, u32)> = self.token_count
784            .par_iter()
785            .map(|(token, &count)| (token.clone(), count))
786            .collect();
787
788        token_list.par_sort_by(|a, b| a.0.len().cmp(&b.0.len()));
789        token_list
790    }
791
792    #[inline(always)]
793    pub fn get_unique_token_ratio(&self) -> f64 {
794        if self.total_token_count == 0 {
795            return 0.0;
796        }
797        self.token_count.len() as f64 / self.total_token_count as f64
798    }
799
800    #[inline(always)]
801    pub fn reset(&mut self) {
802        self.token_count.clear();
803        self.total_token_count = 0;
804    }
805}
806
807#[cfg(test)]
808mod tests {
809    use super::*;
810
811    #[test]
812    fn test_add_token() {
813        let mut tf = TokenFrequency::new();
814        tf.add_token("rust");
815        assert_eq!(tf.token_count.get("rust"), Some(&1));
816        assert_eq!(tf.total_token_count, 1);
817    }
818
819    #[test]
820    fn test_add_tokens() {
821        let mut tf = TokenFrequency::new();
822        tf.add_tokens(&["rust", "rust", "programming"]);
823        assert_eq!(tf.token_count.get("rust"), Some(&2));
824        assert_eq!(tf.token_count.get("programming"), Some(&1));
825        assert_eq!(tf.total_token_count, 3);
826    }
827
828    #[test]
829    fn test_sub_token() {
830        let mut tf = TokenFrequency::new();
831        tf.add_tokens(&["rust", "rust", "programming"]);
832        tf.sub_token("rust");
833        assert_eq!(tf.token_count.get("rust"), Some(&1));
834        assert_eq!(tf.total_token_count, 2);
835    }
836
837    #[test]
838    fn test_tfidf_calc() {
839        let tfidf = TokenFrequency::tfidf_calc(2.0, 1.5);
840        assert_eq!(tfidf, 3.0);
841    }
842
843    #[test]
844    fn test_reset() {
845        let mut tf = TokenFrequency::new();
846        tf.add_tokens(&["rust", "programming"]);
847        tf.reset();
848        assert!(tf.token_count.is_empty());
849        assert_eq!(tf.total_token_count, 0);
850    }
851
852    #[test]
853    fn test_get_token_length_stats() {
854        let mut tf = TokenFrequency::new();
855        tf.add_tokens(&["rust", "go", "java"]);
856        let stats = tf.get_token_length_stats();
857        assert_eq!(stats, Some((2, 4, 3.3333333333333335)));
858    }
859
860    #[test]
861    fn test_unique_token_ratio() {
862        let mut tf = TokenFrequency::new();
863        tf.add_tokens(&["rust", "rust", "go"]);
864        assert_eq!(tf.get_unique_token_ratio(), 2.0 / 3.0);
865    }
866}