1use core::str;
2use std::{collections::{hash_map::Keys, HashMap, HashSet}, fmt::Debug};
3
4use fst::Map;
5use serde::{Deserialize, Serialize};
6use rayon::prelude::*;
7
8#[derive(Serialize, Deserialize, Debug, Clone)]
14pub struct TokenFrequency {
15 pub token_count: HashMap<String, u32>,
16 pub total_token_count: u64,
17}
18
19impl TokenFrequency {
20 pub fn new() -> Self {
21 TokenFrequency {
22 token_count: HashMap::new(),
23 total_token_count: 0,
24 }
25 }
26
27 #[inline(always)]
28 pub fn add_token(&mut self, token: &str) -> &mut Self {
29 let count = self.token_count.entry(token.to_string()).or_insert(0);
30 *count += 1;
31 self.total_token_count += 1;
32 self
33 }
34
35 #[inline(always)]
36 pub fn add_token_n(&mut self, token: &str, n: u32) -> &mut Self {
37 let count = self.token_count.entry(token.to_string()).or_insert(0);
38 *count += n;
39 self.total_token_count += n as u64;
40 self
41 }
42
43 #[inline(always)]
44 pub fn add_tokens(&mut self, tokens: &[&str]) -> &mut Self {
45 for &token in tokens {
46 let count = self.token_count.entry(token.to_string()).or_insert(0);
47 *count += 1;
48 self.total_token_count += 1;
49 }
50 self
51 }
52
53 #[inline(always)]
54 pub fn add_tokens_string(&mut self, tokens: &[String]) -> &mut Self {
55 for token in tokens {
56 let count = self.token_count.entry(token.clone()).or_insert(0);
57 *count += 1;
58 self.total_token_count += 1;
59 }
60 self
61 }
62
63 #[inline(always)]
64 pub fn sub_token(&mut self, token: &str) -> &mut Self {
65 if let Some(count) = self.token_count.get_mut(token) {
66 if *count > 0 {
67 *count -= 1;
68 self.total_token_count -= 1;
69 }
70 }
71 self
72 }
73
74 #[inline(always)]
75 pub fn sub_token_n(&mut self, token: &str, n: u32) -> &mut Self {
76 if let Some(count) = self.token_count.get_mut(token) {
77 if *count >= n {
78 *count -= n;
79 self.total_token_count -= n as u64;
80 }
81 }
82 self
83 }
84
85 #[inline(always)]
86 pub fn sub_tokens(&mut self, tokens: &[&str]) -> &mut Self {
87 for &token in tokens {
88 if let Some(count) = self.token_count.get_mut(token) {
89 if *count > 0 {
90 *count -= 1;
91 self.total_token_count -= 1;
92 }
93 }
94 }
95 self
96 }
97
98 #[inline(always)]
99 pub fn sub_tokens_string(&mut self, tokens: &[String]) -> &mut Self {
100 for token in tokens {
101 if let Some(count) = self.token_count.get_mut(token.as_str()) {
102 if *count > 0 {
103 *count -= 1;
104 self.total_token_count -= 1;
105 }
106 }
107 }
108 self
109 }
110
111 #[inline(always)]
112 pub fn tf_calc(max_count: u32, count: u32) -> f64 {
113 (count as f64 + 1.0).ln() / (max_count as f64 + 1.0).ln()
114 }
115
116 #[inline(always)]
117 pub fn tf_calc_as_u16(max_count: u32, count: u32) -> u16 {
118 let normalized_value = (count as f64 + 1.0).ln() / (max_count as f64 + 1.0).ln();
119 (normalized_value * 65535.0).round() as u16
121 }
122
123 #[inline(always)]
124 pub fn tf_calc_as_u32(max_count: u32, count: u32) -> u32 {
125 let normalized_value = (count as f64 + 1.0).ln() / (max_count as f64 + 1.0).ln();
126 (normalized_value * 4294967295.0).round() as u32
128 }
129
130 #[inline(always)]
132 pub fn get_tf_vector(&self) -> Vec<(String, u16)> {
133 let max_count = self.get_most_frequent_token_count();
134 self.token_count
135 .iter()
136 .map(|(token, &count)| {
137 (token.clone(), Self::tf_calc_as_u16(max_count, count))
138 })
139 .collect()
140 }
141
142 #[inline(always)]
144 pub fn get_tf_vector_parallel(&self) -> Vec<(String, u16)> {
145 let max_count = self.get_most_frequent_token_count();
146 self.token_count
147 .par_iter()
148 .map(|(token, &count)| {
149 (token.clone(), Self::tf_calc_as_u16(max_count, count))
150 })
151 .collect()
152 }
153
154 #[inline(always)]
156 pub fn get_tf_vector_ref(&self) -> Vec<(&str, u16)> {
157 let max_count = self.get_most_frequent_token_count();
158 self.token_count
159 .iter()
160 .map(|(token, &count)| {
161 (token.as_str(), Self::tf_calc_as_u16(max_count, count))
162 })
163 .collect()
164 }
165
166 #[inline(always)]
168 pub fn get_tf_vector_ref_parallel(&self) -> Vec<(&str, u16)> {
169 let max_count = self.get_most_frequent_token_count();
170 self.token_count
171 .par_iter()
172 .map(|(token, &count)| {
173 (token.as_str(), Self::tf_calc_as_u16(max_count, count))
174 })
175 .collect()
176 }
177
178 #[inline(always)]
180 pub fn get_tf_hashmap(&self) -> HashMap<String, u16> {
181 let max_count = self.get_most_frequent_token_count();
182 self.token_count
183 .iter()
184 .map(|(token, &count)| {
185 (token.clone(), Self::tf_calc_as_u16(max_count, count))
186 })
187 .collect()
188 }
189
190 #[inline(always)]
192 pub fn get_tf_hashmap_parallel(&self) -> HashMap<String, u16> {
193 let max_count = self.get_most_frequent_token_count();
194 self.token_count
195 .par_iter()
196 .map(|(token, &count)| {
197 (token.clone(), Self::tf_calc_as_u16(max_count, count))
198 })
199 .collect()
200 }
201
202 #[inline(always)]
204 pub fn get_tf_hashmap_ref(&self) -> HashMap<&str, u16> {
205 let max_count = self.get_most_frequent_token_count();
206 self.token_count
207 .iter()
208 .map(|(token, &count)| {
209 (token.as_str(), Self::tf_calc_as_u16(max_count, count))
210 })
211 .collect()
212 }
213
214 #[inline(always)]
216 pub fn get_tf_hashmap_ref_parallel(&self) -> HashMap<&str, u16> {
217 let max_count = self.get_most_frequent_token_count();
218 self.token_count
219 .par_iter()
220 .map(|(token, &count)| {
221 (token.as_str(), Self::tf_calc_as_u16(max_count, count))
222 })
223 .collect()
224 }
225
226 #[inline(always)]
228 pub fn get_token_tf(&self, token: &str) -> u16 {
229 let max_count = self.get_most_frequent_token_count();
230 let count = self.token_count.get(token).copied().unwrap_or(0);
231 Self::tf_calc_as_u16(max_count, count)
232 }
233
234 #[inline(always)]
235 pub fn idf_max(&self, total_doc_count: u64) -> f64 {
236 (1.0 + total_doc_count as f64 / (2.0)).ln()
237 }
238
239 #[inline(always)]
240 pub fn idf_calc(total_doc_count: u64, max_idf: f64, doc_count: u32) -> f64 {
241 (1.0 + total_doc_count as f64 / (1.0 + doc_count as f64)).ln() / max_idf
242 }
243
244 #[inline(always)]
245 pub fn idf_calc_as_u16(total_doc_count: u64, max_idf: f64, doc_count: u32) -> u16 {
246 let normalized_value = (1.0 + total_doc_count as f64 / (1.0 + doc_count as f64)).ln() / max_idf;
247 (normalized_value * 65535.0).round() as u16
249 }
250
251 #[inline(always)]
252 pub fn idf_calc_as_u32(total_doc_count: u64, max_idf: f64, doc_count: u32) -> u32 {
253 let normalized_value = (1.0 + total_doc_count as f64 / (1.0 + doc_count as f64)).ln() / max_idf;
254 (normalized_value * 4294967295.0).round() as u32
256 }
257
258 #[inline(always)]
259 pub fn get_idf_vector(&self, total_doc_count: u64) -> Vec<(String, u16)> {
260 self.token_count
261 .iter()
262 .map(|(token, &doc_count)| {
263 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
264 (token.clone(), idf)
265 })
266 .collect()
267 }
268
269 #[inline(always)]
270 pub fn get_idf_vector_ref(&self, total_doc_count: u64) -> Vec<(&str, u16)> {
271 self.token_count.iter().map(|(token, &doc_count)| {
272 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
273 (token.as_str(), idf)
274 }).collect()
275 }
276
277 #[inline(always)]
278 pub fn get_idf_vector_parallel(&self, total_doc_count: u64) -> Vec<(String, u16)> {
279 self.token_count
280 .par_iter()
281 .map(|(token, &doc_count)| {
282 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
283 (token.clone(), idf)
284 })
285 .collect()
286 }
287
288 #[inline(always)]
289 pub fn get_idf_vector_ref_parallel(&self, total_doc_count: u64) -> Vec<(&str, u16)> {
290 self.token_count.par_iter().map(|(token, &doc_count)| {
291 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
292 (token.as_str(), idf)
293 }).collect()
294 }
295
296 #[inline(always)]
297 pub fn get_idf_hashmap(&self, total_doc_count: u64) -> HashMap<String, u16> {
298 self.token_count
299 .iter()
300 .map(|(token, &doc_count)| {
301 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
302 (token.clone(), idf)
303 })
304 .collect()
305 }
306
307 #[inline(always)]
308 pub fn get_idf_hashmap_ref(&self, total_doc_count: u64) -> HashMap<&str, u16> {
309 self.token_count.iter().map(|(token, &doc_count)| {
310 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
311 (token.as_str(), idf)
312 }).collect()
313 }
314
315 #[inline(always)]
316 pub fn get_idf_hashmap_parallel(&self, total_doc_count: u64) -> HashMap<String, u16> {
317 self.token_count
318 .par_iter()
319 .map(|(token, &doc_count)| {
320 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
321 (token.clone(), idf)
322 })
323 .collect()
324 }
325
326 #[inline(always)]
327 pub fn get_idf_hashmap_ref_parallel(&self, total_doc_count: u64) -> HashMap<&str, u16> {
328 self.token_count.par_iter().map(|(token, &doc_count)| {
329 let idf = Self::idf_calc_as_u16(total_doc_count, self.idf_max(total_doc_count), doc_count);
330 (token.as_str(), idf)
331 }).collect()
332 }
333
334 #[inline(always)]
335 pub fn get_token_count_vector(&self) -> Vec<(String, u32)> {
336 self.token_count.iter().map(|(token, &count)| {
337 (token.clone(), count)
338 }).collect()
339 }
340
341 #[inline(always)]
342 pub fn get_token_count_hashmap(&self) -> HashMap<String, u32> {
343 self.token_count.clone()
344 }
345
346 #[inline(always)]
347 pub fn get_token_count_hashmap_ref(&self) -> HashMap<&str, u32> {
348 self.token_count.iter().map(|(token, &count)| {
349 (token.as_str(), count)
350 }).collect()
351 }
352
353 #[inline(always)]
354 pub fn get_total_token_count(&self) -> u64 {
355 self.total_token_count
356 }
357
358 #[inline(always)]
359 pub fn get_total_token_count_ref(&self) -> &u64 {
360 &self.total_token_count
361 }
362
363 #[inline(always)]
364 pub fn get_token_count(&self, token: &str) -> u32 {
365 *self.token_count.get(token).unwrap_or(&0)
366 }
367
368 #[inline(always)]
369 pub fn get_token_count_ref(&self, token: &str) -> &u32 {
370 self.token_count.get(token).unwrap_or(&0)
371 }
372
373 #[inline(always)]
374 pub fn get_most_frequent_tokens(&self) -> Vec<(String, u32)> {
375 if let Some(&max_count) = self.token_count.values().max() {
376 self.token_count.iter()
377 .filter(|&(_, &count)| count == max_count)
378 .map(|(token, &count)| (token.clone(), count))
379 .collect()
380 } else {
381 Vec::new()
382 }
383 }
384
385 #[inline(always)]
386 pub fn get_most_frequent_token_count(&self) -> u32 {
387 if let Some(&max_count) = self.token_count.values().max() {
388 max_count
389 } else {
390 0
391 }
392 }
393
394 #[inline(always)]
395 pub fn get_most_frequent_tokens_parallel(&self) -> Vec<(String, u32)> {
396 if self.token_count.is_empty() {
397 return Vec::new();
398 }
399 let max_frequency = self
400 .token_count
401 .par_iter()
402 .map(|(_, &count)| count)
403 .max()
404 .unwrap();
405 self.token_count
406 .par_iter()
407 .filter(|&(_, &count)| count == max_frequency)
408 .map(|(token, &count)| (token.clone(), count))
409 .collect()
410 }
411
412 #[inline(always)]
413 pub fn tfidf_calc(tf : f64, idf: f64) -> f64 {
414 tf * idf
415 }
416
417 #[inline(always)]
418 pub fn tfidf_calc_as_u16(tf: u16, idf: u16) -> u16 {
419 let product = tf as u32 * idf as u32;
420 ((product + 65_535) / 65_536) as u16
421 }
422
423 #[inline(always)]
424 pub fn tfidf_calc_as_u32(tf : u32, idf: u32) -> u32 {
425 let product = tf as u64 * idf as u64;
426 ((product + 4_294_967_295) / 4_294_967_296) as u32
427 }
428
429 #[inline(always)]
430 pub fn get_tfidf_vector(&self, idf_map: &HashMap<String, u16>) -> Vec<(String, u16)> {
431 self.token_count.iter().map(|(token, &count)| {
432 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
433 let idf = idf_map.get(token).copied().unwrap_or(0);
434 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
435 }).collect()
436 }
437
438 #[inline(always)]
439 pub fn get_tfidf_vector_fst(&self, idf_map: &Map<Vec<u8>>) -> Vec<(String, u16)> {
440 self.token_count.iter().map(|(token, &count)| {
441 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
442 let idf = match idf_map.get(token.as_bytes()) {
443 Some(idf) => idf as u16,
444 None => 0,
445 };
446 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
447 }).collect()
448 }
449
450 #[inline(always)]
451 pub fn get_tfidf_hashmap(&self, idf_map: &HashMap<String, u16>) -> HashMap<String, u16> {
452 self.token_count.iter().map(|(token, &count)| {
453 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
454 let idf = idf_map.get(token).copied().unwrap_or(0);
455 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
456 }).collect()
457 }
458
459 #[inline(always)]
460 pub fn get_tfidf_hashmap_fst(&self, idf_map: &Map<Vec<u8>>) -> HashMap<String, u16> {
461 self.token_count.iter().map(|(token, &count)| {
462 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
463 let idf = match idf_map.get(token.as_bytes()) {
464 Some(idf) => idf as u16,
465 None => 0,
466 };
467 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
468 }).collect()
469 }
470
471 #[inline(always)]
472 pub fn get_tfidf_vector_parallel(&self, idf_map: &HashMap<String, u16>) -> Vec<(String, u16)> {
473 self.token_count
474 .par_iter()
475 .map(|(token, &count)| {
476 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
477 let idf = idf_map.get(token).copied().unwrap_or(0);
478 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
479 })
480 .collect()
481 }
482
483 #[inline(always)]
484 pub fn get_tfidf_vector_fst_parallel(&self, idf_map: &Map<Vec<u8>>) -> Vec<(String, u16)> {
485 self.token_count
486 .par_iter()
487 .map(|(token, &count)| {
488 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
489 let idf = match idf_map.get(token.as_bytes()) {
490 Some(idf) => idf as u16,
491 None => 0,
492 };
493 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
494 })
495 .collect()
496 }
497
498 #[inline(always)]
499 pub fn get_tfidf_hashmap_parallel(&self, idf_map: &HashMap<String, u16>) -> HashMap<String, u16> {
500 self.token_count
501 .par_iter()
502 .map(|(token, &count)| {
503 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
504 let idf = idf_map.get(token).copied().unwrap_or(0);
505 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
506 })
507 .collect()
508 }
509
510 #[inline(always)]
511 pub fn get_tfidf_hashmap_fst_parallel(&self, idf_map: &Map<Vec<u8>>) -> HashMap<String, u16> {
512 self.token_count
513 .par_iter()
514 .map(|(token, &count)| {
515 let tf = Self::tf_calc_as_u16(self.get_most_frequent_token_count(), count);
516 let idf = match idf_map.get(token.as_bytes()) {
517 Some(idf) => idf as u16,
518 None => 0,
519 };
520 (token.clone(), Self::tfidf_calc_as_u16(tf, idf))
521 })
522 .collect()
523 }
524
525 #[inline(always)]
526 pub fn contains_token(&self, token: &str) -> bool {
527 self.token_count.contains_key(token)
528 }
529
530 #[inline(always)]
531 pub fn get_token_set(&self) -> Vec<String> {
532 self.token_count.keys().cloned().collect()
533 }
534
535 #[inline(always)]
536 pub fn get_token_set_ref(&self) -> Vec<&str> {
537 self.token_count.keys().map(|s| s.as_str()).collect()
538 }
539
540 #[inline(always)]
541 pub fn get_token_hashset(&self) -> HashSet<String> {
542 self.token_count.keys().cloned().collect()
543 }
544
545 #[inline(always)]
546 pub fn get_token_hashset_ref(&self) -> HashSet<&str> {
547 self.token_count.keys().map(|s| s.as_str()).collect()
548 }
549
550 #[inline(always)]
551 pub fn get_token_set_len(&self) -> usize {
552 self.token_count.len()
553 }
554
555 #[inline(always)]
556 pub fn get_token_set_iter(&self) -> Keys<String, u32> {
557 self.token_count.keys()
558 }
559
560 #[inline(always)]
561 pub fn get_token_set_iter_ref(&self) -> impl Iterator<Item = &str> {
562 self.token_count.keys().map(|s| s.as_str())
563 }
564
565 #[inline(always)]
566 pub fn get_token_length_stats(&self) -> Option<(usize, usize, f64)> {
567 if self.token_count.is_empty() {
568 return None;
569 }
570
571 let lengths: Vec<usize> = self.token_count.keys().map(|token| token.len()).collect();
572 let min_len = *lengths.iter().min().unwrap();
573 let max_len = *lengths.iter().max().unwrap();
574 let avg_len = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
575
576 Some((min_len, max_len, avg_len))
577 }
578
579 #[inline(always)]
580 pub fn get_token_length_stats_ref(&self) -> Option<(usize, usize, f64)> {
581 if self.token_count.is_empty() {
582 return None;
583 }
584
585 let lengths: Vec<usize> = self.token_count.keys().map(|token| token.len()).collect();
586 let min_len = *lengths.iter().min().unwrap();
587 let max_len = *lengths.iter().max().unwrap();
588 let avg_len = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
589
590 Some((min_len, max_len, avg_len))
591 }
592
593 #[inline(always)]
594 pub fn get_token_length_stats_parallel(&self) -> Option<(usize, usize, f64)> {
595 if self.token_count.is_empty() {
596 return None;
597 }
598
599 let (min_len, max_len, total_len, count) = self.token_count
600 .par_iter()
601 .map(|(token, _)| (token.len(), token.len(), token.len(), 1))
602 .reduce(
603 || (usize::MAX, 0, 0, 0),
604 |acc, len| {
605 let min_len = acc.0.min(len.0);
606 let max_len = acc.1.max(len.1);
607 let total_len = acc.2 + len.2;
608 let count = acc.3 + len.3;
609 (min_len, max_len, total_len, count)
610 },
611 );
612
613 Some((min_len, max_len, total_len as f64 / count as f64))
614 }
615
616 #[inline(always)]
617 pub fn remove_stop_tokens(&mut self, stop_tokens: &[&str]) {
618 for &stop_token in stop_tokens {
619 if let Some(count) = self.token_count.remove(stop_token) {
620 self.total_token_count -= count as u64;
621 }
622 }
623 }
624
625 #[inline(always)]
626 pub fn remove_stop_tokens_parallel(&mut self, stop_tokens: &[&str]) {
627 let to_remove: Vec<String> = stop_tokens
628 .par_iter()
629 .filter_map(|&stop_token| {
630 self.token_count.get(stop_token).map(|_| stop_token.to_string())
631 })
632 .collect();
633
634 for token in to_remove {
635 if let Some(count) = self.token_count.remove(&token) {
636 self.total_token_count -= count as u64;
637 }
638 }
639 }
640
641 #[inline(always)]
642 pub fn remove_tokens_by_condition<F>(&mut self, condition: F) -> u64
643 where
644 F: Fn(&str, &u32) -> bool,
645 {
646 let mut removed_total_count: u64 = 0;
647 self.token_count.retain(|token, count| {
648 if condition(token, count) {
649 removed_total_count += *count as u64;
650 false
651 } else {
652 true
653 }
654 });
655 self.total_token_count -= removed_total_count as u64;
656
657 removed_total_count
658 }
659
660 #[inline(always)]
661 pub fn get_sorted_by_frequency_desc(&self) -> Vec<(String, u32)> {
662 let mut token_list: Vec<(String, u32)> = self.token_count
663 .iter()
664 .map(|(token, &count)| (token.clone(), count))
665 .collect();
666
667 token_list.sort_by(|a, b| b.1.cmp(&a.1));
668 token_list
669 }
670
671 #[inline(always)]
672 pub fn get_sorted_by_frequency_desc_parallel(&self) -> Vec<(String, u32)> {
673 let mut token_list: Vec<(String, u32)> = self.token_count
674 .par_iter()
675 .map(|(token, &count)| (token.clone(), count))
676 .collect();
677
678 token_list.par_sort_by(|a, b| b.1.cmp(&a.1));
679 token_list
680 }
681
682 #[inline(always)]
683 pub fn get_sorted_by_frequency_asc(&self) -> Vec<(String, u32)> {
684 let mut token_list: Vec<(String, u32)> = self.token_count
685 .iter()
686 .map(|(token, &count)| (token.clone(), count))
687 .collect();
688
689 token_list.sort_by(|a, b| a.1.cmp(&b.1));
690 token_list
691 }
692
693 #[inline(always)]
694 pub fn get_sorted_by_frequency_asc_parallel(&self) -> Vec<(String, u32)> {
695 let mut token_list: Vec<(String, u32)> = self.token_count
696 .par_iter()
697 .map(|(token, &count)| (token.clone(), count))
698 .collect();
699
700 token_list.par_sort_by(|a, b| a.1.cmp(&b.1));
701 token_list
702 }
703
704 #[inline(always)]
705 pub fn get_sorted_by_alphabetical_asc(&self) -> Vec<(String, u32)> {
706 let mut token_list: Vec<(String, u32)> = self.token_count
707 .iter()
708 .map(|(token, &count)| (token.clone(), count))
709 .collect();
710
711 token_list.sort_by(|a, b| a.0.cmp(&b.0));
712 token_list
713 }
714
715 #[inline(always)]
716 pub fn get_sorted_by_alphabetical_asc_parallel(&self) -> Vec<(String, u32)> {
717 let mut token_list: Vec<(String, u32)> = self.token_count
718 .par_iter()
719 .map(|(token, &count)| (token.clone(), count))
720 .collect();
721
722 token_list.par_sort_by(|a, b| a.0.cmp(&b.0));
723 token_list
724 }
725
726 #[inline(always)]
727 pub fn get_sorted_by_alphabetical_desc(&self) -> Vec<(String, u32)> {
728 let mut token_list: Vec<(String, u32)> = self.token_count
729 .iter()
730 .map(|(token, &count)| (token.clone(), count))
731 .collect();
732
733 token_list.sort_by(|a, b| b.0.cmp(&a.0));
734 token_list
735 }
736
737 #[inline(always)]
738 pub fn get_sorted_by_alphabetical_desc_parallel(&self) -> Vec<(String, u32)> {
739 let mut token_list: Vec<(String, u32)> = self.token_count
740 .par_iter()
741 .map(|(token, &count)| (token.clone(), count))
742 .collect();
743
744 token_list.par_sort_by(|a, b| b.0.cmp(&a.0));
745 token_list
746 }
747
748 #[inline(always)]
749 pub fn get_sorted_by_length_desc(&self) -> Vec<(String, u32)> {
750 let mut token_list: Vec<(String, u32)> = self.token_count
751 .iter()
752 .map(|(token, &count)| (token.clone(), count))
753 .collect();
754
755 token_list.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
756 token_list
757 }
758
759 #[inline(always)]
760 pub fn get_sorted_by_length_desc_parallel(&self) -> Vec<(String, u32)> {
761 let mut token_list: Vec<(String, u32)> = self.token_count
762 .par_iter()
763 .map(|(token, &count)| (token.clone(), count))
764 .collect();
765
766 token_list.par_sort_by(|a, b| b.0.len().cmp(&a.0.len()));
767 token_list
768 }
769
770 #[inline(always)]
771 pub fn get_sorted_by_length_asc(&self) -> Vec<(String, u32)> {
772 let mut token_list: Vec<(String, u32)> = self.token_count
773 .iter()
774 .map(|(token, &count)| (token.clone(), count))
775 .collect();
776
777 token_list.sort_by(|a, b| a.0.len().cmp(&b.0.len()));
778 token_list
779 }
780
781 #[inline(always)]
782 pub fn get_sorted_by_length_asc_parallel(&self) -> Vec<(String, u32)> {
783 let mut token_list: Vec<(String, u32)> = self.token_count
784 .par_iter()
785 .map(|(token, &count)| (token.clone(), count))
786 .collect();
787
788 token_list.par_sort_by(|a, b| a.0.len().cmp(&b.0.len()));
789 token_list
790 }
791
792 #[inline(always)]
793 pub fn get_unique_token_ratio(&self) -> f64 {
794 if self.total_token_count == 0 {
795 return 0.0;
796 }
797 self.token_count.len() as f64 / self.total_token_count as f64
798 }
799
800 #[inline(always)]
801 pub fn reset(&mut self) {
802 self.token_count.clear();
803 self.total_token_count = 0;
804 }
805}
806
807#[cfg(test)]
808mod tests {
809 use super::*;
810
811 #[test]
812 fn test_add_token() {
813 let mut tf = TokenFrequency::new();
814 tf.add_token("rust");
815 assert_eq!(tf.token_count.get("rust"), Some(&1));
816 assert_eq!(tf.total_token_count, 1);
817 }
818
819 #[test]
820 fn test_add_tokens() {
821 let mut tf = TokenFrequency::new();
822 tf.add_tokens(&["rust", "rust", "programming"]);
823 assert_eq!(tf.token_count.get("rust"), Some(&2));
824 assert_eq!(tf.token_count.get("programming"), Some(&1));
825 assert_eq!(tf.total_token_count, 3);
826 }
827
828 #[test]
829 fn test_sub_token() {
830 let mut tf = TokenFrequency::new();
831 tf.add_tokens(&["rust", "rust", "programming"]);
832 tf.sub_token("rust");
833 assert_eq!(tf.token_count.get("rust"), Some(&1));
834 assert_eq!(tf.total_token_count, 2);
835 }
836
837 #[test]
838 fn test_tfidf_calc() {
839 let tfidf = TokenFrequency::tfidf_calc(2.0, 1.5);
840 assert_eq!(tfidf, 3.0);
841 }
842
843 #[test]
844 fn test_reset() {
845 let mut tf = TokenFrequency::new();
846 tf.add_tokens(&["rust", "programming"]);
847 tf.reset();
848 assert!(tf.token_count.is_empty());
849 assert_eq!(tf.total_token_count, 0);
850 }
851
852 #[test]
853 fn test_get_token_length_stats() {
854 let mut tf = TokenFrequency::new();
855 tf.add_tokens(&["rust", "go", "java"]);
856 let stats = tf.get_token_length_stats();
857 assert_eq!(stats, Some((2, 4, 3.3333333333333335)));
858 }
859
860 #[test]
861 fn test_unique_token_ratio() {
862 let mut tf = TokenFrequency::new();
863 tf.add_tokens(&["rust", "rust", "go"]);
864 assert_eq!(tf.get_unique_token_ratio(), 2.0 / 3.0);
865 }
866}