pub struct TokenFrequency { /* private fields */ }Expand description
TokenFrequency struct Manages the frequency of token occurrences. Counts the number of times each token appears.
§Examples
use crate::tf_idf_vectorizer::vectorizer::token::TokenFrequency;
let mut token_freq = TokenFrequency::new();
token_freq.add_token("token1");
token_freq.add_token("token2");
token_freq.add_token("token1");
assert_eq!(token_freq.token_count("token1"), 2);Implementations§
Source§impl TokenFrequency
Implementation for adding and removing tokens
impl TokenFrequency
Implementation for adding and removing tokens
Sourcepub fn new() -> Self
pub fn new() -> Self
Create a new TokenFrequency
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19 vectorizer.del_doc(&"doc1".to_string());
20 vectorizer.add_doc("doc3".to_string(), &freq1);
21
22 // similarity search
23 let mut query_tokens = TokenFrequency::new();
24 query_tokens.add_tokens(&["rust", "高速"]);
25 let algorithm = SimilarityAlgorithm::CosineSimilarity;
26 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27 result.sort_by_score();
28
29 // print result
30 result.list.iter().for_each(|(k, s, l)| {
31 println!("doc: {}, score: {}, length: {}", k, s, l);
32 });
33 // debug
34 println!("result count: {}", result.list.len());
35 println!("{:?}", vectorizer);
36}Sourcepub fn add_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
pub fn add_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19 vectorizer.del_doc(&"doc1".to_string());
20 vectorizer.add_doc("doc3".to_string(), &freq1);
21
22 // similarity search
23 let mut query_tokens = TokenFrequency::new();
24 query_tokens.add_tokens(&["rust", "高速"]);
25 let algorithm = SimilarityAlgorithm::CosineSimilarity;
26 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27 result.sort_by_score();
28
29 // print result
30 result.list.iter().for_each(|(k, s, l)| {
31 println!("doc: {}, score: {}, length: {}", k, s, l);
32 });
33 // debug
34 println!("result count: {}", result.list.len());
35 println!("{:?}", vectorizer);
36}Sourcepub fn sub_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
pub fn sub_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
Sourcepub fn set_token_count(&mut self, token: &str, count: u64) -> &mut Self
pub fn set_token_count(&mut self, token: &str, count: u64) -> &mut Self
Sourcepub fn add_tokens_from_freq(&mut self, other: &TokenFrequency) -> &mut Self
pub fn add_tokens_from_freq(&mut self, other: &TokenFrequency) -> &mut Self
Source§impl TokenFrequency
Implementation for retrieving information from TokenFrequency
impl TokenFrequency
Implementation for retrieving information from TokenFrequency
Sourcepub fn token_count_vector(&self) -> Vec<(String, u64)>
pub fn token_count_vector(&self) -> Vec<(String, u64)>
Get a vector of all tokens and their counts
§Returns
Vec<(String, u64)>- Vector of tokens and their counts
Sourcepub fn token_count_vector_ref_str(&self) -> Vec<(&str, u64)>
pub fn token_count_vector_ref_str(&self) -> Vec<(&str, u64)>
Get a vector of all tokens and their counts (as &str)
§Returns
Vec<(&str, u64)>- Vector of tokens and their counts
Sourcepub fn token_count_hashmap_ref_str(&self) -> HashMap<&str, u64, RandomState>
pub fn token_count_hashmap_ref_str(&self) -> HashMap<&str, u64, RandomState>
Get a hashmap of all tokens and their counts (as &str)
§Returns
HashMap<&str, u64>- HashMap of tokens and their counts
Sourcepub fn token_count(&self, token: &str) -> u64
pub fn token_count(&self, token: &str) -> u64
Sourcepub fn most_frequent_tokens_vector(&self) -> Vec<(String, u64)>
pub fn most_frequent_tokens_vector(&self) -> Vec<(String, u64)>
Get the most frequent tokens If multiple tokens have the same count, all are returned
§Returns
Vec<(String, u64)>- Vector of most frequent tokens and their counts
Sourcepub fn most_frequent_token_count(&self) -> u64
pub fn most_frequent_token_count(&self) -> u64
Sourcepub fn contains_token(&self, token: &str) -> bool
pub fn contains_token(&self, token: &str) -> bool
Sourcepub fn token_set_ref_str(&self) -> Vec<&str>
pub fn token_set_ref_str(&self) -> Vec<&str>
Sourcepub fn token_hashset(&self) -> HashSet<String, RandomState>
pub fn token_hashset(&self) -> HashSet<String, RandomState>
Sourcepub fn token_hashset_ref_str(&self) -> HashSet<&str, RandomState>
pub fn token_hashset_ref_str(&self) -> HashSet<&str, RandomState>
Sourcepub fn remove_stop_tokens(&mut self, stop_tokens: &[&str]) -> u64
pub fn remove_stop_tokens(&mut self, stop_tokens: &[&str]) -> u64
Sourcepub fn remove_tokens_by<F>(&mut self, condition: F) -> u64
pub fn remove_tokens_by<F>(&mut self, condition: F) -> u64
Sourcepub fn sorted_frequency_vector(&self) -> Vec<(String, u64)>
pub fn sorted_frequency_vector(&self) -> Vec<(String, u64)>
Get a vector of tokens sorted by frequency (descending)
§Returns
Vec<(String, u64)>- Vector of tokens sorted by frequency
Sourcepub fn sorted_dict_order_vector(&self) -> Vec<(String, u64)>
pub fn sorted_dict_order_vector(&self) -> Vec<(String, u64)>
Get a vector of tokens sorted by dictionary order (ascending)
§Returns
Vec<(String, u64)>- Vector of tokens sorted by dictionary order
Sourcepub fn unique_token_ratio(&self) -> f64
pub fn unique_token_ratio(&self) -> f64
Calculate the diversity of tokens 1.0 indicates complete diversity, 0.0 indicates no diversity
§Returns
f64- Diversity of tokens
Sourcepub fn probability_vector(&self) -> Vec<(String, f64)>
pub fn probability_vector(&self) -> Vec<(String, f64)>
Get the probability distribution P(token) (owned String version) Returns an empty vector if total is 0
Sourcepub fn probability_vector_ref_str(&self) -> Vec<(&str, f64)>
pub fn probability_vector_ref_str(&self) -> Vec<(&str, f64)>
Get the probability distribution P(token) (as &str) Returns an empty vector if total is 0
Sourcepub fn probability(&self, token: &str) -> f64
pub fn probability(&self, token: &str) -> f64
Get the probability P(token) for a specific token Returns 0.0 if total is 0
Sourcepub fn shrink_to_fit(&mut self)
pub fn shrink_to_fit(&mut self)
Shrink internal storage to fit current size
Trait Implementations§
Source§impl Clone for TokenFrequency
impl Clone for TokenFrequency
Source§fn clone(&self) -> TokenFrequency
fn clone(&self) -> TokenFrequency
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read more