pub struct TokenFrequency { /* private fields */ }Expand description
TokenFrequency struct Manages the frequency of token occurrences. Counts the number of times each token appears.
§Examples
use crate::tf_idf_vectorizer::vectorizer::token::TokenFrequency;
let mut token_freq = TokenFrequency::new();
token_freq.add_token("token1");
token_freq.add_token("token2");
token_freq.add_token("token1");
assert_eq!(token_freq.token_count("token1"), 2);Implementations§
Source§impl TokenFrequency
Implementation for adding and removing tokens
impl TokenFrequency
Implementation for adding and removing tokens
Sourcepub fn new() -> Self
pub fn new() -> Self
Create a new TokenFrequency
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19
20 // similarity search
21 let mut query_tokens = TokenFrequency::new();
22 query_tokens.add_tokens(&["rust", "高速"]);
23 let algorithm = SimilarityAlgorithm::CosineSimilarity;
24 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
25 result.sort_by_score();
26
27 // print result
28 result.list.iter().for_each(|(k, s, l)| {
29 println!("doc: {}, score: {}, length: {}", k, s, l);
30 });
31 // debug
32 println!("result count: {}", result.list.len());
33 println!("{:?}", vectorizer);
34}Sourcepub fn add_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
pub fn add_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19
20 // similarity search
21 let mut query_tokens = TokenFrequency::new();
22 query_tokens.add_tokens(&["rust", "高速"]);
23 let algorithm = SimilarityAlgorithm::CosineSimilarity;
24 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
25 result.sort_by_score();
26
27 // print result
28 result.list.iter().for_each(|(k, s, l)| {
29 println!("doc: {}, score: {}, length: {}", k, s, l);
30 });
31 // debug
32 println!("result count: {}", result.list.len());
33 println!("{:?}", vectorizer);
34}Sourcepub fn sub_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
pub fn sub_tokens<T>(&mut self, tokens: &[T]) -> &mut Self
Sourcepub fn set_token_count(&mut self, token: &str, count: u64) -> &mut Self
pub fn set_token_count(&mut self, token: &str, count: u64) -> &mut Self
Sourcepub fn add_tokens_from_freq(&mut self, other: &TokenFrequency) -> &mut Self
pub fn add_tokens_from_freq(&mut self, other: &TokenFrequency) -> &mut Self
Source§impl TokenFrequency
Implementation for retrieving information from TokenFrequency
impl TokenFrequency
Implementation for retrieving information from TokenFrequency
Sourcepub fn token_count_vector(&self) -> Vec<(String, u64)>
pub fn token_count_vector(&self) -> Vec<(String, u64)>
Get a vector of all tokens and their counts
§Returns
Vec<(String, u64)>- Vector of tokens and their counts
Sourcepub fn token_count_vector_ref_str(&self) -> Vec<(&str, u64)>
pub fn token_count_vector_ref_str(&self) -> Vec<(&str, u64)>
Get a vector of all tokens and their counts (as &str)
§Returns
Vec<(&str, u64)>- Vector of tokens and their counts
Sourcepub fn token_count_hashmap_ref_str(&self) -> HashMap<&str, u64, RandomState>
pub fn token_count_hashmap_ref_str(&self) -> HashMap<&str, u64, RandomState>
Get a hashmap of all tokens and their counts (as &str)
§Returns
HashMap<&str, u64>- HashMap of tokens and their counts
Sourcepub fn token_count(&self, token: &str) -> u64
pub fn token_count(&self, token: &str) -> u64
Sourcepub fn most_frequent_tokens_vector(&self) -> Vec<(String, u64)>
pub fn most_frequent_tokens_vector(&self) -> Vec<(String, u64)>
Get the most frequent tokens If multiple tokens have the same count, all are returned
§Returns
Vec<(String, u64)>- Vector of most frequent tokens and their counts
Sourcepub fn most_frequent_token_count(&self) -> u64
pub fn most_frequent_token_count(&self) -> u64
Sourcepub fn contains_token(&self, token: &str) -> bool
pub fn contains_token(&self, token: &str) -> bool
Sourcepub fn token_set_ref_str(&self) -> Vec<&str>
pub fn token_set_ref_str(&self) -> Vec<&str>
Sourcepub fn token_hashset(&self) -> HashSet<String, RandomState>
pub fn token_hashset(&self) -> HashSet<String, RandomState>
Sourcepub fn token_hashset_ref_str(&self) -> HashSet<&str, RandomState>
pub fn token_hashset_ref_str(&self) -> HashSet<&str, RandomState>
Sourcepub fn remove_stop_tokens(&mut self, stop_tokens: &[&str]) -> u64
pub fn remove_stop_tokens(&mut self, stop_tokens: &[&str]) -> u64
Sourcepub fn remove_tokens_by<F>(&mut self, condition: F) -> u64
pub fn remove_tokens_by<F>(&mut self, condition: F) -> u64
Sourcepub fn sorted_frequency_vector(&self) -> Vec<(String, u64)>
pub fn sorted_frequency_vector(&self) -> Vec<(String, u64)>
Get a vector of tokens sorted by frequency (descending)
§Returns
Vec<(String, u64)>- Vector of tokens sorted by frequency
Sourcepub fn sorted_dict_order_vector(&self) -> Vec<(String, u64)>
pub fn sorted_dict_order_vector(&self) -> Vec<(String, u64)>
Get a vector of tokens sorted by dictionary order (ascending)
§Returns
Vec<(String, u64)>- Vector of tokens sorted by dictionary order
Sourcepub fn unique_token_ratio(&self) -> f64
pub fn unique_token_ratio(&self) -> f64
Calculate the diversity of tokens 1.0 indicates complete diversity, 0.0 indicates no diversity
§Returns
f64- Diversity of tokens
Sourcepub fn probability_vector(&self) -> Vec<(String, f64)>
pub fn probability_vector(&self) -> Vec<(String, f64)>
Get the probability distribution P(token) (owned String version) Returns an empty vector if total is 0
Sourcepub fn probability_vector_ref_str(&self) -> Vec<(&str, f64)>
pub fn probability_vector_ref_str(&self) -> Vec<(&str, f64)>
Get the probability distribution P(token) (as &str) Returns an empty vector if total is 0
Sourcepub fn probability(&self, token: &str) -> f64
pub fn probability(&self, token: &str) -> f64
Get the probability P(token) for a specific token Returns 0.0 if total is 0
Sourcepub fn shrink_to_fit(&mut self)
pub fn shrink_to_fit(&mut self)
Shrink internal storage to fit current size
Trait Implementations§
Source§impl Clone for TokenFrequency
impl Clone for TokenFrequency
Source§fn clone(&self) -> TokenFrequency
fn clone(&self) -> TokenFrequency
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read moreSource§impl Debug for TokenFrequency
impl Debug for TokenFrequency
Source§impl<'de> Deserialize<'de> for TokenFrequency
impl<'de> Deserialize<'de> for TokenFrequency
Source§fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
Source§impl<T> From<&[T]> for TokenFrequency
impl<T> From<&[T]> for TokenFrequency
Auto Trait Implementations§
impl Freeze for TokenFrequency
impl RefUnwindSafe for TokenFrequency
impl Send for TokenFrequency
impl Sync for TokenFrequency
impl Unpin for TokenFrequency
impl UnwindSafe for TokenFrequency
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more