pub struct TFIDFVectorizer<N = f32, K = String, E = DefaultTFIDFEngine>where
N: Num + Copy + Into<f64> + Send + Sync,
E: TFIDFEngine<N> + Send + Sync,
K: Clone + Send + Sync,{
pub documents: Vec<TFVector<N, K>>,
pub token_dim_sample: IndexSet<Box<str>, RandomState>,
pub corpus_ref: Arc<Corpus>,
pub idf: IDFVector<N>,
/* private fields */
}Fields§
§documents: Vec<TFVector<N, K>>Document’s TF Vector
token_dim_sample: IndexSet<Box<str>, RandomState>TF Vector’s token dimension sample
corpus_ref: Arc<Corpus>Corpus reference
idf: IDFVector<N>IDF Vector
Implementations§
Source§impl<N, K, E> TFIDFVectorizer<N, K, E>
impl<N, K, E> TFIDFVectorizer<N, K, E>
Sourcepub fn similarity(
&mut self,
freq: &TokenFrequency,
algorithm: &SimilarityAlgorithm,
) -> Hits<K>
pub fn similarity( &mut self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>
Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Calls update_idf() to use the latest IDF vector
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19 vectorizer.del_doc(&"doc1".to_string());
20 vectorizer.add_doc("doc3".to_string(), &freq1);
21
22 // similarity search
23 let mut query_tokens = TokenFrequency::new();
24 query_tokens.add_tokens(&["rust", "高速"]);
25 let algorithm = SimilarityAlgorithm::CosineSimilarity;
26 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27 result.sort_by_score();
28
29 // print result
30 result.list.iter().for_each(|(k, s, l)| {
31 println!("doc: {}, score: {}, length: {}", k, s, l);
32 });
33 // debug
34 println!("result count: {}", result.list.len());
35 println!("{:?}", vectorizer);
36}Sourcepub fn similarity_uncheck_idf(
&self,
freq: &TokenFrequency,
algorithm: &SimilarityAlgorithm,
) -> Hits<K>
pub fn similarity_uncheck_idf( &self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>
Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Does not check the IDF vector (can be called with immutable reference) Call update_idf() manually if needed
Source§impl<N, K, E> TFIDFVectorizer<N, K, E>
impl<N, K, E> TFIDFVectorizer<N, K, E>
Sourcepub fn new(corpus_ref: Arc<Corpus>) -> Self
pub fn new(corpus_ref: Arc<Corpus>) -> Self
Create a new TFIDFVectorizer instance
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19 vectorizer.del_doc(&"doc1".to_string());
20 vectorizer.add_doc("doc3".to_string(), &freq1);
21
22 // similarity search
23 let mut query_tokens = TokenFrequency::new();
24 query_tokens.add_tokens(&["rust", "高速"]);
25 let algorithm = SimilarityAlgorithm::CosineSimilarity;
26 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27 result.sort_by_score();
28
29 // print result
30 result.list.iter().for_each(|(k, s, l)| {
31 println!("doc: {}, score: {}, length: {}", k, s, l);
32 });
33 // debug
34 println!("result count: {}", result.list.len());
35 println!("{:?}", vectorizer);
36}Sourcepub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>)
pub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>)
set corpus reference and recalculate idf
Sourcepub fn update_idf(&mut self)
pub fn update_idf(&mut self)
Corpusに変更があればIDFを再計算する
Source§impl<N, K, E> TFIDFVectorizer<N, K, E>
impl<N, K, E> TFIDFVectorizer<N, K, E>
Sourcepub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency)
pub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency)
Add a document The immediately referenced Corpus is also updated
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19 vectorizer.del_doc(&"doc1".to_string());
20 vectorizer.add_doc("doc3".to_string(), &freq1);
21
22 // similarity search
23 let mut query_tokens = TokenFrequency::new();
24 query_tokens.add_tokens(&["rust", "高速"]);
25 let algorithm = SimilarityAlgorithm::CosineSimilarity;
26 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27 result.sort_by_score();
28
29 // print result
30 result.list.iter().for_each(|(k, s, l)| {
31 println!("doc: {}, score: {}, length: {}", k, s, l);
32 });
33 // debug
34 println!("result count: {}", result.list.len());
35 println!("{:?}", vectorizer);
36}Sourcepub fn del_doc(&mut self, doc_id: &K)where
K: PartialEq,
pub fn del_doc(&mut self, doc_id: &K)where
K: PartialEq,
Examples found in repository?
5fn main() {
6 // build corpus
7 let corpus = Arc::new(Corpus::new());
8
9 // add documents
10 let mut freq1 = TokenFrequency::new();
11 freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12 let mut freq2 = TokenFrequency::new();
13 freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15 // build query
16 let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);
17 vectorizer.add_doc("doc1".to_string(), &freq1);
18 vectorizer.add_doc("doc2".to_string(), &freq2);
19 vectorizer.del_doc(&"doc1".to_string());
20 vectorizer.add_doc("doc3".to_string(), &freq1);
21
22 // similarity search
23 let mut query_tokens = TokenFrequency::new();
24 query_tokens.add_tokens(&["rust", "高速"]);
25 let algorithm = SimilarityAlgorithm::CosineSimilarity;
26 let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27 result.sort_by_score();
28
29 // print result
30 result.list.iter().for_each(|(k, s, l)| {
31 println!("doc: {}, score: {}, length: {}", k, s, l);
32 });
33 // debug
34 println!("result count: {}", result.list.len());
35 println!("{:?}", vectorizer);
36}Sourcepub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>where
K: PartialEq,
pub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>where
K: PartialEq,
Get TFVector by document ID
Sourcepub fn get_tf_into_token_freq(&self, key: &K) -> Option<TokenFrequency>
pub fn get_tf_into_token_freq(&self, key: &K) -> Option<TokenFrequency>
Get TokenFrequency by document ID If quantized, there may be some error Words not included in the corpus are ignored
Sourcepub fn contains_doc(&self, key: &K) -> boolwhere
K: PartialEq,
pub fn contains_doc(&self, key: &K) -> boolwhere
K: PartialEq,
Check if a document with the given ID exists
Sourcepub fn contains_token(&self, token: &str) -> bool
pub fn contains_token(&self, token: &str) -> bool
Check if the token exists in the token dimension sample
Sourcepub fn contains_tokens_from_freq(&self, freq: &TokenFrequency) -> bool
pub fn contains_tokens_from_freq(&self, freq: &TokenFrequency) -> bool
Check if all tokens in the given TokenFrequency exist in the token dimension sample
pub fn doc_num(&self) -> usize
Trait Implementations§
Source§impl<N, K, E> Clone for TFIDFVectorizer<N, K, E>
impl<N, K, E> Clone for TFIDFVectorizer<N, K, E>
Source§fn clone(&self) -> TFIDFVectorizer<N, K, E>
fn clone(&self) -> TFIDFVectorizer<N, K, E>
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read more