Struct TFIDFVectorizer

Source

pub struct TFIDFVectorizer<N = f32, K = String, E = DefaultTFIDFEngine>where
    N: Num + Copy + Into<f64> + Send + Sync,
    E: TFIDFEngine<N> + Send + Sync,
    K: Clone + Send + Sync,{
    pub documents: Vec<TFVector<N, K>>,
    pub token_dim_sample: IndexSet<Box<str>, RandomState>,
    pub corpus_ref: Arc<Corpus>,
    pub idf: IDFVector<N>,
    /* private fields */
}

Fields§

§documents: Vec<TFVector<N, K>>

Document’s TF Vector

§token_dim_sample: IndexSet<Box<str>, RandomState>

TF Vector’s token dimension sample

§corpus_ref: Arc<Corpus>

Corpus reference

§idf: IDFVector<N>

IDF Vector

Implementations§

Source §

impl<N, K, E> TFIDFVectorizer<N, K, E>
where K: Clone + Sync + Send + PartialEq, N: Num + Copy + Into<f64> + DeNormalizer + Send + Sync, E: TFIDFEngine<N> + Send + Sync,

Source

pub fn similarity( &mut self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Calls update_idf() to use the latest IDF vector

Examples found in repository ?

examples/basic.rs (line 26)

5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}

Source

pub fn similarity_uncheck_idf( &self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Does not check the IDF vector (can be called with immutable reference) Call update_idf() manually if needed

Source §

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync, E: TFIDFEngine<N> + Send + Sync, K: Clone + Send + Sync,

Source

pub fn new(corpus_ref: Arc<Corpus>) -> Self

Create a new TFIDFVectorizer instance

Examples found in repository ?

examples/basic.rs (line 16)

5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}

Source

pub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>)

set corpus reference and recalculate idf

Source

pub fn update_idf(&mut self)

Corpusに変更があればIDFを再計算する

Source §

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync, E: TFIDFEngine<N> + Send + Sync, K: PartialEq + Clone + Send + Sync,

Source

pub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency)

Add a document The immediately referenced Corpus is also updated

Examples found in repository ?

examples/basic.rs (line 17)

5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}

Source

pub fn del_doc(&mut self, doc_id: &K)
where K: PartialEq,

Examples found in repository ?

examples/basic.rs (line 19)

5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}

Source

pub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>
where K: PartialEq,

Get TFVector by document ID

Source

pub fn get_tf_into_token_freq(&self, key: &K) -> Option<TokenFrequency>

Get TokenFrequency by document ID If quantized, there may be some error Words not included in the corpus are ignored

Source

pub fn contains_doc(&self, key: &K) -> bool
where K: PartialEq,

Check if a document with the given ID exists

Source

pub fn contains_token(&self, token: &str) -> bool

Check if the token exists in the token dimension sample

Source

pub fn contains_tokens_from_freq(&self, freq: &TokenFrequency) -> bool

Check if all tokens in the given TokenFrequency exist in the token dimension sample

Source

pub fn doc_num(&self) -> usize

Trait Implementations§

Source §

impl<N, K, E> Clone for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync + Clone, E: TFIDFEngine<N> + Send + Sync + Clone, K: Clone + Send + Sync + Clone,

Source §

fn clone(&self) -> TFIDFVectorizer<N, K, E>

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

Source §

impl<N, K, E> Debug for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync + Debug, E: TFIDFEngine<N> + Send + Sync + Debug, K: Clone + Send + Sync + Debug,

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Source §

impl<N, K, E> Serialize for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Serialize + Into<f64> + Send + Sync, K: Serialize + Clone + Send + Sync, E: TFIDFEngine<N>,

Source §

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

Serialize TFIDFVectorizer. This struct contains references, so they are excluded from serialization. Use TFIDFData for deserialization.

Auto Trait Implementations§

§

impl<N, K, E> Freeze for TFIDFVectorizer<N, K, E>

§

impl<N = f32, K = String, E = DefaultTFIDFEngine> !RefUnwindSafe for TFIDFVectorizer<N, K, E>

§

impl<N, K, E> Send for TFIDFVectorizer<N, K, E>

§

impl<N, K, E> Sync for TFIDFVectorizer<N, K, E>

§

impl<N, K, E> Unpin for TFIDFVectorizer<N, K, E>
where E: Unpin, K: Unpin, N: Unpin,

§

impl<N = f32, K = String, E = DefaultTFIDFEngine> !UnwindSafe for TFIDFVectorizer<N, K, E>

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> CloneToUninit for T
where T: Clone,

Source §

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)

Performs copy-assignment from self to dest. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T> ToOwned for T
where T: Clone,

Source §

type Owned = T

The resulting type after obtaining ownership.

Source §

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more

Source §

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct TFIDFVectorizer Copy item path

Fields§

Implementations§

impl<N, K, E> TFIDFVectorizer<N, K, E>where K: Clone + Sync + Send + PartialEq, N: Num + Copy + Into<f64> + DeNormalizer + Send + Sync, E: TFIDFEngine<N> + Send + Sync,

pub fn similarity( &mut self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

pub fn similarity_uncheck_idf( &self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

impl<N, K, E> TFIDFVectorizer<N, K, E>where N: Num + Copy + Into<f64> + Send + Sync, E: TFIDFEngine<N> + Send + Sync, K: Clone + Send + Sync,

pub fn new(corpus_ref: Arc<Corpus>) -> Self

pub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>)

pub fn update_idf(&mut self)

impl<N, K, E> TFIDFVectorizer<N, K, E>where N: Num + Copy + Into<f64> + Send + Sync, E: TFIDFEngine<N> + Send + Sync, K: PartialEq + Clone + Send + Sync,

pub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency)

pub fn del_doc(&mut self, doc_id: &K)where K: PartialEq,

pub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>where K: PartialEq,

pub fn get_tf_into_token_freq(&self, key: &K) -> Option<TokenFrequency>

pub fn contains_doc(&self, key: &K) -> boolwhere K: PartialEq,

pub fn contains_token(&self, token: &str) -> bool

pub fn contains_tokens_from_freq(&self, freq: &TokenFrequency) -> bool

pub fn doc_num(&self) -> usize

Trait Implementations§

impl<N, K, E> Clone for TFIDFVectorizer<N, K, E>where N: Num + Copy + Into<f64> + Send + Sync + Clone, E: TFIDFEngine<N> + Send + Sync + Clone, K: Clone + Send + Sync + Clone,

fn clone(&self) -> TFIDFVectorizer<N, K, E>

fn clone_from(&mut self, source: &Self)

impl<N, K, E> Debug for TFIDFVectorizer<N, K, E>where N: Num + Copy + Into<f64> + Send + Sync + Debug, E: TFIDFEngine<N> + Send + Sync + Debug, K: Clone + Send + Sync + Debug,

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl<N, K, E> Serialize for TFIDFVectorizer<N, K, E>where N: Num + Copy + Serialize + Into<f64> + Send + Sync, K: Serialize + Clone + Send + Sync, E: TFIDFEngine<N>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>where S: Serializer,

Auto Trait Implementations§

impl<N, K, E> Freeze for TFIDFVectorizer<N, K, E>

impl<N = f32, K = String, E = DefaultTFIDFEngine> !RefUnwindSafe for TFIDFVectorizer<N, K, E>

impl<N, K, E> Send for TFIDFVectorizer<N, K, E>

impl<N, K, E> Sync for TFIDFVectorizer<N, K, E>

impl<N, K, E> Unpin for TFIDFVectorizer<N, K, E>where E: Unpin, K: Unpin, N: Unpin,

impl<N = f32, K = String, E = DefaultTFIDFEngine> !UnwindSafe for TFIDFVectorizer<N, K, E>

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct TFIDFVectorizer

impl<N, K, E> TFIDFVectorizer<N, K, E>
where K: Clone + Sync + Send + PartialEq, N: Num + Copy + Into<f64> + DeNormalizer + Send + Sync, E: TFIDFEngine<N> + Send + Sync,

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync, E: TFIDFEngine<N> + Send + Sync, K: Clone + Send + Sync,

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync, E: TFIDFEngine<N> + Send + Sync, K: PartialEq + Clone + Send + Sync,

pub fn del_doc(&mut self, doc_id: &K)
where K: PartialEq,

pub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>
where K: PartialEq,

pub fn contains_doc(&self, key: &K) -> bool
where K: PartialEq,

impl<N, K, E> Clone for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync + Clone, E: TFIDFEngine<N> + Send + Sync + Clone, K: Clone + Send + Sync + Clone,

impl<N, K, E> Debug for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64> + Send + Sync + Debug, E: TFIDFEngine<N> + Send + Sync + Debug, K: Clone + Send + Sync + Debug,

impl<N, K, E> Serialize for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Serialize + Into<f64> + Send + Sync, K: Serialize + Clone + Send + Sync, E: TFIDFEngine<N>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

impl<N, K, E> Unpin for TFIDFVectorizer<N, K, E>
where E: Unpin, K: Unpin, N: Unpin,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,