TFIDFVectorizer

Struct TFIDFVectorizer 

Source
pub struct TFIDFVectorizer<N = f32, K = String, E = DefaultTFIDFEngine>
where N: Num + Copy, E: TFIDFEngine<N>,
{ pub documents: Vec<TFVector<N, K>>, pub token_dim_sample: IndexSet<String, RandomState>, pub corpus_ref: Arc<Corpus>, pub idf: IDFVector<N>, /* private fields */ }

Fields§

§documents: Vec<TFVector<N, K>>

Document’s TF Vector

§token_dim_sample: IndexSet<String, RandomState>

TF Vector’s token dimension sample

§corpus_ref: Arc<Corpus>

Corpus reference

§idf: IDFVector<N>

IDF Vector

Implementations§

Source§

impl<N, K, E> TFIDFVectorizer<N, K, E>
where K: Clone + Sync + Send + PartialEq, N: Num + Copy + Into<f64> + DeNormalizer + Send + Sync, E: TFIDFEngine<N>,

Source

pub fn similarity( &mut self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Calls update_idf() to use the latest IDF vector

Examples found in repository?
examples/basic.rs (line 26)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}
Source

pub fn similarity_uncheck_idf( &self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Does not check the IDF vector (can be called with immutable reference) Call update_idf() manually if needed

Source§

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy, E: TFIDFEngine<N>,

Source

pub fn new(corpus_ref: Arc<Corpus>) -> Self

Create a new TFIDFVectorizer instance

Examples found in repository?
examples/basic.rs (line 16)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}
Source

pub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>)

set corpus reference and recalculate idf

Source

pub fn update_idf(&mut self)

Corpusに変更があればIDFを再計算する

Source§

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy + Into<f64>, E: TFIDFEngine<N>, K: PartialEq,

Source

pub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency)

Add a document The immediately referenced Corpus is also updated

Examples found in repository?
examples/basic.rs (line 17)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}
Source

pub fn del_doc(&mut self, doc_id: &K)
where K: PartialEq,

Examples found in repository?
examples/basic.rs (line 19)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19    vectorizer.del_doc(&"doc1".to_string());
20    vectorizer.add_doc("doc3".to_string(), &freq1);
21
22    // similarity search
23    let mut query_tokens = TokenFrequency::new();
24    query_tokens.add_tokens(&["rust", "高速"]);
25    let algorithm = SimilarityAlgorithm::CosineSimilarity;
26    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
27    result.sort_by_score();
28
29    // print result
30    result.list.iter().for_each(|(k, s, l)| {
31        println!("doc: {}, score: {}, length: {}", k, s, l);
32    });
33    // debug
34    println!("result count: {}", result.list.len());
35    println!("{:?}", vectorizer);
36}
Source

pub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>
where K: PartialEq,

Get TFVector by document ID

Source

pub fn get_tf_into_token_freq(&self, key: &K) -> Option<TokenFrequency>

Get TokenFrequency by document ID If quantized, there may be some error Words not included in the corpus are ignored

Source

pub fn contains_doc(&self, key: &K) -> bool
where K: PartialEq,

Check if a document with the given ID exists

Source

pub fn contains_token(&self, token: &str) -> bool

Check if the token exists in the token dimension sample

Source

pub fn contains_tokens_from_freq(&self, freq: &TokenFrequency) -> bool

Check if all tokens in the given TokenFrequency exist in the token dimension sample

Source

pub fn doc_num(&self) -> usize

Trait Implementations§

Source§

impl<N, K: Clone, E> Clone for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Clone, E: TFIDFEngine<N> + Clone,

Source§

fn clone(&self) -> TFIDFVectorizer<N, K, E>

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl<N, K: Debug, E> Debug for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Debug, E: TFIDFEngine<N> + Debug,

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl<N, K, E> Serialize for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Serialize, K: Serialize, E: TFIDFEngine<N>,

Source§

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

Serialize TFIDFVectorizer. This struct contains references, so they are excluded from serialization. Use TFIDFData for deserialization.

Auto Trait Implementations§

§

impl<N, K, E> Freeze for TFIDFVectorizer<N, K, E>

§

impl<N = f32, K = String, E = DefaultTFIDFEngine> !RefUnwindSafe for TFIDFVectorizer<N, K, E>

§

impl<N, K, E> Send for TFIDFVectorizer<N, K, E>
where K: Send, N: Send,

§

impl<N, K, E> Sync for TFIDFVectorizer<N, K, E>
where K: Sync, N: Sync,

§

impl<N, K, E> Unpin for TFIDFVectorizer<N, K, E>
where E: Unpin, K: Unpin, N: Unpin,

§

impl<N = f32, K = String, E = DefaultTFIDFEngine> !UnwindSafe for TFIDFVectorizer<N, K, E>

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> IntoEither for T

Source§

fn into_either(self, into_left: bool) -> Either<Self, Self>

Converts self into a Left variant of Either<Self, Self> if into_left is true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

Converts self into a Left variant of Either<Self, Self> if into_left(&self) returns true. Converts self into a Right variant of Either<Self, Self> otherwise. Read more
Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.