TFIDFVectorizer

Struct TFIDFVectorizer 

Source
pub struct TFIDFVectorizer<N = f32, K = String, E = DefaultTFIDFEngine>
where N: Num + Copy, E: TFIDFEngine<N>,
{ pub documents: Vec<TFVector<N, K>>, pub token_dim_sample: IndexSet<String, RandomState>, pub corpus_ref: Arc<Corpus>, pub idf: IDFVector<N>, /* private fields */ }

Fields§

§documents: Vec<TFVector<N, K>>

Document’s TF Vector

§token_dim_sample: IndexSet<String, RandomState>

TF Vector’s token dimension sample

§corpus_ref: Arc<Corpus>

Corpus reference

§idf: IDFVector<N>

IDF Vector

Implementations§

Source§

impl<N, K, E> TFIDFVectorizer<N, K, E>
where K: Clone, N: Num + Copy + Into<f64> + DeNormalizer, E: TFIDFEngine<N>,

Source

pub fn similarity( &mut self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Calls update_idf() to use the latest IDF vector

Examples found in repository?
examples/basic.rs (line 24)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19
20    // similarity search
21    let mut query_tokens = TokenFrequency::new();
22    query_tokens.add_tokens(&["rust", "高速"]);
23    let algorithm = SimilarityAlgorithm::CosineSimilarity;
24    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
25    result.sort_by_score();
26
27    // print result
28    result.list.iter().for_each(|(k, s, l)| {
29        println!("doc: {}, score: {}, length: {}", k, s, l);
30    });
31    // debug
32    println!("result count: {}", result.list.len());
33    println!("{:?}", vectorizer);
34}
Source

pub fn similarity_uncheck_idf( &self, freq: &TokenFrequency, algorithm: &SimilarityAlgorithm, ) -> Hits<K>

Calculate similarity scores based on query token frequency Uses the specified similarity algorithm Does not check the IDF vector (can be called with immutable reference) Call update_idf() manually if needed

Source§

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy, E: TFIDFEngine<N>,

Source

pub fn new(corpus_ref: Arc<Corpus>) -> Self

Create a new TFIDFVectorizer instance

Examples found in repository?
examples/basic.rs (line 16)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19
20    // similarity search
21    let mut query_tokens = TokenFrequency::new();
22    query_tokens.add_tokens(&["rust", "高速"]);
23    let algorithm = SimilarityAlgorithm::CosineSimilarity;
24    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
25    result.sort_by_score();
26
27    // print result
28    result.list.iter().for_each(|(k, s, l)| {
29        println!("doc: {}, score: {}, length: {}", k, s, l);
30    });
31    // debug
32    println!("result count: {}", result.list.len());
33    println!("{:?}", vectorizer);
34}
Source

pub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>)

set corpus reference and recalculate idf

Source

pub fn update_idf(&mut self)

Corpusに変更があればIDFを再計算する

Source§

impl<N, K, E> TFIDFVectorizer<N, K, E>
where N: Num + Copy, E: TFIDFEngine<N>,

Source

pub fn add_doc(&mut self, doc_id: K, doc: &TokenFrequency)

Add a document The immediately referenced Corpus is also updated

Examples found in repository?
examples/basic.rs (line 17)
5fn main() {
6    // build corpus
7    let corpus = Arc::new(Corpus::new());
8
9    // add documents
10    let mut freq1 = TokenFrequency::new();
11    freq1.add_tokens(&["rust", "高速", "並列", "rust"]);
12    let mut freq2 = TokenFrequency::new();
13    freq2.add_tokens(&["rust", "柔軟", "安全", "rust"]);
14
15    // build query
16    let mut vectorizer: TFIDFVectorizer<u16> = TFIDFVectorizer::new(corpus);    
17    vectorizer.add_doc("doc1".to_string(), &freq1);
18    vectorizer.add_doc("doc2".to_string(), &freq2);
19
20    // similarity search
21    let mut query_tokens = TokenFrequency::new();
22    query_tokens.add_tokens(&["rust", "高速"]);
23    let algorithm = SimilarityAlgorithm::CosineSimilarity;
24    let mut result = vectorizer.similarity(&query_tokens, &algorithm);
25    result.sort_by_score();
26
27    // print result
28    result.list.iter().for_each(|(k, s, l)| {
29        println!("doc: {}, score: {}, length: {}", k, s, l);
30    });
31    // debug
32    println!("result count: {}", result.list.len());
33    println!("{:?}", vectorizer);
34}
Source

pub fn get_tf(&self, key: &K) -> Option<&TFVector<N, K>>
where K: PartialEq,

Get TFVector by document ID

Source

pub fn get_tf_into_token_freq(&self, key: &K) -> Option<TokenFrequency>
where K: PartialEq, N: Into<f64>,

Get TokenFrequency by document ID If quantized, there may be some error Words not included in the corpus are ignored

Source

pub fn contains_doc(&self, key: &K) -> bool
where K: PartialEq,

Check if a document with the given ID exists

Source

pub fn contains_token(&self, token: &str) -> bool

Check if the token exists in the token dimension sample

Source

pub fn contains_tokens_from_freq(&self, freq: &TokenFrequency) -> bool

Check if all tokens in the given TokenFrequency exist in the token dimension sample

Trait Implementations§

Source§

impl<N, K: Debug, E> Debug for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Debug, E: TFIDFEngine<N> + Debug,

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl<N, K, E> Serialize for TFIDFVectorizer<N, K, E>
where N: Num + Copy + Serialize, K: Serialize, E: TFIDFEngine<N>,

Source§

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

Serialize TFIDFVectorizer. This struct contains references, so they are excluded from serialization. Use TFIDFData for deserialization.

Auto Trait Implementations§

§

impl<N, K, E> Freeze for TFIDFVectorizer<N, K, E>

§

impl<N = f32, K = String, E = DefaultTFIDFEngine> !RefUnwindSafe for TFIDFVectorizer<N, K, E>

§

impl<N, K, E> Send for TFIDFVectorizer<N, K, E>
where E: Send, K: Send, N: Send,

§

impl<N, K, E> Sync for TFIDFVectorizer<N, K, E>
where E: Sync, K: Sync, N: Sync,

§

impl<N, K, E> Unpin for TFIDFVectorizer<N, K, E>
where E: Unpin, K: Unpin, N: Unpin,

§

impl<N = f32, K = String, E = DefaultTFIDFEngine> !UnwindSafe for TFIDFVectorizer<N, K, E>

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.