Struct rnltk::document::TfidfMatrix

source ·
pub struct TfidfMatrix { /* private fields */ }
Expand description

Struct for holding the resulting tfidf_matrix from DocumentTermFrequencies::get_tfidf_from_term_frequencies

Implementations§

source§

impl TfidfMatrix

source

pub fn get_tfidf_matrix(&self) -> &GenericMatrix

Gets the TF-IDF matrix that was created from DocumentTermFrequencies::get_tfidf_from_term_frequencies.

This ensures the user can’t instantiate their own instance of TfidfMatrix and must use the formatted, normalized matrix.

source

pub fn get_cosine_similarity_from_tfidf(&self) -> CosineSimilarityMatrix

Gets the cosine similarity matrix from the TfidfMatrix’s tfidf_matrix.

Normally, calculating the cosine similarity of two document vectors would look like \(\cos \theta = \frac{D_i \cdot D_j}{|D_i| |D_j|}\). Since the TF-IDF matrix returned from DocumentTermFrequencies::get_tfidf_from_term_frequencies is already normalized, this simplifies to \(\cos \theta = D_i \cdot D_j\).

The resulting matrix has 1’s along the diagonal since the similarity of a document with itself is 1. The intersections of rows and columns, \(M_{i,j}\), is the cosine similarity value between \(D_i\) and \(D_j\).

Examples
use rnltk::document::DocumentTermFrequencies;
use rnltk::sample_data;
 
let document_term_frequencies: DocumentTermFrequencies = DocumentTermFrequencies::new(sample_data::get_term_frequencies());
let tfidf_matrix = document_term_frequencies.get_tfidf_from_term_frequencies();
let cosine_similarity_matrix = tfidf_matrix.get_cosine_similarity_from_tfidf();
Examples found in repository?
examples/document_similarity.rs (line 39)
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
fn main() {
    let document1 = "It is a far, far better thing I do, than I have ever done";
    let document2 = "Call me Ishmael";
    let document3 = "Is this a dagger I see before me?";
    let document4 = "O happy dagger";

    let documents = vec![document1, document2, document3, document4];

    let stop_words = token::get_stop_words();

    let token_config = token::TokenConfig {
        remove_stop_words: true,
        stem: true,
        stop_words
    };

    let documents_term_frequencies = token::get_term_frequencies_from_sentences_configurable(&documents, token_config);

    let mut all_term_frequencies: Vec<f64> = vec![];

    documents_term_frequencies.iter().for_each(|term_frequencies| {
        all_term_frequencies.extend(term_frequencies.values().into_iter());
    });

    let nrows = documents_term_frequencies[0].values().len();
    let ncols = documents.len();

    let document_term_frequencies = DMatrix::from_vec(nrows, ncols, all_term_frequencies);

    let document_term_frequency_matrix = document::DocumentTermFrequencies::new(document_term_frequencies);
    let tfidf_matrix = document_term_frequency_matrix.get_tfidf_from_term_frequencies();

    let cosine_similarity = tfidf_matrix.get_cosine_similarity_from_tfidf();
    let cosine_similarity_matrix = cosine_similarity.get_cosine_similarity_matrix();

    println!("COSINE SIMILARITY MATRIX");
    for row_index in 0..ncols {
        println!(
            "Document {}          {:.2}          {:.2}          {:.2}          {:.2}",
            row_index + 1,
            &cosine_similarity_matrix[(row_index, 0)],
            &cosine_similarity_matrix[(row_index, 1)],
            &cosine_similarity_matrix[(row_index, 2)],
            &cosine_similarity_matrix[(row_index, 3)]
        )
    }
    println!("              Document 1    Document 2    Document 3    Document 4");

    println!("\n-----------------------------\n");

    let lsa_cosine_similarity = tfidf_matrix.get_lsa_cosine_similarity_from_tfidf(2).unwrap();
    let lsa_cosine_similarity_matrix = lsa_cosine_similarity.get_lsa_cosine_similarity_matrix();

    println!("LSA COSINE SIMILARITY MATRIX");
    for row_index in 0..ncols {
        println!(
            "Document {}          {:.2}          {:.2}          {:.2}          {:.2}",
            row_index + 1,
            &lsa_cosine_similarity_matrix[(row_index, 0)],
            &lsa_cosine_similarity_matrix[(row_index, 1)],
            &lsa_cosine_similarity_matrix[(row_index, 2)],
            &lsa_cosine_similarity_matrix[(row_index, 3)]
        )
    }
    println!("              Document 1    Document 2    Document 3    Document 4");
}
source

pub fn get_lsa_cosine_similarity_from_tfidf( &self, k: usize ) -> Result<LsaCosineSimilarityMatrix, RnltkError>

Gets the Latent Semantic Analysis (LSA) cosine similarity matrix from the TfidfMatrix’s tfidf_matrix.

Singular Value Decomposition (SVD) is applied to the \(m \times n\) tfidf_matrix to reduce dimensionality. The k largest singular values are chosen to produce a reduced \({V_k}^T\) matrix, with \(1 \le v \le n\). Each document column in the \({V_k}^T\) matrix is normalized and then we dot product them together. To shift the resulting dot product from a range of [-1…-1] to [0…1], we add 1 to the dot product and then divide by 2 (\(\frac{1 + \cos(\theta)}{2}\)).

The resulting matrix has 1’s along the diagonal since the similarity of a document with itself is 1. The intersections of rows and columns, \(M_{i,j}\), is the cosine similarity value between \(D_i\) and \(D_j\).

Examples
use rnltk::document::DocumentTermFrequencies;
use rnltk::sample_data;
 
let document_term_frequencies: DocumentTermFrequencies = DocumentTermFrequencies::new(sample_data::get_term_frequencies());
let tfidf_matrix = document_term_frequencies.get_tfidf_from_term_frequencies();
let lsa_cosine_similarity_matrix = tfidf_matrix.get_lsa_cosine_similarity_from_tfidf(2).unwrap();
Examples found in repository?
examples/document_similarity.rs (line 57)
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
fn main() {
    let document1 = "It is a far, far better thing I do, than I have ever done";
    let document2 = "Call me Ishmael";
    let document3 = "Is this a dagger I see before me?";
    let document4 = "O happy dagger";

    let documents = vec![document1, document2, document3, document4];

    let stop_words = token::get_stop_words();

    let token_config = token::TokenConfig {
        remove_stop_words: true,
        stem: true,
        stop_words
    };

    let documents_term_frequencies = token::get_term_frequencies_from_sentences_configurable(&documents, token_config);

    let mut all_term_frequencies: Vec<f64> = vec![];

    documents_term_frequencies.iter().for_each(|term_frequencies| {
        all_term_frequencies.extend(term_frequencies.values().into_iter());
    });

    let nrows = documents_term_frequencies[0].values().len();
    let ncols = documents.len();

    let document_term_frequencies = DMatrix::from_vec(nrows, ncols, all_term_frequencies);

    let document_term_frequency_matrix = document::DocumentTermFrequencies::new(document_term_frequencies);
    let tfidf_matrix = document_term_frequency_matrix.get_tfidf_from_term_frequencies();

    let cosine_similarity = tfidf_matrix.get_cosine_similarity_from_tfidf();
    let cosine_similarity_matrix = cosine_similarity.get_cosine_similarity_matrix();

    println!("COSINE SIMILARITY MATRIX");
    for row_index in 0..ncols {
        println!(
            "Document {}          {:.2}          {:.2}          {:.2}          {:.2}",
            row_index + 1,
            &cosine_similarity_matrix[(row_index, 0)],
            &cosine_similarity_matrix[(row_index, 1)],
            &cosine_similarity_matrix[(row_index, 2)],
            &cosine_similarity_matrix[(row_index, 3)]
        )
    }
    println!("              Document 1    Document 2    Document 3    Document 4");

    println!("\n-----------------------------\n");

    let lsa_cosine_similarity = tfidf_matrix.get_lsa_cosine_similarity_from_tfidf(2).unwrap();
    let lsa_cosine_similarity_matrix = lsa_cosine_similarity.get_lsa_cosine_similarity_matrix();

    println!("LSA COSINE SIMILARITY MATRIX");
    for row_index in 0..ncols {
        println!(
            "Document {}          {:.2}          {:.2}          {:.2}          {:.2}",
            row_index + 1,
            &lsa_cosine_similarity_matrix[(row_index, 0)],
            &lsa_cosine_similarity_matrix[(row_index, 1)],
            &lsa_cosine_similarity_matrix[(row_index, 2)],
            &lsa_cosine_similarity_matrix[(row_index, 3)]
        )
    }
    println!("              Document 1    Document 2    Document 3    Document 4");
}

Trait Implementations§

source§

impl Clone for TfidfMatrix

source§

fn clone(&self) -> TfidfMatrix

Returns a copy of the value. Read more
1.0.0 · source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
source§

impl Debug for TfidfMatrix

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for Twhere T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for Twhere T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for Twhere T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for Twhere U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T> Same<T> for T

§

type Output = T

Should always be Self
§

impl<SS, SP> SupersetOf<SS> for SPwhere SS: SubsetOf<SP>,

§

fn to_subset(&self) -> Option<SS>

The inverse inclusion map: attempts to construct self from the equivalent element of its superset. Read more
§

fn is_in_subset(&self) -> bool

Checks if self is actually part of its subset T (and can be converted to it).
§

fn to_subset_unchecked(&self) -> SS

Use with care! Same as self.to_subset but without any property checks. Always succeeds.
§

fn from_subset(element: &SS) -> SP

The inclusion map: converts self to the equivalent element of its superset.
source§

impl<T> ToOwned for Twhere T: Clone,

§

type Owned = T

The resulting type after obtaining ownership.
source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
source§

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.