pub fn get_term_frequencies_from_sentences_configurable(
sentences: &[&str],
config: TokenConfig
) -> Vec<BTreeMap<String, f64>>
Expand description
Gets a count of all words from a vector of word_tokens
based on a given configuration.
This function will be deprecated in the future once rnltk
hits version 1.0
and functionality will be moved to get_term_frequencies_from_word_vector
.
Examples
use std::collections::BTreeMap;
use rnltk::token;
let token_config = token::TokenConfig::default();
let sentences = vec!["fear leads to anger", "anger leads to hatred", "hatred leads to conflict", "conflict leads to suffering."];
let word_counts1 = BTreeMap::from([
("fear".to_string(), 1.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 0.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
]);
let word_counts2 = BTreeMap::from([
("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 1.), ("hatr".to_string(), 1.), ("conflict".to_string(), 0.), ("suffer".to_string(), 0.)
]);
let word_counts3 = BTreeMap::from([
("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 1.), ("conflict".to_string(),1.), ("suffer".to_string(), 0.)
]);
let word_counts4 = BTreeMap::from([
("fear".to_string(), 0.), ("lead".to_string(), 1.), ("anger".to_string(), 0.), ("hatr".to_string(), 0.), ("conflict".to_string(), 1.), ("suffer".to_string(), 1.)
]);
let term_frequencies = token::get_term_frequencies_from_sentences_configurable(&sentences, token_config);
assert_eq!(vec![word_counts1, word_counts2, word_counts3, word_counts4], term_frequencies);
Examples found in repository?
examples/document_similarity.rs (line 23)
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
fn main() {
let document1 = "It is a far, far better thing I do, than I have ever done";
let document2 = "Call me Ishmael";
let document3 = "Is this a dagger I see before me?";
let document4 = "O happy dagger";
let documents = vec![document1, document2, document3, document4];
let stop_words = token::get_stop_words();
let token_config = token::TokenConfig {
remove_stop_words: true,
stem: true,
stop_words
};
let documents_term_frequencies = token::get_term_frequencies_from_sentences_configurable(&documents, token_config);
let mut all_term_frequencies: Vec<f64> = vec![];
documents_term_frequencies.iter().for_each(|term_frequencies| {
all_term_frequencies.extend(term_frequencies.values().into_iter());
});
let nrows = documents_term_frequencies[0].values().len();
let ncols = documents.len();
let document_term_frequencies = DMatrix::from_vec(nrows, ncols, all_term_frequencies);
let document_term_frequency_matrix = document::DocumentTermFrequencies::new(document_term_frequencies);
let tfidf_matrix = document_term_frequency_matrix.get_tfidf_from_term_frequencies();
let cosine_similarity = tfidf_matrix.get_cosine_similarity_from_tfidf();
let cosine_similarity_matrix = cosine_similarity.get_cosine_similarity_matrix();
println!("COSINE SIMILARITY MATRIX");
for row_index in 0..ncols {
println!(
"Document {} {:.2} {:.2} {:.2} {:.2}",
row_index + 1,
&cosine_similarity_matrix[(row_index, 0)],
&cosine_similarity_matrix[(row_index, 1)],
&cosine_similarity_matrix[(row_index, 2)],
&cosine_similarity_matrix[(row_index, 3)]
)
}
println!(" Document 1 Document 2 Document 3 Document 4");
println!("\n-----------------------------\n");
let lsa_cosine_similarity = tfidf_matrix.get_lsa_cosine_similarity_from_tfidf(2).unwrap();
let lsa_cosine_similarity_matrix = lsa_cosine_similarity.get_lsa_cosine_similarity_matrix();
println!("LSA COSINE SIMILARITY MATRIX");
for row_index in 0..ncols {
println!(
"Document {} {:.2} {:.2} {:.2} {:.2}",
row_index + 1,
&lsa_cosine_similarity_matrix[(row_index, 0)],
&lsa_cosine_similarity_matrix[(row_index, 1)],
&lsa_cosine_similarity_matrix[(row_index, 2)],
&lsa_cosine_similarity_matrix[(row_index, 3)]
)
}
println!(" Document 1 Document 2 Document 3 Document 4");
}