impl TfidfVectorizer {
#[must_use]
pub fn new() -> Self {
Self {
count_vectorizer: CountVectorizer::new(),
idf_values: Vec::new(),
sublinear_tf: false,
}
}
#[must_use]
pub fn with_sublinear_tf(mut self, enable: bool) -> Self {
self.sublinear_tf = enable;
self
}
#[must_use]
pub fn with_ngram_range(mut self, min_n: usize, max_n: usize) -> Self {
self.count_vectorizer = self.count_vectorizer.with_ngram_range(min_n, max_n);
self
}
#[must_use]
pub fn with_min_df(mut self, min_df: usize) -> Self {
self.count_vectorizer = self.count_vectorizer.with_min_df(min_df);
self
}
#[must_use]
pub fn with_max_df(mut self, max_df: f32) -> Self {
self.count_vectorizer = self.count_vectorizer.with_max_df(max_df);
self
}
#[must_use]
pub fn with_stop_words_english(mut self) -> Self {
self.count_vectorizer = self.count_vectorizer.with_stop_words_english();
self
}
#[must_use]
pub fn with_custom_stop_words(mut self, words: &[&str]) -> Self {
self.count_vectorizer = self.count_vectorizer.with_stop_words(words);
self
}
#[must_use]
pub fn with_strip_accents(mut self, enable: bool) -> Self {
self.count_vectorizer = self.count_vectorizer.with_strip_accents(enable);
self
}
#[must_use]
pub fn with_tokenizer(mut self, tokenizer: Box<dyn Tokenizer>) -> Self {
self.count_vectorizer = self.count_vectorizer.with_tokenizer(tokenizer);
self
}
#[must_use]
pub fn with_lowercase(mut self, lowercase: bool) -> Self {
self.count_vectorizer = self.count_vectorizer.with_lowercase(lowercase);
self
}
#[must_use]
pub fn with_max_features(mut self, max_features: usize) -> Self {
self.count_vectorizer = self.count_vectorizer.with_max_features(max_features);
self
}
pub fn fit_transform<S: AsRef<str>>(
&mut self,
documents: &[S],
) -> Result<Matrix<f64>, AprenderError> {
self.fit(documents)?;
self.transform(documents)
}
pub fn fit<S: AsRef<str>>(&mut self, documents: &[S]) -> Result<(), AprenderError> {
self.count_vectorizer.fit(documents)?;
let count_matrix = self.count_vectorizer.transform(documents)?;
let vocab_size = self.count_vectorizer.vocabulary_size();
let n_docs = documents.len() as f64;
let mut doc_freq = vec![0.0; vocab_size];
#[allow(clippy::needless_range_loop)]
for col in 0..vocab_size {
for row in 0..count_matrix.n_rows() {
if count_matrix.get(row, col) > 0.0 {
doc_freq[col] += 1.0;
}
}
}
self.idf_values = doc_freq
.iter()
.map(|&df| ((n_docs + 1.0) / (df + 1.0)).ln() + 1.0)
.collect();
Ok(())
}
pub fn transform<S: AsRef<str>>(&self, documents: &[S]) -> Result<Matrix<f64>, AprenderError> {
if self.idf_values.is_empty() {
return Err(AprenderError::Other(
"IDF not computed. Call fit() first".to_string(),
));
}
let tf_matrix = self.count_vectorizer.transform(documents)?;
let n_docs = tf_matrix.n_rows();
let vocab_size = tf_matrix.n_cols();
let mut tfidf_data = Vec::with_capacity(n_docs * vocab_size);
for row in 0..n_docs {
for col in 0..vocab_size {
let raw_tf = tf_matrix.get(row, col);
let tf = if self.sublinear_tf && raw_tf > 0.0 {
1.0 + raw_tf.ln()
} else {
raw_tf
};
let idf = self.idf_values[col];
tfidf_data.push(tf * idf);
}
}
Matrix::from_vec(n_docs, vocab_size, tfidf_data)
.map_err(|e: &str| AprenderError::Other(e.to_string()))
}
#[must_use]
pub fn vocabulary(&self) -> &HashMap<String, usize> {
self.count_vectorizer.vocabulary()
}
#[must_use]
pub fn vocabulary_size(&self) -> usize {
self.count_vectorizer.vocabulary_size()
}
#[must_use]
pub fn idf_values(&self) -> &[f64] {
&self.idf_values
}
}
impl Default for TfidfVectorizer {
fn default() -> Self {
Self::new()
}
}
fn strip_accents_unicode(text: &str) -> String {
text.chars()
.map(|c| {
match c {
'á' | 'à' | 'â' | 'ä' | 'ã' | 'å' => 'a',
'é' | 'è' | 'ê' | 'ë' => 'e',
'í' | 'ì' | 'î' | 'ï' => 'i',
'ó' | 'ò' | 'ô' | 'ö' | 'õ' => 'o',
'ú' | 'ù' | 'û' | 'ü' => 'u',
'ý' | 'ÿ' => 'y',
'ñ' => 'n',
'ç' => 'c',
'Á' | 'À' | 'Â' | 'Ä' | 'Ã' | 'Å' => 'A',
'É' | 'È' | 'Ê' | 'Ë' => 'E',
'Í' | 'Ì' | 'Î' | 'Ï' => 'I',
'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Õ' => 'O',
'Ú' | 'Ù' | 'Û' | 'Ü' => 'U',
'Ý' => 'Y',
'Ñ' => 'N',
'Ç' => 'C',
_ => c,
}
})
.collect()
}
#[allow(missing_debug_implementations)]
pub struct HashingVectorizer {
tokenizer: Option<Box<dyn Tokenizer>>,
n_features: usize,
lowercase: bool,
ngram_range: (usize, usize),
stop_words: Option<StopWordsFilter>,
}