pub mod corpus;
pub mod tfidf;
pub mod term;
pub mod serde;
pub mod evaluate;
use std::cmp::Ordering;
use std::sync::Arc;
use std::hash::Hash;
use half::f16;
use num_traits::Num;
use crate::utils::datastruct::map::index_map::{EntryMut, InsertResult, RemoveResult};
use crate::utils::datastruct::vector::{TFVector, TFVectorTrait, IDFVector};
use crate::{DefaultTFIDFEngine, TFIDFEngine, TermFrequency};
use crate::utils::datastruct::map::IndexMap;
use crate::Corpus;
#[derive(Debug, Clone)]
pub struct TFIDFVectorizer<N = f16, K = String, E = DefaultTFIDFEngine>
where
N: Num + Copy + Into<f64> + Send + Sync,
E: TFIDFEngine<N> + Send + Sync,
K: Clone + Send + Sync + Eq + std::hash::Hash,
{
pub documents: IndexMap<K, TFVector<N>>,
pub term_dim_rev_index: IndexMap<Box<str>, Vec<u32>>,
pub corpus_ref: Arc<Corpus>,
pub idf_cache: IDFVector,
_marker: std::marker::PhantomData<E>,
}
impl <N, K, E> TFIDFVectorizer<N, K, E>
where
N: Num + Copy + Into<f64> + Send + Sync,
E: TFIDFEngine<N> + Send + Sync,
K: Clone + Send + Sync + Eq + Hash,
{
pub fn new(corpus_ref: Arc<Corpus>) -> Self {
let mut instance = Self {
documents: IndexMap::new(),
term_dim_rev_index: IndexMap::new(),
corpus_ref,
idf_cache: IDFVector::new(),
_marker: std::marker::PhantomData,
};
instance.re_calc_idf();
instance
}
pub fn set_corpus_ref(&mut self, corpus_ref: Arc<Corpus>) {
self.corpus_ref = corpus_ref;
self.re_calc_idf();
}
pub fn update_idf(&mut self) {
if self.corpus_ref.get_gen_num() != self.idf_cache.latest_entropy {
self.re_calc_idf();
}
}
fn re_calc_idf(&mut self) {
self.idf_cache.latest_entropy = self.corpus_ref.get_gen_num();
self.idf_cache.doc_num = self.corpus_ref.get_doc_num();
self.idf_cache.idf_vec = E::idf_vec(&self.corpus_ref, self.term_dim_rev_index.keys());
}
}
impl <N, K, E> TFIDFVectorizer<N, K, E>
where
N: Num + Copy + Into<f64> + Send + Sync,
E: TFIDFEngine<N> + Send + Sync,
K: PartialEq + Clone + Send + Sync + Eq + Hash
{
pub fn add_doc(&mut self, key: K, doc: &TermFrequency) {
for tok in doc.term_set(){
self.term_dim_rev_index
.entry_mut(tok.into_boxed_str())
.or_insert_with(Vec::new);
}
let tf_vec= E::tf_vec(doc, self.term_dim_rev_index.as_index_set());
let (new_terms, old_terms) = self.add_tf_vec(key, tf_vec);
if old_terms.is_empty() && new_terms.is_empty() {
return;
}
if old_terms.is_empty() {
let add_terms: Vec<&Box<str>> = new_terms.iter()
.filter_map(|&idx| self.term_dim_rev_index.get_key_with_index(idx as usize))
.collect();
self.corpus_ref.add_set(&add_terms);
return;
}
let mut new_terms_iter = new_terms.into_iter().fuse();
let mut old_terms_iter = old_terms.into_iter().fuse();
let mut new_term_next = new_terms_iter.next();
let mut old_term_next = old_terms_iter.next();
let mut add_terms = Vec::new();
let mut del_terms = Vec::new();
while let (Some(new_idx), Some(old_idx)) = (new_term_next, old_term_next) {
match new_idx.cmp(&old_idx) {
Ordering::Less => {
let term = self.term_dim_rev_index.get_key_with_index(new_idx as usize).expect("unreachable");
add_terms.push(term);
new_term_next = new_terms_iter.next();
}
Ordering::Greater => {
let term = self.term_dim_rev_index.get_key_with_index(old_idx as usize).expect("unreachable");
del_terms.push(term);
old_term_next = old_terms_iter.next();
}
Ordering::Equal => {
new_term_next = new_terms_iter.next();
old_term_next = old_terms_iter.next();
}
}
}
while let Some(new_idx) = new_term_next {
let term = self.term_dim_rev_index.get_key_with_index(new_idx as usize).expect("unreachable");
add_terms.push(term);
new_term_next = new_terms_iter.next();
}
while let Some(old_idx) = old_term_next {
let term = self.term_dim_rev_index.get_key_with_index(old_idx as usize).expect("unreachable");
del_terms.push(term);
old_term_next = old_terms_iter.next();
}
self.corpus_ref.add_set(&add_terms);
self.corpus_ref.sub_set(&del_terms);
}
fn add_tf_vec(&mut self, key: K, tf_vec: TFVector<N>) -> (Vec<u32>, Vec<u32>) {
let new_tf_terms_ind: Vec<u32> = tf_vec.as_ind_slice().to_vec();
match self.documents.insert(key, tf_vec) {
InsertResult::New { index: id } => {
self.documents.get_with_index(id).expect("unreachable").as_ind_slice().iter().for_each(|&idx| {
self.term_dim_rev_index.get_with_index_mut(idx as usize).expect("unreachable").push(id as u32);
});
(new_tf_terms_ind, Vec::new())
}
InsertResult::Override { old_value: old_tf, old_key: _, index: id } => {
let old_tf_ind_iter = old_tf.as_ind_slice().iter();
let new_tf_ind_iter = self.documents.get_with_index(id).expect("unreachable").as_ind_slice().iter();
let mut old_it = old_tf_ind_iter.fuse();
let mut new_it = new_tf_ind_iter.fuse();
let mut old_next = old_it.next();
let mut new_next = new_it.next();
while let (Some(old_idx), Some(new_idx)) = (old_next, new_next) {
match old_idx.cmp(new_idx) {
Ordering::Equal => {
old_next = old_it.next();
new_next = new_it.next();
}
Ordering::Less => {
let doc_keys = self.term_dim_rev_index.get_with_index_mut(*old_idx as usize).expect("unreachable");
doc_keys.iter().position(|k| *k == id as u32).map(|pos| {
doc_keys.swap_remove(pos);
});
old_next = old_it.next();
}
Ordering::Greater => {
let doc_keys = self.term_dim_rev_index.get_with_index_mut(*new_idx as usize).expect("unreachable");
doc_keys.push(id as u32);
new_next = new_it.next();
}
}
}
(new_tf_terms_ind, old_tf.as_ind_slice().to_vec())
}
}
}
pub fn del_doc(&mut self, key: &K)
where
K: PartialEq,
{
match self.documents.swap_remove(key) {
RemoveResult::Removed { old_value: tf_vec, old_key: _, index: id } => {
let terms_idx = tf_vec.as_ind_slice();
terms_idx.iter().for_each(|&idx| {
let doc_keys = self.term_dim_rev_index.get_with_index_mut(idx as usize).expect("unreachable");
doc_keys.iter().position(|k| *k == id as u32).map(|pos| {
doc_keys.swap_remove(pos);
});
});
let swap_doc_id = self.documents.len() as u32;
if swap_doc_id != id as u32 {
self.documents.get_with_index(id).expect("unreachable").as_ind_slice().iter().for_each(|&idx| {
let doc_keys = self.term_dim_rev_index.get_with_index_mut(idx as usize).expect("unreachable");
doc_keys.iter().position(|k| *k == swap_doc_id).map(|pos| {
doc_keys[pos] = id as u32;
});
});
}
let terms = terms_idx.iter()
.filter_map(|&idx| self.term_dim_rev_index.get_key_with_index(idx as usize))
.collect::<Vec<&Box<str>>>();
self.corpus_ref.sub_set(&terms);
}
RemoveResult::None => {}
}
}
pub fn get_tf(&self, key: &K) -> Option<&TFVector<N>>
where
K: Eq + Hash,
{
self.documents.get(key)
}
pub fn get_tf_into_term_freq(&self, key: &K) -> Option<TermFrequency>
{
if let Some(tf_vec) = self.get_tf(key) {
let mut term_freq = TermFrequency::new();
tf_vec.raw_iter().for_each(|(idx, val)| {
let idx = idx as usize;
if let Some(term) = self.term_dim_rev_index.get_key_with_index(idx) {
let term_num = E::tf_denorm(val);
term_freq.set_term_count(term, term_num as u64);
} });
Some(term_freq)
} else {
None
}
}
pub fn contains_doc(&self, key: &K) -> bool
where
K: PartialEq,
{
self.documents.contains_key(key)
}
pub fn contains_term(&self, term: &str) -> bool {
self.term_dim_rev_index.contains_key(&Box::<str>::from(term))
}
pub fn contains_terms_from_freq(&self, freq: &TermFrequency) -> bool {
freq.term_set_ref_str().iter().all(|tok| self.contains_term(tok))
}
pub fn doc_num(&self) -> usize {
self.documents.len()
}
pub fn merge(&mut self, other: Self)
where
K: Eq + Hash,
{
let perm_idxs: Vec<u32> = other.term_dim_rev_index.into_iter().map(|(term, _)| {
match self.term_dim_rev_index.entry_mut(term) {
EntryMut::Occupied { index, ..} => index as u32,
EntryMut::Vacant { key, map } => {
match map.insert(key, Vec::new()) {
InsertResult::New { index } => index as u32,
InsertResult::Override { .. } => unreachable!(),
}
},
}
}).collect();
other.documents.into_iter().for_each(|(key, mut tf_vec)| {
tf_vec.perm(&perm_idxs);
let (_, old_tf_terms_ind) = self.add_tf_vec(key, tf_vec);
let del_terms = old_tf_terms_ind.into_iter().map(|old_idx| {
self.term_dim_rev_index.get_key_with_index(old_idx as usize).expect("unreachable")
}).collect::<Vec<&Box<str>>>();
self.corpus_ref.sub_set(&del_terms);
});
}
}