use half::f16;
use num_traits::{Num, Pow};
use crate::{Corpus, TermFrequency, utils::datastruct::{map::IndexSet, vector::{TFVector, TFVectorTrait}}};
pub trait TFIDFEngine<N>: Send + Sync
where
N: Num + Copy
{
fn idf_vec(corpus: &Corpus, term_dim_sample: &Vec<Box<str>>) -> Vec<f32> {
let mut idf_vec = Vec::with_capacity(term_dim_sample.len());
let doc_num = corpus.get_doc_num() as f64;
for term in term_dim_sample.iter() {
let doc_freq = corpus.get_term_count(term);
idf_vec.push((doc_num / (doc_freq as f64 + 1.0)) as f32);
}
idf_vec
}
fn tf_vec(freq: &TermFrequency, term_dim_sample: &IndexSet<Box<str>>) -> TFVector<N>;
fn tf_denorm(val: N) -> u32;
}
#[derive(Debug, Clone)]
pub struct DefaultTFIDFEngine;
impl DefaultTFIDFEngine {
pub fn new() -> Self {
DefaultTFIDFEngine
}
}
impl TFIDFEngine<f16> for DefaultTFIDFEngine {
#[inline]
fn tf_vec(freq: &TermFrequency, term_dim_sample: &IndexSet<Box<str>>) -> TFVector<f16> {
let term_sum = freq.term_sum() as u32;
let len = freq.term_num();
let mut ind_vec: Vec<u32> = Vec::with_capacity(len);
let mut val_vec: Vec<f16> = Vec::with_capacity(len);
for (term, count) in freq.iter() {
let count = (count as f32).sqrt();
if let Some(idx) = term_dim_sample.get_index(term) {
ind_vec.push(idx as u32);
val_vec.push(f16::from_f32(count));
}
}
unsafe { TFVector::from_vec(ind_vec, val_vec, len as u32, term_sum) }
}
#[inline(always)]
fn tf_denorm(val: f16) -> u32 {
val.to_f32().pow(2) as u32
}
}
impl TFIDFEngine<f32> for DefaultTFIDFEngine
{
#[inline]
fn tf_vec(freq: &TermFrequency, term_dim_sample: &IndexSet<Box<str>>) -> TFVector<f32> {
let term_sum = freq.term_sum() as u32;
let len = freq.term_num();
let mut ind_vec: Vec<u32> = Vec::with_capacity(len);
let mut val_vec: Vec<f32> = Vec::with_capacity(len);
for (term, count) in freq.iter() {
if let Some(idx) = term_dim_sample.get_index(term) {
ind_vec.push(idx as u32);
val_vec.push(count as f32);
}
}
unsafe { TFVector::from_vec(ind_vec, val_vec, len as u32, term_sum) }
}
#[inline(always)]
fn tf_denorm(val: f32) -> u32 {
val as u32
}
}
impl TFIDFEngine<u32> for DefaultTFIDFEngine
{
#[inline]
fn tf_vec(freq: &TermFrequency, term_dim_sample: &IndexSet<Box<str>>) -> TFVector<u32> {
let term_sum = freq.term_sum() as u32;
let len = freq.term_num();
let mut ind_vec: Vec<u32> = Vec::with_capacity(len);
let mut val_vec: Vec<u32> = Vec::with_capacity(len);
for (term, count) in freq.iter() {
if let Some(idx) = term_dim_sample.get_index(term) {
ind_vec.push(idx as u32);
val_vec.push(count as u32);
}
}
unsafe { TFVector::from_vec(ind_vec, val_vec, len as u32, term_sum) }
}
#[inline(always)]
fn tf_denorm(val: u32) -> u32 {
val
}
}
impl TFIDFEngine<u16> for DefaultTFIDFEngine
{
#[inline]
fn tf_vec(freq: &TermFrequency, term_dim_sample: &IndexSet<Box<str>>) -> TFVector<u16> {
let term_sum = freq.term_sum() as u32;
let len = freq.term_num();
let mut ind_vec: Vec<u32> = Vec::with_capacity(len);
let mut val_vec: Vec<u16> = Vec::with_capacity(len);
for (term, count) in freq.iter() {
if let Some(idx) = term_dim_sample.get_index(term) {
ind_vec.push(idx as u32);
val_vec.push(count as u16);
}
}
unsafe { TFVector::from_vec(ind_vec, val_vec, len as u32, term_sum) }
}
#[inline(always)]
fn tf_denorm(val: u16) -> u32 {
val as u32
}
}