use std::collections::HashMap;
use std::sync::Arc;
use std::cmp::Ordering;
use tokenize::tokenize;
use index::Index;
pub struct Global {
pub name: String,
pub dictionary: HashMap<String, usize>, pub indices: Vec<Arc<Index>>
}
impl Global {
pub fn new(name: &str) -> Global {
Global {
name: name.to_owned(),
dictionary: HashMap::new(),
indices: Vec::new()
}
}
pub fn insert(&mut self, corpus: &str) -> Arc<Index> {
let index: Arc<Index> = Arc::new(Index::new(corpus));
let index_ref = index.clone();
let tokens = index.tokens.clone();
for token in tokens.keys() {
let token = token.clone();
let entry = self.dictionary.entry(token).or_insert(0);
*entry += 1;
}
self.indices.push(index_ref);
index.clone()
}
pub fn search(&self, text: &str) -> Vec<(Arc<Index>, f32)> {
let indices = self.indices.clone();
let tokens_vec: Vec<String> = tokenize(text);
let mut tokens: HashMap<String, usize> = HashMap::with_capacity(tokens_vec.len());
for token in tokens_vec.into_iter() {
let entry = tokens.entry(token).or_insert(0);
*entry += 1;
}
let token_ref = &tokens;
let mut scores: Vec<(Arc<Index>, f32)> = Vec::new();
for index in indices.into_iter() {
let index = index.clone();
let mut score = 0.0f32;
for (token, count) in token_ref.into_iter() {
let index_count: usize = match index.tokens.get(token) {
Some(val) => val.clone(),
None => 0
};
let occurance = index_count * count;
let global_occurance: usize = match self.dictionary.get(token) {
Some(val) => val.clone(),
None => 0
};
score += occurance as f32 / global_occurance as f32;
}
scores.push((index.clone(), score / index.word_count as f32));
}
self.finalize(scores)
}
fn finalize(&self, mut scores: Vec<(Arc<Index>, f32)>) -> Vec<(Arc<Index>, f32)> {
scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
let mut table: Vec<(Arc<Index>, f32)> = Vec::new();
for (index, score) in scores.into_iter() {
let index = index.clone();
table.push((index.clone(), score));
}
table
}
}