use std::fs::File;
use std::path::Path;
use std::io::{Write, Read};
use std::collections::BTreeMap;
use std::cmp::Ordering;
use serde::{Deserialize, Serialize};
use rmp_serde::{Deserializer, Serializer};
use tokenizer;
use document::{self, Document};
type TermIndex = Vec<Vec<usize>>;
type TermIdPair = (usize, String);
#[derive(Debug)]
pub struct IndexerError<'a> {
pub message: &'a str,
}
impl<'a> IndexerError<'a> {
fn new(msg: &'a str) -> IndexerError {
IndexerError { message: msg }
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Index {
pub n_terms: usize,
pub n_docs: usize, terms: BTreeMap<String, usize>,
documents: Vec<Document>,
term_doc_idx: Vec<Vec<usize>>, }
impl Index {
pub fn new() -> Index {
Index {
n_terms: 0,
n_docs: 0,
terms: BTreeMap::new(), documents: Vec::new(),
term_doc_idx: vec![],
}
}
pub fn add_term(&mut self, term: String) -> Option<usize> {
if self.terms.contains_key(&term) {
let term_id:usize = *self.terms.get(&term).unwrap();
Some(term_id)
} else {
let term_id = self.n_terms;
self.terms.insert(term, term_id);
self.n_terms += 1;
Some(term_id)
}
}
pub fn add(&mut self, doc: Document ) -> Result<usize, IndexerError> {
let current_doc_id = self.n_docs;
self.documents.push( doc );
self.n_docs += 1;
Ok(current_doc_id)
}
pub fn index(&mut self) -> Result<usize, IndexerError> {
let mut doc_id:usize = 0;
for doc in self.documents.clone().iter() {
let tokens = tokenizer::tokenize_whitespace(doc.text.clone());
for term in tokens.iter() {
match Index::add_term(&mut *self, term.clone()) {
None => return Err(IndexerError::new("Failed to add term into index")),
Some(term_id) => {
self.add_doc_into_term_idx(term_id, doc_id);
}
}
}
doc_id += 1; }
self.n_docs = doc_id; Ok(doc_id)
}
pub fn get_docs_by_term(&self, term: String) -> Option<Vec<Document>> {
if !self.terms.contains_key(&term) {
return None;
}
let term_id = self.terms[&term];
let docs = self.term_doc_idx[term_id]
.iter()
.fold(vec![], |mut acc, &id|{
acc.push(self.documents[id].clone());
acc
});
Some(docs)
}
pub fn get_terms(&self) -> Vec<String> {
let mut vocabulary = Vec::with_capacity(self.n_terms);
for (term, &term_id) in self.terms.iter() {
vocabulary.push( (term_id, term.to_string()) )
}
vocabulary.sort_by(|a, b| cmp_term_pair(a, b) ); vocabulary.iter().map(|x| x.1.clone() ).collect()
}
fn add_doc_into_term_idx(&mut self, term_id: usize, doc_id: usize) -> Option<usize> {
if term_id >= self.term_doc_idx.len() {
self.term_doc_idx.push(vec![])
};
let doc_pos = self.term_doc_idx[term_id].len();
self.term_doc_idx[term_id].push(doc_id);
Some(doc_pos)
}
pub fn get_term_index(&self) -> Vec<(usize, Vec<usize>)> {
let mut term_idx = Vec::with_capacity(self.n_terms);
let mut term_id = 0 as usize;
for doc_ids in self.term_doc_idx.iter() {
term_idx.push( (term_id, doc_ids.clone() ) );
term_id += 1;
}
term_idx
}
pub fn get_document_label(&self, doc_id: usize) -> Option<String> {
match self.documents.get(doc_id) {
Some(doc) => Some(doc.clone().label),
_ => None
}
}
pub fn get_documents(&self) -> Vec<Document> {
self.documents.clone()
}
}
pub fn build_from_path<'a>(target_path: &'a str) -> Result<Index, IndexerError> {
let path = Path::new(target_path);
if !path.exists() {
return Err(IndexerError::new("target path doesnt exists or is not accessible"));
}
let mut idx = Index::new();
for entry in path.read_dir().expect("read_dir failed") {
if let Ok(metadata) = entry {
if let Ok(doc) = document::from_json_file(metadata.path()) {
idx.add(doc).expect("Failed to add document");
}
}
}
idx.index().is_ok();
Ok(idx)
}
pub fn save<'a>(idx: &Index, target_path: &'a str) -> Result<bool, IndexerError<'a>> {
let mut fp = match File::create(target_path) {
Ok(fp) => fp,
Err(_) => return Err(IndexerError::new("Failed to open targetfile"))
};
let mut buf: Vec<u8> = Vec::new();
idx.serialize(&mut Serializer::new(&mut buf)).expect("Failed to serialize index");
fp.write_all(&buf).expect("Failed to write into file");
fp.sync_all().expect("Failed to save file on the disk");
Ok(true)
}
pub fn load<'a>(source_path: &'a str) -> Result<Index, IndexerError> {
let mut fp = match File::open(source_path) {
Ok(fp) => fp,
Err(_) => return Err(IndexerError::new("Failed to open sourcefile"))
};
let mut buf = Vec::new();
fp.read_to_end(&mut buf).expect("Failed to read a content of the sourcefile");
let mut de = Deserializer::new(&buf[..]);
match Deserialize::deserialize(&mut de) {
Ok(idx) => Ok(idx),
Err(_) => Err(IndexerError::new("Failed to deserialize file buffer"))
}
}
fn cmp_term_pair(a: &TermIdPair, b: &TermIdPair) -> Ordering {
if a.0 >= b.0 {
Ordering::Greater
} else {
Ordering::Less
}
}