use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, BufRead};
use regex::Regex;
use std::fmt::Formatter;
use log::{info, trace};
use crate::Error;
#[derive(Debug)]
pub struct IndexEntry {
entry : u32,
count : u32
}
impl IndexEntry {
pub fn new(entry : u32) -> Self {
IndexEntry {
entry,
count : 1
}
}
pub fn with_count(self, count : u32) -> Self {
IndexEntry {
entry : self.entry,
count
}
}
pub fn entry(&self) -> &u32 {
&self.entry
}
pub fn increment(&mut self) {
self.count += 1
}
}
#[derive(Debug)]
pub struct InvertedIndex {
lists : HashMap<String, Vec<IndexEntry>>,
file : String
}
impl InvertedIndex {
pub fn from_file(file_name : &str) -> Result<Self, Error> {
let i = std::time::Instant::now();
trace!("Loading file into BufReader ...");
let file = File::open(file_name)?;
let buf_reader = BufReader::new(file);
trace!("BufReader initialized");
let mut lists : HashMap<String, Vec<IndexEntry>> = HashMap::new();
let mut current_line: u32 = 1;
trace!("Reading lines ...");
for line in buf_reader.lines() {
let l = line?;
Self::insert_indices(&l, current_line, &mut lists)?;
print!("Lines read: {}\r", current_line);
current_line += 1;
}
println!("Lines read: {}", current_line - 1);
println!("Time elapsed: {}s", i.elapsed().as_millis() as f32 / 1000.0);
Ok(InvertedIndex { lists, file : file_name.to_string()})
}
pub fn get_lists(&self) -> &HashMap<String, Vec<IndexEntry>> {
&self.lists
}
pub fn get_mut_lists(&mut self) -> &mut HashMap<String, Vec<IndexEntry>> {
&mut self.lists
}
pub fn take(self) -> HashMap<String, Vec<IndexEntry>> {
self.lists
}
fn insert_indices(line : &str,
current_line : u32,
lists : &mut HashMap<String, Vec<IndexEntry>>)
-> Result<(), Error> {
let re = Regex::new("[^a-zA-Z]+")?;
let splits = re.split(line);
for word in splits {
let word = &word.to_lowercase();
if !lists.contains_key(word) {
let v = vec![IndexEntry::new(current_line)];
lists.insert(word.to_string(), v);
} else {
let v =
lists.get_mut(word).ok_or(Error::NoneError)?;
let last = v.last_mut().ok_or(Error::NoneError)?;
if last.entry() != ¤t_line {
v.push(IndexEntry::new(current_line));
} else {
last.increment();
}
}
}
Ok(())
}
pub fn process_query_output(&self, queries : Vec<String>)
-> Result<String, Error>{
let indices = self.process_query(queries)?;
let file = File::open(&self.file)?;
let buf_reader = BufReader::new(file);
let mut lines = buf_reader.lines();
let mut curr_line = 0;
let mut result = String::new();
for i in indices {
let i = i as usize;
result.push_str(
lines.nth(i - curr_line - 1).ok_or(Error::OutOfBounds)??.as_str()
);
curr_line = i - 1;
}
Ok(result)
}
pub fn process_query(&self, queries : Vec<String>) -> Result<Vec<u32>, Error> {
trace!("Processing queries ...");
if queries.is_empty() {
return Ok(Vec::new());
}
let mut lists = Vec::new();
for q in queries {
let q = q.trim().to_string().to_lowercase();
if let Some(l) = self.lists.get(&q) {
lists.push(l);
} else {
info!("Not all queries found");
return Ok(Vec::new())
}
}
Self::intersect(lists)
}
fn intersect(lists: Vec<&Vec<IndexEntry>>) -> Result<Vec<u32>, Error>{
trace!("Intersecting lists...");
let lists = &lists;
let mut iterators = Vec::with_capacity(lists.len());
for l in lists {
iterators.push(l.iter())
}
let mut accept = lists.len() - 1;
let mut current_it : usize = 0;
let mut current_max : u32 = 0;
let mut results : Vec<u32> = Vec::new();
loop {
let it = iterators.get_mut(current_it)
.ok_or(Error::OutOfBounds)?;
if let Some(i) = it.next() {
if *i.entry() > current_max {
current_max = *i.entry();
accept = match current_it {
0 => {lists.len() - 1}
_ => {current_it - 1}
};
}
if i.entry() == ¤t_max {
if accept == current_it {
results.push(current_max);
}
}
else {
continue;
}
current_it = (current_it + 1) % lists.len();
}
else {
break;
}
}
Ok(results)
}
}
impl std::fmt::Display for InvertedIndex {
fn fmt(&self, f : &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
let mut result = String::new();
for (word, list) in &self.lists {
result.push_str(word);
result.push_str(": ");
result.push_str(&format!("{:?}", list));
result.push_str("\n");
}
write!(f, "{}", result)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gen_ii() {
let ii =
InvertedIndex::from_file("examples/example.txt")
.unwrap().take();
let mut ii: Vec<(&String, &Vec<IndexEntry>)>
= ii.iter().collect();
ii.sort_by_key(|(s, _)| {
*s
});
assert_eq!(format!("{:?}", ii), "[(\"a\", [IndexEntry { entry: 1, count\
: 1 }, IndexEntry { entry: 2, count: 1 }]), (\"doc\", [IndexEntry { entry: 1, c\
ount: 1 }, IndexEntry { entry: 2, count: 1 }, IndexEntry { entry: 3, count: 1 }\
]), (\"film\", [IndexEntry { entry: 2, count: 1 }]), (\"movie\", [IndexEntry { \
entry: 1, count: 2 }, IndexEntry { entry: 3, count: 1 }])]");
}
#[test]
fn test_intersect() {
let ii = InvertedIndex::from_file("examples/example.txt");
let lists = ii.unwrap().take();
let result = InvertedIndex::intersect(
vec![&lists["a"], &lists["movie"]]).unwrap();
assert_eq!(result, vec![1])
}
#[test]
fn test_process_query() {
let ii = InvertedIndex::from_file("examples/example.txt").unwrap();
assert_eq!(ii.process_query(vec!["a".to_string(), "movie".to_string()]).unwrap(), vec![1]);
}
#[test]
fn test_process_query_output() {
let ii = InvertedIndex::from_file("examples/example.txt").unwrap();
let s = ii.process_query_output(vec!["a".to_string(), "movie".to_string()]).unwrap();
assert_eq!(s, "Doc 1 A movie movie.".to_string())
}
}