claw 0.1.0-alpha2

Low-level information retrieval library.
Documentation
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, BufRead};
use regex::Regex;
use std::fmt::Formatter;
use log::{info, trace};
use crate::Error;

#[derive(Debug)]
pub struct IndexEntry {
    entry : u32,
    count : u32
}

impl IndexEntry {
    pub fn new(entry : u32) -> Self {
        IndexEntry {
            entry,
            count : 1
        }
    }

    pub fn with_count(self, count : u32) -> Self {
        IndexEntry {
            entry : self.entry,
            count
        }
    }

    pub fn entry(&self) -> &u32 {
        &self.entry
    }

    pub fn increment(&mut self) {
        self.count += 1
    }
}

/// Contains inverted index lists.
#[derive(Debug)]
pub struct InvertedIndex {
    lists : HashMap<String, Vec<IndexEntry>>,
    file : String
}

impl InvertedIndex {
    /// Create a new InvertedIndex from a file.
    pub fn from_file(file_name : &str) -> Result<Self, Error> {
        let i = std::time::Instant::now();
        trace!("Loading file into BufReader ...");
        let file = File::open(file_name)?;
        let buf_reader = BufReader::new(file);

        trace!("BufReader initialized");


        let mut lists : HashMap<String, Vec<IndexEntry>> = HashMap::new();

        let mut current_line: u32 = 1;

        trace!("Reading lines ...");

        for line in buf_reader.lines() {
            let l = line?;
            Self::insert_indices(&l, current_line, &mut lists)?;
            print!("Lines read: {}\r", current_line);
            current_line += 1;
        }

        println!("Lines read: {}", current_line - 1);
        println!("Time elapsed: {}s", i.elapsed().as_millis() as f32 / 1000.0);

        Ok(InvertedIndex { lists, file : file_name.to_string()})
    }

    /// Get a reference to the HashMap containing the inverted lists.
    pub fn get_lists(&self) -> &HashMap<String, Vec<IndexEntry>> {
        &self.lists
    }

    /// Get a mutable reference to the HashMap containing the inverted lists.
    pub fn get_mut_lists(&mut self) -> &mut HashMap<String, Vec<IndexEntry>> {
        &mut self.lists
    }

    pub fn take(self) -> HashMap<String, Vec<IndexEntry>> {
        self.lists
    }

    fn insert_indices(line : &str,
                      current_line : u32,
                      lists : &mut HashMap<String, Vec<IndexEntry>>)
        -> Result<(), Error> {
        let re = Regex::new("[^a-zA-Z]+")?;
        let splits = re.split(line);
        for word in splits {
            let word = &word.to_lowercase();
            if !lists.contains_key(word) {
                let v = vec![IndexEntry::new(current_line)];
                lists.insert(word.to_string(), v);
            } else {
                let v =
                    lists.get_mut(word).ok_or(Error::NoneError)?;
                let last = v.last_mut().ok_or(Error::NoneError)?;
                if last.entry() != &current_line {
                    v.push(IndexEntry::new(current_line));
                } else {
                    last.increment();
                }
            }
        }

        Ok(())
    }

    /// Finds the intersection of the lists identified by the queries.
    /// Returns it as formatted output in a single String.
    pub fn process_query_output(&self, queries : Vec<String>)
        -> Result<String, Error>{
        let indices = self.process_query(queries)?;
        let file = File::open(&self.file)?;
        let buf_reader = BufReader::new(file);
        let mut lines = buf_reader.lines();
        let mut curr_line = 0;
        let mut result = String::new();


        for i in indices {
            let i = i as usize;
            result.push_str(
                lines.nth(i - curr_line - 1).ok_or(Error::OutOfBounds)??.as_str()
            );
            curr_line = i - 1;
        }
        Ok(result)
    }

    /// Finds the intersection of the lists identified by the queries.
    /// Return as a vec of indices.
    pub fn process_query(&self, queries : Vec<String>) -> Result<Vec<u32>, Error> {
        trace!("Processing queries ...");
        if queries.is_empty() {
            return Ok(Vec::new());
        }
        let mut lists = Vec::new();
        for q in queries {
            let q = q.trim().to_string().to_lowercase();
            if let Some(l) = self.lists.get(&q) {
                lists.push(l);
            } else {
                info!("Not all queries found");
                return Ok(Vec::new())
            }
        }
        Self::intersect(lists)
    }

    fn intersect(lists: Vec<&Vec<IndexEntry>>) -> Result<Vec<u32>, Error>{
        trace!("Intersecting lists...");
        let lists = &lists;
        let mut iterators = Vec::with_capacity(lists.len());
        for l in lists {
            iterators.push(l.iter())
        }
        let mut accept = lists.len() - 1;
        let mut current_it : usize = 0;
        let mut current_max : u32 = 0;

        let mut results : Vec<u32> = Vec::new();
        loop {
            let it = iterators.get_mut(current_it)
                .ok_or(Error::OutOfBounds)?;
            if let Some(i) = it.next() {
                if *i.entry() > current_max {
                    current_max = *i.entry();
                    accept = match current_it {
                        0 => {lists.len() - 1}
                        _ => {current_it - 1}
                    };
                }
                if i.entry() == &current_max {
                    if accept == current_it {
                        results.push(current_max);
                    }
                }
                else {
                    continue;
                }
                current_it = (current_it + 1) % lists.len();
            }
            else {
                break;
            }
        }
        Ok(results)
    }
}

impl std::fmt::Display for InvertedIndex {
    fn fmt(&self, f : &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
        let mut result = String::new();
        for (word, list) in &self.lists {
            result.push_str(word);
            result.push_str(": ");
            result.push_str(&format!("{:?}", list));
            result.push_str("\n");
        }
        write!(f, "{}", result)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_gen_ii() {
        let ii =
            InvertedIndex::from_file("examples/example.txt")
                .unwrap().take();
        let mut ii: Vec<(&String, &Vec<IndexEntry>)>
            = ii.iter().collect();
        ii.sort_by_key(|(s, _)| {
            *s
        });
        assert_eq!(format!("{:?}", ii), "[(\"a\", [IndexEntry { entry: 1, count\
: 1 }, IndexEntry { entry: 2, count: 1 }]), (\"doc\", [IndexEntry { entry: 1, c\
ount: 1 }, IndexEntry { entry: 2, count: 1 }, IndexEntry { entry: 3, count: 1 }\
]), (\"film\", [IndexEntry { entry: 2, count: 1 }]), (\"movie\", [IndexEntry { \
entry: 1, count: 2 }, IndexEntry { entry: 3, count: 1 }])]");
    }

    #[test]
    fn test_intersect() {
        let ii = InvertedIndex::from_file("examples/example.txt");
        let lists = ii.unwrap().take();
        let result = InvertedIndex::intersect(
            vec![&lists["a"], &lists["movie"]]).unwrap();
        assert_eq!(result, vec![1])
    }

    #[test]
    fn test_process_query() {
        let ii = InvertedIndex::from_file("examples/example.txt").unwrap();
        assert_eq!(ii.process_query(vec!["a".to_string(), "movie".to_string()]).unwrap(), vec![1]);
    }

    #[test]
    fn test_process_query_output() {
        let ii = InvertedIndex::from_file("examples/example.txt").unwrap();
        let s = ii.process_query_output(vec!["a".to_string(), "movie".to_string()]).unwrap();
        assert_eq!(s, "Doc 1   A movie movie.".to_string())
    }
}