use crossbeam_channel::unbounded;
use rayon::prelude::*;
use std::{
cmp::Ordering,
collections::HashMap,
fmt::{Display, Formatter},
fs::File,
io::{prelude::*, BufReader},
ops::{Deref, DerefMut},
path::PathBuf,
};
use crate::errors::*;
pub(crate) type Token = String;
pub(crate) type ID = usize;
pub(crate) type Loc = usize;
#[derive(Clone, Debug, Default)]
pub struct PosQGram {
pub token: Token,
pub loc: Loc,
}
impl PosQGram {
fn from(t: Token, l: Loc) -> Self {
Self { token: t, loc: l }
}
pub fn cmp(&self, other: &Self, inverted: InvertedIndex) -> Ordering {
let len_a: usize = inverted.get(&self.token).unwrap().1;
let len_b: usize = inverted.get(&other.token).unwrap().1;
match len_a.cmp(&len_b) {
Ordering::Greater => Ordering::Greater,
Ordering::Less => Ordering::Less,
Ordering::Equal => self.token.as_bytes().cmp(&other.token.as_bytes()),
}
}
}
impl Display for PosQGram {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "({}, {})", self.token, self.loc)
}
}
#[derive(Debug)]
pub struct PosQGramArray {
pub inner: Vec<PosQGram>,
}
impl PosQGramArray {
pub fn new() -> Self {
Self {
inner: Vec::<PosQGram>::new(),
}
}
pub fn from_vec(inner: Vec<PosQGram>) -> Self {
Self { inner }
}
pub fn from(s: &str, q: usize) -> Self {
let slice: Vec<String> = Vec::from(s)
.par_windows(q)
.map(|ngrams| {
std::str::from_utf8(ngrams)
.expect("Error when parsing ngrams")
.to_string()
})
.collect();
let mut inner: Vec<PosQGram> = Vec::new();
slice.into_iter().enumerate().for_each(|(pos, key)| {
inner.push(PosQGram::from(key.to_string(), pos));
});
inner.par_sort_unstable_by_key(|qgram| qgram.loc);
Self { inner }
}
pub fn sort_by_frequency(&mut self, inverted: &InvertedIndex) {
self.par_sort_unstable_by(|a, b| {
let len_a: usize = inverted.get(&a.token).unwrap().1;
let len_b: usize = inverted.get(&b.token).unwrap().1;
match len_a.cmp(&len_b) {
Ordering::Greater => Ordering::Greater,
Ordering::Less => Ordering::Less,
Ordering::Equal => a.token.as_bytes().cmp(&b.token.as_bytes()),
}
});
}
pub fn sort_by_location(&mut self) {
self.par_sort_unstable_by(|a, b| a.loc.cmp(&b.loc))
}
}
impl Deref for PosQGramArray {
type Target = Vec<PosQGram>;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl DerefMut for PosQGramArray {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
impl Display for PosQGramArray {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let mut r: String = String::new();
self.inner.iter().for_each(|qgram| {
r.push_str(&format!("{}, ", qgram));
});
r.pop();
r.pop();
write!(f, "[{}]", r)
}
}
pub type InvertedList = Vec<(ID, Loc)>;
pub type InvertedIndex = HashMap<Token, (InvertedList, usize)>;
pub fn generate_inverted_index(
doc_x: &PathBuf,
doc_y: &PathBuf,
q: usize,
) -> Result<InvertedIndex> {
let reader_y: BufReader<File> = BufReader::new(File::open(doc_y)?);
let mut ngram_map: InvertedIndex = HashMap::new();
let (map_y_s, map_y_r) = unbounded::<(Token, (ID, Loc))>();
reader_y
.lines()
.enumerate()
.for_each(|(line_id, line_result)| {
let map_y_s_clone = map_y_s.clone();
let slice: Vec<_> = Vec::from(line_result.unwrap())
.par_windows(q)
.map(|qgrams| {
std::str::from_utf8(qgrams)
.expect("Error when parsing ngrams")
.to_string()
})
.collect();
slice.into_par_iter().enumerate().for_each(|(pos, key)| {
map_y_s_clone.send((key, (line_id, pos))).unwrap();
});
});
drop(map_y_s);
while let Ok((key, (line_id, pos))) = map_y_r.recv() {
ngram_map
.entry(key)
.or_insert((Vec::new(), 0))
.0
.push((line_id, pos));
}
drop(map_y_r);
ngram_map
.values_mut()
.par_bridge()
.for_each(|value| value.1 = value.0.len());
if doc_x != doc_y {
let reader_x: BufReader<File> = BufReader::new(File::open(doc_x)?);
let (map_x_s, map_x_r) = unbounded::<Token>();
reader_x.lines().for_each(|line_result| {
let map_x_s_clone = map_x_s.clone();
let slice: Vec<_> = Vec::from(line_result.unwrap())
.par_windows(q)
.map(|qgrams| {
std::str::from_utf8(qgrams)
.expect("Error when parsing ngrams")
.to_string()
})
.collect();
slice.into_par_iter().for_each(|key| {
map_x_s_clone.send(key).unwrap();
});
});
drop(map_x_s);
while let Ok(key) = map_x_r.recv() {
let (_list_y, count) = ngram_map.entry(key).or_insert((Vec::new(), 0));
*count += 1;
}
drop(map_x_r);
}
ngram_map.par_iter_mut().for_each(|(_, (list_y, _count))| {
list_y.par_sort_unstable_by_key(|(id_y, _loc_y)| {
*id_y });
});
Ok(ngram_map)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn pos_qgram_array() {
let pos_qgram = PosQGramArray::from("hello", 2);
assert_eq!(
format!("{}", &pos_qgram),
"[(he, 0), (el, 1), (ll, 2), (lo, 3)]"
);
}
#[test]
fn qgram_counter() {
let testfile: PathBuf = PathBuf::from("./testset/sample_test1.txt".to_string());
let result: String = format!(
"{:?}",
generate_inverted_index(&testfile, &testfile, 2)
.unwrap()
.get("he")
);
assert_eq!(result, format!("{:?}", Some(([(0, 0), (1, 0), (2, 0)], 3))));
}
}