use crate::containers::{AdjList, Bitmap, Sequence};
use crate::object_iter::ObjectIter;
use crate::predicate_iter::PredicateIter;
use crate::{ControlInfo, IdKind};
use bytesize::ByteSize;
use log::{debug, error};
use rsdict::RsDict;
use std::cmp::Ordering;
use std::convert::TryFrom;
use std::fmt;
use std::io;
use std::io::BufRead;
use sucds::{CompactVector, Searial, WaveletMatrix, WaveletMatrixBuilder};
#[allow(missing_docs)]
#[repr(u8)]
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum Order {
Unknown = 0,
SPO = 1,
SOP = 2,
PSO = 3,
POS = 4,
OSP = 5,
OPS = 6,
}
impl TryFrom<u32> for Order {
type Error = std::io::Error;
fn try_from(original: u32) -> Result<Self, Self::Error> {
match original {
0 => Ok(Order::Unknown),
1 => Ok(Order::SPO),
2 => Ok(Order::SOP),
3 => Ok(Order::PSO),
4 => Ok(Order::POS),
5 => Ok(Order::OSP),
6 => Ok(Order::OPS),
_ => Err(Self::Error::new(io::ErrorKind::InvalidData, "Unrecognized order")),
}
}
}
pub struct OpIndex {
pub sequence: CompactVector,
pub bitmap: Bitmap,
}
impl fmt::Debug for OpIndex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "total size {} {{", ByteSize(self.size_in_bytes() as u64))?;
writeln!(
f,
" sequence: {} with {} bits,",
ByteSize(self.sequence.len() as u64 * self.sequence.width() as u64 / 8),
self.sequence.width()
)?;
write!(f, " bitmap: {:#?}\n}}", self.bitmap)
}
}
impl OpIndex {
pub fn size_in_bytes(&self) -> usize {
self.sequence.len() * self.sequence.width() / 8 + self.bitmap.size_in_bytes()
}
pub fn find(&self, o: Id) -> usize {
self.bitmap.dict.select1(o as u64 - 1).unwrap() as usize
}
pub fn last(&self, o: Id) -> usize {
match self.bitmap.dict.select1(o as u64) {
Some(index) => index as usize - 1,
None => self.bitmap.dict.len() - 1,
}
}
}
pub struct TriplesBitmap {
order: Order,
pub bitmap_y: Bitmap,
pub adjlist_z: AdjList,
pub op_index: OpIndex,
pub wavelet_y: WaveletMatrix,
}
impl fmt::Debug for TriplesBitmap {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "total size {}", ByteSize(self.size_in_bytes() as u64))?;
writeln!(f, "adjlist_z {:#?}", self.adjlist_z)?;
writeln!(f, "op_index {:#?}", self.op_index)?;
write!(f, "wavelet_y {}", ByteSize(self.wavelet_y.size_in_bytes() as u64))
}
}
impl TriplesBitmap {
pub fn read_sect<R: BufRead>(reader: &mut R) -> io::Result<Self> {
use io::Error;
use io::ErrorKind::InvalidData;
let triples_ci = ControlInfo::read(reader)?;
match &triples_ci.format[..] {
"<http://purl.org/HDT/hdt#triplesBitmap>" => Ok(TriplesBitmap::read(reader, &triples_ci)?),
"<http://purl.org/HDT/hdt#triplesList>" => {
Err(Error::new(InvalidData, "Triples Lists are not supported yet."))
}
_ => Err(Error::new(InvalidData, "Unknown triples listing format.")),
}
}
pub fn triples_with_id(&self, id: usize, id_kind: &IdKind) -> Box<dyn Iterator<Item = TripleId> + '_> {
match id_kind {
IdKind::Subject => Box::new(BitmapIter::with_s(self, id)),
IdKind::Predicate => Box::new(PredicateIter::new(self, id)),
IdKind::Object => Box::new(ObjectIter::new(self, id)),
}
}
pub fn size_in_bytes(&self) -> usize {
self.adjlist_z.size_in_bytes() + self.op_index.size_in_bytes() + self.wavelet_y.size_in_bytes()
}
pub fn find_y(&self, subject_id: Id) -> usize {
if subject_id == 0 {
return 0;
}
self.bitmap_y.dict.select1(subject_id as u64 - 1).unwrap() as usize + 1
}
pub fn last_y(&self, subject_id: usize) -> usize {
self.find_y(subject_id + 1) - 1
}
fn bin_search_y(&self, element: usize, begin: usize, end: usize) -> Option<usize> {
let mut low = begin;
let mut high = end;
while low < high {
let mid = (low + high) / 2;
match self.wavelet_y.get(mid).cmp(&element) {
Ordering::Less => low = mid + 1,
Ordering::Greater => high = mid,
Ordering::Equal => return Some(mid),
};
}
None
}
pub fn search_y(&self, subject_id: usize, property_id: usize) -> Option<usize> {
self.bin_search_y(property_id, self.find_y(subject_id), self.last_y(subject_id) + 1)
}
fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix {
debug!("Building wavelet matrix...");
let mut wavelet_builder = WaveletMatrixBuilder::with_width(sequence.bits_per_entry);
for x in &sequence {
wavelet_builder.push(x);
}
assert!(sequence.crc_handle.take().unwrap().join().unwrap(), "wavelet source CRC check failed.");
drop(sequence);
let wavelet = wavelet_builder.build().expect("Error building the wavelet matrix. Aborting.");
debug!("built wavelet matrix with length {}", wavelet.len());
wavelet
}
fn read<R: BufRead>(reader: &mut R, triples_ci: &ControlInfo) -> io::Result<Self> {
use std::io::Error;
use std::io::ErrorKind::InvalidData;
let order: Order;
if let Some(n) = triples_ci.get("order").and_then(|v| v.parse::<u32>().ok()) {
order = Order::try_from(n)?;
} else {
return Err(Error::new(InvalidData, "Unrecognized order"));
}
let bitmap_y = Bitmap::read(reader)?;
let bitmap_z = Bitmap::read(reader)?;
let sequence_y = Sequence::read(reader)?;
let wavelet_thread = std::thread::spawn(|| Self::build_wavelet(sequence_y));
let mut sequence_z = Sequence::read(reader)?;
debug!("Building OPS index...");
let entries = sequence_z.entries;
let max_object = sequence_z.into_iter().max().unwrap().to_owned();
let mut indicess = vec![Vec::<u32>::with_capacity(4); max_object];
for i in 0..entries {
let object = sequence_z.get(i);
if object == 0 {
error!("ERROR: There is a zero value in the Z level.");
continue;
}
indicess[object - 1].push(i as u32); }
let mut bitmap_index_dict = RsDict::new();
let mut cv = CompactVector::with_capacity(entries, sucds::util::needed_bits(entries));
let wavelet_y = wavelet_thread.join().unwrap();
let get_p = |pos_z: u32| {
let pos_y = bitmap_z.dict.rank(pos_z.to_owned() as u64, true);
wavelet_y.get(pos_y as usize) as Id
};
for mut indices in indicess {
let mut first = true;
indices.sort_by_cached_key(|a| get_p(*a));
for index in indices {
bitmap_index_dict.push(first);
first = false;
cv.push(index as usize);
}
}
let bitmap_index = Bitmap { dict: bitmap_index_dict };
let op_index = OpIndex { sequence: cv, bitmap: bitmap_index };
debug!("built OPS index");
assert!(sequence_z.crc_handle.take().unwrap().join().unwrap(), "sequence_z CRC check failed.");
let adjlist_z = AdjList::new(sequence_z, bitmap_z);
Ok(TriplesBitmap { order, bitmap_y, adjlist_z, op_index, wavelet_y })
}
pub fn coord_to_triple(&self, x: Id, y: Id, z: Id) -> io::Result<TripleId> {
use io::Error;
use io::ErrorKind::InvalidData;
if x == 0 || y == 0 || z == 0 {
return Err(Error::new(
InvalidData,
format!("({x},{y},{z}) none of the components of a triple may be 0."),
));
}
match self.order {
Order::SPO => Ok(TripleId::new(x, y, z)),
Order::SOP => Ok(TripleId::new(x, z, y)),
Order::PSO => Ok(TripleId::new(y, x, z)),
Order::POS => Ok(TripleId::new(y, z, x)),
Order::OSP => Ok(TripleId::new(z, x, y)),
Order::OPS => Ok(TripleId::new(z, y, x)),
Order::Unknown => Err(Error::new(InvalidData, "unknown triples order")),
}
}
}
impl<'a> IntoIterator for &'a TriplesBitmap {
type Item = TripleId;
type IntoIter = BitmapIter<'a>;
fn into_iter(self) -> Self::IntoIter {
BitmapIter::new(self)
}
}
pub struct BitmapIter<'a> {
triples: &'a TriplesBitmap,
x: Id,
pos_y: usize,
pos_z: usize,
max_y: usize,
max_z: usize,
}
impl<'a> BitmapIter<'a> {
pub const fn new(triples: &'a TriplesBitmap) -> Self {
BitmapIter {
triples,
x: 1, pos_y: 0,
pos_z: 0,
max_y: triples.wavelet_y.len(), max_z: triples.adjlist_z.len(), }
}
pub const fn empty(triples: &'a TriplesBitmap) -> Self {
BitmapIter { triples, x: 1, pos_y: 0, pos_z: 0, max_y: 0, max_z: 0 }
}
pub fn with_s(triples: &'a TriplesBitmap, subject_id: Id) -> Self {
let min_y = triples.find_y(subject_id - 1);
let min_z = triples.adjlist_z.find(min_y as Id);
let max_y = triples.find_y(subject_id);
let max_z = triples.adjlist_z.find(max_y as Id);
BitmapIter { triples, x: subject_id, pos_y: min_y, pos_z: min_z, max_y, max_z }
}
pub fn with_pattern(triples: &'a TriplesBitmap, pat: &TripleId) -> Self {
let (pat_x, pat_y, pat_z) = (pat.subject_id, pat.predicate_id, pat.object_id);
let (min_y, max_y, min_z, max_z);
let mut x = 1;
if pat_x != 0 {
if pat_y != 0 {
match triples.search_y(pat_x - 1, pat_y) {
Some(y) => min_y = y,
None => return BitmapIter::empty(triples),
};
max_y = min_y + 1;
if pat_z != 0 {
match triples.adjlist_z.search(min_y, pat_z) {
Some(z) => min_z = z,
None => return BitmapIter::empty(triples),
};
max_z = min_z + 1;
} else {
min_z = triples.adjlist_z.find(min_y);
max_z = triples.adjlist_z.last(min_y) + 1;
}
} else {
min_y = triples.find_y(pat_x - 1);
min_z = triples.adjlist_z.find(min_y);
max_y = triples.last_y(pat_x - 1) + 1;
max_z = triples.adjlist_z.find(max_y);
}
x = pat_x;
} else {
min_y = 0;
min_z = 0;
max_y = triples.wavelet_y.len();
max_z = triples.adjlist_z.len();
}
BitmapIter { triples, x, pos_y: min_y, pos_z: min_z, max_y, max_z }
}
}
impl<'a> Iterator for BitmapIter<'a> {
type Item = TripleId;
fn next(&mut self) -> Option<Self::Item> {
if self.pos_y >= self.max_y {
return None;
}
if self.pos_z >= self.max_z {
return None;
}
let y = self.triples.wavelet_y.get(self.pos_y) as Id;
let z = self.triples.adjlist_z.get_id(self.pos_z);
let triple_id = self.triples.coord_to_triple(self.x, y, z).unwrap();
if self.triples.adjlist_z.at_last_sibling(self.pos_z) {
if self.triples.bitmap_y.at_last_sibling(self.pos_y) {
self.x += 1;
}
self.pos_y += 1;
}
self.pos_z += 1;
Some(triple_id)
}
}
pub type Id = usize;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct TripleId {
pub subject_id: Id,
pub predicate_id: Id,
pub object_id: Id,
}
impl TripleId {
pub const fn new(subject_id: Id, predicate_id: Id, object_id: Id) -> Self {
TripleId { subject_id, predicate_id, object_id }
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::header::Header;
use crate::tests::init;
use crate::{ControlInfo, FourSectDict, IdKind};
use pretty_assertions::assert_eq;
use std::fs::File;
use std::io::BufReader;
#[test]
fn read_triples() {
init();
let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
let mut reader = BufReader::new(file);
ControlInfo::read(&mut reader).unwrap();
Header::read(&mut reader).unwrap();
let _dict = FourSectDict::read(&mut reader).unwrap();
let triples = TriplesBitmap::read_sect(&mut reader).unwrap();
let v: Vec<TripleId> = triples.into_iter().collect::<Vec<TripleId>>();
assert_eq!(v.len(), 327);
assert_eq!(v[0].subject_id, 1);
assert_eq!(v[2].subject_id, 1);
assert_eq!(v[3].subject_id, 2);
let num_subjects = 48;
let num_predicates = 23;
let num_objects = 175;
let mut filtered: Vec<TripleId>;
let kinds = [IdKind::Subject, IdKind::Predicate, IdKind::Object];
let lens = [num_subjects, num_predicates, num_objects];
let funs = [|t: TripleId| t.subject_id, |t: TripleId| t.predicate_id, |t: TripleId| t.object_id];
for j in 0..kinds.len() {
for i in 1..=lens[j] {
filtered = v.iter().filter(|tid| funs[j](**tid) == i).copied().collect();
filtered.sort_unstable();
let mut triples_with_id = triples.triples_with_id(i, &kinds[j]).collect::<Vec<TripleId>>();
triples_with_id.sort_unstable();
assert_eq!(filtered, triples_with_id, "triples_with({},{:?})", i, kinds[j]);
}
}
assert_eq!(0, BitmapIter::empty(&triples).count());
assert_eq!(
vec![TripleId::new(14, 14, 154)],
BitmapIter::with_pattern(&triples, &TripleId::new(14, 14, 154)).collect::<Vec<_>>()
);
assert_eq!(
vec![TripleId::new(14, 14, 154)],
BitmapIter::with_pattern(&triples, &TripleId::new(14, 14, 0)).collect::<Vec<_>>()
);
for i in 1..num_subjects {
assert_eq!(
BitmapIter::with_s(&triples, i).collect::<Vec<_>>(),
BitmapIter::with_pattern(&triples, &TripleId::new(i, 0, 0)).collect::<Vec<_>>()
);
}
assert_eq!(v, BitmapIter::with_pattern(&triples, &TripleId::new(0, 0, 0)).collect::<Vec<_>>());
assert_eq!(0, BitmapIter::with_pattern(&triples, &TripleId::new(12, 14, 154)).count());
}
}