use std::{collections::HashMap, io::Write, path::PathBuf};
use positioned_io::{ReadAt, WriteAt};
use crate::{DocumentId, PageId, Result, Version, ZeboError};
pub struct ZeboIndex<DocId> {
offset: u64,
index_file: std::fs::File,
#[cfg(test)]
#[allow(dead_code)]
index_file_path: PathBuf,
p: std::marker::PhantomData<DocId>,
}
impl<DocId: DocumentId> ZeboIndex<DocId> {
pub fn try_new(index_dir: PathBuf) -> Result<Self> {
std::fs::create_dir_all(&index_dir).map_err(ZeboError::OperationError)?;
let index_file_path = index_dir.join("index.index");
let mut index_file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(true)
.open(&index_file_path)
.map_err(ZeboError::OperationError)?;
index_file
.write_all_at(0, &[Version::V1.into()])
.map_err(ZeboError::OperationError)?;
let initial_offset = 9_u64.to_be_bytes();
index_file
.write_all_at(1, &initial_offset)
.map_err(ZeboError::OperationError)?;
Ok(Self {
offset: 8 + 1,
index_file,
p: std::marker::PhantomData,
#[cfg(test)]
index_file_path,
})
}
pub fn try_load(index_dir: PathBuf) -> Result<Self> {
let index_file_path = index_dir.join("index.index");
let index_file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.truncate(false)
.open(&index_file_path)
.map_err(ZeboError::OperationError)?;
let mut buf = [0; 1];
index_file
.read_exact_at(0, &mut buf)
.map_err(ZeboError::OperationError)?;
let version = buf[0];
if version != Version::V1.into() {
return Err(ZeboError::UnsupportedVersion {
version,
wanted: Version::V1.into(),
});
}
let mut offset = [0; 8];
index_file
.read_exact_at(1, &mut offset)
.map_err(ZeboError::OperationError)?;
let offset = u64::from_be_bytes(offset);
Ok(Self {
offset,
index_file,
p: std::marker::PhantomData,
#[cfg(test)]
index_file_path,
})
}
pub fn new_page(&mut self, start_doc_id: DocId, page_id: PageId) -> Result<()> {
let mut buf = [0; 8 + 8];
buf[0..8].copy_from_slice(&page_id.0.to_be_bytes());
buf[8..16].copy_from_slice(&start_doc_id.as_u64().to_be_bytes());
self.index_file
.write_all_at(self.offset, &buf)
.map_err(ZeboError::OperationError)?;
self.offset += 8 + 8; self.index_file
.write_all_at(1, &self.offset.to_be_bytes())
.map_err(ZeboError::OperationError)?;
Write::flush(&mut self.index_file).map_err(ZeboError::OperationError)?;
self.index_file
.sync_all()
.map_err(ZeboError::OperationError)?;
Ok(())
}
pub fn get_page_ids(&self) -> Result<Vec<PageId>> {
let mut buf = vec![0; (self.offset - 8 - 1) as usize];
self.index_file
.read_exact_at(9, &mut buf)
.map_err(ZeboError::OperationError)?;
let expected_page_size = (self.offset - 8 - 1) / (8 + 8);
let mut pages = Vec::with_capacity(expected_page_size as usize);
for chunk in buf.chunks_exact(16) {
let page_id = u64::from_be_bytes(chunk[0..8].try_into().unwrap());
pages.push(PageId(page_id));
}
pages.sort_by_key(|page| page.0);
Ok(pages)
}
pub fn get_pages<I: IntoIterator<Item = DocId>>(
&self,
doc_ids: I,
result: &mut HashMap<PageId, Vec<(u64, ProbableIndex)>>,
) -> Result<()> {
let mut buf = vec![0; (self.offset - 8 - 1) as usize];
self.index_file
.read_exact_at(9, &mut buf)
.map_err(ZeboError::OperationError)?;
let expected_page_size = (self.offset - 8 - 1) / (8 + 8);
let mut pages = Vec::with_capacity(expected_page_size as usize);
for chunk in buf.chunks_exact(16) {
let page_id = u64::from_be_bytes(chunk[0..8].try_into().unwrap());
let starting_doc_id = u64::from_be_bytes(chunk[8..16].try_into().unwrap());
pages.push((starting_doc_id, PageId(page_id)));
}
for doc_id in doc_ids {
let (starting_doc_id, page_id) =
match pages.binary_search_by_key(&doc_id.as_u64(), |(s, _)| *s) {
Ok(index) => pages[index],
Err(index) => {
if index == 0 {
continue;
} else {
pages[index - 1]
}
}
};
let d = doc_id.as_u64();
debug_assert!(starting_doc_id <= d);
result
.entry(page_id)
.or_default()
.push((d, ProbableIndex(d - starting_doc_id)));
}
Ok(())
}
pub fn get_all_pages(&self) -> Result<Vec<(u64, PageId)>> {
let mut buf = vec![0; (self.offset - 8 - 1) as usize];
self.index_file
.read_exact_at(9, &mut buf)
.map_err(ZeboError::OperationError)?;
let expected_page_size = (self.offset - 8 - 1) / (8 + 8);
let mut pages = Vec::with_capacity(expected_page_size as usize);
for chunk in buf.chunks_exact(16) {
let page_id = u64::from_be_bytes(chunk[0..8].try_into().unwrap());
let starting_doc_id = u64::from_be_bytes(chunk[8..16].try_into().unwrap());
pages.push((starting_doc_id, PageId(page_id)));
}
Ok(pages)
}
}
#[derive(Debug, Clone, Copy)]
pub struct ProbableIndex(pub u64);