use xxhash_rust::xxh64::xxh64;
use crate::index::segment::{FOOTER_SIZE, HEADER_SIZE, MAGIC};
use crate::IndexError;
pub(super) struct SegmentLayout {
pub doc_table_offset: usize,
pub dict_offset: usize,
pub doc_count: u32,
pub gram_count: u32,
pub postings_start: usize,
}
pub(super) fn parse_segment_mmap(
mmap: &[u8],
accepted_versions: &[u32],
) -> Result<SegmentLayout, IndexError> {
let len = mmap.len();
let corrupt = |msg: &str| IndexError::CorruptIndex(msg.into());
if len < HEADER_SIZE + FOOTER_SIZE {
return Err(corrupt("file too small"));
}
let footer = mmap
.get(len - FOOTER_SIZE..)
.ok_or_else(|| corrupt("truncated: cannot read footer"))?;
if footer.get(44..48) != Some(MAGIC.as_slice()) {
return Err(corrupt("bad footer magic"));
}
let version = u32::from_le_bytes(
footer
.get(40..44)
.ok_or_else(|| corrupt("truncated footer"))?
.try_into()
.map_err(|_| corrupt("footer slice"))?,
);
if !accepted_versions.contains(&version) {
return Err(IndexError::CorruptIndex(format!(
"unsupported segment version {version}"
)));
}
let stored_checksum = u64::from_le_bytes(
footer
.get(32..40)
.ok_or_else(|| corrupt("truncated footer"))?
.try_into()
.map_err(|_| corrupt("footer slice"))?,
);
let content = mmap
.get(..len - FOOTER_SIZE)
.ok_or_else(|| corrupt("truncated: cannot read content"))?;
if xxh64(content, 0) != stored_checksum {
return Err(corrupt("checksum mismatch"));
}
if mmap.get(0..4) != Some(MAGIC.as_slice()) {
return Err(corrupt("bad header magic"));
}
let doc_table_offset = u64::from_le_bytes(
footer[0..8]
.try_into()
.map_err(|_| corrupt("footer doc_table_offset slice"))?,
) as usize;
let dict_offset = u64::from_le_bytes(
footer[16..24]
.try_into()
.map_err(|_| corrupt("footer dict_offset slice"))?,
) as usize;
let doc_count = u32::from_le_bytes(
footer[24..28]
.try_into()
.map_err(|_| corrupt("footer doc_count slice"))?,
);
let gram_count = u32::from_le_bytes(
footer[28..32]
.try_into()
.map_err(|_| corrupt("footer gram_count slice"))?,
);
let content_end = len - FOOTER_SIZE;
if doc_table_offset < HEADER_SIZE || doc_table_offset > content_end {
return Err(corrupt("doc_table_offset out of range"));
}
if dict_offset < HEADER_SIZE || dict_offset > content_end {
return Err(corrupt("dict_offset out of range"));
}
if dict_offset < doc_table_offset {
return Err(corrupt("dict_offset precedes doc_table_offset"));
}
let postings_start = doc_table_offset.saturating_add(doc_count as usize * 8);
Ok(SegmentLayout {
doc_table_offset,
dict_offset,
doc_count,
gram_count,
postings_start,
})
}
#[cfg(feature = "memmap2")]
pub(super) fn read_exact_at(
file: &std::fs::File,
buf: &mut [u8],
offset: u64,
) -> std::io::Result<()> {
#[cfg(unix)]
{
use std::os::unix::fs::FileExt;
file.read_exact_at(buf, offset)
}
#[cfg(windows)]
{
use std::io::{Error, ErrorKind};
use std::os::windows::fs::FileExt;
let mut remaining = buf;
let mut off = offset;
while !remaining.is_empty() {
match file.seek_read(remaining, off) {
Ok(0) => {
return Err(Error::new(
ErrorKind::UnexpectedEof,
"failed to fill buffer",
));
}
Ok(n) => {
remaining = &mut remaining[n..];
off += n as u64;
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
Ok(())
}
#[cfg(not(any(unix, windows)))]
{
use std::io::{Read, Seek, SeekFrom};
let mut owned = file.try_clone()?;
owned.seek(SeekFrom::Start(offset))?;
owned.read_exact(buf)
}
}
#[cfg(feature = "memmap2")]
pub(super) const POST_MAGIC_SIZE: u64 = 8;
#[cfg(feature = "memmap2")]
pub(super) fn read_posting_list_pread(
post_file: &std::fs::File,
abs_off: u64,
) -> std::io::Result<crate::posting::PostingList> {
use std::io::{Error, ErrorKind};
use crate::posting::{roaring_util, PostingList};
let mut header = [0u8; 9];
let header_off = abs_off.checked_add(POST_MAGIC_SIZE).ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"posting header offset overflow",
)
})?;
read_exact_at(post_file, &mut header, header_off)?;
let encoding = header[0];
let byte_len = u32::from_le_bytes(header[5..9].try_into().unwrap()) as usize;
const MAX_POSTING_BYTES: usize = 8 * 1024 * 1024;
if byte_len > MAX_POSTING_BYTES {
return Err(Error::new(
ErrorKind::InvalidData,
format!("posting list too large: {byte_len} bytes (max {MAX_POSTING_BYTES})"),
));
}
let mut data = vec![0u8; byte_len];
let data_off = abs_off.checked_add(9 + POST_MAGIC_SIZE).ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"posting data offset overflow",
)
})?;
read_exact_at(post_file, &mut data, data_off)?;
match encoding {
0 => Ok(PostingList::Small(data)),
1 => roaring_util::deserialize(&data)
.map(PostingList::Large)
.map_err(|e| Error::new(ErrorKind::InvalidData, e.to_string())),
_ => Err(Error::new(
ErrorKind::InvalidData,
format!("unknown posting list encoding {encoding}"),
)),
}
}