#[cfg(feature = "memmap2")]
use std::path::Path;
#[cfg(feature = "memmap2")]
use memmap2::MmapOptions;
use xxhash_rust::xxh64::xxh64;
#[cfg(feature = "memmap2")]
use crate::index::segment::MAX_SEGMENT_SIZE;
use crate::index::segment::{
MmapSegment, PostingsBacking, SegmentData, FOOTER_SIZE, FORMAT_VERSION_V2, FORMAT_VERSION_V3,
HEADER_SIZE, MAGIC,
};
use crate::IndexError;
pub(super) struct SegmentLayout {
pub doc_table_offset: usize,
pub dict_offset: usize,
pub doc_count: u32,
pub gram_count: u32,
pub postings_start: usize,
}
pub(super) fn parse_segment_mmap(
mmap: &[u8],
accepted_versions: &[u32],
) -> Result<SegmentLayout, IndexError> {
let len = mmap.len();
let corrupt = |msg: &str| IndexError::CorruptIndex(msg.into());
if len < HEADER_SIZE + FOOTER_SIZE {
return Err(corrupt("file too small"));
}
let footer = mmap
.get(len - FOOTER_SIZE..)
.ok_or_else(|| corrupt("truncated: cannot read footer"))?;
if footer.get(44..48) != Some(MAGIC.as_slice()) {
return Err(corrupt("bad footer magic"));
}
let version = u32::from_le_bytes(
footer
.get(40..44)
.ok_or_else(|| corrupt("truncated footer"))?
.try_into()
.map_err(|_| corrupt("footer slice"))?,
);
if !accepted_versions.contains(&version) {
return Err(IndexError::CorruptIndex(format!(
"unsupported segment version {version}"
)));
}
let stored_checksum = u64::from_le_bytes(
footer
.get(32..40)
.ok_or_else(|| corrupt("truncated footer"))?
.try_into()
.map_err(|_| corrupt("footer slice"))?,
);
let content = mmap
.get(..len - FOOTER_SIZE)
.ok_or_else(|| corrupt("truncated: cannot read content"))?;
if xxh64(content, 0) != stored_checksum {
return Err(corrupt("checksum mismatch"));
}
if mmap.get(0..4) != Some(MAGIC.as_slice()) {
return Err(corrupt("bad header magic"));
}
let doc_table_offset = u64::from_le_bytes(
footer[0..8]
.try_into()
.map_err(|_| corrupt("footer doc_table_offset slice"))?,
) as usize;
let dict_offset = u64::from_le_bytes(
footer[16..24]
.try_into()
.map_err(|_| corrupt("footer dict_offset slice"))?,
) as usize;
let doc_count = u32::from_le_bytes(
footer[24..28]
.try_into()
.map_err(|_| corrupt("footer doc_count slice"))?,
);
let gram_count = u32::from_le_bytes(
footer[28..32]
.try_into()
.map_err(|_| corrupt("footer gram_count slice"))?,
);
let content_end = len - FOOTER_SIZE;
if doc_table_offset < HEADER_SIZE || doc_table_offset > content_end {
return Err(corrupt("doc_table_offset out of range"));
}
if dict_offset < HEADER_SIZE || dict_offset > content_end {
return Err(corrupt("dict_offset out of range"));
}
if dict_offset < doc_table_offset {
return Err(corrupt("dict_offset precedes doc_table_offset"));
}
let postings_start = doc_table_offset.saturating_add(doc_count as usize * 8);
Ok(SegmentLayout {
doc_table_offset,
dict_offset,
doc_count,
gram_count,
postings_start,
})
}
#[cfg(feature = "memmap2")]
pub(super) fn read_exact_at(
file: &std::fs::File,
buf: &mut [u8],
offset: u64,
) -> std::io::Result<()> {
#[cfg(unix)]
{
use std::os::unix::fs::FileExt;
file.read_exact_at(buf, offset)
}
#[cfg(not(unix))]
{
use std::io::{Read, Seek, SeekFrom};
let mut owned = file.try_clone()?;
owned.seek(SeekFrom::Start(offset))?;
owned.read_exact(buf)
}
}
#[cfg(feature = "memmap2")]
pub(super) const POST_MAGIC_SIZE: u64 = 8;
#[cfg(feature = "memmap2")]
pub(super) fn read_posting_list_pread(
post_file: &std::fs::File,
abs_off: u64,
) -> std::io::Result<crate::posting::PostingList> {
use std::io::{Error, ErrorKind};
use crate::posting::{roaring_util, PostingList};
let mut header = [0u8; 9];
let header_off = abs_off.checked_add(POST_MAGIC_SIZE).ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"posting header offset overflow",
)
})?;
read_exact_at(post_file, &mut header, header_off)?;
let encoding = header[0];
let byte_len = u32::from_le_bytes(header[5..9].try_into().unwrap()) as usize;
const MAX_POSTING_BYTES: usize = 8 * 1024 * 1024;
if byte_len > MAX_POSTING_BYTES {
return Err(Error::new(
ErrorKind::InvalidData,
format!("posting list too large: {byte_len} bytes (max {MAX_POSTING_BYTES})"),
));
}
let mut data = vec![0u8; byte_len];
let data_off = abs_off.checked_add(9 + POST_MAGIC_SIZE).ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
"posting data offset overflow",
)
})?;
read_exact_at(post_file, &mut data, data_off)?;
match encoding {
0 => Ok(PostingList::Small(data)),
1 => roaring_util::deserialize(&data)
.map(PostingList::Large)
.map_err(|e| Error::new(ErrorKind::InvalidData, e.to_string())),
_ => Err(Error::new(
ErrorKind::InvalidData,
format!("unknown posting list encoding {encoding}"),
)),
}
}
impl MmapSegment {
pub fn from_bytes(dict_bytes: Vec<u8>, post_bytes: Vec<u8>) -> Result<Self, IndexError> {
let layout = parse_segment_mmap(&dict_bytes, &[FORMAT_VERSION_V2, FORMAT_VERSION_V3])?;
let len = dict_bytes.len();
Ok(MmapSegment {
_file: None,
expected_len: len,
doc_count: layout.doc_count,
gram_count: layout.gram_count,
doc_table_offset: layout.doc_table_offset,
dict_offset: layout.dict_offset,
postings_start: layout.postings_start,
mmap: SegmentData::Heap(dict_bytes),
postings: PostingsBacking::InMemory(post_bytes),
})
}
#[cfg(feature = "memmap2")]
pub fn open(path: &Path) -> Result<Self, IndexError> {
let file = std::fs::File::open(path)?;
let file_meta = file.metadata()?;
if file_meta.len() > MAX_SEGMENT_SIZE {
return Err(IndexError::CorruptIndex(format!(
"segment too large ({} bytes, max {})",
file_meta.len(),
MAX_SEGMENT_SIZE
)));
}
file.try_lock_shared()
.map_err(|e| std::io::Error::other(e.to_string()))?;
let mmap = unsafe { MmapOptions::new().map_copy_read_only(&file)? };
let len = mmap.len();
let layout = parse_segment_mmap(&mmap, &[FORMAT_VERSION_V2, FORMAT_VERSION_V3])?;
Ok(MmapSegment {
_file: Some(file),
mmap: SegmentData::Mmap(mmap),
expected_len: len,
doc_count: layout.doc_count,
gram_count: layout.gram_count,
doc_table_offset: layout.doc_table_offset,
dict_offset: layout.dict_offset,
postings_start: layout.postings_start,
postings: PostingsBacking::V2Mmap,
})
}
#[cfg(feature = "memmap2")]
pub fn open_split(dict_path: &Path, post_path: &Path) -> Result<Self, IndexError> {
let file = std::fs::File::open(dict_path)?;
let file_meta = file.metadata()?;
if file_meta.len() > MAX_SEGMENT_SIZE {
return Err(IndexError::CorruptIndex(format!(
"dict file too large ({} bytes, max {})",
file_meta.len(),
MAX_SEGMENT_SIZE
)));
}
file.try_lock_shared()
.map_err(|e| std::io::Error::other(e.to_string()))?;
let mmap = unsafe { MmapOptions::new().map_copy_read_only(&file)? };
let len = mmap.len();
let layout = parse_segment_mmap(&mmap, &[FORMAT_VERSION_V3])?;
let post_file = std::fs::File::open(post_path)?;
post_file
.try_lock_shared()
.map_err(|e| std::io::Error::other(e.to_string()))?;
const POST_MAGIC: &[u8; 8] = b"SNTXPOST";
const POST_MIN_SIZE: usize = 8 + 8;
let post_meta = post_file.metadata()?;
let post_len = post_meta.len() as usize;
if post_len < POST_MIN_SIZE {
return Err(IndexError::CorruptIndex(format!(
"post file too small: {post_len} bytes"
)));
}
let mut post_magic = [0u8; 8];
read_exact_at(&post_file, &mut post_magic, 0)?;
if &post_magic != POST_MAGIC {
return Err(IndexError::CorruptIndex(
"post file has wrong magic (expected SNTXPOST)".into(),
));
}
let checksum_offset = (post_len - 8) as u64;
let mut stored_cksum_bytes = [0u8; 8];
read_exact_at(&post_file, &mut stored_cksum_bytes, checksum_offset)?;
let stored_post_checksum = u64::from_le_bytes(stored_cksum_bytes);
let postings_data_len = post_len - 16; let mut postings_data = vec![0u8; postings_data_len];
if postings_data_len > 0 {
read_exact_at(&post_file, &mut postings_data, 8)?;
}
let expected_post_checksum = xxh64(&postings_data, 0);
if stored_post_checksum != expected_post_checksum {
return Err(IndexError::CorruptIndex(
"post file checksum mismatch".into(),
));
}
Ok(MmapSegment {
_file: Some(file),
mmap: SegmentData::Mmap(mmap),
expected_len: len,
doc_count: layout.doc_count,
gram_count: layout.gram_count,
doc_table_offset: layout.doc_table_offset,
dict_offset: layout.dict_offset,
postings_start: 0,
postings: PostingsBacking::V3File(post_file),
})
}
}