use std::path::Path;
use super::*;
use tempfile::TempDir;
#[test]
fn round_trip_empty_segment() {
let dir = TempDir::new().unwrap();
let writer = SegmentWriter::new();
let meta = writer.write_to_dir(dir.path()).unwrap();
assert_eq!(meta.doc_count, 0);
assert_eq!(meta.gram_count, 0);
assert!(dir.path().join(&meta.dict_filename).exists());
assert!(dir.path().join(&meta.post_filename).exists());
let dict_path = dir.path().join(&meta.dict_filename);
let seg = MmapSegment::open(&dict_path).unwrap();
assert_eq!(seg.doc_count, 0);
assert_eq!(seg.gram_count, 0);
}
#[test]
fn round_trip_with_docs_and_grams() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("src/main.rs"), 0xDEAD, 100);
writer.add_document(1, Path::new("src/lib.rs"), 0xBEEF, 200);
writer.add_gram_posting(0xAAAA, 0);
writer.add_gram_posting(0xAAAA, 1);
writer.add_gram_posting(0xBBBB, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
assert_eq!(meta.doc_count, 2);
assert_eq!(meta.gram_count, 2);
assert!(dir.path().join(&meta.dict_filename).exists());
assert!(dir.path().join(&meta.post_filename).exists());
let dict_path = dir.path().join(&meta.dict_filename);
let post_path = dir.path().join(&meta.post_filename);
let seg = MmapSegment::open_split(&dict_path, &post_path).unwrap();
assert_eq!(seg.doc_count, 2);
let d0 = seg.get_doc(0).unwrap();
assert_eq!(d0.path, Path::new("src/main.rs"));
assert_eq!(d0.content_hash, 0xDEAD);
let pl = seg.lookup_gram(0xAAAA).unwrap();
let ids = pl.to_vec().unwrap();
assert_eq!(ids, vec![0, 1]);
}
#[test]
fn duplicate_postings_are_deduplicated() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("src/main.rs"), 0xDEAD, 100);
writer.add_document(1, Path::new("src/lib.rs"), 0xBEEF, 200);
writer.add_gram_posting(0xAAAA, 0);
writer.add_gram_posting(0xAAAA, 0);
writer.add_gram_posting(0xAAAA, 1);
let meta = writer.write_to_dir(dir.path()).unwrap();
assert_eq!(meta.gram_count, 1);
assert!(dir.path().join(&meta.dict_filename).exists());
assert!(dir.path().join(&meta.post_filename).exists());
let dict_path = dir.path().join(&meta.dict_filename);
let post_path = dir.path().join(&meta.post_filename);
let seg = MmapSegment::open_split(&dict_path, &post_path).unwrap();
assert_eq!(seg.gram_cardinality(0xAAAA), Some(2));
}
#[test]
fn corrupt_file_rejected() {
let dir = TempDir::new().unwrap();
let bad_path = dir.path().join("bad.dict");
std::fs::write(&bad_path, b"not a valid segment").unwrap();
assert!(MmapSegment::open(&bad_path).is_err());
}
#[test]
fn verify_integrity_passes_on_clean_segment() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 1, 10);
writer.add_gram_posting(0xAA, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let seg = MmapSegment::open(&dict_path).unwrap();
assert!(seg.verify_integrity().is_ok());
}
#[test]
fn open_rejects_segment_exceeding_size_limit() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 1, 10);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let seg = MmapSegment::open(&dict_path);
assert!(
seg.is_ok(),
"valid segment under size limit must open successfully"
);
}
#[test]
fn map_private_copy_unaffected_by_post_open_file_mutation() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 1, 10);
writer.add_gram_posting(0xAA, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let seg = MmapSegment::open(&dict_path).unwrap();
let replacement = dir.path().join("replacement.dict");
std::fs::write(&replacement, b"SNTX_corrupted_on_disk").unwrap();
std::fs::rename(&replacement, &dict_path).unwrap();
assert!(
seg.verify_integrity().is_ok(),
"mmap must survive atomic file replacement via rename"
);
assert!(
seg.get_doc(0).is_some(),
"mmap must serve doc reads after file replacement"
);
}
#[test]
fn with_capacity_hint_does_not_panic_when_exceeded() {
let mut writer = SegmentWriter::with_capacity(1, 2);
writer.add_document(0, Path::new("a.rs"), 1, 10);
for i in 0u64..100 {
writer.add_gram_posting(i, 0);
}
let dir = TempDir::new().unwrap();
assert!(writer.write_to_dir(dir.path()).is_ok());
}
#[test]
fn add_document_rejects_duplicate_doc_ids() {
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 1, 10);
writer.add_document(1, Path::new("b.rs"), 2, 20);
writer.add_document(1, Path::new("c.rs"), 3, 30);
let dir = TempDir::new().unwrap();
let result = writer.write_to_dir(dir.path());
assert!(result.is_err(), "duplicate doc_id must be rejected");
}
#[test]
fn format_version_constants_are_distinct() {
assert_ne!(FORMAT_VERSION_V2, FORMAT_VERSION_V3);
assert_eq!(FORMAT_VERSION, FORMAT_VERSION_V3);
}
#[test]
fn dict_entry_size_matches_components() {
assert_eq!(DICT_ENTRY_SIZE, 20);
}
#[test]
fn v3_writer_produces_two_files() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("src/lib.rs"), 0xABCD, 100);
writer.add_gram_posting(0x1234, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
assert!(
dir.path().join(&meta.dict_filename).exists(),
"missing .dict"
);
assert!(
dir.path().join(&meta.post_filename).exists(),
"missing .post"
);
let any_seg = std::fs::read_dir(dir.path())
.unwrap()
.any(|e| e.unwrap().file_name().to_string_lossy().ends_with(".seg"));
assert!(!any_seg, "v3 writer must not produce a .seg file");
}
#[test]
fn v3_round_trip_lookup_gram() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("src/main.rs"), 0xDEAD, 100);
writer.add_document(1, Path::new("src/lib.rs"), 0xBEEF, 200);
writer.add_gram_posting(0xAAAA, 0);
writer.add_gram_posting(0xAAAA, 1);
writer.add_gram_posting(0xBBBB, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let seg = MmapSegment::open_split(
&dir.path().join(&meta.dict_filename),
&dir.path().join(&meta.post_filename),
)
.unwrap();
assert_eq!(seg.doc_count, 2);
let d0 = seg.get_doc(0).unwrap();
assert_eq!(d0.path, Path::new("src/main.rs"));
let pl = seg.lookup_gram(0xAAAA).unwrap();
assert_eq!(pl.to_vec().unwrap(), vec![0, 1]);
let pl2 = seg.lookup_gram(0xBBBB).unwrap();
assert_eq!(pl2.to_vec().unwrap(), vec![0]);
assert!(seg.lookup_gram(0xCCCC).is_none());
}
#[test]
fn v3_round_trip_get_doc() {
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 0xAA, 10);
writer.add_gram_posting(0x11, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let seg = MmapSegment::open_split(
&dir.path().join(&meta.dict_filename),
&dir.path().join(&meta.post_filename),
)
.unwrap();
let doc = seg.get_doc(0).unwrap();
assert_eq!(doc.path, Path::new("a.rs"));
assert_eq!(doc.content_hash, 0xAA);
assert!(seg.get_doc(1).is_none());
}
#[cfg(unix)]
#[test]
fn round_trip_preserves_non_utf8_path_bytes() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
let path = std::path::PathBuf::from(OsString::from_vec(b"src/odd\xff.rs".to_vec()));
writer.add_document(0, &path, 0xDEAD, 100);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let seg = MmapSegment::open(&dict_path).unwrap();
let d0 = seg.get_doc(0).unwrap();
assert_eq!(d0.path, path);
}
#[test]
fn v3_post_file_has_magic_and_checksum() {
let dir = tempfile::TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("src/a.rs"), 0x1234, 100);
writer.add_gram_posting(0xAAAA, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let post_bytes = std::fs::read(dir.path().join(&meta.post_filename)).unwrap();
assert_eq!(&post_bytes[..8], b"SNTXPOST", "missing .post magic header");
assert!(post_bytes.len() >= 17, "post file too short");
let postings_data = &post_bytes[8..post_bytes.len() - 8];
let expected_checksum = xxhash_rust::xxh64::xxh64(postings_data, 0);
let stored_checksum =
u64::from_le_bytes(post_bytes[post_bytes.len() - 8..].try_into().unwrap());
assert_eq!(stored_checksum, expected_checksum, "checksum mismatch");
}
#[test]
fn open_split_rejects_corrupt_post_file() {
let dir = tempfile::TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("src/a.rs"), 0xABCD, 100);
writer.add_gram_posting(0x1111, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let post_path = dir.path().join(&meta.post_filename);
let mut post_bytes = std::fs::read(&post_path).unwrap();
post_bytes[0] = b'X'; std::fs::write(&post_path, &post_bytes).unwrap();
let result = MmapSegment::open_split(
&dir.path().join(&meta.dict_filename),
&dir.path().join(&meta.post_filename),
);
assert!(
result.is_err(),
"open_split must reject corrupt .post magic"
);
}
#[test]
fn get_doc_rejects_abs_off_pointing_into_dict_section() {
use xxhash_rust::xxh64::xxh64;
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 0xABCD, 10);
writer.add_gram_posting(0x1111, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let mut bytes = std::fs::read(&dict_path).unwrap();
let len = bytes.len();
let footer_start = len - FOOTER_SIZE;
let dict_offset_value = u64::from_le_bytes(
bytes[footer_start + 16..footer_start + 24]
.try_into()
.unwrap(),
) as usize;
let doc_table_offset = HEADER_SIZE; let bad_abs_off = (dict_offset_value + 4) as u64;
bytes[doc_table_offset..doc_table_offset + 8].copy_from_slice(&bad_abs_off.to_le_bytes());
let new_cksum = xxh64(&bytes[..footer_start], 0);
bytes[footer_start + 32..footer_start + 40].copy_from_slice(&new_cksum.to_le_bytes());
let crafted_path = dir.path().join("crafted.dict");
std::fs::write(&crafted_path, &bytes).unwrap();
let seg = MmapSegment::open(&crafted_path).unwrap();
assert!(
seg.get_doc(0).is_none(),
"get_doc must return None when abs_off points into dict section"
);
}
#[test]
fn mmap_isolation_from_disk_overwrite() {
let dir = tempfile::TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, std::path::Path::new("a.rs"), 0xABCD, 10);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let seg = MmapSegment::open(&dict_path).unwrap();
let replacement = dir.path().join("replacement.dict");
std::fs::write(&replacement, b"CORRUPTED").unwrap();
std::fs::rename(&replacement, &dict_path).unwrap();
let doc = seg.get_doc(0);
assert!(
doc.is_some(),
"mmap must survive atomic file replacement via rename"
);
}
#[test]
fn v2_posting_offset_below_postings_start_returns_none() {
use xxhash_rust::xxh64::xxh64;
let dir = TempDir::new().unwrap();
let mut writer = SegmentWriter::new();
writer.add_document(0, Path::new("a.rs"), 0xABCD, 10);
writer.add_gram_posting(0x1111_2222_3333_4444u64, 0);
let meta = writer.write_to_dir(dir.path()).unwrap();
let dict_path = dir.path().join(&meta.dict_filename);
let mut bytes = std::fs::read(&dict_path).unwrap();
let len = bytes.len();
let footer_start = len - FOOTER_SIZE;
let doc_table_offset =
u64::from_le_bytes(bytes[footer_start..footer_start + 8].try_into().unwrap()) as usize;
let doc_count = u32::from_le_bytes(
bytes[footer_start + 24..footer_start + 28]
.try_into()
.unwrap(),
);
let dict_offset = u64::from_le_bytes(
bytes[footer_start + 16..footer_start + 24]
.try_into()
.unwrap(),
) as usize;
let postings_start = doc_table_offset + doc_count as usize * 8;
assert!(
postings_start > HEADER_SIZE,
"postings_start({postings_start}) must exceed HEADER_SIZE({HEADER_SIZE}) \
for this test to distinguish old vs new check"
);
let abs_off_field_start = dict_offset + 8; let crafted_abs_off = doc_table_offset as u64;
bytes[abs_off_field_start..abs_off_field_start + 8]
.copy_from_slice(&crafted_abs_off.to_le_bytes());
let new_cksum = xxh64(&bytes[..footer_start], 0);
bytes[footer_start + 32..footer_start + 40].copy_from_slice(&new_cksum.to_le_bytes());
let crafted_path = dir.path().join("crafted_b03.dict");
std::fs::write(&crafted_path, &bytes).unwrap();
let seg = MmapSegment::open(&crafted_path).unwrap();
let result = seg.lookup_gram(0x1111_2222_3333_4444u64);
assert!(
result.is_none(),
"lookup_gram must return None when abs_off({crafted_abs_off}) < \
postings_start({postings_start}): {result:?}"
);
}