use std::fs;
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use crate::error::{Error, Result};
pub const MAGIC: &[u8; 8] = b"SHUFLIDX";
pub const CURRENT_VERSION: u8 = 1;
const FINGERPRINT_DOMAIN: &[u8] = b"shuflr-v2-fingerprint\0";
const SAMPLE_BYTES: usize = 4096;
pub fn sidecar_path(input: &Path) -> PathBuf {
let mut s = input.as_os_str().to_os_string();
s.push(".shuflr-idx");
PathBuf::from(s)
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct Fingerprint(pub [u8; 32]);
impl Fingerprint {
pub fn from_metadata(path: &Path) -> Result<Self> {
let meta = fs::metadata(path).map_err(Error::Io)?;
let size = meta.len();
let mtime_ns = meta
.modified()
.ok()
.and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
.map(|d| d.as_nanos())
.unwrap_or(0);
#[cfg(unix)]
let (ino, dev) = {
use std::os::unix::fs::MetadataExt as _;
(meta.ino(), meta.dev())
};
#[cfg(not(unix))]
let (ino, dev): (u64, u64) = (0, 0);
let basename = path
.file_name()
.map(|s| s.as_encoded_bytes().to_vec())
.unwrap_or_default();
let mut head = vec![];
let mut mid = vec![];
let mut tail = vec![];
if size > 0 {
let mut f = fs::File::open(path).map_err(Error::Io)?;
let head_len = std::cmp::min(size as usize, SAMPLE_BYTES);
head = vec![0u8; head_len];
f.read_exact(&mut head).map_err(Error::Io)?;
if size as usize > 3 * SAMPLE_BYTES {
let mid_start = size / 2 - (SAMPLE_BYTES as u64) / 2;
f.seek(SeekFrom::Start(mid_start)).map_err(Error::Io)?;
mid = vec![0u8; SAMPLE_BYTES];
f.read_exact(&mut mid).map_err(Error::Io)?;
let tail_start = size - SAMPLE_BYTES as u64;
f.seek(SeekFrom::Start(tail_start)).map_err(Error::Io)?;
tail = vec![0u8; SAMPLE_BYTES];
f.read_exact(&mut tail).map_err(Error::Io)?;
}
}
let mut h = blake3::Hasher::new();
h.update(FINGERPRINT_DOMAIN);
h.update(&basename);
h.update(b"\0");
h.update(&size.to_le_bytes());
h.update(&mtime_ns.to_le_bytes());
h.update(&ino.to_le_bytes());
h.update(&dev.to_le_bytes());
for sample in [&head, &mid, &tail] {
h.update(&(sample.len() as u64).to_le_bytes());
h.update(sample);
}
Ok(Self(*h.finalize().as_bytes()))
}
}
#[derive(Debug)]
pub struct IndexFile {
pub version: u8,
pub fingerprint: Fingerprint,
pub offsets: Vec<u64>,
}
impl IndexFile {
pub fn count(&self) -> u64 {
(self.offsets.len().saturating_sub(1)) as u64
}
pub fn record_range(&self, i: usize) -> (u64, u64) {
(self.offsets[i], self.offsets[i + 1])
}
pub fn record_len(&self, i: usize) -> u64 {
self.offsets[i + 1] - self.offsets[i]
}
pub fn build<R: Read>(reader: R, fingerprint: Fingerprint) -> Result<Self> {
let mut buf: Vec<u8> = Vec::with_capacity(8 * 1024);
let mut offsets: Vec<u64> = Vec::new();
let mut cursor: u64 = 0;
let mut reader = BufReader::with_capacity(2 * 1024 * 1024, reader);
offsets.push(0);
loop {
buf.clear();
let n = reader.read_until(b'\n', &mut buf).map_err(Error::Io)?;
if n == 0 {
break;
}
cursor += n as u64;
offsets.push(cursor);
}
Ok(Self {
version: CURRENT_VERSION,
fingerprint,
offsets,
})
}
pub fn write_to(&self, mut out: impl Write) -> Result<()> {
out.write_all(MAGIC).map_err(Error::Io)?;
out.write_all(&[self.version]).map_err(Error::Io)?;
out.write_all(&[0u8; 7]).map_err(Error::Io)?; out.write_all(&self.fingerprint.0).map_err(Error::Io)?;
out.write_all(&self.count().to_le_bytes())
.map_err(Error::Io)?;
for off in &self.offsets {
out.write_all(&off.to_le_bytes()).map_err(Error::Io)?;
}
Ok(())
}
pub fn read_from(mut r: impl Read) -> Result<Self> {
let mut magic = [0u8; 8];
r.read_exact(&mut magic).map_err(Error::Io)?;
if &magic != MAGIC {
return Err(Error::Input(format!(
"not a shuflr index (magic {magic:?} != {MAGIC:?})"
)));
}
let mut version_buf = [0u8; 1];
r.read_exact(&mut version_buf).map_err(Error::Io)?;
if version_buf[0] != CURRENT_VERSION {
return Err(Error::Input(format!(
"unsupported shuflr-idx version {} (this build expects {CURRENT_VERSION})",
version_buf[0]
)));
}
let mut reserved = [0u8; 7];
r.read_exact(&mut reserved).map_err(Error::Io)?;
let mut fp = [0u8; 32];
r.read_exact(&mut fp).map_err(Error::Io)?;
let mut count_buf = [0u8; 8];
r.read_exact(&mut count_buf).map_err(Error::Io)?;
let count = u64::from_le_bytes(count_buf);
let entries = (count as usize) + 1;
let mut offsets = Vec::with_capacity(entries);
let mut off_buf = [0u8; 8];
for _ in 0..entries {
r.read_exact(&mut off_buf).map_err(Error::Io)?;
offsets.push(u64::from_le_bytes(off_buf));
}
Ok(Self {
version: version_buf[0],
fingerprint: Fingerprint(fp),
offsets,
})
}
pub fn save(&self, path: &Path) -> Result<()> {
let tmp = {
let mut s = path.as_os_str().to_os_string();
s.push(".tmp");
PathBuf::from(s)
};
{
let mut f = fs::File::create(&tmp).map_err(Error::Io)?;
self.write_to(&mut f)?;
f.sync_all().map_err(Error::Io)?;
}
fs::rename(&tmp, path).map_err(Error::Io)?;
Ok(())
}
pub fn load(path: &Path) -> Result<Self> {
let f = fs::File::open(path).map_err(Error::Io)?;
Self::read_from(BufReader::with_capacity(2 * 1024 * 1024, f))
}
pub fn verify_fingerprint(&self, expected: Fingerprint) -> Result<()> {
if self.fingerprint == expected {
Ok(())
} else {
Err(Error::InputChanged(
"index fingerprint mismatches current file; rebuild with `shuflr index`".into(),
))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_tracks_record_boundaries() {
let text = b"alpha\nbravo\ncharlie\ndelta\n";
let idx = IndexFile::build(&text[..], Fingerprint([0; 32])).unwrap();
assert_eq!(idx.count(), 4);
assert_eq!(idx.record_range(0), (0, 6));
assert_eq!(idx.record_range(1), (6, 12));
assert_eq!(idx.record_range(2), (12, 20));
assert_eq!(idx.record_range(3), (20, 26));
}
#[test]
fn build_handles_missing_trailing_newline() {
let text = b"first\nsecond";
let idx = IndexFile::build(&text[..], Fingerprint([0; 32])).unwrap();
assert_eq!(idx.count(), 2);
assert_eq!(idx.record_range(1), (6, 12));
}
#[test]
fn empty_input_zero_records() {
let idx = IndexFile::build(&b""[..], Fingerprint([0; 32])).unwrap();
assert_eq!(idx.count(), 0);
assert_eq!(idx.offsets, vec![0]);
}
#[test]
fn write_read_roundtrip() {
let idx = IndexFile::build(&b"a\nb\nc\nd\n"[..], Fingerprint([0xab; 32])).unwrap();
let mut buf = Vec::new();
idx.write_to(&mut buf).unwrap();
let parsed = IndexFile::read_from(&buf[..]).unwrap();
assert_eq!(parsed.count(), idx.count());
assert_eq!(parsed.offsets, idx.offsets);
assert_eq!(parsed.fingerprint, idx.fingerprint);
assert_eq!(parsed.version, CURRENT_VERSION);
}
#[test]
fn wrong_magic_rejected() {
let bad = b"NOT_SHUF\x01";
let err = IndexFile::read_from(&bad[..]).unwrap_err();
assert!(matches!(err, Error::Input(_)));
}
#[test]
fn wrong_version_rejected() {
let mut bad = Vec::new();
bad.extend_from_slice(MAGIC);
bad.push(99);
bad.extend_from_slice(&[0u8; 7]);
bad.extend_from_slice(&[0u8; 32]);
bad.extend_from_slice(&0u64.to_le_bytes());
bad.extend_from_slice(&0u64.to_le_bytes());
let err = IndexFile::read_from(&bad[..]).unwrap_err();
assert!(matches!(err, Error::Input(_)));
}
#[test]
fn save_and_load_atomic() {
let tmp = tempfile::tempdir().unwrap();
let idx_path = tmp.path().join("data.shuflr-idx");
let idx = IndexFile::build(&b"x\ny\nz\n"[..], Fingerprint([7; 32])).unwrap();
idx.save(&idx_path).unwrap();
let loaded = IndexFile::load(&idx_path).unwrap();
assert_eq!(loaded.offsets, idx.offsets);
assert_eq!(loaded.fingerprint, idx.fingerprint);
}
#[test]
fn fingerprint_metadata_is_stable_on_unchanged_file() {
let tmp = tempfile::NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), b"hello\n").unwrap();
let a = Fingerprint::from_metadata(tmp.path()).unwrap();
let b = Fingerprint::from_metadata(tmp.path()).unwrap();
assert_eq!(a, b);
}
#[test]
fn sidecar_path_is_input_plus_suffix() {
let p = Path::new("/tmp/foo.jsonl");
assert_eq!(sidecar_path(p), PathBuf::from("/tmp/foo.jsonl.shuflr-idx"));
}
#[test]
fn fingerprint_detects_middle_only_patch() {
use std::os::unix::fs::MetadataExt as _;
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path();
let size = 64 * 1024; let mut body = vec![b'A'; size];
std::fs::write(path, &body).unwrap();
let meta_before = std::fs::metadata(path).unwrap();
let fp_a = Fingerprint::from_metadata(path).unwrap();
body[size / 2] = b'B';
std::fs::write(path, &body).unwrap();
unsafe {
let ts_a = libc_timespec(meta_before.atime(), meta_before.atime_nsec());
let ts_m = libc_timespec(meta_before.mtime(), meta_before.mtime_nsec());
let path_c = std::ffi::CString::new(path.as_os_str().as_encoded_bytes()).unwrap();
let times = [ts_a, ts_m];
libc_utimensat(-100, path_c.as_ptr() as *const u8, times.as_ptr(), 0);
}
let fp_b = Fingerprint::from_metadata(path).unwrap();
assert_ne!(
fp_a, fp_b,
"middle-byte patch with preserved mtime must still flip the fingerprint"
);
}
#[test]
fn fingerprint_detects_in_place_rewrite_of_same_size() {
use std::os::unix::fs::MetadataExt as _;
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path();
let body_a = vec![b'A'; 16 * 1024];
let body_b = {
let mut v = body_a.clone();
v[0] = b'B'; v
};
std::fs::write(path, &body_a).unwrap();
let meta_before = std::fs::metadata(path).unwrap();
let fp_a = Fingerprint::from_metadata(path).unwrap();
std::fs::write(path, &body_b).unwrap();
unsafe {
let ts_a = libc_timespec(meta_before.atime(), meta_before.atime_nsec());
let ts_m = libc_timespec(meta_before.mtime(), meta_before.mtime_nsec());
let path_c = std::ffi::CString::new(path.as_os_str().as_encoded_bytes()).unwrap();
let times = [ts_a, ts_m];
libc_utimensat(-100, path_c.as_ptr() as *const u8, times.as_ptr(), 0);
}
let fp_b = Fingerprint::from_metadata(path).unwrap();
assert_ne!(
fp_a, fp_b,
"fingerprint must distinguish same-size, same-mtime rewrites"
);
}
unsafe extern "C" {
fn utimensat(dirfd: i32, pathname: *const i8, times: *const Timespec, flags: i32) -> i32;
}
#[repr(C)]
struct Timespec {
tv_sec: i64,
tv_nsec: i64,
}
unsafe fn libc_timespec(sec: i64, nsec: i64) -> Timespec {
Timespec {
tv_sec: sec,
tv_nsec: nsec,
}
}
unsafe fn libc_utimensat(
dirfd: i32,
pathname: *const u8,
times: *const Timespec,
flags: i32,
) -> i32 {
unsafe { utimensat(dirfd, pathname as *const i8, times, flags) }
}
}