use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
use std::path::Path;
const HEADER_BYTES: usize = 6;
pub struct LabelSpillWriter {
writer: BufWriter<File>,
}
impl LabelSpillWriter {
pub fn new(path: &Path) -> std::io::Result<Self> {
let file = File::create(path)?;
Ok(Self {
writer: BufWriter::with_capacity(64 * 1024, file),
})
}
pub fn append(&mut self, qnum: u32, label: &str) -> std::io::Result<()> {
let bytes = label.as_bytes();
let len = bytes.len().min(u16::MAX as usize) as u16;
self.writer.write_all(&qnum.to_le_bytes())?;
self.writer.write_all(&len.to_le_bytes())?;
self.writer.write_all(&bytes[..len as usize])?;
Ok(())
}
pub fn finish(mut self) -> std::io::Result<u64> {
self.writer.flush()?;
let file = self
.writer
.into_inner()
.map_err(|e| std::io::Error::other(e.to_string()))?;
let size = file.metadata()?.len();
file.sync_all()?;
Ok(size)
}
}
pub fn read_labels_for(
path: &Path,
wanted: &HashSet<u32>,
) -> std::io::Result<HashMap<u32, String>> {
let file = File::open(path)?;
let mut reader = BufReader::with_capacity(64 * 1024, file);
let mut result: HashMap<u32, String> = HashMap::with_capacity(wanted.len());
let mut qbuf = [0u8; 4];
let mut lenbuf = [0u8; 2];
loop {
match reader.read_exact(&mut qbuf) {
Ok(()) => {}
Err(ref e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
Err(e) => return Err(e),
}
reader.read_exact(&mut lenbuf)?;
let qnum = u32::from_le_bytes(qbuf);
let len = u16::from_le_bytes(lenbuf) as usize;
if wanted.contains(&qnum) && len > 0 {
let mut bytes = vec![0u8; len];
reader.read_exact(&mut bytes)?;
result.insert(qnum, String::from_utf8_lossy(&bytes).into_owned());
} else {
reader.seek(SeekFrom::Current(len as i64))?;
}
}
Ok(result)
}
#[allow(dead_code)]
pub const fn record_overhead_bytes() -> usize {
HEADER_BYTES
}
#[cfg(test)]
mod tests {
use super::*;
fn tmp_path() -> std::path::PathBuf {
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
let seq = COUNTER.fetch_add(1, Ordering::Relaxed);
let nanos = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
std::env::temp_dir().join(format!("kglite_label_spill_{}_{}.bin", nanos, seq))
}
#[test]
fn write_then_read_wanted_subset() {
let path = tmp_path();
let mut w = LabelSpillWriter::new(&path).unwrap();
w.append(5, "human").unwrap();
w.append(76, "Barack Obama").unwrap();
w.append(20, "Norway").unwrap();
w.append(42, "Douglas Adams").unwrap();
let size = w.finish().unwrap();
assert!(size > 0);
let wanted: HashSet<u32> = [5, 20].into_iter().collect();
let got = read_labels_for(&path, &wanted).unwrap();
assert_eq!(got.len(), 2);
assert_eq!(got.get(&5).unwrap(), "human");
assert_eq!(got.get(&20).unwrap(), "Norway");
assert!(!got.contains_key(&76));
assert!(!got.contains_key(&42));
let _ = std::fs::remove_file(path);
}
#[test]
fn last_write_wins_per_qnum() {
let path = tmp_path();
let mut w = LabelSpillWriter::new(&path).unwrap();
w.append(5, "first").unwrap();
w.append(5, "second").unwrap();
w.finish().unwrap();
let wanted: HashSet<u32> = [5].into_iter().collect();
let got = read_labels_for(&path, &wanted).unwrap();
assert_eq!(got.get(&5).unwrap(), "second");
let _ = std::fs::remove_file(path);
}
#[test]
fn empty_wanted_set_skips_all() {
let path = tmp_path();
let mut w = LabelSpillWriter::new(&path).unwrap();
for i in 0..1000 {
w.append(i, "label").unwrap();
}
w.finish().unwrap();
let wanted = HashSet::new();
let got = read_labels_for(&path, &wanted).unwrap();
assert!(got.is_empty());
let _ = std::fs::remove_file(path);
}
#[test]
fn empty_journal_reads_empty() {
let path = tmp_path();
LabelSpillWriter::new(&path).unwrap().finish().unwrap();
let wanted: HashSet<u32> = [1, 2, 3].into_iter().collect();
let got = read_labels_for(&path, &wanted).unwrap();
assert!(got.is_empty());
let _ = std::fs::remove_file(path);
}
#[test]
fn zero_length_labels_handled() {
let path = tmp_path();
let mut w = LabelSpillWriter::new(&path).unwrap();
w.append(1, "").unwrap();
w.append(2, "real").unwrap();
w.finish().unwrap();
let wanted: HashSet<u32> = [1, 2].into_iter().collect();
let got = read_labels_for(&path, &wanted).unwrap();
assert!(!got.contains_key(&1));
assert_eq!(got.get(&2).unwrap(), "real");
let _ = std::fs::remove_file(path);
}
}