ruve-db 0.1.1

A hybrid vector and full-text search database with HNSW approximate nearest-neighbour indexing and BM25
Documentation
use std::fs::{OpenOptions};
use std::io::{Read, Seek, SeekFrom, Write};
use crate::types::Record;

// append a record, and it's size to file and returns its offset
pub fn append_record(record: &Record, path: &String) -> u64 {
    let mut file = OpenOptions::new()
        .create(true)
        .append(true)
        .read(true)
        .open(path)
        .unwrap();

    let offset = file.seek(SeekFrom::End(0)).unwrap();


    let serialized = bincode::serialize(record).unwrap();
    let length = serialized.len() as u64;

    file.write_all(&length.to_le_bytes()).unwrap();
    file.write_all(&serialized).unwrap();

    offset
}

// retrieve and deserialize a record from offset
pub fn retrieve_record(offset: u64, path: &String) -> Record {
    let mut file = OpenOptions::new()
        .read(true)
        .open(path)
        .unwrap();

    file.seek(SeekFrom::Start(offset)).unwrap();

    let mut record_size_buffer = [0u8; 8];
    file.read_exact(&mut record_size_buffer).unwrap();
    let record_size = u64::from_le_bytes(record_size_buffer);

    let mut record_buffer = vec![0u8; record_size as usize];
    file.read_exact(&mut record_buffer).unwrap();

    bincode::deserialize(&record_buffer).unwrap()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::types::Record;

    fn tmp_path(name: &str) -> String {
        std::env::temp_dir()
            .join(format!("ruve_storage_{}_{}.bin", name,
                std::time::SystemTime::now()
                    .duration_since(std::time::UNIX_EPOCH)
                    .unwrap()
                    .subsec_nanos()))
            .to_str().unwrap().to_string()
    }

    #[test]
    fn append_then_retrieve_roundtrip() {
        let path = tmp_path("roundtrip");
        let record = Record::new(vec![1.0, 2.0, 3.0], Some("hello".to_string()), Some("id-1".to_string()));

        let offset = append_record(&record, &path);
        let got = retrieve_record(offset, &path);

        assert_eq!(got.id, "id-1");
        assert_eq!(got.vector, vec![1.0, 2.0, 3.0]);
        assert_eq!(got.metadata.as_deref(), Some("hello"));

        let _ = std::fs::remove_file(&path);
    }

    #[test]
    fn multiple_records_are_each_retrievable_by_offset() {
        let path = tmp_path("multi");

        let r1 = Record::new(vec![1.0], Some("first".to_string()),  Some("id-1".to_string()));
        let r2 = Record::new(vec![2.0], Some("second".to_string()), Some("id-2".to_string()));
        let r3 = Record::new(vec![3.0], Some("third".to_string()),  Some("id-3".to_string()));

        let o1 = append_record(&r1, &path);
        let o2 = append_record(&r2, &path);
        let o3 = append_record(&r3, &path);

        assert_eq!(retrieve_record(o1, &path).id, "id-1");
        assert_eq!(retrieve_record(o2, &path).id, "id-2");
        assert_eq!(retrieve_record(o3, &path).id, "id-3");

        // retrieval is order-independent
        assert_eq!(retrieve_record(o3, &path).metadata.as_deref(), Some("third"));
        assert_eq!(retrieve_record(o1, &path).metadata.as_deref(), Some("first"));

        let _ = std::fs::remove_file(&path);
    }

    #[test]
    fn record_with_no_metadata_roundtrips() {
        let path = tmp_path("no_meta");
        let record = Record::new(vec![0.5], None, Some("bare".to_string()));

        let offset = append_record(&record, &path);
        let got = retrieve_record(offset, &path);

        assert_eq!(got.id, "bare");
        assert!(got.metadata.is_none());

        let _ = std::fs::remove_file(&path);
    }
}