use serde::{Deserialize, Serialize};
pub const SEGMENT_MAGIC: &[u8; 4] = b"VCNT";
pub const CHECKPOINT_MAGIC: [u8; 4] = *b"VCKP";
pub const FORMAT_VERSION: u32 = 1;
pub const WAL_MAGIC: [u8; 4] = *b"VWAL";
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum IndexType {
Hnsw,
DiskAnn,
IvfPq,
ScaNN,
Sng,
Flat,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum CompressionType {
None,
ProductQuantization,
ScalarQuantization,
BinaryQuantization,
RaBitQ,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SegmentHeader {
pub segment_id: u64,
pub vector_count: u64,
pub dimension: u32,
pub index_type: IndexType,
pub compression: CompressionType,
pub created_at: u64,
pub metadata: std::collections::HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SegmentInfo {
pub segment_id: u64,
pub wal_sequence: u64,
}
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub enum GraphWalEntry {
InsertNode {
doc_id: u32,
level: u8,
vector: Vec<f32>,
neighbors_per_level: Vec<Vec<u32>>,
},
DeleteNode {
doc_id: u32,
},
UpdateNeighbors {
node_id: u32,
level: u8,
neighbors: Vec<u32>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexManifest {
pub version: u32,
pub index_type: IndexType,
pub dimension: u32,
pub total_vectors: u64,
pub segments: Vec<u64>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub segment_info: Vec<SegmentInfo>,
pub wal_sequence: u64,
pub checkpoint_id: Option<u64>,
pub config: serde_json::Value,
pub created_at: u64,
pub modified_at: u64,
}
impl IndexManifest {
pub fn replay_start_sequence(&self) -> u64 {
if self.segment_info.is_empty() {
self.wal_sequence
} else {
self.segment_info
.iter()
.map(|s| s.wal_sequence)
.min()
.unwrap_or(self.wal_sequence)
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SegmentFooter {
pub magic: [u8; 4],
pub format_version: u32,
pub header_offset: u64,
pub vectors_offset: u64,
pub graph_offset: u64,
pub ids_offset: u64,
pub checksum: u32,
}
impl SegmentFooter {
pub fn new() -> Self {
Self {
magic: *SEGMENT_MAGIC,
format_version: FORMAT_VERSION,
header_offset: 0,
vectors_offset: 0,
graph_offset: 0,
ids_offset: 0,
checksum: 0,
}
}
const SERIALIZED_SIZE: usize = 44;
pub fn read<R: std::io::Read>(reader: &mut R) -> super::error::PersistenceResult<Self> {
let mut buf = vec![0u8; Self::SERIALIZED_SIZE];
reader.read_exact(&mut buf)?;
let mut cursor = std::io::Cursor::new(&buf);
use std::io::Read;
let mut magic = [0u8; 4];
cursor.read_exact(&mut magic)?;
let mut u32_buf = [0u8; 4];
cursor.read_exact(&mut u32_buf)?;
let format_version = u32::from_le_bytes(u32_buf);
let mut u64_buf = [0u8; 8];
cursor.read_exact(&mut u64_buf)?;
let header_offset = u64::from_le_bytes(u64_buf);
cursor.read_exact(&mut u64_buf)?;
let vectors_offset = u64::from_le_bytes(u64_buf);
cursor.read_exact(&mut u64_buf)?;
let graph_offset = u64::from_le_bytes(u64_buf);
cursor.read_exact(&mut u64_buf)?;
let ids_offset = u64::from_le_bytes(u64_buf);
cursor.read_exact(&mut u32_buf)?;
let checksum = u32::from_le_bytes(u32_buf);
Ok(Self {
magic,
format_version,
header_offset,
vectors_offset,
graph_offset,
ids_offset,
checksum,
})
}
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> super::error::PersistenceResult<()> {
writer.write_all(&self.magic)?;
writer.write_all(&self.format_version.to_le_bytes())?;
writer.write_all(&self.header_offset.to_le_bytes())?;
writer.write_all(&self.vectors_offset.to_le_bytes())?;
writer.write_all(&self.graph_offset.to_le_bytes())?;
writer.write_all(&self.ids_offset.to_le_bytes())?;
writer.write_all(&self.checksum.to_le_bytes())?;
Ok(())
}
}
pub trait Persistable: Sized {
fn to_bytes(&self) -> crate::Result<Vec<u8>>;
fn from_bytes(bytes: &[u8]) -> crate::Result<Self>;
fn size_hint(&self) -> usize;
}
pub trait IndexPersistence: Sized {
fn save(&self, path: &std::path::Path) -> crate::Result<()>;
fn load(path: &std::path::Path) -> crate::Result<Self>;
fn exists(path: &std::path::Path) -> bool {
path.join("manifest.json").exists()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
#[test]
#[test]
fn test_segment_header_serde() {
let header = SegmentHeader {
segment_id: 1,
vector_count: 1000,
dimension: 128,
index_type: IndexType::Hnsw,
compression: CompressionType::None,
created_at: 1234567890,
metadata: std::collections::HashMap::new(),
};
let json = serde_json::to_string(&header).unwrap();
let parsed: SegmentHeader = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.segment_id, 1);
assert_eq!(parsed.index_type, IndexType::Hnsw);
}
#[test]
fn test_manifest_serde() {
let manifest = IndexManifest {
version: FORMAT_VERSION,
index_type: IndexType::DiskAnn,
dimension: 384,
total_vectors: 10000,
segments: vec![1, 2, 3],
segment_info: vec![],
wal_sequence: 42,
checkpoint_id: Some(5),
config: serde_json::json!({"M": 16, "ef_construction": 200}),
created_at: 1234567890,
modified_at: 1234567899,
};
let json = serde_json::to_string_pretty(&manifest).unwrap();
let parsed: IndexManifest = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.version, FORMAT_VERSION);
assert_eq!(parsed.segments.len(), 3);
assert!(!json.contains("segment_info"));
}
#[test]
fn test_manifest_backward_compat_no_segment_info() {
let legacy_json = r#"{
"version": 1,
"index_type": "DiskAnn",
"dimension": 384,
"total_vectors": 10000,
"segments": [1, 2, 3],
"wal_sequence": 42,
"checkpoint_id": 5,
"config": {},
"created_at": 1234567890,
"modified_at": 1234567899
}"#;
let parsed: IndexManifest = serde_json::from_str(legacy_json).unwrap();
assert!(parsed.segment_info.is_empty());
assert_eq!(parsed.replay_start_sequence(), 42);
}
#[test]
fn test_manifest_per_segment_watermarks() {
let manifest = IndexManifest {
version: FORMAT_VERSION,
index_type: IndexType::Hnsw,
dimension: 128,
total_vectors: 5000,
segments: vec![1, 2, 3],
segment_info: vec![
SegmentInfo {
segment_id: 1,
wal_sequence: 100,
},
SegmentInfo {
segment_id: 2,
wal_sequence: 50,
},
SegmentInfo {
segment_id: 3,
wal_sequence: 200,
},
],
wal_sequence: 200,
checkpoint_id: None,
config: serde_json::json!({}),
created_at: 0,
modified_at: 0,
};
assert_eq!(manifest.replay_start_sequence(), 50);
let json = serde_json::to_string(&manifest).unwrap();
let parsed: IndexManifest = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.segment_info.len(), 3);
assert_eq!(parsed.replay_start_sequence(), 50);
}
#[test]
fn test_segment_info_serde() {
let info = SegmentInfo {
segment_id: 42,
wal_sequence: 100,
};
let json = serde_json::to_string(&info).unwrap();
let parsed: SegmentInfo = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.segment_id, 42);
assert_eq!(parsed.wal_sequence, 100);
}
#[test]
fn test_graph_wal_entry_postcard_roundtrip() {
let entries = vec![
GraphWalEntry::InsertNode {
doc_id: 42,
level: 2,
vector: vec![1.0, 2.0, 3.0],
neighbors_per_level: vec![vec![1, 2, 3], vec![4, 5], vec![6]],
},
GraphWalEntry::DeleteNode { doc_id: 99 },
GraphWalEntry::UpdateNeighbors {
node_id: 10,
level: 0,
neighbors: vec![20, 30, 40],
},
];
for entry in &entries {
let bytes = postcard::to_allocvec(entry).unwrap();
let decoded: GraphWalEntry = postcard::from_bytes(&bytes).unwrap();
assert_eq!(&decoded, entry);
}
}
}