Skip to main content

sochdb_vector/segment/
format.rs

1//! Segment binary format definitions.
2//!
3//! The segment file format is designed for mmap and sequential SIMD scans:
4//! - Little-endian, fixed header
5//! - Offset table to SoA blocks
6//! - BPS stored as block-major SoA
7//! - RDF posting lists stored in VID-striped chunks
8
9use crate::types::*;
10use bytemuck::{Pod, Zeroable};
11
12/// Segment header - fixed size at file start
13#[repr(C)]
14#[derive(Debug, Clone, Copy, Pod, Zeroable)]
15pub struct SegmentHeader {
16    /// Magic bytes: b"SVSEGM\0\0"
17    pub magic: [u8; 8],
18    /// Format version
19    pub version: u32,
20    /// Feature flags
21    pub flags: SegmentFlags,
22    /// Number of vectors in segment
23    pub n_vec: u32,
24    /// Vector dimension
25    pub dim: u32,
26    /// BPS block size (e.g., 16)
27    pub bps_block: u16,
28    /// BPS projections per block (1 or 2)
29    pub bps_proj: u16,
30    /// RDF top-t dimensions per vector
31    pub rdf_t: u16,
32    /// RDF stripe shift (log2 of stripe size)
33    pub rdf_stripe_shift: u8,
34    /// Number of outliers per vector
35    pub num_outliers: u8,
36
37    // Offset table (bytes from file start)
38    /// BPS SoA array
39    pub off_bps: u64,
40    /// int8 embeddings (blocked SoA)
41    pub off_i8: u64,
42    /// Quantization scales per block
43    pub off_scales: u64,
44    /// Outlier entries
45    pub off_outliers: u64,
46    /// Tombstone bitset
47    pub off_tombstone: u64,
48    /// RDF posting list directory
49    pub off_rdf_dir: u64,
50    /// RDF posting list data
51    pub off_rdf_data: u64,
52    /// Dimension weights for RDF
53    pub off_dim_weights: u64,
54    /// Original fp32 vectors (optional, for verification)
55    pub off_fp32: u64,
56    /// BPS quantization parameters (min, inv_range per slot)
57    pub off_bps_qparams: u64,
58    /// Total file length
59    pub file_len: u64,
60
61    /// Padding for alignment (to 256 bytes total)
62    /// 8 + 4 + 4 + 4 + 4 + 2 + 2 + 2 + 1 + 1 + (10 * 8) + 8 = 120, so we need 136 reserved
63    pub _reserved1: [u8; 128],
64    pub _reserved2: [u8; 8],
65}
66
67impl SegmentHeader {
68    pub const SIZE: usize = std::mem::size_of::<Self>();
69
70    /// Create a new header with magic and version
71    pub fn new(n_vec: u32, dim: u32) -> Self {
72        Self {
73            magic: MAGIC,
74            version: SEGMENT_VERSION,
75            flags: SegmentFlags::empty(),
76            n_vec,
77            dim,
78            bps_block: DEFAULT_BPS_BLOCK_SIZE,
79            bps_proj: DEFAULT_BPS_PROJECTIONS,
80            rdf_t: DEFAULT_RDF_TOP_T,
81            rdf_stripe_shift: DEFAULT_STRIPE_SHIFT,
82            num_outliers: DEFAULT_NUM_OUTLIERS,
83            off_bps: 0,
84            off_i8: 0,
85            off_scales: 0,
86            off_outliers: 0,
87            off_tombstone: 0,
88            off_rdf_dir: 0,
89            off_rdf_data: 0,
90            off_dim_weights: 0,
91            off_fp32: 0,
92            off_bps_qparams: 0,
93            file_len: 0,
94            _reserved1: [0; 128],
95            _reserved2: [0; 8],
96        }
97    }
98
99    /// Validate header
100    pub fn validate(&self) -> crate::Result<()> {
101        if self.magic != MAGIC {
102            return Err(crate::Error::InvalidMagic);
103        }
104        if self.version != SEGMENT_VERSION {
105            return Err(crate::Error::UnsupportedVersion(self.version));
106        }
107        Ok(())
108    }
109
110    /// Number of BPS blocks
111    pub fn num_bps_blocks(&self) -> u32 {
112        (self.dim + self.bps_block as u32 - 1) / self.bps_block as u32
113    }
114
115    /// Size of BPS data in bytes
116    pub fn bps_size(&self) -> usize {
117        self.num_bps_blocks() as usize * self.n_vec as usize * self.bps_proj as usize
118    }
119
120    /// Size of int8 embedding data in bytes
121    pub fn i8_size(&self) -> usize {
122        self.n_vec as usize * self.dim as usize
123    }
124
125    /// Stripe size (number of vids per stripe)
126    pub fn stripe_size(&self) -> usize {
127        1usize << self.rdf_stripe_shift
128    }
129}
130
131/// Segment feature flags
132#[repr(transparent)]
133#[derive(Debug, Clone, Copy, Pod, Zeroable, PartialEq, Eq)]
134pub struct SegmentFlags(pub u32);
135
136impl SegmentFlags {
137    pub const NONE: u32 = 0;
138    pub const HAS_FP32: u32 = 1 << 0;
139    pub const HAS_OUTLIERS: u32 = 1 << 1;
140    pub const HAS_RDF: u32 = 1 << 2;
141    pub const HAS_BPS: u32 = 1 << 3;
142    pub const NORMALIZED: u32 = 1 << 4;
143    pub const ROTATED: u32 = 1 << 5;
144
145    pub fn empty() -> Self {
146        Self(Self::NONE)
147    }
148
149    pub fn has(&self, flag: u32) -> bool {
150        (self.0 & flag) != 0
151    }
152
153    pub fn set(&mut self, flag: u32) {
154        self.0 |= flag;
155    }
156}
157
158/// RDF posting list directory entry
159#[repr(C)]
160#[derive(Debug, Clone, Copy, Pod, Zeroable)]
161pub struct PostingListEntry {
162    /// Offset to list data
163    pub offset: u64,
164    /// Number of postings (total across all stripes)
165    pub length: u32,
166    /// Number of stripe chunks
167    pub num_stripes: u16,
168    /// Flags (is_stopword, etc.)
169    pub flags: u16,
170}
171
172impl PostingListEntry {
173    pub const FLAG_STOPWORD: u16 = 1 << 0;
174
175    pub fn is_stopword(&self) -> bool {
176        (self.flags & Self::FLAG_STOPWORD) != 0
177    }
178}
179
180/// Block quantization scale
181#[repr(C)]
182#[derive(Debug, Clone, Copy, Pod, Zeroable)]
183pub struct BlockScale {
184    /// Scale factor for the block
185    pub scale: f32,
186}
187
188/// Align a value to the next multiple of alignment
189#[inline]
190pub const fn align_to(value: usize, alignment: usize) -> usize {
191    (value + alignment - 1) & !(alignment - 1)
192}
193
194/// Compute the offset of BPS data for a specific block and vector
195#[inline]
196pub const fn bps_offset(block: usize, vec_id: usize, n_vec: usize, proj: usize) -> usize {
197    // SoA layout: bps[(block * proj + p) * n_vec + vec]
198    // For proj=1: bps[block * n_vec + vec]
199    if proj == 1 {
200        block * n_vec + vec_id
201    } else {
202        (block * 2) * n_vec + vec_id * 2
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn test_header_size() {
212        assert_eq!(SegmentHeader::SIZE, 256);
213    }
214
215    #[test]
216    fn test_header_validation() {
217        let mut header = SegmentHeader::new(1000, 768);
218        assert!(header.validate().is_ok());
219
220        header.magic = [0; 8];
221        assert!(header.validate().is_err());
222    }
223
224    #[test]
225    fn test_flags() {
226        let mut flags = SegmentFlags::empty();
227        assert!(!flags.has(SegmentFlags::HAS_BPS));
228
229        flags.set(SegmentFlags::HAS_BPS);
230        assert!(flags.has(SegmentFlags::HAS_BPS));
231        assert!(!flags.has(SegmentFlags::HAS_RDF));
232    }
233}