Skip to main content

nvs_core/
bundle.rs

1use std::fs::{self, File};
2use std::io::Read;
3use std::path::{Path, PathBuf};
4
5use crate::errors::*;
6use crate::manifest::Manifest;
7use memmap2::Mmap;
8use std::collections::HashMap;
9
10#[repr(C)]
11#[derive(Debug, Clone, Copy)]
12struct MetaIdxEntry {
13    block_id: u32,
14    offset_in_block: u32,
15    doc_size: u32,
16    padding: u32,
17}
18
19const META_IDX_ENTRY_SIZE: usize = size_of::<MetaIdxEntry>();
20
21#[derive(Debug)]
22pub struct Bundle {
23    #[allow(dead_code)]
24    root: PathBuf,
25    pub manifest: Manifest,
26    pub meta_block_size: u32,
27    pub meta_block_count: u32,
28    pub meta_codec: Option<String>,
29    pub meta_block_headers: Vec<(u32, u32, u32, u32)>, // (comp_size, decomp_size, doc_count, codec)
30    // Vectors
31    vectors: Mmap,
32    // Metadata
33    meta_blocks: Mmap,
34    meta_idx: Vec<MetaIdxEntry>,
35    // BM25 (internal)
36    pub(crate) doclen: Vec<u32>,
37    pub(crate) terms: HashMap<String, usize>,
38    pub(crate) lexicon: Vec<LexiconEntry>,
39    pub(crate) postings: Vec<u8>,
40}
41
42impl Bundle {
43    pub fn open<P: AsRef<Path>>(root: P) -> Result<Self> {
44        let root = root.as_ref().to_path_buf();
45        // load manifest
46        let manifest_path = root.join("manifest.json");
47        let mut s = String::new();
48        File::open(&manifest_path)?.read_to_string(&mut s)?;
49        let manifest: Manifest = serde_json::from_str(&s)?;
50
51        if manifest.format != "nvs.v1" {
52            return Err(NvsError::InvalidManifest("unsupported format"));
53        }
54        if manifest.num_docs == 0 {
55            return Err(NvsError::InvalidManifest("num_docs must be > 0"));
56        }
57        if manifest.dim == 0 {
58            return Err(NvsError::InvalidManifest("dim must be > 0"));
59        }
60
61        // Validate meta.idx count == num_docs and load entries
62        let meta_idx_path = root.join(&manifest.files.meta_idx.path);
63        let meta_idx_md = fs::metadata(&meta_idx_path)?;
64        let sz = meta_idx_md.len() as usize;
65        if sz % META_IDX_ENTRY_SIZE != 0 {
66            return Err(NvsError::InvalidBundle(
67                "meta.idx not aligned to entry size",
68            ));
69        }
70        let count = sz / META_IDX_ENTRY_SIZE;
71        if count as u64 != manifest.num_docs {
72            return Err(NvsError::InvalidBundle("meta.idx entry count mismatch"));
73        }
74        let mut meta_idx_entries = Vec::with_capacity(count);
75        {
76            let mut f = File::open(&meta_idx_path)?;
77            let mut buf = Vec::with_capacity(sz);
78            f.read_to_end(&mut buf)?;
79            let mut i = 0usize;
80            while i + 16 <= buf.len() {
81                let block_id = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
82                i += 4;
83                let offset_in_block = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
84                i += 4;
85                let doc_size = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
86                i += 4;
87                let padding = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
88                i += 4;
89                meta_idx_entries.push(MetaIdxEntry {
90                    block_id,
91                    offset_in_block,
92                    doc_size,
93                    padding,
94                });
95            }
96        }
97
98        // Validate meta.blocks header and derive block_size
99        let meta_blocks_path = root.join(&manifest.files.meta.path);
100        let meta_blocks_file = File::open(&meta_blocks_path)?;
101        let mut f = meta_blocks_file.try_clone()?;
102        let mut u32buf = [0u8; 4];
103        // read block_count
104        f.read_exact(&mut u32buf)?;
105        let block_count = u32::from_le_bytes(u32buf);
106        if block_count == 0 {
107            return Err(NvsError::InvalidBundle("block_count must be > 0"));
108        }
109        // header size = 4 + block_count * 16
110        let header_size = 4u64 + (block_count as u64) * 16u64;
111        let total_size = fs::metadata(&meta_blocks_path)?.len();
112        if total_size <= header_size {
113            return Err(NvsError::InvalidBundle("meta.blocks too small for headers"));
114        }
115        let remaining = total_size - header_size;
116        if remaining % (block_count as u64) != 0 {
117            return Err(NvsError::InvalidBundle(
118                "meta.blocks data not divisible by block_count",
119            ));
120        }
121        let derived_block = (remaining / (block_count as u64)) as u32;
122
123        if let Some(bsz) = manifest.files.meta.block_size {
124            if bsz != derived_block {
125                return Err(NvsError::InvalidBundle("manifest block_size mismatch"));
126            }
127        }
128        // Read per-block headers
129        let mut headers: Vec<(u32, u32, u32, u32)> = Vec::with_capacity(block_count as usize);
130        for _ in 0..block_count {
131            let mut b = [0u8; 16];
132            f.read_exact(&mut b)?;
133            let csz = u32::from_le_bytes(b[0..4].try_into().unwrap());
134            let dsz = u32::from_le_bytes(b[4..8].try_into().unwrap());
135            let dct = u32::from_le_bytes(b[8..12].try_into().unwrap());
136            let cod = u32::from_le_bytes(b[12..16].try_into().unwrap());
137            headers.push((csz, dsz, dct, cod));
138        }
139        let meta_blocks = unsafe { Mmap::map(&meta_blocks_file)? };
140
141        // Map vectors (f32 or f16)
142        let vectors_path = root.join(&manifest.files.vectors.path);
143        let vec_file = File::open(&vectors_path)?;
144        let vectors = unsafe { Mmap::map(&vec_file)? };
145        let elem_size = if manifest.embedding.dtype.to_lowercase() == "f16" {
146            2
147        } else {
148            4
149        };
150        let row_bytes = (manifest.dim as usize) * elem_size;
151        let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
152        let expected = (manifest.num_docs as usize) * aligned_row_bytes;
153        if vectors.len() != expected {
154            return Err(NvsError::InvalidBundle("vectors size mismatch"));
155        }
156
157        // Load BM25-related files
158        let doclen_path = root.join(&manifest.files.doclen.path);
159        let mut doclen = Vec::<u32>::new();
160        {
161            let mut f = File::open(&doclen_path)?;
162            let mut buf = Vec::new();
163            f.read_to_end(&mut buf)?;
164            if buf.len() % 4 != 0 {
165                return Err(NvsError::InvalidBundle("doclen size not multiple of 4"));
166            }
167            let n = buf.len() / 4;
168            doclen.resize(n, 0);
169            for i in 0..n {
170                let b = [buf[4 * i], buf[4 * i + 1], buf[4 * i + 2], buf[4 * i + 3]];
171                doclen[i] = u32::from_le_bytes(b);
172            }
173            if n as u64 != manifest.num_docs {
174                return Err(NvsError::InvalidBundle("doclen rows mismatch"));
175            }
176        }
177        // terms.dict
178        let terms_path = root.join(&manifest.files.terms.path);
179        let terms = load_terms(&terms_path)?;
180        // lexicon.bin
181        let lexicon_path = root.join(&manifest.files.lexicon.path);
182        let lexicon = load_lexicon(&lexicon_path)?;
183        // postings.bin
184        let postings_path = root.join(&manifest.files.postings.path);
185        let postings = {
186            let mut f = File::open(&postings_path)?;
187            let mut buf = Vec::new();
188            f.read_to_end(&mut buf)?;
189            buf
190        };
191
192        let meta_codec = manifest.files.meta.compression.clone();
193        Ok(Self {
194            root,
195            manifest,
196            meta_block_size: derived_block,
197            meta_block_count: block_count,
198            meta_codec,
199            meta_block_headers: headers,
200            vectors,
201            meta_blocks,
202            meta_idx: meta_idx_entries,
203            doclen,
204            terms,
205            lexicon,
206            postings,
207        })
208    }
209
210    pub fn get_document(&self, doc_id: u32) -> Option<(String, String, String)> {
211        let idx = *self.meta_idx.get(doc_id as usize)?;
212        let header_size = 4usize + (self.meta_block_count as usize) * 16usize;
213        let block_size = self.meta_block_size as usize;
214        let base = &self.meta_blocks;
215        let blocks_start = header_size;
216        let block0 = blocks_start;
217        let block_begin = block0 + (idx.block_id as usize) * block_size;
218        let header = self
219            .meta_block_headers
220            .get(idx.block_id as usize)
221            .copied()
222            .unwrap_or((0, 0, 0, 0));
223        let codec = header.3; // 0=none, 1=zstd
224        if codec == 0 {
225            // Uncompressed, read directly from mmap
226            if (idx.offset_in_block as usize) > block_size {
227                return None;
228            }
229            if (idx.offset_in_block as usize) + (idx.doc_size as usize) > block_size {
230                return None;
231            }
232            let mut p = block_begin + idx.offset_in_block as usize;
233            let end = block_begin + block_size;
234            if p + 4 > end {
235                return None;
236            }
237            let id_len = u32::from_le_bytes(base[p..p + 4].try_into().ok()?) as usize;
238            p += 4;
239            if p + id_len > end {
240                return None;
241            }
242            let id = String::from_utf8(base[p..p + id_len].to_vec()).ok()?;
243            p += id_len;
244            if p + 4 > end {
245                return None;
246            }
247            let text_len = u32::from_le_bytes(base[p..p + 4].try_into().ok()?) as usize;
248            p += 4;
249            if p + text_len > end {
250                return None;
251            }
252            let text = String::from_utf8(base[p..p + text_len].to_vec()).ok()?;
253            p += text_len;
254            if p + 4 > end {
255                return None;
256            }
257            let meta_len = u32::from_le_bytes(base[p..p + 4].try_into().ok()?) as usize;
258            p += 4;
259            if p + meta_len > end {
260                return None;
261            }
262            let meta = String::from_utf8(base[p..p + meta_len].to_vec()).ok()?;
263            Some((id, text, meta))
264        } else {
265            // Zstd compressed block: decompress and then parse at offset
266            let comp_size = header.0 as usize;
267            let decomp_size = header.1 as usize;
268            let comp_start = block_begin;
269            let comp_end = comp_start + comp_size.min(block_size);
270            if comp_end > base.len() {
271                return None;
272            }
273            let comp_slice = &base[comp_start..comp_end];
274            // Decompress
275            let mut buf = vec![0u8; decomp_size];
276            match zstd::bulk::decompress_to_buffer(comp_slice, &mut buf) {
277                Ok(_) => {
278                    if (idx.offset_in_block as usize) + (idx.doc_size as usize) > buf.len() {
279                        return None;
280                    }
281                    let mut p = idx.offset_in_block as usize;
282                    let end = buf.len();
283                    if p + 4 > end {
284                        return None;
285                    }
286                    let id_len = u32::from_le_bytes(buf[p..p + 4].try_into().ok()?) as usize;
287                    p += 4;
288                    if p + id_len > end {
289                        return None;
290                    }
291                    let id = String::from_utf8(buf[p..p + id_len].to_vec()).ok()?;
292                    p += id_len;
293                    if p + 4 > end {
294                        return None;
295                    }
296                    let text_len = u32::from_le_bytes(buf[p..p + 4].try_into().ok()?) as usize;
297                    p += 4;
298                    if p + text_len > end {
299                        return None;
300                    }
301                    let text = String::from_utf8(buf[p..p + text_len].to_vec()).ok()?;
302                    p += text_len;
303                    if p + 4 > end {
304                        return None;
305                    }
306                    let meta_len = u32::from_le_bytes(buf[p..p + 4].try_into().ok()?) as usize;
307                    p += 4;
308                    if p + meta_len > end {
309                        return None;
310                    }
311                    let meta = String::from_utf8(buf[p..p + meta_len].to_vec()).ok()?;
312                    Some((id, text, meta))
313                }
314                Err(_) => None,
315            }
316        }
317    }
318
319    // Hybrid search moved to vector_store + hybrid
320
321    #[inline]
322    pub(crate) fn row_stride_f32(&self) -> usize {
323        let row_bytes = (self.manifest.dim as usize) * 4;
324        let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
325        aligned_row_bytes / 4
326    }
327
328    // Vector search moved to vector_store
329
330    // Internal accessors for VectorStore
331    pub(crate) fn vectors_as_f32(&self) -> &[f32] {
332        bytemuck::cast_slice(&self.vectors)
333    }
334    pub(crate) fn vectors_raw(&self) -> &[u8] {
335        &self.vectors
336    }
337    #[allow(dead_code)]
338    pub(crate) fn num_docs_usize(&self) -> usize {
339        self.manifest.num_docs as usize
340    }
341    #[allow(dead_code)]
342    pub(crate) fn dim_usize(&self) -> usize {
343        self.manifest.dim as usize
344    }
345    pub(crate) fn row_stride_bytes(&self) -> usize {
346        let elem = if self.manifest.embedding.dtype.to_lowercase() == "f16" {
347            2
348        } else {
349            4
350        };
351        let row = (self.manifest.dim as usize) * elem;
352        ((row + 63) / 64) * 64
353    }
354
355    // BM25 search has moved to crate::bm25
356}
357
358#[derive(Debug, Clone, Copy)]
359pub(crate) struct LexiconEntry {
360    pub(crate) offset: u64,
361    pub(crate) length: u32,
362    pub(crate) df: u32,
363}
364
365fn load_lexicon(path: &Path) -> Result<Vec<LexiconEntry>> {
366    let mut f = File::open(path)?;
367    let mut buf = Vec::new();
368    f.read_to_end(&mut buf)?;
369    if buf.len() % 16 != 0 {
370        return Err(NvsError::InvalidBundle("lexicon size not multiple of 16"));
371    }
372    let mut v = Vec::with_capacity(buf.len() / 16);
373    let mut i = 0usize;
374    while i + 16 <= buf.len() {
375        let off = u64::from_le_bytes(buf[i..i + 8].try_into().unwrap());
376        i += 8;
377        let length = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
378        i += 4;
379        let df = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
380        i += 4;
381        v.push(LexiconEntry {
382            offset: off,
383            length,
384            df,
385        });
386    }
387    Ok(v)
388}
389
390fn load_terms(path: &Path) -> Result<HashMap<String, usize>> {
391    let mut f = File::open(path)?;
392    let mut buf = Vec::new();
393    f.read_to_end(&mut buf)?;
394    let mut m = HashMap::new();
395    let mut i = 0usize;
396    let mut id = 0usize;
397    while i + 4 <= buf.len() {
398        let len = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap()) as usize;
399        i += 4;
400        if i + len > buf.len() {
401            break;
402        }
403        let s = String::from_utf8_lossy(&buf[i..i + len]).to_string();
404        i += len;
405        m.insert(s, id);
406        id += 1;
407    }
408    Ok(m)
409}
410
411#[cfg(test)]
412mod tests {
413    use super::*;
414    use crate::tokenizer::SimpleTokenizer;
415    use std::io::Write;
416    use std::time::{SystemTime, UNIX_EPOCH};
417
418    fn temp_dir(prefix: &str) -> PathBuf {
419        let ts = SystemTime::now()
420            .duration_since(UNIX_EPOCH)
421            .unwrap()
422            .as_millis();
423        let p = std::env::temp_dir().join(format!("{}_{}", prefix, ts));
424        let _ = fs::create_dir_all(&p);
425        p
426    }
427
428    fn write_manifest(root: &Path, num_docs: u64, dim: u64, block_size: u32) {
429        let manifest = format!(
430            r#"{{
431  "format": "nvs.v1",
432  "num_docs": {},
433  "dim": {},
434  "embedding": {{"model": "test", "dtype": "f32"}},
435  "bm25": {{"avgdl": 1.0, "k1": 1.2, "b": 0.75}},
436  "files": {{
437    "vectors": {{"path": "vectors.f32", "dtype": "f32", "rows": {}, "cols": {}}},
438    "doclen": {{"path": "doclen.u32", "dtype": "u32", "rows": {}}},
439    "lexicon": {{"path": "lexicon.bin"}},
440    "postings": {{"path": "postings.bin"}},
441    "terms": {{"path": "terms.dict"}},
442    "meta_idx": {{"path": "meta.idx", "schema": "u32 block_id, u32 offset, u32 doc_size"}},
443    "meta": {{"path": "meta.blocks", "block_size": {}, "doc_aligned": true}}
444  }}
445}}"#,
446            num_docs, dim, num_docs, dim, num_docs, block_size
447        );
448        let mut f = File::create(root.join("manifest.json")).unwrap();
449        f.write_all(manifest.as_bytes()).unwrap();
450    }
451
452    fn write_meta_blocks(root: &Path, block_count: u32, block_size: u32) {
453        let mut f = File::create(root.join("meta.blocks")).unwrap();
454        f.write_all(&block_count.to_le_bytes()).unwrap();
455        let hdr = [0u8; 16];
456        for _ in 0..block_count {
457            f.write_all(&hdr).unwrap();
458        }
459        let block = vec![0u8; block_size as usize];
460        for _ in 0..block_count {
461            f.write_all(&block).unwrap();
462        }
463    }
464
465    fn write_meta_idx(root: &Path, entries: usize) {
466        let mut f = File::create(root.join("meta.idx")).unwrap();
467        for _ in 0..entries {
468            let entry = MetaIdxEntry {
469                block_id: 0,
470                offset_in_block: 0,
471                doc_size: 0,
472                padding: 0,
473            };
474            let bytes: [u8; META_IDX_ENTRY_SIZE] = unsafe { std::mem::transmute(entry) };
475            f.write_all(&bytes).unwrap();
476        }
477    }
478
479    fn touch(root: &Path, name: &str) {
480        let _ = File::create(root.join(name)).unwrap();
481    }
482
483    #[test]
484    fn open_ok_with_valid_headers() {
485        let dir = temp_dir("nvs_rust_ok");
486        write_manifest(&dir, 3, 4, 256);
487        // write vectors file with correct padded size (zeros)
488        {
489            let row_bytes = (4usize) * 4;
490            let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
491            let data = vec![0u8; (3usize) * aligned_row_bytes];
492            let mut f = File::create(dir.join("vectors.f32")).unwrap();
493            f.write_all(&data).unwrap();
494        }
495        // write doclen for 3 docs
496        {
497            let mut f = File::create(dir.join("doclen.u32")).unwrap();
498            for v in [0u32, 0u32, 0u32] {
499                f.write_all(&v.to_le_bytes()).unwrap();
500            }
501        }
502        touch(&dir, "lexicon.bin");
503        touch(&dir, "postings.bin");
504        touch(&dir, "terms.dict");
505        write_meta_idx(&dir, 3);
506        write_meta_blocks(&dir, 2, 256);
507
508        let b = Bundle::open(&dir).expect("bundle open");
509        assert_eq!(b.meta_block_size, 256);
510        assert_eq!(b.meta_block_count, 2);
511    }
512
513    #[test]
514    fn open_fails_on_meta_idx_count_mismatch() {
515        let dir = temp_dir("nvs_rust_bad_idx");
516        write_manifest(&dir, 2, 1, 128);
517        // vectors file with correct padded size (zeros)
518        {
519            let row_bytes = (1usize) * 4;
520            let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
521            let data = vec![0u8; (1usize) * aligned_row_bytes];
522            let mut f = File::create(dir.join("vectors.f32")).unwrap();
523            f.write_all(&data).unwrap();
524        }
525        touch(&dir, "doclen.u32");
526        touch(&dir, "lexicon.bin");
527        touch(&dir, "postings.bin");
528        touch(&dir, "terms.dict");
529        write_meta_idx(&dir, 1); // should be 2
530        write_meta_blocks(&dir, 1, 128);
531
532        let err = Bundle::open(&dir).unwrap_err();
533        match err {
534            NvsError::InvalidBundle(_) => {}
535            _ => panic!("unexpected err"),
536        }
537    }
538
539    #[test]
540    fn open_fails_on_manifest_block_size_mismatch() {
541        let dir = temp_dir("nvs_rust_bad_bsz");
542        write_manifest(&dir, 1, 1, 128);
543        touch(&dir, "vectors.f32");
544        touch(&dir, "doclen.u32");
545        touch(&dir, "lexicon.bin");
546        touch(&dir, "postings.bin");
547        touch(&dir, "terms.dict");
548        write_meta_idx(&dir, 1);
549        // write meta.blocks with derived block size 256 (header: 1 block, then 256 bytes)
550        write_meta_blocks(&dir, 1, 256);
551
552        let err = Bundle::open(&dir).unwrap_err();
553        match err {
554            NvsError::InvalidBundle(_) => {}
555            _ => panic!("unexpected err"),
556        }
557    }
558
559    #[test]
560    fn bm25_small_corpus_ordering() {
561        let dir = temp_dir("nvs_rust_bm25");
562        // 3 docs, dim 1, block_size 128
563        write_manifest(&dir, 3, 1, 128);
564        // vectors file with correct padded size (zeros)
565        {
566            let dim = 1usize;
567            let num_docs = 3usize;
568            let row_bytes = dim * 4;
569            let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
570            let data = vec![0u8; num_docs * aligned_row_bytes];
571            let mut f = File::create(dir.join("vectors.f32")).unwrap();
572            f.write_all(&data).unwrap();
573        }
574        // doclen: token counts per doc
575        {
576            let mut f = File::create(dir.join("doclen.u32")).unwrap();
577            // len(a)=3, len(b)=1, len(c)=3
578            for v in [3u32, 1u32, 3u32] {
579                f.write_all(&v.to_le_bytes()).unwrap();
580            }
581        }
582        // terms: apple, banana, cherry
583        {
584            let mut f = File::create(dir.join("terms.dict")).unwrap();
585            for s in ["apple", "banana", "cherry"] {
586                let len = s.len() as u32;
587                f.write_all(&len.to_le_bytes()).unwrap();
588                f.write_all(s.as_bytes()).unwrap();
589            }
590        }
591        // postings: each entry [delta,u32][tf,u32]
592        // apple in doc0(tf=3) and doc2(tf=1)
593        // banana in doc1(tf=3)
594        // cherry in doc2(tf=2)
595        let mut postings = Vec::<u8>::new();
596        let mut lex = Vec::<u8>::new();
597        let mut offset: u64 = 0;
598        let add_entry = |delta: u32, tf: u32, buf: &mut Vec<u8>| {
599            buf.extend_from_slice(&delta.to_le_bytes());
600            buf.extend_from_slice(&tf.to_le_bytes());
601        };
602        // apple: 2 entries
603        add_entry(0, 3, &mut postings); // doc0
604        add_entry(2, 1, &mut postings); // doc2 (prev=0 -> +2)
605        lex.extend_from_slice(&offset.to_le_bytes());
606        lex.extend_from_slice(&2u32.to_le_bytes());
607        lex.extend_from_slice(&2u32.to_le_bytes());
608        offset += 2 * 8;
609        // banana: 1 entry (doc1)
610        add_entry(1, 3, &mut postings);
611        lex.extend_from_slice(&offset.to_le_bytes());
612        lex.extend_from_slice(&1u32.to_le_bytes());
613        lex.extend_from_slice(&1u32.to_le_bytes());
614        offset += 1 * 8;
615        // cherry: 1 entry (doc2)
616        add_entry(1, 2, &mut postings); // from prev doc1 -> doc2 delta=1
617        lex.extend_from_slice(&offset.to_le_bytes());
618        lex.extend_from_slice(&1u32.to_le_bytes());
619        lex.extend_from_slice(&1u32.to_le_bytes());
620
621        {
622            let mut f = File::create(dir.join("postings.bin")).unwrap();
623            f.write_all(&postings).unwrap();
624            let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
625            lf.write_all(&lex).unwrap();
626        }
627        // minimal meta files
628        write_meta_idx(&dir, 3);
629        write_meta_blocks(&dir, 1, 128);
630
631        let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
632        // Query apple should bring doc0 before doc2
633        let res = store.search_bm25("apple", 3);
634        assert!(!res.is_empty());
635        assert_eq!(res[0].0, 0);
636        // Scores should be non-increasing
637        for i in 1..res.len() {
638            assert!(res[i - 1].1 >= res[i].1, "bm25 scores must be sorted desc");
639        }
640        // Multi-term apple+banana likely keeps doc1 and doc0 in top 2
641        let res2 = store.search_bm25("apple banana", 3);
642        assert!(res2.iter().any(|&(id, _)| id == 0));
643        assert!(res2.iter().any(|&(id, _)| id == 1));
644        for i in 1..res2.len() {
645            assert!(
646                res2[i - 1].1 >= res2[i].1,
647                "bm25 scores must be sorted desc"
648            );
649        }
650    }
651
652    #[test]
653    fn bm25_sort_order_and_ties() {
654        use std::io::Write;
655        let dir = temp_dir("nvs_rust_bm25_ties");
656        // 3 docs, dim 1
657        write_manifest(&dir, 3, 1, 128);
658        // vectors: zeros with proper padding
659        {
660            let row_bytes = 4usize;
661            let aligned = row_bytes.div_ceil(64) * 64;
662            let data = vec![0u8; 3 * aligned];
663            let mut f = File::create(dir.join("vectors.f32")).unwrap();
664            f.write_all(&data).unwrap();
665        }
666        // doc lengths: all 1
667        {
668            let mut f = File::create(dir.join("doclen.u32")).unwrap();
669            for _ in 0..3 {
670                f.write_all(&1u32.to_le_bytes()).unwrap();
671            }
672        }
673        // terms: one term 'foo'
674        {
675            let mut f = File::create(dir.join("terms.dict")).unwrap();
676            let s = "foo";
677            f.write_all(&(s.len() as u32).to_le_bytes()).unwrap();
678            f.write_all(s.as_bytes()).unwrap();
679        }
680        // postings: foo appears once in doc0, doc1, doc2 (equal tf -> tie)
681        {
682            let mut postings = Vec::<u8>::new();
683            let mut lexicon = Vec::<u8>::new();
684            let offset: u64 = 0;
685            let add = |delta: u32, tf: u32, buf: &mut Vec<u8>| {
686                buf.extend_from_slice(&delta.to_le_bytes());
687                buf.extend_from_slice(&tf.to_le_bytes());
688            };
689            add(0, 1, &mut postings); // doc0
690            add(1, 1, &mut postings); // doc1
691            add(1, 1, &mut postings); // doc2
692            lexicon.extend_from_slice(&offset.to_le_bytes());
693            lexicon.extend_from_slice(&3u32.to_le_bytes());
694            lexicon.extend_from_slice(&3u32.to_le_bytes());
695            let mut pf = File::create(dir.join("postings.bin")).unwrap();
696            pf.write_all(&postings).unwrap();
697            let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
698            lf.write_all(&lexicon).unwrap();
699        }
700        // meta files
701        write_meta_idx(&dir, 3);
702        write_meta_blocks(&dir, 1, 128);
703
704        let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
705        let res = store.search_bm25("foo", 3);
706        assert_eq!(res.len(), 3);
707        // Scores must be non-increasing
708        for i in 1..res.len() {
709            assert!(res[i - 1].1 >= res[i].1, "bm25 scores must be sorted desc");
710        }
711        // Ties must be broken by id asc (0,1,2)
712        assert_eq!(res[0].0, 0);
713        assert_eq!(res[1].0, 1);
714        assert_eq!(res[2].0, 2);
715    }
716
717    #[test]
718    fn vector_search_small() {
719        use std::io::Write;
720        let dir = temp_dir("nvs_rust_vec");
721        let num_docs = 4u64;
722        let dim = 4u64;
723        let block = 128u32;
724        write_manifest(&dir, num_docs, dim, block);
725        // Write vectors: identity rows padded to 64B
726        {
727            let row_bytes = (dim as usize) * 4;
728            let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
729            let mut data = vec![0u8; (num_docs as usize) * aligned_row_bytes];
730            for i in 0..(num_docs as usize) {
731                for j in 0..(dim as usize) {
732                    let v = if i == j { 1f32 } else { 0f32 };
733                    let off = i * aligned_row_bytes + j * 4;
734                    data[off..off + 4].copy_from_slice(&v.to_le_bytes());
735                }
736            }
737            let mut f = File::create(dir.join("vectors.f32")).unwrap();
738            f.write_all(&data).unwrap();
739        }
740        // minimal bm25 files
741        {
742            let mut f = File::create(dir.join("doclen.u32")).unwrap();
743            for _ in 0..num_docs {
744                f.write_all(&0u32.to_le_bytes()).unwrap();
745            }
746        }
747        File::create(dir.join("lexicon.bin")).unwrap();
748        File::create(dir.join("postings.bin")).unwrap();
749        File::create(dir.join("terms.dict")).unwrap();
750        write_meta_idx(&dir, num_docs as usize);
751        write_meta_blocks(&dir, 1, 128);
752
753        let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
754        let q = [1f32, 0f32, 0f32, 0f32];
755        let res = store.search_vector(&q, 3);
756        assert!(!res.is_empty());
757        // Top-1 should be doc 0
758        assert_eq!(res[0].0, 0);
759        // Scores should be non-increasing
760        for i in 1..res.len() {
761            assert!(res[i - 1].1 >= res[i].1);
762        }
763        // Determinism
764        let res2 = store.search_vector(&q, 3);
765        assert_eq!(res, res2);
766    }
767
768    #[test]
769    fn get_document_basic() {
770        use std::io::Write;
771        let dir = temp_dir("nvs_rust_getdoc");
772        write_manifest(&dir, 2, 1, 128);
773        // vectors
774        {
775            let row_bytes = 4usize;
776            let aligned = row_bytes.div_ceil(64) * 64;
777            let data = vec![0u8; 2 * aligned];
778            let mut f = File::create(dir.join("vectors.f32")).unwrap();
779            f.write_all(&data).unwrap();
780        }
781        // doclen
782        {
783            let mut f = File::create(dir.join("doclen.u32")).unwrap();
784            for _ in 0..2 {
785                f.write_all(&0u32.to_le_bytes()).unwrap();
786            }
787        }
788        // empty bm25 index files
789        File::create(dir.join("lexicon.bin")).unwrap();
790        File::create(dir.join("postings.bin")).unwrap();
791        File::create(dir.join("terms.dict")).unwrap();
792        // meta.blocks with 1 block and 2 docs
793        let (id0, text0, meta0) = ("a", "text a", "{\"k\":1}");
794        let (id1, text1, meta1) = ("b", "text b", "{\"k\":2}");
795        let rec_size = |id: &str, tx: &str, mj: &str| 4 + id.len() + 4 + tx.len() + 4 + mj.len();
796        let s0 = rec_size(id0, text0, meta0);
797        let s1 = rec_size(id1, text1, meta1);
798        let mut mb = Vec::<u8>::new();
799        // block_count = 1
800        mb.extend_from_slice(&1u32.to_le_bytes());
801        // header for block 0: [id, usize, dcount, pad]
802        mb.extend_from_slice(&0u32.to_le_bytes());
803        mb.extend_from_slice(&(s0 as u32 + s1 as u32).to_le_bytes());
804        mb.extend_from_slice(&2u32.to_le_bytes());
805        mb.extend_from_slice(&0u32.to_le_bytes());
806        // block data
807        let write_rec = |id: &str, tx: &str, mj: &str, buf: &mut Vec<u8>| {
808            buf.extend_from_slice(&(id.len() as u32).to_le_bytes());
809            buf.extend_from_slice(id.as_bytes());
810            buf.extend_from_slice(&(tx.len() as u32).to_le_bytes());
811            buf.extend_from_slice(tx.as_bytes());
812            buf.extend_from_slice(&(mj.len() as u32).to_le_bytes());
813            buf.extend_from_slice(mj.as_bytes());
814        };
815        write_rec(id0, text0, meta0, &mut mb);
816        write_rec(id1, text1, meta1, &mut mb);
817        // pad to block_size 128
818        let block_size = 128usize;
819        let _header_size = 4 + 1 * 16;
820        let data_len = s0 + s1;
821        let pad_len = block_size - data_len;
822        mb.extend(std::iter::repeat(0u8).take(pad_len));
823        // write file
824        let mut fmb = File::create(dir.join("meta.blocks")).unwrap();
825        fmb.write_all(&mb).unwrap();
826        // meta.idx entries
827        {
828            let mut idx = Vec::<u8>::new();
829            idx.extend_from_slice(&0u32.to_le_bytes());
830            idx.extend_from_slice(&0u32.to_le_bytes());
831            idx.extend_from_slice(&(s0 as u32).to_le_bytes());
832            idx.extend_from_slice(&0u32.to_le_bytes());
833            idx.extend_from_slice(&0u32.to_le_bytes());
834            idx.extend_from_slice(&(s0 as u32).to_le_bytes());
835            idx.extend_from_slice(&(s1 as u32).to_le_bytes());
836            idx.extend_from_slice(&0u32.to_le_bytes());
837            let mut fi = File::create(dir.join("meta.idx")).unwrap();
838            fi.write_all(&idx).unwrap();
839        }
840
841        let b = Bundle::open(&dir).unwrap();
842        let d0 = b.get_document(0).unwrap();
843        assert_eq!(d0.0, "a");
844        assert!(d0.1.contains("text a"));
845        assert!(d0.2.contains("\"k\":1"));
846        let d1 = b.get_document(1).unwrap();
847        assert_eq!(d1.0, "b");
848        assert!(d1.1.contains("text b"));
849        assert!(d1.2.contains("\"k\":2"));
850    }
851
852    #[test]
853    fn hybrid_extremes_vector_and_bm25() {
854        use std::io::Write;
855        // Build a bundle where BM25 has signal and vectors are zero; weight 0.0 follows BM25
856        let dir = temp_dir("nvs_rust_hybrid_bm25");
857        write_manifest(&dir, 3, 1, 128);
858        // vectors: zeros
859        {
860            let row_bytes = 4usize;
861            let aligned = row_bytes.div_ceil(64) * 64;
862            let data = vec![0u8; 3 * aligned];
863            let mut f = File::create(dir.join("vectors.f32")).unwrap();
864            f.write_all(&data).unwrap();
865        }
866        // doclen
867        {
868            let mut f = File::create(dir.join("doclen.u32")).unwrap();
869            for _ in 0..3 {
870                f.write_all(&1u32.to_le_bytes()).unwrap();
871            }
872        }
873        // terms: one term 'apple'
874        {
875            let mut f = File::create(dir.join("terms.dict")).unwrap();
876            let s = "apple";
877            f.write_all(&(s.len() as u32).to_le_bytes()).unwrap();
878            f.write_all(s.as_bytes()).unwrap();
879        }
880        // postings: apple in doc1 only
881        {
882            let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
883            let mut pf = File::create(dir.join("postings.bin")).unwrap();
884            // offset=0, length=1, df=1
885            lf.write_all(&0u64.to_le_bytes()).unwrap();
886            lf.write_all(&1u32.to_le_bytes()).unwrap();
887            lf.write_all(&1u32.to_le_bytes()).unwrap();
888            // posting [delta=1, tf=1]
889            pf.write_all(&1u32.to_le_bytes()).unwrap();
890            pf.write_all(&1u32.to_le_bytes()).unwrap();
891        }
892        write_meta_idx(&dir, 3);
893        write_meta_blocks(&dir, 1, 128);
894        let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
895        let v = [1f32];
896        let hv = store.search_hybrid(&v, "apple", 2, 0.0);
897        assert_eq!(hv[0].0, 1, "bm25 extreme should rank doc1 first");
898
899        // Build a bundle where BM25 is empty and vectors are identity; weight 1.0 follows vectors
900        let dir2 = temp_dir("nvs_rust_hybrid_vec");
901        write_manifest(&dir2, 3, 3, 128);
902        {
903            let dim = 3usize;
904            let n = 3usize;
905            let row_bytes = dim * 4;
906            let aligned = row_bytes.div_ceil(64) * 64;
907            let mut data = vec![0u8; n * aligned];
908            for i in 0..n {
909                for j in 0..dim {
910                    let v = if i == j { 1f32 } else { 0f32 };
911                    let off = i * aligned + j * 4;
912                    data[off..off + 4].copy_from_slice(&v.to_le_bytes());
913                }
914            }
915            let mut f = File::create(dir2.join("vectors.f32")).unwrap();
916            f.write_all(&data).unwrap();
917        }
918        {
919            let mut f = File::create(dir2.join("doclen.u32")).unwrap();
920            for _ in 0..3 {
921                f.write_all(&0u32.to_le_bytes()).unwrap();
922            }
923        }
924        File::create(dir2.join("lexicon.bin")).unwrap();
925        File::create(dir2.join("postings.bin")).unwrap();
926        File::create(dir2.join("terms.dict")).unwrap();
927        write_meta_idx(&dir2, 3);
928        write_meta_blocks(&dir2, 1, 128);
929        let store2 = crate::VectorStore::from_bundle(Bundle::open(&dir2).unwrap());
930        let q = [1f32, 0f32, 0f32];
931        let hv2 = store2.search_hybrid(&q, "unused", 2, 1.0);
932        assert_eq!(hv2[0].0, 0, "vector extreme should rank doc0 first");
933    }
934
935    // --- E2E-like pack-then-open helpers and tests ---
936    #[derive(Clone)]
937    struct TDoc {
938        id: String,
939        text: String,
940        embedding: Vec<f32>,
941    }
942
943    fn pack_bundle(dir: &Path, docs: &[TDoc], dim: usize, block_size: usize) {
944        // Vectors (64B aligned rows)
945        {
946            let row_bytes = dim * 4;
947            let aligned = row_bytes.div_ceil(64) * 64;
948            let mut data = vec![0u8; docs.len() * aligned];
949            for (i, d) in docs.iter().enumerate() {
950                assert_eq!(d.embedding.len(), dim);
951                for j in 0..dim {
952                    let off = i * aligned + j * 4;
953                    data[off..off + 4].copy_from_slice(&d.embedding[j].to_le_bytes());
954                }
955            }
956            let mut f = File::create(dir.join("vectors.f32")).unwrap();
957            f.write_all(&data).unwrap();
958        }
959        // Tokenize and collect BM25 stats
960        let tok = SimpleTokenizer::new();
961        let mut doc_tokens: Vec<Vec<String>> = Vec::with_capacity(docs.len());
962        let mut df_map: HashMap<String, usize> = HashMap::new();
963        let mut postings_map: HashMap<String, Vec<(usize, u32)>> = HashMap::new();
964        for (i, d) in docs.iter().enumerate() {
965            let tokens = tok.split(&d.text);
966            let mut tf: HashMap<&str, u32> = HashMap::new();
967            for t in &tokens {
968                *tf.entry(t.as_str()).or_insert(0) += 1;
969            }
970            for (term, &count) in tf.iter() {
971                postings_map
972                    .entry(term.to_string())
973                    .or_default()
974                    .push((i, count));
975            }
976            for term in tf.keys() {
977                *df_map.entry((*term).to_string()).or_insert(0) += 1;
978            }
979            doc_tokens.push(tokens);
980        }
981        // doclen
982        {
983            let mut f = File::create(dir.join("doclen.u32")).unwrap();
984            for tokens in &doc_tokens {
985                let len = tokens.len() as u32;
986                f.write_all(&len.to_le_bytes()).unwrap();
987            }
988        }
989        // Terms sorted for consistent IDs
990        let mut terms: Vec<String> = postings_map.keys().cloned().collect();
991        terms.sort();
992        {
993            let mut f = File::create(dir.join("terms.dict")).unwrap();
994            for t in &terms {
995                let len = t.len() as u32;
996                f.write_all(&len.to_le_bytes()).unwrap();
997                f.write_all(t.as_bytes()).unwrap();
998            }
999        }
1000        // Build postings.bin and lexicon.bin
1001        {
1002            let mut postings = Vec::<u8>::new();
1003            let mut lexicon = Vec::<u8>::new();
1004            let mut offset: u64 = 0;
1005            for t in &terms {
1006                let mut list = postings_map.get(t).cloned().unwrap_or_default();
1007                list.sort_by_key(|&(doc, _)| doc);
1008                let mut prev = 0usize;
1009                let mut length = 0u32;
1010                for (doc, tf) in list.into_iter() {
1011                    let delta = (doc - prev) as u32;
1012                    prev = doc;
1013                    length += 1;
1014                    postings.extend_from_slice(&delta.to_le_bytes());
1015                    postings.extend_from_slice(&tf.to_le_bytes());
1016                }
1017                let df = *df_map.get(t).unwrap_or(&0) as u32;
1018                lexicon.extend_from_slice(&offset.to_le_bytes());
1019                lexicon.extend_from_slice(&length.to_le_bytes());
1020                lexicon.extend_from_slice(&df.to_le_bytes());
1021                offset += (length as u64) * 8;
1022            }
1023            let mut pf = File::create(dir.join("postings.bin")).unwrap();
1024            pf.write_all(&postings).unwrap();
1025            let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
1026            lf.write_all(&lexicon).unwrap();
1027        }
1028        // Build meta.blocks and meta.idx
1029        let mut blocks: Vec<Vec<u8>> = Vec::new();
1030        let mut headers: Vec<(u32, u32, u32, u32)> = Vec::new();
1031        let mut idx: Vec<u8> = Vec::new();
1032        let mut cur = Vec::<u8>::with_capacity(block_size);
1033        let mut cur_usize = 0u32;
1034        let mut cur_docs = 0u32;
1035        let mut block_id = 0u32;
1036        for d in docs {
1037            let meta_json = format!(
1038                "{{\"embedding\":[{}]}}",
1039                d.embedding
1040                    .iter()
1041                    .map(|v| v.to_string())
1042                    .collect::<Vec<_>>()
1043                    .join(",")
1044            );
1045            let rec_size = 4 + d.id.len() + 4 + d.text.len() + 4 + meta_json.len();
1046            if cur_docs > 0 && cur_usize as usize + rec_size > block_size {
1047                headers.push((block_id, cur_usize, cur_docs, 0));
1048                blocks.push(std::mem::take(&mut cur));
1049                cur = Vec::with_capacity(block_size);
1050                cur_usize = 0;
1051                cur_docs = 0;
1052                block_id += 1;
1053            }
1054            // idx entry
1055            idx.extend_from_slice(&block_id.to_le_bytes());
1056            idx.extend_from_slice(&cur_usize.to_le_bytes());
1057            idx.extend_from_slice(&(rec_size as u32).to_le_bytes());
1058            idx.extend_from_slice(&0u32.to_le_bytes());
1059            // write record
1060            cur.extend_from_slice(&(d.id.len() as u32).to_le_bytes());
1061            cur.extend_from_slice(d.id.as_bytes());
1062            cur.extend_from_slice(&(d.text.len() as u32).to_le_bytes());
1063            cur.extend_from_slice(d.text.as_bytes());
1064            cur.extend_from_slice(&(meta_json.len() as u32).to_le_bytes());
1065            cur.extend_from_slice(meta_json.as_bytes());
1066            cur_usize += rec_size as u32;
1067            cur_docs += 1;
1068        }
1069        if cur_docs > 0 {
1070            headers.push((block_id, cur_usize, cur_docs, 0));
1071            blocks.push(cur);
1072        }
1073        // meta.blocks: write header and padded blocks
1074        {
1075            let mut f = File::create(dir.join("meta.blocks")).unwrap();
1076            f.write_all(&(headers.len() as u32).to_le_bytes()).unwrap();
1077            for (id, usizeb, dcount, pad) in &headers {
1078                f.write_all(&id.to_le_bytes()).unwrap();
1079                f.write_all(&usizeb.to_le_bytes()).unwrap();
1080                f.write_all(&dcount.to_le_bytes()).unwrap();
1081                f.write_all(&pad.to_le_bytes()).unwrap();
1082            }
1083            for b in &blocks {
1084                f.write_all(&b).unwrap();
1085                if b.len() < block_size {
1086                    f.write_all(&vec![0u8; block_size - b.len()]).unwrap();
1087                }
1088            }
1089        }
1090        // meta.idx
1091        {
1092            let mut f = File::create(dir.join("meta.idx")).unwrap();
1093            f.write_all(&idx).unwrap();
1094        }
1095        // manifest
1096        {
1097            let manifest = format!(
1098                r#"{{
1099  "format": "nvs.v1",
1100  "num_docs": {},
1101  "dim": {},
1102  "embedding": {{"model": "test", "dtype": "f32"}},
1103  "bm25": {{"avgdl": 1.0, "k1": 1.2, "b": 0.75}},
1104  "files": {{
1105    "vectors": {{"path": "vectors.f32", "dtype": "f32", "rows": {}, "cols": {}}},
1106    "doclen": {{"path": "doclen.u32", "dtype": "u32", "rows": {}}},
1107    "lexicon": {{"path": "lexicon.bin"}},
1108    "postings": {{"path": "postings.bin"}},
1109    "terms": {{"path": "terms.dict"}},
1110    "meta_idx": {{"path": "meta.idx", "schema": "u32 block_id, u32 offset, u32 doc_size"}},
1111    "meta": {{"path": "meta.blocks", "block_size": {}, "doc_aligned": true}}
1112  }}
1113}}"#,
1114                docs.len(),
1115                dim,
1116                docs.len(),
1117                dim,
1118                docs.len(),
1119                block_size
1120            );
1121            let mut f = File::create(dir.join("manifest.json")).unwrap();
1122            f.write_all(manifest.as_bytes()).unwrap();
1123        }
1124        // checksums.xxhash64
1125        {
1126            use xxhash_rust::xxh64::xxh64;
1127            let files = [
1128                "manifest.json",
1129                "vectors.f32",
1130                "doclen.u32",
1131                "lexicon.bin",
1132                "postings.bin",
1133                "terms.dict",
1134                "meta.idx",
1135                "meta.blocks",
1136            ];
1137            let mut out = String::new();
1138            for name in files {
1139                let path = dir.join(name);
1140                let mut buf = Vec::new();
1141                File::open(&path).unwrap().read_to_end(&mut buf).unwrap();
1142                let h = xxh64(&buf, 0);
1143                out.push_str(&format!("{h:016x}  {name}\n"));
1144            }
1145            let mut f = File::create(dir.join("checksums.xxhash64")).unwrap();
1146            f.write_all(out.as_bytes()).unwrap();
1147        }
1148    }
1149
1150    #[test]
1151    fn corpus_semantic_sanity() {
1152        use rand::rngs::StdRng;
1153        use rand::seq::SliceRandom;
1154        use rand::{Rng, SeedableRng};
1155
1156        // Build a medium corpus across three topical clusters with synthetic embeddings.
1157        // Text contains topical keywords so BM25 should agree with vector similarity.
1158        #[derive(Clone)]
1159        struct Topic {
1160            name: &'static str,
1161            keywords: &'static [&'static str],
1162        }
1163        let topics = [
1164            Topic {
1165                name: "physics",
1166                keywords: &[
1167                    "quantum", "particle", "wave", "electron", "photon", "field", "spin", "energy",
1168                ],
1169            },
1170            Topic {
1171                name: "cooking",
1172                keywords: &[
1173                    "recipe",
1174                    "cook",
1175                    "bake",
1176                    "ingredients",
1177                    "oven",
1178                    "simmer",
1179                    "spice",
1180                    "kitchen",
1181                ],
1182            },
1183            Topic {
1184                name: "finance",
1185                keywords: &[
1186                    "market",
1187                    "stock",
1188                    "investment",
1189                    "portfolio",
1190                    "risk",
1191                    "returns",
1192                    "capital",
1193                    "trading",
1194                ],
1195            },
1196        ];
1197
1198        let dim = 64usize;
1199        let per_topic = 30usize; // total 90 docs
1200        let block_size = 8192usize;
1201        let dir = temp_dir("nvs_rust_corpus_semantic");
1202
1203        // Create per-topic centroids
1204        let mut rng = StdRng::seed_from_u64(42);
1205        let mut centroids: Vec<Vec<f32>> = Vec::new();
1206        for _ in 0..topics.len() {
1207            let mut v: Vec<f32> = (0..dim).map(|_| rng.gen_range(-0.5f32..0.5f32)).collect();
1208            // normalize
1209            let n = v.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-6);
1210            for x in &mut v {
1211                *x /= n;
1212            }
1213            centroids.push(v);
1214        }
1215
1216        // Generate documents
1217        let mut docs: Vec<TDoc> = Vec::with_capacity(topics.len() * per_topic);
1218        for (ti, topic) in topics.iter().enumerate() {
1219            for j in 0..per_topic {
1220                // Build simple text with 4 keywords shuffled
1221                let mut idxs: Vec<usize> = (0..topic.keywords.len()).collect();
1222                idxs.shuffle(&mut rng);
1223                let kw = [
1224                    topic.keywords[idxs[0]],
1225                    topic.keywords[idxs[1]],
1226                    topic.keywords[idxs[2]],
1227                    topic.keywords[idxs[3]],
1228                ];
1229                let text = format!(
1230                    "{} {} discussed here. We also mention {} and {} in this paragraph about {}.",
1231                    kw[0], kw[1], kw[2], kw[3], topic.name
1232                );
1233
1234                // Embedding = centroid + small noise
1235                let base = &centroids[ti];
1236                let mut e = vec![0f32; dim];
1237                for d in 0..dim {
1238                    let noise: f32 = rng.gen_range(-0.03..0.03);
1239                    e[d] = base[d] + noise;
1240                }
1241                // renormalize
1242                let n = e.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-6);
1243                for x in &mut e {
1244                    *x /= n;
1245                }
1246
1247                let id = format!("{}-{:02}", topic.name, j);
1248                docs.push(TDoc {
1249                    id,
1250                    text,
1251                    embedding: e,
1252                });
1253            }
1254        }
1255
1256        // Pack bundle
1257        pack_bundle(&dir, &docs, dim, block_size);
1258        let store = crate::VectorStore::open(&dir).expect("open bundle");
1259        assert_eq!(store.size(), topics.len() * per_topic);
1260        assert_eq!(store.dimensions(), dim);
1261
1262        // Helper: extract topic from id
1263        let topic_of = |doc_id: u32| -> String {
1264            store
1265                .get_document(doc_id)
1266                .unwrap()
1267                .0
1268                .split('-')
1269                .next()
1270                .unwrap()
1271                .to_string()
1272        };
1273
1274        // For each topic, build a vector query from the centroid and a BM25 query from 2-3 keywords
1275        for (ti, topic) in topics.iter().enumerate() {
1276            // Vector query
1277            let qv = centroids[ti].clone();
1278            let vres = store.search_vector(&qv, 10);
1279            assert!(!vres.is_empty());
1280            let top_topic = topic_of(vres[0].0);
1281            assert_eq!(top_topic, topic.name, "vector top-1 should match topic");
1282            let same_count = vres
1283                .iter()
1284                .filter(|(id, _)| topic_of(*id) == topic.name)
1285                .count();
1286            assert!(
1287                same_count >= 7,
1288                "expected >=7/10 same-topic in vector search, got {}",
1289                same_count
1290            );
1291
1292            // BM25 query text from two keywords
1293            let qtext = format!("{} {}", topic.keywords[0], topic.keywords[1]);
1294            let bres = store.search_bm25(&qtext, 10);
1295            assert!(!bres.is_empty());
1296            let top_topic_b = topic_of(bres[0].0);
1297            assert_eq!(top_topic_b, topic.name, "bm25 top-1 should match topic");
1298            let same_count_b = bres
1299                .iter()
1300                .filter(|(id, _)| topic_of(*id) == topic.name)
1301                .count();
1302            assert!(
1303                same_count_b >= 6,
1304                "expected >=6/10 same-topic in BM25, got {}",
1305                same_count_b
1306            );
1307
1308            // Hybrid query
1309            let hres = store.search_hybrid(&qv, &qtext, 10, 0.5);
1310            assert!(!hres.is_empty());
1311            let top_topic_h = topic_of(hres[0].0);
1312            assert_eq!(top_topic_h, topic.name, "hybrid top-1 should match topic");
1313            let same_count_h = hres
1314                .iter()
1315                .filter(|(id, _)| topic_of(*id) == topic.name)
1316                .count();
1317            assert!(
1318                same_count_h >= 7,
1319                "expected >=7/10 same-topic in hybrid, got {}",
1320                same_count_h
1321            );
1322        }
1323    }
1324
1325    #[test]
1326    fn e2e_pack_then_open_single_block() {
1327        let dir_in = temp_dir("nvs_rust_e2e_in_single");
1328        let docs = vec![
1329            TDoc {
1330                id: "doc0".into(),
1331                text: "doc text number 0".into(),
1332                embedding: vec![1.0, 0.0, 0.0, 0.0],
1333            },
1334            TDoc {
1335                id: "doc1".into(),
1336                text: "doc text number 1".into(),
1337                embedding: vec![1.0, 0.0, 0.0, 0.0],
1338            },
1339            TDoc {
1340                id: "doc2".into(),
1341                text: "doc text number 2".into(),
1342                embedding: vec![1.0, 0.0, 0.0, 0.0],
1343            },
1344        ];
1345        pack_bundle(&dir_in, &docs, 4, 131072);
1346        let store = crate::VectorStore::from_bundle(Bundle::open(&dir_in).unwrap());
1347        assert_eq!(store.size(), 3);
1348        assert_eq!(store.dimensions(), 4);
1349        let d0 = store.get_document(0).unwrap();
1350        assert_eq!(d0.0, "doc0");
1351        assert!(d0.1.contains("doc text number 0"));
1352        assert!(d0.2.contains("\"embedding\""));
1353        let d2 = store.get_document(2).unwrap();
1354        assert_eq!(d2.0, "doc2");
1355        assert!(d2.1.contains("doc text number 2"));
1356        let q = [1f32, 0f32, 0f32, 0f32];
1357        let res = store.search_vector(&q, 2);
1358        assert!(!res.is_empty());
1359    }
1360
1361    #[test]
1362    fn e2e_pack_then_open_multiple_blocks() {
1363        let dir_in = temp_dir("nvs_rust_e2e_in_multi");
1364        let mut docs = Vec::new();
1365        for i in 0..10 {
1366            docs.push(TDoc {
1367                id: format!("m{i}"),
1368                text: format!("m text number {i}"),
1369                embedding: vec![1.0, 0.0, 0.0, 0.0],
1370            });
1371        }
1372        pack_bundle(&dir_in, &docs, 4, 256);
1373        let store = crate::VectorStore::from_bundle(Bundle::open(&dir_in).unwrap());
1374        assert_eq!(store.size(), 10);
1375        let d0 = store.get_document(0).unwrap();
1376        assert_eq!(d0.0, "m0");
1377        let d9 = store.get_document(9).unwrap();
1378        assert_eq!(d9.0, "m9");
1379        for i in 0..10 {
1380            let d = store.get_document(i).unwrap();
1381            assert_eq!(d.0, format!("m{i}"));
1382        }
1383    }
1384
1385    #[test]
1386    fn e2e_block_headers_and_checksums() {
1387        let dir_in = temp_dir("nvs_rust_e2e_hdr");
1388        let mut docs = Vec::new();
1389        for i in 0..10 {
1390            docs.push(TDoc {
1391                id: format!("h{i}"),
1392                text: format!("h text {i}"),
1393                embedding: vec![1.0, 0.0, 0.0, 0.0],
1394            });
1395        }
1396        pack_bundle(&dir_in, &docs, 4, 256);
1397        // Parse meta.blocks
1398        {
1399            let mut f = File::open(dir_in.join("meta.blocks")).unwrap();
1400            let mut buf = Vec::new();
1401            f.read_to_end(&mut buf).unwrap();
1402            let mut p = 0usize;
1403            let block_count = u32::from_le_bytes(buf[p..p + 4].try_into().unwrap()) as usize;
1404            p += 4;
1405            let mut hdrs = Vec::new();
1406            for _ in 0..block_count {
1407                let id = u32::from_le_bytes(buf[p..p + 4].try_into().unwrap());
1408                let usizeb = u32::from_le_bytes(buf[p + 4..p + 8].try_into().unwrap());
1409                let dcount = u32::from_le_bytes(buf[p + 8..p + 12].try_into().unwrap());
1410                let pad = u32::from_le_bytes(buf[p + 12..p + 16].try_into().unwrap());
1411                p += 16;
1412                hdrs.push((id, usizeb, dcount, pad));
1413            }
1414            let total_size = buf.len();
1415            let header_size = 4 + block_count * 16;
1416            let block_size = (total_size - header_size) / block_count;
1417            assert!(block_size > 0);
1418            let mut total_docs = 0usize;
1419            for i in 0..block_count {
1420                let (_id, usizeb, _dcount, _) = hdrs[i];
1421                let start = header_size + i * block_size;
1422                let mut consumed = 0usize;
1423                let mut pos = start;
1424                while consumed < usizeb as usize {
1425                    let idl = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap()) as usize;
1426                    pos += 4;
1427                    consumed += 4;
1428                    pos += idl;
1429                    consumed += idl;
1430                    let tl = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap()) as usize;
1431                    pos += 4;
1432                    consumed += 4;
1433                    pos += tl;
1434                    consumed += tl;
1435                    let ml = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap()) as usize;
1436                    pos += 4;
1437                    consumed += 4;
1438                    pos += ml;
1439                    consumed += ml;
1440                    total_docs += 1;
1441                }
1442                assert_eq!(consumed, usizeb as usize);
1443                assert_eq!(
1444                    total_docs as u32,
1445                    hdrs.iter().map(|h| h.2).take(i + 1).sum::<u32>()
1446                );
1447            }
1448            assert_eq!(total_docs, 10);
1449        }
1450        // Checksums format sanity
1451        {
1452            let mut s = String::new();
1453            File::open(dir_in.join("checksums.xxhash64"))
1454                .unwrap()
1455                .read_to_string(&mut s)
1456                .unwrap();
1457            let mut seen = 0;
1458            for line in s.lines() {
1459                if line.is_empty() {
1460                    continue;
1461                }
1462                let mut parts = line.split("  ");
1463                let hex = parts.next().unwrap();
1464                let fname = parts.next().unwrap_or("");
1465                assert_eq!(hex.len(), 16);
1466                assert!(hex
1467                    .chars()
1468                    .all(|c| c.is_ascii_hexdigit() && c.is_lowercase() || c.is_ascii_digit()));
1469                assert!(Path::new(&dir_in).join(fname).exists());
1470                seen += 1;
1471            }
1472            assert!(seen >= 5);
1473        }
1474    }
1475
1476    #[test]
1477    fn e2e_vector_search_f16() {
1478        use half::f16;
1479        // Build a minimal f16 bundle and ensure vector search works
1480        let dir = temp_dir("nvs_rust_e2e_f16");
1481        let num_docs = 3usize;
1482        let dim = 4usize;
1483        let block_size = 128u32;
1484
1485        // Write vectors.f16 with identity-like rows, 64B aligned
1486        {
1487            let row_bytes = dim * 2; // f16
1488            let aligned = row_bytes.div_ceil(64) * 64;
1489            let mut data = vec![0u8; num_docs * aligned];
1490            for i in 0..num_docs {
1491                for j in 0..dim {
1492                    let v = if i == j { 1.0f32 } else { 0.0f32 };
1493                    let h = f16::from_f32(v);
1494                    let off = i * aligned + j * 2;
1495                    data[off..off + 2].copy_from_slice(&h.to_le_bytes());
1496                }
1497            }
1498            let mut f = File::create(dir.join("vectors.f16")).unwrap();
1499            f.write_all(&data).unwrap();
1500        }
1501
1502        // doclen for num_docs (zeros)
1503        {
1504            let mut f = File::create(dir.join("doclen.u32")).unwrap();
1505            for _ in 0..num_docs {
1506                f.write_all(&0u32.to_le_bytes()).unwrap();
1507            }
1508        }
1509        // Empty bm25 files
1510        File::create(dir.join("lexicon.bin")).unwrap();
1511        File::create(dir.join("postings.bin")).unwrap();
1512        File::create(dir.join("terms.dict")).unwrap();
1513
1514        // Minimal meta files (no actual doc content needed for this test)
1515        write_meta_idx(&dir, num_docs);
1516        write_meta_blocks(&dir, 1, block_size);
1517
1518        // Write manifest pointing to f16 vectors
1519        {
1520            let manifest = format!(
1521                r#"{{
1522  "format": "nvs.v1",
1523  "num_docs": {},
1524  "dim": {},
1525  "embedding": {{"model": "test", "dtype": "f16"}},
1526  "bm25": {{"avgdl": 0.0, "k1": 1.2, "b": 0.75}},
1527  "files": {{
1528    "vectors": {{"path": "vectors.f16", "dtype": "f16", "rows": {}, "cols": {}}},
1529    "doclen": {{"path": "doclen.u32", "dtype": "u32", "rows": {}}},
1530    "lexicon": {{"path": "lexicon.bin"}},
1531    "postings": {{"path": "postings.bin"}},
1532    "terms": {{"path": "terms.dict"}},
1533    "meta_idx": {{"path": "meta.idx", "schema": "u32 block_id, u32 offset, u32 doc_size"}},
1534    "meta": {{"path": "meta.blocks", "block_size": {}, "doc_aligned": true}}
1535  }}
1536}}"#,
1537                num_docs, dim, num_docs, dim, num_docs, block_size
1538            );
1539            let mut f = File::create(dir.join("manifest.json")).unwrap();
1540            f.write_all(manifest.as_bytes()).unwrap();
1541        }
1542
1543        // Open and run vector search
1544        let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
1545        assert_eq!(store.size(), num_docs);
1546        assert_eq!(store.dimensions(), dim);
1547        let q = [1f32, 0f32, 0f32, 0f32];
1548        let res = store.search_vector(&q, 3);
1549        assert!(!res.is_empty());
1550        assert_eq!(res[0].0, 0);
1551    }
1552}