1use std::fs::{self, File};
2use std::io::Read;
3use std::path::{Path, PathBuf};
4
5use crate::errors::*;
6use crate::manifest::Manifest;
7use memmap2::Mmap;
8use std::collections::HashMap;
9
10#[repr(C)]
11#[derive(Debug, Clone, Copy)]
12struct MetaIdxEntry {
13 block_id: u32,
14 offset_in_block: u32,
15 doc_size: u32,
16 padding: u32,
17}
18
19const META_IDX_ENTRY_SIZE: usize = size_of::<MetaIdxEntry>();
20
21#[derive(Debug)]
22pub struct Bundle {
23 #[allow(dead_code)]
24 root: PathBuf,
25 pub manifest: Manifest,
26 pub meta_block_size: u32,
27 pub meta_block_count: u32,
28 pub meta_codec: Option<String>,
29 pub meta_block_headers: Vec<(u32, u32, u32, u32)>, vectors: Mmap,
32 meta_blocks: Mmap,
34 meta_idx: Vec<MetaIdxEntry>,
35 pub(crate) doclen: Vec<u32>,
37 pub(crate) terms: HashMap<String, usize>,
38 pub(crate) lexicon: Vec<LexiconEntry>,
39 pub(crate) postings: Vec<u8>,
40}
41
42impl Bundle {
43 pub fn open<P: AsRef<Path>>(root: P) -> Result<Self> {
44 let root = root.as_ref().to_path_buf();
45 let manifest_path = root.join("manifest.json");
47 let mut s = String::new();
48 File::open(&manifest_path)?.read_to_string(&mut s)?;
49 let manifest: Manifest = serde_json::from_str(&s)?;
50
51 if manifest.format != "nvs.v1" {
52 return Err(NvsError::InvalidManifest("unsupported format"));
53 }
54 if manifest.num_docs == 0 {
55 return Err(NvsError::InvalidManifest("num_docs must be > 0"));
56 }
57 if manifest.dim == 0 {
58 return Err(NvsError::InvalidManifest("dim must be > 0"));
59 }
60
61 let meta_idx_path = root.join(&manifest.files.meta_idx.path);
63 let meta_idx_md = fs::metadata(&meta_idx_path)?;
64 let sz = meta_idx_md.len() as usize;
65 if sz % META_IDX_ENTRY_SIZE != 0 {
66 return Err(NvsError::InvalidBundle(
67 "meta.idx not aligned to entry size",
68 ));
69 }
70 let count = sz / META_IDX_ENTRY_SIZE;
71 if count as u64 != manifest.num_docs {
72 return Err(NvsError::InvalidBundle("meta.idx entry count mismatch"));
73 }
74 let mut meta_idx_entries = Vec::with_capacity(count);
75 {
76 let mut f = File::open(&meta_idx_path)?;
77 let mut buf = Vec::with_capacity(sz);
78 f.read_to_end(&mut buf)?;
79 let mut i = 0usize;
80 while i + 16 <= buf.len() {
81 let block_id = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
82 i += 4;
83 let offset_in_block = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
84 i += 4;
85 let doc_size = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
86 i += 4;
87 let padding = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
88 i += 4;
89 meta_idx_entries.push(MetaIdxEntry {
90 block_id,
91 offset_in_block,
92 doc_size,
93 padding,
94 });
95 }
96 }
97
98 let meta_blocks_path = root.join(&manifest.files.meta.path);
100 let meta_blocks_file = File::open(&meta_blocks_path)?;
101 let mut f = meta_blocks_file.try_clone()?;
102 let mut u32buf = [0u8; 4];
103 f.read_exact(&mut u32buf)?;
105 let block_count = u32::from_le_bytes(u32buf);
106 if block_count == 0 {
107 return Err(NvsError::InvalidBundle("block_count must be > 0"));
108 }
109 let header_size = 4u64 + (block_count as u64) * 16u64;
111 let total_size = fs::metadata(&meta_blocks_path)?.len();
112 if total_size <= header_size {
113 return Err(NvsError::InvalidBundle("meta.blocks too small for headers"));
114 }
115 let remaining = total_size - header_size;
116 if remaining % (block_count as u64) != 0 {
117 return Err(NvsError::InvalidBundle(
118 "meta.blocks data not divisible by block_count",
119 ));
120 }
121 let derived_block = (remaining / (block_count as u64)) as u32;
122
123 if let Some(bsz) = manifest.files.meta.block_size {
124 if bsz != derived_block {
125 return Err(NvsError::InvalidBundle("manifest block_size mismatch"));
126 }
127 }
128 let mut headers: Vec<(u32, u32, u32, u32)> = Vec::with_capacity(block_count as usize);
130 for _ in 0..block_count {
131 let mut b = [0u8; 16];
132 f.read_exact(&mut b)?;
133 let csz = u32::from_le_bytes(b[0..4].try_into().unwrap());
134 let dsz = u32::from_le_bytes(b[4..8].try_into().unwrap());
135 let dct = u32::from_le_bytes(b[8..12].try_into().unwrap());
136 let cod = u32::from_le_bytes(b[12..16].try_into().unwrap());
137 headers.push((csz, dsz, dct, cod));
138 }
139 let meta_blocks = unsafe { Mmap::map(&meta_blocks_file)? };
140
141 let vectors_path = root.join(&manifest.files.vectors.path);
143 let vec_file = File::open(&vectors_path)?;
144 let vectors = unsafe { Mmap::map(&vec_file)? };
145 let elem_size = if manifest.embedding.dtype.to_lowercase() == "f16" {
146 2
147 } else {
148 4
149 };
150 let row_bytes = (manifest.dim as usize) * elem_size;
151 let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
152 let expected = (manifest.num_docs as usize) * aligned_row_bytes;
153 if vectors.len() != expected {
154 return Err(NvsError::InvalidBundle("vectors size mismatch"));
155 }
156
157 let doclen_path = root.join(&manifest.files.doclen.path);
159 let mut doclen = Vec::<u32>::new();
160 {
161 let mut f = File::open(&doclen_path)?;
162 let mut buf = Vec::new();
163 f.read_to_end(&mut buf)?;
164 if buf.len() % 4 != 0 {
165 return Err(NvsError::InvalidBundle("doclen size not multiple of 4"));
166 }
167 let n = buf.len() / 4;
168 doclen.resize(n, 0);
169 for i in 0..n {
170 let b = [buf[4 * i], buf[4 * i + 1], buf[4 * i + 2], buf[4 * i + 3]];
171 doclen[i] = u32::from_le_bytes(b);
172 }
173 if n as u64 != manifest.num_docs {
174 return Err(NvsError::InvalidBundle("doclen rows mismatch"));
175 }
176 }
177 let terms_path = root.join(&manifest.files.terms.path);
179 let terms = load_terms(&terms_path)?;
180 let lexicon_path = root.join(&manifest.files.lexicon.path);
182 let lexicon = load_lexicon(&lexicon_path)?;
183 let postings_path = root.join(&manifest.files.postings.path);
185 let postings = {
186 let mut f = File::open(&postings_path)?;
187 let mut buf = Vec::new();
188 f.read_to_end(&mut buf)?;
189 buf
190 };
191
192 let meta_codec = manifest.files.meta.compression.clone();
193 Ok(Self {
194 root,
195 manifest,
196 meta_block_size: derived_block,
197 meta_block_count: block_count,
198 meta_codec,
199 meta_block_headers: headers,
200 vectors,
201 meta_blocks,
202 meta_idx: meta_idx_entries,
203 doclen,
204 terms,
205 lexicon,
206 postings,
207 })
208 }
209
210 pub fn get_document(&self, doc_id: u32) -> Option<(String, String, String)> {
211 let idx = *self.meta_idx.get(doc_id as usize)?;
212 let header_size = 4usize + (self.meta_block_count as usize) * 16usize;
213 let block_size = self.meta_block_size as usize;
214 let base = &self.meta_blocks;
215 let blocks_start = header_size;
216 let block0 = blocks_start;
217 let block_begin = block0 + (idx.block_id as usize) * block_size;
218 let header = self
219 .meta_block_headers
220 .get(idx.block_id as usize)
221 .copied()
222 .unwrap_or((0, 0, 0, 0));
223 let codec = header.3; if codec == 0 {
225 if (idx.offset_in_block as usize) > block_size {
227 return None;
228 }
229 if (idx.offset_in_block as usize) + (idx.doc_size as usize) > block_size {
230 return None;
231 }
232 let mut p = block_begin + idx.offset_in_block as usize;
233 let end = block_begin + block_size;
234 if p + 4 > end {
235 return None;
236 }
237 let id_len = u32::from_le_bytes(base[p..p + 4].try_into().ok()?) as usize;
238 p += 4;
239 if p + id_len > end {
240 return None;
241 }
242 let id = String::from_utf8(base[p..p + id_len].to_vec()).ok()?;
243 p += id_len;
244 if p + 4 > end {
245 return None;
246 }
247 let text_len = u32::from_le_bytes(base[p..p + 4].try_into().ok()?) as usize;
248 p += 4;
249 if p + text_len > end {
250 return None;
251 }
252 let text = String::from_utf8(base[p..p + text_len].to_vec()).ok()?;
253 p += text_len;
254 if p + 4 > end {
255 return None;
256 }
257 let meta_len = u32::from_le_bytes(base[p..p + 4].try_into().ok()?) as usize;
258 p += 4;
259 if p + meta_len > end {
260 return None;
261 }
262 let meta = String::from_utf8(base[p..p + meta_len].to_vec()).ok()?;
263 Some((id, text, meta))
264 } else {
265 let comp_size = header.0 as usize;
267 let decomp_size = header.1 as usize;
268 let comp_start = block_begin;
269 let comp_end = comp_start + comp_size.min(block_size);
270 if comp_end > base.len() {
271 return None;
272 }
273 let comp_slice = &base[comp_start..comp_end];
274 let mut buf = vec![0u8; decomp_size];
276 match zstd::bulk::decompress_to_buffer(comp_slice, &mut buf) {
277 Ok(_) => {
278 if (idx.offset_in_block as usize) + (idx.doc_size as usize) > buf.len() {
279 return None;
280 }
281 let mut p = idx.offset_in_block as usize;
282 let end = buf.len();
283 if p + 4 > end {
284 return None;
285 }
286 let id_len = u32::from_le_bytes(buf[p..p + 4].try_into().ok()?) as usize;
287 p += 4;
288 if p + id_len > end {
289 return None;
290 }
291 let id = String::from_utf8(buf[p..p + id_len].to_vec()).ok()?;
292 p += id_len;
293 if p + 4 > end {
294 return None;
295 }
296 let text_len = u32::from_le_bytes(buf[p..p + 4].try_into().ok()?) as usize;
297 p += 4;
298 if p + text_len > end {
299 return None;
300 }
301 let text = String::from_utf8(buf[p..p + text_len].to_vec()).ok()?;
302 p += text_len;
303 if p + 4 > end {
304 return None;
305 }
306 let meta_len = u32::from_le_bytes(buf[p..p + 4].try_into().ok()?) as usize;
307 p += 4;
308 if p + meta_len > end {
309 return None;
310 }
311 let meta = String::from_utf8(buf[p..p + meta_len].to_vec()).ok()?;
312 Some((id, text, meta))
313 }
314 Err(_) => None,
315 }
316 }
317 }
318
319 #[inline]
322 pub(crate) fn row_stride_f32(&self) -> usize {
323 let row_bytes = (self.manifest.dim as usize) * 4;
324 let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
325 aligned_row_bytes / 4
326 }
327
328 pub(crate) fn vectors_as_f32(&self) -> &[f32] {
332 bytemuck::cast_slice(&self.vectors)
333 }
334 pub(crate) fn vectors_raw(&self) -> &[u8] {
335 &self.vectors
336 }
337 #[allow(dead_code)]
338 pub(crate) fn num_docs_usize(&self) -> usize {
339 self.manifest.num_docs as usize
340 }
341 #[allow(dead_code)]
342 pub(crate) fn dim_usize(&self) -> usize {
343 self.manifest.dim as usize
344 }
345 pub(crate) fn row_stride_bytes(&self) -> usize {
346 let elem = if self.manifest.embedding.dtype.to_lowercase() == "f16" {
347 2
348 } else {
349 4
350 };
351 let row = (self.manifest.dim as usize) * elem;
352 ((row + 63) / 64) * 64
353 }
354
355 }
357
358#[derive(Debug, Clone, Copy)]
359pub(crate) struct LexiconEntry {
360 pub(crate) offset: u64,
361 pub(crate) length: u32,
362 pub(crate) df: u32,
363}
364
365fn load_lexicon(path: &Path) -> Result<Vec<LexiconEntry>> {
366 let mut f = File::open(path)?;
367 let mut buf = Vec::new();
368 f.read_to_end(&mut buf)?;
369 if buf.len() % 16 != 0 {
370 return Err(NvsError::InvalidBundle("lexicon size not multiple of 16"));
371 }
372 let mut v = Vec::with_capacity(buf.len() / 16);
373 let mut i = 0usize;
374 while i + 16 <= buf.len() {
375 let off = u64::from_le_bytes(buf[i..i + 8].try_into().unwrap());
376 i += 8;
377 let length = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
378 i += 4;
379 let df = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap());
380 i += 4;
381 v.push(LexiconEntry {
382 offset: off,
383 length,
384 df,
385 });
386 }
387 Ok(v)
388}
389
390fn load_terms(path: &Path) -> Result<HashMap<String, usize>> {
391 let mut f = File::open(path)?;
392 let mut buf = Vec::new();
393 f.read_to_end(&mut buf)?;
394 let mut m = HashMap::new();
395 let mut i = 0usize;
396 let mut id = 0usize;
397 while i + 4 <= buf.len() {
398 let len = u32::from_le_bytes(buf[i..i + 4].try_into().unwrap()) as usize;
399 i += 4;
400 if i + len > buf.len() {
401 break;
402 }
403 let s = String::from_utf8_lossy(&buf[i..i + len]).to_string();
404 i += len;
405 m.insert(s, id);
406 id += 1;
407 }
408 Ok(m)
409}
410
411#[cfg(test)]
412mod tests {
413 use super::*;
414 use crate::tokenizer::SimpleTokenizer;
415 use std::io::Write;
416 use std::time::{SystemTime, UNIX_EPOCH};
417
418 fn temp_dir(prefix: &str) -> PathBuf {
419 let ts = SystemTime::now()
420 .duration_since(UNIX_EPOCH)
421 .unwrap()
422 .as_millis();
423 let p = std::env::temp_dir().join(format!("{}_{}", prefix, ts));
424 let _ = fs::create_dir_all(&p);
425 p
426 }
427
428 fn write_manifest(root: &Path, num_docs: u64, dim: u64, block_size: u32) {
429 let manifest = format!(
430 r#"{{
431 "format": "nvs.v1",
432 "num_docs": {},
433 "dim": {},
434 "embedding": {{"model": "test", "dtype": "f32"}},
435 "bm25": {{"avgdl": 1.0, "k1": 1.2, "b": 0.75}},
436 "files": {{
437 "vectors": {{"path": "vectors.f32", "dtype": "f32", "rows": {}, "cols": {}}},
438 "doclen": {{"path": "doclen.u32", "dtype": "u32", "rows": {}}},
439 "lexicon": {{"path": "lexicon.bin"}},
440 "postings": {{"path": "postings.bin"}},
441 "terms": {{"path": "terms.dict"}},
442 "meta_idx": {{"path": "meta.idx", "schema": "u32 block_id, u32 offset, u32 doc_size"}},
443 "meta": {{"path": "meta.blocks", "block_size": {}, "doc_aligned": true}}
444 }}
445}}"#,
446 num_docs, dim, num_docs, dim, num_docs, block_size
447 );
448 let mut f = File::create(root.join("manifest.json")).unwrap();
449 f.write_all(manifest.as_bytes()).unwrap();
450 }
451
452 fn write_meta_blocks(root: &Path, block_count: u32, block_size: u32) {
453 let mut f = File::create(root.join("meta.blocks")).unwrap();
454 f.write_all(&block_count.to_le_bytes()).unwrap();
455 let hdr = [0u8; 16];
456 for _ in 0..block_count {
457 f.write_all(&hdr).unwrap();
458 }
459 let block = vec![0u8; block_size as usize];
460 for _ in 0..block_count {
461 f.write_all(&block).unwrap();
462 }
463 }
464
465 fn write_meta_idx(root: &Path, entries: usize) {
466 let mut f = File::create(root.join("meta.idx")).unwrap();
467 for _ in 0..entries {
468 let entry = MetaIdxEntry {
469 block_id: 0,
470 offset_in_block: 0,
471 doc_size: 0,
472 padding: 0,
473 };
474 let bytes: [u8; META_IDX_ENTRY_SIZE] = unsafe { std::mem::transmute(entry) };
475 f.write_all(&bytes).unwrap();
476 }
477 }
478
479 fn touch(root: &Path, name: &str) {
480 let _ = File::create(root.join(name)).unwrap();
481 }
482
483 #[test]
484 fn open_ok_with_valid_headers() {
485 let dir = temp_dir("nvs_rust_ok");
486 write_manifest(&dir, 3, 4, 256);
487 {
489 let row_bytes = (4usize) * 4;
490 let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
491 let data = vec![0u8; (3usize) * aligned_row_bytes];
492 let mut f = File::create(dir.join("vectors.f32")).unwrap();
493 f.write_all(&data).unwrap();
494 }
495 {
497 let mut f = File::create(dir.join("doclen.u32")).unwrap();
498 for v in [0u32, 0u32, 0u32] {
499 f.write_all(&v.to_le_bytes()).unwrap();
500 }
501 }
502 touch(&dir, "lexicon.bin");
503 touch(&dir, "postings.bin");
504 touch(&dir, "terms.dict");
505 write_meta_idx(&dir, 3);
506 write_meta_blocks(&dir, 2, 256);
507
508 let b = Bundle::open(&dir).expect("bundle open");
509 assert_eq!(b.meta_block_size, 256);
510 assert_eq!(b.meta_block_count, 2);
511 }
512
513 #[test]
514 fn open_fails_on_meta_idx_count_mismatch() {
515 let dir = temp_dir("nvs_rust_bad_idx");
516 write_manifest(&dir, 2, 1, 128);
517 {
519 let row_bytes = (1usize) * 4;
520 let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
521 let data = vec![0u8; (1usize) * aligned_row_bytes];
522 let mut f = File::create(dir.join("vectors.f32")).unwrap();
523 f.write_all(&data).unwrap();
524 }
525 touch(&dir, "doclen.u32");
526 touch(&dir, "lexicon.bin");
527 touch(&dir, "postings.bin");
528 touch(&dir, "terms.dict");
529 write_meta_idx(&dir, 1); write_meta_blocks(&dir, 1, 128);
531
532 let err = Bundle::open(&dir).unwrap_err();
533 match err {
534 NvsError::InvalidBundle(_) => {}
535 _ => panic!("unexpected err"),
536 }
537 }
538
539 #[test]
540 fn open_fails_on_manifest_block_size_mismatch() {
541 let dir = temp_dir("nvs_rust_bad_bsz");
542 write_manifest(&dir, 1, 1, 128);
543 touch(&dir, "vectors.f32");
544 touch(&dir, "doclen.u32");
545 touch(&dir, "lexicon.bin");
546 touch(&dir, "postings.bin");
547 touch(&dir, "terms.dict");
548 write_meta_idx(&dir, 1);
549 write_meta_blocks(&dir, 1, 256);
551
552 let err = Bundle::open(&dir).unwrap_err();
553 match err {
554 NvsError::InvalidBundle(_) => {}
555 _ => panic!("unexpected err"),
556 }
557 }
558
559 #[test]
560 fn bm25_small_corpus_ordering() {
561 let dir = temp_dir("nvs_rust_bm25");
562 write_manifest(&dir, 3, 1, 128);
564 {
566 let dim = 1usize;
567 let num_docs = 3usize;
568 let row_bytes = dim * 4;
569 let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
570 let data = vec![0u8; num_docs * aligned_row_bytes];
571 let mut f = File::create(dir.join("vectors.f32")).unwrap();
572 f.write_all(&data).unwrap();
573 }
574 {
576 let mut f = File::create(dir.join("doclen.u32")).unwrap();
577 for v in [3u32, 1u32, 3u32] {
579 f.write_all(&v.to_le_bytes()).unwrap();
580 }
581 }
582 {
584 let mut f = File::create(dir.join("terms.dict")).unwrap();
585 for s in ["apple", "banana", "cherry"] {
586 let len = s.len() as u32;
587 f.write_all(&len.to_le_bytes()).unwrap();
588 f.write_all(s.as_bytes()).unwrap();
589 }
590 }
591 let mut postings = Vec::<u8>::new();
596 let mut lex = Vec::<u8>::new();
597 let mut offset: u64 = 0;
598 let add_entry = |delta: u32, tf: u32, buf: &mut Vec<u8>| {
599 buf.extend_from_slice(&delta.to_le_bytes());
600 buf.extend_from_slice(&tf.to_le_bytes());
601 };
602 add_entry(0, 3, &mut postings); add_entry(2, 1, &mut postings); lex.extend_from_slice(&offset.to_le_bytes());
606 lex.extend_from_slice(&2u32.to_le_bytes());
607 lex.extend_from_slice(&2u32.to_le_bytes());
608 offset += 2 * 8;
609 add_entry(1, 3, &mut postings);
611 lex.extend_from_slice(&offset.to_le_bytes());
612 lex.extend_from_slice(&1u32.to_le_bytes());
613 lex.extend_from_slice(&1u32.to_le_bytes());
614 offset += 1 * 8;
615 add_entry(1, 2, &mut postings); lex.extend_from_slice(&offset.to_le_bytes());
618 lex.extend_from_slice(&1u32.to_le_bytes());
619 lex.extend_from_slice(&1u32.to_le_bytes());
620
621 {
622 let mut f = File::create(dir.join("postings.bin")).unwrap();
623 f.write_all(&postings).unwrap();
624 let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
625 lf.write_all(&lex).unwrap();
626 }
627 write_meta_idx(&dir, 3);
629 write_meta_blocks(&dir, 1, 128);
630
631 let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
632 let res = store.search_bm25("apple", 3);
634 assert!(!res.is_empty());
635 assert_eq!(res[0].0, 0);
636 for i in 1..res.len() {
638 assert!(res[i - 1].1 >= res[i].1, "bm25 scores must be sorted desc");
639 }
640 let res2 = store.search_bm25("apple banana", 3);
642 assert!(res2.iter().any(|&(id, _)| id == 0));
643 assert!(res2.iter().any(|&(id, _)| id == 1));
644 for i in 1..res2.len() {
645 assert!(
646 res2[i - 1].1 >= res2[i].1,
647 "bm25 scores must be sorted desc"
648 );
649 }
650 }
651
652 #[test]
653 fn bm25_sort_order_and_ties() {
654 use std::io::Write;
655 let dir = temp_dir("nvs_rust_bm25_ties");
656 write_manifest(&dir, 3, 1, 128);
658 {
660 let row_bytes = 4usize;
661 let aligned = row_bytes.div_ceil(64) * 64;
662 let data = vec![0u8; 3 * aligned];
663 let mut f = File::create(dir.join("vectors.f32")).unwrap();
664 f.write_all(&data).unwrap();
665 }
666 {
668 let mut f = File::create(dir.join("doclen.u32")).unwrap();
669 for _ in 0..3 {
670 f.write_all(&1u32.to_le_bytes()).unwrap();
671 }
672 }
673 {
675 let mut f = File::create(dir.join("terms.dict")).unwrap();
676 let s = "foo";
677 f.write_all(&(s.len() as u32).to_le_bytes()).unwrap();
678 f.write_all(s.as_bytes()).unwrap();
679 }
680 {
682 let mut postings = Vec::<u8>::new();
683 let mut lexicon = Vec::<u8>::new();
684 let offset: u64 = 0;
685 let add = |delta: u32, tf: u32, buf: &mut Vec<u8>| {
686 buf.extend_from_slice(&delta.to_le_bytes());
687 buf.extend_from_slice(&tf.to_le_bytes());
688 };
689 add(0, 1, &mut postings); add(1, 1, &mut postings); add(1, 1, &mut postings); lexicon.extend_from_slice(&offset.to_le_bytes());
693 lexicon.extend_from_slice(&3u32.to_le_bytes());
694 lexicon.extend_from_slice(&3u32.to_le_bytes());
695 let mut pf = File::create(dir.join("postings.bin")).unwrap();
696 pf.write_all(&postings).unwrap();
697 let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
698 lf.write_all(&lexicon).unwrap();
699 }
700 write_meta_idx(&dir, 3);
702 write_meta_blocks(&dir, 1, 128);
703
704 let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
705 let res = store.search_bm25("foo", 3);
706 assert_eq!(res.len(), 3);
707 for i in 1..res.len() {
709 assert!(res[i - 1].1 >= res[i].1, "bm25 scores must be sorted desc");
710 }
711 assert_eq!(res[0].0, 0);
713 assert_eq!(res[1].0, 1);
714 assert_eq!(res[2].0, 2);
715 }
716
717 #[test]
718 fn vector_search_small() {
719 use std::io::Write;
720 let dir = temp_dir("nvs_rust_vec");
721 let num_docs = 4u64;
722 let dim = 4u64;
723 let block = 128u32;
724 write_manifest(&dir, num_docs, dim, block);
725 {
727 let row_bytes = (dim as usize) * 4;
728 let aligned_row_bytes = row_bytes.div_ceil(64) * 64;
729 let mut data = vec![0u8; (num_docs as usize) * aligned_row_bytes];
730 for i in 0..(num_docs as usize) {
731 for j in 0..(dim as usize) {
732 let v = if i == j { 1f32 } else { 0f32 };
733 let off = i * aligned_row_bytes + j * 4;
734 data[off..off + 4].copy_from_slice(&v.to_le_bytes());
735 }
736 }
737 let mut f = File::create(dir.join("vectors.f32")).unwrap();
738 f.write_all(&data).unwrap();
739 }
740 {
742 let mut f = File::create(dir.join("doclen.u32")).unwrap();
743 for _ in 0..num_docs {
744 f.write_all(&0u32.to_le_bytes()).unwrap();
745 }
746 }
747 File::create(dir.join("lexicon.bin")).unwrap();
748 File::create(dir.join("postings.bin")).unwrap();
749 File::create(dir.join("terms.dict")).unwrap();
750 write_meta_idx(&dir, num_docs as usize);
751 write_meta_blocks(&dir, 1, 128);
752
753 let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
754 let q = [1f32, 0f32, 0f32, 0f32];
755 let res = store.search_vector(&q, 3);
756 assert!(!res.is_empty());
757 assert_eq!(res[0].0, 0);
759 for i in 1..res.len() {
761 assert!(res[i - 1].1 >= res[i].1);
762 }
763 let res2 = store.search_vector(&q, 3);
765 assert_eq!(res, res2);
766 }
767
768 #[test]
769 fn get_document_basic() {
770 use std::io::Write;
771 let dir = temp_dir("nvs_rust_getdoc");
772 write_manifest(&dir, 2, 1, 128);
773 {
775 let row_bytes = 4usize;
776 let aligned = row_bytes.div_ceil(64) * 64;
777 let data = vec![0u8; 2 * aligned];
778 let mut f = File::create(dir.join("vectors.f32")).unwrap();
779 f.write_all(&data).unwrap();
780 }
781 {
783 let mut f = File::create(dir.join("doclen.u32")).unwrap();
784 for _ in 0..2 {
785 f.write_all(&0u32.to_le_bytes()).unwrap();
786 }
787 }
788 File::create(dir.join("lexicon.bin")).unwrap();
790 File::create(dir.join("postings.bin")).unwrap();
791 File::create(dir.join("terms.dict")).unwrap();
792 let (id0, text0, meta0) = ("a", "text a", "{\"k\":1}");
794 let (id1, text1, meta1) = ("b", "text b", "{\"k\":2}");
795 let rec_size = |id: &str, tx: &str, mj: &str| 4 + id.len() + 4 + tx.len() + 4 + mj.len();
796 let s0 = rec_size(id0, text0, meta0);
797 let s1 = rec_size(id1, text1, meta1);
798 let mut mb = Vec::<u8>::new();
799 mb.extend_from_slice(&1u32.to_le_bytes());
801 mb.extend_from_slice(&0u32.to_le_bytes());
803 mb.extend_from_slice(&(s0 as u32 + s1 as u32).to_le_bytes());
804 mb.extend_from_slice(&2u32.to_le_bytes());
805 mb.extend_from_slice(&0u32.to_le_bytes());
806 let write_rec = |id: &str, tx: &str, mj: &str, buf: &mut Vec<u8>| {
808 buf.extend_from_slice(&(id.len() as u32).to_le_bytes());
809 buf.extend_from_slice(id.as_bytes());
810 buf.extend_from_slice(&(tx.len() as u32).to_le_bytes());
811 buf.extend_from_slice(tx.as_bytes());
812 buf.extend_from_slice(&(mj.len() as u32).to_le_bytes());
813 buf.extend_from_slice(mj.as_bytes());
814 };
815 write_rec(id0, text0, meta0, &mut mb);
816 write_rec(id1, text1, meta1, &mut mb);
817 let block_size = 128usize;
819 let _header_size = 4 + 1 * 16;
820 let data_len = s0 + s1;
821 let pad_len = block_size - data_len;
822 mb.extend(std::iter::repeat(0u8).take(pad_len));
823 let mut fmb = File::create(dir.join("meta.blocks")).unwrap();
825 fmb.write_all(&mb).unwrap();
826 {
828 let mut idx = Vec::<u8>::new();
829 idx.extend_from_slice(&0u32.to_le_bytes());
830 idx.extend_from_slice(&0u32.to_le_bytes());
831 idx.extend_from_slice(&(s0 as u32).to_le_bytes());
832 idx.extend_from_slice(&0u32.to_le_bytes());
833 idx.extend_from_slice(&0u32.to_le_bytes());
834 idx.extend_from_slice(&(s0 as u32).to_le_bytes());
835 idx.extend_from_slice(&(s1 as u32).to_le_bytes());
836 idx.extend_from_slice(&0u32.to_le_bytes());
837 let mut fi = File::create(dir.join("meta.idx")).unwrap();
838 fi.write_all(&idx).unwrap();
839 }
840
841 let b = Bundle::open(&dir).unwrap();
842 let d0 = b.get_document(0).unwrap();
843 assert_eq!(d0.0, "a");
844 assert!(d0.1.contains("text a"));
845 assert!(d0.2.contains("\"k\":1"));
846 let d1 = b.get_document(1).unwrap();
847 assert_eq!(d1.0, "b");
848 assert!(d1.1.contains("text b"));
849 assert!(d1.2.contains("\"k\":2"));
850 }
851
852 #[test]
853 fn hybrid_extremes_vector_and_bm25() {
854 use std::io::Write;
855 let dir = temp_dir("nvs_rust_hybrid_bm25");
857 write_manifest(&dir, 3, 1, 128);
858 {
860 let row_bytes = 4usize;
861 let aligned = row_bytes.div_ceil(64) * 64;
862 let data = vec![0u8; 3 * aligned];
863 let mut f = File::create(dir.join("vectors.f32")).unwrap();
864 f.write_all(&data).unwrap();
865 }
866 {
868 let mut f = File::create(dir.join("doclen.u32")).unwrap();
869 for _ in 0..3 {
870 f.write_all(&1u32.to_le_bytes()).unwrap();
871 }
872 }
873 {
875 let mut f = File::create(dir.join("terms.dict")).unwrap();
876 let s = "apple";
877 f.write_all(&(s.len() as u32).to_le_bytes()).unwrap();
878 f.write_all(s.as_bytes()).unwrap();
879 }
880 {
882 let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
883 let mut pf = File::create(dir.join("postings.bin")).unwrap();
884 lf.write_all(&0u64.to_le_bytes()).unwrap();
886 lf.write_all(&1u32.to_le_bytes()).unwrap();
887 lf.write_all(&1u32.to_le_bytes()).unwrap();
888 pf.write_all(&1u32.to_le_bytes()).unwrap();
890 pf.write_all(&1u32.to_le_bytes()).unwrap();
891 }
892 write_meta_idx(&dir, 3);
893 write_meta_blocks(&dir, 1, 128);
894 let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
895 let v = [1f32];
896 let hv = store.search_hybrid(&v, "apple", 2, 0.0);
897 assert_eq!(hv[0].0, 1, "bm25 extreme should rank doc1 first");
898
899 let dir2 = temp_dir("nvs_rust_hybrid_vec");
901 write_manifest(&dir2, 3, 3, 128);
902 {
903 let dim = 3usize;
904 let n = 3usize;
905 let row_bytes = dim * 4;
906 let aligned = row_bytes.div_ceil(64) * 64;
907 let mut data = vec![0u8; n * aligned];
908 for i in 0..n {
909 for j in 0..dim {
910 let v = if i == j { 1f32 } else { 0f32 };
911 let off = i * aligned + j * 4;
912 data[off..off + 4].copy_from_slice(&v.to_le_bytes());
913 }
914 }
915 let mut f = File::create(dir2.join("vectors.f32")).unwrap();
916 f.write_all(&data).unwrap();
917 }
918 {
919 let mut f = File::create(dir2.join("doclen.u32")).unwrap();
920 for _ in 0..3 {
921 f.write_all(&0u32.to_le_bytes()).unwrap();
922 }
923 }
924 File::create(dir2.join("lexicon.bin")).unwrap();
925 File::create(dir2.join("postings.bin")).unwrap();
926 File::create(dir2.join("terms.dict")).unwrap();
927 write_meta_idx(&dir2, 3);
928 write_meta_blocks(&dir2, 1, 128);
929 let store2 = crate::VectorStore::from_bundle(Bundle::open(&dir2).unwrap());
930 let q = [1f32, 0f32, 0f32];
931 let hv2 = store2.search_hybrid(&q, "unused", 2, 1.0);
932 assert_eq!(hv2[0].0, 0, "vector extreme should rank doc0 first");
933 }
934
935 #[derive(Clone)]
937 struct TDoc {
938 id: String,
939 text: String,
940 embedding: Vec<f32>,
941 }
942
943 fn pack_bundle(dir: &Path, docs: &[TDoc], dim: usize, block_size: usize) {
944 {
946 let row_bytes = dim * 4;
947 let aligned = row_bytes.div_ceil(64) * 64;
948 let mut data = vec![0u8; docs.len() * aligned];
949 for (i, d) in docs.iter().enumerate() {
950 assert_eq!(d.embedding.len(), dim);
951 for j in 0..dim {
952 let off = i * aligned + j * 4;
953 data[off..off + 4].copy_from_slice(&d.embedding[j].to_le_bytes());
954 }
955 }
956 let mut f = File::create(dir.join("vectors.f32")).unwrap();
957 f.write_all(&data).unwrap();
958 }
959 let tok = SimpleTokenizer::new();
961 let mut doc_tokens: Vec<Vec<String>> = Vec::with_capacity(docs.len());
962 let mut df_map: HashMap<String, usize> = HashMap::new();
963 let mut postings_map: HashMap<String, Vec<(usize, u32)>> = HashMap::new();
964 for (i, d) in docs.iter().enumerate() {
965 let tokens = tok.split(&d.text);
966 let mut tf: HashMap<&str, u32> = HashMap::new();
967 for t in &tokens {
968 *tf.entry(t.as_str()).or_insert(0) += 1;
969 }
970 for (term, &count) in tf.iter() {
971 postings_map
972 .entry(term.to_string())
973 .or_default()
974 .push((i, count));
975 }
976 for term in tf.keys() {
977 *df_map.entry((*term).to_string()).or_insert(0) += 1;
978 }
979 doc_tokens.push(tokens);
980 }
981 {
983 let mut f = File::create(dir.join("doclen.u32")).unwrap();
984 for tokens in &doc_tokens {
985 let len = tokens.len() as u32;
986 f.write_all(&len.to_le_bytes()).unwrap();
987 }
988 }
989 let mut terms: Vec<String> = postings_map.keys().cloned().collect();
991 terms.sort();
992 {
993 let mut f = File::create(dir.join("terms.dict")).unwrap();
994 for t in &terms {
995 let len = t.len() as u32;
996 f.write_all(&len.to_le_bytes()).unwrap();
997 f.write_all(t.as_bytes()).unwrap();
998 }
999 }
1000 {
1002 let mut postings = Vec::<u8>::new();
1003 let mut lexicon = Vec::<u8>::new();
1004 let mut offset: u64 = 0;
1005 for t in &terms {
1006 let mut list = postings_map.get(t).cloned().unwrap_or_default();
1007 list.sort_by_key(|&(doc, _)| doc);
1008 let mut prev = 0usize;
1009 let mut length = 0u32;
1010 for (doc, tf) in list.into_iter() {
1011 let delta = (doc - prev) as u32;
1012 prev = doc;
1013 length += 1;
1014 postings.extend_from_slice(&delta.to_le_bytes());
1015 postings.extend_from_slice(&tf.to_le_bytes());
1016 }
1017 let df = *df_map.get(t).unwrap_or(&0) as u32;
1018 lexicon.extend_from_slice(&offset.to_le_bytes());
1019 lexicon.extend_from_slice(&length.to_le_bytes());
1020 lexicon.extend_from_slice(&df.to_le_bytes());
1021 offset += (length as u64) * 8;
1022 }
1023 let mut pf = File::create(dir.join("postings.bin")).unwrap();
1024 pf.write_all(&postings).unwrap();
1025 let mut lf = File::create(dir.join("lexicon.bin")).unwrap();
1026 lf.write_all(&lexicon).unwrap();
1027 }
1028 let mut blocks: Vec<Vec<u8>> = Vec::new();
1030 let mut headers: Vec<(u32, u32, u32, u32)> = Vec::new();
1031 let mut idx: Vec<u8> = Vec::new();
1032 let mut cur = Vec::<u8>::with_capacity(block_size);
1033 let mut cur_usize = 0u32;
1034 let mut cur_docs = 0u32;
1035 let mut block_id = 0u32;
1036 for d in docs {
1037 let meta_json = format!(
1038 "{{\"embedding\":[{}]}}",
1039 d.embedding
1040 .iter()
1041 .map(|v| v.to_string())
1042 .collect::<Vec<_>>()
1043 .join(",")
1044 );
1045 let rec_size = 4 + d.id.len() + 4 + d.text.len() + 4 + meta_json.len();
1046 if cur_docs > 0 && cur_usize as usize + rec_size > block_size {
1047 headers.push((block_id, cur_usize, cur_docs, 0));
1048 blocks.push(std::mem::take(&mut cur));
1049 cur = Vec::with_capacity(block_size);
1050 cur_usize = 0;
1051 cur_docs = 0;
1052 block_id += 1;
1053 }
1054 idx.extend_from_slice(&block_id.to_le_bytes());
1056 idx.extend_from_slice(&cur_usize.to_le_bytes());
1057 idx.extend_from_slice(&(rec_size as u32).to_le_bytes());
1058 idx.extend_from_slice(&0u32.to_le_bytes());
1059 cur.extend_from_slice(&(d.id.len() as u32).to_le_bytes());
1061 cur.extend_from_slice(d.id.as_bytes());
1062 cur.extend_from_slice(&(d.text.len() as u32).to_le_bytes());
1063 cur.extend_from_slice(d.text.as_bytes());
1064 cur.extend_from_slice(&(meta_json.len() as u32).to_le_bytes());
1065 cur.extend_from_slice(meta_json.as_bytes());
1066 cur_usize += rec_size as u32;
1067 cur_docs += 1;
1068 }
1069 if cur_docs > 0 {
1070 headers.push((block_id, cur_usize, cur_docs, 0));
1071 blocks.push(cur);
1072 }
1073 {
1075 let mut f = File::create(dir.join("meta.blocks")).unwrap();
1076 f.write_all(&(headers.len() as u32).to_le_bytes()).unwrap();
1077 for (id, usizeb, dcount, pad) in &headers {
1078 f.write_all(&id.to_le_bytes()).unwrap();
1079 f.write_all(&usizeb.to_le_bytes()).unwrap();
1080 f.write_all(&dcount.to_le_bytes()).unwrap();
1081 f.write_all(&pad.to_le_bytes()).unwrap();
1082 }
1083 for b in &blocks {
1084 f.write_all(&b).unwrap();
1085 if b.len() < block_size {
1086 f.write_all(&vec![0u8; block_size - b.len()]).unwrap();
1087 }
1088 }
1089 }
1090 {
1092 let mut f = File::create(dir.join("meta.idx")).unwrap();
1093 f.write_all(&idx).unwrap();
1094 }
1095 {
1097 let manifest = format!(
1098 r#"{{
1099 "format": "nvs.v1",
1100 "num_docs": {},
1101 "dim": {},
1102 "embedding": {{"model": "test", "dtype": "f32"}},
1103 "bm25": {{"avgdl": 1.0, "k1": 1.2, "b": 0.75}},
1104 "files": {{
1105 "vectors": {{"path": "vectors.f32", "dtype": "f32", "rows": {}, "cols": {}}},
1106 "doclen": {{"path": "doclen.u32", "dtype": "u32", "rows": {}}},
1107 "lexicon": {{"path": "lexicon.bin"}},
1108 "postings": {{"path": "postings.bin"}},
1109 "terms": {{"path": "terms.dict"}},
1110 "meta_idx": {{"path": "meta.idx", "schema": "u32 block_id, u32 offset, u32 doc_size"}},
1111 "meta": {{"path": "meta.blocks", "block_size": {}, "doc_aligned": true}}
1112 }}
1113}}"#,
1114 docs.len(),
1115 dim,
1116 docs.len(),
1117 dim,
1118 docs.len(),
1119 block_size
1120 );
1121 let mut f = File::create(dir.join("manifest.json")).unwrap();
1122 f.write_all(manifest.as_bytes()).unwrap();
1123 }
1124 {
1126 use xxhash_rust::xxh64::xxh64;
1127 let files = [
1128 "manifest.json",
1129 "vectors.f32",
1130 "doclen.u32",
1131 "lexicon.bin",
1132 "postings.bin",
1133 "terms.dict",
1134 "meta.idx",
1135 "meta.blocks",
1136 ];
1137 let mut out = String::new();
1138 for name in files {
1139 let path = dir.join(name);
1140 let mut buf = Vec::new();
1141 File::open(&path).unwrap().read_to_end(&mut buf).unwrap();
1142 let h = xxh64(&buf, 0);
1143 out.push_str(&format!("{h:016x} {name}\n"));
1144 }
1145 let mut f = File::create(dir.join("checksums.xxhash64")).unwrap();
1146 f.write_all(out.as_bytes()).unwrap();
1147 }
1148 }
1149
1150 #[test]
1151 fn corpus_semantic_sanity() {
1152 use rand::rngs::StdRng;
1153 use rand::seq::SliceRandom;
1154 use rand::{Rng, SeedableRng};
1155
1156 #[derive(Clone)]
1159 struct Topic {
1160 name: &'static str,
1161 keywords: &'static [&'static str],
1162 }
1163 let topics = [
1164 Topic {
1165 name: "physics",
1166 keywords: &[
1167 "quantum", "particle", "wave", "electron", "photon", "field", "spin", "energy",
1168 ],
1169 },
1170 Topic {
1171 name: "cooking",
1172 keywords: &[
1173 "recipe",
1174 "cook",
1175 "bake",
1176 "ingredients",
1177 "oven",
1178 "simmer",
1179 "spice",
1180 "kitchen",
1181 ],
1182 },
1183 Topic {
1184 name: "finance",
1185 keywords: &[
1186 "market",
1187 "stock",
1188 "investment",
1189 "portfolio",
1190 "risk",
1191 "returns",
1192 "capital",
1193 "trading",
1194 ],
1195 },
1196 ];
1197
1198 let dim = 64usize;
1199 let per_topic = 30usize; let block_size = 8192usize;
1201 let dir = temp_dir("nvs_rust_corpus_semantic");
1202
1203 let mut rng = StdRng::seed_from_u64(42);
1205 let mut centroids: Vec<Vec<f32>> = Vec::new();
1206 for _ in 0..topics.len() {
1207 let mut v: Vec<f32> = (0..dim).map(|_| rng.gen_range(-0.5f32..0.5f32)).collect();
1208 let n = v.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-6);
1210 for x in &mut v {
1211 *x /= n;
1212 }
1213 centroids.push(v);
1214 }
1215
1216 let mut docs: Vec<TDoc> = Vec::with_capacity(topics.len() * per_topic);
1218 for (ti, topic) in topics.iter().enumerate() {
1219 for j in 0..per_topic {
1220 let mut idxs: Vec<usize> = (0..topic.keywords.len()).collect();
1222 idxs.shuffle(&mut rng);
1223 let kw = [
1224 topic.keywords[idxs[0]],
1225 topic.keywords[idxs[1]],
1226 topic.keywords[idxs[2]],
1227 topic.keywords[idxs[3]],
1228 ];
1229 let text = format!(
1230 "{} {} discussed here. We also mention {} and {} in this paragraph about {}.",
1231 kw[0], kw[1], kw[2], kw[3], topic.name
1232 );
1233
1234 let base = ¢roids[ti];
1236 let mut e = vec![0f32; dim];
1237 for d in 0..dim {
1238 let noise: f32 = rng.gen_range(-0.03..0.03);
1239 e[d] = base[d] + noise;
1240 }
1241 let n = e.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-6);
1243 for x in &mut e {
1244 *x /= n;
1245 }
1246
1247 let id = format!("{}-{:02}", topic.name, j);
1248 docs.push(TDoc {
1249 id,
1250 text,
1251 embedding: e,
1252 });
1253 }
1254 }
1255
1256 pack_bundle(&dir, &docs, dim, block_size);
1258 let store = crate::VectorStore::open(&dir).expect("open bundle");
1259 assert_eq!(store.size(), topics.len() * per_topic);
1260 assert_eq!(store.dimensions(), dim);
1261
1262 let topic_of = |doc_id: u32| -> String {
1264 store
1265 .get_document(doc_id)
1266 .unwrap()
1267 .0
1268 .split('-')
1269 .next()
1270 .unwrap()
1271 .to_string()
1272 };
1273
1274 for (ti, topic) in topics.iter().enumerate() {
1276 let qv = centroids[ti].clone();
1278 let vres = store.search_vector(&qv, 10);
1279 assert!(!vres.is_empty());
1280 let top_topic = topic_of(vres[0].0);
1281 assert_eq!(top_topic, topic.name, "vector top-1 should match topic");
1282 let same_count = vres
1283 .iter()
1284 .filter(|(id, _)| topic_of(*id) == topic.name)
1285 .count();
1286 assert!(
1287 same_count >= 7,
1288 "expected >=7/10 same-topic in vector search, got {}",
1289 same_count
1290 );
1291
1292 let qtext = format!("{} {}", topic.keywords[0], topic.keywords[1]);
1294 let bres = store.search_bm25(&qtext, 10);
1295 assert!(!bres.is_empty());
1296 let top_topic_b = topic_of(bres[0].0);
1297 assert_eq!(top_topic_b, topic.name, "bm25 top-1 should match topic");
1298 let same_count_b = bres
1299 .iter()
1300 .filter(|(id, _)| topic_of(*id) == topic.name)
1301 .count();
1302 assert!(
1303 same_count_b >= 6,
1304 "expected >=6/10 same-topic in BM25, got {}",
1305 same_count_b
1306 );
1307
1308 let hres = store.search_hybrid(&qv, &qtext, 10, 0.5);
1310 assert!(!hres.is_empty());
1311 let top_topic_h = topic_of(hres[0].0);
1312 assert_eq!(top_topic_h, topic.name, "hybrid top-1 should match topic");
1313 let same_count_h = hres
1314 .iter()
1315 .filter(|(id, _)| topic_of(*id) == topic.name)
1316 .count();
1317 assert!(
1318 same_count_h >= 7,
1319 "expected >=7/10 same-topic in hybrid, got {}",
1320 same_count_h
1321 );
1322 }
1323 }
1324
1325 #[test]
1326 fn e2e_pack_then_open_single_block() {
1327 let dir_in = temp_dir("nvs_rust_e2e_in_single");
1328 let docs = vec![
1329 TDoc {
1330 id: "doc0".into(),
1331 text: "doc text number 0".into(),
1332 embedding: vec![1.0, 0.0, 0.0, 0.0],
1333 },
1334 TDoc {
1335 id: "doc1".into(),
1336 text: "doc text number 1".into(),
1337 embedding: vec![1.0, 0.0, 0.0, 0.0],
1338 },
1339 TDoc {
1340 id: "doc2".into(),
1341 text: "doc text number 2".into(),
1342 embedding: vec![1.0, 0.0, 0.0, 0.0],
1343 },
1344 ];
1345 pack_bundle(&dir_in, &docs, 4, 131072);
1346 let store = crate::VectorStore::from_bundle(Bundle::open(&dir_in).unwrap());
1347 assert_eq!(store.size(), 3);
1348 assert_eq!(store.dimensions(), 4);
1349 let d0 = store.get_document(0).unwrap();
1350 assert_eq!(d0.0, "doc0");
1351 assert!(d0.1.contains("doc text number 0"));
1352 assert!(d0.2.contains("\"embedding\""));
1353 let d2 = store.get_document(2).unwrap();
1354 assert_eq!(d2.0, "doc2");
1355 assert!(d2.1.contains("doc text number 2"));
1356 let q = [1f32, 0f32, 0f32, 0f32];
1357 let res = store.search_vector(&q, 2);
1358 assert!(!res.is_empty());
1359 }
1360
1361 #[test]
1362 fn e2e_pack_then_open_multiple_blocks() {
1363 let dir_in = temp_dir("nvs_rust_e2e_in_multi");
1364 let mut docs = Vec::new();
1365 for i in 0..10 {
1366 docs.push(TDoc {
1367 id: format!("m{i}"),
1368 text: format!("m text number {i}"),
1369 embedding: vec![1.0, 0.0, 0.0, 0.0],
1370 });
1371 }
1372 pack_bundle(&dir_in, &docs, 4, 256);
1373 let store = crate::VectorStore::from_bundle(Bundle::open(&dir_in).unwrap());
1374 assert_eq!(store.size(), 10);
1375 let d0 = store.get_document(0).unwrap();
1376 assert_eq!(d0.0, "m0");
1377 let d9 = store.get_document(9).unwrap();
1378 assert_eq!(d9.0, "m9");
1379 for i in 0..10 {
1380 let d = store.get_document(i).unwrap();
1381 assert_eq!(d.0, format!("m{i}"));
1382 }
1383 }
1384
1385 #[test]
1386 fn e2e_block_headers_and_checksums() {
1387 let dir_in = temp_dir("nvs_rust_e2e_hdr");
1388 let mut docs = Vec::new();
1389 for i in 0..10 {
1390 docs.push(TDoc {
1391 id: format!("h{i}"),
1392 text: format!("h text {i}"),
1393 embedding: vec![1.0, 0.0, 0.0, 0.0],
1394 });
1395 }
1396 pack_bundle(&dir_in, &docs, 4, 256);
1397 {
1399 let mut f = File::open(dir_in.join("meta.blocks")).unwrap();
1400 let mut buf = Vec::new();
1401 f.read_to_end(&mut buf).unwrap();
1402 let mut p = 0usize;
1403 let block_count = u32::from_le_bytes(buf[p..p + 4].try_into().unwrap()) as usize;
1404 p += 4;
1405 let mut hdrs = Vec::new();
1406 for _ in 0..block_count {
1407 let id = u32::from_le_bytes(buf[p..p + 4].try_into().unwrap());
1408 let usizeb = u32::from_le_bytes(buf[p + 4..p + 8].try_into().unwrap());
1409 let dcount = u32::from_le_bytes(buf[p + 8..p + 12].try_into().unwrap());
1410 let pad = u32::from_le_bytes(buf[p + 12..p + 16].try_into().unwrap());
1411 p += 16;
1412 hdrs.push((id, usizeb, dcount, pad));
1413 }
1414 let total_size = buf.len();
1415 let header_size = 4 + block_count * 16;
1416 let block_size = (total_size - header_size) / block_count;
1417 assert!(block_size > 0);
1418 let mut total_docs = 0usize;
1419 for i in 0..block_count {
1420 let (_id, usizeb, _dcount, _) = hdrs[i];
1421 let start = header_size + i * block_size;
1422 let mut consumed = 0usize;
1423 let mut pos = start;
1424 while consumed < usizeb as usize {
1425 let idl = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap()) as usize;
1426 pos += 4;
1427 consumed += 4;
1428 pos += idl;
1429 consumed += idl;
1430 let tl = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap()) as usize;
1431 pos += 4;
1432 consumed += 4;
1433 pos += tl;
1434 consumed += tl;
1435 let ml = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap()) as usize;
1436 pos += 4;
1437 consumed += 4;
1438 pos += ml;
1439 consumed += ml;
1440 total_docs += 1;
1441 }
1442 assert_eq!(consumed, usizeb as usize);
1443 assert_eq!(
1444 total_docs as u32,
1445 hdrs.iter().map(|h| h.2).take(i + 1).sum::<u32>()
1446 );
1447 }
1448 assert_eq!(total_docs, 10);
1449 }
1450 {
1452 let mut s = String::new();
1453 File::open(dir_in.join("checksums.xxhash64"))
1454 .unwrap()
1455 .read_to_string(&mut s)
1456 .unwrap();
1457 let mut seen = 0;
1458 for line in s.lines() {
1459 if line.is_empty() {
1460 continue;
1461 }
1462 let mut parts = line.split(" ");
1463 let hex = parts.next().unwrap();
1464 let fname = parts.next().unwrap_or("");
1465 assert_eq!(hex.len(), 16);
1466 assert!(hex
1467 .chars()
1468 .all(|c| c.is_ascii_hexdigit() && c.is_lowercase() || c.is_ascii_digit()));
1469 assert!(Path::new(&dir_in).join(fname).exists());
1470 seen += 1;
1471 }
1472 assert!(seen >= 5);
1473 }
1474 }
1475
1476 #[test]
1477 fn e2e_vector_search_f16() {
1478 use half::f16;
1479 let dir = temp_dir("nvs_rust_e2e_f16");
1481 let num_docs = 3usize;
1482 let dim = 4usize;
1483 let block_size = 128u32;
1484
1485 {
1487 let row_bytes = dim * 2; let aligned = row_bytes.div_ceil(64) * 64;
1489 let mut data = vec![0u8; num_docs * aligned];
1490 for i in 0..num_docs {
1491 for j in 0..dim {
1492 let v = if i == j { 1.0f32 } else { 0.0f32 };
1493 let h = f16::from_f32(v);
1494 let off = i * aligned + j * 2;
1495 data[off..off + 2].copy_from_slice(&h.to_le_bytes());
1496 }
1497 }
1498 let mut f = File::create(dir.join("vectors.f16")).unwrap();
1499 f.write_all(&data).unwrap();
1500 }
1501
1502 {
1504 let mut f = File::create(dir.join("doclen.u32")).unwrap();
1505 for _ in 0..num_docs {
1506 f.write_all(&0u32.to_le_bytes()).unwrap();
1507 }
1508 }
1509 File::create(dir.join("lexicon.bin")).unwrap();
1511 File::create(dir.join("postings.bin")).unwrap();
1512 File::create(dir.join("terms.dict")).unwrap();
1513
1514 write_meta_idx(&dir, num_docs);
1516 write_meta_blocks(&dir, 1, block_size);
1517
1518 {
1520 let manifest = format!(
1521 r#"{{
1522 "format": "nvs.v1",
1523 "num_docs": {},
1524 "dim": {},
1525 "embedding": {{"model": "test", "dtype": "f16"}},
1526 "bm25": {{"avgdl": 0.0, "k1": 1.2, "b": 0.75}},
1527 "files": {{
1528 "vectors": {{"path": "vectors.f16", "dtype": "f16", "rows": {}, "cols": {}}},
1529 "doclen": {{"path": "doclen.u32", "dtype": "u32", "rows": {}}},
1530 "lexicon": {{"path": "lexicon.bin"}},
1531 "postings": {{"path": "postings.bin"}},
1532 "terms": {{"path": "terms.dict"}},
1533 "meta_idx": {{"path": "meta.idx", "schema": "u32 block_id, u32 offset, u32 doc_size"}},
1534 "meta": {{"path": "meta.blocks", "block_size": {}, "doc_aligned": true}}
1535 }}
1536}}"#,
1537 num_docs, dim, num_docs, dim, num_docs, block_size
1538 );
1539 let mut f = File::create(dir.join("manifest.json")).unwrap();
1540 f.write_all(manifest.as_bytes()).unwrap();
1541 }
1542
1543 let store = crate::VectorStore::from_bundle(Bundle::open(&dir).unwrap());
1545 assert_eq!(store.size(), num_docs);
1546 assert_eq!(store.dimensions(), dim);
1547 let q = [1f32, 0f32, 0f32, 0f32];
1548 let res = store.search_vector(&q, 3);
1549 assert!(!res.is_empty());
1550 assert_eq!(res[0].0, 0);
1551 }
1552}