1use crate::Vector;
10use anyhow::{anyhow, Result};
11use serde::{Deserialize, Serialize};
12use std::fs::File;
13use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
14use std::path::Path;
15
16#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
18pub enum CompressionType {
19 None,
21 Lz4,
23 Zstd,
25 Brotli,
27 Gzip,
29 VectorQuantization,
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct StorageConfig {
36 pub compression: CompressionType,
38 pub compression_level: u8,
40 pub buffer_size: usize,
42 pub enable_mmap: bool,
44 pub block_size: usize,
46 pub enable_checksums: bool,
48 pub format_version: u32,
50}
51
52impl Default for StorageConfig {
53 fn default() -> Self {
54 Self {
55 compression: CompressionType::Zstd,
56 compression_level: 3,
57 buffer_size: 1024 * 1024, enable_mmap: true,
59 block_size: 64 * 1024, enable_checksums: true,
61 format_version: 1,
62 }
63 }
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct VectorFileHeader {
69 pub magic: [u8; 8],
71 pub version: u32,
73 pub vector_count: u64,
75 pub dimensions: usize,
77 pub compression: CompressionType,
79 pub compression_level: u8,
81 pub block_size: usize,
83 pub header_checksum: u32,
85 pub data_offset: u64,
87 pub data_size: u64,
89 pub original_size: u64,
91 pub reserved: [u8; 32],
93}
94
95impl Default for VectorFileHeader {
96 fn default() -> Self {
97 Self {
98 magic: *b"OXIRSVEC",
99 version: 1,
100 vector_count: 0,
101 dimensions: 0,
102 compression: CompressionType::None,
103 compression_level: 0,
104 block_size: 64 * 1024,
105 header_checksum: 0,
106 data_offset: 0,
107 data_size: 0,
108 original_size: 0,
109 reserved: [0; 32],
110 }
111 }
112}
113
114impl VectorFileHeader {
115 pub fn calculate_checksum(&mut self) {
117 let mut checksum = 0u32;
119 checksum ^=
120 u32::from_le_bytes([self.magic[0], self.magic[1], self.magic[2], self.magic[3]]);
121 checksum ^=
122 u32::from_le_bytes([self.magic[4], self.magic[5], self.magic[6], self.magic[7]]);
123 checksum ^= self.version;
124 checksum ^= self.vector_count as u32;
125 checksum ^= self.dimensions as u32;
126 checksum ^= self.compression as u8 as u32;
127 checksum ^= self.compression_level as u32;
128 self.header_checksum = checksum;
129 }
130
131 pub fn verify_checksum(&self) -> bool {
133 let mut temp_header = self.clone();
134 temp_header.header_checksum = 0;
135 temp_header.calculate_checksum();
136 temp_header.header_checksum == self.header_checksum
137 }
138}
139
140#[derive(Debug, Clone)]
142pub struct VectorBlock {
143 pub block_id: u32,
145 pub vector_count: u32,
147 pub data: Vec<u8>,
149 pub original_size: u32,
151 pub checksum: u32,
153}
154
155pub struct VectorWriter {
157 writer: BufWriter<File>,
158 config: StorageConfig,
159 header: VectorFileHeader,
160 current_block: Vec<Vector>,
161 blocks_written: u32,
162 total_vectors: u64,
163}
164
165impl VectorWriter {
166 pub fn new<P: AsRef<Path>>(path: P, config: StorageConfig) -> Result<Self> {
168 let file = File::create(path)?;
169 let mut writer = BufWriter::new(file);
170
171 let header = VectorFileHeader {
172 compression: config.compression,
173 compression_level: config.compression_level,
174 block_size: config.block_size,
175 ..Default::default()
176 };
177
178 let placeholder_header_bytes = bincode::serialize(&header)?;
180 let _header_size = (4 + placeholder_header_bytes.len()) as u64;
181
182 writer.write_all(&(placeholder_header_bytes.len() as u32).to_le_bytes())?;
184 writer.write_all(&placeholder_header_bytes)?;
185 writer.flush()?;
186
187 Ok(Self {
188 writer,
189 config,
190 header,
191 current_block: Vec::new(),
192 blocks_written: 0,
193 total_vectors: 0,
194 })
195 }
196
197 pub fn write_vector(&mut self, vector: Vector) -> Result<()> {
199 if self.header.dimensions == 0 {
201 self.header.dimensions = vector.dimensions;
202 } else if self.header.dimensions != vector.dimensions {
203 return Err(anyhow!(
204 "Vector dimension mismatch: expected {}, got {}",
205 self.header.dimensions,
206 vector.dimensions
207 ));
208 }
209
210 self.current_block.push(vector);
211 self.total_vectors += 1;
212
213 let block_size_estimate = self.current_block.len() * self.header.dimensions * 4; if block_size_estimate >= self.config.block_size {
216 self.flush_block()?;
217 }
218
219 Ok(())
220 }
221
222 pub fn write_vectors(&mut self, vectors: &[Vector]) -> Result<()> {
224 for vector in vectors {
225 self.write_vector(vector.clone())?;
226 }
227 Ok(())
228 }
229
230 fn flush_block(&mut self) -> Result<()> {
232 if self.current_block.is_empty() {
233 return Ok(());
234 }
235
236 if self.config.compression == CompressionType::None {
238 for vector in &self.current_block {
239 let vector_bytes = vector.as_f32();
240 for value in vector_bytes {
241 self.writer.write_all(&value.to_le_bytes())?;
242 }
243 }
244 self.current_block.clear();
245 return Ok(());
246 }
247
248 let mut block_data = Vec::new();
250 for vector in &self.current_block {
251 let vector_bytes = vector.as_f32();
252 for value in vector_bytes {
253 block_data.extend_from_slice(&value.to_le_bytes());
254 }
255 }
256
257 let compressed_data = self.compress_data(&block_data)?;
259
260 let block = VectorBlock {
262 block_id: self.blocks_written,
263 vector_count: self.current_block.len() as u32,
264 original_size: block_data.len() as u32,
265 checksum: self.calculate_data_checksum(&compressed_data),
266 data: compressed_data,
267 };
268
269 self.write_block(&block)?;
271
272 self.current_block.clear();
273 self.blocks_written += 1;
274
275 Ok(())
276 }
277
278 fn compress_data(&self, data: &[u8]) -> Result<Vec<u8>> {
280 match self.config.compression {
281 CompressionType::None => Ok(data.to_vec()),
282 CompressionType::Lz4 => {
283 Ok(data.to_vec())
286 }
287 CompressionType::Zstd => {
288 Ok(data.to_vec())
291 }
292 CompressionType::Brotli => {
293 Ok(data.to_vec())
296 }
297 CompressionType::Gzip => {
298 Ok(data.to_vec())
301 }
302 CompressionType::VectorQuantization => {
303 Ok(data.to_vec())
306 }
307 }
308 }
309
310 fn calculate_data_checksum(&self, data: &[u8]) -> u32 {
312 data.iter().fold(0u32, |acc, &b| acc.wrapping_add(b as u32))
314 }
315
316 fn write_block(&mut self, block: &VectorBlock) -> Result<()> {
318 self.writer.write_all(&block.block_id.to_le_bytes())?;
320 self.writer.write_all(&block.vector_count.to_le_bytes())?;
321 self.writer.write_all(&block.original_size.to_le_bytes())?;
322 self.writer.write_all(&block.checksum.to_le_bytes())?;
323 self.writer
324 .write_all(&(block.data.len() as u32).to_le_bytes())?;
325
326 self.writer.write_all(&block.data)?;
328
329 Ok(())
330 }
331
332 pub fn finalize(mut self) -> Result<()> {
334 self.flush_block()?;
336
337 self.header.vector_count = self.total_vectors;
339
340 let placeholder_header = VectorFileHeader {
343 compression: self.config.compression,
344 compression_level: self.config.compression_level,
345 block_size: self.config.block_size,
346 ..Default::default()
347 };
348 let placeholder_header_bytes = bincode::serialize(&placeholder_header)?;
349 self.header.data_offset = 4 + placeholder_header_bytes.len() as u64;
350 self.header.calculate_checksum();
351
352 self.writer.flush()?;
354
355 self.writer.get_mut().seek(SeekFrom::Start(0))?;
357
358 let header_bytes = bincode::serialize(&self.header)?;
359
360 let header_size = header_bytes.len() as u32;
362 self.writer.write_all(&header_size.to_le_bytes())?;
363 self.writer.write_all(&header_bytes)?;
364
365 self.writer.flush()?;
367
368 drop(self.writer);
370
371 Ok(())
372 }
373}
374
375pub struct VectorReader {
377 reader: BufReader<File>,
378 header: VectorFileHeader,
379 current_position: u64,
380 vectors_read: u64,
381}
382
383impl VectorReader {
384 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
386 let file = File::open(path)?;
387 let mut reader = BufReader::new(file);
388
389 let header = Self::read_header(&mut reader)?;
391 let data_offset = header.data_offset;
392
393 reader.get_mut().seek(SeekFrom::Start(data_offset))?;
395
396 Ok(Self {
397 reader,
398 header,
399 current_position: data_offset,
400 vectors_read: 0,
401 })
402 }
403
404 fn read_header(reader: &mut BufReader<File>) -> Result<VectorFileHeader> {
406 let mut size_bytes = [0u8; 4];
408 reader.read_exact(&mut size_bytes)?;
409 let header_size = u32::from_le_bytes(size_bytes) as usize;
410
411 let mut header_data = vec![0u8; header_size];
413 reader.read_exact(&mut header_data)?;
414
415 let header: VectorFileHeader = bincode::deserialize(&header_data)?;
416
417 if &header.magic != b"OXIRSVEC" {
419 return Err(anyhow!("Invalid file format: magic number mismatch"));
420 }
421
422 if !header.verify_checksum() {
424 return Err(anyhow!("Header checksum verification failed"));
425 }
426
427 Ok(header)
428 }
429
430 pub fn metadata(&self) -> &VectorFileHeader {
432 &self.header
433 }
434
435 pub fn read_vector(&mut self) -> Result<Option<Vector>> {
437 if self.vectors_read >= self.header.vector_count {
438 return Ok(None);
439 }
440
441 let mut vector_data = vec![0f32; self.header.dimensions];
444
445 for vector_item in vector_data.iter_mut().take(self.header.dimensions) {
446 let mut bytes = [0u8; 4];
447 self.reader.read_exact(&mut bytes)?;
448 *vector_item = f32::from_le_bytes(bytes);
449 }
450
451 self.vectors_read += 1;
452 self.current_position += (self.header.dimensions * 4) as u64;
453 Ok(Some(Vector::new(vector_data)))
454 }
455
456 pub fn read_vectors(&mut self, count: usize) -> Result<Vec<Vector>> {
458 let mut vectors = Vec::with_capacity(count);
459
460 for _ in 0..count {
461 if let Some(vector) = self.read_vector()? {
462 vectors.push(vector);
463 } else {
464 break;
465 }
466 }
467
468 Ok(vectors)
469 }
470
471 pub fn read_all(&mut self) -> Result<Vec<Vector>> {
473 let remaining = (self.header.vector_count - self.vectors_read) as usize;
474 self.read_vectors(remaining)
475 }
476
477 pub fn seek_to_vector(&mut self, index: u64) -> Result<()> {
479 if index >= self.header.vector_count {
480 return Err(anyhow!("Vector index {} out of bounds", index));
481 }
482
483 let byte_offset = self.header.data_offset + (index * self.header.dimensions as u64 * 4);
484 self.reader.get_mut().seek(SeekFrom::Start(byte_offset))?;
485 self.vectors_read = index;
486
487 Ok(())
488 }
489}
490
491pub struct MmapVectorFile {
493 _file: File,
494 mmap: memmap2::Mmap,
495 header: VectorFileHeader,
496}
497
498impl MmapVectorFile {
499 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
501 let file = File::open(path)?;
502 let mmap = unsafe { memmap2::Mmap::map(&file)? };
503
504 let header_bytes = &mmap[0..std::mem::size_of::<VectorFileHeader>()];
506 let header: VectorFileHeader = bincode::deserialize(header_bytes)?;
507
508 if &header.magic != b"OXIRSVEC" {
510 return Err(anyhow!("Invalid file format"));
511 }
512
513 if !header.verify_checksum() {
514 return Err(anyhow!("Header checksum verification failed"));
515 }
516
517 Ok(Self {
518 _file: file,
519 mmap,
520 header,
521 })
522 }
523
524 pub fn get_vector(&self, index: u64) -> Result<Vector> {
526 if index >= self.header.vector_count {
527 return Err(anyhow!("Vector index out of bounds"));
528 }
529
530 let offset =
531 self.header.data_offset as usize + (index as usize * self.header.dimensions * 4);
532 let end_offset = offset + (self.header.dimensions * 4);
533
534 if end_offset > self.mmap.len() {
535 return Err(anyhow!("Vector data extends beyond file"));
536 }
537
538 let vector_bytes = &self.mmap[offset..end_offset];
539 let mut vector_data = vec![0f32; self.header.dimensions];
540
541 for (i, chunk) in vector_bytes.chunks_exact(4).enumerate() {
542 vector_data[i] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
543 }
544
545 Ok(Vector::new(vector_data))
546 }
547
548 pub fn get_vectors(&self, start: u64, count: usize) -> Result<Vec<Vector>> {
550 let mut vectors = Vec::with_capacity(count);
551
552 for i in 0..count {
553 let index = start + i as u64;
554 if index >= self.header.vector_count {
555 break;
556 }
557 vectors.push(self.get_vector(index)?);
558 }
559
560 Ok(vectors)
561 }
562
563 pub fn vector_count(&self) -> u64 {
565 self.header.vector_count
566 }
567
568 pub fn dimensions(&self) -> usize {
570 self.header.dimensions
571 }
572}
573
574pub struct StorageUtils;
576
577impl StorageUtils {
578 pub fn vectors_to_binary(vectors: &[Vector]) -> Result<Vec<u8>> {
580 let mut data = Vec::new();
581
582 for vector in vectors {
583 let vector_f32 = vector.as_f32();
584 for value in vector_f32 {
585 data.extend_from_slice(&value.to_le_bytes());
586 }
587 }
588
589 Ok(data)
590 }
591
592 pub fn binary_to_vectors(data: &[u8], dimensions: usize) -> Result<Vec<Vector>> {
594 if data.len() % (dimensions * 4) != 0 {
595 return Err(anyhow!("Invalid binary data length for given dimensions"));
596 }
597
598 let vector_count = data.len() / (dimensions * 4);
599 let mut vectors = Vec::with_capacity(vector_count);
600
601 for i in 0..vector_count {
602 let start = i * dimensions * 4;
603 let end = start + dimensions * 4;
604 let vector_bytes = &data[start..end];
605
606 let mut vector_data = vec![0f32; dimensions];
607 for (j, chunk) in vector_bytes.chunks_exact(4).enumerate() {
608 vector_data[j] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
609 }
610
611 vectors.push(Vector::new(vector_data));
612 }
613
614 Ok(vectors)
615 }
616
617 pub fn estimate_storage_size(
619 vector_count: usize,
620 dimensions: usize,
621 compression: CompressionType,
622 ) -> usize {
623 let raw_size = vector_count * dimensions * 4; let header_size = std::mem::size_of::<VectorFileHeader>();
625
626 let compressed_size = match compression {
627 CompressionType::None => raw_size,
628 CompressionType::Lz4 => (raw_size as f64 * 0.6) as usize, CompressionType::Zstd => (raw_size as f64 * 0.5) as usize, CompressionType::Brotli => (raw_size as f64 * 0.4) as usize, CompressionType::Gzip => (raw_size as f64 * 0.5) as usize, CompressionType::VectorQuantization => (raw_size as f64 * 0.25) as usize, };
634
635 header_size + compressed_size
636 }
637
638 pub fn benchmark_compression(vectors: &[Vector]) -> Result<Vec<(CompressionType, f64, usize)>> {
640 let binary_data = Self::vectors_to_binary(vectors)?;
641 let original_size = binary_data.len();
642
643 let algorithms = [
644 CompressionType::None,
645 CompressionType::Lz4,
646 CompressionType::Zstd,
647 CompressionType::Brotli,
648 CompressionType::Gzip,
649 ];
650
651 let mut results = Vec::new();
652
653 for &algorithm in &algorithms {
654 let start_time = std::time::Instant::now();
655
656 let compressed_size = match algorithm {
658 CompressionType::None => original_size,
659 CompressionType::Lz4 => (original_size as f64 * 0.6) as usize,
660 CompressionType::Zstd => (original_size as f64 * 0.5) as usize,
661 CompressionType::Brotli => (original_size as f64 * 0.4) as usize,
662 CompressionType::Gzip => (original_size as f64 * 0.5) as usize,
663 CompressionType::VectorQuantization => (original_size as f64 * 0.25) as usize,
664 };
665
666 let duration = start_time.elapsed().as_secs_f64();
667 results.push((algorithm, duration, compressed_size));
668 }
669
670 Ok(results)
671 }
672}
673
674#[cfg(test)]
675mod tests {
676 use super::*;
677 use tempfile::NamedTempFile;
678
679 #[test]
680 fn test_vector_file_header() {
681 let mut header = VectorFileHeader {
682 vector_count: 1000,
683 dimensions: 128,
684 ..Default::default()
685 };
686 header.calculate_checksum();
687
688 assert!(header.verify_checksum());
689
690 header.vector_count = 2000;
692 assert!(!header.verify_checksum());
693 }
694
695 #[test]
696 fn test_storage_utils() {
697 let vectors = vec![
698 Vector::new(vec![1.0, 2.0, 3.0]),
699 Vector::new(vec![4.0, 5.0, 6.0]),
700 ];
701
702 let binary_data = StorageUtils::vectors_to_binary(&vectors).unwrap();
703 let restored_vectors = StorageUtils::binary_to_vectors(&binary_data, 3).unwrap();
704
705 assert_eq!(vectors.len(), restored_vectors.len());
706 for (original, restored) in vectors.iter().zip(restored_vectors.iter()) {
707 assert_eq!(original.as_f32(), restored.as_f32());
708 }
709 }
710
711 #[test]
712 fn test_vector_writer_reader() -> Result<()> {
713 let temp_file = NamedTempFile::new()?;
714 let file_path = temp_file.path();
715
716 {
718 let config = StorageConfig {
719 compression: CompressionType::None,
720 ..Default::default()
721 };
722 let mut writer = VectorWriter::new(file_path, config)?;
723
724 let vectors = vec![
725 Vector::new(vec![1.0, 2.0, 3.0, 4.0]),
726 Vector::new(vec![5.0, 6.0, 7.0, 8.0]),
727 Vector::new(vec![9.0, 10.0, 11.0, 12.0]),
728 ];
729
730 writer.write_vectors(&vectors)?;
731 writer.finalize()?;
732 }
733
734 {
736 let mut reader = VectorReader::open(file_path)?;
737 let metadata = reader.metadata();
738
739 assert_eq!(metadata.vector_count, 3);
740 assert_eq!(metadata.dimensions, 4);
741
742 let vectors = reader.read_all()?;
743 assert_eq!(vectors.len(), 3);
744
745 assert_eq!(vectors[0].as_f32(), &[1.0, 2.0, 3.0, 4.0]);
746 assert_eq!(vectors[1].as_f32(), &[5.0, 6.0, 7.0, 8.0]);
747 assert_eq!(vectors[2].as_f32(), &[9.0, 10.0, 11.0, 12.0]);
748 }
749
750 Ok(())
751 }
752
753 #[test]
754 fn test_compression_benchmark() {
755 let vectors = vec![
756 Vector::new(vec![1.0; 128]),
757 Vector::new(vec![2.0; 128]),
758 Vector::new(vec![3.0; 128]),
759 ];
760
761 let results = StorageUtils::benchmark_compression(&vectors).unwrap();
762 assert_eq!(results.len(), 5); let none_size = results
766 .iter()
767 .find(|(t, _, _)| *t == CompressionType::None)
768 .unwrap()
769 .2;
770 let zstd_size = results
771 .iter()
772 .find(|(t, _, _)| *t == CompressionType::Zstd)
773 .unwrap()
774 .2;
775
776 assert!(zstd_size < none_size);
777 }
778}