1use crate::Vector;
10use anyhow::{anyhow, Result};
11use bincode::config::Configuration;
12use bincode::{Decode, Encode};
13use serde::{Deserialize, Serialize};
14use std::fs::File;
15use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
16use std::path::Path;
17
18fn bincode_config(
21) -> Configuration<bincode::config::LittleEndian, bincode::config::Fixint, bincode::config::NoLimit>
22{
23 bincode::config::standard().with_fixed_int_encoding()
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Encode, Decode)]
28pub enum CompressionType {
29 None,
31 Lz4,
33 Zstd,
35 Brotli,
37 Gzip,
39 VectorQuantization,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
45pub struct StorageConfig {
46 pub compression: CompressionType,
48 pub compression_level: u8,
50 pub buffer_size: usize,
52 pub enable_mmap: bool,
54 pub block_size: usize,
56 pub enable_checksums: bool,
58 pub format_version: u32,
60}
61
62impl Default for StorageConfig {
63 fn default() -> Self {
64 Self {
65 compression: CompressionType::Zstd,
66 compression_level: 3,
67 buffer_size: 1024 * 1024, enable_mmap: true,
69 block_size: 64 * 1024, enable_checksums: true,
71 format_version: 1,
72 }
73 }
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
78pub struct VectorFileHeader {
79 pub magic: [u8; 8],
81 pub version: u32,
83 pub vector_count: u64,
85 pub dimensions: usize,
87 pub compression: CompressionType,
89 pub compression_level: u8,
91 pub block_size: usize,
93 pub header_checksum: u32,
95 pub data_offset: u64,
97 pub data_size: u64,
99 pub original_size: u64,
101 pub reserved: [u8; 32],
103}
104
105impl Default for VectorFileHeader {
106 fn default() -> Self {
107 Self {
108 magic: *b"OXIRSVEC",
109 version: 1,
110 vector_count: 0,
111 dimensions: 0,
112 compression: CompressionType::None,
113 compression_level: 0,
114 block_size: 64 * 1024,
115 header_checksum: 0,
116 data_offset: 0,
117 data_size: 0,
118 original_size: 0,
119 reserved: [0; 32],
120 }
121 }
122}
123
124impl VectorFileHeader {
125 pub fn calculate_checksum(&mut self) {
127 let mut checksum = 0u32;
129 checksum ^=
130 u32::from_le_bytes([self.magic[0], self.magic[1], self.magic[2], self.magic[3]]);
131 checksum ^=
132 u32::from_le_bytes([self.magic[4], self.magic[5], self.magic[6], self.magic[7]]);
133 checksum ^= self.version;
134 checksum ^= self.vector_count as u32;
135 checksum ^= self.dimensions as u32;
136 checksum ^= self.compression as u8 as u32;
137 checksum ^= self.compression_level as u32;
138 self.header_checksum = checksum;
139 }
140
141 pub fn verify_checksum(&self) -> bool {
143 let mut temp_header = self.clone();
144 temp_header.header_checksum = 0;
145 temp_header.calculate_checksum();
146 temp_header.header_checksum == self.header_checksum
147 }
148}
149
150#[derive(Debug, Clone)]
152pub struct VectorBlock {
153 pub block_id: u32,
155 pub vector_count: u32,
157 pub data: Vec<u8>,
159 pub original_size: u32,
161 pub checksum: u32,
163}
164
165pub struct VectorWriter {
167 writer: BufWriter<File>,
168 config: StorageConfig,
169 header: VectorFileHeader,
170 current_block: Vec<Vector>,
171 blocks_written: u32,
172 total_vectors: u64,
173}
174
175impl VectorWriter {
176 pub fn new<P: AsRef<Path>>(path: P, config: StorageConfig) -> Result<Self> {
178 let file = File::create(path)?;
179 let mut writer = BufWriter::new(file);
180
181 let header = VectorFileHeader {
182 compression: config.compression,
183 compression_level: config.compression_level,
184 block_size: config.block_size,
185 ..Default::default()
186 };
187
188 let placeholder_header_bytes = bincode::encode_to_vec(&header, bincode_config())
190 .map_err(|e| anyhow!("Failed to serialize placeholder header: {}", e))?;
191 let _header_size = (4 + placeholder_header_bytes.len()) as u64;
192
193 writer.write_all(&(placeholder_header_bytes.len() as u32).to_le_bytes())?;
195 writer.write_all(&placeholder_header_bytes)?;
196 writer.flush()?;
197
198 Ok(Self {
199 writer,
200 config,
201 header,
202 current_block: Vec::new(),
203 blocks_written: 0,
204 total_vectors: 0,
205 })
206 }
207
208 pub fn write_vector(&mut self, vector: Vector) -> Result<()> {
210 if self.header.dimensions == 0 {
212 self.header.dimensions = vector.dimensions;
213 } else if self.header.dimensions != vector.dimensions {
214 return Err(anyhow!(
215 "Vector dimension mismatch: expected {}, got {}",
216 self.header.dimensions,
217 vector.dimensions
218 ));
219 }
220
221 self.current_block.push(vector);
222 self.total_vectors += 1;
223
224 let block_size_estimate = self.current_block.len() * self.header.dimensions * 4; if block_size_estimate >= self.config.block_size {
227 self.flush_block()?;
228 }
229
230 Ok(())
231 }
232
233 pub fn write_vectors(&mut self, vectors: &[Vector]) -> Result<()> {
235 for vector in vectors {
236 self.write_vector(vector.clone())?;
237 }
238 Ok(())
239 }
240
241 fn flush_block(&mut self) -> Result<()> {
243 if self.current_block.is_empty() {
244 return Ok(());
245 }
246
247 if self.config.compression == CompressionType::None {
249 for vector in &self.current_block {
250 let vector_bytes = vector.as_f32();
251 for value in vector_bytes {
252 self.writer.write_all(&value.to_le_bytes())?;
253 }
254 }
255 self.current_block.clear();
256 return Ok(());
257 }
258
259 let mut block_data = Vec::new();
261 for vector in &self.current_block {
262 let vector_bytes = vector.as_f32();
263 for value in vector_bytes {
264 block_data.extend_from_slice(&value.to_le_bytes());
265 }
266 }
267
268 let compressed_data = self.compress_data(&block_data)?;
270
271 let block = VectorBlock {
273 block_id: self.blocks_written,
274 vector_count: self.current_block.len() as u32,
275 original_size: block_data.len() as u32,
276 checksum: self.calculate_data_checksum(&compressed_data),
277 data: compressed_data,
278 };
279
280 self.write_block(&block)?;
282
283 self.current_block.clear();
284 self.blocks_written += 1;
285
286 Ok(())
287 }
288
289 fn compress_data(&self, data: &[u8]) -> Result<Vec<u8>> {
291 match self.config.compression {
292 CompressionType::None => Ok(data.to_vec()),
293 CompressionType::Lz4 => {
294 Ok(data.to_vec())
297 }
298 CompressionType::Zstd => {
299 Ok(data.to_vec())
302 }
303 CompressionType::Brotli => {
304 Ok(data.to_vec())
307 }
308 CompressionType::Gzip => {
309 Ok(data.to_vec())
312 }
313 CompressionType::VectorQuantization => {
314 Ok(data.to_vec())
317 }
318 }
319 }
320
321 fn calculate_data_checksum(&self, data: &[u8]) -> u32 {
323 data.iter().fold(0u32, |acc, &b| acc.wrapping_add(b as u32))
325 }
326
327 fn write_block(&mut self, block: &VectorBlock) -> Result<()> {
329 self.writer.write_all(&block.block_id.to_le_bytes())?;
331 self.writer.write_all(&block.vector_count.to_le_bytes())?;
332 self.writer.write_all(&block.original_size.to_le_bytes())?;
333 self.writer.write_all(&block.checksum.to_le_bytes())?;
334 self.writer
335 .write_all(&(block.data.len() as u32).to_le_bytes())?;
336
337 self.writer.write_all(&block.data)?;
339
340 Ok(())
341 }
342
343 pub fn finalize(mut self) -> Result<()> {
345 self.flush_block()?;
347
348 self.header.vector_count = self.total_vectors;
350
351 let placeholder_header = VectorFileHeader {
354 compression: self.config.compression,
355 compression_level: self.config.compression_level,
356 block_size: self.config.block_size,
357 ..Default::default()
358 };
359 let placeholder_header_bytes =
360 bincode::encode_to_vec(&placeholder_header, bincode_config())
361 .map_err(|e| anyhow!("Failed to serialize placeholder header: {}", e))?;
362 self.header.data_offset = 4 + placeholder_header_bytes.len() as u64;
363 self.header.calculate_checksum();
364
365 self.writer.flush()?;
367
368 self.writer.get_mut().seek(SeekFrom::Start(0))?;
370
371 let header_bytes = bincode::encode_to_vec(&self.header, bincode_config())
372 .map_err(|e| anyhow!("Failed to serialize header: {}", e))?;
373
374 let header_size = header_bytes.len() as u32;
376 self.writer.write_all(&header_size.to_le_bytes())?;
377 self.writer.write_all(&header_bytes)?;
378
379 self.writer.flush()?;
381
382 drop(self.writer);
384
385 Ok(())
386 }
387}
388
389pub struct VectorReader {
391 reader: BufReader<File>,
392 header: VectorFileHeader,
393 current_position: u64,
394 vectors_read: u64,
395}
396
397impl VectorReader {
398 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
400 let file = File::open(path)?;
401 let mut reader = BufReader::new(file);
402
403 let header = Self::read_header(&mut reader)?;
405 let data_offset = header.data_offset;
406
407 reader.get_mut().seek(SeekFrom::Start(data_offset))?;
409
410 Ok(Self {
411 reader,
412 header,
413 current_position: data_offset,
414 vectors_read: 0,
415 })
416 }
417
418 fn read_header(reader: &mut BufReader<File>) -> Result<VectorFileHeader> {
420 let mut size_bytes = [0u8; 4];
422 reader.read_exact(&mut size_bytes)?;
423 let header_size = u32::from_le_bytes(size_bytes) as usize;
424
425 let mut header_data = vec![0u8; header_size];
427 reader.read_exact(&mut header_data)?;
428
429 let (header, _): (VectorFileHeader, _) =
430 bincode::decode_from_slice(&header_data, bincode_config())
431 .map_err(|e| anyhow!("Failed to deserialize header: {}", e))?;
432
433 if &header.magic != b"OXIRSVEC" {
435 return Err(anyhow!("Invalid file format: magic number mismatch"));
436 }
437
438 if !header.verify_checksum() {
440 return Err(anyhow!("Header checksum verification failed"));
441 }
442
443 Ok(header)
444 }
445
446 pub fn metadata(&self) -> &VectorFileHeader {
448 &self.header
449 }
450
451 pub fn read_vector(&mut self) -> Result<Option<Vector>> {
453 if self.vectors_read >= self.header.vector_count {
454 return Ok(None);
455 }
456
457 let mut vector_data = vec![0f32; self.header.dimensions];
460
461 for vector_item in vector_data.iter_mut().take(self.header.dimensions) {
462 let mut bytes = [0u8; 4];
463 self.reader.read_exact(&mut bytes)?;
464 *vector_item = f32::from_le_bytes(bytes);
465 }
466
467 self.vectors_read += 1;
468 self.current_position += (self.header.dimensions * 4) as u64;
469 Ok(Some(Vector::new(vector_data)))
470 }
471
472 pub fn read_vectors(&mut self, count: usize) -> Result<Vec<Vector>> {
474 let mut vectors = Vec::with_capacity(count);
475
476 for _ in 0..count {
477 if let Some(vector) = self.read_vector()? {
478 vectors.push(vector);
479 } else {
480 break;
481 }
482 }
483
484 Ok(vectors)
485 }
486
487 pub fn read_all(&mut self) -> Result<Vec<Vector>> {
489 let remaining = (self.header.vector_count - self.vectors_read) as usize;
490 self.read_vectors(remaining)
491 }
492
493 pub fn seek_to_vector(&mut self, index: u64) -> Result<()> {
495 if index >= self.header.vector_count {
496 return Err(anyhow!("Vector index {} out of bounds", index));
497 }
498
499 let byte_offset = self.header.data_offset + (index * self.header.dimensions as u64 * 4);
500 self.reader.get_mut().seek(SeekFrom::Start(byte_offset))?;
501 self.vectors_read = index;
502
503 Ok(())
504 }
505}
506
507pub struct MmapVectorFile {
509 _file: File,
510 mmap: memmap2::Mmap,
511 header: VectorFileHeader,
512}
513
514impl MmapVectorFile {
515 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
517 let file = File::open(path)?;
518 let mmap = unsafe { memmap2::Mmap::map(&file)? };
519
520 if mmap.len() < 4 {
522 return Err(anyhow!("File too small to contain header"));
523 }
524 let header_size = u32::from_le_bytes([mmap[0], mmap[1], mmap[2], mmap[3]]) as usize;
525 if mmap.len() < 4 + header_size {
526 return Err(anyhow!("File too small to contain full header"));
527 }
528 let header_bytes = &mmap[4..4 + header_size];
529 let (header, _): (VectorFileHeader, _) =
530 bincode::decode_from_slice(header_bytes, bincode_config())
531 .map_err(|e| anyhow!("Failed to deserialize header: {}", e))?;
532
533 if &header.magic != b"OXIRSVEC" {
535 return Err(anyhow!("Invalid file format"));
536 }
537
538 if !header.verify_checksum() {
539 return Err(anyhow!("Header checksum verification failed"));
540 }
541
542 Ok(Self {
543 _file: file,
544 mmap,
545 header,
546 })
547 }
548
549 pub fn get_vector(&self, index: u64) -> Result<Vector> {
551 if index >= self.header.vector_count {
552 return Err(anyhow!("Vector index out of bounds"));
553 }
554
555 let offset =
556 self.header.data_offset as usize + (index as usize * self.header.dimensions * 4);
557 let end_offset = offset + (self.header.dimensions * 4);
558
559 if end_offset > self.mmap.len() {
560 return Err(anyhow!("Vector data extends beyond file"));
561 }
562
563 let vector_bytes = &self.mmap[offset..end_offset];
564 let mut vector_data = vec![0f32; self.header.dimensions];
565
566 for (i, chunk) in vector_bytes.chunks_exact(4).enumerate() {
567 vector_data[i] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
568 }
569
570 Ok(Vector::new(vector_data))
571 }
572
573 pub fn get_vectors(&self, start: u64, count: usize) -> Result<Vec<Vector>> {
575 let mut vectors = Vec::with_capacity(count);
576
577 for i in 0..count {
578 let index = start + i as u64;
579 if index >= self.header.vector_count {
580 break;
581 }
582 vectors.push(self.get_vector(index)?);
583 }
584
585 Ok(vectors)
586 }
587
588 pub fn vector_count(&self) -> u64 {
590 self.header.vector_count
591 }
592
593 pub fn dimensions(&self) -> usize {
595 self.header.dimensions
596 }
597}
598
599pub struct StorageUtils;
601
602impl StorageUtils {
603 pub fn vectors_to_binary(vectors: &[Vector]) -> Result<Vec<u8>> {
605 let mut data = Vec::new();
606
607 for vector in vectors {
608 let vector_f32 = vector.as_f32();
609 for value in vector_f32 {
610 data.extend_from_slice(&value.to_le_bytes());
611 }
612 }
613
614 Ok(data)
615 }
616
617 pub fn binary_to_vectors(data: &[u8], dimensions: usize) -> Result<Vec<Vector>> {
619 if data.len() % (dimensions * 4) != 0 {
620 return Err(anyhow!("Invalid binary data length for given dimensions"));
621 }
622
623 let vector_count = data.len() / (dimensions * 4);
624 let mut vectors = Vec::with_capacity(vector_count);
625
626 for i in 0..vector_count {
627 let start = i * dimensions * 4;
628 let end = start + dimensions * 4;
629 let vector_bytes = &data[start..end];
630
631 let mut vector_data = vec![0f32; dimensions];
632 for (j, chunk) in vector_bytes.chunks_exact(4).enumerate() {
633 vector_data[j] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
634 }
635
636 vectors.push(Vector::new(vector_data));
637 }
638
639 Ok(vectors)
640 }
641
642 pub fn estimate_storage_size(
644 vector_count: usize,
645 dimensions: usize,
646 compression: CompressionType,
647 ) -> usize {
648 let raw_size = vector_count * dimensions * 4; let header_size = std::mem::size_of::<VectorFileHeader>();
650
651 let compressed_size = match compression {
652 CompressionType::None => raw_size,
653 CompressionType::Lz4 => (raw_size as f64 * 0.6) as usize, CompressionType::Zstd => (raw_size as f64 * 0.5) as usize, CompressionType::Brotli => (raw_size as f64 * 0.4) as usize, CompressionType::Gzip => (raw_size as f64 * 0.5) as usize, CompressionType::VectorQuantization => (raw_size as f64 * 0.25) as usize, };
659
660 header_size + compressed_size
661 }
662
663 pub fn benchmark_compression(vectors: &[Vector]) -> Result<Vec<(CompressionType, f64, usize)>> {
665 let binary_data = Self::vectors_to_binary(vectors)?;
666 let original_size = binary_data.len();
667
668 let algorithms = [
669 CompressionType::None,
670 CompressionType::Lz4,
671 CompressionType::Zstd,
672 CompressionType::Brotli,
673 CompressionType::Gzip,
674 ];
675
676 let mut results = Vec::new();
677
678 for &algorithm in &algorithms {
679 let start_time = std::time::Instant::now();
680
681 let compressed_size = match algorithm {
683 CompressionType::None => original_size,
684 CompressionType::Lz4 => (original_size as f64 * 0.6) as usize,
685 CompressionType::Zstd => (original_size as f64 * 0.5) as usize,
686 CompressionType::Brotli => (original_size as f64 * 0.4) as usize,
687 CompressionType::Gzip => (original_size as f64 * 0.5) as usize,
688 CompressionType::VectorQuantization => (original_size as f64 * 0.25) as usize,
689 };
690
691 let duration = start_time.elapsed().as_secs_f64();
692 results.push((algorithm, duration, compressed_size));
693 }
694
695 Ok(results)
696 }
697}
698
699#[cfg(test)]
700mod tests {
701 use super::*;
702 use tempfile::NamedTempFile;
703
704 #[test]
705 fn test_vector_file_header() {
706 let mut header = VectorFileHeader {
707 vector_count: 1000,
708 dimensions: 128,
709 ..Default::default()
710 };
711 header.calculate_checksum();
712
713 assert!(header.verify_checksum());
714
715 header.vector_count = 2000;
717 assert!(!header.verify_checksum());
718 }
719
720 #[test]
721 fn test_storage_utils() {
722 let vectors = vec![
723 Vector::new(vec![1.0, 2.0, 3.0]),
724 Vector::new(vec![4.0, 5.0, 6.0]),
725 ];
726
727 let binary_data = StorageUtils::vectors_to_binary(&vectors).unwrap();
728 let restored_vectors = StorageUtils::binary_to_vectors(&binary_data, 3).unwrap();
729
730 assert_eq!(vectors.len(), restored_vectors.len());
731 for (original, restored) in vectors.iter().zip(restored_vectors.iter()) {
732 assert_eq!(original.as_f32(), restored.as_f32());
733 }
734 }
735
736 #[test]
737 fn test_vector_writer_reader() -> Result<()> {
738 let temp_file = NamedTempFile::new()?;
739 let file_path = temp_file.path();
740
741 {
743 let config = StorageConfig {
744 compression: CompressionType::None,
745 ..Default::default()
746 };
747 let mut writer = VectorWriter::new(file_path, config)?;
748
749 let vectors = vec![
750 Vector::new(vec![1.0, 2.0, 3.0, 4.0]),
751 Vector::new(vec![5.0, 6.0, 7.0, 8.0]),
752 Vector::new(vec![9.0, 10.0, 11.0, 12.0]),
753 ];
754
755 writer.write_vectors(&vectors)?;
756 writer.finalize()?;
757 }
758
759 {
761 let mut reader = VectorReader::open(file_path)?;
762 let metadata = reader.metadata();
763
764 assert_eq!(metadata.vector_count, 3);
765 assert_eq!(metadata.dimensions, 4);
766
767 let vectors = reader.read_all()?;
768 assert_eq!(vectors.len(), 3);
769
770 assert_eq!(vectors[0].as_f32(), &[1.0, 2.0, 3.0, 4.0]);
771 assert_eq!(vectors[1].as_f32(), &[5.0, 6.0, 7.0, 8.0]);
772 assert_eq!(vectors[2].as_f32(), &[9.0, 10.0, 11.0, 12.0]);
773 }
774
775 Ok(())
776 }
777
778 #[test]
779 fn test_compression_benchmark() {
780 let vectors = vec![
781 Vector::new(vec![1.0; 128]),
782 Vector::new(vec![2.0; 128]),
783 Vector::new(vec![3.0; 128]),
784 ];
785
786 let results = StorageUtils::benchmark_compression(&vectors).unwrap();
787 assert_eq!(results.len(), 5); let none_size = results
791 .iter()
792 .find(|(t, _, _)| *t == CompressionType::None)
793 .unwrap()
794 .2;
795 let zstd_size = results
796 .iter()
797 .find(|(t, _, _)| *t == CompressionType::Zstd)
798 .unwrap()
799 .2;
800
801 assert!(zstd_size < none_size);
802 }
803}