1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
use std::collections::HashMap;
use std::io::{Read, Seek, SeekFrom};
use std::sync::Arc;
use crate::error::{LaurusError, Result};
use crate::storage::Storage;
use crate::vector::core::vector::Vector;
/// Storage for vectors (in-memory or on-demand from disk).
///
/// # Thread Safety
///
/// - The `Owned` variant holds an immutable `Arc<HashMap>` that is freely
/// shareable across threads.
/// - The `OnDemand` variant stores a reference to the underlying
/// [`Storage`] and the file name so that each call to [`get`](Self::get)
/// opens an independent file handle. This eliminates the previous
/// `Mutex`-based serialization and allows fully concurrent reads.
#[derive(Debug, Clone)]
pub enum VectorStorage {
/// All vectors are loaded into memory.
Owned(Arc<HashMap<(u64, String), Vector>>),
/// Vectors are read from disk on demand.
///
/// Each [`get`](Self::get) call opens a fresh [`StorageInput`](crate::storage::StorageInput)
/// via [`Storage::open_input`], performs a single seek + read, and closes
/// the handle. For mmap-backed storage this is essentially free; for
/// file-backed storage the OS typically caches the file descriptor.
OnDemand {
/// Reference to the storage backend (e.g. file system, mmap).
storage: Arc<dyn Storage>,
/// Name of the vector index file within the storage.
file_name: String,
/// Pre-built mapping from `(doc_id, field_name)` to byte offset.
offsets: Arc<HashMap<(u64, String), u64>>,
},
}
impl VectorStorage {
/// Returns all keys stored in this vector storage.
pub fn keys(&self) -> Vec<(u64, String)> {
match self {
VectorStorage::Owned(map) => map.keys().cloned().collect(),
VectorStorage::OnDemand { offsets, .. } => offsets.keys().cloned().collect(),
}
}
/// Returns the number of vectors stored.
pub fn len(&self) -> usize {
match self {
VectorStorage::Owned(map) => map.len(),
VectorStorage::OnDemand { offsets, .. } => offsets.len(),
}
}
/// Returns `true` if no vectors are stored.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns `true` if a vector with the given key exists.
///
/// # Arguments
///
/// * `key` - A `(doc_id, field_name)` tuple identifying the vector.
pub fn contains_key(&self, key: &(u64, String)) -> bool {
match self {
VectorStorage::Owned(map) => map.contains_key(key),
VectorStorage::OnDemand { offsets, .. } => offsets.contains_key(key),
}
}
/// Retrieves a vector by its key.
///
/// For the `Owned` variant the vector is cloned (O(1) due to `Arc`
/// wrapping). For the `OnDemand` variant a fresh file handle is opened,
/// the reader seeks to the recorded offset, and the vector data is read
/// directly.
///
/// # Arguments
///
/// * `key` - A `(doc_id, field_name)` tuple identifying the vector.
/// * `dimension` - The expected number of dimensions (used to size the read buffer).
///
/// # Returns
///
/// `Ok(Some(vector))` if the key exists, `Ok(None)` otherwise.
///
/// # Errors
///
/// Returns [`LaurusError`] on I/O failure.
pub fn get(&self, key: &(u64, String), dimension: usize) -> Result<Option<Vector>> {
match self {
VectorStorage::Owned(map) => Ok(map.get(key).cloned()),
VectorStorage::OnDemand {
storage,
file_name,
offsets,
} => {
if let Some(&offset) = offsets.get(key) {
let mut input = storage.open_input(file_name).map_err(|e| {
LaurusError::internal(format!("Failed to open vector file: {e}"))
})?;
input
.seek(SeekFrom::Start(offset))
.map_err(LaurusError::Io)?;
// Skip doc_id (8 bytes) + field_name (4 bytes length + variable)
let mut doc_id_buf = [0u8; 8];
input.read_exact(&mut doc_id_buf)?;
let mut field_name_len_buf = [0u8; 4];
input.read_exact(&mut field_name_len_buf)?;
let field_name_len = u32::from_le_bytes(field_name_len_buf) as usize;
let mut field_name_buf = vec![0u8; field_name_len];
input.read_exact(&mut field_name_buf)?;
// Read vector data
let mut values = vec![0.0f32; dimension];
for value in &mut values {
let mut value_buf = [0u8; 4];
input.read_exact(&mut value_buf)?;
*value = f32::from_le_bytes(value_buf);
}
Ok(Some(Vector::new(values)))
} else {
Ok(None)
}
}
}
}
}