hermes_core/structures/postings/sparse/config.rs
1//! Configuration types for sparse vector posting lists
2
3use serde::{Deserialize, Serialize};
4
5/// Size of the index (term/dimension ID) in sparse vectors
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
7#[repr(u8)]
8pub enum IndexSize {
9 /// 16-bit index (0-65535), ideal for SPLADE vocabularies
10 U16 = 0,
11 /// 32-bit index (0-4B), for large vocabularies
12 #[default]
13 U32 = 1,
14}
15
16impl IndexSize {
17 /// Bytes per index
18 pub fn bytes(&self) -> usize {
19 match self {
20 IndexSize::U16 => 2,
21 IndexSize::U32 => 4,
22 }
23 }
24
25 /// Maximum value representable
26 pub fn max_value(&self) -> u32 {
27 match self {
28 IndexSize::U16 => u16::MAX as u32,
29 IndexSize::U32 => u32::MAX,
30 }
31 }
32
33 pub(crate) fn from_u8(v: u8) -> Option<Self> {
34 match v {
35 0 => Some(IndexSize::U16),
36 1 => Some(IndexSize::U32),
37 _ => None,
38 }
39 }
40}
41
42/// Quantization format for sparse vector weights
43///
44/// Research-validated compression/effectiveness trade-offs (Pati, 2025):
45/// - **UInt8**: 4x compression, ~1-2% nDCG@10 loss (RECOMMENDED for production)
46/// - **Float16**: 2x compression, <1% nDCG@10 loss
47/// - **Float32**: No compression, baseline effectiveness
48/// - **UInt4**: 8x compression, ~3-5% nDCG@10 loss (experimental)
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
50#[repr(u8)]
51pub enum WeightQuantization {
52 /// Full 32-bit float precision
53 #[default]
54 Float32 = 0,
55 /// 16-bit float (half precision) - 2x compression, <1% effectiveness loss
56 Float16 = 1,
57 /// 8-bit unsigned integer with scale factor - 4x compression, ~1-2% effectiveness loss (RECOMMENDED)
58 UInt8 = 2,
59 /// 4-bit unsigned integer with scale factor (packed, 2 per byte) - 8x compression, ~3-5% effectiveness loss
60 UInt4 = 3,
61}
62
63impl WeightQuantization {
64 /// Bytes per weight (approximate for UInt4)
65 pub fn bytes_per_weight(&self) -> f32 {
66 match self {
67 WeightQuantization::Float32 => 4.0,
68 WeightQuantization::Float16 => 2.0,
69 WeightQuantization::UInt8 => 1.0,
70 WeightQuantization::UInt4 => 0.5,
71 }
72 }
73
74 pub(crate) fn from_u8(v: u8) -> Option<Self> {
75 match v {
76 0 => Some(WeightQuantization::Float32),
77 1 => Some(WeightQuantization::Float16),
78 2 => Some(WeightQuantization::UInt8),
79 3 => Some(WeightQuantization::UInt4),
80 _ => None,
81 }
82 }
83}
84
85/// Query-time weighting strategy for sparse vector queries
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
87#[serde(rename_all = "snake_case")]
88pub enum QueryWeighting {
89 /// All terms get weight 1.0
90 #[default]
91 One,
92 /// Terms weighted by IDF (inverse document frequency) from global index statistics
93 /// Uses ln(N/df) where N = total docs, df = docs containing dimension
94 Idf,
95 /// Terms weighted by pre-computed IDF from model's idf.json file
96 /// Loaded from HuggingFace model repo. No fallback to global stats.
97 IdfFile,
98}
99
100/// Query-time configuration for sparse vectors
101///
102/// Research-validated query optimization strategies:
103/// - **max_query_dims (10-20)**: Process only top-k dimensions by weight
104/// - 30-50% latency reduction with <2% nDCG loss (Qiao et al., 2023)
105/// - **heap_factor (0.8)**: Skip blocks with low max score contribution
106/// - ~20% speedup with minor recall loss (SEISMIC-style)
107#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
108pub struct SparseQueryConfig {
109 /// HuggingFace tokenizer path/name for query-time tokenization
110 /// Example: "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
111 #[serde(default, skip_serializing_if = "Option::is_none")]
112 pub tokenizer: Option<String>,
113 /// Weighting strategy for tokenized query terms
114 #[serde(default)]
115 pub weighting: QueryWeighting,
116 /// Heap factor for approximate search (SEISMIC-style optimization)
117 /// A block is skipped if its max possible score < heap_factor * threshold
118 ///
119 /// Research recommendation:
120 /// - 1.0 = exact search (default)
121 /// - 0.8 = approximate, ~20% faster with minor recall loss (RECOMMENDED for production)
122 /// - 0.5 = very approximate, much faster but higher recall loss
123 #[serde(default = "default_heap_factor")]
124 pub heap_factor: f32,
125 /// Maximum number of query dimensions to process (query pruning)
126 /// Processes only the top-k dimensions by weight
127 ///
128 /// Research recommendation (Multiple papers 2022-2024):
129 /// - None = process all dimensions (default, exact)
130 /// - Some(10-20) = process top 10-20 dimensions only (RECOMMENDED for SPLADE)
131 /// - 30-50% latency reduction
132 /// - <2% nDCG@10 loss
133 #[serde(default, skip_serializing_if = "Option::is_none")]
134 pub max_query_dims: Option<usize>,
135}
136
137fn default_heap_factor() -> f32 {
138 1.0
139}
140
141impl Default for SparseQueryConfig {
142 fn default() -> Self {
143 Self {
144 tokenizer: None,
145 weighting: QueryWeighting::One,
146 heap_factor: 1.0,
147 max_query_dims: None,
148 }
149 }
150}
151
152/// Configuration for sparse vector storage
153///
154/// Research-validated optimizations for learned sparse retrieval (SPLADE, uniCOIL, etc.):
155/// - **Weight threshold (0.01-0.05)**: Removes ~30-50% of postings with minimal nDCG impact
156/// - **Posting list pruning (0.1)**: Keeps top 10% per dimension, 50-70% index reduction, <1% nDCG loss
157/// - **Query pruning (top 10-20 dims)**: 30-50% latency reduction, <2% nDCG loss
158/// - **UInt8 quantization**: 4x compression, 1-2% nDCG loss (optimal trade-off)
159#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
160pub struct SparseVectorConfig {
161 /// Size of dimension/term indices
162 pub index_size: IndexSize,
163 /// Quantization for weights (see WeightQuantization docs for trade-offs)
164 pub weight_quantization: WeightQuantization,
165 /// Minimum weight threshold - weights below this value are not indexed
166 ///
167 /// Research recommendation (Guo et al., 2022; SPLADE v2):
168 /// - 0.01-0.05 for SPLADE models removes ~30-50% of postings
169 /// - Minimal impact on nDCG@10 (<1% loss)
170 /// - Major reduction in index size and query latency
171 #[serde(default)]
172 pub weight_threshold: f32,
173 /// Block size for posting lists (must be power of 2, default 128 for SIMD)
174 /// Larger blocks = better compression, smaller blocks = faster seeks
175 #[serde(default = "default_block_size")]
176 pub block_size: usize,
177 /// Static pruning: fraction of postings to keep per inverted list (SEISMIC-style)
178 /// Lists are sorted by weight descending and truncated to top fraction.
179 ///
180 /// Research recommendation (SPLADE v2, Formal et al., 2021):
181 /// - None = keep all postings (default, exact)
182 /// - Some(0.1) = keep top 10% of postings per dimension
183 /// - 50-70% index size reduction
184 /// - <1% nDCG@10 loss
185 /// - Exploits "concentration of importance" in learned representations
186 ///
187 /// Applied only during initial segment build, not during merge.
188 #[serde(default, skip_serializing_if = "Option::is_none")]
189 pub posting_list_pruning: Option<f32>,
190 /// Query-time configuration (tokenizer, weighting)
191 #[serde(default, skip_serializing_if = "Option::is_none")]
192 pub query_config: Option<SparseQueryConfig>,
193}
194
195fn default_block_size() -> usize {
196 128
197}
198
199impl Default for SparseVectorConfig {
200 fn default() -> Self {
201 Self {
202 index_size: IndexSize::U32,
203 weight_quantization: WeightQuantization::Float32,
204 weight_threshold: 0.0,
205 block_size: 128,
206 posting_list_pruning: None,
207 query_config: None,
208 }
209 }
210}
211
212impl SparseVectorConfig {
213 /// SPLADE-optimized config with research-validated defaults
214 ///
215 /// Optimized for SPLADE, uniCOIL, and similar learned sparse retrieval models.
216 /// Based on research findings from:
217 /// - Pati (2025): UInt8 quantization = 4x compression, 1-2% nDCG loss
218 /// - Formal et al. (2021): SPLADE v2 posting list pruning
219 /// - Qiao et al. (2023): Query dimension pruning and approximate search
220 /// - Guo et al. (2022): Weight thresholding for efficiency
221 ///
222 /// Expected performance vs. full precision baseline:
223 /// - Index size: ~15-25% of original (combined effect of all optimizations)
224 /// - Query latency: 40-60% faster
225 /// - Effectiveness: 2-4% nDCG@10 loss (typically acceptable for production)
226 ///
227 /// Vocabulary: ~30K dimensions (fits in u16)
228 pub fn splade() -> Self {
229 Self {
230 index_size: IndexSize::U16,
231 weight_quantization: WeightQuantization::UInt8,
232 weight_threshold: 0.01, // Remove ~30-50% of low-weight postings
233 block_size: 128,
234 posting_list_pruning: Some(0.1), // Keep top 10% per dimension
235 query_config: Some(SparseQueryConfig {
236 tokenizer: None,
237 weighting: QueryWeighting::One,
238 heap_factor: 0.8, // 20% faster approximate search
239 max_query_dims: Some(20), // Process top 20 query dimensions
240 }),
241 }
242 }
243
244 /// Compact config: Maximum compression (experimental)
245 ///
246 /// Uses aggressive UInt4 quantization for smallest possible index size.
247 /// Expected trade-offs:
248 /// - Index size: ~10-15% of Float32 baseline
249 /// - Effectiveness: ~3-5% nDCG@10 loss
250 ///
251 /// Recommended for: Memory-constrained environments, cache-heavy workloads
252 pub fn compact() -> Self {
253 Self {
254 index_size: IndexSize::U16,
255 weight_quantization: WeightQuantization::UInt4,
256 weight_threshold: 0.02, // Slightly higher threshold for UInt4
257 block_size: 128,
258 posting_list_pruning: Some(0.15), // Keep top 15% per dimension
259 query_config: Some(SparseQueryConfig {
260 tokenizer: None,
261 weighting: QueryWeighting::One,
262 heap_factor: 0.7, // More aggressive approximate search
263 max_query_dims: Some(15), // Fewer query dimensions
264 }),
265 }
266 }
267
268 /// Full precision config: No compression, baseline effectiveness
269 ///
270 /// Use for: Research baselines, when effectiveness is critical
271 pub fn full_precision() -> Self {
272 Self {
273 index_size: IndexSize::U32,
274 weight_quantization: WeightQuantization::Float32,
275 weight_threshold: 0.0,
276 block_size: 128,
277 posting_list_pruning: None,
278 query_config: None,
279 }
280 }
281
282 /// Conservative config: Mild optimizations, minimal effectiveness loss
283 ///
284 /// Balances compression and effectiveness with conservative defaults.
285 /// Expected trade-offs:
286 /// - Index size: ~40-50% of Float32 baseline
287 /// - Query latency: ~20-30% faster
288 /// - Effectiveness: <1% nDCG@10 loss
289 ///
290 /// Recommended for: Production deployments prioritizing effectiveness
291 pub fn conservative() -> Self {
292 Self {
293 index_size: IndexSize::U32,
294 weight_quantization: WeightQuantization::Float16,
295 weight_threshold: 0.005, // Minimal pruning
296 block_size: 128,
297 posting_list_pruning: None, // No posting list pruning
298 query_config: Some(SparseQueryConfig {
299 tokenizer: None,
300 weighting: QueryWeighting::One,
301 heap_factor: 0.9, // Nearly exact search
302 max_query_dims: Some(50), // Process more dimensions
303 }),
304 }
305 }
306
307 /// Set weight threshold (builder pattern)
308 pub fn with_weight_threshold(mut self, threshold: f32) -> Self {
309 self.weight_threshold = threshold;
310 self
311 }
312
313 /// Set posting list pruning fraction (builder pattern)
314 /// e.g., 0.1 = keep top 10% of postings per dimension
315 pub fn with_pruning(mut self, fraction: f32) -> Self {
316 self.posting_list_pruning = Some(fraction.clamp(0.0, 1.0));
317 self
318 }
319
320 /// Bytes per entry (index + weight)
321 pub fn bytes_per_entry(&self) -> f32 {
322 self.index_size.bytes() as f32 + self.weight_quantization.bytes_per_weight()
323 }
324
325 /// Serialize config to a single byte
326 pub fn to_byte(&self) -> u8 {
327 ((self.index_size as u8) << 4) | (self.weight_quantization as u8)
328 }
329
330 /// Deserialize config from a single byte
331 /// Note: weight_threshold, block_size and query_config are not serialized in the byte
332 pub fn from_byte(b: u8) -> Option<Self> {
333 let index_size = IndexSize::from_u8(b >> 4)?;
334 let weight_quantization = WeightQuantization::from_u8(b & 0x0F)?;
335 Some(Self {
336 index_size,
337 weight_quantization,
338 weight_threshold: 0.0,
339 block_size: 128,
340 posting_list_pruning: None,
341 query_config: None,
342 })
343 }
344
345 /// Set block size (builder pattern)
346 /// Must be power of 2, recommended: 64, 128, 256
347 pub fn with_block_size(mut self, size: usize) -> Self {
348 self.block_size = size.next_power_of_two();
349 self
350 }
351
352 /// Set query configuration (builder pattern)
353 pub fn with_query_config(mut self, config: SparseQueryConfig) -> Self {
354 self.query_config = Some(config);
355 self
356 }
357}
358
359/// A sparse vector entry: (dimension_id, weight)
360#[derive(Debug, Clone, Copy, PartialEq)]
361pub struct SparseEntry {
362 pub dim_id: u32,
363 pub weight: f32,
364}
365
366/// Sparse vector representation
367#[derive(Debug, Clone, Default)]
368pub struct SparseVector {
369 pub(super) entries: Vec<SparseEntry>,
370}
371
372impl SparseVector {
373 /// Create a new sparse vector
374 pub fn new() -> Self {
375 Self {
376 entries: Vec::new(),
377 }
378 }
379
380 /// Create with pre-allocated capacity
381 pub fn with_capacity(capacity: usize) -> Self {
382 Self {
383 entries: Vec::with_capacity(capacity),
384 }
385 }
386
387 /// Create from dimension IDs and weights
388 pub fn from_entries(dim_ids: &[u32], weights: &[f32]) -> Self {
389 assert_eq!(dim_ids.len(), weights.len());
390 let mut entries: Vec<SparseEntry> = dim_ids
391 .iter()
392 .zip(weights.iter())
393 .map(|(&dim_id, &weight)| SparseEntry { dim_id, weight })
394 .collect();
395 // Sort by dimension ID for efficient intersection
396 entries.sort_by_key(|e| e.dim_id);
397 Self { entries }
398 }
399
400 /// Add an entry (must maintain sorted order by dim_id)
401 pub fn push(&mut self, dim_id: u32, weight: f32) {
402 debug_assert!(
403 self.entries.is_empty() || self.entries.last().unwrap().dim_id < dim_id,
404 "Entries must be added in sorted order by dim_id"
405 );
406 self.entries.push(SparseEntry { dim_id, weight });
407 }
408
409 /// Number of non-zero entries
410 pub fn len(&self) -> usize {
411 self.entries.len()
412 }
413
414 /// Check if empty
415 pub fn is_empty(&self) -> bool {
416 self.entries.is_empty()
417 }
418
419 /// Iterate over entries
420 pub fn iter(&self) -> impl Iterator<Item = &SparseEntry> {
421 self.entries.iter()
422 }
423
424 /// Sort by dimension ID (required for posting list encoding)
425 pub fn sort_by_dim(&mut self) {
426 self.entries.sort_by_key(|e| e.dim_id);
427 }
428
429 /// Sort by weight descending
430 pub fn sort_by_weight_desc(&mut self) {
431 self.entries.sort_by(|a, b| {
432 b.weight
433 .partial_cmp(&a.weight)
434 .unwrap_or(std::cmp::Ordering::Equal)
435 });
436 }
437
438 /// Get top-k entries by weight
439 pub fn top_k(&self, k: usize) -> Vec<SparseEntry> {
440 let mut sorted = self.entries.clone();
441 sorted.sort_by(|a, b| {
442 b.weight
443 .partial_cmp(&a.weight)
444 .unwrap_or(std::cmp::Ordering::Equal)
445 });
446 sorted.truncate(k);
447 sorted
448 }
449
450 /// Compute dot product with another sparse vector
451 pub fn dot(&self, other: &SparseVector) -> f32 {
452 let mut result = 0.0f32;
453 let mut i = 0;
454 let mut j = 0;
455
456 while i < self.entries.len() && j < other.entries.len() {
457 let a = &self.entries[i];
458 let b = &other.entries[j];
459
460 match a.dim_id.cmp(&b.dim_id) {
461 std::cmp::Ordering::Less => i += 1,
462 std::cmp::Ordering::Greater => j += 1,
463 std::cmp::Ordering::Equal => {
464 result += a.weight * b.weight;
465 i += 1;
466 j += 1;
467 }
468 }
469 }
470
471 result
472 }
473
474 /// L2 norm squared
475 pub fn norm_squared(&self) -> f32 {
476 self.entries.iter().map(|e| e.weight * e.weight).sum()
477 }
478
479 /// L2 norm
480 pub fn norm(&self) -> f32 {
481 self.norm_squared().sqrt()
482 }
483
484 /// Prune dimensions below a weight threshold
485 pub fn filter_by_weight(&self, min_weight: f32) -> Self {
486 let entries: Vec<SparseEntry> = self
487 .entries
488 .iter()
489 .filter(|e| e.weight.abs() >= min_weight)
490 .cloned()
491 .collect();
492 Self { entries }
493 }
494}
495
496impl From<Vec<(u32, f32)>> for SparseVector {
497 fn from(pairs: Vec<(u32, f32)>) -> Self {
498 Self {
499 entries: pairs
500 .into_iter()
501 .map(|(dim_id, weight)| SparseEntry { dim_id, weight })
502 .collect(),
503 }
504 }
505}
506
507impl From<SparseVector> for Vec<(u32, f32)> {
508 fn from(vec: SparseVector) -> Self {
509 vec.entries
510 .into_iter()
511 .map(|e| (e.dim_id, e.weight))
512 .collect()
513 }
514}