hermes_core/structures/postings/sparse/config.rs
1//! Configuration types for sparse vector posting lists
2
3use serde::{Deserialize, Serialize};
4
5/// Sparse vector index format
6///
7/// Determines the on-disk layout and query execution strategy:
8/// - **MaxScore**: Per-dimension variable-size blocks (DAAT — document-at-a-time).
9/// Default, optimal for general sparse retrieval with block-max pruning.
10/// - **Bmp**: Fixed doc_id range blocks (BAAT — block-at-a-time).
11/// Based on Mallia, Suel & Tonellotto (SIGIR 2024). Divides the document
12/// space into fixed-size blocks and processes them in decreasing upper-bound
13/// order, enabling aggressive early termination.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
15pub enum SparseFormat {
16 /// Per-dimension variable-size blocks (existing format, DAAT MaxScore)
17 #[default]
18 MaxScore,
19 /// Fixed doc_id range blocks (BMP, BAAT block-at-a-time)
20 Bmp,
21}
22
23impl SparseFormat {
24 fn is_default(&self) -> bool {
25 *self == Self::MaxScore
26 }
27}
28
29/// Size of the index (term/dimension ID) in sparse vectors
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
31#[repr(u8)]
32pub enum IndexSize {
33 /// 16-bit index (0-65535), ideal for SPLADE vocabularies
34 U16 = 0,
35 /// 32-bit index (0-4B), for large vocabularies
36 #[default]
37 U32 = 1,
38}
39
40impl IndexSize {
41 /// Bytes per index
42 pub fn bytes(&self) -> usize {
43 match self {
44 IndexSize::U16 => 2,
45 IndexSize::U32 => 4,
46 }
47 }
48
49 /// Maximum value representable
50 pub fn max_value(&self) -> u32 {
51 match self {
52 IndexSize::U16 => u16::MAX as u32,
53 IndexSize::U32 => u32::MAX,
54 }
55 }
56
57 pub(crate) fn from_u8(v: u8) -> Option<Self> {
58 match v {
59 0 => Some(IndexSize::U16),
60 1 => Some(IndexSize::U32),
61 _ => None,
62 }
63 }
64}
65
66/// Quantization format for sparse vector weights
67///
68/// Research-validated compression/effectiveness trade-offs (Pati, 2025):
69/// - **UInt8**: 4x compression, ~1-2% nDCG@10 loss (RECOMMENDED for production)
70/// - **Float16**: 2x compression, <1% nDCG@10 loss
71/// - **Float32**: No compression, baseline effectiveness
72/// - **UInt4**: 8x compression, ~3-5% nDCG@10 loss (experimental)
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
74#[repr(u8)]
75pub enum WeightQuantization {
76 /// Full 32-bit float precision
77 #[default]
78 Float32 = 0,
79 /// 16-bit float (half precision) - 2x compression, <1% effectiveness loss
80 Float16 = 1,
81 /// 8-bit unsigned integer with scale factor - 4x compression, ~1-2% effectiveness loss (RECOMMENDED)
82 UInt8 = 2,
83 /// 4-bit unsigned integer with scale factor (packed, 2 per byte) - 8x compression, ~3-5% effectiveness loss
84 UInt4 = 3,
85}
86
87impl WeightQuantization {
88 /// Bytes per weight (approximate for UInt4)
89 pub fn bytes_per_weight(&self) -> f32 {
90 match self {
91 WeightQuantization::Float32 => 4.0,
92 WeightQuantization::Float16 => 2.0,
93 WeightQuantization::UInt8 => 1.0,
94 WeightQuantization::UInt4 => 0.5,
95 }
96 }
97
98 pub(crate) fn from_u8(v: u8) -> Option<Self> {
99 match v {
100 0 => Some(WeightQuantization::Float32),
101 1 => Some(WeightQuantization::Float16),
102 2 => Some(WeightQuantization::UInt8),
103 3 => Some(WeightQuantization::UInt4),
104 _ => None,
105 }
106 }
107}
108
109/// Query-time weighting strategy for sparse vector queries
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
111#[serde(rename_all = "snake_case")]
112pub enum QueryWeighting {
113 /// All terms get weight 1.0
114 #[default]
115 One,
116 /// Terms weighted by IDF (inverse document frequency) from global index statistics
117 /// Uses ln(N/df) where N = total docs, df = docs containing dimension
118 Idf,
119 /// Terms weighted by pre-computed IDF from model's idf.json file
120 /// Loaded from HuggingFace model repo. No fallback to global stats.
121 IdfFile,
122}
123
124/// Query-time configuration for sparse vectors
125///
126/// Research-validated query optimization strategies:
127/// - **weight_threshold (0.01-0.05)**: Drop query dimensions with weight below threshold
128/// - Filters low-IDF tokens that add latency without improving relevance
129/// - **max_query_dims (10-20)**: Process only top-k dimensions by weight
130/// - 30-50% latency reduction with <2% nDCG loss (Qiao et al., 2023)
131/// - **heap_factor (0.8)**: Skip blocks with low max score contribution
132/// - ~20% speedup with minor recall loss (SEISMIC-style)
133#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
134pub struct SparseQueryConfig {
135 /// HuggingFace tokenizer path/name for query-time tokenization
136 /// Example: "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
137 #[serde(default, skip_serializing_if = "Option::is_none")]
138 pub tokenizer: Option<String>,
139 /// Weighting strategy for tokenized query terms
140 #[serde(default)]
141 pub weighting: QueryWeighting,
142 /// Heap factor for approximate search (SEISMIC-style optimization)
143 /// A block is skipped if its max possible score < heap_factor * threshold
144 ///
145 /// Research recommendation:
146 /// - 1.0 = exact search (default)
147 /// - 0.8 = approximate, ~20% faster with minor recall loss (RECOMMENDED for production)
148 /// - 0.5 = very approximate, much faster but higher recall loss
149 #[serde(default = "default_heap_factor")]
150 pub heap_factor: f32,
151 /// Minimum weight for query dimensions (query-time pruning)
152 /// Dimensions with abs(weight) below this threshold are dropped before search.
153 /// Useful for filtering low-IDF tokens that add latency without improving relevance.
154 ///
155 /// - 0.0 = no filtering (default)
156 /// - 0.01-0.05 = recommended for SPLADE/learned sparse models
157 #[serde(default)]
158 pub weight_threshold: f32,
159 /// Maximum number of query dimensions to process (query pruning)
160 /// Processes only the top-k dimensions by weight
161 ///
162 /// Research recommendation (Multiple papers 2022-2024):
163 /// - None = process all dimensions (default, exact)
164 /// - Some(10-20) = process top 10-20 dimensions only (RECOMMENDED for SPLADE)
165 /// - 30-50% latency reduction
166 /// - <2% nDCG@10 loss
167 #[serde(default, skip_serializing_if = "Option::is_none")]
168 pub max_query_dims: Option<usize>,
169 /// Fraction of query dimensions to keep (0.0-1.0), same semantics as
170 /// indexing-time `pruning`: sort by abs(weight) descending,
171 /// keep top fraction. None or 1.0 = no pruning.
172 #[serde(default, skip_serializing_if = "Option::is_none")]
173 pub pruning: Option<f32>,
174 /// Minimum number of query dimensions before pruning and weight_threshold
175 /// filtering are applied. Protects short queries from losing most signal.
176 ///
177 /// Default: 4. Set to 0 to always apply pruning/filtering.
178 #[serde(default = "default_min_terms")]
179 pub min_query_dims: usize,
180 /// Maximum number of superblocks to visit (LSP/0 gamma cap).
181 /// 0 = unlimited (default). Only applies to BMP format.
182 #[serde(default)]
183 pub max_superblocks: usize,
184}
185
186fn default_heap_factor() -> f32 {
187 1.0
188}
189
190impl Default for SparseQueryConfig {
191 fn default() -> Self {
192 Self {
193 tokenizer: None,
194 weighting: QueryWeighting::One,
195 heap_factor: 1.0,
196 weight_threshold: 0.0,
197 max_query_dims: None,
198 pruning: None,
199 min_query_dims: 4,
200 max_superblocks: 0,
201 }
202 }
203}
204
205/// Configuration for sparse vector storage
206///
207/// Research-validated optimizations for learned sparse retrieval (SPLADE, uniCOIL, etc.):
208/// - **Weight threshold (0.01-0.05)**: Removes ~30-50% of postings with minimal nDCG impact
209/// - **Posting list pruning (0.1)**: Keeps top 10% per dimension, 50-70% index reduction, <1% nDCG loss
210/// - **Query pruning (top 10-20 dims)**: 30-50% latency reduction, <2% nDCG loss
211/// - **UInt8 quantization**: 4x compression, 1-2% nDCG loss (optimal trade-off)
212#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
213pub struct SparseVectorConfig {
214 /// Index format: MaxScore (DAAT) or BMP (BAAT)
215 #[serde(default, skip_serializing_if = "SparseFormat::is_default")]
216 pub format: SparseFormat,
217 /// Size of dimension/term indices
218 pub index_size: IndexSize,
219 /// Quantization for weights (see WeightQuantization docs for trade-offs)
220 pub weight_quantization: WeightQuantization,
221 /// Minimum weight threshold - weights below this value are not indexed
222 ///
223 /// Research recommendation (Guo et al., 2022; SPLADE v2):
224 /// - 0.01-0.05 for SPLADE models removes ~30-50% of postings
225 /// - Minimal impact on nDCG@10 (<1% loss)
226 /// - Major reduction in index size and query latency
227 #[serde(default)]
228 pub weight_threshold: f32,
229 /// Block size for posting lists (must be power of 2, default 128 for SIMD)
230 /// Larger blocks = better compression, smaller blocks = faster seeks.
231 /// Used by MaxScore format only.
232 #[serde(default = "default_block_size")]
233 pub block_size: usize,
234 /// BMP block size: number of consecutive doc_ids per block (must be power of 2).
235 /// Default 64. Only used when format = Bmp.
236 /// Smaller = better pruning granularity, larger = less overhead.
237 #[serde(default = "default_bmp_block_size")]
238 pub bmp_block_size: u32,
239 /// Maximum BMP grid memory in bytes. If the grid (num_dims × num_blocks)
240 /// would exceed this, bmp_block_size is automatically increased to cap memory.
241 /// Default: 256MB. Set to 0 to disable the cap.
242 #[serde(default = "default_max_bmp_grid_bytes")]
243 pub max_bmp_grid_bytes: u64,
244 /// BMP superblock size: number of consecutive blocks grouped for hierarchical
245 /// pruning (Carlson et al., SIGIR 2025). Must be power of 2.
246 /// Default 64. Set to 0 to disable superblock pruning (flat BMP scoring).
247 /// Only used when format = Bmp.
248 #[serde(default = "default_bmp_superblock_size")]
249 pub bmp_superblock_size: u32,
250 /// Static pruning: fraction of postings to keep per inverted list (SEISMIC-style)
251 /// Lists are sorted by weight descending and truncated to top fraction.
252 ///
253 /// Research recommendation (SPLADE v2, Formal et al., 2021):
254 /// - None = keep all postings (default, exact)
255 /// - Some(0.1) = keep top 10% of postings per dimension
256 /// - 50-70% index size reduction
257 /// - <1% nDCG@10 loss
258 /// - Exploits "concentration of importance" in learned representations
259 ///
260 /// Applied only during initial segment build, not during merge.
261 #[serde(default, skip_serializing_if = "Option::is_none")]
262 pub pruning: Option<f32>,
263 /// Query-time configuration (tokenizer, weighting)
264 #[serde(default, skip_serializing_if = "Option::is_none")]
265 pub query_config: Option<SparseQueryConfig>,
266 /// Fixed vocabulary size (number of dimensions) for BMP format.
267 ///
268 /// When set, all BMP segments use the same grid dimensions (rows = dims),
269 /// enabling zero-copy block-copy merge. The grid is indexed by dim_id directly
270 /// (no dim_ids Section C needed).
271 ///
272 /// Required for BMP format. Typical values:
273 /// - SPLADE/BERT: 30522 or 105879 (WordPiece / Unigram vocabulary)
274 /// - uniCOIL: 30522
275 /// - Custom models: set to vocabulary size
276 ///
277 /// If None, BMP builder derives dims from observed data (V10 behavior).
278 #[serde(default, skip_serializing_if = "Option::is_none")]
279 pub dims: Option<u32>,
280 /// Fixed max weight scale for BMP format.
281 ///
282 /// When set, all BMP segments use the same quantization scale
283 /// (`max_weight_scale = max_weight`), eliminating rescaling during merge.
284 ///
285 /// For SPLADE models: 5.0 (covers typical weight range 0-5).
286 /// If None, BMP builder derives scale from data (V10 behavior).
287 #[serde(default, skip_serializing_if = "Option::is_none")]
288 pub max_weight: Option<f32>,
289 /// Minimum number of postings in a dimension before pruning and
290 /// weight_threshold filtering are applied. Protects dimensions with
291 /// very few postings from losing most of their signal.
292 ///
293 /// Default: 4. Set to 0 to always apply pruning/filtering.
294 #[serde(default = "default_min_terms")]
295 pub min_terms: usize,
296}
297
298fn default_block_size() -> usize {
299 128
300}
301
302fn default_bmp_block_size() -> u32 {
303 64
304}
305
306fn default_max_bmp_grid_bytes() -> u64 {
307 0 // disabled by default — masks eliminate DRAM stalls during scoring
308}
309
310fn default_bmp_superblock_size() -> u32 {
311 64
312}
313
314fn default_min_terms() -> usize {
315 4
316}
317
318impl Default for SparseVectorConfig {
319 fn default() -> Self {
320 Self {
321 format: SparseFormat::MaxScore,
322 index_size: IndexSize::U32,
323 weight_quantization: WeightQuantization::Float32,
324 weight_threshold: 0.0,
325 block_size: 128,
326 bmp_block_size: 64,
327 max_bmp_grid_bytes: 0,
328 bmp_superblock_size: 64,
329 pruning: None,
330 query_config: None,
331
332 dims: None,
333 max_weight: None,
334 min_terms: 4,
335 }
336 }
337}
338
339impl SparseVectorConfig {
340 /// SPLADE-optimized config with research-validated defaults
341 ///
342 /// Optimized for SPLADE, uniCOIL, and similar learned sparse retrieval models.
343 /// Based on research findings from:
344 /// - Pati (2025): UInt8 quantization = 4x compression, 1-2% nDCG loss
345 /// - Formal et al. (2021): SPLADE v2 posting list pruning
346 /// - Qiao et al. (2023): Query dimension pruning and approximate search
347 /// - Guo et al. (2022): Weight thresholding for efficiency
348 ///
349 /// Expected performance vs. full precision baseline:
350 /// - Index size: ~15-25% of original (combined effect of all optimizations)
351 /// - Query latency: 40-60% faster
352 /// - Effectiveness: 2-4% nDCG@10 loss (typically acceptable for production)
353 ///
354 /// Vocabulary: ~30K dimensions (fits in u16)
355 pub fn splade() -> Self {
356 Self {
357 format: SparseFormat::MaxScore,
358 index_size: IndexSize::U16,
359 weight_quantization: WeightQuantization::UInt8,
360 weight_threshold: 0.01, // Remove ~30-50% of low-weight postings
361 block_size: 128,
362 bmp_block_size: 64,
363 max_bmp_grid_bytes: 0,
364 bmp_superblock_size: 64,
365 pruning: Some(0.1), // Keep top 10% per dimension
366 query_config: Some(SparseQueryConfig {
367 tokenizer: None,
368 weighting: QueryWeighting::One,
369 heap_factor: 0.8, // 20% faster approximate search
370 weight_threshold: 0.01, // Drop low-IDF query tokens
371 max_query_dims: Some(20), // Process top 20 query dimensions
372 pruning: Some(0.1), // Keep top 10% of query dims
373 min_query_dims: 4,
374 max_superblocks: 0,
375 }),
376
377 dims: None,
378 max_weight: None,
379 min_terms: 4,
380 }
381 }
382
383 /// SPLADE-optimized config with BMP (Block-Max Pruning) format
384 ///
385 /// Same optimization settings as `splade()` but uses the BMP block-at-a-time
386 /// format (Mallia, Suel & Tonellotto, SIGIR 2024) instead of MaxScore.
387 /// BMP divides the document space into fixed-size blocks and processes them
388 /// in decreasing upper-bound order, enabling aggressive early termination.
389 pub fn splade_bmp() -> Self {
390 Self {
391 format: SparseFormat::Bmp,
392 index_size: IndexSize::U16,
393 weight_quantization: WeightQuantization::UInt8,
394 weight_threshold: 0.01,
395 block_size: 128,
396 bmp_block_size: 64,
397 max_bmp_grid_bytes: 0,
398 bmp_superblock_size: 64,
399 pruning: Some(0.1),
400 query_config: Some(SparseQueryConfig {
401 tokenizer: None,
402 weighting: QueryWeighting::One,
403 heap_factor: 0.8,
404 weight_threshold: 0.01,
405 max_query_dims: Some(20),
406 pruning: Some(0.1),
407 min_query_dims: 4,
408 max_superblocks: 0,
409 }),
410
411 dims: Some(105879),
412 max_weight: Some(5.0),
413 min_terms: 4,
414 }
415 }
416
417 /// Compact config: Maximum compression (experimental)
418 ///
419 /// Uses aggressive UInt4 quantization for smallest possible index size.
420 /// Expected trade-offs:
421 /// - Index size: ~10-15% of Float32 baseline
422 /// - Effectiveness: ~3-5% nDCG@10 loss
423 ///
424 /// Recommended for: Memory-constrained environments, cache-heavy workloads
425 pub fn compact() -> Self {
426 Self {
427 format: SparseFormat::MaxScore,
428 index_size: IndexSize::U16,
429 weight_quantization: WeightQuantization::UInt4,
430 weight_threshold: 0.02, // Slightly higher threshold for UInt4
431 block_size: 128,
432 bmp_block_size: 64,
433 max_bmp_grid_bytes: 0,
434 bmp_superblock_size: 64,
435 pruning: Some(0.15), // Keep top 15% per dimension
436 query_config: Some(SparseQueryConfig {
437 tokenizer: None,
438 weighting: QueryWeighting::One,
439 heap_factor: 0.7, // More aggressive approximate search
440 weight_threshold: 0.02, // Drop low-IDF query tokens
441 max_query_dims: Some(15), // Fewer query dimensions
442 pruning: Some(0.15), // Keep top 15% of query dims
443 min_query_dims: 4,
444 max_superblocks: 0,
445 }),
446
447 dims: None,
448 max_weight: None,
449 min_terms: 4,
450 }
451 }
452
453 /// Full precision config: No compression, baseline effectiveness
454 ///
455 /// Use for: Research baselines, when effectiveness is critical
456 pub fn full_precision() -> Self {
457 Self {
458 format: SparseFormat::MaxScore,
459 index_size: IndexSize::U32,
460 weight_quantization: WeightQuantization::Float32,
461 weight_threshold: 0.0,
462 block_size: 128,
463 bmp_block_size: 64,
464 max_bmp_grid_bytes: 0,
465 bmp_superblock_size: 64,
466 pruning: None,
467 query_config: None,
468
469 dims: None,
470 max_weight: None,
471 min_terms: 4,
472 }
473 }
474
475 /// Conservative config: Mild optimizations, minimal effectiveness loss
476 ///
477 /// Balances compression and effectiveness with conservative defaults.
478 /// Expected trade-offs:
479 /// - Index size: ~40-50% of Float32 baseline
480 /// - Query latency: ~20-30% faster
481 /// - Effectiveness: <1% nDCG@10 loss
482 ///
483 /// Recommended for: Production deployments prioritizing effectiveness
484 pub fn conservative() -> Self {
485 Self {
486 format: SparseFormat::MaxScore,
487 index_size: IndexSize::U32,
488 weight_quantization: WeightQuantization::Float16,
489 weight_threshold: 0.005, // Minimal pruning
490 block_size: 128,
491 bmp_block_size: 64,
492 max_bmp_grid_bytes: 0,
493 bmp_superblock_size: 64,
494 pruning: None, // No posting list pruning
495 query_config: Some(SparseQueryConfig {
496 tokenizer: None,
497 weighting: QueryWeighting::One,
498 heap_factor: 0.9, // Nearly exact search
499 weight_threshold: 0.005, // Minimal query pruning
500 max_query_dims: Some(50), // Process more dimensions
501 pruning: None, // No fraction-based pruning
502 min_query_dims: 4,
503 max_superblocks: 0,
504 }),
505
506 dims: None,
507 max_weight: None,
508 min_terms: 4,
509 }
510 }
511
512 /// Set weight threshold (builder pattern)
513 pub fn with_weight_threshold(mut self, threshold: f32) -> Self {
514 self.weight_threshold = threshold;
515 self
516 }
517
518 /// Set posting list pruning fraction (builder pattern)
519 /// e.g., 0.1 = keep top 10% of postings per dimension
520 pub fn with_pruning(mut self, fraction: f32) -> Self {
521 self.pruning = Some(fraction.clamp(0.0, 1.0));
522 self
523 }
524
525 /// Bytes per entry (index + weight)
526 pub fn bytes_per_entry(&self) -> f32 {
527 self.index_size.bytes() as f32 + self.weight_quantization.bytes_per_weight()
528 }
529
530 /// Serialize config to a single byte.
531 ///
532 /// Layout: bits 7-4 = IndexSize, bit 3 = format (0=MaxScore, 1=BMP), bits 2-0 = WeightQuantization
533 pub fn to_byte(&self) -> u8 {
534 let format_bit = if self.format == SparseFormat::Bmp {
535 0x08
536 } else {
537 0
538 };
539 ((self.index_size as u8) << 4) | format_bit | (self.weight_quantization as u8)
540 }
541
542 /// Deserialize config from a single byte.
543 ///
544 /// Note: weight_threshold, block_size, bmp_block_size, and query_config are not
545 /// serialized in the byte — they come from the schema.
546 pub fn from_byte(b: u8) -> Option<Self> {
547 let index_size = IndexSize::from_u8((b >> 4) & 0x03)?;
548 let format = if b & 0x08 != 0 {
549 SparseFormat::Bmp
550 } else {
551 SparseFormat::MaxScore
552 };
553 let weight_quantization = WeightQuantization::from_u8(b & 0x07)?;
554 Some(Self {
555 format,
556 index_size,
557 weight_quantization,
558 weight_threshold: 0.0,
559 block_size: 128,
560 bmp_block_size: 64,
561 max_bmp_grid_bytes: 0,
562 bmp_superblock_size: 64,
563 pruning: None,
564 query_config: None,
565
566 dims: None,
567 max_weight: None,
568 min_terms: 4,
569 })
570 }
571
572 /// Set block size (builder pattern)
573 /// Must be power of 2, recommended: 64, 128, 256
574 pub fn with_block_size(mut self, size: usize) -> Self {
575 self.block_size = size.next_power_of_two();
576 self
577 }
578
579 /// Set query configuration (builder pattern)
580 pub fn with_query_config(mut self, config: SparseQueryConfig) -> Self {
581 self.query_config = Some(config);
582 self
583 }
584}
585
586/// A sparse vector entry: (dimension_id, weight)
587#[derive(Debug, Clone, Copy, PartialEq)]
588pub struct SparseEntry {
589 pub dim_id: u32,
590 pub weight: f32,
591}
592
593/// Sparse vector representation
594#[derive(Debug, Clone, Default)]
595pub struct SparseVector {
596 pub(super) entries: Vec<SparseEntry>,
597}
598
599impl SparseVector {
600 /// Create a new sparse vector
601 pub fn new() -> Self {
602 Self {
603 entries: Vec::new(),
604 }
605 }
606
607 /// Create with pre-allocated capacity
608 pub fn with_capacity(capacity: usize) -> Self {
609 Self {
610 entries: Vec::with_capacity(capacity),
611 }
612 }
613
614 /// Create from dimension IDs and weights
615 pub fn from_entries(dim_ids: &[u32], weights: &[f32]) -> Self {
616 assert_eq!(dim_ids.len(), weights.len());
617 let mut entries: Vec<SparseEntry> = dim_ids
618 .iter()
619 .zip(weights.iter())
620 .map(|(&dim_id, &weight)| SparseEntry { dim_id, weight })
621 .collect();
622 // Sort by dimension ID for efficient intersection
623 entries.sort_by_key(|e| e.dim_id);
624 Self { entries }
625 }
626
627 /// Add an entry (must maintain sorted order by dim_id)
628 pub fn push(&mut self, dim_id: u32, weight: f32) {
629 debug_assert!(
630 self.entries.is_empty() || self.entries.last().unwrap().dim_id < dim_id,
631 "Entries must be added in sorted order by dim_id"
632 );
633 self.entries.push(SparseEntry { dim_id, weight });
634 }
635
636 /// Number of non-zero entries
637 pub fn len(&self) -> usize {
638 self.entries.len()
639 }
640
641 /// Check if empty
642 pub fn is_empty(&self) -> bool {
643 self.entries.is_empty()
644 }
645
646 /// Iterate over entries
647 pub fn iter(&self) -> impl Iterator<Item = &SparseEntry> {
648 self.entries.iter()
649 }
650
651 /// Sort by dimension ID (required for posting list encoding)
652 pub fn sort_by_dim(&mut self) {
653 self.entries.sort_by_key(|e| e.dim_id);
654 }
655
656 /// Sort by weight descending
657 pub fn sort_by_weight_desc(&mut self) {
658 self.entries.sort_by(|a, b| {
659 b.weight
660 .partial_cmp(&a.weight)
661 .unwrap_or(std::cmp::Ordering::Equal)
662 });
663 }
664
665 /// Get top-k entries by weight
666 pub fn top_k(&self, k: usize) -> Vec<SparseEntry> {
667 let mut sorted = self.entries.clone();
668 sorted.sort_by(|a, b| {
669 b.weight
670 .partial_cmp(&a.weight)
671 .unwrap_or(std::cmp::Ordering::Equal)
672 });
673 sorted.truncate(k);
674 sorted
675 }
676
677 /// Compute dot product with another sparse vector
678 pub fn dot(&self, other: &SparseVector) -> f32 {
679 let mut result = 0.0f32;
680 let mut i = 0;
681 let mut j = 0;
682
683 while i < self.entries.len() && j < other.entries.len() {
684 let a = &self.entries[i];
685 let b = &other.entries[j];
686
687 match a.dim_id.cmp(&b.dim_id) {
688 std::cmp::Ordering::Less => i += 1,
689 std::cmp::Ordering::Greater => j += 1,
690 std::cmp::Ordering::Equal => {
691 result += a.weight * b.weight;
692 i += 1;
693 j += 1;
694 }
695 }
696 }
697
698 result
699 }
700
701 /// L2 norm squared
702 pub fn norm_squared(&self) -> f32 {
703 self.entries.iter().map(|e| e.weight * e.weight).sum()
704 }
705
706 /// L2 norm
707 pub fn norm(&self) -> f32 {
708 self.norm_squared().sqrt()
709 }
710
711 /// Prune dimensions below a weight threshold
712 pub fn filter_by_weight(&self, min_weight: f32) -> Self {
713 let entries: Vec<SparseEntry> = self
714 .entries
715 .iter()
716 .filter(|e| e.weight.abs() >= min_weight)
717 .cloned()
718 .collect();
719 Self { entries }
720 }
721}
722
723impl From<Vec<(u32, f32)>> for SparseVector {
724 fn from(pairs: Vec<(u32, f32)>) -> Self {
725 Self {
726 entries: pairs
727 .into_iter()
728 .map(|(dim_id, weight)| SparseEntry { dim_id, weight })
729 .collect(),
730 }
731 }
732}
733
734impl From<SparseVector> for Vec<(u32, f32)> {
735 fn from(vec: SparseVector) -> Self {
736 vec.entries
737 .into_iter()
738 .map(|e| (e.dim_id, e.weight))
739 .collect()
740 }
741}