1pub const BITVEC_THRESHOLD: f32 = 0.0625; pub const MAX_DIMENSION: usize = 65536;
57
58#[derive(Debug, Clone)]
60pub enum OutlierSet {
61 Empty,
63 Sparse(SparseOutliers),
65 Dense(DenseOutliers),
67}
68
69impl OutlierSet {
70 pub fn empty() -> Self {
72 OutlierSet::Empty
73 }
74
75 pub fn from_dims(dims: &[u16], dimension: usize) -> Self {
77 if dims.is_empty() {
78 return OutlierSet::Empty;
79 }
80
81 let density = dims.len() as f32 / dimension as f32;
82
83 if density > BITVEC_THRESHOLD {
84 OutlierSet::Dense(DenseOutliers::from_dims(dims, dimension))
85 } else {
86 OutlierSet::Sparse(SparseOutliers::from_dims(dims))
87 }
88 }
89
90 pub fn from_iter<I: IntoIterator<Item = u16>>(iter: I, dimension: usize) -> Self {
92 let dims: Vec<u16> = iter.into_iter().collect();
93 Self::from_dims(&dims, dimension)
94 }
95
96 #[inline]
98 pub fn contains(&self, dim: u16) -> bool {
99 match self {
100 OutlierSet::Empty => false,
101 OutlierSet::Sparse(s) => s.contains(dim),
102 OutlierSet::Dense(d) => d.contains(dim),
103 }
104 }
105
106 pub fn len(&self) -> usize {
108 match self {
109 OutlierSet::Empty => 0,
110 OutlierSet::Sparse(s) => s.len(),
111 OutlierSet::Dense(d) => d.len(),
112 }
113 }
114
115 pub fn is_empty(&self) -> bool {
117 matches!(self, OutlierSet::Empty)
118 }
119
120 pub fn memory_bytes(&self) -> usize {
122 match self {
123 OutlierSet::Empty => 0,
124 OutlierSet::Sparse(s) => s.memory_bytes(),
125 OutlierSet::Dense(d) => d.memory_bytes(),
126 }
127 }
128
129 pub fn iter(&self) -> OutlierIterator<'_> {
131 match self {
132 OutlierSet::Empty => OutlierIterator::Empty,
133 OutlierSet::Sparse(s) => OutlierIterator::Sparse(s.dims.iter()),
134 OutlierSet::Dense(d) => OutlierIterator::Dense(DenseIterator::new(d)),
135 }
136 }
137
138 pub fn density(&self, dimension: usize) -> f32 {
140 self.len() as f32 / dimension as f32
141 }
142
143 pub fn is_dense(&self) -> bool {
145 matches!(self, OutlierSet::Dense(_))
146 }
147}
148
149#[derive(Debug, Clone)]
151pub struct SparseOutliers {
152 dims: Vec<u16>,
154}
155
156impl SparseOutliers {
157 pub fn from_dims(dims: &[u16]) -> Self {
159 let mut sorted = dims.to_vec();
160 sorted.sort_unstable();
161 sorted.dedup();
162 Self { dims: sorted }
163 }
164
165 #[inline]
167 pub fn contains(&self, dim: u16) -> bool {
168 self.dims.binary_search(&dim).is_ok()
169 }
170
171 #[inline]
173 pub fn position(&self, dim: u16) -> Option<usize> {
174 self.dims.binary_search(&dim).ok()
175 }
176
177 pub fn len(&self) -> usize {
179 self.dims.len()
180 }
181
182 pub fn is_empty(&self) -> bool {
184 self.dims.is_empty()
185 }
186
187 pub fn memory_bytes(&self) -> usize {
189 self.dims.len() * std::mem::size_of::<u16>()
190 }
191
192 pub fn dims(&self) -> &[u16] {
194 &self.dims
195 }
196}
197
198#[derive(Debug, Clone)]
200pub struct DenseOutliers {
201 bits: Vec<u64>,
203 count: usize,
205}
206
207impl DenseOutliers {
208 pub fn from_dims(dims: &[u16], dimension: usize) -> Self {
210 let num_words = (dimension + 63) / 64;
211 let mut bits = vec![0u64; num_words];
212
213 for &dim in dims {
214 let word_idx = dim as usize / 64;
215 let bit_idx = dim as usize % 64;
216 if word_idx < bits.len() {
217 bits[word_idx] |= 1u64 << bit_idx;
218 }
219 }
220
221 let count = bits.iter().map(|w| w.count_ones() as usize).sum();
223
224 Self { bits, count }
225 }
226
227 #[inline]
229 pub fn contains(&self, dim: u16) -> bool {
230 let word_idx = dim as usize / 64;
231 let bit_idx = dim as usize % 64;
232
233 if word_idx >= self.bits.len() {
234 return false;
235 }
236
237 (self.bits[word_idx] >> bit_idx) & 1 == 1
238 }
239
240 #[inline]
243 pub fn rank(&self, dim: u16) -> usize {
244 let word_idx = dim as usize / 64;
245 let bit_idx = dim as usize % 64;
246
247 let mut count = 0usize;
248
249 for i in 0..word_idx.min(self.bits.len()) {
251 count += self.bits[i].count_ones() as usize;
252 }
253
254 if word_idx < self.bits.len() {
256 let mask = (1u64 << bit_idx) - 1;
257 count += (self.bits[word_idx] & mask).count_ones() as usize;
258 }
259
260 count
261 }
262
263 pub fn len(&self) -> usize {
265 self.count
266 }
267
268 pub fn is_empty(&self) -> bool {
270 self.count == 0
271 }
272
273 pub fn memory_bytes(&self) -> usize {
275 self.bits.len() * std::mem::size_of::<u64>()
276 }
277
278 pub fn capacity(&self) -> usize {
280 self.bits.len() * 64
281 }
282}
283
284pub enum OutlierIterator<'a> {
286 Empty,
287 Sparse(std::slice::Iter<'a, u16>),
288 Dense(DenseIterator<'a>),
289}
290
291impl<'a> Iterator for OutlierIterator<'a> {
292 type Item = u16;
293
294 fn next(&mut self) -> Option<Self::Item> {
295 match self {
296 OutlierIterator::Empty => None,
297 OutlierIterator::Sparse(iter) => iter.next().copied(),
298 OutlierIterator::Dense(iter) => iter.next(),
299 }
300 }
301}
302
303pub struct DenseIterator<'a> {
305 bits: &'a [u64],
306 word_idx: usize,
307 #[allow(dead_code)]
308 bit_idx: u32,
309 remaining: u64,
310}
311
312impl<'a> DenseIterator<'a> {
313 fn new(dense: &'a DenseOutliers) -> Self {
314 let remaining = dense.bits.first().copied().unwrap_or(0);
315 Self {
316 bits: &dense.bits,
317 word_idx: 0,
318 bit_idx: 0,
319 remaining,
320 }
321 }
322}
323
324impl Iterator for DenseIterator<'_> {
325 type Item = u16;
326
327 fn next(&mut self) -> Option<Self::Item> {
328 loop {
329 if self.remaining != 0 {
330 let tz = self.remaining.trailing_zeros();
331 self.remaining &= self.remaining - 1; return Some((self.word_idx * 64 + tz as usize) as u16);
333 }
334
335 self.word_idx += 1;
336 if self.word_idx >= self.bits.len() {
337 return None;
338 }
339 self.remaining = self.bits[self.word_idx];
340 }
341 }
342}
343
344#[derive(Debug, Clone, Copy)]
346pub struct OutlierValue {
347 pub dim: u16,
349 pub value: f32,
351}
352
353#[derive(Debug, Clone)]
355pub struct OutlierStorage {
356 pub set: OutlierSet,
358 pub values: Vec<f32>,
360}
361
362impl OutlierStorage {
363 pub fn empty() -> Self {
365 Self {
366 set: OutlierSet::Empty,
367 values: Vec::new(),
368 }
369 }
370
371 pub fn from_entries(entries: &[OutlierValue], dimension: usize) -> Self {
373 if entries.is_empty() {
374 return Self::empty();
375 }
376
377 let mut sorted: Vec<_> = entries.to_vec();
379 sorted.sort_by_key(|e| e.dim);
380
381 let dims: Vec<u16> = sorted.iter().map(|e| e.dim).collect();
382 let values: Vec<f32> = sorted.iter().map(|e| e.value).collect();
383
384 Self {
385 set: OutlierSet::from_dims(&dims, dimension),
386 values,
387 }
388 }
389
390 #[inline]
392 pub fn get(&self, dim: u16) -> Option<f32> {
393 match &self.set {
394 OutlierSet::Empty => None,
395 OutlierSet::Sparse(s) => s.position(dim).map(|pos| self.values[pos]),
396 OutlierSet::Dense(d) => {
397 if d.contains(dim) {
398 Some(self.values[d.rank(dim)])
399 } else {
400 None
401 }
402 }
403 }
404 }
405
406 #[inline]
408 pub fn contains(&self, dim: u16) -> bool {
409 self.set.contains(dim)
410 }
411
412 pub fn len(&self) -> usize {
414 self.values.len()
415 }
416
417 pub fn is_empty(&self) -> bool {
419 self.values.is_empty()
420 }
421
422 pub fn memory_bytes(&self) -> usize {
424 self.set.memory_bytes() + self.values.len() * std::mem::size_of::<f32>()
425 }
426
427 pub fn iter(&self) -> impl Iterator<Item = (u16, f32)> + '_ {
429 self.set.iter().zip(self.values.iter().copied())
430 }
431}
432
433pub struct BatchOutlierLookup<'a> {
435 storage: &'a OutlierStorage,
436}
437
438impl<'a> BatchOutlierLookup<'a> {
439 pub fn new(storage: &'a OutlierStorage) -> Self {
441 Self { storage }
442 }
443
444 pub fn lookup_batch(&self, dims: &[u16]) -> (Vec<bool>, Vec<f32>) {
447 let mut is_outlier = Vec::with_capacity(dims.len());
448 let mut values = Vec::with_capacity(dims.len());
449
450 for &dim in dims {
451 if let Some(v) = self.storage.get(dim) {
452 is_outlier.push(true);
453 values.push(v);
454 } else {
455 is_outlier.push(false);
456 values.push(0.0);
457 }
458 }
459
460 (is_outlier, values)
461 }
462
463 #[inline]
465 pub fn lookup_4(&self, dims: [u16; 4]) -> ([bool; 4], [f32; 4]) {
466 let mut is_outlier = [false; 4];
467 let mut values = [0.0f32; 4];
468
469 for i in 0..4 {
470 if let Some(v) = self.storage.get(dims[i]) {
471 is_outlier[i] = true;
472 values[i] = v;
473 }
474 }
475
476 (is_outlier, values)
477 }
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483
484 #[test]
485 fn test_empty_outlier_set() {
486 let set = OutlierSet::empty();
487
488 assert!(set.is_empty());
489 assert_eq!(set.len(), 0);
490 assert!(!set.contains(0));
491 assert!(!set.contains(100));
492 }
493
494 #[test]
495 fn test_sparse_outliers() {
496 let dims = vec![5, 10, 100, 200];
497 let set = OutlierSet::from_dims(&dims, 768);
498
499 assert!(!set.is_dense());
500 assert_eq!(set.len(), 4);
501
502 assert!(set.contains(5));
503 assert!(set.contains(10));
504 assert!(set.contains(100));
505 assert!(set.contains(200));
506
507 assert!(!set.contains(0));
508 assert!(!set.contains(6));
509 assert!(!set.contains(99));
510 }
511
512 #[test]
513 fn test_dense_outliers() {
514 let dims: Vec<u16> = (0..100).collect(); let set = OutlierSet::from_dims(&dims, 768);
517
518 assert!(set.is_dense());
519 assert_eq!(set.len(), 100);
520
521 for d in 0..100 {
522 assert!(set.contains(d));
523 }
524 assert!(!set.contains(100));
525 assert!(!set.contains(500));
526 }
527
528 #[test]
529 fn test_sparse_binary_search() {
530 let sparse = SparseOutliers::from_dims(&[10, 20, 30, 40, 50]);
531
532 assert_eq!(sparse.position(10), Some(0));
533 assert_eq!(sparse.position(30), Some(2));
534 assert_eq!(sparse.position(50), Some(4));
535 assert_eq!(sparse.position(15), None);
536 }
537
538 #[test]
539 fn test_dense_rank() {
540 let dense = DenseOutliers::from_dims(&[0, 1, 2, 10, 20], 768);
541
542 assert_eq!(dense.rank(0), 0); assert_eq!(dense.rank(1), 1); assert_eq!(dense.rank(2), 2); assert_eq!(dense.rank(10), 3); assert_eq!(dense.rank(20), 4); }
548
549 #[test]
550 fn test_outlier_storage() {
551 let entries = vec![
552 OutlierValue {
553 dim: 10,
554 value: 1.5,
555 },
556 OutlierValue {
557 dim: 20,
558 value: -2.0,
559 },
560 OutlierValue { dim: 5, value: 0.5 },
561 ];
562
563 let storage = OutlierStorage::from_entries(&entries, 768);
564
565 assert_eq!(storage.len(), 3);
566 assert!(storage.contains(5));
567 assert!(storage.contains(10));
568 assert!(storage.contains(20));
569 assert!(!storage.contains(15));
570
571 assert_eq!(storage.get(5), Some(0.5));
572 assert_eq!(storage.get(10), Some(1.5));
573 assert_eq!(storage.get(20), Some(-2.0));
574 assert_eq!(storage.get(15), None);
575 }
576
577 #[test]
578 fn test_batch_lookup() {
579 let entries = vec![
580 OutlierValue { dim: 0, value: 1.0 },
581 OutlierValue { dim: 2, value: 2.0 },
582 ];
583 let storage = OutlierStorage::from_entries(&entries, 768);
584 let lookup = BatchOutlierLookup::new(&storage);
585
586 let (is_outlier, values) = lookup.lookup_4([0, 1, 2, 3]);
587
588 assert_eq!(is_outlier, [true, false, true, false]);
589 assert_eq!(values[0], 1.0);
590 assert_eq!(values[1], 0.0);
591 assert_eq!(values[2], 2.0);
592 assert_eq!(values[3], 0.0);
593 }
594
595 #[test]
596 fn test_outlier_iteration() {
597 let dims = vec![5, 10, 100];
598 let set = OutlierSet::from_dims(&dims, 768);
599
600 let collected: Vec<u16> = set.iter().collect();
601 assert_eq!(collected, vec![5, 10, 100]);
602 }
603
604 #[test]
605 fn test_dense_iteration() {
606 let dims: Vec<u16> = (0..100).collect();
607 let set = OutlierSet::from_dims(&dims, 768);
608
609 let collected: Vec<u16> = set.iter().collect();
610 assert_eq!(collected.len(), 100);
611 assert_eq!(collected[0], 0);
612 assert_eq!(collected[99], 99);
613 }
614
615 #[test]
616 fn test_crossover_threshold() {
617 let sparse_dims: Vec<u16> = (0..40).collect(); let sparse_set = OutlierSet::from_dims(&sparse_dims, 768);
620 assert!(!sparse_set.is_dense());
621
622 let dense_dims: Vec<u16> = (0..60).collect(); let dense_set = OutlierSet::from_dims(&dense_dims, 768);
625 assert!(dense_set.is_dense());
626 }
627
628 #[test]
629 fn test_memory_efficiency() {
630 let dimension = 768;
631
632 let sparse_dims: Vec<u16> = (0..10).collect();
634 let sparse_set = OutlierSet::from_dims(&sparse_dims, dimension);
635 assert!(sparse_set.memory_bytes() < 100); let dense_dims: Vec<u16> = (0..100).collect();
639 let dense_set = OutlierSet::from_dims(&dense_dims, dimension);
640 assert!(dense_set.memory_bytes() <= 100);
642 }
643
644 #[test]
645 fn test_unsorted_input() {
646 let dims = vec![100, 5, 50, 10, 200];
647 let set = OutlierSet::from_dims(&dims, 768);
648
649 for &d in &dims {
651 assert!(set.contains(d));
652 }
653 }
654
655 #[test]
656 fn test_duplicate_dims() {
657 let dims = vec![5, 5, 10, 10, 10];
658 let set = OutlierSet::from_dims(&dims, 768);
659
660 assert_eq!(set.len(), 2);
662 assert!(set.contains(5));
663 assert!(set.contains(10));
664 }
665}