1use roaring::RoaringBitmap;
7use std::collections::HashMap;
8use std::io::{self, Read, Write};
9
10#[allow(dead_code)]
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13#[repr(u8)]
14pub enum FieldType {
15 Keyword = 1,
17 Integer = 2,
19 Float = 3,
21 Boolean = 4,
23}
24
25impl From<u8> for FieldType {
26 fn from(v: u8) -> Self {
27 match v {
28 2 => Self::Integer,
29 3 => Self::Float,
30 4 => Self::Boolean,
31 _ => Self::Keyword, }
33 }
34}
35
36#[derive(Debug, Clone, Default)]
38pub struct KeywordIndex {
39 terms: HashMap<String, RoaringBitmap>,
41}
42
43impl KeywordIndex {
44 pub fn new() -> Self {
45 Self::default()
46 }
47
48 pub fn insert(&mut self, doc_id: u32, term: &str) {
50 self.terms
51 .entry(term.to_string())
52 .or_default()
53 .insert(doc_id);
54 }
55
56 pub fn remove(&mut self, doc_id: u32) {
58 for bitmap in self.terms.values_mut() {
59 bitmap.remove(doc_id);
60 }
61 self.terms.retain(|_, bitmap| !bitmap.is_empty());
63 }
64
65 pub fn get(&self, term: &str) -> Option<&RoaringBitmap> {
67 self.terms.get(term)
68 }
69
70 #[inline]
72 pub fn contains(&self, doc_id: u32, term: &str) -> bool {
73 self.terms
74 .get(term)
75 .is_some_and(|bitmap| bitmap.contains(doc_id))
76 }
77
78 pub fn terms(&self) -> impl Iterator<Item = &str> {
80 self.terms.keys().map(std::string::String::as_str)
81 }
82
83 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
85 writer.write_all(&(self.terms.len() as u32).to_le_bytes())?;
87
88 for (term, bitmap) in &self.terms {
89 writer.write_all(&(term.len() as u32).to_le_bytes())?;
91 writer.write_all(term.as_bytes())?;
92
93 let mut bitmap_bytes = Vec::new();
95 bitmap.serialize_into(&mut bitmap_bytes)?;
96 writer.write_all(&(bitmap_bytes.len() as u32).to_le_bytes())?;
97 writer.write_all(&bitmap_bytes)?;
98 }
99
100 Ok(())
101 }
102
103 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
105 let mut len_buf = [0u8; 4];
106 reader.read_exact(&mut len_buf)?;
107 let term_count = u32::from_le_bytes(len_buf) as usize;
108
109 let mut terms = HashMap::with_capacity(term_count);
110
111 for _ in 0..term_count {
112 reader.read_exact(&mut len_buf)?;
114 let term_len = u32::from_le_bytes(len_buf) as usize;
115 let mut term_buf = vec![0u8; term_len];
116 reader.read_exact(&mut term_buf)?;
117 let term = String::from_utf8(term_buf)
118 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
119
120 reader.read_exact(&mut len_buf)?;
122 let bitmap_len = u32::from_le_bytes(len_buf) as usize;
123 let mut bitmap_buf = vec![0u8; bitmap_len];
124 reader.read_exact(&mut bitmap_buf)?;
125 let bitmap = RoaringBitmap::deserialize_from(&bitmap_buf[..])
126 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
127
128 terms.insert(term, bitmap);
129 }
130
131 Ok(Self { terms })
132 }
133}
134
135#[derive(Debug, Clone, Default)]
137pub struct BooleanIndex {
138 true_docs: RoaringBitmap,
139 false_docs: RoaringBitmap,
140}
141
142impl BooleanIndex {
143 pub fn new() -> Self {
144 Self::default()
145 }
146
147 pub fn insert(&mut self, doc_id: u32, value: bool) {
148 if value {
149 self.true_docs.insert(doc_id);
150 self.false_docs.remove(doc_id);
151 } else {
152 self.false_docs.insert(doc_id);
153 self.true_docs.remove(doc_id);
154 }
155 }
156
157 pub fn remove(&mut self, doc_id: u32) {
158 self.true_docs.remove(doc_id);
159 self.false_docs.remove(doc_id);
160 }
161
162 #[inline]
163 pub fn matches(&self, doc_id: u32, value: bool) -> bool {
164 if value {
165 self.true_docs.contains(doc_id)
166 } else {
167 self.false_docs.contains(doc_id)
168 }
169 }
170
171 pub fn get_true(&self) -> &RoaringBitmap {
172 &self.true_docs
173 }
174
175 pub fn get_false(&self) -> &RoaringBitmap {
176 &self.false_docs
177 }
178
179 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
181 let mut true_bytes = Vec::new();
183 self.true_docs.serialize_into(&mut true_bytes)?;
184 writer.write_all(&(true_bytes.len() as u32).to_le_bytes())?;
185 writer.write_all(&true_bytes)?;
186
187 let mut false_bytes = Vec::new();
189 self.false_docs.serialize_into(&mut false_bytes)?;
190 writer.write_all(&(false_bytes.len() as u32).to_le_bytes())?;
191 writer.write_all(&false_bytes)?;
192
193 Ok(())
194 }
195
196 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
198 let mut len_buf = [0u8; 4];
199
200 reader.read_exact(&mut len_buf)?;
202 let true_len = u32::from_le_bytes(len_buf) as usize;
203 let mut true_buf = vec![0u8; true_len];
204 reader.read_exact(&mut true_buf)?;
205 let true_docs = RoaringBitmap::deserialize_from(&true_buf[..])
206 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
207
208 reader.read_exact(&mut len_buf)?;
210 let false_len = u32::from_le_bytes(len_buf) as usize;
211 let mut false_buf = vec![0u8; false_len];
212 reader.read_exact(&mut false_buf)?;
213 let false_docs = RoaringBitmap::deserialize_from(&false_buf[..])
214 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
215
216 Ok(Self {
217 true_docs,
218 false_docs,
219 })
220 }
221}
222
223#[derive(Debug, Clone, Default)]
225pub struct NumericIndex {
226 entries: Vec<(f64, u32)>,
228 common_values: HashMap<i64, RoaringBitmap>,
230}
231
232impl NumericIndex {
233 pub fn new() -> Self {
234 Self::default()
235 }
236
237 pub fn insert(&mut self, doc_id: u32, value: f64) {
238 if value.is_nan() {
240 return;
241 }
242
243 let pos = self
245 .entries
246 .binary_search_by(|(v, _)| v.total_cmp(&value))
247 .unwrap_or_else(|p| p);
248 self.entries.insert(pos, (value, doc_id));
249
250 if value.fract() == 0.0 && value >= i64::MIN as f64 && value <= i64::MAX as f64 {
252 let int_val = value as i64;
253 self.common_values
254 .entry(int_val)
255 .or_default()
256 .insert(doc_id);
257 }
258 }
259
260 pub fn remove(&mut self, doc_id: u32) {
261 self.entries.retain(|(_, id)| *id != doc_id);
262 for bitmap in self.common_values.values_mut() {
263 bitmap.remove(doc_id);
264 }
265 self.common_values.retain(|_, bitmap| !bitmap.is_empty());
267 }
268
269 pub fn get_eq(&self, value: f64) -> Option<&RoaringBitmap> {
271 if value.fract() == 0.0 {
272 self.common_values.get(&(value as i64))
273 } else {
274 None
275 }
276 }
277
278 pub fn get_range(&self, min: f64, max: f64) -> RoaringBitmap {
280 let start = self.entries.partition_point(|(v, _)| *v < min);
283
284 let mut result = RoaringBitmap::new();
285 for (val, doc_id) in &self.entries[start..] {
286 if *val > max {
287 break;
288 }
289 result.insert(*doc_id);
290 }
291 result
292 }
293
294 #[inline]
296 pub fn matches_eq(&self, doc_id: u32, value: f64) -> bool {
297 if let Some(bitmap) = self.get_eq(value) {
298 bitmap.contains(doc_id)
299 } else {
300 self.entries
302 .iter()
303 .any(|(v, id)| *id == doc_id && (*v - value).abs() < f64::EPSILON)
304 }
305 }
306
307 #[inline]
309 pub fn matches_range(&self, doc_id: u32, min: f64, max: f64) -> bool {
310 self.entries
311 .iter()
312 .any(|(v, id)| *id == doc_id && *v >= min && *v <= max)
313 }
314
315 #[inline]
317 pub fn matches_gt(&self, doc_id: u32, threshold: f64) -> bool {
318 self.entries
319 .iter()
320 .any(|(v, id)| *id == doc_id && *v > threshold)
321 }
322
323 #[inline]
325 pub fn matches_lt(&self, doc_id: u32, threshold: f64) -> bool {
326 self.entries
327 .iter()
328 .any(|(v, id)| *id == doc_id && *v < threshold)
329 }
330
331 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
333 writer.write_all(&(self.entries.len() as u32).to_le_bytes())?;
335 for (value, doc_id) in &self.entries {
336 writer.write_all(&value.to_le_bytes())?;
337 writer.write_all(&doc_id.to_le_bytes())?;
338 }
339
340 writer.write_all(&(self.common_values.len() as u32).to_le_bytes())?;
342 for (int_val, bitmap) in &self.common_values {
343 writer.write_all(&int_val.to_le_bytes())?;
344 let mut bitmap_bytes = Vec::new();
345 bitmap.serialize_into(&mut bitmap_bytes)?;
346 writer.write_all(&(bitmap_bytes.len() as u32).to_le_bytes())?;
347 writer.write_all(&bitmap_bytes)?;
348 }
349
350 Ok(())
351 }
352
353 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
355 let mut buf4 = [0u8; 4];
356 let mut buf8 = [0u8; 8];
357
358 reader.read_exact(&mut buf4)?;
360 let entries_len = u32::from_le_bytes(buf4) as usize;
361 let mut entries = Vec::with_capacity(entries_len);
362 for _ in 0..entries_len {
363 reader.read_exact(&mut buf8)?;
364 let value = f64::from_le_bytes(buf8);
365 reader.read_exact(&mut buf4)?;
366 let doc_id = u32::from_le_bytes(buf4);
367 entries.push((value, doc_id));
368 }
369
370 reader.read_exact(&mut buf4)?;
372 let common_len = u32::from_le_bytes(buf4) as usize;
373 let mut common_values = HashMap::with_capacity(common_len);
374 for _ in 0..common_len {
375 reader.read_exact(&mut buf8)?;
376 let int_val = i64::from_le_bytes(buf8);
377 reader.read_exact(&mut buf4)?;
378 let bitmap_len = u32::from_le_bytes(buf4) as usize;
379 let mut bitmap_buf = vec![0u8; bitmap_len];
380 reader.read_exact(&mut bitmap_buf)?;
381 let bitmap = RoaringBitmap::deserialize_from(&bitmap_buf[..])
382 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
383 common_values.insert(int_val, bitmap);
384 }
385
386 Ok(Self {
387 entries,
388 common_values,
389 })
390 }
391}
392
393#[derive(Debug, Clone)]
395pub enum FieldIndex {
396 Keyword(KeywordIndex),
397 Boolean(BooleanIndex),
398 Numeric(NumericIndex),
399}
400
401impl FieldIndex {
402 #[must_use]
403 pub fn keyword() -> Self {
404 Self::Keyword(KeywordIndex::new())
405 }
406
407 #[must_use]
408 pub fn boolean() -> Self {
409 Self::Boolean(BooleanIndex::new())
410 }
411
412 #[must_use]
413 pub fn numeric() -> Self {
414 Self::Numeric(NumericIndex::new())
415 }
416}
417
418#[derive(Debug, Clone, Default)]
420pub struct MetadataIndex {
421 fields: HashMap<String, FieldIndex>,
422}
423
424impl MetadataIndex {
425 #[must_use]
426 pub fn new() -> Self {
427 Self::default()
428 }
429
430 pub fn keyword_index(&mut self, field: &str) -> Option<&mut KeywordIndex> {
433 let index = self
434 .fields
435 .entry(field.to_string())
436 .or_insert_with(FieldIndex::keyword);
437 match index {
438 FieldIndex::Keyword(idx) => Some(idx),
439 _ => None, }
441 }
442
443 pub fn boolean_index(&mut self, field: &str) -> Option<&mut BooleanIndex> {
446 let index = self
447 .fields
448 .entry(field.to_string())
449 .or_insert_with(FieldIndex::boolean);
450 match index {
451 FieldIndex::Boolean(idx) => Some(idx),
452 _ => None, }
454 }
455
456 pub fn numeric_index(&mut self, field: &str) -> Option<&mut NumericIndex> {
459 let index = self
460 .fields
461 .entry(field.to_string())
462 .or_insert_with(FieldIndex::numeric);
463 match index {
464 FieldIndex::Numeric(idx) => Some(idx),
465 _ => None, }
467 }
468
469 #[must_use]
471 pub fn get(&self, field: &str) -> Option<&FieldIndex> {
472 self.fields.get(field)
473 }
474
475 pub fn index_json(&mut self, doc_id: u32, metadata: &serde_json::Value) {
478 if let serde_json::Value::Object(map) = metadata {
479 for (key, value) in map {
480 match value {
481 serde_json::Value::String(s) => {
482 if let Some(idx) = self.keyword_index(key) {
483 idx.insert(doc_id, s);
484 }
485 }
486 serde_json::Value::Bool(b) => {
487 if let Some(idx) = self.boolean_index(key) {
488 idx.insert(doc_id, *b);
489 }
490 }
491 serde_json::Value::Number(n) => {
492 if let Some(f) = n.as_f64() {
493 if let Some(idx) = self.numeric_index(key) {
494 idx.insert(doc_id, f);
495 }
496 }
497 }
498 _ => {} }
500 }
501 }
502 }
503
504 pub fn remove(&mut self, doc_id: u32) {
506 for index in self.fields.values_mut() {
507 match index {
508 FieldIndex::Keyword(idx) => idx.remove(doc_id),
509 FieldIndex::Boolean(idx) => idx.remove(doc_id),
510 FieldIndex::Numeric(idx) => idx.remove(doc_id),
511 }
512 }
513 }
514
515 #[inline]
517 #[must_use]
518 pub fn matches(&self, doc_id: u32, filter: &Filter) -> bool {
519 match filter {
520 Filter::Eq(field, value) => self.matches_eq(doc_id, field, value),
521 Filter::Ne(field, value) => !self.matches_eq(doc_id, field, value),
522 Filter::Gt(field, value) => self.matches_gt(doc_id, field, *value),
523 Filter::Gte(field, value) => self.matches_gte(doc_id, field, *value),
524 Filter::Lt(field, value) => self.matches_lt(doc_id, field, *value),
525 Filter::Lte(field, value) => self.matches_lte(doc_id, field, *value),
526 Filter::In(field, values) => values.iter().any(|v| self.matches_eq(doc_id, field, v)),
527 Filter::And(filters) => filters.iter().all(|f| self.matches(doc_id, f)),
528 Filter::Or(filters) => filters.iter().any(|f| self.matches(doc_id, f)),
529 Filter::Not(inner) => !self.matches(doc_id, inner),
530 }
531 }
532
533 fn matches_eq(&self, doc_id: u32, field: &str, value: &FilterValue) -> bool {
534 match (self.get(field), value) {
535 (Some(FieldIndex::Keyword(idx)), FilterValue::String(s)) => idx.contains(doc_id, s),
536 (Some(FieldIndex::Boolean(idx)), FilterValue::Bool(b)) => idx.matches(doc_id, *b),
537 (Some(FieldIndex::Numeric(idx)), FilterValue::Number(n)) => idx.matches_eq(doc_id, *n),
538 _ => false,
539 }
540 }
541
542 fn matches_gt(&self, doc_id: u32, field: &str, value: f64) -> bool {
543 match self.get(field) {
544 Some(FieldIndex::Numeric(idx)) => idx.matches_gt(doc_id, value),
545 _ => false,
546 }
547 }
548
549 fn matches_gte(&self, doc_id: u32, field: &str, value: f64) -> bool {
550 match self.get(field) {
551 Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, value, f64::INFINITY),
552 _ => false,
553 }
554 }
555
556 fn matches_lt(&self, doc_id: u32, field: &str, value: f64) -> bool {
557 match self.get(field) {
558 Some(FieldIndex::Numeric(idx)) => idx.matches_lt(doc_id, value),
559 _ => false,
560 }
561 }
562
563 fn matches_lte(&self, doc_id: u32, field: &str, value: f64) -> bool {
564 match self.get(field) {
565 Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, f64::NEG_INFINITY, value),
566 _ => false,
567 }
568 }
569}
570
571#[derive(Debug, Clone)]
573pub enum FilterValue {
574 String(String),
575 Number(f64),
576 Bool(bool),
577}
578
579#[derive(Debug, Clone)]
581pub enum Filter {
582 Eq(String, FilterValue),
583 Ne(String, FilterValue),
584 Gt(String, f64),
585 Gte(String, f64),
586 Lt(String, f64),
587 Lte(String, f64),
588 In(String, Vec<FilterValue>),
589 And(Vec<Filter>),
590 Or(Vec<Filter>),
591 Not(Box<Filter>),
592}
593
594#[cfg(test)]
595mod tests {
596 use super::*;
597 use serde_json::json;
598
599 #[test]
600 fn test_keyword_index() {
601 let mut idx = KeywordIndex::new();
602 idx.insert(1, "apple");
603 idx.insert(2, "apple");
604 idx.insert(3, "banana");
605
606 assert!(idx.contains(1, "apple"));
607 assert!(idx.contains(2, "apple"));
608 assert!(!idx.contains(3, "apple"));
609 assert!(idx.contains(3, "banana"));
610
611 let bitmap = idx.get("apple").unwrap();
612 assert_eq!(bitmap.len(), 2);
613 }
614
615 #[test]
616 fn test_boolean_index() {
617 let mut idx = BooleanIndex::new();
618 idx.insert(1, true);
619 idx.insert(2, false);
620 idx.insert(3, true);
621
622 assert!(idx.matches(1, true));
623 assert!(!idx.matches(1, false));
624 assert!(idx.matches(2, false));
625 assert!(idx.matches(3, true));
626 }
627
628 #[test]
629 fn test_numeric_index() {
630 let mut idx = NumericIndex::new();
631 idx.insert(1, 10.0);
632 idx.insert(2, 20.0);
633 idx.insert(3, 30.0);
634
635 assert!(idx.matches_eq(1, 10.0));
637 assert!(!idx.matches_eq(1, 20.0));
638
639 let range = idx.get_range(15.0, 25.0);
641 assert!(range.contains(2));
642 assert!(!range.contains(1));
643 assert!(!range.contains(3));
644 }
645
646 #[test]
647 fn test_metadata_index_json() {
648 let mut idx = MetadataIndex::new();
649
650 idx.index_json(1, &json!({"category": "tech", "score": 85, "active": true}));
651 idx.index_json(
652 2,
653 &json!({"category": "tech", "score": 92, "active": false}),
654 );
655 idx.index_json(
656 3,
657 &json!({"category": "science", "score": 78, "active": true}),
658 );
659
660 let filter = Filter::Eq("category".into(), FilterValue::String("tech".into()));
662 assert!(idx.matches(1, &filter));
663 assert!(idx.matches(2, &filter));
664 assert!(!idx.matches(3, &filter));
665
666 let filter = Filter::Gte("score".into(), 80.0);
668 assert!(idx.matches(1, &filter));
669 assert!(idx.matches(2, &filter));
670 assert!(!idx.matches(3, &filter));
671
672 let filter = Filter::Eq("active".into(), FilterValue::Bool(true));
674 assert!(idx.matches(1, &filter));
675 assert!(!idx.matches(2, &filter));
676 assert!(idx.matches(3, &filter));
677
678 let filter = Filter::And(vec![
680 Filter::Eq("category".into(), FilterValue::String("tech".into())),
681 Filter::Eq("active".into(), FilterValue::Bool(true)),
682 ]);
683 assert!(idx.matches(1, &filter));
684 assert!(!idx.matches(2, &filter));
685 assert!(!idx.matches(3, &filter));
686 }
687
688 #[test]
689 fn test_keyword_serialize_roundtrip() {
690 let mut idx = KeywordIndex::new();
691 idx.insert(1, "apple");
692 idx.insert(2, "apple");
693 idx.insert(3, "banana");
694
695 let mut buf = Vec::new();
696 idx.serialize(&mut buf).unwrap();
697
698 let mut cursor = std::io::Cursor::new(&buf);
699 let idx2 = KeywordIndex::deserialize(&mut cursor).unwrap();
700
701 assert!(idx2.contains(1, "apple"));
702 assert!(idx2.contains(2, "apple"));
703 assert!(idx2.contains(3, "banana"));
704 }
705}