1use roaring::RoaringBitmap;
7use std::collections::HashMap;
8use std::io::{self, Read, Write};
9
10#[allow(dead_code)]
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13#[repr(u8)]
14pub enum FieldType {
15 Keyword = 1,
17 Integer = 2,
19 Float = 3,
21 Boolean = 4,
23}
24
25impl From<u8> for FieldType {
26 fn from(v: u8) -> Self {
27 match v {
28 2 => Self::Integer,
29 3 => Self::Float,
30 4 => Self::Boolean,
31 _ => Self::Keyword, }
33 }
34}
35
36#[derive(Debug, Clone, Default)]
38pub struct KeywordIndex {
39 terms: HashMap<String, RoaringBitmap>,
41}
42
43impl KeywordIndex {
44 pub fn new() -> Self {
45 Self::default()
46 }
47
48 pub fn insert(&mut self, doc_id: u32, term: &str) {
50 self.terms
51 .entry(term.to_string())
52 .or_default()
53 .insert(doc_id);
54 }
55
56 pub fn remove(&mut self, doc_id: u32) {
58 for bitmap in self.terms.values_mut() {
59 bitmap.remove(doc_id);
60 }
61 }
62
63 pub fn get(&self, term: &str) -> Option<&RoaringBitmap> {
65 self.terms.get(term)
66 }
67
68 #[inline]
70 pub fn contains(&self, doc_id: u32, term: &str) -> bool {
71 self.terms
72 .get(term)
73 .is_some_and(|bitmap| bitmap.contains(doc_id))
74 }
75
76 pub fn terms(&self) -> impl Iterator<Item = &str> {
78 self.terms.keys().map(std::string::String::as_str)
79 }
80
81 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
83 writer.write_all(&(self.terms.len() as u32).to_le_bytes())?;
85
86 for (term, bitmap) in &self.terms {
87 writer.write_all(&(term.len() as u32).to_le_bytes())?;
89 writer.write_all(term.as_bytes())?;
90
91 let mut bitmap_bytes = Vec::new();
93 bitmap.serialize_into(&mut bitmap_bytes)?;
94 writer.write_all(&(bitmap_bytes.len() as u32).to_le_bytes())?;
95 writer.write_all(&bitmap_bytes)?;
96 }
97
98 Ok(())
99 }
100
101 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
103 let mut len_buf = [0u8; 4];
104 reader.read_exact(&mut len_buf)?;
105 let term_count = u32::from_le_bytes(len_buf) as usize;
106
107 let mut terms = HashMap::with_capacity(term_count);
108
109 for _ in 0..term_count {
110 reader.read_exact(&mut len_buf)?;
112 let term_len = u32::from_le_bytes(len_buf) as usize;
113 let mut term_buf = vec![0u8; term_len];
114 reader.read_exact(&mut term_buf)?;
115 let term = String::from_utf8_lossy(&term_buf).to_string();
116
117 reader.read_exact(&mut len_buf)?;
119 let bitmap_len = u32::from_le_bytes(len_buf) as usize;
120 let mut bitmap_buf = vec![0u8; bitmap_len];
121 reader.read_exact(&mut bitmap_buf)?;
122 let bitmap = RoaringBitmap::deserialize_from(&bitmap_buf[..])
123 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
124
125 terms.insert(term, bitmap);
126 }
127
128 Ok(Self { terms })
129 }
130}
131
132#[derive(Debug, Clone, Default)]
134pub struct BooleanIndex {
135 true_docs: RoaringBitmap,
136 false_docs: RoaringBitmap,
137}
138
139impl BooleanIndex {
140 pub fn new() -> Self {
141 Self::default()
142 }
143
144 pub fn insert(&mut self, doc_id: u32, value: bool) {
145 if value {
146 self.true_docs.insert(doc_id);
147 self.false_docs.remove(doc_id);
148 } else {
149 self.false_docs.insert(doc_id);
150 self.true_docs.remove(doc_id);
151 }
152 }
153
154 pub fn remove(&mut self, doc_id: u32) {
155 self.true_docs.remove(doc_id);
156 self.false_docs.remove(doc_id);
157 }
158
159 #[inline]
160 pub fn matches(&self, doc_id: u32, value: bool) -> bool {
161 if value {
162 self.true_docs.contains(doc_id)
163 } else {
164 self.false_docs.contains(doc_id)
165 }
166 }
167
168 pub fn get_true(&self) -> &RoaringBitmap {
169 &self.true_docs
170 }
171
172 pub fn get_false(&self) -> &RoaringBitmap {
173 &self.false_docs
174 }
175}
176
177#[derive(Debug, Clone, Default)]
179pub struct NumericIndex {
180 entries: Vec<(f64, u32)>,
182 common_values: HashMap<i64, RoaringBitmap>,
184}
185
186impl NumericIndex {
187 pub fn new() -> Self {
188 Self::default()
189 }
190
191 pub fn insert(&mut self, doc_id: u32, value: f64) {
192 if value.is_nan() {
194 return;
195 }
196
197 let pos = self
199 .entries
200 .binary_search_by(|(v, _)| v.total_cmp(&value))
201 .unwrap_or_else(|p| p);
202 self.entries.insert(pos, (value, doc_id));
203
204 if value.fract() == 0.0 && value >= i64::MIN as f64 && value <= i64::MAX as f64 {
206 let int_val = value as i64;
207 self.common_values
208 .entry(int_val)
209 .or_default()
210 .insert(doc_id);
211 }
212 }
213
214 pub fn remove(&mut self, doc_id: u32) {
215 self.entries.retain(|(_, id)| *id != doc_id);
216 for bitmap in self.common_values.values_mut() {
217 bitmap.remove(doc_id);
218 }
219 }
220
221 pub fn get_eq(&self, value: f64) -> Option<&RoaringBitmap> {
223 if value.fract() == 0.0 {
224 self.common_values.get(&(value as i64))
225 } else {
226 None
227 }
228 }
229
230 pub fn get_range(&self, min: f64, max: f64) -> RoaringBitmap {
232 let start = self.entries.partition_point(|(v, _)| *v < min);
235
236 let mut result = RoaringBitmap::new();
237 for (val, doc_id) in &self.entries[start..] {
238 if *val > max {
239 break;
240 }
241 result.insert(*doc_id);
242 }
243 result
244 }
245
246 #[inline]
248 pub fn matches_eq(&self, doc_id: u32, value: f64) -> bool {
249 if let Some(bitmap) = self.get_eq(value) {
250 bitmap.contains(doc_id)
251 } else {
252 self.entries
254 .iter()
255 .any(|(v, id)| *id == doc_id && (*v - value).abs() < f64::EPSILON)
256 }
257 }
258
259 #[inline]
261 pub fn matches_range(&self, doc_id: u32, min: f64, max: f64) -> bool {
262 self.entries
263 .iter()
264 .any(|(v, id)| *id == doc_id && *v >= min && *v <= max)
265 }
266}
267
268#[derive(Debug, Clone)]
270pub enum FieldIndex {
271 Keyword(KeywordIndex),
272 Boolean(BooleanIndex),
273 Numeric(NumericIndex),
274}
275
276impl FieldIndex {
277 #[must_use]
278 pub fn keyword() -> Self {
279 Self::Keyword(KeywordIndex::new())
280 }
281
282 #[must_use]
283 pub fn boolean() -> Self {
284 Self::Boolean(BooleanIndex::new())
285 }
286
287 #[must_use]
288 pub fn numeric() -> Self {
289 Self::Numeric(NumericIndex::new())
290 }
291}
292
293#[derive(Debug, Clone, Default)]
295pub struct MetadataIndex {
296 fields: HashMap<String, FieldIndex>,
297}
298
299impl MetadataIndex {
300 #[must_use]
301 pub fn new() -> Self {
302 Self::default()
303 }
304
305 pub fn keyword_index(&mut self, field: &str) -> Option<&mut KeywordIndex> {
308 let index = self
309 .fields
310 .entry(field.to_string())
311 .or_insert_with(FieldIndex::keyword);
312 match index {
313 FieldIndex::Keyword(idx) => Some(idx),
314 _ => None, }
316 }
317
318 pub fn boolean_index(&mut self, field: &str) -> Option<&mut BooleanIndex> {
321 let index = self
322 .fields
323 .entry(field.to_string())
324 .or_insert_with(FieldIndex::boolean);
325 match index {
326 FieldIndex::Boolean(idx) => Some(idx),
327 _ => None, }
329 }
330
331 pub fn numeric_index(&mut self, field: &str) -> Option<&mut NumericIndex> {
334 let index = self
335 .fields
336 .entry(field.to_string())
337 .or_insert_with(FieldIndex::numeric);
338 match index {
339 FieldIndex::Numeric(idx) => Some(idx),
340 _ => None, }
342 }
343
344 #[must_use]
346 pub fn get(&self, field: &str) -> Option<&FieldIndex> {
347 self.fields.get(field)
348 }
349
350 pub fn index_json(&mut self, doc_id: u32, metadata: &serde_json::Value) {
353 if let serde_json::Value::Object(map) = metadata {
354 for (key, value) in map {
355 match value {
356 serde_json::Value::String(s) => {
357 if let Some(idx) = self.keyword_index(key) {
358 idx.insert(doc_id, s);
359 }
360 }
361 serde_json::Value::Bool(b) => {
362 if let Some(idx) = self.boolean_index(key) {
363 idx.insert(doc_id, *b);
364 }
365 }
366 serde_json::Value::Number(n) => {
367 if let Some(f) = n.as_f64() {
368 if let Some(idx) = self.numeric_index(key) {
369 idx.insert(doc_id, f);
370 }
371 }
372 }
373 _ => {} }
375 }
376 }
377 }
378
379 pub fn remove(&mut self, doc_id: u32) {
381 for index in self.fields.values_mut() {
382 match index {
383 FieldIndex::Keyword(idx) => idx.remove(doc_id),
384 FieldIndex::Boolean(idx) => idx.remove(doc_id),
385 FieldIndex::Numeric(idx) => idx.remove(doc_id),
386 }
387 }
388 }
389
390 #[inline]
392 #[must_use]
393 pub fn matches(&self, doc_id: u32, filter: &Filter) -> bool {
394 match filter {
395 Filter::Eq(field, value) => self.matches_eq(doc_id, field, value),
396 Filter::Ne(field, value) => !self.matches_eq(doc_id, field, value),
397 Filter::Gt(field, value) => self.matches_gt(doc_id, field, *value),
398 Filter::Gte(field, value) => self.matches_gte(doc_id, field, *value),
399 Filter::Lt(field, value) => self.matches_lt(doc_id, field, *value),
400 Filter::Lte(field, value) => self.matches_lte(doc_id, field, *value),
401 Filter::In(field, values) => values.iter().any(|v| self.matches_eq(doc_id, field, v)),
402 Filter::And(filters) => filters.iter().all(|f| self.matches(doc_id, f)),
403 Filter::Or(filters) => filters.iter().any(|f| self.matches(doc_id, f)),
404 Filter::Not(inner) => !self.matches(doc_id, inner),
405 }
406 }
407
408 fn matches_eq(&self, doc_id: u32, field: &str, value: &FilterValue) -> bool {
409 match (self.get(field), value) {
410 (Some(FieldIndex::Keyword(idx)), FilterValue::String(s)) => idx.contains(doc_id, s),
411 (Some(FieldIndex::Boolean(idx)), FilterValue::Bool(b)) => idx.matches(doc_id, *b),
412 (Some(FieldIndex::Numeric(idx)), FilterValue::Number(n)) => idx.matches_eq(doc_id, *n),
413 _ => false,
414 }
415 }
416
417 fn matches_gt(&self, doc_id: u32, field: &str, value: f64) -> bool {
418 match self.get(field) {
419 Some(FieldIndex::Numeric(idx)) => {
420 idx.matches_range(doc_id, value + f64::EPSILON, f64::MAX)
421 }
422 _ => false,
423 }
424 }
425
426 fn matches_gte(&self, doc_id: u32, field: &str, value: f64) -> bool {
427 match self.get(field) {
428 Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, value, f64::MAX),
429 _ => false,
430 }
431 }
432
433 fn matches_lt(&self, doc_id: u32, field: &str, value: f64) -> bool {
434 match self.get(field) {
435 Some(FieldIndex::Numeric(idx)) => {
436 idx.matches_range(doc_id, f64::MIN, value - f64::EPSILON)
437 }
438 _ => false,
439 }
440 }
441
442 fn matches_lte(&self, doc_id: u32, field: &str, value: f64) -> bool {
443 match self.get(field) {
444 Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, f64::MIN, value),
445 _ => false,
446 }
447 }
448}
449
450#[derive(Debug, Clone)]
452pub enum FilterValue {
453 String(String),
454 Number(f64),
455 Bool(bool),
456}
457
458#[derive(Debug, Clone)]
460pub enum Filter {
461 Eq(String, FilterValue),
462 Ne(String, FilterValue),
463 Gt(String, f64),
464 Gte(String, f64),
465 Lt(String, f64),
466 Lte(String, f64),
467 In(String, Vec<FilterValue>),
468 And(Vec<Filter>),
469 Or(Vec<Filter>),
470 Not(Box<Filter>),
471}
472
473#[cfg(test)]
474mod tests {
475 use super::*;
476 use serde_json::json;
477
478 #[test]
479 fn test_keyword_index() {
480 let mut idx = KeywordIndex::new();
481 idx.insert(1, "apple");
482 idx.insert(2, "apple");
483 idx.insert(3, "banana");
484
485 assert!(idx.contains(1, "apple"));
486 assert!(idx.contains(2, "apple"));
487 assert!(!idx.contains(3, "apple"));
488 assert!(idx.contains(3, "banana"));
489
490 let bitmap = idx.get("apple").unwrap();
491 assert_eq!(bitmap.len(), 2);
492 }
493
494 #[test]
495 fn test_boolean_index() {
496 let mut idx = BooleanIndex::new();
497 idx.insert(1, true);
498 idx.insert(2, false);
499 idx.insert(3, true);
500
501 assert!(idx.matches(1, true));
502 assert!(!idx.matches(1, false));
503 assert!(idx.matches(2, false));
504 assert!(idx.matches(3, true));
505 }
506
507 #[test]
508 fn test_numeric_index() {
509 let mut idx = NumericIndex::new();
510 idx.insert(1, 10.0);
511 idx.insert(2, 20.0);
512 idx.insert(3, 30.0);
513
514 assert!(idx.matches_eq(1, 10.0));
516 assert!(!idx.matches_eq(1, 20.0));
517
518 let range = idx.get_range(15.0, 25.0);
520 assert!(range.contains(2));
521 assert!(!range.contains(1));
522 assert!(!range.contains(3));
523 }
524
525 #[test]
526 fn test_metadata_index_json() {
527 let mut idx = MetadataIndex::new();
528
529 idx.index_json(1, &json!({"category": "tech", "score": 85, "active": true}));
530 idx.index_json(
531 2,
532 &json!({"category": "tech", "score": 92, "active": false}),
533 );
534 idx.index_json(
535 3,
536 &json!({"category": "science", "score": 78, "active": true}),
537 );
538
539 let filter = Filter::Eq("category".into(), FilterValue::String("tech".into()));
541 assert!(idx.matches(1, &filter));
542 assert!(idx.matches(2, &filter));
543 assert!(!idx.matches(3, &filter));
544
545 let filter = Filter::Gte("score".into(), 80.0);
547 assert!(idx.matches(1, &filter));
548 assert!(idx.matches(2, &filter));
549 assert!(!idx.matches(3, &filter));
550
551 let filter = Filter::Eq("active".into(), FilterValue::Bool(true));
553 assert!(idx.matches(1, &filter));
554 assert!(!idx.matches(2, &filter));
555 assert!(idx.matches(3, &filter));
556
557 let filter = Filter::And(vec![
559 Filter::Eq("category".into(), FilterValue::String("tech".into())),
560 Filter::Eq("active".into(), FilterValue::Bool(true)),
561 ]);
562 assert!(idx.matches(1, &filter));
563 assert!(!idx.matches(2, &filter));
564 assert!(!idx.matches(3, &filter));
565 }
566
567 #[test]
568 fn test_keyword_serialize_roundtrip() {
569 let mut idx = KeywordIndex::new();
570 idx.insert(1, "apple");
571 idx.insert(2, "apple");
572 idx.insert(3, "banana");
573
574 let mut buf = Vec::new();
575 idx.serialize(&mut buf).unwrap();
576
577 let mut cursor = std::io::Cursor::new(&buf);
578 let idx2 = KeywordIndex::deserialize(&mut cursor).unwrap();
579
580 assert!(idx2.contains(1, "apple"));
581 assert!(idx2.contains(2, "apple"));
582 assert!(idx2.contains(3, "banana"));
583 }
584}