use crate::types::{Document, DocumentId, Value};
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndexType {
Hash,
BTree,
FullText,
Unique,
}
pub struct DocumentIndex {
field: String,
index_type: IndexType,
hash_index: HashMap<IndexKey, HashSet<DocumentId>>,
text_index: Option<InvertedIndex>,
}
impl DocumentIndex {
pub fn new(field: impl Into<String>, index_type: IndexType) -> Self {
let text_index = if index_type == IndexType::FullText {
Some(InvertedIndex::new())
} else {
None
};
Self {
field: field.into(),
index_type,
hash_index: HashMap::new(),
text_index,
}
}
pub fn field(&self) -> &str {
&self.field
}
pub fn index_type(&self) -> IndexType {
self.index_type
}
pub fn index_document(&mut self, doc: &Document) {
let value = doc.get(&self.field);
if let Some(value) = value {
match self.index_type {
IndexType::FullText => {
if let Some(text) = value.as_str() {
if let Some(ref mut idx) = self.text_index {
idx.index_document(&doc.id, text);
}
}
}
_ => {
let key = IndexKey::from_value(value);
self.hash_index
.entry(key)
.or_default()
.insert(doc.id.clone());
}
}
}
}
pub fn unindex_document(&mut self, doc: &Document) {
let value = doc.get(&self.field);
if let Some(value) = value {
match self.index_type {
IndexType::FullText => {
if let Some(text) = value.as_str() {
if let Some(ref mut idx) = self.text_index {
idx.unindex_document(&doc.id, text);
}
}
}
_ => {
let key = IndexKey::from_value(value);
if let Some(ids) = self.hash_index.get_mut(&key) {
ids.remove(&doc.id);
if ids.is_empty() {
self.hash_index.remove(&key);
}
}
}
}
}
}
pub fn find_eq(&self, value: &Value) -> Vec<DocumentId> {
let key = IndexKey::from_value(value);
self.hash_index
.get(&key)
.map(|ids| ids.iter().cloned().collect())
.unwrap_or_default()
}
pub fn search(&self, query: &str) -> Vec<DocumentId> {
self.text_index
.as_ref()
.map(|idx| idx.search(query))
.unwrap_or_default()
}
pub fn clear(&mut self) {
self.hash_index.clear();
if let Some(ref mut idx) = self.text_index {
idx.clear();
}
}
pub fn key_count(&self) -> usize {
self.hash_index.len()
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum IndexKey {
Null,
Bool(bool),
Int(i64),
String(String),
Float(OrderedFloat),
}
impl IndexKey {
fn from_value(value: &Value) -> Self {
match value {
Value::Null => Self::Null,
Value::Bool(b) => Self::Bool(*b),
Value::Int(n) => Self::Int(*n),
Value::Float(f) => Self::Float(OrderedFloat(*f)),
Value::String(s) => Self::String(s.clone()),
Value::Array(_) | Value::Object(_) => Self::Null,
}
}
}
#[derive(Debug, Clone, Copy)]
struct OrderedFloat(f64);
impl PartialEq for OrderedFloat {
fn eq(&self, other: &Self) -> bool {
self.0.to_bits() == other.0.to_bits()
}
}
impl Eq for OrderedFloat {}
impl std::hash::Hash for OrderedFloat {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.0.to_bits().hash(state);
}
}
pub struct InvertedIndex {
terms: HashMap<String, HashSet<DocumentId>>,
doc_terms: HashMap<DocumentId, HashSet<String>>,
}
impl InvertedIndex {
pub fn new() -> Self {
Self {
terms: HashMap::new(),
doc_terms: HashMap::new(),
}
}
pub fn index_document(&mut self, doc_id: &DocumentId, text: &str) {
let tokens = self.tokenize(text);
let mut doc_term_set = HashSet::new();
for token in tokens {
self.terms
.entry(token.clone())
.or_default()
.insert(doc_id.clone());
doc_term_set.insert(token);
}
self.doc_terms.insert(doc_id.clone(), doc_term_set);
}
pub fn unindex_document(&mut self, doc_id: &DocumentId, text: &str) {
let tokens = self.tokenize(text);
for token in &tokens {
if let Some(docs) = self.terms.get_mut(token) {
docs.remove(doc_id);
if docs.is_empty() {
self.terms.remove(token);
}
}
}
self.doc_terms.remove(doc_id);
}
pub fn search(&self, query: &str) -> Vec<DocumentId> {
let tokens = self.tokenize(query);
if tokens.is_empty() {
return Vec::new();
}
let mut result: Option<HashSet<DocumentId>> = None;
for token in tokens {
let matching = self.terms.get(&token).cloned().unwrap_or_default();
result = Some(match result {
Some(current) => current.intersection(&matching).cloned().collect(),
None => matching,
});
}
result
.map(|set| set.into_iter().collect())
.unwrap_or_default()
}
pub fn search_any(&self, query: &str) -> Vec<DocumentId> {
let tokens = self.tokenize(query);
let mut result = HashSet::new();
for token in tokens {
if let Some(docs) = self.terms.get(&token) {
result.extend(docs.iter().cloned());
}
}
result.into_iter().collect()
}
pub fn clear(&mut self) {
self.terms.clear();
self.doc_terms.clear();
}
pub fn term_count(&self) -> usize {
self.terms.len()
}
fn tokenize(&self, text: &str) -> Vec<String> {
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty() && s.len() >= 2)
.map(String::from)
.collect()
}
}
impl Default for InvertedIndex {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hash_index() {
let mut index = DocumentIndex::new("status", IndexType::Hash);
let mut doc1 = Document::with_id("doc1");
doc1.set("status", "active");
let mut doc2 = Document::with_id("doc2");
doc2.set("status", "active");
let mut doc3 = Document::with_id("doc3");
doc3.set("status", "inactive");
index.index_document(&doc1);
index.index_document(&doc2);
index.index_document(&doc3);
let result = index.find_eq(&Value::String("active".to_string()));
assert_eq!(result.len(), 2);
let result = index.find_eq(&Value::String("inactive".to_string()));
assert_eq!(result.len(), 1);
}
#[test]
fn test_inverted_index() {
let mut index = InvertedIndex::new();
let doc1 = DocumentId::new("doc1");
let doc2 = DocumentId::new("doc2");
index.index_document(&doc1, "The quick brown fox");
index.index_document(&doc2, "The lazy brown dog");
let result = index.search("brown");
assert_eq!(result.len(), 2);
let result = index.search("quick");
assert_eq!(result.len(), 1);
let result = index.search("quick brown");
assert_eq!(result.len(), 1);
}
#[test]
fn test_full_text_index() {
let mut index = DocumentIndex::new("content", IndexType::FullText);
let mut doc1 = Document::with_id("doc1");
doc1.set("content", "Hello world");
let mut doc2 = Document::with_id("doc2");
doc2.set("content", "Goodbye world");
index.index_document(&doc1);
index.index_document(&doc2);
let result = index.search("world");
assert_eq!(result.len(), 2);
let result = index.search("hello");
assert_eq!(result.len(), 1);
}
#[test]
fn test_unindex() {
let mut index = DocumentIndex::new("name", IndexType::Hash);
let mut doc = Document::with_id("doc1");
doc.set("name", "Test");
index.index_document(&doc);
assert_eq!(index.find_eq(&Value::String("Test".to_string())).len(), 1);
index.unindex_document(&doc);
assert_eq!(index.find_eq(&Value::String("Test".to_string())).len(), 0);
}
}