mindb 0.1.2

Lightweight embedded key–value store with write-ahead log and zstd compression.
Documentation
//! Full-text search sidecar implemented as an inverted index.
#![allow(dead_code)]

use std::collections::{HashMap, HashSet};
use std::sync::Arc;

use parking_lot::RwLock;

use crate::index::{SequenceNumber, VersionPointer};

#[derive(Clone, Default)]
pub struct FullTextSidecar {
    inner: Arc<RwLock<HashMap<String, HashMap<Vec<u8>, VersionPointer>>>>,
}

impl FullTextSidecar {
    /// Creates a new inverted index.
    pub fn new() -> Self {
        Self {
            inner: Arc::new(RwLock::new(HashMap::new())),
        }
    }

    /// Tokenises the provided text and indexes each token to the key.
    pub fn index_document(&self, text: &str, key: Vec<u8>, pointer: VersionPointer) {
        let tokens = FullTextSidecar::tokenise(text);
        let mut guard = self.inner.write();
        for token in tokens {
            let entry = guard.entry(token).or_default();
            match entry.get_mut(&key) {
                Some(existing) if existing.sequence >= pointer.sequence => continue,
                Some(existing) => *existing = pointer.clone(),
                None => {
                    entry.insert(key.clone(), pointer.clone());
                }
            }
        }
    }

    /// Returns the keys matching all the provided terms and visible under the snapshot.
    pub fn search(&self, terms: &[String], snapshot: SequenceNumber) -> Vec<Vec<u8>> {
        if terms.is_empty() {
            return Vec::new();
        }

        let guard = self.inner.read();
        let mut iter = terms.iter();
        let first = match iter.next() {
            Some(term) => guard.get(term.as_str()).cloned().unwrap_or_default(),
            None => return Vec::new(),
        };

        let mut keys: HashSet<Vec<u8>> = first
            .into_iter()
            .filter(|(_, pointer)| pointer.is_visible_at(snapshot))
            .map(|(key, _)| key)
            .collect();

        for term in iter {
            if let Some(mapping) = guard.get(term.as_str()) {
                let term_keys: HashSet<Vec<u8>> = mapping
                    .iter()
                    .filter(|(_, pointer)| pointer.is_visible_at(snapshot))
                    .map(|(key, _)| key.clone())
                    .collect();
                keys = keys
                    .intersection(&term_keys)
                    .cloned()
                    .collect::<HashSet<_>>();
            } else {
                return Vec::new();
            }
        }

        keys.into_iter().collect()
    }

    fn tokenise(text: &str) -> Vec<String> {
        text.split(|c: char| !c.is_alphanumeric())
            .filter(|token| !token.is_empty())
            .map(|token| token.to_lowercase())
            .collect()
    }
}