scepter 0.1.0

Composable primitives for planet-scale time-series routing, indexing, and aggregation.
Documentation
use std::collections::{HashMap, HashSet};
use std::hash::Hash;

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum HintError {
    EmptyExcerpt,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ExcerptStrategy {
    Trigram,
    NGram(usize),
    Full,
}

impl ExcerptStrategy {
    pub fn excerpts(&self, value: &str) -> Vec<String> {
        match self {
            Self::Trigram => trigrams(value),
            Self::NGram(size) => ngrams(value, *size),
            Self::Full => vec![value.to_owned()],
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct FieldHint {
    pub schema: String,
    pub field: String,
    pub excerpt: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FieldPredicate {
    Equals(String),
    Contains(String),
    HasAllExcerpts(Vec<String>),
}

impl FieldPredicate {
    pub fn excerpts(&self, strategy: &ExcerptStrategy) -> Vec<String> {
        match self {
            Self::Equals(value) | Self::Contains(value) => strategy.excerpts(value),
            Self::HasAllExcerpts(excerpts) => excerpts.clone(),
        }
    }
}

/// Approximate field-hint index mapping searchable excerpts to child shards.
#[derive(Debug, Clone)]
pub struct FieldHintIndex<ChildId> {
    strategy: ExcerptStrategy,
    hints: HashMap<FieldHint, HashSet<ChildId>>,
}

impl<ChildId> Default for FieldHintIndex<ChildId> {
    fn default() -> Self {
        Self {
            strategy: ExcerptStrategy::Trigram,
            hints: HashMap::new(),
        }
    }
}

impl<ChildId> FieldHintIndex<ChildId>
where
    ChildId: Clone + Eq + Hash,
{
    pub fn new() -> Self {
        Self::default()
    }

    pub fn with_strategy(strategy: ExcerptStrategy) -> Self {
        Self {
            strategy,
            hints: HashMap::new(),
        }
    }

    pub fn insert_value(&mut self, schema: &str, field: &str, value: &str, child: ChildId) {
        for excerpt in self.strategy.excerpts(value) {
            self.insert_excerpt(schema, field, excerpt, child.clone());
        }
    }

    pub fn insert_excerpt(
        &mut self,
        schema: &str,
        field: &str,
        excerpt: impl Into<String>,
        child: ChildId,
    ) {
        self.hints
            .entry(FieldHint {
                schema: schema.to_owned(),
                field: field.to_owned(),
                excerpt: excerpt.into(),
            })
            .or_default()
            .insert(child);
    }

    pub fn candidates(
        &self,
        schema: &str,
        field: &str,
        predicate: &FieldPredicate,
    ) -> HashSet<ChildId> {
        let mut excerpts = predicate.excerpts(&self.strategy).into_iter();
        let Some(first) = excerpts.next() else {
            return HashSet::new();
        };

        let mut result = self.lookup(schema, field, &first);
        for excerpt in excerpts {
            let next = self.lookup(schema, field, &excerpt);
            result.retain(|child| next.contains(child));
        }
        result
    }

    pub fn candidate_union(
        &self,
        schema: &str,
        field: &str,
        predicates: &[FieldPredicate],
    ) -> HashSet<ChildId> {
        let mut result = HashSet::new();
        for predicate in predicates {
            result.extend(self.candidates(schema, field, predicate));
        }
        result
    }

    fn lookup(&self, schema: &str, field: &str, excerpt: &str) -> HashSet<ChildId> {
        self.hints
            .get(&FieldHint {
                schema: schema.to_owned(),
                field: field.to_owned(),
                excerpt: excerpt.to_owned(),
            })
            .cloned()
            .unwrap_or_default()
    }
}

pub fn trigrams(value: &str) -> Vec<String> {
    ngrams_with_padding(value, 3)
}

pub fn ngrams(value: &str, size: usize) -> Vec<String> {
    try_ngrams(value, size).unwrap_or_default()
}

pub fn try_ngrams(value: &str, size: usize) -> Result<Vec<String>, HintError> {
    if size == 0 {
        return Err(HintError::EmptyExcerpt);
    }
    Ok(ngrams_with_padding(value, size))
}

fn ngrams_with_padding(value: &str, size: usize) -> Vec<String> {
    let padding = "^".repeat(size.saturating_sub(1));
    let suffix = "$".repeat(size.saturating_sub(1));
    let padded = format!("{padding}{value}{suffix}");
    let chars: Vec<char> = padded.chars().collect();
    chars
        .windows(size)
        .map(|window| window.iter().collect())
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn field_hint_index_intersects_trigram_candidates() {
        let mut index = FieldHintIndex::new();
        index.insert_value("ComputeTask", "job", "monarch", "leaf-1");
        index.insert_value("ComputeTask", "job", "monitor", "leaf-2");

        let candidates = index.candidates(
            "ComputeTask",
            "job",
            &FieldPredicate::Equals("monarch".to_owned()),
        );

        assert!(candidates.contains("leaf-1"));
        assert!(!candidates.contains("leaf-2"));
    }

    #[test]
    fn full_string_strategy_indexes_metric_names() {
        let mut index = FieldHintIndex::with_strategy(ExcerptStrategy::Full);
        index.insert_value("Metric", "name", "/rpc/server/latency", "zone-1");

        let candidates = index.candidates(
            "Metric",
            "name",
            &FieldPredicate::Equals("/rpc/server/latency".to_owned()),
        );

        assert_eq!(candidates.len(), 1);
    }
}