use std::collections::{HashMap, HashSet};
use std::hash::Hash;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum HintError {
EmptyExcerpt,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ExcerptStrategy {
Trigram,
NGram(usize),
Full,
}
impl ExcerptStrategy {
pub fn excerpts(&self, value: &str) -> Vec<String> {
match self {
Self::Trigram => trigrams(value),
Self::NGram(size) => ngrams(value, *size),
Self::Full => vec![value.to_owned()],
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct FieldHint {
pub schema: String,
pub field: String,
pub excerpt: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FieldPredicate {
Equals(String),
Contains(String),
HasAllExcerpts(Vec<String>),
}
impl FieldPredicate {
pub fn excerpts(&self, strategy: &ExcerptStrategy) -> Vec<String> {
match self {
Self::Equals(value) | Self::Contains(value) => strategy.excerpts(value),
Self::HasAllExcerpts(excerpts) => excerpts.clone(),
}
}
}
#[derive(Debug, Clone)]
pub struct FieldHintIndex<ChildId> {
strategy: ExcerptStrategy,
hints: HashMap<FieldHint, HashSet<ChildId>>,
}
impl<ChildId> Default for FieldHintIndex<ChildId> {
fn default() -> Self {
Self {
strategy: ExcerptStrategy::Trigram,
hints: HashMap::new(),
}
}
}
impl<ChildId> FieldHintIndex<ChildId>
where
ChildId: Clone + Eq + Hash,
{
pub fn new() -> Self {
Self::default()
}
pub fn with_strategy(strategy: ExcerptStrategy) -> Self {
Self {
strategy,
hints: HashMap::new(),
}
}
pub fn insert_value(&mut self, schema: &str, field: &str, value: &str, child: ChildId) {
for excerpt in self.strategy.excerpts(value) {
self.insert_excerpt(schema, field, excerpt, child.clone());
}
}
pub fn insert_excerpt(
&mut self,
schema: &str,
field: &str,
excerpt: impl Into<String>,
child: ChildId,
) {
self.hints
.entry(FieldHint {
schema: schema.to_owned(),
field: field.to_owned(),
excerpt: excerpt.into(),
})
.or_default()
.insert(child);
}
pub fn candidates(
&self,
schema: &str,
field: &str,
predicate: &FieldPredicate,
) -> HashSet<ChildId> {
let mut excerpts = predicate.excerpts(&self.strategy).into_iter();
let Some(first) = excerpts.next() else {
return HashSet::new();
};
let mut result = self.lookup(schema, field, &first);
for excerpt in excerpts {
let next = self.lookup(schema, field, &excerpt);
result.retain(|child| next.contains(child));
}
result
}
pub fn candidate_union(
&self,
schema: &str,
field: &str,
predicates: &[FieldPredicate],
) -> HashSet<ChildId> {
let mut result = HashSet::new();
for predicate in predicates {
result.extend(self.candidates(schema, field, predicate));
}
result
}
fn lookup(&self, schema: &str, field: &str, excerpt: &str) -> HashSet<ChildId> {
self.hints
.get(&FieldHint {
schema: schema.to_owned(),
field: field.to_owned(),
excerpt: excerpt.to_owned(),
})
.cloned()
.unwrap_or_default()
}
}
pub fn trigrams(value: &str) -> Vec<String> {
ngrams_with_padding(value, 3)
}
pub fn ngrams(value: &str, size: usize) -> Vec<String> {
try_ngrams(value, size).unwrap_or_default()
}
pub fn try_ngrams(value: &str, size: usize) -> Result<Vec<String>, HintError> {
if size == 0 {
return Err(HintError::EmptyExcerpt);
}
Ok(ngrams_with_padding(value, size))
}
fn ngrams_with_padding(value: &str, size: usize) -> Vec<String> {
let padding = "^".repeat(size.saturating_sub(1));
let suffix = "$".repeat(size.saturating_sub(1));
let padded = format!("{padding}{value}{suffix}");
let chars: Vec<char> = padded.chars().collect();
chars
.windows(size)
.map(|window| window.iter().collect())
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn field_hint_index_intersects_trigram_candidates() {
let mut index = FieldHintIndex::new();
index.insert_value("ComputeTask", "job", "monarch", "leaf-1");
index.insert_value("ComputeTask", "job", "monitor", "leaf-2");
let candidates = index.candidates(
"ComputeTask",
"job",
&FieldPredicate::Equals("monarch".to_owned()),
);
assert!(candidates.contains("leaf-1"));
assert!(!candidates.contains("leaf-2"));
}
#[test]
fn full_string_strategy_indexes_metric_names() {
let mut index = FieldHintIndex::with_strategy(ExcerptStrategy::Full);
index.insert_value("Metric", "name", "/rpc/server/latency", "zone-1");
let candidates = index.candidates(
"Metric",
"name",
&FieldPredicate::Equals("/rpc/server/latency".to_owned()),
);
assert_eq!(candidates.len(), 1);
}
}