use std::{iter, marker::PhantomData};
use rustc_hash::{FxHashMap, FxHashSet};
mod config;
pub use config::*;
pub struct QuickMatch<'a> {
config: QuickMatchConfig,
max_word_count: usize,
max_word_len: usize,
max_query_len: usize,
word_index: FxHashMap<String, FxHashSet<*const str>>,
trigram_index: FxHashMap<[char; 3], FxHashSet<*const str>>,
_phantom: PhantomData<&'a str>,
}
unsafe impl Send for QuickMatch<'_> {}
unsafe impl Sync for QuickMatch<'_> {}
impl<'a> QuickMatch<'a> {
pub fn new(items: &[&'a str]) -> Self {
Self::new_with(items, QuickMatchConfig::default())
}
pub fn new_with(items: &[&'a str], config: QuickMatchConfig) -> Self {
let mut word_index: FxHashMap<String, FxHashSet<*const str>> = FxHashMap::default();
let mut trigram_index: FxHashMap<[char; 3], FxHashSet<*const str>> = FxHashMap::default();
let mut max_word_len = 0;
let mut max_query_len = 0;
let mut max_words = 0;
let sep = sep_table(config.separators());
for &item in items {
max_query_len = max_query_len.max(item.len());
let item_words: Vec<&str> = words(item, &sep).collect();
max_words = max_words.max(item_words.len());
for word in &item_words {
max_word_len = max_word_len.max(word.len());
for len in 1..=word.len() {
word_index
.entry(word[..len].to_string())
.or_default()
.insert(item);
}
let mut chars = word.chars();
if let (Some(mut a), Some(mut b)) = (chars.next(), chars.next()) {
for c in chars {
trigram_index.entry([a, b, c]).or_default().insert(item);
a = b;
b = c;
}
}
}
for pair in item_words.windows(2) {
let compound = format!("{}{}", pair[0], pair[1]);
max_word_len = max_word_len.max(compound.len());
let from = pair[0].len() + 1;
for len in from..=compound.len() {
word_index
.entry(compound[..len].to_string())
.or_default()
.insert(item);
}
}
}
Self {
max_query_len: max_query_len + 6,
max_word_len: max_word_len + 4,
max_word_count: max_words + 2,
word_index,
trigram_index,
config,
_phantom: PhantomData,
}
}
pub fn matches(&self, query: &str) -> Vec<&'a str> {
self.matches_with(query, &self.config)
}
pub fn matches_with(&self, query: &str, config: &QuickMatchConfig) -> Vec<&'a str> {
let limit = config.limit();
let trigram_budget = config.trigram_budget();
let query: String = query
.trim()
.chars()
.filter(|c| c.is_ascii())
.map(|c| c.to_ascii_lowercase())
.collect();
if query.is_empty() || query.len() > self.max_query_len {
return vec![];
}
let sep = sep_table(config.separators());
let mut query_words: Vec<&str> = vec![];
for w in words(&query, &sep) {
if w.len() <= self.max_word_len && !query_words.contains(&w) {
query_words.push(w);
}
}
if query_words.is_empty() || query_words.len() > self.max_word_count {
return vec![];
}
let mut unknown_words: Vec<&str> = vec![];
let mut known_sets: Vec<&FxHashSet<*const str>> = vec![];
for &word in &query_words {
if let Some(items) = self.word_index.get(word) {
known_sets.push(items)
} else if word.len() >= 3 && unknown_words.len() < trigram_budget {
unknown_words.push(word)
}
}
let pool = Self::intersect_sets(&known_sets);
if !unknown_words.is_empty() && trigram_budget > 0 {
let min_len = query.len().saturating_sub(3);
let (scores, hit_count) =
self.score_trigrams(&unknown_words, trigram_budget, pool.as_ref(), min_len);
let min_score = hit_count.div_ceil(2).max(config.min_score());
let results = Self::rank(
scores.into_iter().filter(|(_, s)| *s >= min_score),
&query_words,
&sep,
limit,
);
if !results.is_empty() {
return results;
}
}
let candidates = pool.unwrap_or_else(|| Self::union_sets(&known_sets));
Self::rank(
candidates.into_iter().map(|p| (p, 0)),
&query_words,
&sep,
limit,
)
}
fn intersect_sets(sets: &[&FxHashSet<*const str>]) -> Option<FxHashSet<*const str>> {
let (smallest_idx, smallest) = sets
.iter()
.copied()
.enumerate()
.min_by_key(|(_, s)| s.len())?;
let mut result = smallest.clone();
for (i, set) in sets.iter().enumerate() {
if i == smallest_idx {
continue;
}
result.retain(|ptr| set.contains(ptr));
if result.is_empty() {
return None;
}
}
Some(result)
}
fn union_sets(sets: &[&FxHashSet<*const str>]) -> FxHashSet<*const str> {
sets.iter().flat_map(|s| s.iter().copied()).collect()
}
fn rank(
candidates: impl IntoIterator<Item = (*const str, usize)>,
query_words: &[&str],
sep: &[bool; 256],
limit: usize,
) -> Vec<&'a str> {
let mut buckets: Vec<Vec<(&str, usize, usize)>> = vec![vec![]; query_words.len() + 1];
for (item, fuzzy) in candidates {
let s = unsafe { &*item as &'a str };
let (matched, position) = word_match(s, query_words, sep);
buckets[matched].push((s, fuzzy, position));
}
let mut results = Vec::with_capacity(limit);
for bucket in buckets.iter_mut().rev() {
if bucket.is_empty() {
continue;
}
bucket.sort_unstable_by(|a, b| {
b.1.cmp(&a.1) .then(a.2.cmp(&b.2)) .then(a.0.len().cmp(&b.0.len())) .then(a.0.cmp(b.0)) });
results.extend(bucket.iter().take(limit - results.len()).map(|&(s, ..)| s));
if results.len() >= limit {
break;
}
}
results
}
fn score_trigrams(
&self,
unknown_words: &[&str],
trigram_budget: usize,
pool: Option<&FxHashSet<*const str>>,
min_len: usize,
) -> (FxHashMap<*const str, usize>, usize) {
let mut scores: FxHashMap<*const str, usize> = FxHashMap::default();
scores.reserve(256);
if let Some(pool) = pool {
for &item in pool {
scores.insert(item, 1);
}
}
let has_pool = pool.is_some();
let mut budget = trigram_budget;
let mut hit_count = 0;
let mut visited: FxHashSet<[char; 3]> = FxHashSet::default();
'outer: for round in 0..trigram_budget {
for word in unknown_words {
if budget == 0 {
break 'outer;
}
let bytes = word.as_bytes();
let Some(pos) = trigram_position(bytes.len(), round) else {
continue;
};
let trigram = [
bytes[pos] as char,
bytes[pos + 1] as char,
bytes[pos + 2] as char,
];
if !visited.insert(trigram) {
continue;
}
budget -= 1;
let Some(items) = self.trigram_index.get(&trigram) else {
continue;
};
hit_count += 1;
if has_pool {
for &item in items {
if let Some(score) = scores.get_mut(&item) {
*score += 1;
}
}
} else {
for &item in items {
if unsafe { &*item }.len() >= min_len {
*scores.entry(item).or_default() += 1;
}
}
}
}
}
(scores, hit_count)
}
}
fn sep_table(separators: &[char]) -> [bool; 256] {
let mut table = [false; 256];
for &c in separators {
if (c as usize) < 256 {
table[c as usize] = true;
}
}
table
}
fn words<'s>(text: &'s str, sep: &'s [bool; 256]) -> impl Iterator<Item = &'s str> {
let bytes = text.as_bytes();
let mut i = 0;
iter::from_fn(move || {
while i < bytes.len() && sep[bytes[i] as usize] {
i += 1;
}
let start = i;
while i < bytes.len() && !sep[bytes[i] as usize] {
i += 1;
}
(i > start).then(|| &text[start..i])
})
}
fn word_match(item: &str, query_words: &[&str], sep: &[bool; 256]) -> (usize, usize) {
let mut matched = 0;
let mut position = 0;
for iw in words(item, sep) {
if query_words
.get(matched)
.is_some_and(|qw| iw.starts_with(*qw))
{
matched += 1;
} else if matched == 0 {
position += 1;
}
}
(matched, position)
}
fn trigram_position(len: usize, round: usize) -> Option<usize> {
let max = len - 3;
if round == 0 {
return Some(0);
}
if round == 1 && max > 0 {
return Some(max);
}
if round == 2 && max > 1 {
return Some(max / 2);
}
if max <= 2 {
return None;
}
let mid = max / 2;
let offset = (round - 2) >> 1;
let pos = if round & 1 == 1 {
mid.saturating_sub(offset)
} else {
mid + offset
};
if pos == 0 || pos >= max || pos == mid {
None
} else {
Some(pos)
}
}