use bit_set::BitSet;
use java_asm::StrRef;
use nucleo_matcher::pattern::{CaseMatching, Normalization, Pattern};
use nucleo_matcher::{Config, Matcher, Utf32Str};
use std::cmp::{min, Reverse};
use std::ops::Deref;
use std::sync::Arc;
pub struct FuzzyMatchModel {
input: StrRef,
items: Vec<StrRef>,
top_n: usize,
matcher: Matcher,
pattern: Pattern,
inc_infos: IncrementalInfos,
}
type IncrementalInfos = Vec<Option<SearchResult>>;
impl FuzzyMatchModel {
pub fn new(
input: StrRef, items: &[StrRef], top_n: usize,
) -> Self {
let config = Config::DEFAULT.match_paths();
let matcher = Matcher::new(config);
let pattern = Pattern::parse(&input, CaseMatching::Ignore, Normalization::Never);
let items = items.to_vec();
let inc_info = vec![None; items.len() + 1];
FuzzyMatchModel {
input,
items,
top_n,
matcher,
pattern,
inc_infos: inc_info,
}
}
pub fn search_with_new_input(&mut self, new_input: StrRef) -> SearchResult {
let old_input = self.input.clone();
let old_len = old_input.len();
let new_len = new_input.len();
let mut previous_info: Option<SearchResult> = None;
if let Some(Some(previous)) = self.inc_infos.get(old_len) {
previous_info = Some(previous.clone());
}
if old_len == new_len && new_input == old_input {
if let Some(result) = previous_info {
return result.clone();
}
} else {
self.input = new_input.clone();
self.pattern.reparse(&new_input, CaseMatching::Ignore, Normalization::Never);
}
let result: SearchResult = previous_info
.map(|inc_info| {
self.inc_case_1(&old_input, &new_input, &inc_info)
.or_else(|| self.inc_case_2(&old_input, &new_input))
.unwrap_or_else(|| self.full_search())
})
.unwrap_or_else(|| self.full_search());
result
}
fn inc_case_1(
&mut self, old_input: &str, new_input: &str,
old_inc_info: &SearchResult,
) -> Option<SearchResult> {
let old_len = old_input.len();
let new_len = new_input.len();
if new_len < old_len || !new_input.starts_with(old_input) { return None; };
let SearchResult { stop_idx, items: inc_items } = old_inc_info;
let inc_items: Vec<StrRef> = inc_items.iter().map(|item| Arc::clone(&item.item)).collect();
let search_result = self.search_in_ranges(&inc_items, *stop_idx);
self.inc_infos.resize(new_len + 1, None);
self.inc_infos[new_len] = Some(search_result.clone());
Some(search_result)
}
fn inc_case_2(
&mut self, old_input: &str, new_input: &str,
) -> Option<SearchResult> {
let mut common_prefix_len = 0usize;
let old_bytes = old_input.as_bytes();
let new_bytes = new_input.as_bytes();
let min_len = min(old_bytes.len(), new_bytes.len());
for i in 0..min_len {
if old_bytes[i] == new_bytes[i] {
common_prefix_len = i + 1;
} else {
break;
}
}
let Some(existed_result) = self.inc_infos.get(common_prefix_len)? else {
return None;
};
let new_result = existed_result.clone();
self.inc_infos.resize(new_input.len() + 1, None);
Some(new_result)
}
pub fn full_search(&mut self) -> SearchResult {
let SearchResult { stop_idx, items } = self.search_in_ranges(&[], 0);
let new_inc_info = SearchResult { stop_idx, items };
self.inc_infos.clear();
self.inc_infos.resize(self.input.len() + 1, None);
self.inc_infos[self.input.len()] = Some(new_inc_info.clone());
new_inc_info
}
fn search_in_ranges(
&mut self, items_1: &[StrRef], items_2_start_idx: usize,
) -> SearchResult {
let Self { top_n, items: all_items, matcher, pattern, .. } = self;
let top_n = *top_n;
let mut buf = Vec::new();
let mut items: Vec<(u32, SearchResultItem)> = Vec::with_capacity(top_n);
for item in items_1 {
if items.len() >= top_n {
break;
}
let haystack = Utf32Str::new(item.as_ref(), &mut buf);
let mut indices_record: Vec<u32> = vec![];
let score = pattern
.indices(haystack, matcher, &mut indices_record);
let Some(score) = score else {
continue;
};
let mut indices = BitSet::new();
for idx in indices_record {
indices.insert(idx as usize);
}
let result_item = SearchResultItem {
item: Arc::clone(item),
indices,
};
items.push((score, result_item));
}
let mut stop_idx = items_2_start_idx;
for (idx, item) in all_items.iter().enumerate().skip(stop_idx) {
if items.len() >= top_n {
break;
}
stop_idx = idx;
let haystack = Utf32Str::new(item.as_ref(), &mut buf);
let mut indices_record: Vec<u32> = vec![];
let score = pattern
.indices(haystack, matcher, &mut indices_record);
let Some(score) = score else {
continue;
};
let mut indices = BitSet::new();
for idx in indices_record {
indices.insert(idx as usize);
}
let result_item = SearchResultItem {
item: Arc::clone(item),
indices,
};
items.push((score, result_item));
}
items.sort_by_key(|(score, _)| Reverse(*score));
let items: Vec<_> = items.into_iter().map(|(_, item)| item).collect();
SearchResult { stop_idx, items }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SearchResultItem {
pub item: StrRef,
pub indices: BitSet,
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct SearchResult {
pub stop_idx: usize,
pub items: Vec<SearchResultItem>,
}
#[cfg(test)]
mod tests {
use crate::impls::fuzzy::{FuzzyMatchModel, SearchResult, SearchResultItem};
use bit_set::BitSet;
use java_asm::{vec_str_ref, StrRef};
use rand::prelude::SliceRandom;
use rand::rng;
#[test]
fn test_fuzzy_match() {
let input: StrRef = "abc".into();
let items: Vec<StrRef> = vec_str_ref![
"apple/banana",
"apple/banana/cake",
];
let mut model = FuzzyMatchModel::new(input, &items, 10);
let mut expected_bits = BitSet::new();
expected_bits.insert(0);
expected_bits.insert(6);
expected_bits.insert(13);
let expected_result_1 = SearchResult {
stop_idx: 1,
items: vec![
SearchResultItem {
item: "apple/banana/cake".into(),
indices: expected_bits,
}
],
};
assert_eq!(model.full_search(), expected_result_1);
let real_result = model.search_with_new_input("abn".into());
let mut expected_bits = BitSet::new();
expected_bits.insert(0);
expected_bits.insert(6);
expected_bits.insert(8);
let expected_result_2 = SearchResult {
stop_idx: 1,
items: vec![
SearchResultItem {
item: "apple/banana".into(),
indices: expected_bits.clone(),
},
SearchResultItem {
item: "apple/banana/cake".into(),
indices: expected_bits,
}
],
};
assert_eq!(real_result, expected_result_2);
let real_result = model.search_with_new_input("abcd".into());
let expected_result_3 = SearchResult {
stop_idx: 1,
items: vec![],
};
assert_eq!(real_result, expected_result_3);
}
#[test]
fn test_huge_input() {
let sample_size = 100_000;
let input: StrRef = "im2z".into();
let items: Vec<StrRef> = (0..sample_size).map(|i|
format!("item/{}/i12k/pc1i1/z", i).into()
).collect();
let start = std::time::Instant::now();
let mut model = FuzzyMatchModel::new(input, &items, 100_000);
let result = model.full_search();
println!("Cost time: {:?}ms for 100K items", start.elapsed().as_millis());
assert_eq!(result.items.len(), sample_size);
}
#[test]
fn test_huge_input_take_1000() {
let sample_size = 50_000;
let input: StrRef = "im21".into();
let items_1 = (0..sample_size).map(|i|
format!("item/{}/i12k/pc1i1/z", i).into()
);
let items_2 = (0..sample_size).map(|i|
format!("item/{}", i).into()
);
let mut items: Vec<StrRef> = items_1.chain(items_2).collect();
items.shuffle(&mut rng());
let start = std::time::Instant::now();
let mut model = FuzzyMatchModel::new(input, &items, 10000);
let result = model.full_search();
println!("Cost time: {:?}ms for take 10000 items", start.elapsed().as_millis());
assert_eq!(result.items.len(), 10000);
let start = std::time::Instant::now();
let result = model.search_with_new_input("im21z".into());
println!("Cost time: {:?}ms for take 10000 items next time", start.elapsed().as_millis());
assert_eq!(result.items.len(), 10000);
}
}