#![crate_name = "fiveo"]
#![crate_type = "lib"]
#![feature(test)]
#![cfg_attr(feature = "webassembly", no_std)]
#![cfg_attr(feature = "webassembly", feature(alloc, core_intrinsics, core_float))]
#[cfg(feature = "webassembly")]
extern crate alloc;
#[cfg(feature = "webassembly")]
use {alloc::{BinaryHeap, Vec}, core::{cmp, iter, str, f32, num::Float}};
#[cfg(not(feature = "webassembly"))]
use std::{cmp, iter, str, collections::BinaryHeap, f32};
#[derive(Clone, Debug, PartialEq, Eq)]
struct CandidateBitmask(u32);
impl CandidateBitmask {
pub fn from(values: &mut Iterator<Item = char>) -> Self {
let mut bitmask = 0 as u32;
loop {
match values.next() {
Some(val) if val.is_ascii() && val.is_alphabetic() => {
bitmask |= 1 << (val as u8 - 'a' as u8);
}
Some(_) => continue,
None => break,
}
}
CandidateBitmask(bitmask)
}
fn matches(&self, other: &Self) -> bool {
(self.0 & other.0) == self.0
}
}
#[derive(Clone, Debug, PartialEq)]
struct Candidate<'a> {
value: &'a str,
mask: CandidateBitmask,
}
impl<'a> Candidate<'a> {
fn from(value: &'a str) -> Candidate<'a> {
let mask = CandidateBitmask::from(&mut value.to_ascii_lowercase().chars());
Candidate { value, mask }
}
}
#[derive(Clone, Debug, PartialEq, PartialOrd)]
pub struct SearchResult {
index: usize,
score: f32,
}
impl Eq for SearchResult {}
impl Ord for SearchResult {
fn cmp(&self, other: &Self) -> cmp::Ordering {
self.score.partial_cmp(&other.score).unwrap()
}
}
impl SearchResult {
pub fn index(&self) -> usize {
self.index
}
pub fn score(&self) -> f32 {
self.score
}
}
pub struct MatcherParameters {
pub slash_bonus: f32,
pub separator_bonus: f32,
pub camelcase_bonus: f32,
pub period_bonus: f32,
pub max_gap: usize,
pub cache_size: usize,
pub distance_penalty: f32,
pub min_distance_penalty: f32,
pub cumulative_distance_penalty: f32,
}
impl Default for MatcherParameters {
fn default() -> Self {
MatcherParameters {
camelcase_bonus: 0.8f32,
separator_bonus: 0.8f32,
slash_bonus: 0.9f32,
period_bonus: 0.7f32,
max_gap: 10,
cache_size: 2000,
distance_penalty: 0.6f32,
min_distance_penalty: 0.2f32 + f32::EPSILON,
cumulative_distance_penalty: 0.05f32,
}
}
}
pub struct Matcher<'a> {
candidates: Vec<Candidate<'a>>,
parameters: MatcherParameters,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum MatcherError {
TextEncoding(str::Utf8Error),
}
impl<'a> Matcher<'a> {
pub fn new(dictionary: &str, parameters: MatcherParameters) -> Result<Matcher, MatcherError> {
let candidates = dictionary
.lines()
.map(|line| Candidate::from(line))
.collect();
Ok(Matcher {
candidates,
parameters,
})
}
pub fn search(&self, query: &str, max_results: usize) -> Vec<SearchResult> {
let query_lowercase = query.to_lowercase();
let query_mask = CandidateBitmask::from(&mut query_lowercase.chars());
let mut result_heap: BinaryHeap<SearchResult> = BinaryHeap::with_capacity(max_results);
for (candidate_index, candidate) in self.candidates.iter().enumerate() {
if !query_mask.matches(&candidate.mask) {
continue;
}
let cache_len = query.len() * candidate.value.len();
let mut match_idx_cache = Vec::with_capacity(cache_len);
let mut match_score_cache = Vec::with_capacity(cache_len);
match_idx_cache.resize(cache_len, None);
match_score_cache.resize(cache_len, None);
let score = self.score_candidate(
query,
candidate,
&mut match_idx_cache,
&mut match_score_cache,
);
if score > 0.0f32 {
let is_higher_scoring = result_heap
.peek()
.map(|rs| score > rs.score())
.unwrap_or(false);
if is_higher_scoring || result_heap.len() < max_results {
result_heap.push(SearchResult {
index: candidate_index,
score,
})
}
if result_heap.capacity() > max_results {
result_heap.pop();
}
}
}
result_heap.into_sorted_vec()
}
fn score_candidate(
&self,
query: &str,
candidate: &Candidate,
match_idx_cache: &mut Vec<Option<usize>>,
match_score_cache: &mut Vec<Option<f32>>,
) -> f32 {
let query_len = query.len();
let candidate_len = candidate.value.len();
let score = query_len as f32
* self.score_candidate_recursive(
&mut query.char_indices().peekable(),
query_len,
&mut candidate.value.char_indices().peekable(),
candidate_len,
match_idx_cache,
match_score_cache,
);
score.max(0.)
}
fn score_candidate_recursive(
&self,
query_chars: &mut iter::Peekable<str::CharIndices>,
query_len: usize,
candidate_chars: &mut iter::Peekable<str::CharIndices>,
candidate_len: usize,
match_idx_cache: &mut Vec<Option<usize>>,
match_score_cache: &mut Vec<Option<f32>>,
) -> f32 {
let mut score = 0.0f32;
let (query_char_index, query_char) = match query_chars.next() {
Some((idx, ch)) => (idx, ch),
None => return 1.0f32,
};
let candidate_start_index = match candidate_chars.peek() {
Some(&(idx, _)) => idx,
None => return 1.0f32,
};
let cache_offset = query_char_index * candidate_len + candidate_start_index;
if let Some(cached_score) = match_score_cache[cache_offset] {
return cached_score;
}
let mut best_match_index: Option<usize> = None;
let mut remaining_candidate_chars = self.parameters.max_gap;
let mut last_candidate_char: Option<char> = None;
let mut last_slash: Option<usize> = None;
let mut distance_penalty = self.parameters.distance_penalty;
while let Some((candidate_char_index, candidate_char)) = candidate_chars.next() {
if remaining_candidate_chars == 0 {
break;
}
if query_char_index == 0 && (query_char == '\\' || query_char == '/') {
last_slash = Some(candidate_char_index);
}
if query_char == candidate_char
|| query_char.to_lowercase().eq(candidate_char.to_lowercase())
{
let query_char_score = if candidate_char_index == candidate_start_index {
1.0f32
} else {
match last_candidate_char {
Some(val) => match val {
'/' => self.parameters.slash_bonus,
'-' | '_' | ' ' | '0'...'9' => self.parameters.separator_bonus,
'a'...'z' if candidate_char.is_uppercase() => {
self.parameters.camelcase_bonus
}
'.' => self.parameters.period_bonus,
_ => distance_penalty,
},
_ => distance_penalty,
}
};
if query_char_index > 0 && distance_penalty > self.parameters.min_distance_penalty {
distance_penalty -= self.parameters.cumulative_distance_penalty;
}
let mut new_score = query_char_score
* self.score_candidate_recursive(
query_chars,
query_len,
candidate_chars,
candidate_len,
match_idx_cache,
match_score_cache,
);
if query_char_index == 0 {
new_score /= (candidate_len - last_slash.unwrap_or(0)) as f32;
}
if new_score > score {
score = new_score;
best_match_index = Some(candidate_char_index);
if f32::abs(score - 1.0f32) < f32::EPSILON {
break;
}
}
}
last_candidate_char = Some(candidate_char);
remaining_candidate_chars -= 1;
}
match_score_cache[cache_offset] = Some(score);
match_idx_cache[cache_offset] = best_match_index;
score
}
}
#[cfg(test)]
mod tests {
extern crate test;
use super::*;
use self::test::{black_box, Bencher};
#[test]
pub fn exact_match_has_perfect_score() {
let searcher = Matcher::new("my_word1", MatcherParameters::default()).unwrap();
let results = searcher.search("my_word1", 1);
assert_eq!(1.0f32, results[0].score());
}
#[bench]
fn unreal_engine_search(b: &mut Bencher) {
let searcher = Matcher::new(
include_str!("../benchmark_data/ue4_file_list.txt"),
MatcherParameters::default(),
).unwrap();
b.iter(|| black_box(searcher.search("file.cpp", 100)))
}
}