//! Fuzzy matching magic
//!
//! The algorithm is a custom port of the [`fuzzaldrin-plus`][fuzzaldrinplus] algorithm.
//! This is a package used in Atom editor.
//!
//! NOTE: The only part missing (I think) from the original algorithm is the path score bonus
//!
//! ### References
//! * [jeancroy/fuzz-aldrin-plus/src/scorer.coffee#L83](https://github.com/jeancroy/fuzz-aldrin-plus/blob/84eac1d73bacbbd11978e6960f4aa89f8396c540/src/scorer.coffee#L83)
//! * [jeancroy/fuzz-aldrin-plus/src/matcher.coffee#L172](https://github.com/jeancroy/fuzz-aldrin-plus/blob/84eac1d73bacbbd11978e6960f4aa89f8396c540/src/matcher.coffee#L172)
//!
//! [fuzzaldrinplus]: https://github.com/jeancroy/fuzz-aldrin-plus
mod predicates;
mod scoring;
mod types;
use predicates::*;
use scoring::*;
use types::*;
pub use types::{Candidate, Query};
use crate::common::Text;
use rayon::prelude::*;
// Max number missed consecutive hit = ceil(MISS_COEFF * query.len()) + 5
const MISS_COEFF: f32 = 0.75;
/// Search for candidates that fuzzy-match a query
///
/// * If the query is empty it just returns the same pool of candidates
/// * Otherwise it will try to compute the best match for each candidate.
/// * If `preserve_order` is not set, the candidates will be sorted from higher
/// score to lower.
pub fn search<'pool>(
q: &str,
pool: &'pool impl IntoParallelRefIterator<'pool, Item = &'pool Text>,
preserve_order: bool,
) -> Vec<Candidate> {
let mut matches: Vec<Candidate>;
if q.is_empty() {
matches = pool.par_iter().map(|txt| txt.into()).collect();
} else {
let query: Query = q.into();
matches = pool
.par_iter()
.map(|c| compute_match(&query, c))
.filter(|c| c.is_some())
.map(|c| c.unwrap())
.collect();
if !preserve_order {
matches.par_sort_unstable_by(|a, b| b.cmp(a));
}
}
matches
}
/// This function will return a Candidate with the computed score and matches.
fn compute_match(query: &Query, subject: &Text) -> Option<Candidate> {
if query.is_empty() {
return None;
}
// -----------------------------------------------------------------
// Check if the query is inside the subject
if !is_match(query, subject) {
return None;
}
// -----------------------------------------------------------------
// Acronym sequence
let acronym_score = match score_acronyms(query, subject) {
None => 0.0,
Some(acronym) => {
if acronym.count == query.len() {
let score =
score_quality(query.len(), subject.len(), acronym.score, acronym.position);
let matches = acronym.matches;
return Some(Candidate::new(subject, score, matches));
}
acronym.score
}
};
// -----------------------------------------------------------------
// Exact Match
if let Some(result) = score_exact_match(query, subject) {
let score = result.score;
let matches = result.matches;
return Some(Candidate::new(subject, score, matches));
}
// -----------------------------------------------------------------
// Individual characters
// (Smith Waterman algorithm)
// init
let scored_size = score_size(query.len(), subject.len());
// keep track of the calculated scores of query letters in the row
let mut score_row = vec![0.0_f32; query.len()];
// keep track of the consecutive scores of query letters in the row
let mut consecutive_row = vec![0.0_f32; query.len()];
let miss_budget = (MISS_COEFF * query.len() as f32).ceil() + 5.0;
let mut miss_left = miss_budget;
let mut should_rebuild = true;
// trace matrix, this is used to recover best matches positions
let mut trace = TraceMatrix::new(subject.len(), query.len());
// the algorithm works over a matrix where columns represent query letters
// and rows are subject letters
//
// For example, for query "fft" and subject "fxfxtx", the matrix would be
//
// | f | f | t | Where the symbols mean:
// ---------------- * (^) Look for the score/match in the upper row
// f | d | < | < | * (<) Look for the score/match in the left column
// x | ^ | ^ | ^ | * (d) This is a good match, use this score and move
// f | ^ | d | < | in diagonal (up/left)
// x | ^ | ^ | ^ |
// t | ^ | ^ | d |
// x | ^ | ^ | ^ |
// ----------------
let subject_iter = subject.lowercase_iter().enumerate();
'subject_loop: for (subject_index, subject_grapheme) in subject_iter {
// for every letter in the subject we move one row in the matrix
if !query.contains(subject_grapheme) {
if should_rebuild {
// by resetting the consecutive_row to 0 we force the next
// query letter match to recalculate its consecutive score
consecutive_row = vec![0.0_f32; query.len()];
should_rebuild = false;
}
continue 'subject_loop;
}
// current row best score
let mut score = 0.0;
// score from the up/left position of the matrix
let mut score_diag = 0.0;
// consecutive score from the up/left position of the matrix
let mut consecutive_diag = 0.0;
let mut record_miss = true;
should_rebuild = true;
let query_iter = query.lowercase_iter().enumerate();
for (query_index, query_grapheme) in query_iter {
// for every letter in the query we move one column in the matrix
let mut consecutive_score = 0.0;
let score_up = score_row[query_index];
if score_up >= score {
// the upper score is better than the one that has been accumulated
// so far
score = score_up;
trace.up_at(query_index, subject_index);
} else {
trace.left_at(query_index, subject_index);
}
if query_grapheme == subject_grapheme {
let is_start = is_start_of_word(subject, subject_index);
if consecutive_diag > 0.0 {
// if the upper/left (diagonal) position has a consecutive_score that means
// this current column in this row is a consecutive letter from that score
// so it can be reused
consecutive_score = consecutive_diag;
} else {
// this current letter doesn't follow consecutive letters in the subject,
// so it needs to recalculate the consecutive score
consecutive_score =
score_consecutives(query, subject, query_index, subject_index, is_start);
}
let score_char =
score_character(subject_index, is_start, acronym_score, consecutive_score);
let aligned_score = score_diag + score_char;
if aligned_score > score {
// the current calculated score (aligned_score) is better than the accumulated,
// so it can be set as the accumulated
score = aligned_score;
// this position is marked as diagonal because the score is the highest so far
trace.diagonal_at(query_index, subject_index);
miss_left = miss_budget;
} else {
if record_miss {
miss_left -= 1.0;
if miss_left <= 0.0 {
let final_score =
score.max(score_row[query.last_index()]) * scored_size;
if final_score <= 0.0 {
return None;
} else {
let matches = trace.traceback(query_index, subject_index);
return Some(Candidate::new(subject, final_score, matches));
}
}
}
record_miss = false;
}
}
// before moving to the next column, put the upper score as the
// diagonal score (the position is moved one to the right in the
// next iteration)
score_diag = score_up;
// save the score in the row it will be used as the score_up
// in the next subject's iteration
score_row[query_index] = score;
// the same for consecutive scores
consecutive_diag = consecutive_row[query_index];
consecutive_row[query_index] = consecutive_score;
if score <= 0.0 {
trace.stop_at(query_index, subject_index);
}
}
}
// after all iterations the highest score will be in the last position
let final_score = score_row[query.last_index()] * scored_size;
if final_score == 0.0 {
None
} else {
let matches = trace.traceback(query.last_index(), subject.last_index());
Some(Candidate::new(subject, final_score, matches))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::common::TextBuilder;
#[test]
fn matches_test() {
let cases: Vec<(Query, &str, Vec<usize>)> = vec![
// Exact acronym
("fft".into(), "FirstFactoryTest", vec![0, 5, 12]),
// Extra acronym letters
("fft".into(), "FirstFactoryTest.ts", vec![0, 5, 12]),
// Exact match
("core".into(), "0core0app.rb", vec![1, 2, 3, 4]),
// Exact match, second position is better
("core".into(), "0core0app_core.rb", vec![10, 11, 12, 13]),
// Consecutive letters
("core".into(), "controller", vec![0, 1, 4, 8]),
];
for (query, string, expected) in cases {
let subject = TextBuilder::build(string);
let result = compute_match(&query, &subject);
assert!(result.is_some());
let result = result.unwrap();
assert_eq!(
result.matches, expected,
"Expected {} to have matches {:?} inside {}",
query, expected, subject,
);
}
}
#[test]
fn compute_match_on_different_queries_test() {
let cases: Vec<(Query, Query, &str)> = vec![
// Acronym wins
("psh".into(), "push".into(), "Plus: Stage Hunk"),
// Exact world wins
("Hello".into(), "he".into(), "Hello World"),
// More consecutive chars wins
("ello".into(), "hllo".into(), "Hello World"),
];
for (a, b, string) in cases {
let subject = TextBuilder::build(string);
let result_a = compute_match(&a, &subject);
let result_b = compute_match(&b, &subject);
assert!(result_a.is_some());
assert!(result_b.is_some());
let result_a = result_a.unwrap();
let result_b = result_b.unwrap();
assert!(
result_a.score() > result_b.score(),
"Expected {} to have a higher score than {} inside {}",
a,
b,
subject,
);
}
}
}