use ahash::{AHashMap, AHashSet};
use itertools::Itertools;
use std::cmp;
use std::cmp::Ordering;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
use unicode_normalization::UnicodeNormalization;
#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
use ahash::RandomState;
#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
use std::sync::LazyLock;
#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
pub static HASHER_32: LazyLock<RandomState> =
LazyLock::new(|| RandomState::with_seeds(805272099, 242851902, 646123436, 591410655));
#[inline]
#[cfg(all(target_feature = "aes", target_feature = "sse2"))]
pub(crate) fn hash32(term_bytes: &[u8]) -> u32 {
use gxhash::gxhash32;
gxhash32(term_bytes, 1234)
}
#[inline]
#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
pub(crate) fn hash32(term_bytes: &[u8]) -> u32 {
HASHER_32.hash_one(term_bytes) as u32
}
use std::{cmp::min, mem};
use smallvec::SmallVec;
use smallvec::smallvec;
const VEC_SIZE: usize = 16; pub type FastVec<T> = SmallVec<[T; VEC_SIZE]>;
pub fn damerau_levenshtein_osa(a: &str, b: &str, max_distance: usize) -> Option<usize> {
let b_len = b.chars().count();
let mut prev_two_distances: FastVec<usize> = (0..b_len + 1).collect();
let mut prev_distances: FastVec<usize> = (0..b_len + 1).collect();
let mut curr_distances: FastVec<usize> = smallvec![0; b_len + 1];
let mut prev_a_char = char::MAX;
let mut prev_b_char = char::MAX;
for (i, a_char) in a.chars().enumerate() {
curr_distances[0] = i + 1;
for (j, b_char) in b.chars().enumerate() {
let cost = usize::from(a_char != b_char);
curr_distances[j + 1] = min(
curr_distances[j] + 1,
min(prev_distances[j + 1] + 1, prev_distances[j] + cost),
);
if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char
{
curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1);
}
prev_b_char = b_char;
}
mem::swap(&mut prev_two_distances, &mut prev_distances);
mem::swap(&mut prev_distances, &mut curr_distances);
prev_a_char = a_char;
}
if prev_distances[b_len] <= max_distance {
Some(prev_distances[b_len])
} else {
None
}
}
pub fn unicode_normalization_form_kc(input: &str) -> String {
input
.nfkc() .collect::<String>() }
pub fn transfer_case(source: &str, target: &str) -> String {
if source == target || source.eq_ignore_ascii_case(target) {
return source.to_string();
}
let mut result = String::new();
use itertools::EitherOrBoth;
use itertools::Itertools;
let mut last_upper = false;
for pair in source.chars().zip_longest(target.chars()) {
match pair {
EitherOrBoth::Both(s, t) => {
if s.is_uppercase() {
if !result.is_empty() {
last_upper = true;
}
result.push_str(&t.to_string().to_uppercase());
}
else if s.is_whitespace() && !t.is_whitespace() && last_upper {
result.push_str(&t.to_string().to_uppercase());
} else {
last_upper = false;
result.push(t);
}
}
EitherOrBoth::Left(_) => (),
EitherOrBoth::Right(t) => {
if last_upper {
result.push_str(&t.to_string().to_uppercase());
} else {
result.push(t);
}
}
}
}
result
}
pub fn parse_words(text: &str) -> Vec<String> {
let mut non_unique_terms_line: Vec<String> = Vec::with_capacity(text.len() << 3);
let text_normalized = text.to_lowercase();
let mut start = false;
let mut start_pos = 0;
for char in text_normalized.char_indices() {
start = match char.1 {
token if token.is_alphanumeric() => {
if !start {
start_pos = char.0;
}
true
}
'_' | '\'' | '’' => true,
_ => {
if start {
non_unique_terms_line.push(text_normalized[start_pos..char.0].to_string());
}
false
}
};
}
if start {
non_unique_terms_line.push(text_normalized[start_pos..text_normalized.len()].to_string());
}
non_unique_terms_line
}
fn len(s: &str) -> usize {
s.chars().count()
}
fn remove(s: &str, index: usize) -> String {
s.chars()
.enumerate()
.filter(|(ii, _)| ii != &index)
.map(|(_, ch)| ch)
.collect()
}
fn slice(s: &str, start: usize, end: usize) -> String {
s.chars().skip(start).take(end - start).collect()
}
fn suffix(s: &str, start: usize) -> String {
s.chars().skip(start).collect::<String>()
}
fn at(s: &str, i: isize) -> Option<char> {
if i < 0 || i >= s.len() as isize {
return None;
}
s.chars().nth(i as usize)
}
#[derive(Debug, Clone)]
pub struct Composition {
pub segmented_string: String,
pub distance_sum: usize,
pub prob_log_sum: f64,
}
impl Composition {
fn empty() -> Self {
Self {
segmented_string: "".to_string(),
distance_sum: 0,
prob_log_sum: 0.0,
}
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Suggestion {
pub term: String,
pub distance: usize,
pub count: usize,
}
impl Suggestion {
fn empty() -> Suggestion {
Suggestion {
term: "".to_string(),
distance: 0,
count: 0,
}
}
fn new(term: impl Into<String>, distance: usize, count: usize) -> Suggestion {
Suggestion {
term: term.into(),
distance,
count,
}
}
}
impl Ord for Suggestion {
fn cmp(&self, other: &Suggestion) -> Ordering {
let distance_cmp = self.distance.cmp(&other.distance);
if distance_cmp == Ordering::Equal {
return other.count.cmp(&self.count);
}
distance_cmp
}
}
impl PartialOrd for Suggestion {
fn partial_cmp(&self, other: &Suggestion) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for Suggestion {
fn eq(&self, other: &Suggestion) -> bool {
self.distance == other.distance && self.count == other.count
}
}
impl Eq for Suggestion {}
#[derive(Eq, PartialEq, Debug)]
pub enum Verbosity {
Top,
Closest,
All,
}
#[derive(PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct SymSpell {
max_dictionary_edit_distance: usize,
term_length_threshold: Option<Vec<usize>>,
prefix_length: usize,
count_threshold: usize,
corpus_word_count: usize,
max_dictionary_term_length: usize,
deletes: AHashMap<u32, Vec<Box<str>>>,
words: AHashMap<Box<str>, usize>,
bigrams: AHashMap<Box<str>, usize>,
bigram_min_count: usize,
}
fn edits(
word: &str,
edit_distance: usize,
max_term_edit_distance: usize,
delete_words: &mut AHashSet<String>,
) {
let edit_distance = edit_distance + 1;
let word_len = len(word);
if word_len > 1 {
for i in 0..word_len {
let delete = remove(word, i);
if !delete_words.contains(&delete) {
delete_words.insert(delete.clone());
if edit_distance < max_term_edit_distance {
edits(&delete, edit_distance, max_term_edit_distance, delete_words);
}
}
}
}
}
impl SymSpell {
pub fn new(
max_dictionary_edit_distance: usize,
term_length_threshold: Option<Vec<usize>>,
prefix_length: usize,
count_threshold: usize,
) -> Self {
Self {
max_dictionary_edit_distance, term_length_threshold, prefix_length, count_threshold, corpus_word_count: 1_024_908_267_229,
max_dictionary_term_length: 0,
deletes: AHashMap::new(),
words: AHashMap::new(),
bigrams: AHashMap::new(),
bigram_min_count: usize::MAX,
}
}
pub fn get_dictionary_size(&self) -> usize {
self.words.len()
}
pub fn save_dictionary(
&self,
path: &Path,
separator: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::create(path)?;
let mut writer = BufWriter::new(file);
for (entry, count) in self
.words
.iter()
.sorted_unstable_by(|a, b| Ord::cmp(&b.1, &a.1))
{
writeln!(writer, "{}{}{}", entry, separator, count)?;
}
writer.flush()?;
Ok(())
}
pub fn load_dictionary(
&mut self,
path: &Path,
term_index: usize,
count_index: usize,
separator: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::open(path)?;
let sr = BufReader::new(file);
for line in sr.lines() {
let line_str = line?;
self.load_dictionary_line(&line_str, term_index, count_index, separator);
}
Ok(())
}
pub fn load_dictionary_line(
&mut self,
line: &str,
term_index: usize,
count_index: usize,
separator: &str,
) -> bool {
let line_parts: Vec<&str> = line.split(separator).collect();
if line_parts.len() >= 2 {
let key = line_parts[term_index].to_string();
let count = line_parts[count_index].parse::<usize>().unwrap();
self.create_dictionary_entry(key, count);
}
true
}
pub fn load_bigram_dictionary(
&mut self,
path: &Path,
term_index: usize,
count_index: usize,
separator: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::open(path)?;
let sr = BufReader::new(file);
for line in sr.lines() {
let line_str = line?;
self.load_bigram_dictionary_line(&line_str, term_index, count_index, separator);
}
Ok(())
}
pub fn load_bigram_dictionary_line(
&mut self,
line: &str,
term_index: usize,
count_index: usize,
separator: &str,
) -> bool {
let line_parts: Vec<&str> = line.split(separator).collect();
let line_parts_len = if separator == " " { 3 } else { 2 };
if line_parts.len() >= line_parts_len {
let key = if separator == " " {
[line_parts[term_index], line_parts[term_index + 1]].join(" ")
} else {
line_parts[term_index].to_string()
};
let count = line_parts[count_index].parse::<usize>().unwrap();
self.bigrams.insert(key.into_boxed_str(), count);
if count < self.bigram_min_count {
self.bigram_min_count = count;
}
}
true
}
pub fn lookup(
&self,
input: &str,
verbosity: Verbosity,
max_edit_distance: usize,
term_length_threshold: &Option<Vec<usize>>,
max_results: Option<usize>,
preserve_case: bool,
) -> Vec<Suggestion> {
if max_edit_distance + term_length_threshold.as_ref().map_or(0, |x| x.len())
> self.max_dictionary_edit_distance
+ self.term_length_threshold.as_ref().map_or(0, |x| x.len())
{
println!("max_edit_distance is bigger than max_dictionary_edit_distance");
}
let max_term_edit_distance =
term_length_threshold
.as_ref()
.map_or(max_edit_distance, |term_length_threshold| {
let term_len = len(input);
let mut max_term_edit_distance = 0;
for (i, threshold) in term_length_threshold.iter().enumerate() {
if term_len >= *threshold {
max_term_edit_distance = max_edit_distance + i
} else {
break;
}
}
max_term_edit_distance
});
let mut suggestions: Vec<Suggestion> = Vec::new();
let input_lower_case = input.to_lowercase();
let input_original_case = if preserve_case {
input
} else {
&input_lower_case
};
let input = input_lower_case.as_str();
let input_len = len(input);
if input_len as isize - max_term_edit_distance as isize
> self.max_dictionary_term_length as isize
{
return suggestions;
}
let mut hashset1: AHashSet<String> = AHashSet::new();
let mut hashset2: AHashSet<String> = AHashSet::new();
if self.words.contains_key(input) {
let suggestion_count = self.words[input];
suggestions.push(Suggestion::new(input_original_case, 0, suggestion_count));
if verbosity != Verbosity::All {
return suggestions;
}
}
if max_term_edit_distance == 0 {
return suggestions;
}
hashset2.insert(input.to_string());
let mut max_edit_distance2 = max_term_edit_distance;
let mut candidate_pointer = 0;
let mut candidates = Vec::new();
let mut input_prefix_len = input_len;
if input_prefix_len > self.prefix_length {
input_prefix_len = self.prefix_length;
candidates.push(slice(input, 0, input_prefix_len));
} else {
candidates.push(input.to_string());
}
while candidate_pointer < candidates.len() {
let candidate = &candidates.get(candidate_pointer).unwrap().clone();
candidate_pointer += 1;
let candidate_len = len(candidate);
let length_diff = input_prefix_len as isize - candidate_len as isize;
if length_diff > max_edit_distance2 as isize {
if verbosity == Verbosity::All {
continue;
}
break;
}
let hash = hash32(candidate.as_bytes());
if self.deletes.contains_key(&hash) {
let dict_suggestions = &self.deletes[&hash];
for suggestion in dict_suggestions {
let suggestion_len = len(suggestion);
if suggestion.as_ref() == input {
continue;
}
if suggestion_len.abs_diff(input_len) > max_edit_distance2
|| suggestion_len < candidate_len
|| (suggestion_len == candidate_len && suggestion.as_ref() != candidate)
{
continue;
}
let sugg_prefix_len = min(suggestion_len, self.prefix_length);
if sugg_prefix_len > input_prefix_len
&& sugg_prefix_len - candidate_len > max_edit_distance2
{
continue;
}
let distance;
if candidate_len == 0 {
distance = cmp::max(input_len, suggestion_len);
if distance > max_edit_distance2 || hashset2.contains(suggestion.as_ref()) {
continue;
}
hashset2.insert(suggestion.to_string());
} else if suggestion_len == 1 {
distance = if !input.contains(&slice(suggestion, 0, 1)) {
input_len
} else {
input_len - 1
};
if distance > max_edit_distance2 || hashset2.contains(suggestion.as_ref()) {
continue;
}
hashset2.insert(suggestion.to_string());
} else if self.has_different_suffix(
max_term_edit_distance,
input,
input_len,
candidate_len,
suggestion,
suggestion_len,
) {
continue;
} else {
if verbosity != Verbosity::All
&& !self.delete_in_suggestion_prefix(
candidate,
candidate_len,
suggestion,
suggestion_len,
)
{
continue;
}
if hashset2.contains(suggestion.as_ref()) {
continue;
}
hashset2.insert(suggestion.to_string());
distance = if let Some(distance) =
damerau_levenshtein_osa(input, suggestion, max_edit_distance2)
{
distance
} else {
continue;
};
}
if distance <= max_edit_distance2 {
let suggestion_count = self.words[suggestion];
let si = Suggestion::new(suggestion.as_ref(), distance, suggestion_count);
if !suggestions.is_empty() {
match verbosity {
Verbosity::Closest => {
if distance < max_edit_distance2 {
suggestions.clear();
}
}
Verbosity::Top => {
if distance < max_edit_distance2
|| suggestion_count > suggestions[0].count
{
max_edit_distance2 = distance;
suggestions[0] = si;
}
continue;
}
_ => (),
}
}
if verbosity != Verbosity::All {
max_edit_distance2 = distance;
}
suggestions.push(si);
}
}
}
if length_diff < max_term_edit_distance as isize && candidate_len <= self.prefix_length
{
if verbosity != Verbosity::All && length_diff >= max_edit_distance2 as isize {
continue;
}
for i in 0..candidate_len {
let delete = remove(candidate, i);
if !hashset1.contains(&delete) {
hashset1.insert(delete.clone());
candidates.push(delete);
}
}
}
}
if suggestions.len() > 1 {
suggestions.sort_unstable();
}
if preserve_case {
for suggestion in suggestions.iter_mut() {
suggestion.term = transfer_case(input_original_case, &suggestion.term);
}
}
if let Some(max_results) = max_results {
suggestions.truncate(max_results);
}
suggestions
}
pub fn lookup_compound(
&self,
input: &str,
edit_distance_max: usize,
term_length_threshold: &Option<Vec<usize>>,
preserve_case: bool,
) -> Vec<Suggestion> {
let term_list1 = parse_words(input);
let mut suggestions: Vec<Suggestion>; let mut suggestion_parts: Vec<Suggestion> = Vec::new();
let mut last_combi = false;
for (i, term) in term_list1.iter().enumerate() {
suggestions = self.lookup(
term,
Verbosity::Top,
edit_distance_max,
term_length_threshold,
None,
false,
);
if i > 0 && !last_combi {
let mut suggestions_combi: Vec<Suggestion> = self.lookup(
&[term_list1[i - 1].as_str(), term_list1[i].as_str()].join(""),
Verbosity::Top,
edit_distance_max,
term_length_threshold,
None,
false,
);
if !suggestions_combi.is_empty() {
let best1 = suggestion_parts[suggestion_parts.len() - 1].clone();
let best2 = if !suggestions.is_empty() {
suggestions[0].clone()
} else {
Suggestion::new(
term_list1[i].as_str(),
edit_distance_max + 1,
(10f64 / 10usize.saturating_pow(len(&term_list1[i]) as u32) as f64)
as usize,
)
};
let distance1 = best1.distance + best2.distance;
if suggestions_combi[0].distance + 1 < distance1
|| (suggestions_combi[0].distance + 1 == distance1
&& (suggestions_combi[0].count
> (best1.count as f64 / self.corpus_word_count as f64 * best2.count as f64) as usize))
{
suggestions_combi[0].distance += 1;
let last_i = suggestion_parts.len() - 1;
suggestion_parts[last_i] = suggestions_combi[0].clone();
last_combi = true;
continue;
}
}
}
last_combi = false;
if !suggestions.is_empty()
&& ((suggestions[0].distance == 0) || (len(&term_list1[i]) == 1))
{
suggestion_parts.push(suggestions[0].clone());
} else {
let mut suggestion_split_best = if !suggestions.is_empty() {
suggestions[0].clone()
} else {
Suggestion::empty()
};
let term_length = len(&term_list1[i]);
if term_length > 1 {
for j in 1..term_length {
let part1 = slice(&term_list1[i], 0, j);
let part2 = slice(&term_list1[i], j, term_length);
let mut suggestion_split = Suggestion::empty();
let suggestions1 = self.lookup(
&part1,
Verbosity::Top,
edit_distance_max,
term_length_threshold,
None,
false,
);
if !suggestions1.is_empty() {
let suggestions2 = self.lookup(
&part2,
Verbosity::Top,
edit_distance_max,
term_length_threshold,
None,
false,
);
if !suggestions2.is_empty() {
suggestion_split.term =
[suggestions1[0].term.as_str(), suggestions2[0].term.as_str()]
.join(" ");
let distance2 = if let Some(d) = damerau_levenshtein_osa(
&term_list1[i],
&suggestion_split.term,
edit_distance_max,
) {
d
} else {
edit_distance_max + 1
};
if !suggestion_split_best.term.is_empty() {
if distance2 > suggestion_split_best.distance {
continue;
}
if distance2 < suggestion_split_best.distance {
suggestion_split_best = Suggestion::empty();
}
}
let bigram_count: usize =
match self.bigrams.get(&*suggestion_split.term) {
Some(&bigram_frequency) => {
if !suggestions.is_empty() {
let best_si = &suggestions[0];
if suggestion_split.term == term_list1[i] {
cmp::max(bigram_frequency, best_si.count + 2)
} else if suggestions1[0].term == best_si.term
|| suggestions2[0].term == best_si.term
{
cmp::max(bigram_frequency, best_si.count + 1)
} else {
bigram_frequency
}
} else if suggestion_split.term == term_list1[i] {
cmp::max(
bigram_frequency,
cmp::max(
suggestions1[0].count,
suggestions2[0].count,
) + 2,
)
} else {
bigram_frequency
}
}
None => {
min(
self.bigram_min_count,
(suggestions1[0].count as f64
/ self.corpus_word_count as f64
* suggestions2[0].count as f64)
as usize,
)
}
};
suggestion_split.distance = distance2;
suggestion_split.count = bigram_count;
if suggestion_split_best.term.is_empty()
|| suggestion_split.count > suggestion_split_best.count
{
suggestion_split_best = suggestion_split.clone();
}
}
}
}
if !suggestion_split_best.term.is_empty() {
suggestion_parts.push(suggestion_split_best.clone());
} else {
let mut si = Suggestion::empty();
si.term = term_list1[i].clone();
si.count =
(10f64 / 10usize.saturating_pow(term_length as u32) as f64) as usize;
si.distance = edit_distance_max + 1;
suggestion_parts.push(si);
}
} else {
let mut si = Suggestion::empty();
si.term = term_list1[i].clone();
si.count = (10f64 / 10usize.saturating_pow(term_length as u32) as f64) as usize;
si.distance = edit_distance_max + 1;
suggestion_parts.push(si);
}
}
}
let mut suggestion = Suggestion::empty();
let mut tmp_count: f64 = self.corpus_word_count as f64;
let mut s = "".to_string();
for si in suggestion_parts {
s.push_str(&si.term);
s.push(' ');
tmp_count *= si.count as f64 / self.corpus_word_count as f64;
}
let output = s.trim();
suggestion.count = tmp_count as usize;
suggestion.distance =
damerau_levenshtein_osa(&input.to_lowercase(), output, usize::MAX).unwrap();
suggestion.term = if preserve_case {
transfer_case(input, output)
} else {
output.to_lowercase()
};
vec![suggestion]
}
pub fn word_segmentation(&self, input: &str, max_edit_distance: usize) -> Composition {
let input = &unicode_normalization_form_kc(input);
let asize = len(input);
let mut ci: usize = 0;
let mut compositions: Vec<Composition> = vec![Composition::empty(); asize];
for j in 0..asize {
let imax = min(asize - j, self.max_dictionary_term_length);
for i in 1..=imax {
let mut part = slice(input, j, j + i);
let mut sep_len = 0;
let mut top_ed = 0;
let first_char = at(&part, 0).unwrap();
if first_char.is_whitespace() {
part = remove(&part, 0);
} else {
sep_len = 1;
}
top_ed += part.len();
part = part.replace(" ", "");
top_ed -= part.len();
let results =
self.lookup(&part, Verbosity::Top, max_edit_distance, &None, None, true);
let top_prob_log = if !results.is_empty() {
if results[0].distance > 0 {
part = results[0].term.clone();
top_ed += results[0].distance;
}
(results[0].count as f64 / self.corpus_word_count as f64).log10()
} else {
let part_len = len(&part);
top_ed += part_len;
(10.0 / (self.corpus_word_count as f64 * 10.0f64.powf(part_len as f64))).log10()
};
let di = (i + ci) % asize;
if j == 0 {
compositions[i - 1] = Composition {
segmented_string: part.to_owned(),
distance_sum: top_ed,
prob_log_sum: top_prob_log,
};
} else if i == self.max_dictionary_term_length
|| (((compositions[ci].distance_sum + top_ed == compositions[di].distance_sum)
|| (compositions[ci].distance_sum + sep_len + top_ed
== compositions[di].distance_sum))
&& (compositions[di].prob_log_sum
< compositions[ci].prob_log_sum + top_prob_log))
|| (compositions[ci].distance_sum + sep_len + top_ed
< compositions[di].distance_sum)
{
if ((part.len() == 1) && part.chars().nth(0).unwrap().is_ascii_punctuation())
|| ((part.len() == 3) && part.starts_with("’"))
{
compositions[di] = Composition {
segmented_string: [
compositions[ci].segmented_string.as_str(),
part.as_str(),
]
.join(""),
distance_sum: compositions[ci].distance_sum + top_ed,
prob_log_sum: compositions[ci].prob_log_sum + top_prob_log,
};
} else {
compositions[di] = Composition {
segmented_string: [
compositions[ci].segmented_string.as_str(),
part.as_str(),
]
.join(" "),
distance_sum: compositions[ci].distance_sum + sep_len + top_ed,
prob_log_sum: compositions[ci].prob_log_sum + top_prob_log,
};
}
}
}
if j != 0 {
ci += 1;
}
ci = if ci == asize { 0 } else { ci };
}
if compositions.is_empty() {
Composition {
segmented_string: input.to_string(),
distance_sum: 0,
prob_log_sum: 0.0,
}
} else {
compositions[ci].to_owned()
}
}
fn delete_in_suggestion_prefix(
&self,
delete: &str,
delete_len: usize,
suggestion: &str,
suggestion_len: usize,
) -> bool {
if delete_len == 0 {
return true;
}
let suggestion_len = if self.prefix_length < suggestion_len {
self.prefix_length
} else {
suggestion_len
};
let mut j = 0;
for i in 0..delete_len {
let del_char = at(delete, i as isize).unwrap();
while j < suggestion_len && del_char != at(suggestion, j as isize).unwrap() {
j += 1;
}
if j == suggestion_len {
return false;
}
}
true
}
pub fn create_dictionary_entry<T>(&mut self, term: T, count: usize) -> bool
where
T: Clone + AsRef<str> + Into<String>,
{
let entry = self
.words
.entry(term.clone().into().into_boxed_str())
.or_insert(0);
if *entry == 0 {
*entry = count;
if count < self.count_threshold {
return false;
}
} else {
let old_count = *entry;
let updated_count = if usize::MAX - *entry > count {
*entry + count
} else {
usize::MAX
};
*entry = updated_count;
if updated_count < self.count_threshold || old_count >= self.count_threshold {
return false;
}
}
let term_len = len(term.as_ref());
if self
.term_length_threshold
.as_ref()
.is_some_and(|term_length_threshold| {
!term_length_threshold.is_empty() && term_len < term_length_threshold[0]
})
{
return false;
}
let max_term_edit_distance = self.term_length_threshold.as_ref().map_or(
self.max_dictionary_edit_distance,
|term_length_threshold| {
let mut max_term_edit_distance = 0;
for (i, threshold) in term_length_threshold.iter().enumerate() {
if term_len >= *threshold {
max_term_edit_distance = self.max_dictionary_edit_distance + i
} else {
break;
}
}
max_term_edit_distance
},
);
if term_len > self.max_dictionary_term_length {
self.max_dictionary_term_length = term_len;
}
let edits = self.edits_prefix(term.as_ref(), max_term_edit_distance);
for delete in edits {
let delete_hash = hash32(delete.as_bytes());
self.deletes
.entry(delete_hash)
.and_modify(|e| e.push(term.clone().into().into_boxed_str()))
.or_insert_with(|| vec![term.clone().into().into_boxed_str()]);
}
true
}
fn edits_prefix(&self, key: &str, max_term_edit_distance: usize) -> AHashSet<String> {
let mut hash_set = AHashSet::new();
let key_len = len(key);
if key_len <= self.max_dictionary_edit_distance {
hash_set.insert("".to_string());
}
if key_len > self.prefix_length {
let shortened_key = slice(key, 0, self.prefix_length);
hash_set.insert(shortened_key.clone());
edits(&shortened_key, 0, max_term_edit_distance, &mut hash_set);
} else {
hash_set.insert(key.to_string());
edits(key, 0, max_term_edit_distance, &mut hash_set);
};
hash_set
}
fn has_different_suffix(
&self,
max_edit_distance: usize,
input: &str,
input_len: usize,
candidate_len: usize,
suggestion: &str,
suggestion_len: usize,
) -> bool {
let min =
if self.prefix_length as isize - max_edit_distance as isize == candidate_len as isize {
min(input_len, suggestion_len) as isize - self.prefix_length as isize
} else {
0
};
(self.prefix_length - max_edit_distance == candidate_len)
&& (((min - self.prefix_length as isize) > 1)
&& (suffix(input, input_len + 1 - min as usize)
!= suffix(suggestion, suggestion_len + 1 - min as usize)))
|| ((min > 0)
&& (at(input, (input_len - min as usize) as isize)
!= at(suggestion, (suggestion_len - min as usize) as isize))
&& ((at(input, (input_len - min as usize - 1) as isize)
!= at(suggestion, (suggestion_len - min as usize) as isize))
|| (at(input, (input_len - min as usize) as isize)
!= at(suggestion, (suggestion_len - min as usize - 1) as isize))))
}
}