extern crate alloc;
use alloc::collections::BTreeSet;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
pub fn extract_trigrams(input: &str) -> BTreeSet<String> {
let mut out = BTreeSet::new();
for word in split_into_words(input) {
let padded = pad_word(&word);
let bytes = padded.as_bytes();
if bytes.len() < 3 {
continue;
}
for i in 0..=bytes.len() - 3 {
if let Ok(s) = core::str::from_utf8(&bytes[i..i + 3]) {
out.insert(s.to_string());
}
}
}
out
}
pub fn trigram_count(input: &str) -> usize {
extract_trigrams(input).len()
}
pub fn similarity(a: &str, b: &str) -> f64 {
let sa = extract_trigrams(a);
let sb = extract_trigrams(b);
if sa.is_empty() && sb.is_empty() {
return 0.0;
}
let inter = sa.intersection(&sb).count();
let union = sa.len() + sb.len() - inter;
if union == 0 {
0.0
} else {
inter as f64 / union as f64
}
}
pub fn trigrams_from_like_pattern(pattern: &str) -> Option<BTreeSet<String>> {
let mut runs: Vec<(String, bool /* leading-anchored */, bool /* trailing-anchored */)> =
Vec::new();
let mut cur = String::new();
let mut iter = pattern.chars().peekable();
let mut leading = true;
while let Some(c) = iter.next() {
match c {
'\\' => {
if let Some(next) = iter.next() {
cur.push(next);
}
}
'%' | '_' => {
if !cur.is_empty() {
runs.push((core::mem::take(&mut cur), leading, false));
}
leading = false;
}
_ => cur.push(c),
}
}
if !cur.is_empty() {
runs.push((cur, leading, true));
}
let mut out = BTreeSet::new();
for (run, anchored_left, anchored_right) in &runs {
let mut padded = String::new();
if *anchored_left {
padded.push_str(" ");
}
padded.push_str(&run.to_lowercase());
if *anchored_right {
padded.push(' ');
}
let bytes = padded.as_bytes();
if bytes.len() < 3 {
continue;
}
for i in 0..=bytes.len() - 3 {
if let Ok(s) = core::str::from_utf8(&bytes[i..i + 3]) {
out.insert(s.to_string());
}
}
}
if out.is_empty() {
None
} else {
Some(out)
}
}
fn split_into_words(input: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut cur = String::new();
for c in input.chars() {
if is_word_char(c) {
for lc in c.to_lowercase() {
cur.push(lc);
}
} else if !cur.is_empty() {
out.push(core::mem::take(&mut cur));
}
}
if !cur.is_empty() {
out.push(cur);
}
out
}
fn is_word_char(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '\''
}
fn pad_word(word: &str) -> String {
let mut s = String::with_capacity(word.len() + 3);
s.push_str(" ");
s.push_str(word);
s.push(' ');
s
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
fn collect(input: &str) -> Vec<String> {
extract_trigrams(input).into_iter().collect()
}
#[test]
fn padding_and_lowercase() {
let trs = collect("Hi");
assert_eq!(trs, vec![" h", " hi", "hi "]);
}
#[test]
fn multi_word_splits_on_space() {
let trs = collect("foo bar");
assert!(trs.contains(&"foo".to_string()));
assert!(trs.contains(&"bar".to_string()));
assert!(!trs.contains(&"o b".to_string()));
}
#[test]
fn similarity_identical() {
assert!((similarity("hello", "hello") - 1.0).abs() < 1e-9);
}
#[test]
fn similarity_disjoint() {
assert!(similarity("foo", "xyz") < 0.1);
}
#[test]
fn like_pattern_anchored() {
let trs = trigrams_from_like_pattern("foo%").unwrap();
assert!(trs.contains(" f"));
assert!(trs.contains(" fo"));
assert!(trs.contains("foo"));
}
#[test]
fn like_pattern_unanchored_short_run() {
let trs = trigrams_from_like_pattern("%foo%").unwrap();
assert!(trs.contains("foo"));
assert!(!trs.contains(" f"));
}
#[test]
fn like_pattern_no_usable_constraint() {
assert!(trigrams_from_like_pattern("%ab%").is_none());
assert!(trigrams_from_like_pattern("%%").is_none());
}
#[test]
fn like_pattern_backslash_escapes() {
let trs = trigrams_from_like_pattern("\\%foo").unwrap();
assert!(trs.contains(" %"));
}
}