use crate::schema::FieldIndex;
use std::collections::BTreeSet;
pub(crate) fn normalize(s: &str) -> String {
s.to_lowercase()
}
pub(crate) fn value_tokens(text: &str, idx: FieldIndex) -> Vec<String> {
let norm = normalize(text);
let chars: Vec<char> = norm.chars().collect();
let mut out = BTreeSet::new();
match idx {
FieldIndex::None => {}
FieldIndex::Exact => {
out.insert(norm);
}
FieldIndex::Prefix(n) => {
for len in 1..=n.min(chars.len()) {
out.insert(chars[..len].iter().collect());
}
}
FieldIndex::Ngram(n) => {
if chars.len() < n {
if !chars.is_empty() {
out.insert(norm);
}
} else {
for w in chars.windows(n) {
out.insert(w.iter().collect());
}
}
}
}
out.into_iter().collect()
}
pub(crate) fn ngram_probe_tokens(probe: &str, n: usize) -> Option<Vec<String>> {
let norm = normalize(probe);
let chars: Vec<char> = norm.chars().collect();
if chars.len() < n {
return None;
}
let set: BTreeSet<String> = chars.windows(n).map(|w| w.iter().collect()).collect();
Some(set.into_iter().collect())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ngram_tokens_are_char_windows() {
assert_eq!(
value_tokens("AbCd", FieldIndex::Ngram(3)),
vec!["abc".to_string(), "bcd".to_string()]
);
assert_eq!(
value_tokens("김철수", FieldIndex::Ngram(2)),
vec!["김철", "철수"]
);
assert_eq!(value_tokens("ab", FieldIndex::Ngram(3)), vec!["ab"]);
}
#[test]
fn prefix_tokens_cover_lengths_up_to_n() {
assert_eq!(
value_tokens("Carol", FieldIndex::Prefix(3)),
vec!["c".to_string(), "ca".to_string(), "car".to_string()]
);
assert_eq!(value_tokens("ab", FieldIndex::Prefix(5)), vec!["a", "ab"]);
}
#[test]
fn probe_tokens_match_value_tokens() {
let value = value_tokens("Carol Danvers", FieldIndex::Ngram(3));
let probe = ngram_probe_tokens("DANVERS", 3).unwrap();
assert!(probe.iter().all(|t| value.contains(t)));
assert!(
ngram_probe_tokens("da", 3).is_none(),
"short probe -> fallback"
);
}
}