use convert_case::{Case, Casing};
use emojis;
use lazy_static::lazy_static;
use pagefind_stem::Stemmer;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;
lazy_static! {
static ref EMOJI: Regex = Regex::new("\\p{Emoji}").unwrap();
}
#[derive(Debug, Clone, PartialEq)]
pub struct IndexableWord {
pub stemmed: String,
pub is_compound_part: bool,
pub original: Option<String>,
}
pub fn get_indexable_words(
raw_word: &str,
stemmer: Option<&Stemmer>,
include_characters: &[char],
) -> Vec<IndexableWord> {
let mut results = Vec::new();
let mut normalized_word = String::with_capacity(raw_word.len());
for mut c in raw_word.chars() {
let is_alpha = c.is_alphanumeric();
if is_alpha || include_characters.contains(&c) {
c.make_ascii_lowercase();
if c.is_uppercase() {
normalized_word.extend(c.to_lowercase());
} else {
normalized_word.push(c);
}
}
}
if !normalized_word.is_empty() {
let diacritic_normalized = normalize_diacritics(&normalized_word);
let stemmed = if let Some(stemmer) = stemmer {
stemmer
.stem(diacritic_normalized.as_deref().unwrap_or(&normalized_word))
.into_owned()
} else {
diacritic_normalized
.clone()
.unwrap_or_else(|| normalized_word.to_string())
};
results.push(IndexableWord {
stemmed,
original: diacritic_normalized.map(|_| normalized_word.to_string()),
is_compound_part: false,
});
}
let possibly_compound = raw_word.chars().any(|c| !c.is_alphanumeric())
|| raw_word.chars().skip(1).any(|c| c.is_uppercase());
if !possibly_compound {
return results;
}
let (word_parts, extras) = get_discrete_words(raw_word);
if !normalized_word.is_empty()
&& (word_parts.contains(|c: char| c.is_whitespace())
|| !normalized_word.starts_with(&word_parts))
{
let part_words: Vec<_> = word_parts.split_whitespace().collect();
for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
let part_diacritic = normalize_diacritics(part_word);
let stemmed_part = if let Some(stemmer) = stemmer {
stemmer
.stem(part_diacritic.as_deref().unwrap_or(part_word))
.into_owned()
} else {
part_diacritic
.clone()
.unwrap_or_else(|| part_word.to_string())
};
results.push(IndexableWord {
stemmed: stemmed_part,
original: part_diacritic.map(|_| part_word.to_string()),
is_compound_part: true,
});
}
}
if let Some(extras) = extras {
for extra in extras {
results.push(IndexableWord {
stemmed: extra,
original: None,
is_compound_part: false,
});
}
}
results
}
fn normalize_diacritics(word: &str) -> Option<String> {
if word.is_ascii() {
return None;
}
let normalized: String = word
.nfd()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.collect();
if normalized != word {
Some(normalized)
} else {
None
}
}
fn get_discrete_words<S: AsRef<str>>(s: S) -> (String, Option<Vec<String>>) {
let mut extras = None;
let words = s
.as_ref()
.replace(|c: char| c.is_ascii_punctuation(), " ")
.to_case(Case::Lower)
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if EMOJI.is_match(s.as_ref()) {
extras = Some(
s.as_ref()
.graphemes(true)
.into_iter()
.filter_map(|x| {
if emojis::get(x).is_some() {
Some(x.to_string())
} else {
None
}
})
.collect::<Vec<_>>(),
);
}
(words, extras)
}
#[cfg(test)]
mod tests {
use crate::fossick::get_stemmer;
use super::*;
#[test]
fn indexable_words_simple() {
let stemmer = get_stemmer("en");
let results = get_indexable_words("hello", stemmer.as_ref(), &[]);
assert_eq!(
results,
vec![IndexableWord {
stemmed: "hello".to_string(),
is_compound_part: false,
original: None
}]
);
}
#[test]
fn indexable_words_compound() {
let stemmer = get_stemmer("en");
let expected = vec![
IndexableWord {
stemmed: "threeantelop".to_string(),
is_compound_part: false,
original: None,
},
IndexableWord {
stemmed: "three".to_string(),
is_compound_part: true,
original: None,
},
IndexableWord {
stemmed: "antelop".to_string(),
is_compound_part: true,
original: None,
},
];
let camel_results = get_indexable_words("ThreeAntelopes", stemmer.as_ref(), &[]);
let kebab_results = get_indexable_words("three-antelopes", stemmer.as_ref(), &[]);
let snake_results = get_indexable_words("three_antelopes", stemmer.as_ref(), &[]);
assert_eq!(camel_results, expected);
assert_eq!(kebab_results, expected);
assert_eq!(snake_results, expected);
}
#[test]
fn indexable_words_compound_diacritics() {
let stemmer = get_stemmer("fr");
let results = get_indexable_words("café-crème", stemmer.as_ref(), &[]);
assert_eq!(
results,
vec![
IndexableWord {
stemmed: "cafecrem".to_string(),
is_compound_part: false,
original: Some("cafécrème".to_string()),
},
IndexableWord {
stemmed: "caf".to_string(),
is_compound_part: true,
original: Some("café".to_string()),
},
IndexableWord {
stemmed: "crem".to_string(),
is_compound_part: true,
original: Some("crème".to_string()),
},
]
);
}
#[test]
fn hyphenated_words() {
let input = "these-words-are-hyphenated";
assert_eq!(
get_discrete_words(input),
("these words are hyphenated".into(), None)
);
}
#[test]
fn underscored_words() {
let input = "__array_structures";
assert_eq!(get_discrete_words(input), ("array structures".into(), None));
}
#[test]
fn camel_words() {
let input = "WKWebVIEWComponent";
assert_eq!(
get_discrete_words(input),
("wk web view component".into(), None)
);
}
#[test]
fn dotted_words() {
let input = "page.Find";
assert_eq!(get_discrete_words(input), ("page find".into(), None));
}
#[test]
fn misc_punctuation() {
let input = "cloud/cannon,page.find";
assert_eq!(
get_discrete_words(input),
("cloud cannon page find".into(), None)
);
}
#[test]
fn french() {
let input = "l'alphabet";
assert_eq!(get_discrete_words(input), ("l alphabet".into(), None));
}
#[test]
fn html() {
let input = "<FormComponent data-pagefind-meta='[key:(value)]'>";
assert_eq!(
get_discrete_words(input),
("form component data pagefind meta key value".into(), None)
);
}
#[test]
fn emoji() {
let input = "cloud🌦️cannon";
assert_eq!(
get_discrete_words(input),
("cloud🌦️cannon".into(), Some(vec!["🌦️".into()]))
);
let input = "👋👨👩👧👦🌾";
assert_eq!(
get_discrete_words(input),
(
"👋👨👩👧👦🌾".into(),
Some(vec!["👋".into(), "👨👩👧👦".into(), "🌾".into()])
)
);
}
}