use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::sync::Mutex;
const CACHE_MAX: usize = 10_000;
static IRREGULARS: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
let mut m = HashMap::new();
m.insert("people", "person");
m.insert("children", "child");
m.insert("men", "man");
m.insert("women", "woman");
m.insert("mice", "mouse");
m.insert("geese", "goose");
m.insert("oxen", "ox");
m.insert("feet", "foot");
m.insert("teeth", "tooth");
m.insert("lives", "life");
m.insert("wives", "wife");
m.insert("moves", "move");
m.insert("zombies", "zombie");
m.insert("indices", "index");
m.insert("vertices", "vertex");
m.insert("leaves", "leaf");
m.insert("calves", "calf");
m.insert("halves", "half");
m.insert("loaves", "loaf");
m.insert("hooves", "hoof");
m
});
static UNCOUNTABLE: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"news", "fish", "sheep", "deer", "series", "species",
"equipment", "information", "money", "rice", "jeans",
"police", "data", "media",
]
.iter()
.copied()
.collect()
});
struct Rule {
re: Regex,
repl: &'static str,
}
static RULES: Lazy<Vec<Rule>> = Lazy::new(|| {
vec![
Rule { re: Regex::new(r"(?i)(quiz)zes$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(matri|appendi)ces$").unwrap(), repl: "${1}x" },
Rule { re: Regex::new(r"(?i)(ox)en$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(alias|status)(es)?$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(octop|vir)(us|i)$").unwrap(), repl: "${1}us" },
Rule { re: Regex::new(r"(?i)(cris|ax|test)es$").unwrap(), repl: "${1}is" },
Rule { re: Regex::new(r"(?i)(shoe)s$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(bus)(es)?$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)([ml])ice$").unwrap(), repl: "${1}ouse" },
Rule { re: Regex::new(r"(?i)(x|ch|ss|sh)es$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(m)ovies$").unwrap(), repl: "${1}ovie" },
Rule { re: Regex::new(r"(?i)(s)eries$").unwrap(), repl: "${1}eries" },
Rule { re: Regex::new(r"(?i)([^aeiouy]|qu)ies$").unwrap(), repl: "${1}y" },
Rule { re: Regex::new(r"(?i)([lr])ves$").unwrap(), repl: "${1}f" },
Rule { re: Regex::new(r"(?i)(tive)s$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(hive)s$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)([^f])ves$").unwrap(), repl: "${1}fe" },
Rule {
re: Regex::new(
r"(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$",
)
.unwrap(),
repl: "${1}sis",
},
Rule { re: Regex::new(r"(?i)([ti])a$").unwrap(), repl: "${1}um" },
Rule { re: Regex::new(r"(?i)(n)ews$").unwrap(), repl: "${1}ews" },
Rule { re: Regex::new(r"(?i)(o)es$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)(ss)$").unwrap(), repl: "${1}" },
Rule { re: Regex::new(r"(?i)s$").unwrap(), repl: "" },
]
});
static CACHE: Lazy<Mutex<HashMap<String, String>>> = Lazy::new(|| Mutex::new(HashMap::new()));
pub fn singularize(word: &str) -> String {
{
let mut cache = CACHE.lock().unwrap();
if let Some(v) = cache.get(word) {
return v.clone();
}
if cache.len() >= CACHE_MAX {
cache.clear();
}
}
let out = singularize_inner(word);
let mut cache = CACHE.lock().unwrap();
cache.insert(word.to_string(), out.clone());
out
}
fn singularize_inner(word: &str) -> String {
if word.is_empty() {
return word.to_string();
}
let lower = word.to_lowercase();
if UNCOUNTABLE.contains(lower.as_str()) {
return word.to_string();
}
if let Some(&irr) = IRREGULARS.get(lower.as_str()) {
return preserve_case(word, irr);
}
for r in RULES.iter() {
if r.re.is_match(word) {
return r.re.replace(word, r.repl).into_owned();
}
}
word.to_string()
}
fn preserve_case(original: &str, lowered: &str) -> String {
if original == original.to_uppercase() {
return lowered.to_uppercase();
}
if let Some(first) = original.chars().next() {
if first.is_uppercase() {
let mut chars = lowered.chars();
if let Some(c) = chars.next() {
let upper: String = c.to_uppercase().collect();
let rest: String = chars.collect();
return upper + &rest;
}
}
}
lowered.to_string()
}