use regex::Regex;
pub struct Options {
pub are_numbers_colloquial: bool,
}
impl Options {
pub fn default() -> Options {
Options {are_numbers_colloquial: false}
}
pub fn with_colloquial() -> Options {
Options {are_numbers_colloquial: true}
}
}
pub fn get_a_or_an(word: &str) -> &str {
get_a_or_an_options(word, &Options::default())
}
pub fn get_a_or_an_options<'s>(word: &'s str, options: &Options) -> &'s str {
if word.len() == 0 {
return "";
}
let is_an = is_an_options(word, options);
a_or_an_capitalized_to_match(is_an, get_first_word(word))
}
fn a_or_an_capitalized_to_match(is_an: bool, first_word: &str) -> &str {
let is_title_case = is_title_case(first_word);
if is_an {
if is_title_case {
return "An";
}
return "an";
}
if is_title_case {
return "A";
}
"a"
}
fn is_title_case(first_word: &str) -> bool {
starts_with_capital(first_word) && !remainder_has_capitals(first_word)
}
fn starts_with_capital(first_word: &str) -> bool {
is_capital_char(get_first_letter(first_word))
}
fn remainder_has_capitals(first_word: &str) -> bool {
let mut chars_iter = first_word.chars();
chars_iter.next();
chars_iter.any(|c| is_capital_char(c))
}
fn is_capital_char(c: char) -> bool {
'A' <= c && c <= 'Z'
}
pub fn is_an(word: &str) -> bool {
is_an_options(word, &Options::default())
}
pub fn is_an_options(word: &str, options: &Options) -> bool {
if word.len() == 0 {
return false;
}
let word = get_first_word(word);
let word_lower = word.to_lowercase();
if is_number(word) {
return is_an_for_number(word, options)
}
let is_an_result = is_naively_an(&word_lower);
if is_acronym(word) {
return is_an_for_acronym(word);
}
if is_exception(&word_lower) || is_exception_after_strip(&word_lower) {
return !is_an_result;
}
is_an_result
}
fn is_exception_after_strip(word_lower: &str) -> bool {
["s", "es", "ed", "ly"].into_iter().map(|ending| strip_end(&word_lower, ending))
.any(|stripped| is_exception(stripped))
}
fn strip_end<'s>(word: &'s str, ending: &str) -> &'s str {
if word.ends_with(ending) {
return &word[..(word.len() - ending.len())]
}
word
}
fn get_first_word(word: &str) -> &str {
let word = word.trim();
let words: Vec<&str> = word.split(|c: char| " ,.-;:'".contains(c)).collect();
words[0]
}
fn get_first_letter(word: &str) -> char {
word.chars().next().unwrap()
}
fn is_naively_an(word: &str) -> bool {
"aeiou".contains(get_first_letter(word))
}
fn is_exception(word: &str) -> bool {
let exceptions = [
"eunuch",
"eucalyptus",
"eugenics",
"eulogy",
"euphemism",
"euphony",
"euphoria",
"eureka",
"euro",
"european",
"euphemistic",
"euphonic",
"euphoric",
"euphemistically",
"euphonically",
"euphorically",
"heir",
"heiress",
"herb",
"homage",
"honesty",
"honor",
"honour",
"hour",
"honest",
"honorous",
"honestly",
"hourly",
"one",
"ouija",
"once",
"ubiquity",
"udometer",
"ufo",
"uke",
"ukelele",
"ululate",
"unicorn",
"unicycle",
"uniform",
"unify",
"union",
"unison",
"unit",
"unity",
"universe",
"university",
"upas",
"ural",
"uranium",
"urea",
"ureter",
"urethra",
"urine",
"urologist",
"urology",
"urus",
"usage",
"use",
"user",
"usual",
"usurp",
"usury",
"utensil",
"uterus",
"utility",
"utopia",
"utricle",
"uvarovite",
"uvea",
"uvula",
"ubiquitous",
"ugandan",
"ukrainian",
"unanimous",
"unicameral",
"unified",
"unique",
"unisex",
"universal",
"urinal",
"urological",
"useful",
"useless",
"usurious",
"usurped",
"utilitarian",
"utopic",
"yttria",
"yggdrasil",
"ylem",
"yperite",
"ytterbia",
"ytterbium",
"yttrium",
"ytterbous",
"ytterbic",
"yttric",
];
exceptions.contains(&word)
}
fn is_acronym(word: &str) -> bool {
is_match(word, r"^[A-Z]+$")
}
fn is_an_for_acronym(word: &str) -> bool {
let is_irregular = is_irregular_acronym(word);
let initial_vowel = starts_with_vowel(word);
if both_or_neither(initial_vowel, is_irregular) {
return false;
}
true
}
fn both_or_neither(a: bool, b: bool) -> bool {
a && b || !a && !b
}
fn is_irregular_acronym(word: &str) -> bool {
is_match(word, r"^[UFHLMNRSX]")
}
fn starts_with_vowel(word: &str) -> bool {
is_match(word, r"^[aeiouAEIOU]")
}
fn is_match(word: &str, regex: &str) -> bool {
let re = Regex::new(regex).unwrap();
re.is_match(word)
}
fn is_number(word: &str) -> bool {
is_match(word, r"^([0-9,]+)")
}
fn is_an_for_number(word: &str, options: &Options) -> bool {
let mut is_an = false;
if is_match(word, r"^(11|8|18)") {
let starts_with_11_or_18 = is_match(word, r"^(11|18)");
if starts_with_11_or_18 && word.len() == 4 {
is_an = options.are_numbers_colloquial;
} else if starts_with_11_or_18 && (word.len() - 2) % 3 == 0 {
is_an = true;
} else {
is_an = word.starts_with('8');
}
}
is_an
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn a_or_an_capitalized_to_match_test() {
assert_eq!("An", a_or_an_capitalized_to_match(true, "Ugly"));
assert_eq!("A", a_or_an_capitalized_to_match(false, "Leopard"));
assert_eq!("an", a_or_an_capitalized_to_match(true, "ugly"));
assert_eq!("a", a_or_an_capitalized_to_match(false, "leopard"));
assert_eq!("an", a_or_an_capitalized_to_match(true, "UgLy"));
assert_eq!("a", a_or_an_capitalized_to_match(false, "lEoparD"));
assert_eq!("an", a_or_an_capitalized_to_match(true, "FIFA"));
assert_eq!("a", a_or_an_capitalized_to_match(false, "UN"));
}
#[test]
fn get_first_word_test() {
assert_eq!("one", get_first_word("one two"));
assert_eq!("one", get_first_word("one two three"));
assert_eq!("one", get_first_word("one-two three"));
assert_eq!("heir", get_first_word("heir's"));
}
#[test]
fn strip_end_test() {
assert_eq!("one", strip_end("ones", "s"));
assert_eq!("heir", strip_end("heir's", "'s"));
assert_eq!("hour", strip_end("houred", "ed"));
assert_eq!("hour", strip_end("hourly", "ly"));
assert_eq!("hour", strip_end("hour's", "'s"));
}
#[test]
fn common_words() {
assert_eq!("an", get_a_or_an("antelope"));
assert_eq!("an", get_a_or_an("apple"));
assert_eq!("a", get_a_or_an("pear"));
assert_eq!(true, is_an("antelope"));
assert_eq!(true, is_an("apple"));
assert_eq!(false, is_an("pear"));
}
#[test]
fn zero_length() {
assert_eq!("", get_a_or_an(""));
assert_eq!(false, is_an(""));
}
macro_rules! tests {
($($name:ident: $value:expr,)*) => {
$(
#[test]
fn $name() {
let (input, expected) = $value;
assert_eq!(expected, get_a_or_an(input));
assert_eq!(expected.to_lowercase() == "an", is_an(input));
}
)*
}
}
macro_rules! tests_options_with_colloquial {
($($name:ident: $value:expr,)*) => {
$(
#[test]
fn $name() {
let options = &(Options::with_colloquial());
let (input, expected) = $value;
assert_eq!(expected, get_a_or_an_options(input, options));
assert_eq!(expected.to_lowercase() == "an", is_an_options(input, options));
}
)*
}
}
tests! {
test_ac0: ("CEO", "a"),
test_ac1: ("EU", "an"),
test_ac2a: ("FFA", "an"),
test_ac2b: ("FIFA", "an"),
test_ac2c: ("IOU", "an"),
test_ac3: ("MIA", "an"),
test_ac4: ("MNM", "an"),
test_ac5: ("UFO", "a"),
test_ac6: ("UN", "a"),
test_ac7: ("US", "a"),
test_ac8: ("USA", "a"),
test_a1: ("alien", "an"),
test_a2a: ("american", "an"),
test_a2b: ("antelope", "an"),
test_a2c: ("apple", "an"),
test_a2d: ("banana", "a"),
test_e1: ("economic", "an"),
test_e2: ("economy", "an"),
test_eu1: ("euro", "a"),
test_eu2: ("european", "a"),
test_eu3: ("European", "A"),
test_eu4: ("eucalyptus", "a"),
test_eu5: ("eulogy", "a"),
test_ur1: ("uranium", "a"),
test_ur2: ("urinal", "a"),
test_ur3: ("urologist", "a"),
test_uni1: ("unicorn", "a"),
test_uni2: ("uniform", "a"),
test_uni3: ("unit", "a"),
test_uni4: ("universal", "a"),
test_uni5: ("university", "a"),
test_uni_adj1: ("unidentified", "an"),
test_uni_adj2: ("unimportant", "an"),
test_uni_adj3: ("unintended", "an"),
test_uni_adj4: ("unintelligent", "an"),
test_h1: ("hair", "a"),
test_h2: ("heir", "an"),
test_h3: ("herb", "an"),
test_h4: ("hotel", "a"),
test_o0: ("ordinary", "an"),
test_o1: ("ouija", "a"),
test_u0: ("ukelele", "a"),
test_u1: ("umbrella", "an"),
test_u2: ("user", "a"),
test_n0: ("0", "a"),
test_n1: ("1", "a"),
test_n2: ("2", "a"),
test_n3: ("3", "a"),
test_n4: ("4", "a"),
test_n5: ("5", "a"),
test_n6: ("6", "a"),
test_n7: ("7", "a"),
test_n8: ("8", "an"),
test_n9: ("9", "a"),
test_n10: ("10", "a"),
test_n11: ("11", "an"),
test_n12: ("12", "a"),
test_n13: ("13", "a"),
test_n14: ("14", "a"),
test_n15: ("15", "a"),
test_n16: ("16", "a"),
test_n17: ("17", "a"),
test_n18: ("18", "an"),
test_n19: ("19", "a"),
test_n20: ("20", "a"),
test_ny1000: ("1000", "a"),
test_ny1800: ("1800", "a"),
test_ny1892: ("1892", "a"),
test_n11_01: ("11", "an"),
test_n11_02: ("110", "a"),
test_n11_03: ("1100", "a"),
test_n11_04: ("11000", "an"),
test_n11_05: ("110000", "a"),
test_n11_06: ("1100000", "a"),
test_n11_07: ("11000000", "an"),
test_n11_08: ("110000000", "a"),
test_n11_09: ("1100000000", "a"),
test_n11_10: ("11000000000", "an"),
test_n11_11: ("110000000000", "a"),
test_n11_12: ("1100000000000", "a"),
test_n11_13: ("11000000000000", "an"),
test_n11_14: ("110000000000000", "a"),
test_n11_15: ("1100000000000000", "a"),
test_n11_16: ("11000000000000000", "an"),
test_ns18_01: ("18", "an"),
test_ns18_02: ("180", "a"),
test_ns18_03: ("18000", "an"),
test_ns18_04: ("180000", "a"),
test_ns18_05: ("1800000", "a"),
test_ns18_06: ("18000000", "an"),
test_ns18_07: ("180000000", "a"),
test_ns18_08: ("1800000000", "a"),
test_ns18_09: ("18000000000", "an"),
test_ns18_10: ("180000000000", "a"),
test_ns18_11: ("1800000000000", "a"),
test_ns18_12: ("18000000000000", "an"),
test_ns18_13: ("180000000000000", "a"),
test_ns18_14: ("1800000000000000", "a"),
test_ns18_15: ("18000000000000000", "an"),
test_n8_01: ("8", "an"),
test_n8_02: ("80", "an"),
test_n8_03: ("800", "an"),
test_n8_04: ("8000", "an"),
test_n8_05: ("80000", "an"),
test_n8_06: ("800000", "an"),
test_n8_07: ("8000000", "an"),
test_n8_08: ("80000000", "an"),
test_n8_09: ("800000000", "an"),
test_mc1: ("Alien", "An"),
test_mc2: ("anteLoPe", "an"),
test_mc3: ("haiR", "a"),
test_mc4: ("HEIR", "an"),
test_mc5: ("Heir", "An"),
test_mc6: ("Ugly", "An"),
test_other1: ("ouija board", "a"),
test_other2: ("apple-board", "an"),
test_other3: ("honor-bound", "an"),
test_other4: ("horror-bound", "a"),
test_other_s1: ("heavenly", "a"),
test_other_s2: ("honored", "an"),
test_other_s3: ("hourly", "an"),
test_other_s4: ("heirly", "an"),
test_other6: ("heiresses", "an"),
test_other6b: ("heirs", "an"),
test_other7: ("honors", "an"),
test_other8: ("heir's", "an"),
test_other9: ("horror's", "a"),
test_other_adv1: ("ubiquitously", "a"),
test_other_adv2: ("ukelele", "a"),
test_other_adv3: ("unanimously", "a"),
test_other_adv4: ("unicamerally", "a"),
test_other_adv5: ("uniquely", "a"),
test_other_adv6: ("universally", "a"),
test_other_adv7: ("urologically", "a"),
test_other_adv8: ("usefully", "a"),
test_other_adv9: ("uselessly", "a"),
test_other_adv10: ("usuriously", "a"),
}
tests_options_with_colloquial! {
test_colloquial_ac1: ("EU", "an"),
test_colloquial_ac2: ("FIFA", "an"),
test_colloquial_ac3: ("MIA", "an"),
test_colloquial_ac4: ("MNM", "an"),
test_colloquial_ac5: ("UFO", "a"),
test_colloquial_ac6: ("UN", "a"),
test_colloquial_a1: ("alien", "an"),
test_colloquial_a2: ("antelope", "an"),
test_colloquial_h1: ("hair", "a"),
test_colloquial_h2: ("heir", "an"),
test_colloquial_h3: ("herb", "an"),
test_colloquial_h4: ("hotel", "a"),
test_colloquial_u1: ("umbrella", "an"),
test_colloquial_u2: ("user", "a"),
test_colloquial_n0: ("0", "a"),
test_colloquial_n1: ("1", "a"),
test_colloquial_n2: ("2", "a"),
test_colloquial_n3: ("3", "a"),
test_colloquial_n4: ("4", "a"),
test_colloquial_n5: ("5", "a"),
test_colloquial_n6: ("6", "a"),
test_colloquial_n7: ("7", "a"),
test_colloquial_n8: ("8", "an"),
test_colloquial_n9: ("9", "a"),
test_colloquial_n10: ("10", "a"),
test_colloquial_ny1000: ("1000", "a"),
test_colloquial_ny1100: ("1100", "an"),
test_colloquial_ny1800: ("1800", "an"),
test_colloquial_ny1892: ("1892", "an"),
}
}