use crate::CharStringExt;
use crate::char_ext::CharExt;
use serde::{Deserialize, Serialize};
pub enum Orthography {
Lowercase = 1 << 0,
Titlecase = 1 << 1,
AllCaps = 1 << 2,
LowerCamel = 1 << 3,
UpperCamel = 1 << 4,
Multiword = 1 << 5,
Hyphenated = 1 << 6,
Apostrophe = 1 << 7,
RomanNumerals = 1 << 8,
}
type OrthographyFlagsUnderlyingType = u16;
bitflags::bitflags! {
#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, PartialOrd, Serialize)]
pub struct OrthFlags: OrthographyFlagsUnderlyingType {
const LOWERCASE = Orthography::Lowercase as OrthographyFlagsUnderlyingType;
const TITLECASE = Orthography::Titlecase as OrthographyFlagsUnderlyingType;
const ALLCAPS = Orthography::AllCaps as OrthographyFlagsUnderlyingType;
const LOWER_CAMEL = Orthography::LowerCamel as OrthographyFlagsUnderlyingType;
const UPPER_CAMEL = Orthography::UpperCamel as OrthographyFlagsUnderlyingType;
const MULTIWORD = Orthography::Multiword as OrthographyFlagsUnderlyingType;
const HYPHENATED = Orthography::Hyphenated as OrthographyFlagsUnderlyingType;
const APOSTROPHE = Orthography::Apostrophe as OrthographyFlagsUnderlyingType;
const ROMAN_NUMERALS = Orthography::RomanNumerals as OrthographyFlagsUnderlyingType;
}
}
impl Default for OrthFlags {
fn default() -> Self {
Self::empty()
}
}
impl OrthFlags {
pub fn from_letters(letters: &[char]) -> Self {
let mut ortho_flags = Self::default();
let mut all_lower = true;
let mut all_upper = true;
let mut first_is_upper = false;
let mut first_is_lower = false;
let mut saw_upper_after_first = false;
let mut saw_lower_after_first = false;
let mut is_first_char = true;
let mut upper_to_lower = false;
let mut lower_to_upper = false;
let letter_count = letters.iter().filter(|c| c.is_english_lingual()).count();
for &c in letters {
if c == ' ' {
ortho_flags |= Self::MULTIWORD;
continue;
}
if c == '-' {
ortho_flags |= Self::HYPHENATED;
continue;
}
if c.normalized() == '\'' {
ortho_flags |= Self::APOSTROPHE;
continue;
}
if !c.is_english_lingual() {
continue;
}
if c.is_lowercase() {
all_upper = false;
if is_first_char {
first_is_lower = true;
} else {
saw_lower_after_first = true;
if upper_to_lower {
lower_to_upper = true;
}
upper_to_lower = true;
}
} else if c.is_uppercase() {
all_lower = false;
if is_first_char {
first_is_upper = true;
} else {
saw_upper_after_first = true;
if lower_to_upper {
upper_to_lower = true;
}
lower_to_upper = true;
}
} else {
first_is_upper = false;
first_is_lower = false;
upper_to_lower = false;
lower_to_upper = false;
}
is_first_char = false;
}
if letter_count > 0 {
if all_lower {
ortho_flags |= Self::LOWERCASE;
}
if all_upper {
ortho_flags |= Self::ALLCAPS;
}
if letter_count > 1 && first_is_upper && !saw_upper_after_first {
ortho_flags |= Self::TITLECASE;
}
if first_is_lower && saw_upper_after_first {
ortho_flags |= Self::LOWER_CAMEL;
}
if first_is_upper && saw_lower_after_first && saw_upper_after_first {
ortho_flags |= Self::UPPER_CAMEL;
}
}
if looks_like_roman_numerals(letters) && is_really_roman_numerals(&letters.to_lower()) {
ortho_flags |= Self::ROMAN_NUMERALS;
}
ortho_flags
}
}
fn looks_like_roman_numerals(word: &[char]) -> bool {
let mut is_roman = false;
let first_char_upper;
if let Some((&first, rest)) = word.split_first()
&& "mdclxvi".contains(first.to_ascii_lowercase())
{
first_char_upper = first.is_uppercase();
for &c in rest {
if !"mdclxvi".contains(c.to_ascii_lowercase()) || c.is_uppercase() != first_char_upper {
return false;
}
}
is_roman = true;
}
is_roman
}
fn is_really_roman_numerals(word: &[char]) -> bool {
let s: String = word.iter().collect();
let mut chars = s.chars().peekable();
let mut m_count = 0;
while m_count < 4 && chars.peek() == Some(&'m') {
chars.next();
m_count += 1;
}
if !check_roman_group(&mut chars, 'c', 'd', 'm') {
return false;
}
if !check_roman_group(&mut chars, 'x', 'l', 'c') {
return false;
}
if !check_roman_group(&mut chars, 'i', 'v', 'x') {
return false;
}
if chars.next().is_some() {
return false;
}
true
}
fn check_roman_group<I: Iterator<Item = char>>(
chars: &mut std::iter::Peekable<I>,
one: char,
five: char,
ten: char,
) -> bool {
match chars.peek() {
Some(&c) if c == one => {
chars.next();
match chars.peek() {
Some(&next) if next == ten || next == five => {
chars.next();
true
}
_ => {
let mut count = 0;
while count < 2 && chars.peek() == Some(&one) {
chars.next();
count += 1;
}
true
}
}
}
Some(&c) if c == five => {
chars.next();
let mut count = 0;
while count < 3 && chars.peek() == Some(&one) {
chars.next();
count += 1;
}
true
}
_ => true,
}
}
#[cfg(test)]
mod tests {
use crate::CharString;
use crate::dict_word_metadata::tests::md;
use crate::dict_word_metadata_orthography::OrthFlags;
fn orth_flags(s: &str) -> OrthFlags {
let letters: CharString = s.chars().collect();
OrthFlags::from_letters(&letters)
}
#[test]
fn test_lowercase_flags() {
let flags = orth_flags("hello");
assert!(flags.contains(OrthFlags::LOWERCASE));
assert!(!flags.contains(OrthFlags::TITLECASE));
assert!(!flags.contains(OrthFlags::ALLCAPS));
assert!(!flags.contains(OrthFlags::LOWER_CAMEL));
assert!(!flags.contains(OrthFlags::UPPER_CAMEL));
let flags = orth_flags("hello123");
assert!(flags.contains(OrthFlags::LOWERCASE));
}
#[test]
fn test_titlecase_flags() {
let flags = orth_flags("Hello");
assert!(!flags.contains(OrthFlags::LOWERCASE));
assert!(flags.contains(OrthFlags::TITLECASE));
assert!(!flags.contains(OrthFlags::ALLCAPS));
assert!(!flags.contains(OrthFlags::LOWER_CAMEL));
assert!(!flags.contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("World").contains(OrthFlags::TITLECASE));
assert!(orth_flags("Something").contains(OrthFlags::TITLECASE));
assert!(!orth_flags("McDonald").contains(OrthFlags::TITLECASE));
assert!(!orth_flags("O'Reilly").contains(OrthFlags::TITLECASE));
assert!(!orth_flags("A").contains(OrthFlags::TITLECASE));
}
#[test]
fn test_allcaps_flags() {
let flags = orth_flags("HELLO");
assert!(!flags.contains(OrthFlags::LOWERCASE));
assert!(!flags.contains(OrthFlags::TITLECASE));
assert!(flags.contains(OrthFlags::ALLCAPS));
assert!(!flags.contains(OrthFlags::LOWER_CAMEL));
assert!(!flags.contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("NASA").contains(OrthFlags::ALLCAPS));
assert!(orth_flags("I").contains(OrthFlags::ALLCAPS));
}
#[test]
fn test_lower_camel_flags() {
let flags = orth_flags("helloWorld");
assert!(!flags.contains(OrthFlags::LOWERCASE));
assert!(!flags.contains(OrthFlags::TITLECASE));
assert!(!flags.contains(OrthFlags::ALLCAPS));
assert!(flags.contains(OrthFlags::LOWER_CAMEL));
assert!(!flags.contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("getHTTPResponse").contains(OrthFlags::LOWER_CAMEL));
assert!(orth_flags("eBay").contains(OrthFlags::LOWER_CAMEL));
assert!(!orth_flags("hello").contains(OrthFlags::LOWER_CAMEL));
assert!(!orth_flags("HelloWorld").contains(OrthFlags::LOWER_CAMEL));
}
#[test]
fn test_upper_camel_flags() {
let flags = orth_flags("HelloWorld");
assert!(!flags.contains(OrthFlags::LOWERCASE));
assert!(!flags.contains(OrthFlags::TITLECASE));
assert!(!flags.contains(OrthFlags::ALLCAPS));
assert!(!flags.contains(OrthFlags::LOWER_CAMEL));
assert!(flags.contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("HttpRequest").contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("McDonald").contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("O'Reilly").contains(OrthFlags::UPPER_CAMEL));
assert!(orth_flags("XMLHttpRequest").contains(OrthFlags::UPPER_CAMEL));
assert!(!orth_flags("Hello").contains(OrthFlags::UPPER_CAMEL));
assert!(!orth_flags("NASA").contains(OrthFlags::UPPER_CAMEL));
assert!(!orth_flags("Hi").contains(OrthFlags::UPPER_CAMEL));
}
#[test]
fn test_roman_numeral_flags() {
assert!(orth_flags("MCMXCIV").contains(OrthFlags::ROMAN_NUMERALS));
assert!(orth_flags("mdccclxxi").contains(OrthFlags::ROMAN_NUMERALS));
assert!(orth_flags("MMXXI").contains(OrthFlags::ROMAN_NUMERALS));
assert!(orth_flags("mcmxciv").contains(OrthFlags::ROMAN_NUMERALS));
assert!(orth_flags("MCMXCIV").contains(OrthFlags::ROMAN_NUMERALS));
assert!(orth_flags("MMI").contains(OrthFlags::ROMAN_NUMERALS));
assert!(orth_flags("MMXXV").contains(OrthFlags::ROMAN_NUMERALS));
}
#[test]
fn test_single_roman_numeral_flags() {
assert!(orth_flags("i").contains(OrthFlags::ROMAN_NUMERALS));
}
#[test]
fn empty_string_is_not_roman_numeral() {
assert!(!orth_flags("").contains(OrthFlags::ROMAN_NUMERALS));
}
#[test]
fn dont_allow_mixed_case_roman_numerals() {
assert!(!orth_flags("MCMlxxxVIII").contains(OrthFlags::ROMAN_NUMERALS));
}
#[test]
fn dont_allow_looks_like_but_isnt_roman_numeral() {
assert!(!orth_flags("mdxlivx").contains(OrthFlags::ROMAN_NUMERALS));
assert!(!orth_flags("XIXIVV").contains(OrthFlags::ROMAN_NUMERALS));
}
#[test]
fn australia_lexeme_is_titlecase_even_when_word_is_lowercase() {
assert!(md("australia").orth_info.contains(OrthFlags::TITLECASE));
}
#[test]
fn australia_lexeme_is_titlecase_even_when_word_is_all_caps() {
assert!(md("AUSTRALIA").orth_info.contains(OrthFlags::TITLECASE));
}
#[test]
fn australia_lexeme_is_titlecase_even_when_word_is_mixed_case() {
assert!(md("AuStrAlIA").orth_info.contains(OrthFlags::TITLECASE));
}
#[test]
fn db_and_kw_symbols_are_lower_camel_case() {
assert!(md("db").orth_info.contains(OrthFlags::LOWER_CAMEL));
}
#[test]
fn am_is_lowercase_and_titlecase_and_all_caps() {
let metadata = md("am");
assert!(metadata.orth_info.contains(OrthFlags::LOWERCASE));
assert!(metadata.orth_info.contains(OrthFlags::TITLECASE));
assert!(metadata.orth_info.contains(OrthFlags::ALLCAPS));
}
#[test]
fn reading_is_both_lowercase_and_titlecase() {
let metadata = md("reading");
assert!(metadata.orth_info.contains(OrthFlags::LOWERCASE));
assert!(metadata.orth_info.contains(OrthFlags::TITLECASE));
}
#[test]
fn ebay_and_esim_are_lower_camel() {
let md1 = md("ebay");
assert!(md1.orth_info.contains(OrthFlags::LOWER_CAMEL));
let md2 = md("esim");
assert!(md2.orth_info.contains(OrthFlags::LOWER_CAMEL));
}
}