#[cfg(test)]
use strum::EnumIter;
use unicode_segmentation::UnicodeSegmentation;
#[cfg_attr(test, derive(EnumIter))]
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
pub enum Boundary {
Hyphen,
Underscore,
Space,
UpperLower,
LowerUpper,
DigitUpper,
UpperDigit,
DigitLower,
LowerDigit,
Acronym,
}
impl Boundary {
pub fn list_from(s: &str) -> Vec<Self> {
Boundary::all()
.iter()
.filter(|boundary| {
let left_iter = s.graphemes(true);
let mid_iter = s.graphemes(true).skip(1);
let right_iter = s.graphemes(true).skip(2);
let mut one_iter = left_iter.clone();
let two_iter = left_iter.clone().zip(mid_iter.clone());
let mut two_iter_and_upper = two_iter.clone().zip(std::iter::once(false).chain(
two_iter.map(|(a, b)| grapheme_is_uppercase(a) && grapheme_is_uppercase(b)),
));
let mut three_iter = left_iter.zip(mid_iter).zip(right_iter);
one_iter.any(|a| boundary.detect_one(a))
|| two_iter_and_upper
.any(|((a, b), is_acro)| boundary.detect_two(a, b) && !is_acro)
|| three_iter.any(|((a, b), c)| boundary.detect_three(a, b, c))
})
.copied()
.collect()
}
pub fn defaults() -> Vec<Self> {
use Boundary::*;
vec![
Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit,
Acronym,
]
}
pub fn delims() -> Vec<Self> {
use Boundary::*;
vec![Hyphen, Underscore, Space]
}
pub fn digits() -> Vec<Self> {
use Boundary::*;
vec![DigitUpper, UpperDigit, DigitLower, LowerDigit]
}
pub fn letter_digit() -> Vec<Self> {
use Boundary::*;
vec![UpperDigit, LowerDigit]
}
pub fn digit_letter() -> Vec<Self> {
use Boundary::*;
vec![DigitUpper, DigitLower]
}
pub fn all() -> Vec<Self> {
use Boundary::*;
vec![
Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit, DigitLower,
LowerDigit, Acronym,
]
}
fn detect_one(&self, c: &str) -> bool {
use Boundary::*;
match self {
Hyphen => c == "-",
Underscore => c == "_",
Space => c == " ",
_ => false,
}
}
fn detect_two(&self, c: &str, d: &str) -> bool {
use Boundary::*;
match self {
UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d),
LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d),
DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d),
UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d),
DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d),
LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d),
_ => false,
}
}
fn detect_three(&self, c: &str, d: &str, e: &str) -> bool {
use Boundary::*;
if let Acronym = self {
grapheme_is_uppercase(c) && grapheme_is_uppercase(d) && grapheme_is_lowercase(e)
} else {
false
}
}
}
fn grapheme_is_digit(c: &str) -> bool {
c.chars().all(|c| c.is_ascii_digit())
}
fn grapheme_is_uppercase(c: &str) -> bool {
c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase()
}
fn grapheme_is_lowercase(c: &str) -> bool {
c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase()
}
pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String>
where
T: AsRef<str>,
{
use std::iter::once;
let s = s.as_ref();
let left_iter = s.graphemes(true);
let mid_iter = s.graphemes(true).skip(1);
let right_iter = s.graphemes(true).skip(2);
let singles = left_iter.clone();
let doubles = left_iter.clone().zip(mid_iter.clone());
let triples = left_iter.zip(mid_iter).zip(right_iter);
let singles = singles
.map(|c| boundaries.iter().any(|b| b.detect_one(c)))
.map(|split| if split { Some(true) } else { None });
let doubles = doubles
.map(|(c, d)| boundaries.iter().any(|b| b.detect_two(c, d)))
.map(|split| if split { Some(false) } else { None });
let triples = triples
.map(|((c, d), e)| boundaries.iter().any(|b| b.detect_three(c, d, e)))
.map(|split| if split { Some(false) } else { None });
let split_points = singles
.zip(once(None).chain(doubles))
.zip(once(None).chain(triples).chain(once(None)))
.map(|((s, d), t)| s.or(d).or(t));
let mut words = Vec::new();
let mut word = String::new();
for (c, split) in s.graphemes(true).zip(split_points) {
match split {
None => word.push_str(c),
Some(true) => words.push(std::mem::take(&mut word)),
Some(false) => {
words.push(std::mem::take(&mut word));
word.push_str(c);
}
}
}
words.push(word);
words.into_iter().filter(|s| !s.is_empty()).collect()
}
#[cfg(test)]
mod test {
use super::*;
use strum::IntoEnumIterator;
#[test]
fn all_boundaries_in_iter() {
let all = Boundary::all();
for boundary in Boundary::iter() {
assert!(all.contains(&boundary));
}
}
#[test]
fn split_on_delims() {
assert_eq!(
vec!["my", "word", "list", "separated", "by", "delims"],
split("my_word-list separated-by_delims", &Boundary::delims())
)
}
#[test]
fn boundaries_found_in_string() {
use Boundary::*;
assert_eq!(vec![UpperLower], Boundary::list_from(".Aaaa"));
assert_eq!(
vec![LowerUpper, UpperLower, LowerDigit],
Boundary::list_from("a8.Aa.aA")
);
assert_eq!(Boundary::digits(), Boundary::list_from("b1B1b"));
assert_eq!(
vec![Hyphen, Underscore, Space, Acronym],
Boundary::list_from("AAa -_")
);
}
}