use unicode_segmentation::UnicodeSegmentation;
fn grapheme_is_digit(c: &&str) -> bool {
c.chars().all(|c| c.is_ascii_digit())
}
fn grapheme_is_uppercase(c: &&str) -> bool {
c.to_uppercase() != c.to_lowercase() && *c == c.to_uppercase()
}
fn grapheme_is_lowercase(c: &&str) -> bool {
c.to_uppercase() != c.to_lowercase() && *c == c.to_lowercase()
}
#[derive(Debug, Eq, Hash, Clone, Copy)]
pub struct Boundary {
pub name: &'static str,
pub condition: fn(&[&str], Option<&'static str>) -> bool,
pub arg: Option<&'static str>,
pub start: usize,
pub len: usize,
}
impl PartialEq for Boundary {
fn eq(&self, other: &Self) -> bool {
self.name == other.name
}
}
impl Boundary {
pub const SPACE: Boundary = Boundary {
name: "Space",
condition: |s, _| s.get(0) == Some(&" "),
arg: None,
start: 0,
len: 1,
};
pub const HYPHEN: Boundary = Boundary {
name: "Hyphen",
condition: |s, _| s.get(0) == Some(&"-"),
arg: None,
start: 0,
len: 1,
};
pub const UNDERSCORE: Boundary = Boundary {
name: "Underscore",
condition: |s, _| s.get(0) == Some(&"_"),
arg: None,
start: 0,
len: 1,
};
pub const LOWER_UPPER: Boundary = Boundary {
name: "LowerUpper",
condition: |s, _| {
s.get(0).map(grapheme_is_lowercase) == Some(true)
&& s.get(1).map(grapheme_is_uppercase) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const UPPER_LOWER: Boundary = Boundary {
name: "UpperLower",
condition: |s, _| {
s.get(0).map(grapheme_is_uppercase) == Some(true)
&& s.get(1).map(grapheme_is_lowercase) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const ACRONYM: Boundary = Boundary {
name: "Acronym",
condition: |s, _| {
s.get(0).map(grapheme_is_uppercase) == Some(true)
&& s.get(1).map(grapheme_is_uppercase) == Some(true)
&& s.get(2).map(grapheme_is_lowercase) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const LOWER_DIGIT: Boundary = Boundary {
name: "LowerDigit",
condition: |s, _| {
s.get(0).map(grapheme_is_lowercase) == Some(true)
&& s.get(1).map(grapheme_is_digit) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const UPPER_DIGIT: Boundary = Boundary {
name: "UpperDigit",
condition: |s, _| {
s.get(0).map(grapheme_is_uppercase) == Some(true)
&& s.get(1).map(grapheme_is_digit) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const DIGIT_LOWER: Boundary = Boundary {
name: "DigitLower",
condition: |s, _| {
s.get(0).map(grapheme_is_digit) == Some(true)
&& s.get(1).map(grapheme_is_lowercase) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const DIGIT_UPPER: Boundary = Boundary {
name: "DigitUpper",
condition: |s, _| {
s.get(0).map(grapheme_is_digit) == Some(true)
&& s.get(1).map(grapheme_is_uppercase) == Some(true)
},
arg: None,
start: 1,
len: 0,
};
pub const fn from_delim(delim: &'static str) -> Boundary {
Boundary {
name: delim,
arg: Some(delim),
condition: |s, arg| s.join("").starts_with(arg.unwrap()),
start: 0,
len: delim.len(),
}
}
pub const fn defaults() -> [Boundary; 9] {
[
Boundary::SPACE,
Boundary::HYPHEN,
Boundary::UNDERSCORE,
Boundary::LOWER_UPPER,
Boundary::ACRONYM,
Boundary::LOWER_DIGIT,
Boundary::UPPER_DIGIT,
Boundary::DIGIT_LOWER,
Boundary::DIGIT_UPPER,
]
}
pub const fn digits() -> [Boundary; 4] {
[
Boundary::LOWER_DIGIT,
Boundary::UPPER_DIGIT,
Boundary::DIGIT_LOWER,
Boundary::DIGIT_UPPER,
]
}
pub const fn letter_digit() -> [Boundary; 2] {
[Boundary::LOWER_DIGIT, Boundary::UPPER_DIGIT]
}
pub fn digit_letter() -> [Boundary; 2] {
[Boundary::DIGIT_LOWER, Boundary::DIGIT_UPPER]
}
pub fn defaults_from(pattern: &str) -> Vec<Boundary> {
let mut boundaries = Vec::new();
for boundary in Boundary::defaults() {
let parts = split(&pattern, &[boundary]);
if parts.len() > 1 || parts.len() == 0 || parts[0] != pattern {
boundaries.push(boundary);
}
}
boundaries
}
}
pub fn split<'s, T>(s: &'s T, boundaries: &[Boundary]) -> Vec<&'s str>
where
T: AsRef<str>,
{
let s = s.as_ref();
if s.len() == 0 {
return vec![];
}
let mut words = Vec::new();
let mut last_boundary_end = 0;
let (indices, graphemes): (Vec<_>, Vec<_>) = s.grapheme_indices(true).unzip();
let grapheme_length = indices[graphemes.len() - 1] + graphemes[graphemes.len() - 1].len();
for i in 0..graphemes.len() {
for boundary in boundaries {
if (boundary.condition)(&graphemes[i..], boundary.arg) {
let boundary_byte_start: usize =
*indices.get(i + boundary.start).unwrap_or(&grapheme_length);
let boundary_byte_end: usize = *indices
.get(i + boundary.start + boundary.len)
.unwrap_or(&grapheme_length);
words.push(&s[last_boundary_end..boundary_byte_start]);
last_boundary_end = boundary_byte_end;
break;
}
}
}
words.push(&s[last_boundary_end..]);
words.into_iter().filter(|s| !s.is_empty()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn hyphen() {
let s = "a-b-c";
let v = split(&s, &[Boundary::HYPHEN]);
assert_eq!(v, vec!["a", "b", "c"]);
}
#[test]
fn underscore() {
let s = "a_b_c";
let v = split(&s, &[Boundary::UNDERSCORE]);
assert_eq!(v, vec!["a", "b", "c"]);
}
#[test]
fn space() {
let s = "a b c";
let v = split(&s, &[Boundary::SPACE]);
assert_eq!(v, vec!["a", "b", "c"]);
}
#[test]
fn delimiters() {
let s = "aaa-bbb_ccc ddd ddd-eee";
let v = split(
&s,
&[Boundary::SPACE, Boundary::UNDERSCORE, Boundary::HYPHEN],
);
assert_eq!(v, vec!["aaa", "bbb", "ccc", "ddd", "ddd", "eee"]);
}
#[test]
fn lower_upper() {
let s = "lowerUpperUpper";
let v = split(&s, &[Boundary::LOWER_UPPER]);
assert_eq!(v, vec!["lower", "Upper", "Upper"]);
}
#[test]
fn acronym() {
let s = "XMLRequest";
let v = split(&s, &[Boundary::ACRONYM]);
assert_eq!(v, vec!["XML", "Request"]);
}
#[test]
fn boundaries_found_in_string() {
assert_eq!(Vec::<Boundary>::new(), Boundary::defaults_from(".Aaaa"));
assert_eq!(
vec![Boundary::LOWER_UPPER, Boundary::LOWER_DIGIT,],
Boundary::defaults_from("a8.Aa.aA")
);
assert_eq!(
Boundary::digits().to_vec(),
Boundary::defaults_from("b1B1b")
);
assert_eq!(
vec![
Boundary::SPACE,
Boundary::HYPHEN,
Boundary::UNDERSCORE,
Boundary::ACRONYM,
],
Boundary::defaults_from("AAa -_")
);
}
#[test]
fn boundary_consts_same() {
assert_eq!(Boundary::SPACE, Boundary::SPACE);
}
#[test]
fn from_delim_dot() {
let boundary = Boundary::from_delim(".");
let s = "lower.Upper.Upper";
let v = split(&s, &[boundary]);
assert_eq!(vec!["lower", "Upper", "Upper"], v)
}
#[test]
fn from_delim_double_colon() {
let boundary = Boundary::from_delim("::");
let s = "lower::lowerUpper::Upper";
let v = split(&s, &[boundary]);
assert_eq!(vec!["lower", "lowerUpper", "Upper"], v)
}
}