use unicode_segmentation::UnicodeSegmentation;
#[derive(PartialEq)]
enum CharType {
Lower,
Upper,
Digit,
Underscore,
Period,
Colon,
}
#[derive(Debug, PartialEq)]
pub struct Split {
pub word: String,
pub start_char: u32,
}
pub fn split(s: &str) -> Vec<Split> {
let mut result = Vec::new();
let mut current_word = String::new();
let mut current_word_start: usize = 0;
let mut prev_char_type = None;
for (i, c) in s.graphemes(true).enumerate() {
assert!(
!c.chars().any(|ch| ch.is_whitespace()),
"There should be no white space in the input: '{}'",
s
);
let char_type = c
.chars()
.map(|ch| match ch {
ch if ch.is_uppercase() => CharType::Upper,
ch if ch.is_ascii_digit() => CharType::Digit,
'_' => CharType::Underscore,
'.' => CharType::Period,
':' => CharType::Colon,
_ => CharType::Lower,
})
.next()
.unwrap();
let should_split = match prev_char_type {
Some(CharType::Lower) if char_type == CharType::Upper => true,
Some(CharType::Upper)
if char_type == CharType::Upper && {
match s.chars().nth(i + 1) {
Some(t) => t.is_ascii_lowercase(),
None => false,
}
} =>
{
true
}
Some(prev)
if (prev != CharType::Digit && char_type == CharType::Digit)
|| (prev == CharType::Digit && char_type != CharType::Digit) =>
{
true
}
_ => {
char_type == CharType::Underscore
|| char_type == CharType::Period
|| char_type == CharType::Colon
}
};
if should_split && !current_word.is_empty() {
result.push(Split {
word: current_word.clone(),
start_char: current_word_start as u32,
});
current_word.clear();
current_word_start = i;
}
if char_type == CharType::Underscore
|| char_type == CharType::Period
|| char_type == CharType::Colon
{
current_word_start = i + 1;
} else {
current_word += c;
}
prev_char_type = Some(char_type);
}
if !current_word.is_empty() {
let start = s.chars().count() - current_word.chars().count();
result.push(Split {
word: current_word,
start_char: start as u32,
});
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_camel_case_splitting() {
let words: Vec<String> = split("calculateUserAge")
.into_iter()
.map(|s| s.word)
.collect();
assert_eq!(words, vec!["calculate", "User", "Age"]);
}
#[test]
fn test_camel_case_splitting_underscore() {
let words = split("calculateUser_Age____word__");
assert_eq!(
words,
vec![
Split {
word: "calculate".to_string(),
start_char: 0
},
Split {
word: "User".to_string(),
start_char: 9
},
Split {
word: "Age".to_string(),
start_char: 14
},
Split {
word: "word".to_string(),
start_char: 21
}
]
);
}
#[test]
fn test_camel_case_splitting_period() {
let words = split("calculateUser.Age.._.word._");
assert_eq!(
words,
vec![
Split {
word: "calculate".to_string(),
start_char: 0
},
Split {
word: "User".to_string(),
start_char: 9
},
Split {
word: "Age".to_string(),
start_char: 14
},
Split {
word: "word".to_string(),
start_char: 21
}
]
);
}
#[test]
fn test_camel_case_splitting_colon() {
let words = split("calculateUser:Age..:.word.:");
assert_eq!(
words,
vec![
Split {
word: "calculate".to_string(),
start_char: 0
},
Split {
word: "User".to_string(),
start_char: 9
},
Split {
word: "Age".to_string(),
start_char: 14
},
Split {
word: "word".to_string(),
start_char: 21
}
]
);
}
#[test]
fn test_complex_camel_case() {
let words = split("XMLHttpRequest");
assert_eq!(
words,
vec![
Split {
word: "XML".to_string(),
start_char: 0
},
Split {
word: "Http".to_string(),
start_char: 3
},
Split {
word: "Request".to_string(),
start_char: 7
}
]
);
}
#[test]
fn test_number() {
let words: Vec<String> = split("userAge10").into_iter().map(|s| s.word).collect();
assert_eq!(words, vec!["user", "Age", "10"]);
}
#[test]
fn test_uppercase() {
let words: Vec<String> = split("EXAMPLE").into_iter().map(|s| s.word).collect();
assert_eq!(words, vec!["EXAMPLE"]);
}
#[test]
fn test_uppercase_first() {
let words: Vec<String> = split("Example").into_iter().map(|s| s.word).collect();
assert_eq!(words, vec!["Example"]);
}
#[test]
fn test_unicode() {
let words: Vec<String> = split("こんにちは").into_iter().map(|s| s.word).collect();
assert_eq!(words, vec!["こんにちは"]);
}
}