use lazy_static::lazy_static;
use punkt_n::params::Standard;
use punkt_n::{SentenceTokenizer, TrainingData};
use regex::Regex;
use std::borrow::Cow;
lazy_static! {
pub static ref PREPOSITIONS: Regex = Regex::new(r"(?i)^\s*(and|a[st]|[io]n|of|for|to|with|by|at|from|in|on|over|under|or|like)\b").unwrap();
pub static ref LONG_LOWER_CASE: Regex = Regex::new(r"^\p{Ll}{7,}$").unwrap();
static ref TOKENIZATION_DATA: TrainingData = TrainingData::english();
static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}
pub fn conditions(candidate: &str) -> bool {
let length = candidate.len();
if length < 2 || length > 12 {
return false;
}
let word_count = candidate.split_whitespace().count();
if word_count > 2 {
return false;
}
if LONG_LOWER_CASE.is_match(candidate) {
return false;
}
let mut has_letter = false;
for (i, c) in candidate.chars().enumerate() {
if i == 0 && !c.is_alphanumeric() {
return false;
}
if c.is_alphabetic() {
has_letter = true;
break;
}
}
if word_count != 1 && PREPOSITIONS.is_match(candidate) {
return false;
}
has_letter
}
pub fn tokenize_and_clean<'a>(text: &'a str) -> impl Iterator<Item = Cow<'a, str>> + 'a {
SentenceTokenizer::<Standard>::new(text, &TOKENIZATION_DATA).map(|sent| {
if sent.contains('\n') {
Cow::Owned(WHITESPACE_REGEX.replace_all(sent.trim(), " ").into_owned())
} else {
Cow::Borrowed(sent.trim())
}
})
}