use indicatif::ProgressBar;
use itertools::Itertools;
use lazy_static::lazy_static;
use rayon::iter::IntoParallelRefIterator;
use rayon::iter::ParallelIterator;
use regex::Regex;
use rustrict::{Censor, Type};
use std::collections::HashSet;
use std::fs;
use std::sync::Mutex;
lazy_static! {
static ref DICTIONARY: HashSet<&'static str> = include_str!("dictionary.txt")
.split("\r\n")
.chain(include_str!("dictionary_extra.txt").split('\n'))
.filter(|&word| !word.is_empty() && !is_blacklisted(word))
.collect();
static ref VALID_SHORT: HashSet<&'static str> =
include_str!("dictionary_common_valid_short.txt")
.split("\n")
.filter(|l| !l.is_empty())
.collect();
static ref CONCAT_DICTIONARY: HashSet<&'static str> = include_str!("dictionary_common.txt")
.split("\n")
.filter(|&w| {
(w.len() > 3 || VALID_SHORT.contains(w))
&& !is_blacklisted(w)
&& !is_ignore_fp(w.chars())
})
.collect();
static ref BLACKLIST: Vec<Regex> = include_str!("profanity.csv")
.split("\n")
.filter(|l| !l.is_empty())
.map(|l| &l[..l.find(',').unwrap()])
.chain(
include_str!("dictionary_blacklist.txt")
.split("\n")
.filter(|l| !l.is_empty())
)
.map(|l| Regex::new(l).unwrap())
.collect();
}
pub fn is_ignore_fp<C: Iterator<Item = char>>(text: C) -> bool {
Censor::new(text)
.with_ignore_false_positives(true)
.analyze()
.is(Type::PROFANE | Type::OFFENSIVE | Type::SEXUAL | Type::MEAN)
}
fn maybe_false_positive<C: Iterator<Item = char> + Clone>(word: C) -> Option<String> {
if is_ignore_fp(word.clone()) {
let word: String = word.collect();
let word = &word[..];
if is_blacklisted(word) {
return None;
}
let index_range = 0..=word.len();
let mut shortest_subslice = word;
for perm in index_range.permutations(2) {
let start = perm[0];
let end = perm[1];
if start >= end {
continue;
}
let sub_slice = &word[start..end];
if sub_slice.len() >= shortest_subslice.len() {
continue;
}
let valid = if sub_slice.contains(' ') {
let mut split = sub_slice.split(' ');
split.all(|w| DICTIONARY.contains(w))
} else {
DICTIONARY.contains(sub_slice)
};
if valid && is_ignore_fp(sub_slice.chars()) && !is_blacklisted(sub_slice) {
shortest_subslice = sub_slice;
}
}
return Some(String::from(shortest_subslice));
}
None
}
fn main() {
let progress = ProgressBar::new(DICTIONARY.len() as u64);
let false_positives: HashSet<String> = DICTIONARY
.par_iter()
.filter_map(|&word| {
progress.inc(1);
maybe_false_positive(word.chars())
})
.collect();
let progress = ProgressBar::new((CONCAT_DICTIONARY.len() as u64).pow(2));
progress.eta();
let false_positives = Mutex::new(false_positives);
CONCAT_DICTIONARY.par_iter().for_each(|word1| {
for word2 in CONCAT_DICTIONARY.iter() {
progress.inc(1);
if let Some(false_positive) =
maybe_false_positive(word1.chars().chain(" ".chars()).chain(word2.chars()))
{
false_positives.lock().unwrap().insert(false_positive);
}
}
});
let mut sorted: Vec<_> = false_positives.into_inner().unwrap().into_iter().collect();
sorted.sort();
fs::write("src/false_positives.txt", sorted.join("\n")).unwrap();
}
fn is_blacklisted(phrase: &str) -> bool {
BLACKLIST.iter().any(|p| {
p.find(phrase)
.map(|m| m.start() == 0 && m.end() == phrase.len())
.unwrap_or(false)
})
}
#[allow(dead_code)]
fn is_sus(phrase: &str) -> bool {
BLACKLIST.iter().any(|p| {
p.find(phrase.trim_end_matches('s'))
.map(|m| m.start() == 0 && m.end() == phrase.len())
.unwrap_or(false)
})
}