rustrict 0.1.18

rustrict is a profanity filter for Rust
Documentation
use indicatif::ProgressBar;
use itertools::Itertools;
use lazy_static::lazy_static;
use rayon::iter::IntoParallelRefIterator;
use rayon::iter::ParallelIterator;
use regex::Regex;
use rustrict::{Censor, Type};
use std::collections::HashSet;
use std::fs;
use std::sync::Mutex;

lazy_static! {
    static ref DICTIONARY: HashSet<&'static str> = include_str!("dictionary.txt")
        .split("\r\n")
        .chain(include_str!("dictionary_extra.txt").split('\n'))
        .filter(|&word| !word.is_empty() && !is_blacklisted(word))
        .collect();
    static ref VALID_SHORT: HashSet<&'static str> =
        include_str!("dictionary_common_valid_short.txt")
            .split("\n")
            .filter(|l| !l.is_empty())
            .collect();
    static ref CONCAT_DICTIONARY: HashSet<&'static str> = include_str!("dictionary_common.txt")
        .split("\n")
        .filter(|&w| {
            (w.len() > 3 || VALID_SHORT.contains(w))
                && !is_blacklisted(w)
                && !is_ignore_fp(w.chars())
        })
        .collect();
    static ref BLACKLIST: Vec<Regex> = include_str!("profanity.csv")
        .split("\n")
        .filter(|l| !l.is_empty())
        .map(|l| &l[..l.find(',').unwrap()])
        .chain(
            include_str!("dictionary_blacklist.txt")
                .split("\n")
                .filter(|l| !l.is_empty())
        )
        .map(|l| Regex::new(l).unwrap())
        .collect();
}

pub fn is_ignore_fp<C: Iterator<Item = char>>(text: C) -> bool {
    Censor::new(text)
        .with_ignore_false_positives(true)
        .analyze()
        .is(Type::PROFANE | Type::OFFENSIVE | Type::SEXUAL | Type::MEAN)
}

fn maybe_false_positive<C: Iterator<Item = char> + Clone>(word: C) -> Option<String> {
    if is_ignore_fp(word.clone()) {
        let word: String = word.collect();
        let word = &word[..];
        if is_blacklisted(word) {
            return None;
        }
        let index_range = 0..=word.len();
        let mut shortest_subslice = word;
        for perm in index_range.permutations(2) {
            let start = perm[0];
            let end = perm[1];

            if start >= end {
                continue;
            }

            let sub_slice = &word[start..end];

            if sub_slice.len() >= shortest_subslice.len() {
                continue;
            }

            let valid = if sub_slice.contains(' ') {
                let mut split = sub_slice.split(' ');
                split.all(|w| DICTIONARY.contains(w))
            } else {
                DICTIONARY.contains(sub_slice)
            };

            if valid && is_ignore_fp(sub_slice.chars()) && !is_blacklisted(sub_slice) {
                shortest_subslice = sub_slice;
            }
        }
        return Some(String::from(shortest_subslice));
    }
    None
}

fn main() {
    let progress = ProgressBar::new(DICTIONARY.len() as u64);

    let false_positives: HashSet<String> = DICTIONARY
        .par_iter()
        .filter_map(|&word| {
            progress.inc(1);
            maybe_false_positive(word.chars())
        })
        .collect();

    let progress = ProgressBar::new((CONCAT_DICTIONARY.len() as u64).pow(2));
    progress.eta();

    let false_positives = Mutex::new(false_positives);

    CONCAT_DICTIONARY.par_iter().for_each(|word1| {
        for word2 in CONCAT_DICTIONARY.iter() {
            progress.inc(1);
            if let Some(false_positive) =
                maybe_false_positive(word1.chars().chain(" ".chars()).chain(word2.chars()))
            {
                //println!("fp: {}", false_positive);
                false_positives.lock().unwrap().insert(false_positive);
            }
        }
    });

    let mut sorted: Vec<_> = false_positives.into_inner().unwrap().into_iter().collect();
    sorted.sort();

    fs::write("src/false_positives.txt", sorted.join("\n")).unwrap();

    //println!("{:?}", sorted);
}

fn is_blacklisted(phrase: &str) -> bool {
    BLACKLIST.iter().any(|p| {
        p.find(phrase)
            .map(|m| m.start() == 0 && m.end() == phrase.len())
            .unwrap_or(false)
    })
}

#[allow(dead_code)]
fn is_sus(phrase: &str) -> bool {
    BLACKLIST.iter().any(|p| {
        p.find(phrase.trim_end_matches('s'))
            .map(|m| m.start() == 0 && m.end() == phrase.len())
            .unwrap_or(false)
    })
}