censor/
lib.rs

1#![warn(missing_docs)]
2
3/*!
4This crate implements a simple but powerful profanity filter.
5
6While this filter can still be technically subverted, the goal is that by
7the time a profanity gets past the filter, it barely resembles the original word.
8This is done by subverting common profanity filter workarounds, such as inserting
9spaces or special characters in between letters (`F_U_C_K`) or using similar-looking
10characters in the place of others (`SH!T`).
11
12Keep in mind though, that this filter is far from perfect. If people *really* want
13to swear, they can get through this filter.
14
15# Usage
16The [`Censor`] enum is the main object used for censoring strings.
17It is essentially a set of words to be filtered out. The [`Standard`]
18variant contains words that most people consider to be swear words, and is meant to be a good
19baseline for a filter. More sets and individual words can be added with the `+` and `+=`
20operators, and sets and words can be removed with the `-` and `-= operators`.
21
22```
23use censor::*;
24
25let censor = Censor::Standard;
26
27// Use `Censor::check` to check if a string contains a profanity
28assert!(censor.check("fuck"));
29assert!(censor.check("FUCK"));
30assert!(censor.check("FuCk"));
31assert!(censor.check("fμ¢κ"));
32assert!(censor.check("f!u!c!k"));
33assert!(censor.check("F_u c_K"));
34assert!(censor.check("fuuuuuuuck"));
35
36assert!(!censor.check("fluff truck"));
37assert!(!censor.check("fukushima"));
38
39// Use `Censor::censor` to censor a string with asterisks
40assert_eq!("*_*_*_*_*", censor.censor("₱_û_$_$_¥"));
41assert_eq!("**** that ****, dude", censor.censor("fuck that shit, dude"));
42assert_eq!("******* yoouuu", censor.censor("fuuuuck yoouuu"));
43
44// Use `Censor::replace` to replace censored words with any grawlix string
45assert_eq!("What the !@#$?", censor.replace("What the fuck?", "!@#$%"));
46
47// You can combine `Censor`s and add your own words
48let censor = Standard + Zealous + Sex + "dong";
49
50assert_eq!(
51    "Woops, I dropped my monster ******, that I use for my magnum ****",
52    censor.censor("Woops, I dropped my monster condom, that I use for my magnum dong")
53);
54
55// You can remove words from `Censor`s too
56let censor = Standard - "ass";
57assert!(!censor.check("I don't care if people say 'ass'"));
58
59// Overlapping censored words are fully censored
60let censor = Standard + Sex;
61assert_eq!("**********", censor.censor("shititties"));
62assert_eq!("*************", censor.censor("blowjoboobies"))
63```
64*/
65
66use std::{
67    collections::{BTreeSet, HashMap, HashSet},
68    ops::{Add, AddAssign, Sub, SubAssign},
69};
70
71use once_cell::sync::Lazy;
72
73static CHAR_ALIASES: Lazy<HashMap<char, char>> = Lazy::new(|| {
74    let mut map = HashMap::new();
75    const CASE_DIFF: u8 = b'a' - b'A';
76    for c in b'A'..=b'Z' {
77        map.insert(c as char, (c + CASE_DIFF) as char);
78    }
79    macro_rules! alias {
80        ($reduced:literal => $($alias:literal),*) => {
81            $(map.insert($alias, $reduced);)*
82        };
83    }
84    alias!('a' => '4', '@', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'à', 'á', 'â', 'ã', 'ä', 'å', 'α', 'Α');
85    alias!('b' => 'ß', 'Β', '฿');
86    alias!('c' => '¢', 'ç', 'Ç', '©');
87    alias!('d' => 'Ð', '₫');
88    alias!('e' => '3', '£', '€', 'È', 'É', 'Ê', 'Ë', 'è', 'é', 'ê', 'ë', 'ε', 'Ε', 'Ξ', 'Σ');
89    alias!('g' => '6');
90    alias!('h' => 'Η');
91    alias!('k' => 'κ', 'Κ');
92    alias!('i' => '1', '|', '!', 'Ì', 'Í', 'Î', 'Ï', 'ì', 'í', 'î', 'ï', 'Ι');
93    alias!('m' => 'Μ');
94    alias!('n' => 'ñ', 'Ñ', 'η', 'Ν', 'Π');
95    alias!('o' => '0', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'ò', 'ó', 'ô', 'õ', 'ö', 'Ø', 'ø', 'θ', 'ο', 'σ', 'Θ', 'Ο', 'Φ');
96    alias!('p' => 'ρ', 'Ρ', '₱', '℗', 'Þ', 'þ');
97    alias!('r' => '®');
98    alias!('s' => '5', '$');
99    alias!('t' => 'τ', 'Τ');
100    alias!('u' => 'Ù', 'Ú', 'Û', 'Ü', 'ù', 'ú', 'û', 'ü', 'μ', 'υ');
101    alias!('v' => 'ν');
102    alias!('w' => 'ω', '₩');
103    alias!('x' => '×', 'χ', 'Χ');
104    alias!('y' => '¥', 'Ý', 'ý', 'ÿ', 'γ', 'Υ');
105    alias!('z' => '2', 'Ζ');
106    map
107});
108
109macro_rules! word_set {
110    ($doc:literal, $name:ident, $($word:literal),*) => {
111        #[doc = $doc]
112        #[doc = ""]
113        #[doc = "#### List"]
114        $(
115            #[doc = $word]
116            #[doc = ""]
117        )*
118        pub static $name: Lazy<HashSet<String>> = Lazy::new(|| {
119            let mut set = HashSet::new();
120            let words = [$($word),*];
121            for i in 0..words.len() {
122                set.insert(String::from(words[i]));
123            }
124            set
125        });
126    };
127}
128
129word_set!(
130    "Words that are profanities by most people's definition",
131    STANDARD_WORDS,
132    "ass",
133    "asshole",
134    "bitch",
135    "cock",
136    "cunt",
137    "fag",
138    "fagot",
139    "faggot",
140    "fuck",
141    "nigger",
142    "piss",
143    "pussy",
144    "shit",
145    "twat",
146    "whore"
147);
148word_set!(
149    "Words that are profanities only to the zealous",
150    ZEALOUS_WORDS,
151    "crap",
152    "damn",
153    "goddamn",
154    "hell",
155    "suck"
156);
157word_set!(
158    "Words related to sex",
159    SEX_WORDS,
160    "ass",
161    "asshole",
162    "blowjob",
163    "boob",
164    "boobie",
165    "boobies",
166    "boobjob",
167    "breast",
168    "clitoris",
169    "cock",
170    "condom",
171    "cunnilingus",
172    "cunt",
173    "dick",
174    "doggystyle",
175    "ejaculate",
176    "felate",
177    "felatio",
178    "fetish",
179    "foreskin",
180    "handjob",
181    "labia",
182    "masturbate",
183    "masturbation",
184    "masterbate",
185    "masterbation",
186    "penis",
187    "pussy",
188    "rimjob",
189    "semen",
190    "sex",
191    "tits",
192    "tittie",
193    "titties",
194    "titty",
195    "twat",
196    "vagina",
197    "vulva"
198);
199
200/**
201A collection of words to censor
202*/
203#[derive(Debug, Clone, Eq)]
204pub enum Censor {
205    /**
206    Standard swear words
207
208    For more information, see [`STANDARD_WORDS`]
209    */
210    Standard,
211    /**
212    Words related to sex
213
214    Not usually used by itself
215
216    For more information, see [`SEX_WORDS`]
217    */
218    Sex,
219    /**
220    Words that are profanities only to the zealous
221
222    Not usually used by itself
223
224    For more information, see [`ZEALOUS_WORDS`]
225    */
226    Zealous,
227    /// A custom set of words
228    Custom(HashSet<String>),
229}
230
231pub use Censor::*;
232
233impl Default for Censor {
234    fn default() -> Self {
235        Standard
236    }
237}
238
239impl Censor {
240    /// Create an empty `Censor`
241    pub fn empty() -> Self {
242        Custom(HashSet::new())
243    }
244    /// Create a `Censor::Custom`
245    pub fn custom<I, W>(words: I) -> Self
246    where
247        I: IntoIterator<Item = W>,
248        W: Into<String>,
249    {
250        Custom(words.into_iter().map(Into::into).collect())
251    }
252    /// Check if a string contains censored words
253    pub fn check(&self, text: &str) -> bool {
254        !self.bad_chars(text, 0, 0).is_empty()
255    }
256    /// Count the number of censored words in a string
257    ///
258    /// # Example
259    /// ```
260    /// use censor::*;
261    ///
262    /// let censor = Censor::Standard;
263    ///
264    /// assert_eq!(0, censor.count("dog"));
265    /// assert_eq!(1, censor.count("motherfucker"));
266    /// assert_eq!(2, censor.count("bitch ass guy"));
267    /// ```
268    pub fn count(&self, text: &str) -> usize {
269        let bad_chars = self.bad_chars(text, 0, 0);
270        let mut count = 0;
271        let mut in_censored = false;
272        for i in 0..text.chars().count() {
273            if bad_chars.contains(&i) {
274                if !in_censored {
275                    in_censored = true;
276                    count += 1;
277                }
278            } else {
279                in_censored = false;
280            }
281        }
282        count
283    }
284    /// Replace censored words in the string with asterisks (`*`s)
285    pub fn censor(&self, text: &str) -> String {
286        self.replace(text, "*")
287    }
288    /**
289    Replace censored words in the string with characters from a 'grawlix' string (#?!@$)
290
291    # Panics
292    Panics if the grawlix string is empty
293    */
294    #[track_caller]
295    pub fn replace(&self, text: &str, grawlix: &str) -> String {
296        self.replace_with_offsets(text, grawlix, 0, 0)
297    }
298    /**
299    Replace censored words in the string with characters from a 'grawlix' string (#?!@$)
300
301    Characters at indices within the given offsets from the start and end of words will not be censored
302
303    # Panics
304    Panics if the grawlix string is empty
305    */
306    #[track_caller]
307    pub fn replace_with_offsets(
308        &self,
309        text: &str,
310        grawlix: &str,
311        start_offset: usize,
312        end_offset: usize,
313    ) -> String {
314        if grawlix.is_empty() {
315            panic!("grawlix is empty");
316        }
317        let graw_chars: Vec<char> = grawlix.chars().collect();
318        let mut graw_offset: usize = 0;
319
320        let bad_chars = self.bad_chars(text, start_offset, end_offset);
321        text.chars()
322            .enumerate()
323            .map(|(i, c)| {
324                if bad_chars.contains(&i) {
325                    let graw = graw_chars[graw_offset];
326                    graw_offset = (graw_offset + 1) % graw_chars.len();
327                    graw
328                } else {
329                    c
330                }
331            })
332            .collect()
333    }
334    /// Get a set of the indices of characters in the given string that
335    /// are part of censored words
336    pub fn bad_chars(&self, text: &str, start_offset: usize, end_offset: usize) -> HashSet<usize> {
337        let lowercase = text.to_lowercase();
338        let sizes: BTreeSet<usize> = self.list().map(|s| s.len()).collect();
339        // Check just alpha
340        let (alphanum_only, alphanum_map) = remove_non_alpha(&lowercase);
341        let bad_alphanum_chars = self._bad_chars(
342            &alphanum_only,
343            &alphanum_map,
344            &sizes,
345            start_offset,
346            end_offset,
347        );
348        // Check aliased then without whitespace
349        let (alias_ws, alias_ws_map) = remove_whitespace(&alias(&lowercase));
350        let bad_alias_ws_chars =
351            self._bad_chars(&alias_ws, &alias_ws_map, &sizes, start_offset, end_offset);
352        // Check aliased then just alpha
353        let (alias_alphanum, alias_alphanum_map) = remove_non_alpha(&alias(&lowercase));
354        let bad_alias_alphanum_chars = self._bad_chars(
355            &alias_alphanum,
356            &alias_alphanum_map,
357            &sizes,
358            start_offset,
359            end_offset,
360        );
361        // Union sets
362        bad_alphanum_chars
363            .into_iter()
364            .chain(bad_alias_ws_chars)
365            .chain(bad_alias_alphanum_chars)
366            .collect()
367    }
368    fn _bad_chars(
369        &self,
370        text: &str,
371        map: &HashMap<usize, usize>,
372        sizes: &BTreeSet<usize>,
373        start_offset: usize,
374        end_offset: usize,
375    ) -> HashSet<usize> {
376        let (deduped, dd_map) = dedup_string(text);
377        let mut set = HashSet::new();
378        for &size in sizes.iter().rev() {
379            for word in self.list().filter(|s| s.len() == size) {
380                for (i, _) in text.match_indices(word.as_str()) {
381                    for j in start_offset..word.len().saturating_sub(end_offset) {
382                        let k = i + j;
383                        if let Some(k) = map.get(&k) {
384                            set.insert(*k);
385                        }
386                    }
387                }
388                for (i, _) in deduped.match_indices(word.as_str()) {
389                    for j in start_offset..word.len().saturating_sub(end_offset) {
390                        let k = i + j;
391                        if let Some(ls) = dd_map.get(&k) {
392                            for l in ls {
393                                if let Some(k) = map.get(l) {
394                                    set.insert(*k);
395                                }
396                            }
397                        }
398                    }
399                }
400            }
401        }
402        set
403    }
404    /// Get a reference to the set used by the `Censor`
405    pub fn set(&self) -> &HashSet<String> {
406        match self {
407            Standard => &STANDARD_WORDS,
408            Zealous => &ZEALOUS_WORDS,
409            Sex => &SEX_WORDS,
410            Custom(words) => words,
411        }
412    }
413    /// Get an iterator over all censored words
414    pub fn list(&self) -> std::collections::hash_set::Iter<String> {
415        self.set().iter()
416    }
417    /// Find a censored word in the `Censor`. Applies character aliases
418    pub fn find(&self, word: &str) -> Option<&str> {
419        let word = alias(word);
420        self.set().get(&word).map(|w| w.as_str())
421    }
422    /// Check if the `Censor` contains a word. Applies character aliases
423    pub fn contains(&self, word: &str) -> bool {
424        self.find(word).is_some()
425    }
426}
427
428impl AddAssign for Censor {
429    fn add_assign(&mut self, other: Self) {
430        *self = Censor::Custom(self.set().union(other.set()).cloned().collect());
431    }
432}
433
434impl PartialEq for Censor {
435    fn eq(&self, other: &Self) -> bool {
436        self.set() == other.set()
437    }
438}
439
440impl<S> AddAssign<S> for Censor
441where
442    S: Into<String>,
443{
444    fn add_assign(&mut self, other: S) {
445        *self = Censor::Custom(self.list().cloned().chain(Some(other.into())).collect());
446    }
447}
448
449impl SubAssign for Censor {
450    fn sub_assign(&mut self, other: Self) {
451        *self = Censor::Custom(self.set().difference(other.set()).cloned().collect());
452    }
453}
454
455impl<S> SubAssign<S> for Censor
456where
457    S: Into<String>,
458{
459    fn sub_assign(&mut self, other: S) {
460        let other = other.into();
461        *self = Censor::Custom(self.list().filter(|&s| s != &other).cloned().collect());
462    }
463}
464
465impl Add for Censor {
466    type Output = Censor;
467    fn add(mut self, other: Self) -> Self::Output {
468        self += other;
469        self
470    }
471}
472
473impl<S> Add<S> for Censor
474where
475    S: Into<String>,
476{
477    type Output = Censor;
478    fn add(mut self, other: S) -> Self::Output {
479        self += other;
480        self
481    }
482}
483
484impl Sub for Censor {
485    type Output = Censor;
486    fn sub(mut self, other: Self) -> Self::Output {
487        self -= other;
488        self
489    }
490}
491
492impl<S> Sub<S> for Censor
493where
494    S: Into<String>,
495{
496    type Output = Censor;
497    fn sub(mut self, other: S) -> Self::Output {
498        self -= other;
499        self
500    }
501}
502
503fn alias(text: &str) -> String {
504    text.chars()
505        .map(|c| CHAR_ALIASES.get(&c).copied().unwrap_or(c))
506        .collect()
507}
508
509fn remove_whitespace(text: &str) -> (String, HashMap<usize, usize>) {
510    let mut output = String::new();
511    let mut map = HashMap::new();
512    for (i, (j, c)) in text
513        .chars()
514        .enumerate()
515        .filter(|(_, c)| !c.is_whitespace())
516        .enumerate()
517    {
518        output.push(c);
519        map.insert(i, j);
520    }
521    (output, map)
522}
523
524fn remove_non_alpha(text: &str) -> (String, HashMap<usize, usize>) {
525    let mut output = String::new();
526    let mut map = HashMap::new();
527    for (i, (j, c)) in text
528        .chars()
529        .enumerate()
530        .filter(|(_, c)| c.is_alphabetic())
531        .enumerate()
532    {
533        output.push(c);
534        map.insert(i, j);
535    }
536    (output, map)
537}
538
539fn dedup_string(s: &str) -> (String, HashMap<usize, Vec<usize>>) {
540    let mut last = None;
541    let mut res = String::new();
542    let mut map = HashMap::new();
543    let mut j = 0;
544    for (i, c) in s.chars().enumerate() {
545        if last.map(|l| l != c).unwrap_or(true) {
546            res.push(c);
547            map.entry(j).or_insert_with(Vec::new).push(i);
548            j += 1;
549        } else {
550            map.entry(j).or_insert_with(Vec::new).push(i);
551        }
552        last = Some(c);
553    }
554    (res, map)
555}