1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#![cfg_attr(test, feature(test))]
#![cfg_attr(doc, feature(doc_cfg))]

#[cfg(feature = "censor")]
pub(crate) mod banned;
#[cfg(feature = "censor")]
pub(crate) mod buffer_proxy_iterator;
#[cfg(feature = "censor")]
pub(crate) mod censor;
#[cfg(feature = "censor")]
pub(crate) mod feature_cell;
#[cfg(feature = "censor")]
pub(crate) mod mtch;
#[cfg(feature = "censor")]
pub(crate) mod replacements;
#[cfg(feature = "censor")]
pub(crate) mod trie;
#[cfg(feature = "censor")]
pub(crate) mod typ;

#[cfg(feature = "context")]
pub(crate) mod context;

#[cfg(feature = "pii")]
mod pii;
#[cfg(feature = "width")]
pub(crate) mod width;

#[cfg(feature = "censor")]
pub use banned::Banned;
#[cfg(feature = "censor")]
pub use replacements::Replacements;
#[cfg(feature = "censor")]
pub use trie::Trie;

#[cfg(feature = "width")]
pub use width::{trim_to_width, width, width_str, width_str_max_unbroken, WordBreak};

#[cfg(feature = "censor")]
pub use typ::Type;

#[cfg(feature = "censor")]
pub use censor::{Censor, CensorIter, CensorStr};

// Facilitate experimentation with different hash collections.
#[cfg(feature = "censor")]
pub(crate) type Map<K, V> = rustc_hash::FxHashMap<K, V>;

#[cfg(feature = "censor")]
pub(crate) type Set<V> = rustc_hash::FxHashSet<V>;

#[cfg(feature = "customize")]
#[allow(deprecated)]
pub use censor::add_word;

#[cfg(feature = "context")]
pub use context::{
    BlockReason, Context, ContextProcessingOptions, ContextRateLimitOptions,
    ContextRepetitionLimitOptions,
};
#[cfg(all(feature = "context", feature = "width"))]
pub use context::ContextWordBreakOptions;

#[cfg(feature = "pii")]
pub use pii::censor_and_analyze_pii;

/// Trims whitespace characters from both ends of a string, according to the definition of
/// `crate::is_whitespace`.
pub fn trim_whitespace(s: &str) -> &str {
    // Some characters are effectively whitespace if they are at the beginning of a string.
    // https://www.compart.com/en/unicode/U+0488
    // https://www.compart.com/en/unicode/U+0489
    s.trim_start_matches(|c| is_whitespace(c) || matches!(c, '\u{0488}' | '\u{0489}'))
        .trim_end_matches(is_whitespace)
}

/// Returns true iff the character is effectively whitespace. The definition of whitespace is broader
/// than that of Unicode, because it includes control characters and a few additional blank characters.
pub fn is_whitespace(c: char) -> bool {
    use finl_unicode::categories::CharacterCategories;

    // NOTE: The following characters are not detected by standard means but show up as blank.

    // https://www.compart.com/en/unicode/U+115F
    // https://www.compart.com/en/unicode/U+1160
    // https://www.compart.com/en/unicode/U+2800
    // https://www.compart.com/en/unicode/U+3164
    // https://www.compart.com/en/unicode/U+FFA0
    c.is_whitespace()
        || c.is_other()
        || matches!(
            c,
            '\u{115F}' | '\u{1160}' | '\u{2800}' | '\u{3164}' | '\u{FFA0}'
        )
}

#[cfg(test)]
mod tests {
    #![allow(unused_imports)]
    extern crate test;

    #[test]
    fn trim_whitespace() {
        // General.
        assert_eq!(crate::trim_whitespace("\u{0020}\u{00A0}\u{2000}\u{2001}\u{2002}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{200B}\u{200C}\u{200D}\u{2028}\u{205F}\u{3000}"), "");

        // Extra cases.
        assert_eq!(
            crate::trim_whitespace(" \u{1160} \u{2800} abc \u{3164} \u{FFA0} \t \u{115F} \n "),
            "abc"
        );

        // Special cases.
        assert_eq!(
            crate::trim_whitespace(
                "\u{0488}\u{1160}\u{0489}\u{1160}\u{0488}\u{1160}\u{0489}abc\u{0488}\u{0489}"
            ),
            "abc\u{0488}\u{0489}"
        )
    }

    #[test]
    fn is_whitespace() {
        assert!(crate::is_whitespace(' '));
        assert!(crate::is_whitespace('\u{2800}'));
        assert!(!crate::is_whitespace('a'));
    }
}

doc_comment::doctest!("../README.md");