rustrict/
lib.rs

1#![cfg_attr(test, feature(test))]
2#![cfg_attr(doc, feature(doc_cfg))]
3
4#[cfg(feature = "censor")]
5pub(crate) mod banned;
6#[cfg(feature = "censor")]
7pub(crate) mod buffer_proxy_iterator;
8#[cfg(feature = "censor")]
9pub(crate) mod censor;
10#[cfg(feature = "censor")]
11pub(crate) mod feature_cell;
12#[cfg(feature = "censor")]
13pub(crate) mod mtch;
14#[cfg(feature = "censor")]
15pub(crate) mod replacements;
16#[cfg(feature = "censor")]
17pub(crate) mod trie;
18#[cfg(feature = "censor")]
19pub(crate) mod typ;
20
21#[cfg(feature = "context")]
22pub(crate) mod context;
23
24#[cfg(feature = "pii")]
25mod pii;
26#[cfg(feature = "width")]
27pub(crate) mod width;
28
29#[cfg(feature = "censor")]
30pub use banned::Banned;
31#[cfg(feature = "censor")]
32pub use replacements::Replacements;
33#[cfg(feature = "censor")]
34pub use trie::Trie;
35
36#[cfg(feature = "width")]
37pub use width::{trim_to_width, width, width_str, width_str_max_unbroken, WordBreak};
38
39#[cfg(feature = "censor")]
40pub use typ::Type;
41
42#[cfg(feature = "censor")]
43pub use censor::{Censor, CensorIter, CensorStr};
44
45// Facilitate experimentation with different hash collections.
46#[cfg(feature = "censor")]
47pub(crate) type Map<K, V> = rustc_hash::FxHashMap<K, V>;
48
49#[cfg(feature = "censor")]
50pub(crate) type Set<V> = rustc_hash::FxHashSet<V>;
51
52#[cfg(feature = "customize")]
53#[allow(deprecated)]
54pub use censor::add_word;
55
56#[cfg(all(feature = "context", feature = "width"))]
57pub use context::ContextWordBreakOptions;
58#[cfg(feature = "context")]
59pub use context::{
60    BlockReason, Context, ContextProcessingOptions, ContextRateLimitOptions,
61    ContextRepetitionLimitOptions,
62};
63
64#[cfg(feature = "pii")]
65pub use pii::censor_and_analyze_pii;
66
67/// Trims whitespace characters from both ends of a string, according to the definition of
68/// `crate::is_whitespace`.
69pub fn trim_whitespace(s: &str) -> &str {
70    // Some characters are effectively whitespace if they are at the beginning of a string.
71    // https://www.compart.com/en/unicode/U+0488
72    // https://www.compart.com/en/unicode/U+0489
73    s.trim_start_matches(|c| is_whitespace(c) || matches!(c, '\u{0488}' | '\u{0489}'))
74        .trim_end_matches(is_whitespace)
75}
76
77/// Returns true iff the character is effectively whitespace. The definition of whitespace is broader
78/// than that of Unicode, because it includes control characters and a few additional blank characters.
79pub fn is_whitespace(c: char) -> bool {
80    use finl_unicode::categories::CharacterCategories;
81
82    // NOTE: The following characters are not detected by standard means but show up as blank.
83
84    // https://www.compart.com/en/unicode/U+115F
85    // https://www.compart.com/en/unicode/U+1160
86    // https://www.compart.com/en/unicode/U+2800
87    // https://www.compart.com/en/unicode/U+3164
88    // https://www.compart.com/en/unicode/U+FFA0
89    c.is_whitespace()
90        || c.is_other()
91        || c.is_format()
92        || matches!(
93            c,
94            '\u{115F}'
95                | '\u{1160}'
96                | '\u{20DD}'
97                | '\u{2800}'
98                | '\u{3164}'
99                | '\u{FFA0}'
100                | '\u{FFFC}'
101        )
102}
103
104#[cfg(test)]
105mod tests {
106    #![allow(unused_imports)]
107    extern crate test;
108
109    #[test]
110    fn trim_whitespace() {
111        // General.
112        assert_eq!(crate::trim_whitespace("\u{0020}\u{00A0}\u{2000}\u{2001}\u{2002}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{200B}\u{200C}\u{200D}\u{2028}\u{205F}\u{3000}"), "");
113
114        // Extra cases.
115        assert_eq!(
116            crate::trim_whitespace(" \u{1160} \u{2800} abc \u{3164} \u{FFA0} \t \u{115F} \n "),
117            "abc"
118        );
119
120        // Special cases.
121        assert_eq!(
122            crate::trim_whitespace(
123                "\u{FFF9}\u{FFFA}\u{FFFB}\u{FFFC}\u{0488}\u{1160}\u{0489}\u{1160}\u{0488}\u{1160}\u{0489}abc\u{0488}\u{0489}"
124            ),
125            "abc\u{0488}\u{0489}"
126        )
127    }
128
129    #[test]
130    fn is_whitespace() {
131        assert!(crate::is_whitespace(' '));
132        assert!(crate::is_whitespace('\u{2800}'));
133        assert!(!crate::is_whitespace('a'));
134    }
135}
136
137doc_comment::doctest!("../README.md");