1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
use std::collections::{HashMap, HashSet};
use rand::RngExt;
use regex::Regex;
use super::AnoError;
pub struct WordTokenizer {
/// A mapping of words to random tokens.
word_token_mapping: HashMap<String, String>,
/// Pre-compiled pattern used to extract word tokens from text.
word_pattern: Regex,
}
impl WordTokenizer {
/// Creates a new instance of `WordTokenizer` that can be used to replace
/// the given words with randomly generated 16-bytes tokens.
///
/// # Arguments
///
/// * `target_words`: words to be replaced by tokens.
pub fn new(target_words: &[&str]) -> Result<Self, AnoError> {
let mut mapping = HashMap::with_capacity(target_words.len());
let mut rng = rand::rng();
for word in target_words {
let mut uuid = [0; 16];
rng.fill(&mut uuid);
mapping.insert(word.to_lowercase(), hex::encode_upper(uuid));
}
Ok(Self {
word_token_mapping: mapping,
word_pattern: Regex::new(r"\b\w+\b")
.map_err(|e| AnoError::AnonymizationError(format!("regex error: {e}")))?,
})
}
/// Remove sensitive words from a text by replacing them with tokens.
///
/// # Arguments
///
/// * `data`: a string representing the input text.
///
/// # Returns
///
/// Texts containing tokens in place of sensitive words.
#[must_use]
pub fn apply(&self, data: &str) -> String {
let result = self
.word_pattern
.replace_all(data, |caps: ®ex::Captures| {
self.word_token_mapping
.get(&caps[0].to_lowercase())
.map_or_else(|| caps[0].to_owned(), Clone::clone)
});
result.into_owned()
}
}
pub struct WordMasker {
/// A set of words to be masked in the text.
word_list: HashSet<String>,
/// Pre-compiled pattern used to extract word tokens from text.
word_pattern: Regex,
}
const MASK: &str = "XXXX";
impl WordMasker {
/// Creates a new `WordMasker` instance.
///
/// # Arguments
///
/// * `words_to_block`: A slice of string references containing the words to
/// be masked in the text.
#[must_use]
// The hardcoded literal `r"\b\w+\b"` is always a valid regex; expect() cannot panic here.
#[allow(clippy::expect_used, clippy::missing_panics_doc)]
pub fn new(words_to_block: &[&str]) -> Self {
Self {
word_list: words_to_block.iter().map(|s| s.to_lowercase()).collect(),
word_pattern: Regex::new(r"\b\w+\b").expect("hardcoded regex is valid"),
}
}
/// Masks the specified words in the given text.
///
/// # Arguments
///
/// * `data`: A string slice containing the text to be masked.
///
/// # Returns
///
/// Text without the sensitive words.
#[must_use]
pub fn apply(&self, data: &str) -> String {
let result = self
.word_pattern
.replace_all(data, |caps: ®ex::Captures| {
if self.word_list.contains(&caps[0].to_lowercase()) {
MASK.to_owned()
} else {
caps[0].to_owned()
}
});
result.into_owned()
}
}
pub struct WordPatternMasker {
pattern: Regex,
replacer: String,
}
impl WordPatternMasker {
/// Creates a new instance of `WordPatternMasker` with the provided pattern
/// regex and replace string.
///
/// # Arguments
///
/// * `pattern_regex` - The pattern regex to search for.
/// * `replace_str` - The string to replace the matched patterns.
pub fn new(pattern_regex: &str, replace_str: &str) -> Result<Self, AnoError> {
Ok(Self {
pattern: Regex::new(pattern_regex)?,
replacer: replace_str.to_owned(),
})
}
/// Applies the pattern mask to the provided data.
///
/// # Arguments
///
/// * `data` - The data to be masked.
///
/// # Returns
///
/// Text with the matched pattern replaced.
#[must_use]
pub fn apply(&self, data: &str) -> String {
self.pattern.replace_all(data, &self.replacer).into_owned()
}
}