1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::fs::File;
use std::io::prelude::*;
use std::io::BufReader;

use std::str::Chars;

#[macro_use]
extern crate lazy_static;

lazy_static! {
    static ref SENSITIVE_WORD_MAP: HashMap<char, SensitiveWordMap> = {
        // let set = read_sensitive_word_file("sensitive-words.txt");
        let set = read_sensitive_word_map();
        build_sensitive_word_map(set)
    };
}

pub enum MatchType {
    MinMatchType,
    //最小匹配规则
    MaxMatchType, //最大匹配规则
}

#[derive(Debug)]
struct SensitiveWordMap {
    word: char,
    is_end: char,
    word_map: Option<HashMap<char, Box<SensitiveWordMap>>>,
}

/// 读取敏感词库中的内容,将内容添加到set集合中
fn read_sensitive_word_file(path: &str) -> BTreeSet<String> {
    let mut set = BTreeSet::<String>::new();
    match File::open(path) {
        Ok(f) => {
            let reader = BufReader::new(f);
            let lines = reader.lines();
            for line in lines.map(|x| x.unwrap()) {
                println!("{}", line);

                set.insert(line);
            }
        }
        Err(e) => panic!("can't open this file :{}", e),
    }

    set
}

fn read_sensitive_word_map() -> BTreeSet<String> {
    let mut set = BTreeSet::<String>::new();
    set.insert(String::from("Fuck"));
    set.insert(String::from("Bitch"));
    set.insert(String::from("套"));
    set.insert(String::from("套现"));
    set.insert(String::from("套现王"));
    set.insert(String::from("套利"));
    set.insert(String::from("信用"));
    set.insert(String::from("信用卡"));
    set.insert(String::from("信用卡套现"));
    set.insert(String::from("信用卡代还"));
    set.insert(String::from("信用卡代付"));
    set.insert(String::from("花呗代还"));
    set.insert(String::from("T+1"));
    set.insert(String::from("T1"));
    set.insert(String::from("D1"));
    set.insert(String::from("D+1"));
    set.insert(String::from("结算"));
    set.insert(String::from("结算费"));
    set.insert(String::from("免结算费"));
    set
}

/// 递归地修改map
fn recursive_build_map(map: &mut SensitiveWordMap, chars: &mut Chars, count: &mut usize) {
    if let Some(ch) = chars.next() {
        *count -= 1;
        if let Some(now_map) = map.word_map.as_mut() {
            let contains_key = now_map.contains_key(&ch);

            if contains_key {
                if let Some(m) = now_map.get_mut(&ch) {
                    recursive_build_map(&mut *m, &mut *chars, count);
                }
            } else {
                let is_end = if *count == 0 { '1' } else { '0' };
                let swm = SensitiveWordMap {
                    word: ch,
                    is_end,
                    word_map: Some(HashMap::<char, Box<SensitiveWordMap>>::new()),
                };
                now_map.insert(ch, Box::new(swm));
                if let Some(m) = now_map.get_mut(&ch) {
                    recursive_build_map(&mut *m, &mut *chars, count);
                }
            }
        }
    }
}

/// 读取敏感词库,将敏感词放入HashMap中,构建一个DFA算法模型
///  {
///   '信': SensitiveWordMap {
///       word: '信',
///       is_end: '0',
///       word_map: Some({
///           '用': SensitiveWordMap {
///               word: '用',
///               is_end: '0',
///               word_map: Some({
///                   '卡': SensitiveWordMap {
///                       word: '卡',
///                       is_end: '0',
///                       word_map: Some({
///                           '套': SensitiveWordMap {
///                               word: '套',
///                               is_end: '0',
///                               word_map: Some({
///                                   '现': SensitiveWordMap {
///                                       word: '现',
///                                       is_end: '1',
///                                       word_map: Som e({})
///                                   }
///                               })
///                           },
///                           '代': SensitiveWordMap {
///                               word: '代',
///                               is_end: '0',
///                               word_map: Some({
///                                   '付': SensitiveWordMap {
///                                       word: '付',
///                                       is_end: '1',
///                                       word_map: Some({})
///                                   },
///                                   '还': SensitiveWordMap {
///                                       word: '还',
///                                       is_end: '1',
///                                       word_map: Some({})
///                                   }
///                               })
///                           }
///                       })
///                   }
///               })
///           }
///       })
///   }
///
fn build_sensitive_word_map(set: BTreeSet<String>) -> HashMap<char, SensitiveWordMap> {
    let mut sensitive_word_map = HashMap::<char, SensitiveWordMap>::new();

    let iterator = set.iter();
    for key in iterator {
        let len = key.chars().count();
        let mut count = len;
        let mut key_chars = key.chars();
        //读取每行的首个字符
        if let Some(first_char) = key_chars.next() {
            count -= 1;
            if let Some(word_map) = sensitive_word_map.get_mut(&first_char) {
                //读取下一个字符
                recursive_build_map(&mut *word_map, &mut key_chars, &mut count);
            } else {
                let is_end = if len == 1 { '1' } else { '0' };

                let now_map = SensitiveWordMap {
                    word: first_char,
                    is_end,
                    word_map: Some(HashMap::<char, Box<SensitiveWordMap>>::new()),
                };
                sensitive_word_map.insert(first_char, now_map);

                if let Some(now_map) = sensitive_word_map.get_mut(&first_char) {
                    recursive_build_map(&mut *now_map, &mut key_chars, &mut count);
                }
            }
        }
    }

    sensitive_word_map
}

/// 递归查找map
///
fn recursive_find_map(
    swm: &SensitiveWordMap,
    txt_vec: &[char],
    i: &mut usize,
    match_flag: &mut usize,
    last_match_length: &mut usize,
    match_type: &MatchType,
) {
    if let Some(word) = txt_vec.get(*i) {
        if let Some(wm) = &swm.word_map {
            if let Some(next_swm) = wm.get(word) {
                *match_flag += 1;

                if swm.is_end == '1' {
                    *last_match_length = *match_flag;
                    match match_type {
                        MatchType::MinMatchType => {
                            return;
                        }
                        MatchType::MaxMatchType => (),
                    }
                }

                if next_swm.is_end == '1' {
                    *last_match_length = *match_flag;
                    match match_type {
                        MatchType::MinMatchType => {
                            return;
                        }
                        MatchType::MaxMatchType => (),
                    }
                }

                if let Some(nwm) = &next_swm.word_map {
                    if nwm.is_empty() {
                        *last_match_length = *match_flag;
                        match match_type {
                            MatchType::MinMatchType => {
                                return;
                            }
                            MatchType::MaxMatchType => (),
                        }
                    }
                }

                *i += 1;
                recursive_find_map(
                    &next_swm,
                    txt_vec,
                    i,
                    match_flag,
                    last_match_length,
                    match_type,
                );
            }
        }
    }
}

/// 查文字中是否包含检敏感字符,如果存在,则返回敏感词字符的长度,不存在返回0
///
fn check_sensitive_word(txt: &str, begin_index: usize, match_type: &MatchType) -> usize {
    let mut match_flag = 0;
    let mut last_match_length = 0;
    // let mut word: char;
    let txt_vec: Vec<char> = txt.chars().collect();
    // let len = txt.len();
    if let Some(word) = &txt_vec.get(begin_index) {
        if let Some(swm) = SENSITIVE_WORD_MAP.get(&word) {
            match_flag += 1;
            if (*swm).is_end == '1' {
                last_match_length = match_flag;

                match match_type {
                    MatchType::MinMatchType => {
                        return last_match_length;
                    }
                    MatchType::MaxMatchType => (),
                }
            }

            //递归查找
            let mut j = begin_index + 1;
            recursive_find_map(
                swm,
                &txt_vec,
                &mut j,
                &mut match_flag,
                &mut last_match_length,
                match_type,
            );
        }
    }
    last_match_length
}

/// 获取文字中的敏感词
///
pub fn find_sensitive_word(txt: &str, match_type: &MatchType) -> BTreeSet<String> {
    let mut sensitive_word_set = BTreeSet::<String>::new();
    let len = txt.chars().count();
    let txt_vec: Vec<char> = txt.chars().collect();
    let mut i = 0;
    while i < len {
        let length = check_sensitive_word(&txt, i, match_type);
        if length > 0 {
            //存在,加入list中
            sensitive_word_set.insert(txt_vec[i..i + length].iter().collect());
            i += length - 1; //减1的原因,是因为循环会自增
        }
        i += 1;
    }

    sensitive_word_set
}

/// 替换敏感字字符
/// # Examples
/// ```
/// let result = rust_by_example::dfa::replace_sensitive_word("信用卡之家", &MatchType::MinMatchType, '*')
/// assert_eq!(result,"**卡之家");
/// ```
pub fn replace_sensitive_word(txt: &str, match_type: &MatchType, replace_char: char) -> String {
    let set: BTreeSet<String> = find_sensitive_word(txt, match_type);
    let mut replace_str = String::from(txt);
    for word in set {
        let len = word.chars().count();
        let replace_chars: String = vec![replace_char; len].iter().collect();
        replace_str = replace_str.replace(word.as_str(), &replace_chars);
    }

    replace_str
}

#[test]
fn filter_sensitive_words() {
    let str_vec = vec![
        "花呗信用卡代还OK套现",
        "套花呗分期代付",
        "马上套现信用卡",
        "期货套利",
        "空手套白狼",
        "守信用卡脖子",
        "坚定信心,同舟共济,科学防治,精准施策",
        "D+1还是T+1秒到结算免结算费",
        "Fuck you!",
        "Son of Bitch",
    ];

    println!("replace_sensitive_word......");
    for str in &str_vec {
        let replace_str = replace_sensitive_word(str, &MatchType::MinMatchType, '*');

        println!("{} --> {}", str, replace_str);
    }
}