1use hashbrown::{HashMap, HashSet};
2
3use rayon::prelude::*;
4
5use crate::{
6 tokenizer::{Token, Tokenizer},
7 traits::Tokenize,
8};
9
10pub fn shared_slices<'a, Iter: Iterator<Item = &'a str> + Send>(
11 iter: Iter,
12 tokenizer: &Tokenizer,
13 filter_alphabetic: bool,
14 filter_numeric: bool,
15 filter_impure: bool,
16) -> HashSet<&'a str> {
17 iter.par_bridge()
18 .map(|msg| tokenizer.tokenize(msg))
19 .map(|toks_vec| {
20 toks_vec
21 .into_iter()
22 .filter_map(|tok| match tok {
23 Token::SpecialWhite(slice) => Some(slice),
24 Token::Whitespace(slice) => Some(slice),
25 Token::Symbolic(slice) => Some(slice),
26 Token::Alphabetic(slice) if filter_alphabetic => Some(slice),
27 Token::Numeric(slice) if filter_numeric => Some(slice),
28 Token::Impure(slice) if filter_impure => Some(slice),
29 _ => None,
30 })
31 .collect::<HashSet<_>>()
32 })
33 .map(Some)
34 .reduce(
35 || None,
36 |s1, s2| match (s1, s2) {
37 (None, None) => None,
38 (None, Some(s)) => Some(s),
39 (Some(s), None) => Some(s),
40 (Some(s1), Some(s2)) => Some(s1.intersection(&s2).copied().collect()),
41 },
42 )
43 .unwrap_or_default()
44}
45
46pub fn templates<'a, Iter: Iterator<Item = &'a str> + Send>(
47 iter: Iter,
48 tokenizer: &Tokenizer,
49 common_slices: &HashSet<&'a str>,
50) -> HashSet<String> {
51 iter.par_bridge()
52 .map(|msg| {
53 tokenizer
54 .tokenize(msg)
55 .into_iter()
56 .map(|tok| tok.as_str())
57 .fold(Vec::new(), |mut temp, slice| {
58 temp.push(common_slices.contains(slice).then_some(slice));
59 temp
60 })
61 })
62 .fold_with(HashSet::new(), |mut temp_set, temp| {
63 temp_set.insert(temp);
64 temp_set
65 })
66 .reduce(Default::default, |s1, s2| {
67 let (mut larger, smaller) = if s1.len() > s2.len() {
68 (s1, s2)
69 } else {
70 (s2, s1)
71 };
72 larger.extend(smaller);
73 larger
74 })
75 .into_iter()
76 .map(|temp| temp.into_iter().map(|tok| tok.unwrap_or("<*>")).collect())
77 .collect()
78}
79
80pub fn parameter_masks<'a, Iter: Iterator<Item = &'a str> + Send>(
81 iter: Iter,
82 tokenizer: &Tokenizer,
83 common_slices: &HashSet<&'a str>,
84) -> HashMap<String, String> {
85 iter.par_bridge()
86 .fold_with(HashMap::new(), |mut map, msg| {
87 let toks = tokenizer.tokenize(msg);
88 let mut msk_vec = String::with_capacity(msg.len());
89 let mut should_parameterize = false;
90 toks.iter()
91 .copied()
92 .enumerate()
93 .for_each(|(idx, tok)| match tok {
94 Token::Symbolic(slice) => {
95 if common_slices.contains(slice) {
96 if matches!(
97 toks.get(idx + 1),
98 Some(Token::Whitespace(_)) | Some(Token::Symbolic(_)) | None
99 ) || matches!(
100 toks.get(idx + 1),
101 Some(Token::Whitespace(_)) | Some(Token::Symbolic(_)) | None
102 ) {
103 msk_vec.push('0');
104 } else if should_parameterize {
105 msk_vec.push('1');
106 } else {
107 msk_vec.push('0');
108 }
109 } else {
110 msk_vec.push('1');
111 }
112 }
113 Token::Whitespace(_) => {
114 msk_vec.push('0');
115 should_parameterize = false;
116 }
117 Token::SpecialWhite(slice) => {
118 (0..slice.len()).for_each(|_| msk_vec.push('0'));
119 }
120 Token::SpecialBlack(slice) => {
121 (0..slice.len()).for_each(|_| msk_vec.push('1'));
122 }
123 _ => {
124 let slice = tok.as_str();
125 if !common_slices.contains(slice) || should_parameterize {
126 (0..slice.len()).for_each(|_| msk_vec.push('1'));
127 should_parameterize = true;
128 } else {
129 (0..slice.len()).for_each(|_| msk_vec.push('0'));
130 }
131 }
132 });
133 map.insert(msg, msk_vec);
134 map
135 })
136 .reduce(HashMap::new, |mut m1, m2| {
137 for (k, v) in m2 {
138 if !m1.contains_key(k) {
139 m1.insert(k, v);
140 }
141 }
142 m1
143 })
144 .into_iter()
145 .map(|(k, v)| (k.to_owned(), v))
146 .collect()
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152
153 #[test]
154 fn test_common_words() {
155 let msgs = [
156 "The value is a",
157 "The value is b",
158 "The value is c",
159 "The value is d",
160 ];
161 let tokenizer = Tokenizer::new(Vec::new(), Vec::new(), "".chars().collect());
162 let expected = HashSet::from(["The", "value", "is", " "]);
163 let computed = shared_slices(msgs.into_iter(), &tokenizer, true, false, false);
164 assert_eq!(computed, expected);
165 }
166
167 #[test]
168 fn test_parameter_mask() {
169 let msgs = ["The value is (val_123) ->"];
170 let tokenizer = Tokenizer::new(
171 Default::default(),
172 Default::default(),
173 "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect(),
174 );
175 let common_slices = HashSet::from(["The", "value", "is", "val", "-", ">", "(", ")", "_"]);
176 let pm = parameter_masks(msgs.into_iter(), &tokenizer, &common_slices);
177 for (k, v) in pm {
178 println!("{k}");
179 println!("{v}");
180 }
181 }
182}