tipping_rs/
template.rs

1use hashbrown::{HashMap, HashSet};
2
3use rayon::prelude::*;
4
5use crate::{
6    tokenizer::{Token, Tokenizer},
7    traits::Tokenize,
8};
9
10pub fn shared_slices<'a, Iter: Iterator<Item = &'a str> + Send>(
11    iter: Iter,
12    tokenizer: &Tokenizer,
13    filter_alphabetic: bool,
14    filter_numeric: bool,
15    filter_impure: bool,
16) -> HashSet<&'a str> {
17    iter.par_bridge()
18        .map(|msg| tokenizer.tokenize(msg))
19        .map(|toks_vec| {
20            toks_vec
21                .into_iter()
22                .filter_map(|tok| match tok {
23                    Token::SpecialWhite(slice) => Some(slice),
24                    Token::Whitespace(slice) => Some(slice),
25                    Token::Symbolic(slice) => Some(slice),
26                    Token::Alphabetic(slice) if filter_alphabetic => Some(slice),
27                    Token::Numeric(slice) if filter_numeric => Some(slice),
28                    Token::Impure(slice) if filter_impure => Some(slice),
29                    _ => None,
30                })
31                .collect::<HashSet<_>>()
32        })
33        .map(Some)
34        .reduce(
35            || None,
36            |s1, s2| match (s1, s2) {
37                (None, None) => None,
38                (None, Some(s)) => Some(s),
39                (Some(s), None) => Some(s),
40                (Some(s1), Some(s2)) => Some(s1.intersection(&s2).copied().collect()),
41            },
42        )
43        .unwrap_or_default()
44}
45
46pub fn templates<'a, Iter: Iterator<Item = &'a str> + Send>(
47    iter: Iter,
48    tokenizer: &Tokenizer,
49    common_slices: &HashSet<&'a str>,
50) -> HashSet<String> {
51    iter.par_bridge()
52        .map(|msg| {
53            tokenizer
54                .tokenize(msg)
55                .into_iter()
56                .map(|tok| tok.as_str())
57                .fold(Vec::new(), |mut temp, slice| {
58                    temp.push(common_slices.contains(slice).then_some(slice));
59                    temp
60                })
61        })
62        .fold_with(HashSet::new(), |mut temp_set, temp| {
63            temp_set.insert(temp);
64            temp_set
65        })
66        .reduce(Default::default, |s1, s2| {
67            let (mut larger, smaller) = if s1.len() > s2.len() {
68                (s1, s2)
69            } else {
70                (s2, s1)
71            };
72            larger.extend(smaller);
73            larger
74        })
75        .into_iter()
76        .map(|temp| temp.into_iter().map(|tok| tok.unwrap_or("<*>")).collect())
77        .collect()
78}
79
80pub fn parameter_masks<'a, Iter: Iterator<Item = &'a str> + Send>(
81    iter: Iter,
82    tokenizer: &Tokenizer,
83    common_slices: &HashSet<&'a str>,
84) -> HashMap<String, String> {
85    iter.par_bridge()
86        .fold_with(HashMap::new(), |mut map, msg| {
87            let toks = tokenizer.tokenize(msg);
88            let mut msk_vec = String::with_capacity(msg.len());
89            let mut should_parameterize = false;
90            toks.iter()
91                .copied()
92                .enumerate()
93                .for_each(|(idx, tok)| match tok {
94                    Token::Symbolic(slice) => {
95                        if common_slices.contains(slice) {
96                            if matches!(
97                                toks.get(idx + 1),
98                                Some(Token::Whitespace(_)) | Some(Token::Symbolic(_)) | None
99                            ) || matches!(
100                                toks.get(idx + 1),
101                                Some(Token::Whitespace(_)) | Some(Token::Symbolic(_)) | None
102                            ) {
103                                msk_vec.push('0');
104                            } else if should_parameterize {
105                                msk_vec.push('1');
106                            } else {
107                                msk_vec.push('0');
108                            }
109                        } else {
110                            msk_vec.push('1');
111                        }
112                    }
113                    Token::Whitespace(_) => {
114                        msk_vec.push('0');
115                        should_parameterize = false;
116                    }
117                    Token::SpecialWhite(slice) => {
118                        (0..slice.len()).for_each(|_| msk_vec.push('0'));
119                    }
120                    Token::SpecialBlack(slice) => {
121                        (0..slice.len()).for_each(|_| msk_vec.push('1'));
122                    }
123                    _ => {
124                        let slice = tok.as_str();
125                        if !common_slices.contains(slice) || should_parameterize {
126                            (0..slice.len()).for_each(|_| msk_vec.push('1'));
127                            should_parameterize = true;
128                        } else {
129                            (0..slice.len()).for_each(|_| msk_vec.push('0'));
130                        }
131                    }
132                });
133            map.insert(msg, msk_vec);
134            map
135        })
136        .reduce(HashMap::new, |mut m1, m2| {
137            for (k, v) in m2 {
138                if !m1.contains_key(k) {
139                    m1.insert(k, v);
140                }
141            }
142            m1
143        })
144        .into_iter()
145        .map(|(k, v)| (k.to_owned(), v))
146        .collect()
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn test_common_words() {
155        let msgs = [
156            "The value is a",
157            "The value is b",
158            "The value is c",
159            "The value is d",
160        ];
161        let tokenizer = Tokenizer::new(Vec::new(), Vec::new(), "".chars().collect());
162        let expected = HashSet::from(["The", "value", "is", " "]);
163        let computed = shared_slices(msgs.into_iter(), &tokenizer, true, false, false);
164        assert_eq!(computed, expected);
165    }
166
167    #[test]
168    fn test_parameter_mask() {
169        let msgs = ["The value is (val_123) ->"];
170        let tokenizer = Tokenizer::new(
171            Default::default(),
172            Default::default(),
173            "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect(),
174        );
175        let common_slices = HashSet::from(["The", "value", "is", "val", "-", ">", "(", ")", "_"]);
176        let pm = parameter_masks(msgs.into_iter(), &tokenizer, &common_slices);
177        for (k, v) in pm {
178            println!("{k}");
179            println!("{v}");
180        }
181    }
182}