1use std::{collections::BTreeSet, marker::PhantomData};
2
3use hashbrown::{HashMap, HashSet};
4use rayon::prelude::*;
5
6use fancy_regex::Regex;
7
8use crate::{
9 graph::{anchor_nodes, build_graph},
10 template::{parameter_masks, shared_slices, templates},
11 token_filter::StaticFilter,
12 token_record::TokenRecord,
13 tokenizer::{Token, Tokenizer},
14 traits::Tokenize,
15};
16
17type Clusters = Vec<Option<usize>>;
18type Templates = Vec<std::collections::HashSet<String>>;
19type Masks = std::collections::HashMap<String, String>;
20
21pub struct NoCompute;
22pub struct Compute;
23
24#[derive(Debug, Clone)]
51pub struct Parser<Templates = NoCompute, Masks = NoCompute> {
52 threshold: f32,
53 special_whites: Vec<Regex>,
54 special_blacks: Vec<Regex>,
55 symbols: HashSet<char>,
56 filter_alphabetic: bool,
57 filter_numeric: bool,
58 filter_impure: bool,
59 compute_templates: PhantomData<Templates>,
60 compute_mask: PhantomData<Masks>,
61}
62
63impl Default for Parser {
64 fn default() -> Self {
65 Self::new()
66 }
67}
68impl Parser {
69 pub fn new() -> Self {
71 Parser {
72 threshold: 0.5,
73 special_whites: Default::default(),
74 special_blacks: Default::default(),
75 symbols: Default::default(),
76 filter_alphabetic: true,
77 filter_numeric: false,
78 filter_impure: false,
79 compute_templates: Default::default(),
80 compute_mask: Default::default(),
81 }
82 }
83
84 #[must_use]
88 pub fn with_threshold(mut self, value: f32) -> Self {
89 assert!(0.0 <= value);
90 assert!(value <= 1.0);
91 self.threshold = value;
92 self
93 }
94
95 #[must_use]
97 pub fn with_special_whites(mut self, value: Vec<Regex>) -> Self {
98 self.special_whites = value;
99 self
100 }
101
102 #[must_use]
104 pub fn with_special_blacks(mut self, value: Vec<Regex>) -> Self {
105 self.special_blacks = value;
106 self
107 }
108
109 #[must_use]
112 pub fn with_symbols(mut self, value: HashSet<char>) -> Self {
113 self.symbols = value;
114 self
115 }
116
117 #[must_use]
119 pub fn with_filter_alphabetic(mut self, value: bool) -> Self {
120 self.filter_alphabetic = value;
121 self
122 }
123
124 #[must_use]
126 pub fn with_filter_numeric(mut self, value: bool) -> Self {
127 self.filter_numeric = value;
128 self
129 }
130
131 #[must_use]
133 pub fn with_filter_impure(mut self, value: bool) -> Self {
134 self.filter_impure = value;
135 self
136 }
137}
138
139impl<T> Parser<NoCompute, T> {
140 #[must_use]
142 pub fn compute_templates(self) -> Parser<Compute, T> {
143 Parser::<Compute, T> {
144 threshold: self.threshold,
145 special_whites: self.special_whites,
146 special_blacks: self.special_blacks,
147 symbols: self.symbols,
148 filter_alphabetic: self.filter_alphabetic,
149 filter_numeric: self.filter_numeric,
150 filter_impure: self.filter_impure,
151 compute_templates: Default::default(),
152 compute_mask: Default::default(),
153 }
154 }
155}
156
157impl<T> Parser<T, NoCompute> {
158 #[must_use]
160 pub fn compute_masks(self) -> Parser<T, Compute> {
161 Parser::<T, Compute> {
162 threshold: self.threshold,
163 special_whites: self.special_whites,
164 special_blacks: self.special_blacks,
165 symbols: self.symbols,
166 filter_alphabetic: self.filter_alphabetic,
167 filter_numeric: self.filter_numeric,
168 filter_impure: self.filter_impure,
169 compute_templates: Default::default(),
170 compute_mask: Default::default(),
171 }
172 }
173}
174
175impl Parser<NoCompute, NoCompute> {
176 pub fn parse<Message: AsRef<str> + Sync>(self, messages: &[Message]) -> Clusters {
183 let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
184 let filter = StaticFilter::with(
185 self.filter_alphabetic,
186 self.filter_numeric,
187 self.filter_impure,
188 );
189 let idep = TokenRecord::new(messages, &tokenizer, &filter);
190 let cmap = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
191 let mut clus = vec![None; messages.len()];
192 cmap.into_iter()
193 .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
194 .enumerate()
195 .for_each(|(cid, (_, indices))| {
196 for idx in indices {
197 clus[idx] = Some(cid);
198 }
199 });
200 clus
201 }
202}
203
204impl Parser<Compute, NoCompute> {
205 pub fn parse<Message: AsRef<str> + Sync>(self, messages: &[Message]) -> (Clusters, Templates) {
215 let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
216 let filter = StaticFilter::with(
217 self.filter_alphabetic,
218 self.filter_numeric,
219 self.filter_impure,
220 );
221 let idep = TokenRecord::new(messages, &tokenizer, &filter);
222 let cmap = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
223 let mut clus = vec![None; messages.len()];
224 let mut temps = vec![HashSet::default(); cmap.len()];
225 let tokenizer =
226 tokenizer.new_with_symbols("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect());
227 cmap.into_iter()
228 .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
229 .enumerate()
230 .for_each(|(cid, (_, indices))| {
231 let stok = shared_slices(
232 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
233 &tokenizer,
234 self.filter_alphabetic,
235 self.filter_numeric,
236 self.filter_impure,
237 );
238 temps[cid] = templates(
239 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
240 &tokenizer,
241 &stok,
242 );
243 for idx in indices {
244 clus[idx] = Some(cid);
245 }
246 });
247
248 (
249 clus,
250 temps
251 .into_iter()
252 .map(|map| map.into_iter().collect())
253 .collect(),
254 )
255 }
256}
257
258impl Parser<NoCompute, Compute> {
259 pub fn parse<Message: AsRef<str> + Sync>(self, messages: &[Message]) -> (Clusters, Masks) {
268 let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
269 let filter = StaticFilter::with(
270 self.filter_alphabetic,
271 self.filter_numeric,
272 self.filter_impure,
273 );
274 let idep = TokenRecord::new(messages, &tokenizer, &filter);
275 let cmap = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
276 let mut clus = vec![None; messages.len()];
277 let mut masks = HashMap::new();
278 let tokenizer =
279 tokenizer.new_with_symbols("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect());
280 cmap.into_iter()
281 .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
282 .enumerate()
283 .for_each(|(cid, (_, indices))| {
284 let stok = shared_slices(
285 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
286 &tokenizer,
287 self.filter_alphabetic,
288 self.filter_numeric,
289 self.filter_impure,
290 );
291 masks.extend(parameter_masks(
292 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
293 &tokenizer,
294 &stok,
295 ));
296 for idx in indices {
297 clus[idx] = Some(cid);
298 }
299 });
300
301 (clus, masks.into_iter().collect())
302 }
303}
304
305impl Parser<Compute, Compute> {
306 pub fn parse<Message: AsRef<str> + Sync>(
318 self,
319 messages: &[Message],
320 ) -> (Clusters, Templates, Masks) {
321 let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
322 let filter = StaticFilter::with(
323 self.filter_alphabetic,
324 self.filter_numeric,
325 self.filter_impure,
326 );
327 let idep = TokenRecord::new(messages, &tokenizer, &filter);
328 let groups = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
329 let mut clus = vec![None; messages.len()];
330 let mut temps = vec![HashSet::default(); groups.len()];
331 let mut masks = HashMap::new();
332 let tokenizer =
333 tokenizer.new_with_symbols("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect());
334 groups
335 .into_iter()
336 .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
337 .enumerate()
338 .for_each(|(cid, (_, indices))| {
339 let stok = shared_slices(
340 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
341 &tokenizer,
342 self.filter_alphabetic,
343 self.filter_numeric,
344 self.filter_impure,
345 );
346 temps[cid] = templates(
347 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
348 &tokenizer,
349 &stok,
350 );
351 masks.extend(parameter_masks(
352 indices.iter().cloned().map(|idx| messages[idx].as_ref()),
353 &tokenizer,
354 &stok,
355 ));
356 for idx in indices {
357 clus[idx] = Some(cid);
358 }
359 });
360
361 (
362 clus,
363 temps
364 .into_iter()
365 .map(|map| map.into_iter().collect())
366 .collect(),
367 masks.into_iter().collect(),
368 )
369 }
370}
371
372fn group_by_anchor_tokens<'a, T: AsRef<str> + Sync>(
373 messages: &'a [T],
374 tokenizer: &Tokenizer,
375 idep: &'a TokenRecord<'a>,
376 threshold: f32,
377) -> HashMap<BTreeSet<Token<'a>>, BTreeSet<usize>> {
378 messages
379 .iter()
380 .enumerate()
381 .par_bridge()
382 .map(|(idx, msg)| {
383 (idx, {
384 let tokens = tokenizer.tokenize(msg.as_ref());
385 let graph = build_graph(
386 tokens
387 .iter()
388 .copied()
389 .filter(|tok| idep.occurence(tok.as_str()).is_some()),
390 |tok1, tok2| {
391 idep.dependency(tok1.as_str(), tok2.as_str()).unwrap_or(0.0) > threshold
392 },
393 );
394 let mut anchor_toks = anchor_nodes(graph);
395 for tok in tokens {
396 match tok {
397 Token::SpecialWhite(_) => {
398 anchor_toks.insert(tok);
399 }
400 Token::SpecialBlack(_) => {
401 anchor_toks.remove(&tok);
402 }
403 _ => (),
404 }
405 }
406 anchor_toks
407 })
408 })
409 .fold_with(
410 HashMap::<BTreeSet<Token<'a>>, BTreeSet<usize>>::new(),
411 |mut map, (idx, anchor_tokens)| {
412 map.entry(anchor_tokens)
413 .and_modify(|indices| {
414 indices.insert(idx);
415 })
416 .or_insert([idx].into());
417 map
418 },
419 )
420 .reduce(Default::default, |mut m1, mut m2| {
421 if m1.len() > m2.len() {
422 m1.reserve(m2.len());
423 for (k, v) in m2 {
424 if let Some(set) = m1.get_mut(&k) {
425 set.extend(v);
426 } else {
427 m1.insert(k, v);
428 }
429 }
430 m1
431 } else {
432 m2.reserve(m1.len());
433 for (k, v) in m1 {
434 if let Some(set) = m2.get_mut(&k) {
435 set.extend(v);
436 } else {
437 m2.insert(k, v);
438 }
439 }
440 m2
441 }
442 })
443}