tipping_rs/
parser.rs

1use std::{collections::BTreeSet, marker::PhantomData};
2
3use hashbrown::{HashMap, HashSet};
4use rayon::prelude::*;
5
6use fancy_regex::Regex;
7
8use crate::{
9    graph::{anchor_nodes, build_graph},
10    template::{parameter_masks, shared_slices, templates},
11    token_filter::StaticFilter,
12    token_record::TokenRecord,
13    tokenizer::{Token, Tokenizer},
14    traits::Tokenize,
15};
16
17type Clusters = Vec<Option<usize>>;
18type Templates = Vec<std::collections::HashSet<String>>;
19type Masks = std::collections::HashMap<String, String>;
20
21pub struct NoCompute;
22pub struct Compute;
23
24/// Tipping (Token Interdependency Parsing) log parser
25/// ```
26/// use fancy_regex::Regex;
27///
28///let msgs = vec![
29///     "User 'admin' logged in from IP address 192.168.1.10",
30///     "Attempt to access unauthorized resource by user 'guest'",
31///     "Database connection failed due to timeout",
32///     "Processing request for data retrieval with queryId: 34521",
33/// ];
34///
35///let special_whites = vec![];
36///let special_blacks = vec![Regex::new(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}").unwrap()];
37///let symbols = "'{}|,".chars().collect();
38///let (event_ids, masks, templates) = tipping_rs::Parser::default()
39///    .with_threshold(0.5)
40///    .with_special_whites(special_whites)
41///    .with_special_blacks(special_blacks)
42///    .with_symbols(symbols)
43///    .with_filter_alphabetic(true)
44///    .with_filter_numeric(false)
45///    .with_filter_impure(false)
46///    .compute_templates()
47///    .compute_masks()
48///    .parse(&msgs);
49/// ```
50#[derive(Debug, Clone)]
51pub struct Parser<Templates = NoCompute, Masks = NoCompute> {
52    threshold: f32,
53    special_whites: Vec<Regex>,
54    special_blacks: Vec<Regex>,
55    symbols: HashSet<char>,
56    filter_alphabetic: bool,
57    filter_numeric: bool,
58    filter_impure: bool,
59    compute_templates: PhantomData<Templates>,
60    compute_mask: PhantomData<Masks>,
61}
62
63impl Default for Parser {
64    fn default() -> Self {
65        Self::new()
66    }
67}
68impl Parser {
69    /// Initiates a new Parser with default parameters
70    pub fn new() -> Self {
71        Parser {
72            threshold: 0.5,
73            special_whites: Default::default(),
74            special_blacks: Default::default(),
75            symbols: Default::default(),
76            filter_alphabetic: true,
77            filter_numeric: false,
78            filter_impure: false,
79            compute_templates: Default::default(),
80            compute_mask: Default::default(),
81        }
82    }
83
84    /// Sets `value` as threshold. The threshold determines if an interdependency
85    /// link should be considered during key node computation or not. The
86    /// threshod should be `0 <= threshold <= 1`.
87    #[must_use]
88    pub fn with_threshold(mut self, value: f32) -> Self {
89        assert!(0.0 <= value);
90        assert!(value <= 1.0);
91        self.threshold = value;
92        self
93    }
94
95    /// Sets `value` as spcial white regexes. White regexes are never parameterized.
96    #[must_use]
97    pub fn with_special_whites(mut self, value: Vec<Regex>) -> Self {
98        self.special_whites = value;
99        self
100    }
101
102    /// Sets `value` as special black regexes. Black regexes are always parameterized.
103    #[must_use]
104    pub fn with_special_blacks(mut self, value: Vec<Regex>) -> Self {
105        self.special_blacks = value;
106        self
107    }
108
109    /// Sets `value` as symbols. Symbolds are characters that are used alongside the
110    /// white spaces to split string during tokenization.
111    #[must_use]
112    pub fn with_symbols(mut self, value: HashSet<char>) -> Self {
113        self.symbols = value;
114        self
115    }
116
117    /// The `value` determines if alphabetic tokens should be used during key node computation.
118    #[must_use]
119    pub fn with_filter_alphabetic(mut self, value: bool) -> Self {
120        self.filter_alphabetic = value;
121        self
122    }
123
124    /// The `value` determines if numeric tokens should be used during key node computation.
125    #[must_use]
126    pub fn with_filter_numeric(mut self, value: bool) -> Self {
127        self.filter_numeric = value;
128        self
129    }
130
131    /// The `value` determines if impure tokens should be used during key node computation.
132    #[must_use]
133    pub fn with_filter_impure(mut self, value: bool) -> Self {
134        self.filter_impure = value;
135        self
136    }
137}
138
139impl<T> Parser<NoCompute, T> {
140    // Add templates computation to the output
141    #[must_use]
142    pub fn compute_templates(self) -> Parser<Compute, T> {
143        Parser::<Compute, T> {
144            threshold: self.threshold,
145            special_whites: self.special_whites,
146            special_blacks: self.special_blacks,
147            symbols: self.symbols,
148            filter_alphabetic: self.filter_alphabetic,
149            filter_numeric: self.filter_numeric,
150            filter_impure: self.filter_impure,
151            compute_templates: Default::default(),
152            compute_mask: Default::default(),
153        }
154    }
155}
156
157impl<T> Parser<T, NoCompute> {
158    // Add parameter mask computation to the output
159    #[must_use]
160    pub fn compute_masks(self) -> Parser<T, Compute> {
161        Parser::<T, Compute> {
162            threshold: self.threshold,
163            special_whites: self.special_whites,
164            special_blacks: self.special_blacks,
165            symbols: self.symbols,
166            filter_alphabetic: self.filter_alphabetic,
167            filter_numeric: self.filter_numeric,
168            filter_impure: self.filter_impure,
169            compute_templates: Default::default(),
170            compute_mask: Default::default(),
171        }
172    }
173}
174
175impl Parser<NoCompute, NoCompute> {
176    /// Parses the input `messages` and returns `Clusters`.
177    ///
178    /// - `Clusters`: A `Vec<Option<usize>>` representing potential cluster IDs. Each `Option<usize>`
179    ///   corresponds to the cluster ID of the message at the same index, or `None` if the message
180    ///   couldn't be parsed.
181    ///
182    pub fn parse<Message: AsRef<str> + Sync>(self, messages: &[Message]) -> Clusters {
183        let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
184        let filter = StaticFilter::with(
185            self.filter_alphabetic,
186            self.filter_numeric,
187            self.filter_impure,
188        );
189        let idep = TokenRecord::new(messages, &tokenizer, &filter);
190        let cmap = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
191        let mut clus = vec![None; messages.len()];
192        cmap.into_iter()
193            .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
194            .enumerate()
195            .for_each(|(cid, (_, indices))| {
196                for idx in indices {
197                    clus[idx] = Some(cid);
198                }
199            });
200        clus
201    }
202}
203
204impl Parser<Compute, NoCompute> {
205    /// Parses the input `messages` and returns `Clusters`, and `Templates`.
206    ///
207    /// - `Clusters`: A `Vec<Option<usize>>` representing potential cluster IDs. Each `Option<usize>`
208    ///   corresponds to the cluster ID of the message at the same index, or `None` if the message
209    ///   couldn't be parsed.
210    ///
211    /// - `Templates`: A `Vec<HashSet<String>>` where each set of templates is aligned with the
212    ///   corresponding cluster ID in the `Clusters` vector.
213    ///
214    pub fn parse<Message: AsRef<str> + Sync>(self, messages: &[Message]) -> (Clusters, Templates) {
215        let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
216        let filter = StaticFilter::with(
217            self.filter_alphabetic,
218            self.filter_numeric,
219            self.filter_impure,
220        );
221        let idep = TokenRecord::new(messages, &tokenizer, &filter);
222        let cmap = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
223        let mut clus = vec![None; messages.len()];
224        let mut temps = vec![HashSet::default(); cmap.len()];
225        let tokenizer =
226            tokenizer.new_with_symbols("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect());
227        cmap.into_iter()
228            .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
229            .enumerate()
230            .for_each(|(cid, (_, indices))| {
231                let stok = shared_slices(
232                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
233                    &tokenizer,
234                    self.filter_alphabetic,
235                    self.filter_numeric,
236                    self.filter_impure,
237                );
238                temps[cid] = templates(
239                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
240                    &tokenizer,
241                    &stok,
242                );
243                for idx in indices {
244                    clus[idx] = Some(cid);
245                }
246            });
247
248        (
249            clus,
250            temps
251                .into_iter()
252                .map(|map| map.into_iter().collect())
253                .collect(),
254        )
255    }
256}
257
258impl Parser<NoCompute, Compute> {
259    /// Parses the input `messages` and returns `Clusters`,  `Masks`.
260    ///
261    /// - `Clusters`: A `Vec<Option<usize>>` representing potential cluster IDs. Each `Option<usize>`
262    ///   corresponds to the cluster ID of the message at the same index, or `None` if the message
263    ///   couldn't be parsed.
264    ///
265    /// - `Masks`: A table mapping each message to its parameter masks.
266    ///
267    pub fn parse<Message: AsRef<str> + Sync>(self, messages: &[Message]) -> (Clusters, Masks) {
268        let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
269        let filter = StaticFilter::with(
270            self.filter_alphabetic,
271            self.filter_numeric,
272            self.filter_impure,
273        );
274        let idep = TokenRecord::new(messages, &tokenizer, &filter);
275        let cmap = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
276        let mut clus = vec![None; messages.len()];
277        let mut masks = HashMap::new();
278        let tokenizer =
279            tokenizer.new_with_symbols("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect());
280        cmap.into_iter()
281            .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
282            .enumerate()
283            .for_each(|(cid, (_, indices))| {
284                let stok = shared_slices(
285                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
286                    &tokenizer,
287                    self.filter_alphabetic,
288                    self.filter_numeric,
289                    self.filter_impure,
290                );
291                masks.extend(parameter_masks(
292                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
293                    &tokenizer,
294                    &stok,
295                ));
296                for idx in indices {
297                    clus[idx] = Some(cid);
298                }
299            });
300
301        (clus, masks.into_iter().collect())
302    }
303}
304
305impl Parser<Compute, Compute> {
306    /// Parses the input `messages` and returns `Clusters`, `Templates`, and `Masks`.
307    ///
308    /// - `Clusters`: A `Vec<Option<usize>>` representing potential cluster IDs. Each `Option<usize>`
309    ///   corresponds to the cluster ID of the message at the same index, or `None` if the message
310    ///   couldn't be parsed.
311    ///
312    /// - `Templates`: A `Vec<HashSet<String>>` where each set of templates is aligned with the
313    ///   corresponding cluster ID in the `Clusters` vector.
314    ///
315    /// - `Masks`: A table mapping each message to its parameter masks.
316    ///
317    pub fn parse<Message: AsRef<str> + Sync>(
318        self,
319        messages: &[Message],
320    ) -> (Clusters, Templates, Masks) {
321        let tokenizer = Tokenizer::new(self.special_whites, self.special_blacks, self.symbols);
322        let filter = StaticFilter::with(
323            self.filter_alphabetic,
324            self.filter_numeric,
325            self.filter_impure,
326        );
327        let idep = TokenRecord::new(messages, &tokenizer, &filter);
328        let groups = group_by_anchor_tokens(messages, &tokenizer, &idep, self.threshold);
329        let mut clus = vec![None; messages.len()];
330        let mut temps = vec![HashSet::default(); groups.len()];
331        let mut masks = HashMap::new();
332        let tokenizer =
333            tokenizer.new_with_symbols("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().collect());
334        groups
335            .into_iter()
336            .filter(|(anchor_toks, _)| !anchor_toks.is_empty())
337            .enumerate()
338            .for_each(|(cid, (_, indices))| {
339                let stok = shared_slices(
340                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
341                    &tokenizer,
342                    self.filter_alphabetic,
343                    self.filter_numeric,
344                    self.filter_impure,
345                );
346                temps[cid] = templates(
347                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
348                    &tokenizer,
349                    &stok,
350                );
351                masks.extend(parameter_masks(
352                    indices.iter().cloned().map(|idx| messages[idx].as_ref()),
353                    &tokenizer,
354                    &stok,
355                ));
356                for idx in indices {
357                    clus[idx] = Some(cid);
358                }
359            });
360
361        (
362            clus,
363            temps
364                .into_iter()
365                .map(|map| map.into_iter().collect())
366                .collect(),
367            masks.into_iter().collect(),
368        )
369    }
370}
371
372fn group_by_anchor_tokens<'a, T: AsRef<str> + Sync>(
373    messages: &'a [T],
374    tokenizer: &Tokenizer,
375    idep: &'a TokenRecord<'a>,
376    threshold: f32,
377) -> HashMap<BTreeSet<Token<'a>>, BTreeSet<usize>> {
378    messages
379        .iter()
380        .enumerate()
381        .par_bridge()
382        .map(|(idx, msg)| {
383            (idx, {
384                let tokens = tokenizer.tokenize(msg.as_ref());
385                let graph = build_graph(
386                    tokens
387                        .iter()
388                        .copied()
389                        .filter(|tok| idep.occurence(tok.as_str()).is_some()),
390                    |tok1, tok2| {
391                        idep.dependency(tok1.as_str(), tok2.as_str()).unwrap_or(0.0) > threshold
392                    },
393                );
394                let mut anchor_toks = anchor_nodes(graph);
395                for tok in tokens {
396                    match tok {
397                        Token::SpecialWhite(_) => {
398                            anchor_toks.insert(tok);
399                        }
400                        Token::SpecialBlack(_) => {
401                            anchor_toks.remove(&tok);
402                        }
403                        _ => (),
404                    }
405                }
406                anchor_toks
407            })
408        })
409        .fold_with(
410            HashMap::<BTreeSet<Token<'a>>, BTreeSet<usize>>::new(),
411            |mut map, (idx, anchor_tokens)| {
412                map.entry(anchor_tokens)
413                    .and_modify(|indices| {
414                        indices.insert(idx);
415                    })
416                    .or_insert([idx].into());
417                map
418            },
419        )
420        .reduce(Default::default, |mut m1, mut m2| {
421            if m1.len() > m2.len() {
422                m1.reserve(m2.len());
423                for (k, v) in m2 {
424                    if let Some(set) = m1.get_mut(&k) {
425                        set.extend(v);
426                    } else {
427                        m1.insert(k, v);
428                    }
429                }
430                m1
431            } else {
432                m2.reserve(m1.len());
433                for (k, v) in m1 {
434                    if let Some(set) = m2.get_mut(&k) {
435                        set.extend(v);
436                    } else {
437                        m2.insert(k, v);
438                    }
439                }
440                m2
441            }
442        })
443}