tau_engine/
rule.rs

1use std::collections::HashMap;
2use std::fmt;
3use std::fs;
4use std::path::Path;
5
6use serde::de::{self, Deserializer, MapAccess, Visitor};
7use serde::{Deserialize, Serialize};
8use serde_yaml::Value as Yaml;
9
10use crate::document::Document;
11use crate::optimiser::{self, Optimisations};
12use crate::parser::{self, Expression};
13use crate::solver;
14use crate::tokeniser::{ModSym, Token, Tokeniser};
15
16/// The detection block, this contains the logic that is to be run through the solver to evaluate a
17/// `Document`.
18#[derive(Clone, Serialize)]
19pub struct Detection {
20    /// The core expression.
21    #[serde(skip_serializing)]
22    pub expression: Expression,
23    /// Additional expressions, defined using key/value pairs.
24    #[serde(skip_serializing)]
25    pub identifiers: HashMap<String, Expression>,
26
27    #[serde(rename = "condition")]
28    expression_raw: String,
29    #[serde(flatten)]
30    identifiers_raw: HashMap<String, Yaml>,
31}
32
33impl fmt::Debug for Detection {
34    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35        f.debug_struct("Detection")
36            .field("expression", &self.expression_raw)
37            .field("identifiers", &self.identifiers_raw)
38            .finish()
39    }
40}
41
42impl<'de> Deserialize<'de> for Detection {
43    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
44    where
45        D: Deserializer<'de>,
46    {
47        struct DetectionVisitor;
48        impl<'de> Visitor<'de> for DetectionVisitor {
49            type Value = Detection;
50            fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
51                formatter.write_str("struct Detection")
52            }
53            fn visit_map<V>(self, mut map: V) -> Result<Detection, V::Error>
54            where
55                V: MapAccess<'de>,
56            {
57                let mut identifiers: HashMap<String, Expression> = HashMap::new();
58                let mut identifiers_raw: HashMap<String, Yaml> = HashMap::new();
59                let mut expression = None;
60                while let Some(key) = map.next_key::<String>()? {
61                    match key.as_ref() {
62                        "condition" => {
63                            if expression.is_some() {
64                                return Err(de::Error::duplicate_field("condition"));
65                            }
66                            expression = Some(map.next_value::<String>()?);
67                        }
68                        _ => {
69                            if identifiers.contains_key(&key) {
70                                return Err(de::Error::custom(format_args!(
71                                    "duplicate field `{}`",
72                                    key
73                                )));
74                            }
75                            let v: Yaml = map.next_value()?;
76                            identifiers.insert(
77                                key.to_string(),
78                                parser::parse_identifier(&v).map_err(|e| {
79                                    de::Error::custom(format!(
80                                        "failed to parse identifier - {:?}",
81                                        e
82                                    ))
83                                })?,
84                            );
85                            identifiers_raw.insert(key.to_string(), v.clone());
86                        }
87                    }
88                }
89                let expression_raw =
90                    expression.ok_or_else(|| de::Error::missing_field("condition"))?;
91                let tokens = match expression_raw.tokenise() {
92                    Ok(tokens) => tokens,
93                    Err(err) => {
94                        return Err(de::Error::custom(format_args!(
95                            "invalid value: condition, failed to tokenise - {}",
96                            err
97                        )));
98                    }
99                };
100
101                // Loop through the tokens making sure that all identifiers are present, this is a
102                // pain because we need to ignore fields... For now we can just check for misc
103                // symbol prefix and skip those if present
104                let mut i = 0;
105                for token in &tokens {
106                    if i > 1 {
107                        if let Token::Modifier(m) = &tokens[i - 2] {
108                            match m {
109                                ModSym::Flt | ModSym::Int | ModSym::Not | ModSym::Str => {
110                                    i += 1;
111                                    continue;
112                                }
113                            }
114                        }
115                    }
116                    if let Token::Identifier(id) = token {
117                        if !identifiers.contains_key(id) {
118                            return Err(de::Error::custom(format_args!(
119                                "invalid condition: identifier not found - {}",
120                                id
121                            )));
122                        }
123                    }
124                    i += 1;
125                }
126
127                let expression = match parser::parse(&tokens) {
128                    Ok(expression) => expression,
129                    Err(err) => {
130                        return Err(de::Error::custom(format_args!(
131                            "invalid value: condition, failed to parse - {}",
132                            err
133                        )));
134                    }
135                };
136                if !expression.is_solvable() {
137                    return Err(de::Error::custom(format_args!(
138                        "invalid value: condition, not solveable - {}",
139                        expression
140                    )));
141                }
142                Ok(Detection {
143                    expression,
144                    identifiers,
145                    expression_raw,
146                    identifiers_raw,
147                })
148            }
149        }
150        const FIELDS: &[&str] = &["identifiers", "condition"];
151        deserializer.deserialize_struct("Detection", FIELDS, DetectionVisitor)
152    }
153}
154
155/// A rule used by the solver to evaluate a `Document`.
156///
157/// A rule contains the detection logic, along with the true positive and negative tests. The
158/// inclusion of these basic test allows for a basic level of verification to be ensured.
159///
160/// Rules are written in YAML and have a simple but powerful syntax.
161///
162/// # Syntax
163///
164/// There are two parts to a rule's logic: the condition & the identifiers.
165///
166/// ## Condition
167///
168/// The condition is the main expression and describes the top level logic for the rule. It can be
169/// comprised of the following:
170///
171/// <table>
172///     <thead>
173///         <tr>
174///             <th>Expression</th>
175///             <th>Description</th>
176///         </tr>
177///     </thead>
178///     <tbody>
179///         <tr>
180///             <td>_ <code>and</code> _</td>
181///             <td>
182///                 <span>The logical conjunction of two operands, where the operands are any of the following:</span>
183///                 <ul>
184///                     <li>
185///                         <code>expression</code><span>: a nested expression.</span>
186///                     </li>
187///                     <li>
188///                         <code>identifier</code><span>: a key that matches an identifier in the detection block.</span>
189///                     </li>
190///                 </ul>
191///             </td>
192///         </tr>
193///         <tr>
194///             <td>_ <code>or</code> _</td>
195///             <td>
196///                 <span>The logical disjunction of two operands, where the operands are any of the following:</span>
197///                 <ul>
198///                     <li>
199///                         <code>expression</code><span>: a nested expression.</span>
200///                     </li>
201///                     <li>
202///                         <code>identifier</code><span>: a key that matches an identifier in the detection block.</span>
203///                     </li>
204///                 </ul>
205///             </td>
206///         </tr>
207///         <tr>
208///             <td>_ <code>==</code> _</td>
209///             <td>
210///                 <span>The equality comparison of two operands, where the operands are any of the following:</span>
211///                 <ul>
212///                     <li>
213///                         <code>integer</code><span>: an integer.</span>
214///                     </li>
215///                     <li>
216///                         <code>string</code><span>: a string.</span>
217///                     </li>
218///                     <li>
219///                         <code>int(field)</code><span>: a field that should be cast as an
220///                         integer.</span>
221///                     </li>
222///                     <li>
223///                         <code>str(field)</code><span>: a field that should be cast as a
224///                         string.</span>
225///                     </li>
226///                 </ul>
227///             </td>
228///         </tr>
229///         <tr>
230///             <td>_ <code>&gt</code> _</td>
231///             <td>
232///                 <span>The greater than comparison of two operands, where the operands are any of the following:</span>
233///                 <ul>
234///                     <li>
235///                         <code>integer</code><span>: an integer.</span>
236///                     </li>
237///                     <li>
238///                         <code>int(field)</code><span>: a field that should be cast as an
239///                         integer.</span>
240///                     </li>
241///                 </ul>
242///             </td>
243///         </tr>
244///         <tr>
245///             <td>_ <code>&gt=</code> _</td>
246///             <td>
247///                 <span>The greater than or equal comparison of two operands, where the operands are any of the following:</span>
248///                 <ul>
249///                     <li>
250///                         <code>integer</code><span>: an integer.</span>
251///                     </li>
252///                     <li>
253///                         <code>int(field)</code><span>: a field that should be cast as an
254///                         integer.</span>
255///                     </li>
256///                 </ul>
257///             </td>
258///         </tr>
259///         <tr>
260///             <td>_ <code>&lt</code> _</td>
261///             <td>
262///                 <span>The less than comparison of two operands, where the operands are any of the following:</span>
263///                 <ul>
264///                     <li>
265///                         <code>integer</code><span>: an integer.</span>
266///                     </li>
267///                     <li>
268///                         <code>int(field)</code><span>: a field that should be cast as an
269///                         integer.</span>
270///                     </li>
271///                 </ul>
272///             </td>
273///         </tr>
274///         <tr>
275///             <td>_ <code>&lt=</code> _</td>
276///             <td>
277///                 <span>The less than or equal comparison of two operands, where the operands are any of the following:</span>
278///                 <ul>
279///                     <li>
280///                         <code>integer</code><span>: an integer.</span>
281///                     </li>
282///                     <li>
283///                         <code>int(field)</code><span>: a field that should be cast as an
284///                         integer.</span>
285///                     </li>
286///                 </ul>
287///             </td>
288///         </tr>
289///         <tr>
290///             <td><code>all(i)</code></td>
291///             <td>
292///                 <span>An identifier mutator that evaluates to true only if all conditions for identifier <code>i</code> match.</span>
293///             </td>
294///         </tr>
295///         <tr>
296///             <td><code>not</code> _</td>
297///             <td>
298///                 <span>Negate the result of an expression.</span>
299///                 <span>NOTE: This will only negate a result that is true or false, it will
300///                 noop if the result is missing.</span>
301///             </td>
302///         </tr>
303///         <tr>
304///             <td><code style="white-space:nowrap">of(i, x)</code></td>
305///             <td>
306///                 <span>An identifier mutator that evaluates to true only if a minimum of <code>x</code> conditions for identifier <code>i</code> match.</span>
307///             </td>
308///         </tr>
309///     </tbody>
310/// </table>
311///
312/// # Identifiers
313///
314/// Identifiers are used to describe the matching logic for the values contained within documents.
315/// These are then collected by the condition in order to create a rule that can be used to tag a
316/// document.
317///
318/// Due to the nature of an identifier, they are essentially just variations on key/value
319/// pairs. The following variations are supported, where mappings are treated as conjunctions and
320/// sequences are treated as disjunctions:
321///
322/// ```text
323/// # K/V Pairs
324/// IDENTIFIER:
325///     KEY: MATCH
326///
327/// # K/V Pairs with multiple matches
328/// IDENTIFIER:
329///     KEY:
330///     - MATCH_0
331///     - MATCH_1
332///
333/// # K/V Pairs (Grouped)
334/// IDENTIFIER:
335///     - KEY: MATCH
336///
337/// # K/V Pairs (Nested)
338/// IDENTIFIER:
339///     KEY:
340///         KEY: MATCH
341/// ```
342///
343/// Identifiers are unique keys that can be referenced in the `condition`.
344///
345/// Keys are used to get the values from documents. Keys can be wrapped in the following modifiers:
346///
347/// <table>
348///     <thead>
349///         <tr>
350///             <th>Expression</th>
351///             <th>Description</th>
352///         </tr>
353///     </thead>
354///     <tbody>
355///         <tr>
356///             <td><code>all(k)</code></td>
357///             <td>
358///                 <span>A key mutator that evaluates to true only if all matches for keys <code>k</code> match.</span>
359///             </td>
360///         </tr>
361///         <tr>
362///             <td><code style="white-space:nowrap">of(k, x)</code></td>
363///             <td>
364///                 <span>A key mutator that evaluates to true only if a minimum of <code>x</code> matches for key <code>k</code> match.</span>
365///             </td>
366///         </tr>
367///     </tbody>
368/// </table>
369///
370/// Matches are the expressions which are evaluated against values returned by keys. They support the
371/// following syntax:
372///
373/// <table>
374///     <thead>
375///         <tr>
376///             <th>Expression</th>
377///             <th>Description</th>
378///         </tr>
379///     </thead>
380///     <tbody>
381///         <tr>
382///             <td><code>foo</code></td>
383///             <td><span>An exact match</span></td>
384///         </tr>
385///         <tr>
386///             <td><code>foo*</code></td>
387///             <td><span>Starts with</span></td>
388///         </tr>
389///         <tr>
390///             <td><code>*foo</code></td>
391///             <td><span>Ends with</span></td>
392///         </tr>
393///         <tr>
394///             <td><code>*foo*</code></td>
395///             <td><span>Contains</span></td>
396///         </tr>
397///         <tr>
398///             <td><code>?foo</code></td>
399///             <td><span>Regex</span></td>
400///         </tr>
401///         <tr>
402///             <td><code>i</code>_</td>
403///             <td><span>A prefix to convert the match into a case insensitive match.</span></td>
404///         </tr>
405///     </tbody>
406/// </table>
407///
408/// To escape any of the above in order to achieve literal string matching, combinations of `'` and `"` can be used.
409///
410/// # Examples
411///
412/// Here is a very simple rule example:
413///
414/// ```text
415/// detection:
416///   A:
417///     foo: "foo*"
418///     bar: "*bar"
419///   B:
420///     foobar:
421///     - foobar
422///     - foobaz
423///
424///   condition: A and B
425///
426/// true_positives:
427/// - foo: foobar
428///   bar: foobar
429///   foobar: foobar
430///
431/// true_negatives:
432/// - foo: bar
433///   bar: foo
434///   foobar: barfoo
435/// ```
436///
437/// Here is a slightly more complex rule example:
438///
439/// ```text
440/// detection:
441///   A:
442///     all(phrase):
443///     - "*quick*"
444///     - "*brown*"
445///   B:
446///     phrase: ibear
447///
448///   condition: A and not B
449///
450/// true_positives:
451/// - phrase: the quick brown fox
452///
453/// true_negatives:
454/// - foo: the quick brown BEAR
455/// ```
456#[derive(Clone, Debug, Deserialize, Serialize)]
457pub struct Rule {
458    #[serde(default)]
459    optimised: bool,
460
461    pub detection: Detection,
462    pub true_positives: Vec<Yaml>,
463    pub true_negatives: Vec<Yaml>,
464}
465
466impl Rule {
467    /// Load a rule from a YAML file.
468    pub fn load(path: &Path) -> crate::Result<Self> {
469        let contents = fs::read_to_string(path).map_err(crate::error::rule_invalid)?;
470        Self::from_str(&contents)
471    }
472
473    /// Load a rule from a YAML string.
474    // NOTE: We allow this because if we change it now it will be a breaking change and this is not
475    // worth creating a version 2.0 over...
476    #[allow(clippy::should_implement_trait)]
477    pub fn from_str(s: &str) -> crate::Result<Self> {
478        serde_yaml::from_str(s).map_err(crate::error::rule_invalid)
479    }
480
481    /// Load a rule from a YAML Value.
482    pub fn from_value(value: serde_yaml::Value) -> crate::Result<Self> {
483        serde_yaml::from_value(value).map_err(crate::error::rule_invalid)
484    }
485
486    /// Optimise the rule with the optimisations provided.
487    pub fn optimise(mut self, options: Optimisations) -> Self {
488        if self.optimised {
489            return self;
490        }
491        if options.coalesce {
492            self.detection.expression =
493                optimiser::coalesce(self.detection.expression, &self.detection.identifiers);
494            self.detection.identifiers.clear();
495        }
496        if options.shake {
497            self.detection.expression = optimiser::shake(self.detection.expression);
498            self.detection.identifiers = self
499                .detection
500                .identifiers
501                .into_iter()
502                .map(|(k, v)| (k, optimiser::shake(v)))
503                .collect();
504        }
505        if options.rewrite {
506            self.detection.expression = optimiser::rewrite(self.detection.expression);
507            self.detection.identifiers = self
508                .detection
509                .identifiers
510                .into_iter()
511                .map(|(k, v)| (k, optimiser::rewrite(v)))
512                .collect();
513        }
514        if options.matrix {
515            self.detection.expression = optimiser::matrix(self.detection.expression);
516            self.detection.identifiers = self
517                .detection
518                .identifiers
519                .into_iter()
520                .map(|(k, v)| (k, optimiser::matrix(v)))
521                .collect();
522        }
523        self.optimised = true;
524        self
525    }
526
527    /// Evaluates the rule against the provided `Document`, returning true if it has matched.
528    #[inline]
529    pub fn matches(&self, document: &dyn Document) -> bool {
530        solver::solve(&self.detection, document)
531    }
532
533    /// Validates the rule's detection logic against the provided true positives and negatives.
534    pub fn validate(&self) -> crate::Result<bool> {
535        let mut errors = vec![];
536        for test in &self.true_positives {
537            if !(solver::solve(&self.detection, test.as_mapping().unwrap())) {
538                errors.push(format!(
539                    "failed to validate true positive check '{:?}'",
540                    test
541                ));
542            }
543        }
544        for test in &self.true_negatives {
545            if solver::solve(&self.detection, test.as_mapping().unwrap()) {
546                errors.push(format!(
547                    "failed to validate true negative check '{:?}'",
548                    test
549                ));
550            }
551        }
552        if !errors.is_empty() {
553            return Err(crate::Error::new(crate::error::Kind::Validation).with(errors.join(";")));
554        }
555        Ok(true)
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    #[test]
564    fn rule() {
565        let rule = r#"
566        detection:
567          A:
568            foo: 'foo*'
569            bar: '*bar'
570          B:
571            foobar:
572            - foobar
573            - foobaz
574
575          condition: A and B
576
577        true_positives:
578        - foo: foobar
579          bar: foobar
580          foobar: foobar
581
582        true_negatives:
583        - foo: bar
584          bar: foo
585          foobar: barfoo
586        "#;
587        let rule = Rule::from_str(rule).unwrap();
588        assert_eq!(rule.validate().unwrap(), true);
589    }
590}