tau_engine/rule.rs
1use std::collections::HashMap;
2use std::fmt;
3use std::fs;
4use std::path::Path;
5
6use serde::de::{self, Deserializer, MapAccess, Visitor};
7use serde::{Deserialize, Serialize};
8use serde_yaml::Value as Yaml;
9
10use crate::document::Document;
11use crate::optimiser::{self, Optimisations};
12use crate::parser::{self, Expression};
13use crate::solver;
14use crate::tokeniser::{ModSym, Token, Tokeniser};
15
16/// The detection block, this contains the logic that is to be run through the solver to evaluate a
17/// `Document`.
18#[derive(Clone, Serialize)]
19pub struct Detection {
20 /// The core expression.
21 #[serde(skip_serializing)]
22 pub expression: Expression,
23 /// Additional expressions, defined using key/value pairs.
24 #[serde(skip_serializing)]
25 pub identifiers: HashMap<String, Expression>,
26
27 #[serde(rename = "condition")]
28 expression_raw: String,
29 #[serde(flatten)]
30 identifiers_raw: HashMap<String, Yaml>,
31}
32
33impl fmt::Debug for Detection {
34 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35 f.debug_struct("Detection")
36 .field("expression", &self.expression_raw)
37 .field("identifiers", &self.identifiers_raw)
38 .finish()
39 }
40}
41
42impl<'de> Deserialize<'de> for Detection {
43 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
44 where
45 D: Deserializer<'de>,
46 {
47 struct DetectionVisitor;
48 impl<'de> Visitor<'de> for DetectionVisitor {
49 type Value = Detection;
50 fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
51 formatter.write_str("struct Detection")
52 }
53 fn visit_map<V>(self, mut map: V) -> Result<Detection, V::Error>
54 where
55 V: MapAccess<'de>,
56 {
57 let mut identifiers: HashMap<String, Expression> = HashMap::new();
58 let mut identifiers_raw: HashMap<String, Yaml> = HashMap::new();
59 let mut expression = None;
60 while let Some(key) = map.next_key::<String>()? {
61 match key.as_ref() {
62 "condition" => {
63 if expression.is_some() {
64 return Err(de::Error::duplicate_field("condition"));
65 }
66 expression = Some(map.next_value::<String>()?);
67 }
68 _ => {
69 if identifiers.contains_key(&key) {
70 return Err(de::Error::custom(format_args!(
71 "duplicate field `{}`",
72 key
73 )));
74 }
75 let v: Yaml = map.next_value()?;
76 identifiers.insert(
77 key.to_string(),
78 parser::parse_identifier(&v).map_err(|e| {
79 de::Error::custom(format!(
80 "failed to parse identifier - {:?}",
81 e
82 ))
83 })?,
84 );
85 identifiers_raw.insert(key.to_string(), v.clone());
86 }
87 }
88 }
89 let expression_raw =
90 expression.ok_or_else(|| de::Error::missing_field("condition"))?;
91 let tokens = match expression_raw.tokenise() {
92 Ok(tokens) => tokens,
93 Err(err) => {
94 return Err(de::Error::custom(format_args!(
95 "invalid value: condition, failed to tokenise - {}",
96 err
97 )));
98 }
99 };
100
101 // Loop through the tokens making sure that all identifiers are present, this is a
102 // pain because we need to ignore fields... For now we can just check for misc
103 // symbol prefix and skip those if present
104 let mut i = 0;
105 for token in &tokens {
106 if i > 1 {
107 if let Token::Modifier(m) = &tokens[i - 2] {
108 match m {
109 ModSym::Flt | ModSym::Int | ModSym::Not | ModSym::Str => {
110 i += 1;
111 continue;
112 }
113 }
114 }
115 }
116 if let Token::Identifier(id) = token {
117 if !identifiers.contains_key(id) {
118 return Err(de::Error::custom(format_args!(
119 "invalid condition: identifier not found - {}",
120 id
121 )));
122 }
123 }
124 i += 1;
125 }
126
127 let expression = match parser::parse(&tokens) {
128 Ok(expression) => expression,
129 Err(err) => {
130 return Err(de::Error::custom(format_args!(
131 "invalid value: condition, failed to parse - {}",
132 err
133 )));
134 }
135 };
136 if !expression.is_solvable() {
137 return Err(de::Error::custom(format_args!(
138 "invalid value: condition, not solveable - {}",
139 expression
140 )));
141 }
142 Ok(Detection {
143 expression,
144 identifiers,
145 expression_raw,
146 identifiers_raw,
147 })
148 }
149 }
150 const FIELDS: &[&str] = &["identifiers", "condition"];
151 deserializer.deserialize_struct("Detection", FIELDS, DetectionVisitor)
152 }
153}
154
155/// A rule used by the solver to evaluate a `Document`.
156///
157/// A rule contains the detection logic, along with the true positive and negative tests. The
158/// inclusion of these basic test allows for a basic level of verification to be ensured.
159///
160/// Rules are written in YAML and have a simple but powerful syntax.
161///
162/// # Syntax
163///
164/// There are two parts to a rule's logic: the condition & the identifiers.
165///
166/// ## Condition
167///
168/// The condition is the main expression and describes the top level logic for the rule. It can be
169/// comprised of the following:
170///
171/// <table>
172/// <thead>
173/// <tr>
174/// <th>Expression</th>
175/// <th>Description</th>
176/// </tr>
177/// </thead>
178/// <tbody>
179/// <tr>
180/// <td>_ <code>and</code> _</td>
181/// <td>
182/// <span>The logical conjunction of two operands, where the operands are any of the following:</span>
183/// <ul>
184/// <li>
185/// <code>expression</code><span>: a nested expression.</span>
186/// </li>
187/// <li>
188/// <code>identifier</code><span>: a key that matches an identifier in the detection block.</span>
189/// </li>
190/// </ul>
191/// </td>
192/// </tr>
193/// <tr>
194/// <td>_ <code>or</code> _</td>
195/// <td>
196/// <span>The logical disjunction of two operands, where the operands are any of the following:</span>
197/// <ul>
198/// <li>
199/// <code>expression</code><span>: a nested expression.</span>
200/// </li>
201/// <li>
202/// <code>identifier</code><span>: a key that matches an identifier in the detection block.</span>
203/// </li>
204/// </ul>
205/// </td>
206/// </tr>
207/// <tr>
208/// <td>_ <code>==</code> _</td>
209/// <td>
210/// <span>The equality comparison of two operands, where the operands are any of the following:</span>
211/// <ul>
212/// <li>
213/// <code>integer</code><span>: an integer.</span>
214/// </li>
215/// <li>
216/// <code>string</code><span>: a string.</span>
217/// </li>
218/// <li>
219/// <code>int(field)</code><span>: a field that should be cast as an
220/// integer.</span>
221/// </li>
222/// <li>
223/// <code>str(field)</code><span>: a field that should be cast as a
224/// string.</span>
225/// </li>
226/// </ul>
227/// </td>
228/// </tr>
229/// <tr>
230/// <td>_ <code>></code> _</td>
231/// <td>
232/// <span>The greater than comparison of two operands, where the operands are any of the following:</span>
233/// <ul>
234/// <li>
235/// <code>integer</code><span>: an integer.</span>
236/// </li>
237/// <li>
238/// <code>int(field)</code><span>: a field that should be cast as an
239/// integer.</span>
240/// </li>
241/// </ul>
242/// </td>
243/// </tr>
244/// <tr>
245/// <td>_ <code>>=</code> _</td>
246/// <td>
247/// <span>The greater than or equal comparison of two operands, where the operands are any of the following:</span>
248/// <ul>
249/// <li>
250/// <code>integer</code><span>: an integer.</span>
251/// </li>
252/// <li>
253/// <code>int(field)</code><span>: a field that should be cast as an
254/// integer.</span>
255/// </li>
256/// </ul>
257/// </td>
258/// </tr>
259/// <tr>
260/// <td>_ <code><</code> _</td>
261/// <td>
262/// <span>The less than comparison of two operands, where the operands are any of the following:</span>
263/// <ul>
264/// <li>
265/// <code>integer</code><span>: an integer.</span>
266/// </li>
267/// <li>
268/// <code>int(field)</code><span>: a field that should be cast as an
269/// integer.</span>
270/// </li>
271/// </ul>
272/// </td>
273/// </tr>
274/// <tr>
275/// <td>_ <code><=</code> _</td>
276/// <td>
277/// <span>The less than or equal comparison of two operands, where the operands are any of the following:</span>
278/// <ul>
279/// <li>
280/// <code>integer</code><span>: an integer.</span>
281/// </li>
282/// <li>
283/// <code>int(field)</code><span>: a field that should be cast as an
284/// integer.</span>
285/// </li>
286/// </ul>
287/// </td>
288/// </tr>
289/// <tr>
290/// <td><code>all(i)</code></td>
291/// <td>
292/// <span>An identifier mutator that evaluates to true only if all conditions for identifier <code>i</code> match.</span>
293/// </td>
294/// </tr>
295/// <tr>
296/// <td><code>not</code> _</td>
297/// <td>
298/// <span>Negate the result of an expression.</span>
299/// <span>NOTE: This will only negate a result that is true or false, it will
300/// noop if the result is missing.</span>
301/// </td>
302/// </tr>
303/// <tr>
304/// <td><code style="white-space:nowrap">of(i, x)</code></td>
305/// <td>
306/// <span>An identifier mutator that evaluates to true only if a minimum of <code>x</code> conditions for identifier <code>i</code> match.</span>
307/// </td>
308/// </tr>
309/// </tbody>
310/// </table>
311///
312/// # Identifiers
313///
314/// Identifiers are used to describe the matching logic for the values contained within documents.
315/// These are then collected by the condition in order to create a rule that can be used to tag a
316/// document.
317///
318/// Due to the nature of an identifier, they are essentially just variations on key/value
319/// pairs. The following variations are supported, where mappings are treated as conjunctions and
320/// sequences are treated as disjunctions:
321///
322/// ```text
323/// # K/V Pairs
324/// IDENTIFIER:
325/// KEY: MATCH
326///
327/// # K/V Pairs with multiple matches
328/// IDENTIFIER:
329/// KEY:
330/// - MATCH_0
331/// - MATCH_1
332///
333/// # K/V Pairs (Grouped)
334/// IDENTIFIER:
335/// - KEY: MATCH
336///
337/// # K/V Pairs (Nested)
338/// IDENTIFIER:
339/// KEY:
340/// KEY: MATCH
341/// ```
342///
343/// Identifiers are unique keys that can be referenced in the `condition`.
344///
345/// Keys are used to get the values from documents. Keys can be wrapped in the following modifiers:
346///
347/// <table>
348/// <thead>
349/// <tr>
350/// <th>Expression</th>
351/// <th>Description</th>
352/// </tr>
353/// </thead>
354/// <tbody>
355/// <tr>
356/// <td><code>all(k)</code></td>
357/// <td>
358/// <span>A key mutator that evaluates to true only if all matches for keys <code>k</code> match.</span>
359/// </td>
360/// </tr>
361/// <tr>
362/// <td><code style="white-space:nowrap">of(k, x)</code></td>
363/// <td>
364/// <span>A key mutator that evaluates to true only if a minimum of <code>x</code> matches for key <code>k</code> match.</span>
365/// </td>
366/// </tr>
367/// </tbody>
368/// </table>
369///
370/// Matches are the expressions which are evaluated against values returned by keys. They support the
371/// following syntax:
372///
373/// <table>
374/// <thead>
375/// <tr>
376/// <th>Expression</th>
377/// <th>Description</th>
378/// </tr>
379/// </thead>
380/// <tbody>
381/// <tr>
382/// <td><code>foo</code></td>
383/// <td><span>An exact match</span></td>
384/// </tr>
385/// <tr>
386/// <td><code>foo*</code></td>
387/// <td><span>Starts with</span></td>
388/// </tr>
389/// <tr>
390/// <td><code>*foo</code></td>
391/// <td><span>Ends with</span></td>
392/// </tr>
393/// <tr>
394/// <td><code>*foo*</code></td>
395/// <td><span>Contains</span></td>
396/// </tr>
397/// <tr>
398/// <td><code>?foo</code></td>
399/// <td><span>Regex</span></td>
400/// </tr>
401/// <tr>
402/// <td><code>i</code>_</td>
403/// <td><span>A prefix to convert the match into a case insensitive match.</span></td>
404/// </tr>
405/// </tbody>
406/// </table>
407///
408/// To escape any of the above in order to achieve literal string matching, combinations of `'` and `"` can be used.
409///
410/// # Examples
411///
412/// Here is a very simple rule example:
413///
414/// ```text
415/// detection:
416/// A:
417/// foo: "foo*"
418/// bar: "*bar"
419/// B:
420/// foobar:
421/// - foobar
422/// - foobaz
423///
424/// condition: A and B
425///
426/// true_positives:
427/// - foo: foobar
428/// bar: foobar
429/// foobar: foobar
430///
431/// true_negatives:
432/// - foo: bar
433/// bar: foo
434/// foobar: barfoo
435/// ```
436///
437/// Here is a slightly more complex rule example:
438///
439/// ```text
440/// detection:
441/// A:
442/// all(phrase):
443/// - "*quick*"
444/// - "*brown*"
445/// B:
446/// phrase: ibear
447///
448/// condition: A and not B
449///
450/// true_positives:
451/// - phrase: the quick brown fox
452///
453/// true_negatives:
454/// - foo: the quick brown BEAR
455/// ```
456#[derive(Clone, Debug, Deserialize, Serialize)]
457pub struct Rule {
458 #[serde(default)]
459 optimised: bool,
460
461 pub detection: Detection,
462 pub true_positives: Vec<Yaml>,
463 pub true_negatives: Vec<Yaml>,
464}
465
466impl Rule {
467 /// Load a rule from a YAML file.
468 pub fn load(path: &Path) -> crate::Result<Self> {
469 let contents = fs::read_to_string(path).map_err(crate::error::rule_invalid)?;
470 Self::from_str(&contents)
471 }
472
473 /// Load a rule from a YAML string.
474 // NOTE: We allow this because if we change it now it will be a breaking change and this is not
475 // worth creating a version 2.0 over...
476 #[allow(clippy::should_implement_trait)]
477 pub fn from_str(s: &str) -> crate::Result<Self> {
478 serde_yaml::from_str(s).map_err(crate::error::rule_invalid)
479 }
480
481 /// Load a rule from a YAML Value.
482 pub fn from_value(value: serde_yaml::Value) -> crate::Result<Self> {
483 serde_yaml::from_value(value).map_err(crate::error::rule_invalid)
484 }
485
486 /// Optimise the rule with the optimisations provided.
487 pub fn optimise(mut self, options: Optimisations) -> Self {
488 if self.optimised {
489 return self;
490 }
491 if options.coalesce {
492 self.detection.expression =
493 optimiser::coalesce(self.detection.expression, &self.detection.identifiers);
494 self.detection.identifiers.clear();
495 }
496 if options.shake {
497 self.detection.expression = optimiser::shake(self.detection.expression);
498 self.detection.identifiers = self
499 .detection
500 .identifiers
501 .into_iter()
502 .map(|(k, v)| (k, optimiser::shake(v)))
503 .collect();
504 }
505 if options.rewrite {
506 self.detection.expression = optimiser::rewrite(self.detection.expression);
507 self.detection.identifiers = self
508 .detection
509 .identifiers
510 .into_iter()
511 .map(|(k, v)| (k, optimiser::rewrite(v)))
512 .collect();
513 }
514 if options.matrix {
515 self.detection.expression = optimiser::matrix(self.detection.expression);
516 self.detection.identifiers = self
517 .detection
518 .identifiers
519 .into_iter()
520 .map(|(k, v)| (k, optimiser::matrix(v)))
521 .collect();
522 }
523 self.optimised = true;
524 self
525 }
526
527 /// Evaluates the rule against the provided `Document`, returning true if it has matched.
528 #[inline]
529 pub fn matches(&self, document: &dyn Document) -> bool {
530 solver::solve(&self.detection, document)
531 }
532
533 /// Validates the rule's detection logic against the provided true positives and negatives.
534 pub fn validate(&self) -> crate::Result<bool> {
535 let mut errors = vec![];
536 for test in &self.true_positives {
537 if !(solver::solve(&self.detection, test.as_mapping().unwrap())) {
538 errors.push(format!(
539 "failed to validate true positive check '{:?}'",
540 test
541 ));
542 }
543 }
544 for test in &self.true_negatives {
545 if solver::solve(&self.detection, test.as_mapping().unwrap()) {
546 errors.push(format!(
547 "failed to validate true negative check '{:?}'",
548 test
549 ));
550 }
551 }
552 if !errors.is_empty() {
553 return Err(crate::Error::new(crate::error::Kind::Validation).with(errors.join(";")));
554 }
555 Ok(true)
556 }
557}
558
559#[cfg(test)]
560mod tests {
561 use super::*;
562
563 #[test]
564 fn rule() {
565 let rule = r#"
566 detection:
567 A:
568 foo: 'foo*'
569 bar: '*bar'
570 B:
571 foobar:
572 - foobar
573 - foobaz
574
575 condition: A and B
576
577 true_positives:
578 - foo: foobar
579 bar: foobar
580 foobar: foobar
581
582 true_negatives:
583 - foo: bar
584 bar: foo
585 foobar: barfoo
586 "#;
587 let rule = Rule::from_str(rule).unwrap();
588 assert_eq!(rule.validate().unwrap(), true);
589 }
590}