rsigma 0.17.0

CLI for parsing, validating, linting and evaluating Sigma detection rules
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
//! The visibility join and scoring heuristic.
//!
//! [`analyze`] joins three sets in one pass: the rule logsource inventory and
//! rule field set (from `--rules`), the observed field signal (from
//! `--observed`), and the mapping table. It produces a [`VisibilityAnalysis`]
//! that the DeTT&CT emitters, the Navigator layer, and the human report all
//! render from, so those surfaces cannot drift on the scoring.

use std::collections::{BTreeMap, BTreeSet};

use rsigma_eval::RuleFieldSet;
use rsigma_parser::{LogSource, SigmaCollection};

use super::mapping::{MappingTable, entry_products};

/// The observed field signal, parsed from the `--observed` JSON (or empty when
/// the flag is omitted).
#[derive(Debug, Default)]
pub(crate) struct Observed {
    /// Whether `--observed` (or `--addr`) supplied a signal at all.
    pub present: bool,
    /// Rule field names that were referenced but never observed.
    pub missing: BTreeSet<String>,
    /// Observed field names not referenced by any rule, with their counts.
    pub unknown: Vec<ObservedField>,
    /// Total events the observer saw.
    pub events_observed: u64,
    /// Distinct field keys observed.
    pub unique_keys: usize,
}

impl Observed {
    /// A rule field is observed when a signal is present and the field is not
    /// in the broken-coverage `missing` set. With no signal nothing is
    /// observed, which yields the "what would full visibility look like"
    /// baseline.
    fn is_observed(&self, field: &str) -> bool {
        self.present && !self.missing.contains(field)
    }
}

/// A single observed field name and its event count.
#[derive(Debug, Clone)]
pub(crate) struct ObservedField {
    pub field: String,
    pub count: u64,
}

/// Per-data-source visibility, the spine of every output.
#[derive(Debug, Clone)]
pub(crate) struct DataSourceVisibility {
    pub data_source: String,
    /// DeTT&CT 0-to-4 visibility score (none/minimal/medium/good/excellent).
    pub score: u8,
    pub data_components: Vec<String>,
    pub products: Vec<String>,
    pub logsources: Vec<String>,
    pub mapped_fields: Vec<String>,
    pub observed_fields: Vec<String>,
    /// True when the source has mapped rule fields but none were observed.
    pub blind_spot: bool,
}

impl DataSourceVisibility {
    /// True once at least one mapped field was observed.
    pub(crate) fn available(&self) -> bool {
        !self.observed_fields.is_empty()
    }
}

/// Per-technique rolled-up visibility (the max over contributing data sources).
#[derive(Debug, Clone)]
pub(crate) struct TechniqueVisibility {
    pub technique_id: String,
    pub score: u8,
    pub data_sources: Vec<String>,
}

/// An observed data source that no rule consumes (the inverse of a blind spot).
#[derive(Debug, Clone)]
pub(crate) struct UntappedSource {
    pub data_source: String,
    pub observed_fields: Vec<ObservedField>,
}

/// The complete visibility picture rendered by every output surface.
#[derive(Debug, Clone)]
pub(crate) struct VisibilityAnalysis {
    pub data_sources: Vec<DataSourceVisibility>,
    pub techniques: Vec<TechniqueVisibility>,
    pub untapped: Vec<UntappedSource>,
    pub unmapped_logsources: Vec<String>,
    pub rules_total: usize,
    pub logsources_total: usize,
    pub events_observed: u64,
    pub observed_unique_keys: usize,
    pub has_observed: bool,
}

impl VisibilityAnalysis {
    /// Rule-expected data sources whose mapped fields are all unobserved.
    pub(crate) fn blind_spots(&self) -> Vec<&DataSourceVisibility> {
        self.data_sources.iter().filter(|d| d.blind_spot).collect()
    }
}

/// Per-data-source accumulator used during the join.
#[derive(Default)]
struct Agg {
    components: BTreeSet<String>,
    products: BTreeSet<String>,
    logsources: BTreeSet<String>,
    mapped_fields: BTreeSet<String>,
    observed_fields: BTreeSet<String>,
}

/// Map an `(observed, mapped)` field tally to the DeTT&CT 0-to-4 scale.
///
/// The heuristic is deliberately conservative (DeTT&CT files are analyst
/// tuned): all-observed scores 4, all-unobserved scores 0, and the band
/// between is split into minimal/medium/good. A data source with no mapped
/// fields has no field signal and scores 0.
pub(crate) fn score_fraction(observed: usize, mapped: usize) -> u8 {
    if mapped == 0 {
        return 0;
    }
    let frac = observed as f64 / mapped as f64;
    if frac <= 0.0 {
        0
    } else if frac >= 1.0 {
        4
    } else if frac <= 0.25 {
        1
    } else if frac <= 0.5 {
        2
    } else {
        3
    }
}

/// The DeTT&CT visibility level name for a 0-to-4 score.
pub(crate) fn level_name(score: u8) -> &'static str {
    match score {
        0 => "none",
        1 => "minimal",
        2 => "medium",
        3 => "good",
        _ => "excellent",
    }
}

/// Compact display form of a logsource: the present `category/product/service`
/// parts joined with `/`. Returns `None` for an empty logsource.
fn logsource_display(ls: &LogSource) -> Option<String> {
    let parts: Vec<&str> = [&ls.category, &ls.product, &ls.service]
        .into_iter()
        .filter_map(|p| p.as_deref())
        .collect();
    if parts.is_empty() {
        None
    } else {
        Some(parts.join("/"))
    }
}

/// Join the rule inventory, the observed signal, and the mapping table.
pub(crate) fn analyze(
    collection: &SigmaCollection,
    rule_field_set: &RuleFieldSet,
    observed: &Observed,
    mapping: &MappingTable,
) -> VisibilityAnalysis {
    let mut sources: BTreeMap<String, Agg> = BTreeMap::new();
    let mut logsources_seen: BTreeSet<String> = BTreeSet::new();
    let mut unmapped: BTreeSet<String> = BTreeSet::new();

    // Logsource inventory: each rule logsource resolves to the data sources it
    // expects to receive.
    for rule in &collection.rules {
        let Some(display) = logsource_display(&rule.logsource) else {
            continue;
        };
        logsources_seen.insert(display.clone());
        let matches = mapping.logsource_matches(&rule.logsource);
        if matches.is_empty() {
            unmapped.insert(display.clone());
            continue;
        }
        for entry in matches {
            let agg = sources.entry(entry.data_source.clone()).or_default();
            agg.components.insert(entry.data_component.clone());
            agg.products.extend(entry_products(&rule.logsource, entry));
            agg.logsources.insert(display.clone());
        }
    }

    // Field attribution: every rule field that maps to a data component scores
    // its data source, whether or not a logsource reached that source.
    for (field, _origin) in rule_field_set.iter() {
        let Some(component) = mapping.field_component(field) else {
            continue;
        };
        let Some(ds) = mapping.component_source(component) else {
            continue;
        };
        let agg = sources.entry(ds.to_string()).or_default();
        agg.components.insert(component.to_string());
        agg.mapped_fields.insert(field.to_string());
        if observed.is_observed(field) {
            agg.observed_fields.insert(field.to_string());
        }
    }

    // Finalize per-source scores.
    let mut data_sources: Vec<DataSourceVisibility> = sources
        .into_iter()
        .map(|(name, agg)| {
            let score = score_fraction(agg.observed_fields.len(), agg.mapped_fields.len());
            let blind_spot = !agg.mapped_fields.is_empty() && agg.observed_fields.is_empty();
            DataSourceVisibility {
                data_source: name,
                score,
                data_components: agg.components.into_iter().collect(),
                products: agg.products.into_iter().collect(),
                logsources: agg.logsources.into_iter().collect(),
                mapped_fields: agg.mapped_fields.into_iter().collect(),
                observed_fields: agg.observed_fields.into_iter().collect(),
                blind_spot,
            }
        })
        .collect();
    data_sources.sort_by(|a, b| a.data_source.cmp(&b.data_source));

    let expected: BTreeSet<&str> = data_sources
        .iter()
        .map(|d| d.data_source.as_str())
        .collect();

    // Technique rollup: a technique's visibility is the best (max) score among
    // the data sources whose components inform it.
    let mut tech: BTreeMap<String, (u8, BTreeSet<String>)> = BTreeMap::new();
    for ds in &data_sources {
        for component in &ds.data_components {
            for technique in mapping.component_techniques(component) {
                let entry = tech
                    .entry(technique.clone())
                    .or_insert((0, BTreeSet::new()));
                entry.0 = entry.0.max(ds.score);
                entry.1.insert(ds.data_source.clone());
            }
        }
    }
    let techniques: Vec<TechniqueVisibility> = tech
        .into_iter()
        .map(|(technique_id, (score, ds))| TechniqueVisibility {
            technique_id,
            score,
            data_sources: ds.into_iter().collect(),
        })
        .collect();

    // Untapped: observed, unreferenced fields whose data source no rule
    // consumes -- telemetry you receive but write no rule against.
    let mut untapped_map: BTreeMap<String, Vec<ObservedField>> = BTreeMap::new();
    for uf in &observed.unknown {
        let Some(component) = mapping.field_component(&uf.field) else {
            continue;
        };
        let Some(ds) = mapping.component_source(component) else {
            continue;
        };
        if expected.contains(ds) {
            continue;
        }
        untapped_map
            .entry(ds.to_string())
            .or_default()
            .push(uf.clone());
    }
    let untapped: Vec<UntappedSource> = untapped_map
        .into_iter()
        .map(|(data_source, mut fields)| {
            fields.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.field.cmp(&b.field)));
            UntappedSource {
                data_source,
                observed_fields: fields,
            }
        })
        .collect();

    VisibilityAnalysis {
        data_sources,
        techniques,
        untapped,
        unmapped_logsources: unmapped.into_iter().collect(),
        rules_total: collection.rules.len(),
        logsources_total: logsources_seen.len(),
        events_observed: observed.events_observed,
        observed_unique_keys: observed.unique_keys,
        has_observed: observed.present,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn analysis(yaml: &str, observed: Observed) -> VisibilityAnalysis {
        let collection = rsigma_parser::parse_sigma_yaml(yaml).expect("rules parse");
        let rfs = RuleFieldSet::collect(&collection, &[], true);
        let mapping = MappingTable::bundled();
        analyze(&collection, &rfs, &observed, &mapping)
    }

    const RULE: &str = r#"
title: Suspicious Process
id: 00000000-0000-0000-0000-0000000000a1
logsource: {category: process_creation, product: windows}
detection:
    sel:
        Image|endswith: '\evil.exe'
        CommandLine|contains: '--steal'
    condition: sel
"#;

    #[test]
    fn score_fraction_boundaries() {
        assert_eq!(score_fraction(0, 0), 0);
        assert_eq!(score_fraction(0, 4), 0);
        assert_eq!(score_fraction(1, 4), 1);
        assert_eq!(score_fraction(2, 4), 2);
        assert_eq!(score_fraction(3, 4), 3);
        assert_eq!(score_fraction(4, 4), 4);
        // Single mapped field, observed -> full visibility.
        assert_eq!(score_fraction(1, 1), 4);
    }

    #[test]
    fn no_observed_is_baseline_all_zero() {
        let a = analysis(RULE, Observed::default());
        let process = a
            .data_sources
            .iter()
            .find(|d| d.data_source == "Process")
            .expect("Process data source expected");
        assert_eq!(process.score, 0);
        assert!(process.blind_spot, "no observation => blind spot");
        assert!(!process.available());
        assert!(!a.has_observed);
    }

    #[test]
    fn all_fields_observed_scores_excellent() {
        // Observed signal present, nothing missing => every mapped field seen.
        let observed = Observed {
            present: true,
            missing: BTreeSet::new(),
            unknown: Vec::new(),
            events_observed: 100,
            unique_keys: 2,
        };
        let a = analysis(RULE, observed);
        let process = a
            .data_sources
            .iter()
            .find(|d| d.data_source == "Process")
            .unwrap();
        assert_eq!(process.score, 4);
        assert!(!process.blind_spot);
        assert!(process.available());
    }

    #[test]
    fn partial_observation_is_a_blind_spot_only_when_all_missing() {
        // CommandLine observed, Image missing => 1 of 2 mapped fields => score 2.
        let observed = Observed {
            present: true,
            missing: BTreeSet::from(["Image".to_string()]),
            unknown: Vec::new(),
            events_observed: 10,
            unique_keys: 1,
        };
        let a = analysis(RULE, observed);
        let process = a
            .data_sources
            .iter()
            .find(|d| d.data_source == "Process")
            .unwrap();
        assert_eq!(process.score, 2);
        assert!(!process.blind_spot);
    }

    #[test]
    fn unmapped_logsource_is_surfaced() {
        let yaml = r#"
title: Odd
id: 00000000-0000-0000-0000-0000000000b1
logsource: {category: totally_unknown_thing}
detection: {sel: {Foo: bar}, condition: sel}
"#;
        let a = analysis(yaml, Observed::default());
        assert!(
            a.unmapped_logsources
                .contains(&"totally_unknown_thing".to_string())
        );
    }

    #[test]
    fn techniques_roll_up_from_data_sources() {
        let observed = Observed {
            present: true,
            missing: BTreeSet::new(),
            unknown: Vec::new(),
            events_observed: 5,
            unique_keys: 2,
        };
        let a = analysis(RULE, observed);
        // Process Creation informs T1059; with full observation the technique
        // visibility rolls up to 4.
        let t = a
            .techniques
            .iter()
            .find(|t| t.technique_id == "T1059")
            .expect("T1059 reachable via Process Creation");
        assert_eq!(t.score, 4);
        assert!(t.data_sources.contains(&"Process".to_string()));
    }

    #[test]
    fn untapped_source_when_observed_but_unreferenced() {
        // A rule only on Process; an observed (unknown) registry field that no
        // rule references surfaces Windows Registry as untapped.
        let observed = Observed {
            present: true,
            missing: BTreeSet::new(),
            unknown: vec![ObservedField {
                field: "TargetObject".to_string(),
                count: 42,
            }],
            events_observed: 50,
            unique_keys: 3,
        };
        let a = analysis(RULE, observed);
        let untapped = a
            .untapped
            .iter()
            .find(|u| u.data_source == "Windows Registry")
            .expect("Windows Registry is untapped");
        assert_eq!(untapped.observed_fields[0].field, "TargetObject");
    }
}