Skip to main content

taudit_core/
custom_rules.rs

1use crate::finding::{
2    Finding, FindingCategory, FindingExtras, FindingSource, Recommendation, Severity,
3};
4use crate::graph::{AuthorityGraph, NodeKind, TrustZone};
5use crate::propagation::PropagationPath;
6use serde::de::{self, MapAccess, Visitor};
7use serde::{Deserialize, Deserializer};
8use std::collections::{HashMap, HashSet};
9use std::fmt;
10use std::fs;
11use std::io;
12use std::path::{Path, PathBuf};
13
14const MAX_INPUT_BYTES: u64 = 2 * 1024 * 1024;
15const MAX_CUSTOM_RULE_VEC_ITEMS: usize = 1024;
16
17/// Human-readable spelling of the `CustomRule.id` charset+length contract.
18/// Reused in every validation error so operators always see the same regex.
19const RULE_ID_REGEX: &str = "^[A-Za-z_][A-Za-z0-9_-]{0,63}$";
20
21/// Charset + length contract for `CustomRule.id`. Snake_case + kebab-case
22/// friendly, 64-char cap. Rejects empty, leading digits, brackets,
23/// whitespace, and anything that would corrupt the `[id] name: …` finding
24/// message contract or the SARIF/JSON `extract_custom_rule_id` heuristic.
25fn validate_rule_id(id: &str) -> Result<(), String> {
26    if id.is_empty() {
27        return Err(format!(
28            "rule id must be non-empty (allowed: {RULE_ID_REGEX})"
29        ));
30    }
31    if id.len() > 64 {
32        return Err(format!(
33            "rule id '{id}' exceeds 64 characters (allowed: {RULE_ID_REGEX})"
34        ));
35    }
36    let mut chars = id.chars();
37    let first = chars.next().expect("id non-empty checked above");
38    if !(first.is_ascii_alphabetic() || first == '_') {
39        return Err(format!(
40            "rule id '{id}' must start with an ASCII letter or underscore (allowed: {RULE_ID_REGEX})"
41        ));
42    }
43    for c in chars {
44        if !(c.is_ascii_alphanumeric() || c == '_' || c == '-') {
45            return Err(format!(
46                "rule id '{id}' contains invalid character '{c}' (allowed: ASCII letters, digits, underscore, hyphen — {RULE_ID_REGEX})"
47            ));
48        }
49    }
50    Ok(())
51}
52
53/// Serde shim: deserialise `CustomRule.id` as a `String`, then enforce the
54/// validation contract above. The error is surfaced via
55/// `serde::de::Error::custom` so it appears in `serde_yaml` parse errors with
56/// a path/line annotation pointing at the offending document — operators see
57/// exactly which YAML file's `id:` field is wrong.
58fn deserialize_validated_id<'de, D>(deserializer: D) -> Result<String, D::Error>
59where
60    D: Deserializer<'de>,
61{
62    let raw = String::deserialize(deserializer)?;
63    validate_rule_id(&raw).map_err(de::Error::custom)?;
64    Ok(raw)
65}
66
67fn deserialize_capped_vec<'de, D, T>(deserializer: D) -> Result<Vec<T>, D::Error>
68where
69    D: Deserializer<'de>,
70    T: Deserialize<'de>,
71{
72    let values = Vec::<T>::deserialize(deserializer)?;
73    if values.len() > MAX_CUSTOM_RULE_VEC_ITEMS {
74        return Err(de::Error::custom(format!(
75            "custom-rule list exceeds {MAX_CUSTOM_RULE_VEC_ITEMS} entries"
76        )));
77    }
78    Ok(values)
79}
80
81/// A user-defined rule loaded from YAML. Fires when source, sink, and path
82/// predicates all match a propagation path produced by the engine.
83#[derive(Debug, Clone, Deserialize)]
84pub struct CustomRule {
85    /// Stable identifier, embedded into every emitted finding's message and
86    /// extracted by SARIF / JSON sinks via `extract_custom_rule_id`. Validated
87    /// at deserialise time against `^[A-Za-z_][A-Za-z0-9_-]{0,63}$` so a
88    /// malicious or sloppy YAML cannot inject `]`/`[`/whitespace and corrupt
89    /// the message-encoding contract or empty out the rule attribution.
90    #[serde(deserialize_with = "deserialize_validated_id")]
91    pub id: String,
92    pub name: String,
93    #[serde(default)]
94    pub description: String,
95    pub severity: Severity,
96    pub category: FindingCategory,
97    #[serde(rename = "match", default)]
98    pub match_spec: MatchSpec,
99    /// Path of the YAML file this rule was loaded from. Set by
100    /// `load_rules_dir` / `parse_rules_multi_doc_with_source`. Threaded into
101    /// every `Finding` this rule emits (`FindingSource::Custom`) so an
102    /// operator inspecting JSON / SARIF output can distinguish authentic
103    /// built-in findings from any rule that may have been planted in a
104    /// shared `--invariants-dir`. Defaults to `None` for rules constructed
105    /// in tests or in code paths that didn't go through the loader.
106    #[serde(default, skip)]
107    pub source_file: Option<PathBuf>,
108}
109
110#[derive(Debug, Clone, Default, Deserialize)]
111pub struct MatchSpec {
112    #[serde(default)]
113    pub source: NodeMatcher,
114    #[serde(default)]
115    pub sink: NodeMatcher,
116    #[serde(default)]
117    pub path: PathMatcher,
118    /// Graph-level metadata predicate. Applied to `AuthorityGraph::metadata`
119    /// (e.g. `META_TRIGGER`, `META_REPOSITORIES`). When present, ALL conditions
120    /// must hold *in addition to* source/sink/path. Reuses the same typed
121    /// predicate language as node-level metadata (`equals`, `not_equals`,
122    /// `contains`, `in`, plus `not:` negation).
123    #[serde(default)]
124    pub graph_metadata: MetadataMatcher,
125    /// Standalone node predicate. When present, the matcher iterates every
126    /// node in the graph and emits one finding per matching node — the
127    /// source/sink/path fields are ignored, but `graph_metadata:` still
128    /// applies as a graph-wide gate. This is the node-shape-only mode used
129    /// for invariants like "any floating Image" where there is no
130    /// propagation chain to walk.
131    #[serde(default)]
132    pub standalone: Option<NodeMatcher>,
133}
134
135/// Maximum number of elements allowed in a single `Vec<T>` deserialised
136/// from a custom-rule YAML field. A hostile YAML with
137/// `node_type: [secret, secret, … 10M times …]` would otherwise allocate
138/// hundreds of MiB before any rule logic runs (multiple such fields per
139/// rule, multiple rules per file = linear amplification). 1024 is well
140/// above any realistic rule (the largest rule in the existing tree
141/// names ~6 node kinds) and an obvious operator-actionable cap.
142pub const MAX_RULE_VEC_LEN: usize = 1024;
143
144/// A scalar-or-list helper. Lets YAML write `node_type: secret` (single value)
145/// or `node_type: [secret, identity]` (any-of). Single form preserved for
146/// backward compatibility with v0.4.x rule files.
147///
148/// The `Many` variant is capped at [`MAX_RULE_VEC_LEN`] elements at
149/// deserialisation time — see the custom `Deserialize` impl below.
150#[derive(Debug, Clone)]
151pub enum OneOrMany<T> {
152    One(T),
153    Many(Vec<T>),
154}
155
156// Custom Deserialize so we can enforce `MAX_RULE_VEC_LEN` *during* the
157// sequence visit, before serde allocates a full Vec for an attacker.
158//
159// Behaviour matches the previous `#[serde(untagged)]` derive:
160//   * a YAML scalar deserialises to `OneOrMany::One(t)`
161//   * a YAML sequence deserialises to `OneOrMany::Many(vec)`
162//   * any other shape errors via the standard serde error path
163//
164// A cap violation surfaces as a serde error referencing the field, so
165// the resulting `serde_yaml::Error` already carries a path/line annotation
166// pointing operators at the offending document.
167impl<'de, T> Deserialize<'de> for OneOrMany<T>
168where
169    T: Deserialize<'de>,
170{
171    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
172    where
173        D: Deserializer<'de>,
174    {
175        struct OneOrManyVisitor<T> {
176            _phantom: std::marker::PhantomData<T>,
177        }
178
179        impl<'de, T> Visitor<'de> for OneOrManyVisitor<T>
180        where
181            T: Deserialize<'de>,
182        {
183            type Value = OneOrMany<T>;
184
185            fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
186                f.write_str("a single value or a sequence (max 1024 elements)")
187            }
188
189            // Forward every scalar shape to T's Deserialize via
190            // `deserialize_any` on a tiny wrapper. We can't call T's
191            // Deserialize directly here without a deserializer; instead
192            // route through `IntoDeserializer` for the supported scalar
193            // types serde_yaml emits.
194            fn visit_bool<E: de::Error>(self, v: bool) -> Result<Self::Value, E> {
195                T::deserialize(serde::de::value::BoolDeserializer::new(v)).map(OneOrMany::One)
196            }
197            fn visit_i64<E: de::Error>(self, v: i64) -> Result<Self::Value, E> {
198                T::deserialize(serde::de::value::I64Deserializer::new(v)).map(OneOrMany::One)
199            }
200            fn visit_u64<E: de::Error>(self, v: u64) -> Result<Self::Value, E> {
201                T::deserialize(serde::de::value::U64Deserializer::new(v)).map(OneOrMany::One)
202            }
203            fn visit_f64<E: de::Error>(self, v: f64) -> Result<Self::Value, E> {
204                T::deserialize(serde::de::value::F64Deserializer::new(v)).map(OneOrMany::One)
205            }
206            fn visit_str<E: de::Error>(self, v: &str) -> Result<Self::Value, E> {
207                T::deserialize(serde::de::value::StrDeserializer::new(v)).map(OneOrMany::One)
208            }
209            fn visit_string<E: de::Error>(self, v: String) -> Result<Self::Value, E> {
210                T::deserialize(serde::de::value::StringDeserializer::new(v)).map(OneOrMany::One)
211            }
212            fn visit_unit<E: de::Error>(self) -> Result<Self::Value, E> {
213                T::deserialize(serde::de::value::UnitDeserializer::new()).map(OneOrMany::One)
214            }
215
216            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
217            where
218                A: de::SeqAccess<'de>,
219            {
220                // Pre-size with a hint capped at MAX_RULE_VEC_LEN so a
221                // hostile size_hint (e.g. usize::MAX) cannot trick us into
222                // a giant up-front allocation either.
223                let cap_hint = seq
224                    .size_hint()
225                    .map(|h| h.min(MAX_RULE_VEC_LEN))
226                    .unwrap_or(0);
227                let mut out: Vec<T> = Vec::with_capacity(cap_hint);
228                while let Some(item) = seq.next_element::<T>()? {
229                    if out.len() >= MAX_RULE_VEC_LEN {
230                        return Err(de::Error::custom(format!(
231                            "list field exceeds maximum of {MAX_RULE_VEC_LEN} elements; \
232                             split into multiple rules instead"
233                        )));
234                    }
235                    out.push(item);
236                }
237                Ok(OneOrMany::Many(out))
238            }
239        }
240
241        deserializer.deserialize_any(OneOrManyVisitor::<T> {
242            _phantom: std::marker::PhantomData,
243        })
244    }
245}
246
247impl<T: PartialEq> OneOrMany<T> {
248    fn contains(&self, value: &T) -> bool {
249        match self {
250            OneOrMany::One(v) => v == value,
251            OneOrMany::Many(vs) => vs.iter().any(|v| v == value),
252        }
253    }
254}
255
256/// Per-field metadata predicate. Bare string is `equals` (back-compat with
257/// v0.4.x). Operator object supports `equals`, `not_equals`, `contains` (substring
258/// match on string values), and `in` (any-of allowed values).
259#[derive(Debug, Clone, Deserialize)]
260#[serde(untagged)]
261pub enum MetadataPredicate {
262    /// `key: "value"` — equality (back-compat).
263    Equals(String),
264    /// `key: { equals/not_equals/contains/in: ... }`
265    Op(MetadataOp),
266}
267
268#[derive(Debug, Clone, Default, Deserialize)]
269#[serde(deny_unknown_fields)]
270pub struct MetadataOp {
271    #[serde(default)]
272    pub equals: Option<String>,
273    #[serde(default)]
274    pub not_equals: Option<String>,
275    /// Substring match on the string-valued metadata field.
276    #[serde(default)]
277    pub contains: Option<String>,
278    /// Any-of allowed values. Capped at [`MAX_RULE_VEC_LEN`] elements at
279    /// deserialise time so a hostile YAML cannot allocate an unbounded
280    /// `Vec<String>` before any rule logic runs.
281    #[serde(
282        default,
283        rename = "in",
284        deserialize_with = "deserialize_capped_opt_vec_string"
285    )]
286    pub in_: Option<Vec<String>>,
287}
288
289/// Deserialize `Option<Vec<String>>` while enforcing
290/// [`MAX_RULE_VEC_LEN`] inside the sequence visitor. The error message
291/// names the cap and recommends splitting the rule, matching the
292/// `OneOrMany` cap diagnostic so operators see consistent guidance.
293fn deserialize_capped_opt_vec_string<'de, D>(
294    deserializer: D,
295) -> Result<Option<Vec<String>>, D::Error>
296where
297    D: Deserializer<'de>,
298{
299    struct CappedVecStringVisitor;
300
301    impl<'de> Visitor<'de> for CappedVecStringVisitor {
302        type Value = Option<Vec<String>>;
303
304        fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
305            f.write_str("a sequence of strings (max 1024 elements) or null")
306        }
307
308        fn visit_unit<E: de::Error>(self) -> Result<Self::Value, E> {
309            Ok(None)
310        }
311
312        fn visit_none<E: de::Error>(self) -> Result<Self::Value, E> {
313            Ok(None)
314        }
315
316        fn visit_some<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
317        where
318            D: Deserializer<'de>,
319        {
320            deserializer.deserialize_seq(self)
321        }
322
323        fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
324        where
325            A: de::SeqAccess<'de>,
326        {
327            let cap_hint = seq
328                .size_hint()
329                .map(|h| h.min(MAX_RULE_VEC_LEN))
330                .unwrap_or(0);
331            let mut out: Vec<String> = Vec::with_capacity(cap_hint);
332            while let Some(item) = seq.next_element::<String>()? {
333                if out.len() >= MAX_RULE_VEC_LEN {
334                    return Err(de::Error::custom(format!(
335                        "metadata `in:` list exceeds maximum of {MAX_RULE_VEC_LEN} \
336                         elements; split into multiple rules instead"
337                    )));
338                }
339                out.push(item);
340            }
341            Ok(Some(out))
342        }
343    }
344
345    deserializer.deserialize_option(CappedVecStringVisitor)
346}
347
348impl MetadataOp {
349    fn matches(&self, actual: Option<&String>) -> bool {
350        // If the metadata key is absent, only `not_equals` can succeed (against
351        // anything-not-this-value), all positive operators fail.
352        if let Some(want) = &self.equals {
353            if actual.map(|s| s.as_str()) != Some(want.as_str()) {
354                return false;
355            }
356        }
357        if let Some(want) = &self.not_equals {
358            if actual.map(|s| s.as_str()) == Some(want.as_str()) {
359                return false;
360            }
361        }
362        if let Some(needle) = &self.contains {
363            match actual {
364                Some(s) if s.contains(needle.as_str()) => {}
365                _ => return false,
366            }
367        }
368        if let Some(allowed) = &self.in_ {
369            match actual {
370                Some(s) if allowed.iter().any(|a| a == s) => {}
371                _ => return false,
372            }
373        }
374        true
375    }
376}
377
378impl MetadataPredicate {
379    fn matches(&self, actual: Option<&String>) -> bool {
380        match self {
381            MetadataPredicate::Equals(want) => actual.map(|s| s.as_str()) == Some(want.as_str()),
382            MetadataPredicate::Op(op) => op.matches(actual),
383        }
384    }
385}
386
387/// Metadata matcher: map of field -> predicate, with an optional `not`
388/// sub-matcher (negation). The `not:` key is reserved and parsed specially —
389/// it cannot be used as a metadata field name.
390#[derive(Debug, Clone, Default)]
391pub struct MetadataMatcher {
392    pub fields: HashMap<String, MetadataPredicate>,
393    pub not: Option<Box<MetadataMatcher>>,
394}
395
396impl MetadataMatcher {
397    fn matches(&self, metadata: &HashMap<String, String>) -> bool {
398        for (key, pred) in &self.fields {
399            if !pred.matches(metadata.get(key)) {
400                return false;
401            }
402        }
403        if let Some(inner) = &self.not {
404            if inner.matches(metadata) {
405                return false;
406            }
407        }
408        true
409    }
410
411    fn is_empty(&self) -> bool {
412        self.fields.is_empty() && self.not.is_none()
413    }
414}
415
416// Custom Deserialize: pull out reserved `not` key, rest become field predicates.
417impl<'de> Deserialize<'de> for MetadataMatcher {
418    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
419    where
420        D: Deserializer<'de>,
421    {
422        struct MetadataMatcherVisitor;
423
424        impl<'de> Visitor<'de> for MetadataMatcherVisitor {
425            type Value = MetadataMatcher;
426
427            fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
428                f.write_str("a metadata predicate map (field -> string|operator) with optional `not:` sub-map")
429            }
430
431            fn visit_map<M>(self, mut map: M) -> Result<MetadataMatcher, M::Error>
432            where
433                M: MapAccess<'de>,
434            {
435                let mut fields: HashMap<String, MetadataPredicate> = HashMap::new();
436                let mut not: Option<Box<MetadataMatcher>> = None;
437
438                while let Some(key) = map.next_key::<String>()? {
439                    if key == "not" {
440                        if not.is_some() {
441                            return Err(de::Error::duplicate_field("not"));
442                        }
443                        let inner: MetadataMatcher = map.next_value()?;
444                        not = Some(Box::new(inner));
445                    } else {
446                        let value: MetadataPredicate = map.next_value()?;
447                        if fields.insert(key.clone(), value).is_some() {
448                            return Err(de::Error::custom(format!(
449                                "duplicate metadata field `{key}`"
450                            )));
451                        }
452                    }
453                }
454
455                Ok(MetadataMatcher { fields, not })
456            }
457        }
458
459        deserializer.deserialize_map(MetadataMatcherVisitor)
460    }
461}
462
463#[derive(Debug, Clone, Default, Deserialize)]
464pub struct NodeMatcher {
465    /// Single value (`node_type: secret`) or any-of list (`[secret, identity]`).
466    #[serde(default)]
467    pub node_type: Option<OneOrMany<NodeKind>>,
468    /// Single value or any-of list.
469    #[serde(default)]
470    pub trust_zone: Option<OneOrMany<TrustZone>>,
471    #[serde(default)]
472    pub metadata: MetadataMatcher,
473    /// Negation: matches when the inner sub-matcher does NOT match.
474    /// Nested `not` is allowed and double-negation collapses naturally.
475    #[serde(default)]
476    pub not: Option<Box<NodeMatcher>>,
477}
478
479#[derive(Debug, Clone, Default, Deserialize)]
480pub struct PathMatcher {
481    #[serde(default, deserialize_with = "deserialize_capped_vec")]
482    pub crosses_to: Vec<TrustZone>,
483}
484
485#[derive(Debug)]
486pub enum CustomRuleError {
487    FileRead(PathBuf, io::Error),
488    YamlParse(PathBuf, serde_yaml::Error),
489    FileTooLarge {
490        path: PathBuf,
491        max_bytes: u64,
492        actual_bytes: u64,
493    },
494    /// A symlink in the rules directory resolved to a path outside the
495    /// declared `--invariants-dir` tree. Refused unless the caller opts in
496    /// via `allow_external_symlinks: true` (CLI flag
497    /// `--invariants-allow-external-symlinks`). See red-team R2 #4.
498    SymlinkOutsideDir {
499        link: PathBuf,
500        target: PathBuf,
501    },
502}
503
504impl fmt::Display for CustomRuleError {
505    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
506        match self {
507            CustomRuleError::FileRead(path, err) => {
508                write!(
509                    f,
510                    "failed to read custom rule file {}: {err}",
511                    path.display()
512                )
513            }
514            CustomRuleError::YamlParse(path, err) => {
515                write!(
516                    f,
517                    "failed to parse custom rule file {}: {err}",
518                    path.display()
519                )
520            }
521            CustomRuleError::FileTooLarge {
522                path,
523                max_bytes,
524                actual_bytes,
525            } => {
526                write!(
527                    f,
528                    "custom rule file {} exceeds {max_bytes} byte limit ({actual_bytes} bytes)",
529                    path.display()
530                )
531            }
532            CustomRuleError::SymlinkOutsideDir { link, target } => {
533                write!(
534                    f,
535                    "refusing to follow symlink {} → {} (target outside --invariants-dir; potential symlink traversal). Use --invariants-allow-external-symlinks to override.",
536                    link.display(),
537                    target.display()
538                )
539            }
540        }
541    }
542}
543
544impl std::error::Error for CustomRuleError {
545    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
546        match self {
547            CustomRuleError::FileRead(_, err) => Some(err),
548            CustomRuleError::YamlParse(_, err) => Some(err),
549            CustomRuleError::FileTooLarge { .. } => None,
550            CustomRuleError::SymlinkOutsideDir { .. } => None,
551        }
552    }
553}
554
555/// Load all `*.yml` and `*.yaml` files from `dir`. Files are read in sorted
556/// order for deterministic output. Returns a list of all errors alongside
557/// successfully parsed rules — callers decide whether to fail fast or continue.
558///
559/// Symlinks pointing OUTSIDE `dir` are refused by default (red-team R2 #4).
560/// Use [`load_rules_dir_with_opts`] to opt into the legacy follow-everything
561/// behavior.
562pub fn load_rules_dir(dir: &Path) -> Result<Vec<CustomRule>, Vec<CustomRuleError>> {
563    load_rules_dir_with_opts(dir, false)
564}
565
566/// Like [`load_rules_dir`] but lets the caller decide what to do with
567/// symlinks that escape the declared directory.
568///
569/// - The loader walks `dir` **recursively** via a hand-rolled DFS, so
570///   operators can organise rules into subdirectories like
571///   `invariants/gha/`, `invariants/ado/` and have all of them load.
572/// - In-tree symlinks (canonicalized target lives under canonicalized `dir`)
573///   are always followed; a stderr warning is emitted naming the link and
574///   target so the user is never surprised.
575/// - Out-of-tree symlinks are:
576///   - REFUSED with a `CustomRuleError::SymlinkOutsideDir` when
577///     `allow_external_symlinks` is `false` (default — safe).
578///   - Followed, with a louder stderr warning, when
579///     `allow_external_symlinks` is `true` (caller opted in via
580///     `--invariants-allow-external-symlinks`).
581/// - Files reached via multiple paths (e.g. `real.yml` and an `alias.yml ->
582///   ./real.yml` symlink in the same tree) are deduplicated by canonical
583///   path so the same rule never fires twice. A stderr warning is emitted
584///   when a duplicate is dropped.
585///
586/// Why: the loader walks `--invariants-dir` recursively and previously
587/// followed every symlink without checking. A symlink under the directory
588/// pointing OUT (e.g. to `/etc/passwd` or an attacker-controlled file)
589/// was silently read in. This function makes that escape opt-in.
590pub fn load_rules_dir_with_opts(
591    dir: &Path,
592    allow_external_symlinks: bool,
593) -> Result<Vec<CustomRule>, Vec<CustomRuleError>> {
594    // Canonicalize the directory once so we can compare every symlink target
595    // against the same normalized prefix. If canonicalization fails (e.g. a
596    // broken symlink in the path), fall back to the literal path — better to
597    // be conservative and treat *every* symlink as out-of-tree than to crash.
598    let canonical_dir = fs::canonicalize(dir).unwrap_or_else(|_| dir.to_path_buf());
599
600    let mut errors: Vec<CustomRuleError> = Vec::new();
601    // Pairs of (literal_path, canonical_path) for YAML files to read.
602    // Literal path is stamped into `FindingSource::Custom.source_file` so
603    // operator-facing output retains the path as written; canonical path
604    // is the dedup key so symlink aliases collapse to a single rule load.
605    let mut files: Vec<(PathBuf, PathBuf)> = Vec::new();
606    let mut seen: HashSet<PathBuf> = HashSet::new();
607    // Visited directory canonical paths — guards against cycles introduced
608    // by directory symlinks looping back into an ancestor.
609    let mut visited_dirs: HashSet<PathBuf> = HashSet::new();
610    visited_dirs.insert(canonical_dir.clone());
611
612    let mut stack: Vec<PathBuf> = vec![dir.to_path_buf()];
613
614    while let Some(current) = stack.pop() {
615        let read_dir = match fs::read_dir(&current) {
616            Ok(rd) => rd,
617            Err(err) => {
618                errors.push(CustomRuleError::FileRead(current, err));
619                continue;
620            }
621        };
622
623        // The collected `files` vec is sorted before reading, so any
624        // intra-directory order is fine here — iterate the read_dir lazily.
625        for entry in read_dir.flatten() {
626            let path = entry.path();
627
628            // is_symlink uses symlink_metadata under the hood — does not
629            // follow. We need this BEFORE deciding whether to descend.
630            let is_symlink = entry
631                .file_type()
632                .map(|ft| ft.is_symlink())
633                .unwrap_or_else(|_| path.is_symlink());
634
635            // Resolve to a canonical target for symlinks; for regular paths
636            // the canonical path is the same as canonicalize on the literal.
637            // Either way we need the canonical for in-tree check + dedup.
638            let canonical_target = match fs::canonicalize(&path) {
639                Ok(t) => t,
640                Err(err) => {
641                    errors.push(CustomRuleError::FileRead(path.clone(), err));
642                    continue;
643                }
644            };
645
646            // Apply the in-tree-symlink protection at every step of the
647            // recursion (lifted from the original shallow loader).
648            if is_symlink {
649                let in_tree = canonical_target.starts_with(&canonical_dir);
650                if !in_tree {
651                    if allow_external_symlinks {
652                        eprintln!(
653                            "WARNING: following external symlink {} → {} (allowed by --invariants-allow-external-symlinks)",
654                            path.display(),
655                            canonical_target.display()
656                        );
657                    } else {
658                        errors.push(CustomRuleError::SymlinkOutsideDir {
659                            link: path,
660                            target: canonical_target,
661                        });
662                        continue;
663                    }
664                } else {
665                    eprintln!(
666                        "WARNING: following symlink {} → {}",
667                        path.display(),
668                        canonical_target.display()
669                    );
670                }
671            }
672
673            // Classify by the resolved target's metadata. `metadata()`
674            // follows symlinks, which is what we want here.
675            let meta = match fs::metadata(&path) {
676                Ok(m) => m,
677                Err(err) => {
678                    errors.push(CustomRuleError::FileRead(path.clone(), err));
679                    continue;
680                }
681            };
682
683            if meta.is_dir() {
684                // Cycle protection: only descend into a directory whose
685                // canonical path we have not seen yet.
686                if visited_dirs.insert(canonical_target.clone()) {
687                    stack.push(path);
688                }
689                continue;
690            }
691
692            if !meta.is_file() {
693                continue;
694            }
695            match path.extension().and_then(|e| e.to_str()) {
696                Some("yml") | Some("yaml") => {}
697                _ => continue,
698            }
699
700            // Dedup: first sighting wins, so the literal path stamped into
701            // `source_file` is the one DFS reached first.
702            if !seen.insert(canonical_target.clone()) {
703                eprintln!(
704                    "WARNING: symlink {} resolved to the same file already loaded; skipping",
705                    path.display()
706                );
707                continue;
708            }
709            files.push((path, canonical_target));
710        }
711    }
712
713    // Sort by literal path so rule order is deterministic regardless of
714    // filesystem readdir order or DFS traversal order.
715    files.sort_by(|a, b| a.0.cmp(&b.0));
716
717    let mut rules = Vec::new();
718    for (path, _canonical) in files {
719        match read_to_string_capped(&path) {
720            Ok(content) => match parse_rules_multi_doc_with_source(&content, Some(&path)) {
721                Ok(mut parsed) => rules.append(&mut parsed),
722                Err(err) => errors.push(CustomRuleError::YamlParse(path, err)),
723            },
724            Err(err) => errors.push(err),
725        }
726    }
727
728    if errors.is_empty() {
729        Ok(rules)
730    } else {
731        Err(errors)
732    }
733}
734
735fn read_to_string_capped(path: &Path) -> Result<String, CustomRuleError> {
736    let metadata = fs::metadata(path).map_err(|err| CustomRuleError::FileRead(path.into(), err))?;
737    if metadata.len() > MAX_INPUT_BYTES {
738        return Err(CustomRuleError::FileTooLarge {
739            path: path.into(),
740            max_bytes: MAX_INPUT_BYTES,
741            actual_bytes: metadata.len(),
742        });
743    }
744    let content =
745        fs::read_to_string(path).map_err(|err| CustomRuleError::FileRead(path.into(), err))?;
746    if content.len() as u64 > MAX_INPUT_BYTES {
747        return Err(CustomRuleError::FileTooLarge {
748            path: path.into(),
749            max_bytes: MAX_INPUT_BYTES,
750            actual_bytes: content.len() as u64,
751        });
752    }
753    Ok(content)
754}
755
756/// Parse a YAML string containing one or more `CustomRule` documents (separated
757/// by `---`). Single-doc files behave identically to the legacy
758/// `serde_yaml::from_str::<CustomRule>` path. Empty/whitespace-only documents
759/// (e.g. a leading `---` followed by a real doc) are skipped.
760///
761/// Equivalent to `parse_rules_multi_doc_with_source(content, None)` — provenance
762/// stamping is opt-in via the `_with_source` variant so callers that don't
763/// know the originating path (tests, stdin) keep working unchanged.
764pub fn parse_rules_multi_doc(content: &str) -> Result<Vec<CustomRule>, serde_yaml::Error> {
765    parse_rules_multi_doc_with_source(content, None)
766}
767
768/// Parse one or more `CustomRule` documents from `content` and stamp every
769/// produced rule with `source_file = source` so downstream finding emission
770/// can attribute authority back to the originating YAML file. Used by
771/// `load_rules_dir` to thread file paths through into `FindingSource::Custom`.
772pub fn parse_rules_multi_doc_with_source(
773    content: &str,
774    source: Option<&Path>,
775) -> Result<Vec<CustomRule>, serde_yaml::Error> {
776    let mut rules = Vec::new();
777    for doc in serde_yaml::Deserializer::from_str(content) {
778        // An empty document deserializes as `Value::Null`; skip those so a
779        // leading `---` or trailing separator doesn't break the load.
780        let value = serde_yaml::Value::deserialize(doc)?;
781        if value.is_null() {
782            continue;
783        }
784        let mut rule: CustomRule = serde_yaml::from_value(value)?;
785        rule.source_file = source.map(|p| p.to_path_buf());
786        rules.push(rule);
787    }
788    Ok(rules)
789}
790
791impl NodeMatcher {
792    fn matches(&self, node: &crate::graph::Node) -> bool {
793        if let Some(kinds) = &self.node_type {
794            if !kinds.contains(&node.kind) {
795                return false;
796            }
797        }
798        if let Some(zones) = &self.trust_zone {
799            if !zones.contains(&node.trust_zone) {
800                return false;
801            }
802        }
803        if !self.metadata.matches(&node.metadata) {
804            return false;
805        }
806        if let Some(inner) = &self.not {
807            if inner.matches(node) {
808                return false;
809            }
810        }
811        true
812    }
813
814    /// True when the matcher has no constraints — used by tests/tooling.
815    #[allow(dead_code)]
816    fn is_wildcard(&self) -> bool {
817        self.node_type.is_none()
818            && self.trust_zone.is_none()
819            && self.metadata.is_empty()
820            && self.not.is_none()
821    }
822}
823
824impl PathMatcher {
825    fn matches(&self, path: &PropagationPath) -> bool {
826        if self.crosses_to.is_empty() {
827            return true;
828        }
829        match path.boundary_crossing {
830            Some((_, to_zone)) => self.crosses_to.contains(&to_zone),
831            None => false,
832        }
833    }
834}
835
836/// Evaluate every (rule, path) pair. A finding is produced when the rule's
837/// source, sink, and path predicates all match. Findings carry the rule id in
838/// the message so operators can trace back to the originating YAML.
839pub fn evaluate_custom_rules(
840    graph: &AuthorityGraph,
841    paths: &[PropagationPath],
842    rules: &[CustomRule],
843) -> Vec<Finding> {
844    let mut findings = Vec::new();
845
846    for rule in rules {
847        // Standalone (node-shape-only) mode: when `standalone:` is present,
848        // walk every node in the graph and emit one finding per match. The
849        // source/sink/path fields are ignored, but `graph_metadata:` still
850        // gates whether the rule runs at all — that's how PR-context
851        // assertions on node shape work.
852        if let Some(matcher) = &rule.match_spec.standalone {
853            if !rule.match_spec.graph_metadata.matches(&graph.metadata) {
854                continue;
855            }
856            for node in &graph.nodes {
857                if !matcher.matches(node) {
858                    continue;
859                }
860                findings.push(Finding {
861                    severity: rule.severity,
862                    category: rule.category,
863                    nodes_involved: vec![node.id],
864                    message: format!("[{}] {}: {}", rule.id, rule.name, node.name),
865                    recommendation: Recommendation::Manual {
866                        action: if rule.description.is_empty() {
867                            format!("Review custom rule '{}'", rule.id)
868                        } else {
869                            rule.description.clone()
870                        },
871                    },
872                    path: None,
873                    source: custom_source(rule),
874                    extras: FindingExtras::default(),
875                });
876            }
877            continue;
878        }
879
880        // Graph-level metadata gate: if the predicate doesn't hold against
881        // `graph.metadata`, no path in this graph can match this rule. Skip
882        // the path loop entirely. An empty `graph_metadata:` (the common case
883        // for rules that don't care about graph-level state) trivially matches.
884        if !rule.match_spec.graph_metadata.matches(&graph.metadata) {
885            continue;
886        }
887
888        for path in paths {
889            let source_node = match graph.node(path.source) {
890                Some(n) => n,
891                None => continue,
892            };
893            let sink_node = match graph.node(path.sink) {
894                Some(n) => n,
895                None => continue,
896            };
897
898            if !rule.match_spec.source.matches(source_node) {
899                continue;
900            }
901            if !rule.match_spec.sink.matches(sink_node) {
902                continue;
903            }
904            if !rule.match_spec.path.matches(path) {
905                continue;
906            }
907
908            findings.push(Finding {
909                severity: rule.severity,
910                category: rule.category,
911                nodes_involved: vec![path.source, path.sink],
912                message: format!(
913                    "[{}] {}: {} -> {}",
914                    rule.id, rule.name, source_node.name, sink_node.name
915                ),
916                recommendation: Recommendation::Manual {
917                    action: if rule.description.is_empty() {
918                        format!("Review custom rule '{}'", rule.id)
919                    } else {
920                        rule.description.clone()
921                    },
922                },
923                path: Some(path.clone()),
924                source: custom_source(rule),
925                extras: FindingExtras::default(),
926            });
927        }
928    }
929
930    findings
931}
932
933/// Build a `FindingSource::Custom` from the rule's tracked YAML path. Falls
934/// back to an empty path when the rule was constructed in-memory (test,
935/// stdin) and never carried provenance — those callers already know the
936/// finding is custom; the empty path just makes that obvious.
937fn custom_source(rule: &CustomRule) -> FindingSource {
938    FindingSource::Custom {
939        source_file: rule.source_file.clone().unwrap_or_default(),
940    }
941}
942
943#[cfg(test)]
944mod tests {
945    use super::*;
946    use crate::graph::{AuthorityGraph, EdgeKind, PipelineSource};
947    use crate::propagation::{propagation_analysis, DEFAULT_MAX_HOPS};
948
949    fn source() -> PipelineSource {
950        PipelineSource {
951            file: "test.yml".into(),
952            repo: None,
953            git_ref: None,
954            commit_sha: None,
955        }
956    }
957
958    fn build_graph_with_paths() -> (AuthorityGraph, Vec<PropagationPath>) {
959        let mut g = AuthorityGraph::new(source());
960        let secret = g.add_node(NodeKind::Secret, "API_KEY", TrustZone::FirstParty);
961        let trusted = g.add_node(NodeKind::Step, "build", TrustZone::FirstParty);
962        let untrusted = g.add_node(NodeKind::Step, "third-party", TrustZone::Untrusted);
963
964        g.add_edge(trusted, secret, EdgeKind::HasAccessTo);
965        g.add_edge(trusted, untrusted, EdgeKind::DelegatesTo);
966
967        let paths = propagation_analysis(&g, DEFAULT_MAX_HOPS);
968        (g, paths)
969    }
970
971    fn one<T>(v: T) -> Option<OneOrMany<T>> {
972        Some(OneOrMany::One(v))
973    }
974
975    #[test]
976    fn custom_rule_fires_on_matching_path() {
977        let (graph, paths) = build_graph_with_paths();
978
979        let rule = CustomRule {
980            id: "secret_to_untrusted".into(),
981            name: "Secret reaching untrusted step".into(),
982            description: "Custom policy".into(),
983            severity: Severity::Critical,
984            category: FindingCategory::AuthorityPropagation,
985            match_spec: MatchSpec {
986                source: NodeMatcher {
987                    node_type: None,
988                    trust_zone: one(TrustZone::FirstParty),
989                    metadata: MetadataMatcher::default(),
990                    not: None,
991                },
992                sink: NodeMatcher {
993                    node_type: None,
994                    trust_zone: one(TrustZone::Untrusted),
995                    metadata: MetadataMatcher::default(),
996                    not: None,
997                },
998                path: PathMatcher::default(),
999                graph_metadata: MetadataMatcher::default(),
1000                standalone: None,
1001            },
1002            source_file: None,
1003        };
1004
1005        let findings = evaluate_custom_rules(&graph, &paths, &[rule]);
1006        assert_eq!(findings.len(), 1);
1007        assert_eq!(findings[0].severity, Severity::Critical);
1008        assert!(findings[0].message.contains("secret_to_untrusted"));
1009    }
1010
1011    #[test]
1012    fn custom_rule_does_not_fire_when_predicates_miss() {
1013        let (graph, paths) = build_graph_with_paths();
1014
1015        let rule = CustomRule {
1016            id: "miss".into(),
1017            name: "Untrusted source".into(),
1018            description: String::new(),
1019            severity: Severity::Critical,
1020            category: FindingCategory::AuthorityPropagation,
1021            match_spec: MatchSpec {
1022                source: NodeMatcher {
1023                    node_type: None,
1024                    trust_zone: one(TrustZone::Untrusted),
1025                    metadata: MetadataMatcher::default(),
1026                    not: None,
1027                },
1028                sink: NodeMatcher::default(),
1029                path: PathMatcher::default(),
1030                graph_metadata: MetadataMatcher::default(),
1031                standalone: None,
1032            },
1033            source_file: None,
1034        };
1035
1036        let findings = evaluate_custom_rules(&graph, &paths, &[rule]);
1037        assert!(findings.is_empty());
1038    }
1039
1040    #[test]
1041    fn yaml_round_trip_loads_full_rule() {
1042        let yaml = r#"
1043id: my_secret_to_untrusted
1044name: Secret reaching untrusted step
1045description: "Custom policy: secrets must not reach untrusted steps"
1046severity: critical
1047category: authority_propagation
1048match:
1049  source:
1050    node_type: secret
1051    trust_zone: first_party
1052  sink:
1053    node_type: step
1054    trust_zone: untrusted
1055  path:
1056    crosses_to: [untrusted]
1057"#;
1058        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml must parse");
1059        assert_eq!(rule.id, "my_secret_to_untrusted");
1060        assert_eq!(rule.severity, Severity::Critical);
1061        assert!(matches!(
1062            rule.match_spec.source.node_type,
1063            Some(OneOrMany::One(NodeKind::Secret))
1064        ));
1065        assert!(matches!(
1066            rule.match_spec.sink.trust_zone,
1067            Some(OneOrMany::One(TrustZone::Untrusted))
1068        ));
1069        assert_eq!(rule.match_spec.path.crosses_to, vec![TrustZone::Untrusted]);
1070    }
1071
1072    #[test]
1073    fn metadata_predicate_must_match_all_keys() {
1074        let mut g = AuthorityGraph::new(source());
1075        let mut meta = HashMap::new();
1076        meta.insert("kind".to_string(), "deploy".to_string());
1077        let secret =
1078            g.add_node_with_metadata(NodeKind::Secret, "TOKEN", TrustZone::FirstParty, meta);
1079        let sink = g.add_node(NodeKind::Step, "remote", TrustZone::Untrusted);
1080        let step = g.add_node(NodeKind::Step, "use", TrustZone::FirstParty);
1081        g.add_edge(step, secret, EdgeKind::HasAccessTo);
1082        g.add_edge(step, sink, EdgeKind::DelegatesTo);
1083
1084        let paths = propagation_analysis(&g, DEFAULT_MAX_HOPS);
1085
1086        let mut want_fields = HashMap::new();
1087        want_fields.insert(
1088            "kind".to_string(),
1089            MetadataPredicate::Equals("deploy".to_string()),
1090        );
1091        let hit = CustomRule {
1092            id: "hit".into(),
1093            name: "n".into(),
1094            description: String::new(),
1095            severity: Severity::High,
1096            category: FindingCategory::AuthorityPropagation,
1097            match_spec: MatchSpec {
1098                source: NodeMatcher {
1099                    node_type: one(NodeKind::Secret),
1100                    trust_zone: None,
1101                    metadata: MetadataMatcher {
1102                        fields: want_fields,
1103                        not: None,
1104                    },
1105                    not: None,
1106                },
1107                sink: NodeMatcher::default(),
1108                path: PathMatcher::default(),
1109                graph_metadata: MetadataMatcher::default(),
1110                standalone: None,
1111            },
1112            source_file: None,
1113        };
1114        assert_eq!(evaluate_custom_rules(&g, &paths, &[hit]).len(), 1);
1115
1116        let mut wrong_fields = HashMap::new();
1117        wrong_fields.insert(
1118            "kind".to_string(),
1119            MetadataPredicate::Equals("build".to_string()),
1120        );
1121        let miss = CustomRule {
1122            id: "miss".into(),
1123            name: "n".into(),
1124            description: String::new(),
1125            severity: Severity::High,
1126            category: FindingCategory::AuthorityPropagation,
1127            match_spec: MatchSpec {
1128                source: NodeMatcher {
1129                    node_type: one(NodeKind::Secret),
1130                    trust_zone: None,
1131                    metadata: MetadataMatcher {
1132                        fields: wrong_fields,
1133                        not: None,
1134                    },
1135                    not: None,
1136                },
1137                sink: NodeMatcher::default(),
1138                path: PathMatcher::default(),
1139                graph_metadata: MetadataMatcher::default(),
1140                standalone: None,
1141            },
1142            source_file: None,
1143        };
1144        assert!(evaluate_custom_rules(&g, &paths, &[miss]).is_empty());
1145    }
1146
1147    #[test]
1148    fn load_rules_dir_reads_yml_and_yaml() {
1149        let tmp = std::env::temp_dir().join(format!("taudit-custom-rules-{}", std::process::id()));
1150        fs::create_dir_all(&tmp).unwrap();
1151        let yml_path = tmp.join("a.yml");
1152        let yaml_path = tmp.join("b.yaml");
1153        let other_path = tmp.join("c.txt");
1154
1155        fs::write(
1156            &yml_path,
1157            "id: a\nname: a\nseverity: high\ncategory: authority_propagation\n",
1158        )
1159        .unwrap();
1160        fs::write(
1161            &yaml_path,
1162            "id: b\nname: b\nseverity: medium\ncategory: unpinned_action\n",
1163        )
1164        .unwrap();
1165        fs::write(&other_path, "ignored").unwrap();
1166
1167        let rules = load_rules_dir(&tmp).expect("load must succeed");
1168        assert_eq!(rules.len(), 2);
1169        assert_eq!(rules[0].id, "a");
1170        assert_eq!(rules[1].id, "b");
1171
1172        // cleanup
1173        let _ = fs::remove_dir_all(&tmp);
1174    }
1175
1176    #[test]
1177    fn load_rules_dir_reports_yaml_errors_with_path() {
1178        let tmp =
1179            std::env::temp_dir().join(format!("taudit-custom-rules-bad-{}", std::process::id()));
1180        fs::create_dir_all(&tmp).unwrap();
1181        let bad = tmp.join("bad.yml");
1182        fs::write(&bad, "id: x\nseverity: not-a-real-severity\n").unwrap();
1183
1184        let errs = load_rules_dir(&tmp).expect_err("should fail");
1185        assert_eq!(errs.len(), 1);
1186        let msg = errs[0].to_string();
1187        assert!(msg.contains("bad.yml"), "error must mention path: {msg}");
1188
1189        let _ = fs::remove_dir_all(&tmp);
1190    }
1191
1192    /// `EgressBlindspot` and `MissingAuditTrail` carry
1193    /// `#[serde(skip_deserializing)]` because they cannot be detected from
1194    /// pipeline YAML alone — they need runtime telemetry / external audit
1195    /// configuration. A custom-rule YAML that names either of those
1196    /// categories must therefore fail to load with a clear `unknown variant`
1197    /// error from serde, even though the variants still serialise normally
1198    /// and remain valid in OUTPUT schemas.
1199    #[test]
1200    fn reserved_categories_rejected_by_custom_rule_loader() {
1201        let tmp = std::env::temp_dir().join(format!(
1202            "taudit-custom-rules-reserved-{}",
1203            std::process::id()
1204        ));
1205        fs::create_dir_all(&tmp).unwrap();
1206        let reserved_path = tmp.join("reserved.yml");
1207        fs::write(
1208            &reserved_path,
1209            "id: r\nname: r\nseverity: high\ncategory: egress_blindspot\n",
1210        )
1211        .unwrap();
1212
1213        let errs = load_rules_dir(&tmp).expect_err("reserved category must be rejected");
1214        assert_eq!(errs.len(), 1);
1215        let msg = errs[0].to_string();
1216        assert!(
1217            msg.contains("unknown variant") && msg.contains("egress_blindspot"),
1218            "expected an `unknown variant `egress_blindspot`` error, got: {msg}"
1219        );
1220
1221        // Same for the second reserved variant.
1222        let other_path = tmp.join("reserved2.yml");
1223        fs::write(
1224            &other_path,
1225            "id: r2\nname: r2\nseverity: high\ncategory: missing_audit_trail\n",
1226        )
1227        .unwrap();
1228        let errs2 = load_rules_dir(&tmp).expect_err("second reserved category must be rejected");
1229        // Both files are bad now — each surfaces its own error.
1230        assert!(errs2.iter().any(|e| {
1231            let m = e.to_string();
1232            m.contains("unknown variant") && m.contains("missing_audit_trail")
1233        }));
1234
1235        let _ = fs::remove_dir_all(&tmp);
1236    }
1237
1238    /// Constructive seal contract: even though serde refuses to
1239    /// deserialise the reserved variants, the Rust enum can still
1240    /// construct them (e.g. for runtime-enrichment paths) and they MUST
1241    /// serialise verbatim to their snake_case form. This is what makes
1242    /// it correct for the OUTPUT schemas to advertise them.
1243    #[test]
1244    fn reserved_categories_still_serialize_when_constructed_in_rust() {
1245        let f = Finding {
1246            severity: Severity::Medium,
1247            category: FindingCategory::EgressBlindspot,
1248            path: None,
1249            nodes_involved: vec![],
1250            message: "runtime-enriched".into(),
1251            recommendation: Recommendation::Manual {
1252                action: "investigate".into(),
1253            },
1254            source: FindingSource::BuiltIn,
1255            extras: FindingExtras::default(),
1256        };
1257        let json = serde_json::to_value(&f).expect("serialises fine");
1258        assert_eq!(json["category"], "egress_blindspot");
1259
1260        let g = Finding {
1261            category: FindingCategory::MissingAuditTrail,
1262            ..f
1263        };
1264        let json2 = serde_json::to_value(&g).expect("serialises fine");
1265        assert_eq!(json2["category"], "missing_audit_trail");
1266    }
1267
1268    // ── v0.6 grammar additions: negation + typed metadata predicates ─────
1269
1270    /// Build a graph with one secret in first_party reaching one untrusted
1271    /// step. Used by the new grammar tests.
1272    fn simple_first_to_untrusted_graph() -> (AuthorityGraph, Vec<PropagationPath>) {
1273        let mut g = AuthorityGraph::new(source());
1274        let mut meta = HashMap::new();
1275        meta.insert("oidc".to_string(), "true".to_string());
1276        meta.insert("permissions".to_string(), "contents: write".to_string());
1277        meta.insert("role".to_string(), "admin".to_string());
1278        let secret =
1279            g.add_node_with_metadata(NodeKind::Identity, "GH_TOKEN", TrustZone::FirstParty, meta);
1280        let step = g.add_node(NodeKind::Step, "use-it", TrustZone::FirstParty);
1281        let untrusted = g.add_node(NodeKind::Step, "third-party", TrustZone::Untrusted);
1282        g.add_edge(step, secret, EdgeKind::HasAccessTo);
1283        g.add_edge(step, untrusted, EdgeKind::DelegatesTo);
1284        let paths = propagation_analysis(&g, DEFAULT_MAX_HOPS);
1285        (g, paths)
1286    }
1287
1288    #[test]
1289    fn negation_on_trust_zone_inverts_match() {
1290        let (graph, paths) = simple_first_to_untrusted_graph();
1291        // sink is untrusted; "not untrusted" must NOT match the sink → no findings
1292        let yaml = r#"
1293id: r
1294name: r
1295severity: high
1296category: authority_propagation
1297match:
1298  sink:
1299    not:
1300      trust_zone: untrusted
1301"#;
1302        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1303        assert!(evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1304    }
1305
1306    #[test]
1307    fn negation_on_node_type_list_matches_other_kinds() {
1308        let (graph, paths) = simple_first_to_untrusted_graph();
1309        // source kinds in fixtures: identity. "not [secret, identity]" excludes it
1310        // → source predicate fails → no findings.
1311        let yaml = r#"
1312id: r
1313name: r
1314severity: high
1315category: authority_propagation
1316match:
1317  source:
1318    not:
1319      node_type: [secret, identity]
1320"#;
1321        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1322        assert!(evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1323
1324        // Inverse: "not [step]" — source is identity, so the inner does NOT match,
1325        // therefore the not-wrapper matches → at least one finding fires.
1326        let yaml2 = r#"
1327id: r2
1328name: r2
1329severity: high
1330category: authority_propagation
1331match:
1332  source:
1333    not:
1334      node_type: [step]
1335"#;
1336        let rule2: CustomRule = serde_yaml::from_str(yaml2).expect("yaml parses");
1337        assert!(!evaluate_custom_rules(&graph, &paths, &[rule2]).is_empty());
1338    }
1339
1340    #[test]
1341    fn metadata_negation_matches_absent_or_other_value() {
1342        let (graph, paths) = simple_first_to_untrusted_graph();
1343        // The identity has oidc=true. `not: { oidc: "true" }` excludes it →
1344        // no finding when applied to the source.
1345        let yaml = r#"
1346id: r
1347name: r
1348severity: high
1349category: authority_propagation
1350match:
1351  source:
1352    metadata:
1353      not:
1354        oidc: "true"
1355"#;
1356        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1357        assert!(evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1358    }
1359
1360    #[test]
1361    fn metadata_contains_does_substring_match() {
1362        let (graph, paths) = simple_first_to_untrusted_graph();
1363        let yaml = r#"
1364id: r
1365name: r
1366severity: high
1367category: authority_propagation
1368match:
1369  source:
1370    metadata:
1371      permissions:
1372        contains: "contents: write"
1373"#;
1374        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1375        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule]).len(), 1);
1376
1377        // negative case: substring not present
1378        let yaml_miss = r#"
1379id: r
1380name: r
1381severity: high
1382category: authority_propagation
1383match:
1384  source:
1385    metadata:
1386      permissions:
1387        contains: "actions: write"
1388"#;
1389        let rule_miss: CustomRule = serde_yaml::from_str(yaml_miss).expect("yaml parses");
1390        assert!(evaluate_custom_rules(&graph, &paths, &[rule_miss]).is_empty());
1391    }
1392
1393    #[test]
1394    fn metadata_in_matches_any_of_allowed_values() {
1395        let (graph, paths) = simple_first_to_untrusted_graph();
1396        let yaml = r#"
1397id: r
1398name: r
1399severity: high
1400category: authority_propagation
1401match:
1402  source:
1403    metadata:
1404      role:
1405        in: [admin, owner, write]
1406"#;
1407        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1408        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule]).len(), 1);
1409
1410        let yaml_miss = r#"
1411id: r
1412name: r
1413severity: high
1414category: authority_propagation
1415match:
1416  source:
1417    metadata:
1418      role:
1419        in: [reader, none]
1420"#;
1421        let rule_miss: CustomRule = serde_yaml::from_str(yaml_miss).expect("yaml parses");
1422        assert!(evaluate_custom_rules(&graph, &paths, &[rule_miss]).is_empty());
1423    }
1424
1425    #[test]
1426    fn metadata_not_equals_excludes_specific_value() {
1427        let (graph, paths) = simple_first_to_untrusted_graph();
1428        let yaml = r#"
1429id: r
1430name: r
1431severity: high
1432category: authority_propagation
1433match:
1434  source:
1435    metadata:
1436      role:
1437        not_equals: admin
1438"#;
1439        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1440        // role=admin → not_equals fails → no findings
1441        assert!(evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1442
1443        let yaml_hit = r#"
1444id: r
1445name: r
1446severity: high
1447category: authority_propagation
1448match:
1449  source:
1450    metadata:
1451      role:
1452        not_equals: reader
1453"#;
1454        let rule_hit: CustomRule = serde_yaml::from_str(yaml_hit).expect("yaml parses");
1455        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule_hit]).len(), 1);
1456    }
1457
1458    #[test]
1459    fn nested_not_collapses_to_inner_condition() {
1460        let (graph, paths) = simple_first_to_untrusted_graph();
1461        // not(not(trust_zone=first_party)) ≡ trust_zone=first_party.
1462        // The source is first_party so this should fire.
1463        let yaml = r#"
1464id: r
1465name: r
1466severity: high
1467category: authority_propagation
1468match:
1469  source:
1470    not:
1471      not:
1472        trust_zone: first_party
1473"#;
1474        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1475        assert!(!evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1476    }
1477
1478    #[test]
1479    fn node_type_accepts_single_value_back_compat() {
1480        // The original v0.4 simple form must still parse and behave identically.
1481        let yaml = r#"
1482id: r
1483name: r
1484severity: high
1485category: authority_propagation
1486match:
1487  source:
1488    node_type: identity
1489    trust_zone: first_party
1490    metadata:
1491      oidc: "true"
1492"#;
1493        let rule: CustomRule = serde_yaml::from_str(yaml).expect("v0.4 form must still parse");
1494        assert!(matches!(
1495            rule.match_spec.source.node_type,
1496            Some(OneOrMany::One(NodeKind::Identity))
1497        ));
1498        assert!(matches!(
1499            rule.match_spec.source.trust_zone,
1500            Some(OneOrMany::One(TrustZone::FirstParty))
1501        ));
1502        let pred = rule
1503            .match_spec
1504            .source
1505            .metadata
1506            .fields
1507            .get("oidc")
1508            .expect("oidc predicate");
1509        assert!(matches!(pred, MetadataPredicate::Equals(v) if v == "true"));
1510
1511        let (graph, paths) = simple_first_to_untrusted_graph();
1512        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule]).len(), 1);
1513    }
1514
1515    #[test]
1516    fn node_type_accepts_list_form() {
1517        let yaml = r#"
1518id: r
1519name: r
1520severity: high
1521category: authority_propagation
1522match:
1523  source:
1524    node_type: [secret, identity]
1525    trust_zone: [first_party, third_party]
1526"#;
1527        let rule: CustomRule = serde_yaml::from_str(yaml).expect("list form must parse");
1528        match &rule.match_spec.source.node_type {
1529            Some(OneOrMany::Many(v)) => {
1530                assert_eq!(v, &vec![NodeKind::Secret, NodeKind::Identity]);
1531            }
1532            other => panic!("expected list form, got {other:?}"),
1533        }
1534        let (graph, paths) = simple_first_to_untrusted_graph();
1535        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule]).len(), 1);
1536    }
1537
1538    // ── Gap B: graph-level metadata predicates ──────────────
1539
1540    /// Builds a graph with one PR-context source/sink path and lets tests set
1541    /// graph-level metadata to pressure-test the new predicate.
1542    fn pr_context_graph_with_meta(meta: &[(&str, &str)]) -> (AuthorityGraph, Vec<PropagationPath>) {
1543        let mut g = AuthorityGraph::new(source());
1544        let mut secret_meta = HashMap::new();
1545        secret_meta.insert("variable_group".to_string(), "true".to_string());
1546        let secret = g.add_node_with_metadata(
1547            NodeKind::Secret,
1548            "VG_SECRET",
1549            TrustZone::FirstParty,
1550            secret_meta,
1551        );
1552        let step = g.add_node(NodeKind::Step, "use", TrustZone::FirstParty);
1553        let untrusted = g.add_node(NodeKind::Step, "third-party", TrustZone::Untrusted);
1554        g.add_edge(step, secret, crate::graph::EdgeKind::HasAccessTo);
1555        g.add_edge(step, untrusted, crate::graph::EdgeKind::DelegatesTo);
1556        for (k, v) in meta {
1557            g.metadata.insert((*k).to_string(), (*v).to_string());
1558        }
1559        let paths = propagation_analysis(&g, DEFAULT_MAX_HOPS);
1560        (g, paths)
1561    }
1562
1563    #[test]
1564    fn graph_metadata_equals_matches_when_value_present() {
1565        let (graph, paths) = pr_context_graph_with_meta(&[("trigger", "pr")]);
1566        let yaml = r#"
1567id: r
1568name: r
1569severity: high
1570category: authority_propagation
1571match:
1572  graph_metadata:
1573    trigger:
1574      equals: pr
1575  source:
1576    metadata:
1577      variable_group: "true"
1578"#;
1579        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1580        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule]).len(), 1);
1581    }
1582
1583    #[test]
1584    fn graph_metadata_in_matches_any_of_listed_values() {
1585        let (graph, paths) = pr_context_graph_with_meta(&[("trigger", "merge_request_event")]);
1586        let yaml = r#"
1587id: r
1588name: r
1589severity: high
1590category: authority_propagation
1591match:
1592  graph_metadata:
1593    trigger:
1594      in: [pull_request_target, pr, merge_request_event]
1595"#;
1596        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1597        assert!(!evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1598    }
1599
1600    #[test]
1601    fn graph_metadata_negation_excludes_unwanted_trigger() {
1602        // graph trigger=push, rule wants "not push" → must NOT fire.
1603        let (graph, paths) = pr_context_graph_with_meta(&[("trigger", "push")]);
1604        let yaml = r#"
1605id: r
1606name: r
1607severity: high
1608category: authority_propagation
1609match:
1610  graph_metadata:
1611    not:
1612      trigger:
1613        equals: push
1614"#;
1615        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1616        assert!(evaluate_custom_rules(&graph, &paths, &[rule]).is_empty());
1617
1618        // Inverse: trigger=pr, rule wants "not push" → fires.
1619        let (graph2, paths2) = pr_context_graph_with_meta(&[("trigger", "pr")]);
1620        let rule2: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1621        assert!(!evaluate_custom_rules(&graph2, &paths2, &[rule2]).is_empty());
1622    }
1623
1624    #[test]
1625    fn graph_metadata_missing_key_does_not_match_no_crash() {
1626        // Graph has no `trigger` metadata at all. `equals: pr` requires the key
1627        // to be present with that value → no findings, no panic.
1628        let (graph, paths) = pr_context_graph_with_meta(&[]);
1629        assert!(!graph.metadata.contains_key("trigger"));
1630        let yaml = r#"
1631id: r
1632name: r
1633severity: high
1634category: authority_propagation
1635match:
1636  graph_metadata:
1637    trigger:
1638      equals: pr
1639"#;
1640        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1641        let findings = evaluate_custom_rules(&graph, &paths, &[rule]);
1642        assert!(findings.is_empty(), "missing key must yield no findings");
1643    }
1644
1645    #[test]
1646    fn rules_without_graph_metadata_remain_backward_compatible() {
1647        // No `graph_metadata:` block → trivially matches regardless of graph
1648        // state. This is the v0.4-v0.9 behaviour and must keep working.
1649        let (graph, paths) = pr_context_graph_with_meta(&[("trigger", "anything")]);
1650        let yaml = r#"
1651id: r
1652name: r
1653severity: high
1654category: authority_propagation
1655match:
1656  source:
1657    metadata:
1658      variable_group: "true"
1659"#;
1660        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1661        assert_eq!(evaluate_custom_rules(&graph, &paths, &[rule]).len(), 1);
1662    }
1663
1664    // ── Gap C: image sinks + standalone node predicates ─────
1665
1666    /// Builds a graph with one Identity → Step → Image (Untrusted) chain.
1667    /// The Image node is reached via `UsesImage` so propagation_analysis
1668    /// produces a path whose sink is the Image — this is what lets custom
1669    /// rules use `sink: { node_type: image }`.
1670    fn graph_with_image_sink() -> (AuthorityGraph, Vec<PropagationPath>) {
1671        let mut g = AuthorityGraph::new(source());
1672        let identity = g.add_node(NodeKind::Identity, "GH_TOKEN", TrustZone::FirstParty);
1673        let step = g.add_node(NodeKind::Step, "publish", TrustZone::FirstParty);
1674        let image = g.add_node(
1675            NodeKind::Image,
1676            "third-party/deploy@v1",
1677            TrustZone::Untrusted,
1678        );
1679        g.add_edge(step, identity, crate::graph::EdgeKind::HasAccessTo);
1680        g.add_edge(step, image, crate::graph::EdgeKind::UsesImage);
1681        let paths = propagation_analysis(&g, DEFAULT_MAX_HOPS);
1682        (g, paths)
1683    }
1684
1685    #[test]
1686    fn sink_node_type_image_matches_image_path_endpoint() {
1687        let (graph, paths) = graph_with_image_sink();
1688        let yaml = r#"
1689id: r
1690name: r
1691severity: high
1692category: untrusted_with_authority
1693match:
1694  sink:
1695    node_type: image
1696    trust_zone: untrusted
1697"#;
1698        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1699        let findings = evaluate_custom_rules(&graph, &paths, &[rule]);
1700        assert!(
1701            !findings.is_empty(),
1702            "Image-as-sink must produce at least one finding"
1703        );
1704    }
1705
1706    #[test]
1707    fn standalone_matches_every_floating_image_in_graph() {
1708        // Two Image nodes: one floating (no `digest` metadata), one digest-pinned.
1709        let mut g = AuthorityGraph::new(source());
1710        let _step = g.add_node(NodeKind::Step, "build", TrustZone::FirstParty);
1711        let _floating1 = g.add_node(NodeKind::Image, "alpine:latest", TrustZone::ThirdParty);
1712        let _floating2 = g.add_node(NodeKind::Image, "ubuntu:22.04", TrustZone::ThirdParty);
1713        let mut pinned_meta = HashMap::new();
1714        pinned_meta.insert("digest".to_string(), "sha256:abc".to_string());
1715        let _pinned = g.add_node_with_metadata(
1716            NodeKind::Image,
1717            "alpine@sha256:abc",
1718            TrustZone::ThirdParty,
1719            pinned_meta,
1720        );
1721        // Propagation paths irrelevant for standalone mode.
1722        let paths: Vec<PropagationPath> = Vec::new();
1723
1724        let yaml = r#"
1725id: floating_image_standalone
1726name: Floating image
1727severity: medium
1728category: unpinned_action
1729match:
1730  standalone:
1731    node_type: image
1732    not:
1733      metadata:
1734        digest:
1735          contains: "sha256:"
1736"#;
1737        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1738        let findings = evaluate_custom_rules(&g, &paths, &[rule]);
1739        assert_eq!(
1740            findings.len(),
1741            2,
1742            "standalone must fire once per floating Image node"
1743        );
1744    }
1745
1746    #[test]
1747    fn standalone_supports_in_operator() {
1748        let mut g = AuthorityGraph::new(source());
1749        let mut self_hosted_meta = HashMap::new();
1750        self_hosted_meta.insert("self_hosted".to_string(), "true".to_string());
1751        let _pool = g.add_node_with_metadata(
1752            NodeKind::Image,
1753            "self-pool",
1754            TrustZone::FirstParty,
1755            self_hosted_meta,
1756        );
1757        let _hosted = g.add_node(NodeKind::Image, "ubuntu-latest", TrustZone::ThirdParty);
1758        let paths: Vec<PropagationPath> = Vec::new();
1759
1760        let yaml = r#"
1761id: r
1762name: r
1763severity: high
1764category: authority_propagation
1765match:
1766  standalone:
1767    node_type: image
1768    metadata:
1769      self_hosted:
1770        in: ["true", "yes"]
1771"#;
1772        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1773        let findings = evaluate_custom_rules(&g, &paths, &[rule]);
1774        assert_eq!(findings.len(), 1, "in:[\"true\",\"yes\"] matches one node");
1775    }
1776
1777    #[test]
1778    fn standalone_still_honors_graph_metadata_gate() {
1779        // Standalone bypasses source/sink/path but `graph_metadata:` remains
1780        // a precondition — that's how PR-context node-shape rules work.
1781        let mut g_pr = AuthorityGraph::new(source());
1782        g_pr.metadata.insert("trigger".into(), "pr".into());
1783        g_pr.add_node(NodeKind::Image, "alpine:latest", TrustZone::ThirdParty);
1784
1785        let mut g_push = AuthorityGraph::new(source());
1786        g_push.metadata.insert("trigger".into(), "push".into());
1787        g_push.add_node(NodeKind::Image, "alpine:latest", TrustZone::ThirdParty);
1788
1789        let yaml = r#"
1790id: r
1791name: r
1792severity: low
1793category: unpinned_action
1794match:
1795  graph_metadata:
1796    trigger:
1797      equals: pr
1798  standalone:
1799    node_type: image
1800"#;
1801        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1802        assert_eq!(
1803            evaluate_custom_rules(&g_pr, &[], std::slice::from_ref(&rule)).len(),
1804            1,
1805            "fires on PR graph"
1806        );
1807        assert!(
1808            evaluate_custom_rules(&g_push, &[], std::slice::from_ref(&rule)).is_empty(),
1809            "graph_metadata gate must suppress on push graph"
1810        );
1811    }
1812
1813    #[test]
1814    fn standalone_ignores_source_sink_path_fields() {
1815        // Even when source/sink would never match (no propagation paths exist),
1816        // standalone fires per node-shape match. Documents the precedence rule.
1817        let mut g = AuthorityGraph::new(source());
1818        let _img = g.add_node(NodeKind::Image, "alpine:latest", TrustZone::ThirdParty);
1819        let paths: Vec<PropagationPath> = Vec::new();
1820
1821        let yaml = r#"
1822id: r
1823name: r
1824severity: low
1825category: unpinned_action
1826match:
1827  source:
1828    node_type: secret    # would never match anything in this graph
1829  standalone:
1830    node_type: image
1831"#;
1832        let rule: CustomRule = serde_yaml::from_str(yaml).expect("yaml parses");
1833        let findings = evaluate_custom_rules(&g, &paths, &[rule]);
1834        assert_eq!(findings.len(), 1);
1835    }
1836
1837    // ── Gap A: multi-doc YAML loading ───────────────────────
1838
1839    #[test]
1840    fn multi_doc_yaml_loads_each_document_as_separate_rule() {
1841        let yaml = r#"
1842id: rule_a
1843name: First rule
1844severity: high
1845category: authority_propagation
1846match:
1847  source:
1848    node_type: secret
1849---
1850id: rule_b
1851name: Second rule
1852severity: critical
1853category: untrusted_with_authority
1854match:
1855  sink:
1856    trust_zone: untrusted
1857---
1858id: rule_c
1859name: Third rule
1860severity: medium
1861category: unpinned_action
1862"#;
1863        let rules = parse_rules_multi_doc(yaml).expect("multi-doc must parse");
1864        assert_eq!(rules.len(), 3, "expected 3 rules from 3-doc YAML");
1865        assert_eq!(rules[0].id, "rule_a");
1866        assert_eq!(rules[1].id, "rule_b");
1867        assert_eq!(rules[2].id, "rule_c");
1868        assert_eq!(rules[1].severity, Severity::Critical);
1869    }
1870
1871    #[test]
1872    fn single_doc_yaml_still_loads_identically() {
1873        let yaml = r#"
1874id: solo
1875name: Solo rule
1876severity: high
1877category: authority_propagation
1878"#;
1879        let rules = parse_rules_multi_doc(yaml).expect("single-doc must parse");
1880        assert_eq!(rules.len(), 1);
1881        assert_eq!(rules[0].id, "solo");
1882    }
1883
1884    #[test]
1885    fn multi_doc_with_empty_leading_document_is_skipped() {
1886        let yaml = r#"---
1887---
1888id: only
1889name: only
1890severity: low
1891category: authority_propagation
1892"#;
1893        let rules = parse_rules_multi_doc(yaml).expect("must parse");
1894        assert_eq!(rules.len(), 1);
1895        assert_eq!(rules[0].id, "only");
1896    }
1897
1898    #[test]
1899    fn load_rules_dir_loads_multi_doc_files() {
1900        let tmp =
1901            std::env::temp_dir().join(format!("taudit-custom-rules-multi-{}", std::process::id()));
1902        fs::create_dir_all(&tmp).unwrap();
1903        let path = tmp.join("bundle.yml");
1904        fs::write(
1905            &path,
1906            r#"
1907id: a
1908name: a
1909severity: high
1910category: authority_propagation
1911---
1912id: b
1913name: b
1914severity: medium
1915category: unpinned_action
1916---
1917id: c
1918name: c
1919severity: low
1920category: authority_propagation
1921"#,
1922        )
1923        .unwrap();
1924
1925        let rules = load_rules_dir(&tmp).expect("multi-doc file must load");
1926        assert_eq!(rules.len(), 3, "expected 3 rules from one bundled file");
1927
1928        let _ = fs::remove_dir_all(&tmp);
1929    }
1930
1931    // ── Provenance: every custom-rule finding carries source path ────────
1932
1933    #[test]
1934    fn loaded_rule_threads_source_file_into_findings() {
1935        let tmp = std::env::temp_dir().join(format!("taudit-custom-prov-{}", std::process::id()));
1936        fs::create_dir_all(&tmp).unwrap();
1937        let path = tmp.join("provenance.yml");
1938        fs::write(
1939            &path,
1940            r#"
1941id: from_disk
1942name: From disk
1943description: planted invariant
1944severity: critical
1945category: authority_propagation
1946match:
1947  source:
1948    trust_zone: first_party
1949  sink:
1950    trust_zone: untrusted
1951"#,
1952        )
1953        .unwrap();
1954
1955        let rules = load_rules_dir(&tmp).expect("rules load");
1956        assert_eq!(rules.len(), 1);
1957        // The loader stamps source_file on the rule itself.
1958        assert_eq!(rules[0].source_file.as_deref(), Some(path.as_path()));
1959
1960        let (graph, paths) = build_graph_with_paths();
1961        let findings = evaluate_custom_rules(&graph, &paths, &rules);
1962        assert_eq!(findings.len(), 1);
1963        match &findings[0].source {
1964            FindingSource::Custom { source_file } => {
1965                assert_eq!(
1966                    source_file, &path,
1967                    "custom finding must carry the YAML path it was loaded from"
1968                );
1969            }
1970            other => panic!("expected FindingSource::Custom, got {other:?}"),
1971        }
1972
1973        let _ = fs::remove_dir_all(&tmp);
1974    }
1975
1976    #[test]
1977    fn in_memory_custom_rule_emits_custom_source_with_empty_path() {
1978        // Rules constructed in-memory (tests, stdin pipelines) never go
1979        // through the loader and therefore have no source path — the finding
1980        // must still be tagged as Custom (not silently mistakable for built-in)
1981        // so any operator inspecting a SIEM alert immediately sees provenance.
1982        let (graph, paths) = build_graph_with_paths();
1983        let rule = CustomRule {
1984            id: "in_mem".into(),
1985            name: "in-memory".into(),
1986            description: String::new(),
1987            severity: Severity::High,
1988            category: FindingCategory::AuthorityPropagation,
1989            match_spec: MatchSpec::default(),
1990            source_file: None,
1991        };
1992        let findings = evaluate_custom_rules(&graph, &paths, &[rule]);
1993        assert!(!findings.is_empty(), "in-mem rule must still match");
1994        for f in &findings {
1995            match &f.source {
1996                FindingSource::Custom { source_file } => {
1997                    assert!(
1998                        source_file.as_os_str().is_empty(),
1999                        "in-mem custom rule emits Custom with empty path, not BuiltIn"
2000                    );
2001                }
2002                other => {
2003                    panic!("in-memory custom rule must still produce Custom source, got {other:?}")
2004                }
2005            }
2006        }
2007    }
2008
2009    #[test]
2010    fn unknown_metadata_operator_is_rejected() {
2011        let yaml = r#"
2012id: r
2013name: r
2014severity: high
2015category: authority_propagation
2016match:
2017  source:
2018    metadata:
2019      role:
2020        starts_with: adm
2021"#;
2022        let err = serde_yaml::from_str::<CustomRule>(yaml)
2023            .expect_err("unknown operator must be rejected");
2024        let msg = err.to_string();
2025        // serde_yaml's untagged-enum error doesn't always echo the unknown
2026        // field name; the important guarantee is that the parse fails (so
2027        // typos in operator names don't silently match nothing).
2028        assert!(
2029            msg.contains("metadata") || msg.contains("variant"),
2030            "parse should fail with a meaningful location: {msg}"
2031        );
2032    }
2033
2034    // ── Symlink protection (red-team R2 #4) ─────────────────
2035    //
2036    // These tests use Unix symlinks. Skipped on Windows where the test
2037    // harness usually lacks SeCreateSymbolicLinkPrivilege.
2038
2039    #[cfg(unix)]
2040    fn unique_tmp(prefix: &str) -> PathBuf {
2041        use std::sync::atomic::{AtomicU64, Ordering};
2042        static COUNTER: AtomicU64 = AtomicU64::new(0);
2043        let n = COUNTER.fetch_add(1, Ordering::SeqCst);
2044        std::env::temp_dir().join(format!(
2045            "taudit-symlink-{prefix}-{}-{n}",
2046            std::process::id()
2047        ))
2048    }
2049
2050    #[cfg(unix)]
2051    fn write_minimal_rule(path: &Path, id: &str) {
2052        fs::write(
2053            path,
2054            format!("id: {id}\nname: {id}\nseverity: high\ncategory: authority_propagation\n"),
2055        )
2056        .unwrap();
2057    }
2058
2059    #[test]
2060    #[cfg(unix)]
2061    fn load_rules_dir_follows_in_tree_symlink_with_warning() {
2062        use std::os::unix::fs::symlink;
2063
2064        let tmp = unique_tmp("intree");
2065        fs::create_dir_all(&tmp).unwrap();
2066
2067        let real = tmp.join("real.yml");
2068        write_minimal_rule(&real, "in_tree");
2069        let link = tmp.join("alias.yml");
2070        symlink(&real, &link).unwrap();
2071
2072        // Default opts: in-tree symlinks are followed BUT deduplicated by
2073        // canonical path so an alias and its target collapse to a single
2074        // rule load. This is the contract documented on
2075        // `load_rules_dir_with_opts`: "Files reached via multiple paths …
2076        // are deduplicated by canonical path so the same rule never fires
2077        // twice." Pre-v1.1 behaviour double-loaded.
2078        let rules = load_rules_dir(&tmp).expect("in-tree symlink must be loaded");
2079        assert_eq!(
2080            rules.len(),
2081            1,
2082            "expected 1 rule (alias deduped against real target), got {rules:?}"
2083        );
2084        assert_eq!(rules[0].id, "in_tree");
2085
2086        let _ = fs::remove_dir_all(&tmp);
2087    }
2088
2089    #[test]
2090    #[cfg(unix)]
2091    fn load_rules_dir_refuses_out_of_tree_symlink_by_default() {
2092        use std::os::unix::fs::symlink;
2093
2094        let tmp = unique_tmp("outoftree-refuse");
2095        fs::create_dir_all(&tmp).unwrap();
2096
2097        let outside_dir = unique_tmp("outoftree-target");
2098        fs::create_dir_all(&outside_dir).unwrap();
2099        let outside_file = outside_dir.join("evil.yml");
2100        write_minimal_rule(&outside_file, "evil");
2101
2102        let link = tmp.join("legit.yml");
2103        symlink(&outside_file, &link).unwrap();
2104
2105        let errs = load_rules_dir(&tmp).expect_err("out-of-tree symlink must be refused");
2106        assert_eq!(errs.len(), 1);
2107        assert!(
2108            matches!(errs[0], CustomRuleError::SymlinkOutsideDir { .. }),
2109            "expected SymlinkOutsideDir, got {:?}",
2110            errs[0]
2111        );
2112        let msg = errs[0].to_string();
2113        assert!(
2114            msg.contains("legit.yml") && msg.contains("evil.yml"),
2115            "error should name both link and target: {msg}"
2116        );
2117
2118        let _ = fs::remove_dir_all(&tmp);
2119        let _ = fs::remove_dir_all(&outside_dir);
2120    }
2121
2122    #[test]
2123    #[cfg(unix)]
2124    fn load_rules_dir_follows_out_of_tree_symlink_with_override() {
2125        use std::os::unix::fs::symlink;
2126
2127        let tmp = unique_tmp("outoftree-override");
2128        fs::create_dir_all(&tmp).unwrap();
2129
2130        let outside_dir = unique_tmp("outoftree-target-override");
2131        fs::create_dir_all(&outside_dir).unwrap();
2132        let outside_file = outside_dir.join("external.yml");
2133        write_minimal_rule(&outside_file, "external");
2134
2135        let link = tmp.join("aliased.yml");
2136        symlink(&outside_file, &link).unwrap();
2137
2138        let rules = load_rules_dir_with_opts(&tmp, true)
2139            .expect("override flag must allow external symlinks");
2140        assert_eq!(rules.len(), 1);
2141        assert_eq!(rules[0].id, "external");
2142
2143        let _ = fs::remove_dir_all(&tmp);
2144        let _ = fs::remove_dir_all(&outside_dir);
2145    }
2146
2147    // ── F4: recursive directory walk ────────────────────────
2148
2149    #[test]
2150    fn load_rules_dir_walks_subdirectories() {
2151        // Operators organise rules into platform-specific subtrees like
2152        // `invariants/gha/`, `invariants/ado/`. Pre-v1.1 the loader used a
2153        // single `read_dir` and silently skipped every subdir. The recursive
2154        // DFS must pick rules out of `<root>/sub/rule.yml`.
2155        let tmp = std::env::temp_dir().join(format!(
2156            "taudit-custom-rules-recursive-{}",
2157            std::process::id()
2158        ));
2159        let sub = tmp.join("sub");
2160        fs::create_dir_all(&sub).unwrap();
2161
2162        let nested = sub.join("rule.yml");
2163        fs::write(
2164            &nested,
2165            "id: nested\nname: nested\nseverity: high\ncategory: authority_propagation\n",
2166        )
2167        .unwrap();
2168
2169        let rules = load_rules_dir(&tmp).expect("recursive walk must load nested rule");
2170        assert_eq!(
2171            rules.len(),
2172            1,
2173            "expected 1 rule from nested dir, got {rules:?}"
2174        );
2175        assert_eq!(rules[0].id, "nested");
2176
2177        let _ = fs::remove_dir_all(&tmp);
2178    }
2179
2180    // ── F5: in-tree symlink dedup via canonical path ────────
2181
2182    #[test]
2183    #[cfg(unix)]
2184    fn load_rules_dir_dedupes_in_tree_symlink() {
2185        use std::os::unix::fs::symlink;
2186
2187        let tmp = unique_tmp("dedup");
2188        fs::create_dir_all(&tmp).unwrap();
2189
2190        let real = tmp.join("real.yml");
2191        write_minimal_rule(&real, "dedup_target");
2192        let alias = tmp.join("alias.yml");
2193        symlink(&real, &alias).unwrap();
2194
2195        // The alias canonicalises to the same path as `real.yml`. Loader
2196        // must collapse to ONE rule (and emit a warning to stderr — we
2197        // don't capture stderr in unit tests, but we assert the visible
2198        // contract of single-load).
2199        let rules = load_rules_dir(&tmp).expect("alias dedup must succeed");
2200        assert_eq!(rules.len(), 1, "expected 1 rule after dedup, got {rules:?}");
2201        assert_eq!(rules[0].id, "dedup_target");
2202
2203        let _ = fs::remove_dir_all(&tmp);
2204    }
2205
2206    // ── F6: id validation contract ──────────────────────────
2207
2208    #[test]
2209    fn custom_rule_id_validation_rejects_empty() {
2210        let yaml = r#"
2211id: ""
2212name: x
2213severity: high
2214category: authority_propagation
2215"#;
2216        let err = serde_yaml::from_str::<CustomRule>(yaml).expect_err("empty id must be rejected");
2217        let msg = err.to_string();
2218        assert!(
2219            msg.contains("non-empty"),
2220            "error must explain why empty fails: {msg}"
2221        );
2222    }
2223
2224    #[test]
2225    fn custom_rule_id_validation_rejects_brackets() {
2226        let yaml = r#"
2227id: "foo] [bar"
2228name: x
2229severity: high
2230category: authority_propagation
2231"#;
2232        let err =
2233            serde_yaml::from_str::<CustomRule>(yaml).expect_err("bracket in id must be rejected");
2234        let msg = err.to_string();
2235        assert!(
2236            msg.contains("foo] [bar") && msg.contains("invalid character"),
2237            "error must name the offending id and the invalid character: {msg}"
2238        );
2239        // Specifically calls out a bracket character (could be ']' or '[' —
2240        // the loop hits ']' first since it's at index 3 of `foo] [bar`).
2241        assert!(
2242            msg.contains("']'") || msg.contains("'['") || msg.contains("' '"),
2243            "error should quote the first offending character: {msg}"
2244        );
2245    }
2246
2247    #[test]
2248    fn custom_rule_id_validation_rejects_whitespace() {
2249        let yaml = r#"
2250id: "foo bar"
2251name: x
2252severity: high
2253category: authority_propagation
2254"#;
2255        let err = serde_yaml::from_str::<CustomRule>(yaml)
2256            .expect_err("whitespace in id must be rejected");
2257        let msg = err.to_string();
2258        assert!(
2259            msg.contains("foo bar") && msg.contains("invalid character"),
2260            "error must name the offending id and explain why: {msg}"
2261        );
2262    }
2263
2264    #[test]
2265    fn custom_rule_id_validation_accepts_snake_case() {
2266        let yaml = r#"
2267id: my_rule
2268name: snake-case rule
2269severity: high
2270category: authority_propagation
2271"#;
2272        let rule: CustomRule = serde_yaml::from_str(yaml).expect("snake_case id must be accepted");
2273        assert_eq!(rule.id, "my_rule");
2274    }
2275
2276    #[test]
2277    fn custom_rule_id_validation_accepts_kebab_case() {
2278        let yaml = r#"
2279id: my-rule
2280name: kebab-case rule
2281severity: high
2282category: authority_propagation
2283"#;
2284        let rule: CustomRule = serde_yaml::from_str(yaml).expect("kebab-case id must be accepted");
2285        assert_eq!(rule.id, "my-rule");
2286    }
2287
2288    #[test]
2289    fn custom_rule_id_validation_rejects_64_chars_plus_one() {
2290        let id = "a".repeat(65);
2291        let yaml = format!("id: {id}\nname: x\nseverity: high\ncategory: authority_propagation\n");
2292        let err =
2293            serde_yaml::from_str::<CustomRule>(&yaml).expect_err("65-char id must be rejected");
2294        let msg = err.to_string();
2295        assert!(
2296            msg.contains("64 characters"),
2297            "error must cite the 64-char cap: {msg}"
2298        );
2299
2300        // 64 chars exactly is fine.
2301        let id_ok = "a".repeat(64);
2302        let yaml_ok =
2303            format!("id: {id_ok}\nname: x\nseverity: high\ncategory: authority_propagation\n");
2304        let rule: CustomRule =
2305            serde_yaml::from_str(&yaml_ok).expect("64-char id must be accepted (boundary case)");
2306        assert_eq!(rule.id.len(), 64);
2307    }
2308
2309    #[test]
2310    fn custom_rule_id_validation_rejects_leading_digit() {
2311        // Defensive — not in the explicit spec, but documents the
2312        // first-character rule. Snake_case-friendly + matches the regex.
2313        let yaml = r#"
2314id: 1bad
2315name: x
2316severity: high
2317category: authority_propagation
2318"#;
2319        let err = serde_yaml::from_str::<CustomRule>(yaml)
2320            .expect_err("digit-leading id must be rejected");
2321        let msg = err.to_string();
2322        assert!(
2323            msg.contains("must start"),
2324            "error must explain the first-char rule: {msg}"
2325        );
2326    }
2327}