Skip to main content

nyx_scanner/labels/
mod.rs

1//! Per-language source, sanitizer, and sink rule registries.
2//!
3//! The central type is [`DataLabel`], which pairs a [`Cap`] bitflag set with
4//! a role (Source, Sanitizer, Sink). [`LabelRule`] maps AST text patterns to
5//! labels. [`classify`] and [`classify_all`] look up a callee name against
6//! the active language's rule table; [`classify_gated_sink`] handles
7//! argument-role-aware sinks where one argument controls whether the call is
8//! dangerous at all.
9//!
10//! Rules for each language live in per-language submodules (`rust`, `java`,
11//! `go`, `python`, `php`, `ruby`, `javascript`, `typescript`, `c`, `cpp`).
12//! The [`Cap`] bitflag type is defined here and shared with the taint engine.
13
14mod c;
15mod cpp;
16mod go;
17mod java;
18mod javascript;
19mod php;
20mod python;
21pub(crate) mod ruby;
22mod rust;
23mod typescript;
24
25use bitflags::bitflags;
26use once_cell::sync::Lazy;
27use phf::Map;
28use serde::{Deserialize, Serialize};
29use smallvec::SmallVec;
30use std::collections::HashMap;
31
32/// A single rule: if the AST text equals (or ends with) one of the `matchers`,
33/// the node gets `label`.
34#[derive(Debug, Clone, Copy)]
35pub struct LabelRule {
36    pub matchers: &'static [&'static str],
37    pub label: DataLabel,
38    pub case_sensitive: bool,
39}
40
41/// Sentinel returned by [`classify_gated_sink`] for the dynamic/unknown-activation
42/// branch: the gate fires conservatively and every positional argument must be
43/// considered a potential tainted payload, not just the explicit `payload_args`.
44/// Downstream code (`cfg.rs` node construction) detects this sentinel and
45/// expands it to `(0..arity)` using the actual call arity.
46///
47/// The value `usize::MAX` is used because `args.get(usize::MAX)` is a guaranteed
48/// miss for any real argument list, an accidental direct-lookup would be a no-op
49/// rather than silently aliasing position 0.
50pub const ALL_ARGS_PAYLOAD: &[usize] = &[usize::MAX];
51
52/// How a gate decides to activate.
53///
54/// A gate's activation determines whether the callee is treated as a sink at
55/// a given call site. `ValueMatch` inspects a literal/kwarg for dangerous
56/// values; `Destination` fires unconditionally on taint reaching declared
57/// destination-bearing positions or fields.
58#[derive(Debug, Clone, Copy)]
59pub enum GateActivation {
60    /// Legacy literal-value activation.  The gate fires when the constant
61    /// value at `arg_index` (or keyword arg, if `keyword_name`/`dangerous_kwargs`
62    /// is set) matches `dangerous_values` / `dangerous_prefixes`, or when that
63    /// value is dynamic/unknown (conservative).
64    ///
65    /// Used for argument-role-aware sinks like `setAttribute` (activation arg
66    /// selects which attribute is being set) and `parseFromString` (activation
67    /// arg selects the MIME type).
68    ValueMatch,
69    /// Destination-bearing flow activation.  The gate fires when taint reaches
70    /// a declared destination location at the call site, no literal
71    /// inspection, no prefix heuristic.
72    ///
73    /// For callees whose destination is a positional argument (e.g. `fetch`'s
74    /// first arg, `axios.post`'s first arg), set `object_destination_fields`
75    /// to `&[]`: the whole positional argument at each index in the gate's
76    /// `payload_args` is treated as the destination.
77    ///
78    /// For callees that accept a config/options object whose fields designate
79    /// the destination (`axios({url,baseURL,...})`, `http.request({host,path,port})`,
80    /// `got({url,prefixUrl,...})`, `undici.request({origin,path,...})`), list
81    /// the destination-bearing field names here.  When the positional arg is
82    /// an object literal at call time, sink taint checks are restricted to
83    /// identifiers found under those fields; non-destination fields (`body`,
84    /// `data`, `json`, `headers`, ...) are silenced.
85    ///
86    /// When the positional arg is not an object literal (plain string / ident
87    /// / expression), the whole arg is treated as the destination (same as
88    /// the empty-field case).  This keeps `http.request(urlString, cb)` and
89    /// `http.request({host,path}, cb)` both covered by a single gate.
90    Destination {
91        object_destination_fields: &'static [&'static str],
92    },
93}
94
95/// Argument-sensitive sink activation.  Whether a call becomes a sink is
96/// determined by the gate's [`GateActivation`] mode, literal-value matching
97/// for traditional role-selector APIs, or destination-flow activation for
98/// outbound HTTP clients and other APIs where a specific location in the
99/// call carries the attacker-controlled destination.
100///
101/// `payload_args` specifies which argument positions carry the tainted payload.
102/// When non-empty, only variables from those argument positions are checked for
103/// taint at the sink.  When empty, all arguments are considered payloads
104/// (backward-compatible default for `ValueMatch`).
105#[derive(Debug, Clone, Copy)]
106pub struct SinkGate {
107    pub callee_matcher: &'static str,
108    pub arg_index: usize,
109    pub dangerous_values: &'static [&'static str],
110    pub dangerous_prefixes: &'static [&'static str],
111    pub label: DataLabel,
112    pub case_sensitive: bool,
113    pub payload_args: &'static [usize],
114    /// Optional keyword argument name for languages that support keyword args
115    /// (e.g. Python `shell=True` in `subprocess.Popen`).  When set, the
116    /// activation value is extracted from the named keyword argument instead
117    /// of the positional argument at `arg_index`.
118    pub keyword_name: Option<&'static str>,
119    /// Multi-keyword activation rules.  Each entry is `(kwarg_name, values)`
120    /// where any listed value makes the call dangerous.  Gate semantics when
121    /// non-empty:
122    ///   * A listed kwarg with a matching literal value → activate.
123    ///   * A listed kwarg present with a non-literal (dynamic) value →
124    ///     activate conservatively.
125    ///   * A listed kwarg present but with an explicitly safe literal → does
126    ///     not by itself activate.
127    ///   * No listed kwarg present → does not activate (matches the language
128    ///     default, e.g. Python `shell=False` implicit for `subprocess.run`).
129    ///
130    /// When both `keyword_name` and `dangerous_kwargs` are set, `keyword_name`
131    /// wins (back-compat for existing single-kwarg gates).  `&[]` is the
132    /// default and disables this branch.
133    pub dangerous_kwargs: &'static [(&'static str, &'static [&'static str])],
134    /// Activation mode.  [`GateActivation::ValueMatch`] is the legacy default;
135    /// [`GateActivation::Destination`] is used for destination-flow modeling
136    /// (outbound HTTP clients etc.).
137    pub activation: GateActivation,
138}
139
140bitflags! {
141    /// Security capability bits for sources, sanitizers, and sinks.
142    ///
143    /// Each bit represents a security-relevant property. The meaning depends on
144    /// which role the [`Cap`] value is attached to:
145    ///
146    /// - **Source**: which attack classes this tainted value can potentially
147    ///   trigger. Sources usually carry [`Cap::all()`] so they match any sink.
148    ///   [`ENV_VAR`](Cap::ENV_VAR) is an exception — it marks origin rather
149    ///   than reach.
150    /// - **Sanitizer**: which attack classes this function strips. A sanitizer
151    ///   labelled with [`HTML_ESCAPE`](Cap::HTML_ESCAPE) clears the XSS-relevant
152    ///   bits from tainted values that flow through it.
153    /// - **Sink**: which capability bits must be present on the incoming tainted
154    ///   value for a finding to fire. A SQL sink requires [`SQL_QUERY`](Cap::SQL_QUERY).
155    ///
156    /// In practice: a finding fires when a tainted value reaches a sink and
157    /// `(value_caps & sink_caps) != 0`.
158    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
159    pub struct Cap: u16 {
160        /// Taint that originated from an environment variable read.
161        /// Used as a source-origin marker for env-injection rules.
162        const ENV_VAR         = 0b0000_0000_0000_0001;  // bit 0
163        /// Sanitizer: the value has passed through HTML entity escaping.
164        /// Strips XSS risk from values that reach HTML output sinks.
165        const HTML_ESCAPE     = 0b0000_0000_0000_0010;  // bit 1
166        /// Sanitizer: the value has been shell-argument escaped.
167        /// Strips command-injection risk before shell sinks.
168        const SHELL_ESCAPE    = 0b0000_0000_0000_0100;  // bit 2
169        /// Sanitizer: the value has been percent-encoded for use in a URL.
170        const URL_ENCODE      = 0b0000_0000_0000_1000;  // bit 3
171        /// Sanitizer: the value was parsed through a structured JSON decoder
172        /// (as opposed to `eval`-based or regex parsing).
173        const JSON_PARSE      = 0b0000_0000_0001_0000;  // bit 4
174        /// Sink: file system read or write operation (path traversal, arbitrary
175        /// file read/write).
176        const FILE_IO         = 0b0000_0000_0010_0000;  // bit 5
177        /// Sink: format string injection (e.g. `printf`-family, `String.format`).
178        const FMT_STRING      = 0b0000_0000_0100_0000;  // bit 6
179        /// Sink: SQL query construction. Fires for string-concatenated queries
180        /// and parameterized-query builders where the query text itself is tainted.
181        const SQL_QUERY       = 0b0000_0000_1000_0000;  // bit 7
182        /// Sink: unsafe object deserialization (Java `ObjectInputStream`,
183        /// Python `pickle`, Ruby `Marshal`, PHP `unserialize`, etc.).
184        const DESERIALIZE     = 0b0000_0001_0000_0000;  // bit 8
185        /// Sink: server-side request forgery. Fires when attacker-controlled
186        /// data reaches the destination URL of an outbound HTTP request.
187        const SSRF            = 0b0000_0010_0000_0000;  // bit 9
188        /// Sink: code or command execution (shell injection, `eval`, `exec`,
189        /// dynamic `require`/`import`, template injection).
190        const CODE_EXEC       = 0b0000_0100_0000_0000;  // bit 10
191        /// Sink: cryptographic operation with a tainted algorithm name or seed
192        /// (weak-crypto / predictable-randomness patterns).
193        const CRYPTO          = 0b0000_1000_0000_0000;  // bit 11
194        /// Request-bound, caller-supplied identifier that has not yet been
195        /// validated against an ownership/membership check.  Used as the
196        /// carrier cap for folding `auth_analysis` into the SSA/taint
197        /// engine.
198        const UNAUTHORIZED_ID = 0b0001_0000_0000_0000;  // bit 12
199        /// Cross-boundary data-exfiltration: tainted sensitive data flowing
200        /// into outbound request bodies, headers, or other payload-bearing
201        /// fields of network egress APIs.  Distinct from `SSRF` (attacker
202        /// control over the destination URL), `DATA_EXFIL` fires when the
203        /// destination is fixed but attacker-influenced data leaves the
204        /// process via the request payload.
205        const DATA_EXFIL      = 0b0010_0000_0000_0000;  // bit 13
206    }
207}
208
209impl Default for Cap {
210    fn default() -> Self {
211        Cap::empty()
212    }
213}
214
215impl serde::Serialize for Cap {
216    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
217        s.serialize_u16(self.bits())
218    }
219}
220
221impl<'de> serde::Deserialize<'de> for Cap {
222    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
223        let bits = u16::deserialize(d)?;
224        Ok(Cap::from_bits_truncate(bits))
225    }
226}
227
228#[derive(Debug, Clone, Copy, PartialEq, Eq)]
229pub enum Kind {
230    If,
231    InfiniteLoop,
232    While,
233    For,
234    CallFn,
235    CallMethod,
236    CallMacro,
237    Break,
238    Continue,
239    Return,
240    Block,
241    SourceFile,
242    Function,
243    Assignment,
244    CallWrapper,
245    Try,
246    Throw,
247    /// Multi-way dispatch (switch/match): a discriminant evaluates and routes
248    /// control to one of many case bodies. Cases with no terminating jump fall
249    /// through to the next case (where the surface language allows). The CFG
250    /// builder gives each case body the dispatch header as a predecessor so
251    /// reachability does not depend on sibling-case execution order.
252    Switch,
253    Trivia,
254    /// Simple sequential expression (e.g. cast/type-assertion), treated like
255    /// any other sequential statement in the CFG but explicitly classified so
256    /// code that inspects `Kind` can recognise it.
257    Seq,
258    Other,
259}
260
261#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
262pub enum DataLabel {
263    Source(Cap),
264    Sanitizer(Cap),
265    Sink(Cap),
266}
267
268/// Configuration for extracting parameter names from function AST nodes.
269pub struct ParamConfig {
270    /// Field name on the function node that holds the parameter list
271    /// (e.g. "parameters", "formal_parameters").
272    pub params_field: &'static str,
273    /// Tree-sitter node kinds that represent individual parameters.
274    pub param_node_kinds: &'static [&'static str],
275    /// Node kinds representing self/this parameters (e.g. "self_parameter" in Rust).
276    pub self_param_kinds: &'static [&'static str],
277    /// Field names tried in order to extract the identifier from a parameter node.
278    pub ident_fields: &'static [&'static str],
279}
280
281static DEFAULT_PARAM_CONFIG: ParamConfig = ParamConfig {
282    params_field: "parameters",
283    param_node_kinds: &["parameter", "identifier"],
284    self_param_kinds: &[],
285    ident_fields: &["name", "pattern"],
286};
287
288/// Describes taint propagation from input arguments to output arguments
289/// for known C/C++ functions (e.g., inet_pton copies network address from arg 1 to arg 2).
290pub struct ArgPropagation {
291    pub callee: &'static str,
292    pub from_args: &'static [usize],
293    pub to_args: &'static [usize],
294}
295
296/// Look up output-parameter positions for Source-labeled C/C++ functions.
297/// Returns argument indices that receive taint alongside the return value.
298pub fn output_param_source_positions(lang: &str, callee: &str) -> Option<&'static [usize]> {
299    let registry: &[(&str, &[usize])] = match lang {
300        "c" => c::OUTPUT_PARAM_SOURCES,
301        "cpp" => cpp::OUTPUT_PARAM_SOURCES,
302        _ => return None,
303    };
304    let normalized = callee
305        .rsplit("::")
306        .next()
307        .unwrap_or(callee)
308        .rsplit('.')
309        .next()
310        .unwrap_or(callee);
311    registry
312        .iter()
313        .find(|(name, _)| name.eq_ignore_ascii_case(normalized))
314        .map(|(_, positions)| *positions)
315}
316
317/// Look up arg-to-arg propagation rules for known C/C++ functions.
318pub fn arg_propagation(lang: &str, callee: &str) -> Option<&'static ArgPropagation> {
319    let registry: &[ArgPropagation] = match lang {
320        "c" => c::ARG_PROPAGATIONS,
321        "cpp" => cpp::ARG_PROPAGATIONS,
322        _ => return None,
323    };
324    let normalized = callee
325        .rsplit("::")
326        .next()
327        .unwrap_or(callee)
328        .rsplit('.')
329        .next()
330        .unwrap_or(callee);
331    registry
332        .iter()
333        .find(|p| p.callee.eq_ignore_ascii_case(normalized))
334}
335
336static REGISTRY: Lazy<HashMap<&'static str, &'static [LabelRule]>> = Lazy::new(|| {
337    let mut m = HashMap::new();
338    m.insert("rust", rust::RULES);
339    m.insert("rs", rust::RULES);
340
341    m.insert("javascript", javascript::RULES);
342    m.insert("js", javascript::RULES);
343
344    m.insert("typescript", typescript::RULES);
345    m.insert("ts", typescript::RULES);
346
347    m.insert("python", python::RULES);
348    m.insert("py", python::RULES);
349
350    m.insert("go", go::RULES);
351
352    m.insert("java", java::RULES);
353
354    m.insert("c", c::RULES);
355
356    m.insert("cpp", cpp::RULES);
357    m.insert("c++", cpp::RULES);
358
359    m.insert("php", php::RULES);
360
361    m.insert("ruby", ruby::RULES);
362    m.insert("rb", ruby::RULES);
363
364    m
365});
366
367static GATED_REGISTRY: Lazy<HashMap<&'static str, &'static [SinkGate]>> = Lazy::new(|| {
368    let mut m = HashMap::new();
369    m.insert("javascript", javascript::GATED_SINKS);
370    m.insert("js", javascript::GATED_SINKS);
371    m.insert("typescript", typescript::GATED_SINKS);
372    m.insert("ts", typescript::GATED_SINKS);
373    m.insert("python", python::GATED_SINKS);
374    m.insert("py", python::GATED_SINKS);
375    m.insert("go", go::GATED_SINKS);
376    m.insert("php", php::GATED_SINKS);
377    m.insert("c", c::GATED_SINKS);
378    m.insert("cpp", cpp::GATED_SINKS);
379    m.insert("c++", cpp::GATED_SINKS);
380    m
381});
382
383/// Per-language exclusion patterns: callee text that must never be classified.
384static EXCLUDES: Lazy<HashMap<&'static str, &'static [&'static str]>> = Lazy::new(|| {
385    let mut m = HashMap::new();
386    m.insert("javascript", javascript::EXCLUDES);
387    m.insert("js", javascript::EXCLUDES);
388    m.insert("typescript", typescript::EXCLUDES);
389    m.insert("ts", typescript::EXCLUDES);
390    m
391});
392
393/// Check whether `text` matches a per-language exclusion pattern.
394pub(crate) fn is_excluded(lang: &str, trimmed: &[u8]) -> bool {
395    let excludes = match EXCLUDES.get(lang).or_else(|| {
396        let key = lang.to_ascii_lowercase();
397        EXCLUDES.get(key.as_str())
398    }) {
399        Some(e) => *e,
400        None => return false,
401    };
402    for &pat in excludes {
403        if match_suffix_cs(trimmed, pat.as_bytes(), false) {
404            return true;
405        }
406    }
407    false
408}
409
410type FastMap = &'static Map<&'static str, Kind>;
411
412pub(crate) static CLASSIFIERS: Lazy<HashMap<&'static str, FastMap>> = Lazy::new(|| {
413    let mut m = HashMap::new();
414    m.insert("rust", &rust::KINDS);
415    m.insert("rs", &rust::KINDS);
416
417    m.insert("javascript", &javascript::KINDS);
418    m.insert("js", &javascript::KINDS);
419
420    m.insert("typescript", &typescript::KINDS);
421    m.insert("ts", &typescript::KINDS);
422
423    m.insert("python", &python::KINDS);
424    m.insert("py", &python::KINDS);
425
426    m.insert("go", &go::KINDS);
427
428    m.insert("java", &java::KINDS);
429
430    m.insert("c", &c::KINDS);
431
432    m.insert("cpp", &cpp::KINDS);
433    m.insert("c++", &cpp::KINDS);
434
435    m.insert("php", &php::KINDS);
436
437    m.insert("ruby", &ruby::KINDS);
438    m.insert("rb", &ruby::KINDS);
439
440    m
441});
442
443static PARAM_CONFIGS: Lazy<HashMap<&'static str, &'static ParamConfig>> = Lazy::new(|| {
444    let mut m = HashMap::new();
445    m.insert("rust", &rust::PARAM_CONFIG);
446    m.insert("rs", &rust::PARAM_CONFIG);
447
448    m.insert("javascript", &javascript::PARAM_CONFIG);
449    m.insert("js", &javascript::PARAM_CONFIG);
450
451    m.insert("typescript", &typescript::PARAM_CONFIG);
452    m.insert("ts", &typescript::PARAM_CONFIG);
453
454    m.insert("python", &python::PARAM_CONFIG);
455    m.insert("py", &python::PARAM_CONFIG);
456
457    m.insert("go", &go::PARAM_CONFIG);
458
459    m.insert("java", &java::PARAM_CONFIG);
460
461    m.insert("c", &c::PARAM_CONFIG);
462
463    m.insert("cpp", &cpp::PARAM_CONFIG);
464    m.insert("c++", &cpp::PARAM_CONFIG);
465
466    m.insert("php", &php::PARAM_CONFIG);
467
468    m.insert("ruby", &ruby::PARAM_CONFIG);
469    m.insert("rb", &ruby::PARAM_CONFIG);
470
471    m
472});
473
474/// Return the parameter extraction config for the given language, with a sensible default.
475pub fn param_config(lang: &str) -> &'static ParamConfig {
476    PARAM_CONFIGS
477        .get(lang)
478        .copied()
479        .unwrap_or(&DEFAULT_PARAM_CONFIG)
480}
481
482/// Lowercase names whose use as a JS/TS function parameter strongly suggests
483/// the binding carries attacker-controlled input (handler dispatch functions,
484/// controller methods, command wrappers).  When the taint engine enters a
485/// function whose formal parameter matches one of these names and no caller
486/// taint has been supplied, it auto-seeds the parameter as a `UserInput`
487/// source so sinks downstream of the parameter still fire.
488const JS_TS_HANDLER_PARAM_NAMES: &[&str] = &["userinput", "userid", "payload", "cmd", "input"];
489
490/// Check whether a JS/TS formal parameter name strongly implies user input.
491///
492/// Matches the curated exact-name list (case-insensitive) *and* any identifier
493/// that begins with a `user` prefix followed by an uppercase letter (camelCase)
494/// or underscore (snake_case).  The prefix rule captures common handler
495/// parameter names such as `userCmd`, `userPath`, `userData`, and `user_input`
496/// without broadening into generic words that just contain "user".
497pub fn is_js_ts_handler_param_name(name: &str) -> bool {
498    if name.is_empty() || !name.is_ascii() {
499        return false;
500    }
501    if JS_TS_HANDLER_PARAM_NAMES
502        .iter()
503        .any(|candidate| candidate.eq_ignore_ascii_case(name))
504    {
505        return true;
506    }
507    // camelCase / snake_case `user*` prefix: requires at least one
508    // distinguishing character after the prefix so `user` alone does not match.
509    let bytes = name.as_bytes();
510    if bytes.len() >= 5
511        && bytes[..4].eq_ignore_ascii_case(b"user")
512        && (bytes[4].is_ascii_uppercase() || bytes[4] == b'_')
513    {
514        return true;
515    }
516    false
517}
518
519#[inline(always)]
520pub fn lookup(lang: &str, raw: &str) -> Kind {
521    CLASSIFIERS
522        .get(lang)
523        .and_then(|m| m.get(raw).copied())
524        .unwrap_or(Kind::Other)
525}
526
527/// The kind of taint source, used to refine finding severity.
528#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
529#[serde(rename_all = "snake_case")]
530pub enum SourceKind {
531    /// Direct user input (request params, argv, stdin, form data)
532    UserInput,
533    /// HTTP cookie value (carries session / auth material)
534    Cookie,
535    /// HTTP request header (may carry auth tokens, user-agent fingerprints)
536    Header,
537    /// Environment variables and configuration
538    EnvironmentConfig,
539    /// File system reads
540    FileSystem,
541    /// Database query results
542    Database,
543    /// Caught exception, may carry user-controlled data
544    CaughtException,
545    /// Could not determine, treat conservatively
546    Unknown,
547}
548
549/// Sensitivity classification of a taint source.  Drives detector classes
550/// like `DATA_EXFIL` that only fire when the source carries information
551/// the operator did not intend to leak.  Plain user input echoed back into
552/// an outbound request is not data exfiltration, the user already controls
553/// it, surfacing it as a leak is noise.
554///
555/// The threshold for `DATA_EXFIL` is `>= Sensitive`, plain user input is
556/// suppressed.  Projects that legitimately classify a request body as
557/// sensitive (e.g. an API gateway forwarding pre-authenticated user tokens
558/// out of a request body) can override via custom rules in `nyx.conf`,
559/// either by re-classifying the source or by adding a Sanitizer rule for
560/// `Cap::DATA_EXFIL` on the legitimate forwarding path.
561#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
562pub enum Sensitivity {
563    /// Attacker-controlled but not secret in itself, request bodies, query
564    /// strings, form fields, argv.  Echoing this to an outbound request is
565    /// not data exfiltration.
566    Plain,
567    /// Carries operator state the user should not see leak out, cookies,
568    /// auth headers, env, file system reads, database rows.
569    Sensitive,
570    /// Reserved for future explicit secret classifications (API keys,
571    /// credential stores, key material).  No source currently produces
572    /// this, but the threshold check in `effective_sink_caps` already
573    /// handles it monotonically.
574    Secret,
575}
576
577impl SourceKind {
578    /// Return the sensitivity tier this source kind belongs to.  Drives the
579    /// `Cap::DATA_EXFIL` cap-suppression decision in `ast.rs`.
580    pub fn sensitivity(self) -> Sensitivity {
581        match self {
582            // Plain user-controlled input, the user already has the data,
583            // surfacing it back to them via an outbound request is not a
584            // disclosure.
585            SourceKind::UserInput => Sensitivity::Plain,
586            // Operator-bound state, leaking these via an outbound request
587            // is a real cross-boundary disclosure.
588            SourceKind::Cookie
589            | SourceKind::Header
590            | SourceKind::EnvironmentConfig
591            | SourceKind::FileSystem
592            | SourceKind::Database => Sensitivity::Sensitive,
593            // Caught exceptions can carry stack traces, db errors, internal
594            // paths, treat them as sensitive by default.
595            SourceKind::CaughtException => Sensitivity::Sensitive,
596            // Conservative default for unclassified sources, surface
597            // findings rather than silently drop them.
598            SourceKind::Unknown => Sensitivity::Sensitive,
599        }
600    }
601}
602
603/// Infer the source kind from capabilities and callee name.
604pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
605    let cl = callee.to_ascii_lowercase();
606
607    // Cookie / Header are checked *before* the generic user-input bucket
608    // because they imply higher sensitivity (auth material, session ids).
609    // The generic UserInput substrings (`request`, `header`, `cookie`)
610    // would otherwise swallow these.
611    //
612    // Session stores carry auth material (CSRF tokens, signed user ids) of
613    // the same sensitivity tier as raw cookies, so route them through the
614    // `Cookie` arm.  The substring is checked AFTER excluding the
615    // capitalised `Session` constructor (covered by the `request` /
616    // `requests` checks below not firing for `Session` builders).
617    if cl.contains("cookie") || cl.contains("session") {
618        return SourceKind::Cookie;
619    }
620    if cl.contains("header") {
621        return SourceKind::Header;
622    }
623
624    // User input patterns
625    if cl.contains("argv")
626        || cl.contains("stdin")
627        || cl.contains("request")
628        || cl.contains("form")
629        || cl.contains("query")
630        || cl.contains("params")
631        || cl.contains("param")
632        || cl.contains("input")
633        || cl.contains("body")
634        || cl.contains("location")
635        || cl.contains("document.url")
636        || cl.contains("document.referrer")
637        // PHP superglobals: the AST text preserves the `$` (member-text
638        // extraction reads the `variable_name` node verbatim) so we match
639        // both `$_POST` and the `_POST` form some collectors emit.
640        // `$_REQUEST` already matches via the `request` substring above;
641        // `$_COOKIE` / `$_SESSION` route through the Cookie tier earlier in
642        // the function.  `$_SERVER` is operator-state-bearing (auth headers
643        // etc.) so it stays Sensitive by falling through to the Unknown
644        // bucket.
645        || cl == "$_get"
646        || cl == "$_post"
647        || cl == "$_files"
648        || cl == "_get"
649        || cl == "_post"
650        || cl == "_files"
651    {
652        return SourceKind::UserInput;
653    }
654
655    // Environment / config patterns
656    if cl.contains("env")
657        || cl.contains("getenv")
658        || cl.contains("environ")
659        || cl.contains("config")
660    {
661        return SourceKind::EnvironmentConfig;
662    }
663
664    // File system patterns
665    if cl.contains("read") || cl.contains("fopen") || cl.contains("open") {
666        // Distinguish from db reads, file reads typically have FILE_IO cap
667        if caps.contains(Cap::FILE_IO) {
668            return SourceKind::FileSystem;
669        }
670    }
671
672    // Database patterns
673    if cl.contains("fetchone")
674        || cl.contains("fetchall")
675        || cl.contains("fetch_row")
676        || cl.contains("query")
677        || cl.contains("execute")
678    {
679        // Queries that read back from db
680        return SourceKind::Database;
681    }
682
683    SourceKind::Unknown
684}
685
686/// Map a source kind to its appropriate severity level.
687pub fn severity_for_source_kind(kind: SourceKind) -> crate::patterns::Severity {
688    match kind {
689        SourceKind::UserInput => crate::patterns::Severity::High,
690        SourceKind::Cookie => crate::patterns::Severity::High,
691        SourceKind::Header => crate::patterns::Severity::High,
692        SourceKind::EnvironmentConfig => crate::patterns::Severity::High,
693        SourceKind::FileSystem => crate::patterns::Severity::Medium,
694        SourceKind::Database => crate::patterns::Severity::Medium,
695        SourceKind::CaughtException => crate::patterns::Severity::Medium,
696        SourceKind::Unknown => crate::patterns::Severity::High,
697    }
698}
699
700/// A runtime (config-derived) label rule with owned matchers.
701#[derive(Debug, Clone)]
702pub struct RuntimeLabelRule {
703    pub matchers: Vec<String>,
704    pub label: DataLabel,
705    pub case_sensitive: bool,
706}
707
708/// Parse a capability name string into a `Cap` bitflag.
709///
710/// Prefer `CapName` enum for config values; this remains for ad-hoc string parsing.
711#[allow(dead_code)]
712pub fn parse_cap(s: &str) -> Option<Cap> {
713    match s.to_ascii_lowercase().as_str() {
714        "env_var" => Some(Cap::ENV_VAR),
715        "html_escape" => Some(Cap::HTML_ESCAPE),
716        "shell_escape" => Some(Cap::SHELL_ESCAPE),
717        "url_encode" => Some(Cap::URL_ENCODE),
718        "json_parse" => Some(Cap::JSON_PARSE),
719        "file_io" => Some(Cap::FILE_IO),
720        "fmt_string" => Some(Cap::FMT_STRING),
721        "sql_query" => Some(Cap::SQL_QUERY),
722        "deserialize" => Some(Cap::DESERIALIZE),
723        "ssrf" => Some(Cap::SSRF),
724        "code_exec" => Some(Cap::CODE_EXEC),
725        "crypto" => Some(Cap::CRYPTO),
726        "unauthorized_id" => Some(Cap::UNAUTHORIZED_ID),
727        "data_exfil" | "data_exfiltration" => Some(Cap::DATA_EXFIL),
728        "all" => Some(Cap::all()),
729        _ => None,
730    }
731}
732
733/// Pre-built analysis rules for a specific language, derived from config.
734/// Built once per file and threaded through the pipeline.
735#[derive(Debug, Clone, Default)]
736pub struct LangAnalysisRules {
737    pub extra_labels: Vec<RuntimeLabelRule>,
738    pub terminators: Vec<String>,
739    pub event_handlers: Vec<String>,
740    pub frameworks: Vec<crate::utils::project::DetectedFramework>,
741}
742
743/// Build `LangAnalysisRules` from a `Config` for a given language slug.
744pub fn build_lang_rules(
745    config: &crate::utils::config::Config,
746    lang_slug: &str,
747) -> LangAnalysisRules {
748    let mut extra_labels: Vec<RuntimeLabelRule> = Vec::new();
749    let mut terminators = Vec::new();
750    let mut event_handlers = Vec::new();
751
752    if let Some(lang_cfg) = config.analysis.languages.get(lang_slug) {
753        extra_labels.extend(lang_cfg.rules.iter().map(|r| {
754            use crate::utils::config::RuleKind;
755            let cap = r.cap.to_cap();
756            let label = match r.kind {
757                RuleKind::Source => DataLabel::Source(cap),
758                RuleKind::Sanitizer => DataLabel::Sanitizer(cap),
759                RuleKind::Sink => DataLabel::Sink(cap),
760            };
761            RuntimeLabelRule {
762                matchers: r.matchers.clone(),
763                label,
764                case_sensitive: r.case_sensitive,
765            }
766        }));
767        terminators = lang_cfg.terminators.clone();
768        event_handlers = lang_cfg.event_handlers.clone();
769    }
770
771    // Append framework-conditional rules when frameworks are detected.
772    let frameworks = if let Some(ref fw_ctx) = config.framework_ctx {
773        extra_labels.extend(framework_rules_for_lang(lang_slug, fw_ctx));
774        fw_ctx.frameworks.clone()
775    } else {
776        Vec::new()
777    };
778
779    // fold `auth_analysis` into the taint engine by injecting
780    // `Cap::UNAUTHORIZED_ID` sink/sanitizer rules.  Gated by config; default
781    // OFF so the standalone `auth_analysis` subsystem remains authoritative.
782    if config.scanner.enable_auth_as_taint {
783        extra_labels.extend(phase_c_auth_rules_for_lang(lang_slug));
784    }
785
786    LangAnalysisRules {
787        extra_labels,
788        terminators,
789        event_handlers,
790        frameworks,
791    }
792}
793
794/// Return the auth-as-taint rules for a given language (Rust-only).
795fn phase_c_auth_rules_for_lang(lang_slug: &str) -> Vec<RuntimeLabelRule> {
796    match lang_slug {
797        "rust" | "rs" => rust::phase_c_auth_rules(),
798        _ => Vec::new(),
799    }
800}
801
802/// Public re-export used by `ParsedFile::from_source` to
803/// augment per-file rule sets when imports reveal frameworks that the
804/// manifest-level detector missed.
805pub fn framework_rules_for_lang_pub(
806    lang_slug: &str,
807    ctx: &crate::utils::project::FrameworkContext,
808) -> Vec<RuntimeLabelRule> {
809    framework_rules_for_lang(lang_slug, ctx)
810}
811
812/// Return framework-conditional label rules for a given language.
813fn framework_rules_for_lang(
814    lang_slug: &str,
815    ctx: &crate::utils::project::FrameworkContext,
816) -> Vec<RuntimeLabelRule> {
817    match lang_slug {
818        "go" => go::framework_rules(ctx),
819        "ruby" | "rb" => ruby::framework_rules(ctx),
820        "java" => java::framework_rules(ctx),
821        "php" => php::framework_rules(ctx),
822        "python" | "py" => python::framework_rules(ctx),
823        "rust" | "rs" => rust::framework_rules(ctx),
824        "javascript" | "js" => javascript::framework_rules(ctx),
825        "typescript" | "ts" => typescript::framework_rules(ctx),
826        _ => Vec::new(),
827    }
828}
829
830/// Suffix check with configurable case sensitivity.
831#[inline]
832fn ends_with_cs(haystack: &[u8], needle: &[u8], case_sensitive: bool) -> bool {
833    if needle.len() > haystack.len() {
834        return false;
835    }
836    let start = haystack.len() - needle.len();
837    if case_sensitive {
838        haystack[start..] == *needle
839    } else {
840        haystack[start..]
841            .iter()
842            .zip(needle)
843            .all(|(h, n)| h.eq_ignore_ascii_case(n))
844    }
845}
846
847/// Prefix check with configurable case sensitivity.  The `=` exact-match
848/// sigil is meaningless for prefix matchers (which by definition match many
849/// suffixes); it is stripped if present so a malformed matcher like
850/// `=foo_` still behaves predictably.
851#[inline]
852fn starts_with_cs(haystack: &[u8], needle: &[u8], case_sensitive: bool) -> bool {
853    let (needle, _) = unpack_matcher(needle);
854    if needle.len() > haystack.len() {
855        return false;
856    }
857    if case_sensitive {
858        haystack[..needle.len()] == *needle
859    } else {
860        haystack[..needle.len()]
861            .iter()
862            .zip(needle)
863            .all(|(h, n)| h.eq_ignore_ascii_case(n))
864    }
865}
866
867/// Word-boundary suffix match with configurable case sensitivity.
868#[inline]
869fn match_suffix_cs(text: &[u8], matcher: &[u8], case_sensitive: bool) -> bool {
870    let (m, exact_only) = unpack_matcher(matcher);
871    if ends_with_cs(text, m, case_sensitive) {
872        let start = text.len() - m.len();
873        if exact_only {
874            // `=foo` matchers fire only when `text` IS `foo` (no `Mod.foo`,
875            // `Class::foo`, or any preceding namespace).  Lets a label rule
876            // distinguish bare `Kernel#open` from `File.open`, the former
877            // shells out on `|cmd`, the latter never does (CVE-2020-8130).
878            start == 0
879        } else {
880            start == 0 || matches!(text[start - 1], b'.' | b':')
881        }
882    } else {
883        false
884    }
885}
886
887/// Strip an optional `=` "exact-match" sigil from the start of a matcher.
888/// Matchers prefixed with `=` (e.g. `"=open"`) only fire when the candidate
889/// text equals the matcher exactly, the boundary-`.`-or-`:` allowance is
890/// suppressed.  Used to distinguish bare-callee Ruby/Python builtins from
891/// methods of the same name on a typed receiver.
892#[inline]
893fn unpack_matcher(matcher: &[u8]) -> (&[u8], bool) {
894    if matcher.first() == Some(&b'=') {
895        (&matcher[1..], true)
896    } else {
897        (matcher, false)
898    }
899}
900
901/// Try to classify a piece of syntax text.
902/// `lang` is the canonicalised language key ("rust", "javascript", ...).
903///
904/// If `extra` runtime rules are provided, they are checked **first** (config
905/// takes priority over built-in rules).
906///
907/// **Two-pass matching** -- exact / suffix matches are checked across *all*
908/// rules before any prefix (`foo_`) match is attempted.  This prevents a
909/// greedy prefix like `sanitize_` from shadowing a more specific exact
910/// match like `sanitize_shell`.
911pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> Option<DataLabel> {
912    let head = text.split(['(', '<']).next().unwrap_or("");
913    let trimmed = head.trim().as_bytes();
914
915    // Early out: exclude known-benign framework patterns.
916    if is_excluded(lang, trimmed) {
917        return None;
918    }
919
920    // For chained calls like `r.URL.Query().Get`, also strip internal
921    // `().` segments to produce a normalized form like `r.URL.Query.Get`.
922    let full_normalized = normalize_chained_call(text);
923    let full_norm_bytes = full_normalized.as_bytes();
924
925    // ── Check runtime (config) rules first, they take priority ──────
926    if let Some(extras) = extra {
927        // Pass 1: exact / suffix
928        for rule in extras {
929            for raw in &rule.matchers {
930                let m = raw.as_bytes();
931                if m.last() == Some(&b'_') {
932                    continue;
933                }
934                if match_suffix_cs(trimmed, m, rule.case_sensitive)
935                    || match_suffix_cs(full_norm_bytes, m, rule.case_sensitive)
936                {
937                    return Some(rule.label);
938                }
939            }
940        }
941        // Pass 2: prefix
942        for rule in extras {
943            for raw in &rule.matchers {
944                let m = raw.as_bytes();
945                if m.last() == Some(&b'_')
946                    && (starts_with_cs(trimmed, m, rule.case_sensitive)
947                        || starts_with_cs(full_norm_bytes, m, rule.case_sensitive))
948                {
949                    return Some(rule.label);
950                }
951            }
952        }
953    }
954
955    // ── Built-in static rules ────────────────────────────────────────
956    let rules = REGISTRY.get(lang).or_else(|| {
957        let key = lang.to_ascii_lowercase();
958        REGISTRY.get(key.as_str())
959    })?;
960
961    // Pass 1: exact / suffix matches (high confidence)
962    for rule in *rules {
963        for raw in rule.matchers {
964            let m = raw.as_bytes();
965            if m.last() == Some(&b'_') {
966                continue;
967            }
968            if match_suffix_cs(trimmed, m, rule.case_sensitive)
969                || match_suffix_cs(full_norm_bytes, m, rule.case_sensitive)
970            {
971                return Some(rule.label);
972            }
973        }
974    }
975
976    // Pass 2: prefix matches (catch-all, lower priority)
977    for rule in *rules {
978        for raw in rule.matchers {
979            let m = raw.as_bytes();
980            if m.last() == Some(&b'_')
981                && (starts_with_cs(trimmed, m, rule.case_sensitive)
982                    || starts_with_cs(full_norm_bytes, m, rule.case_sensitive))
983            {
984                return Some(rule.label);
985            }
986        }
987    }
988
989    None
990}
991
992/// Classify a piece of syntax text, returning **all** matching labels.
993///
994/// Same two-pass (exact/suffix then prefix) structure as [`classify()`], but
995/// collects every match instead of returning on first hit.  Deduplicates
996/// exact `(variant, caps)` pairs.
997pub fn classify_all(
998    lang: &str,
999    text: &str,
1000    extra: Option<&[RuntimeLabelRule]>,
1001) -> SmallVec<[DataLabel; 2]> {
1002    let head = text.split(['(', '<']).next().unwrap_or("");
1003    let trimmed = head.trim().as_bytes();
1004
1005    // Early out: exclude known-benign framework patterns.
1006    if is_excluded(lang, trimmed) {
1007        return SmallVec::new();
1008    }
1009
1010    let full_normalized = normalize_chained_call(text);
1011    let full_norm_bytes = full_normalized.as_bytes();
1012
1013    let mut out: SmallVec<[DataLabel; 2]> = SmallVec::new();
1014
1015    // Helper: push if not already present (dedup by variant+caps equality).
1016    #[inline]
1017    fn push_dedup(out: &mut SmallVec<[DataLabel; 2]>, label: DataLabel) {
1018        if !out.contains(&label) {
1019            out.push(label);
1020        }
1021    }
1022
1023    // ── Check runtime (config) rules first, they take priority ──────
1024    if let Some(extras) = extra {
1025        // Pass 1: exact / suffix
1026        for rule in extras {
1027            for raw in &rule.matchers {
1028                let m = raw.as_bytes();
1029                if m.last() == Some(&b'_') {
1030                    continue;
1031                }
1032                if match_suffix_cs(trimmed, m, rule.case_sensitive)
1033                    || match_suffix_cs(full_norm_bytes, m, rule.case_sensitive)
1034                {
1035                    push_dedup(&mut out, rule.label);
1036                }
1037            }
1038        }
1039        // Pass 2: prefix
1040        for rule in extras {
1041            for raw in &rule.matchers {
1042                let m = raw.as_bytes();
1043                if m.last() == Some(&b'_')
1044                    && (starts_with_cs(trimmed, m, rule.case_sensitive)
1045                        || starts_with_cs(full_norm_bytes, m, rule.case_sensitive))
1046                {
1047                    push_dedup(&mut out, rule.label);
1048                }
1049            }
1050        }
1051    }
1052
1053    // ── Built-in static rules ────────────────────────────────────────
1054    let rules = REGISTRY.get(lang).or_else(|| {
1055        let key = lang.to_ascii_lowercase();
1056        REGISTRY.get(key.as_str())
1057    });
1058
1059    if let Some(rules) = rules {
1060        // Pass 1: exact / suffix matches (high confidence)
1061        for rule in *rules {
1062            for raw in rule.matchers {
1063                let m = raw.as_bytes();
1064                if m.last() == Some(&b'_') {
1065                    continue;
1066                }
1067                if match_suffix_cs(trimmed, m, rule.case_sensitive)
1068                    || match_suffix_cs(full_norm_bytes, m, rule.case_sensitive)
1069                {
1070                    push_dedup(&mut out, rule.label);
1071                }
1072            }
1073        }
1074
1075        // Pass 2: prefix matches (catch-all, lower priority)
1076        for rule in *rules {
1077            for raw in rule.matchers {
1078                let m = raw.as_bytes();
1079                if m.last() == Some(&b'_')
1080                    && (starts_with_cs(trimmed, m, rule.case_sensitive)
1081                        || starts_with_cs(full_norm_bytes, m, rule.case_sensitive))
1082                {
1083                    push_dedup(&mut out, rule.label);
1084                }
1085            }
1086        }
1087    }
1088
1089    out
1090}
1091
1092/// Result of a gated-sink classification.
1093///
1094/// `label` is the sink capability the callee contributes at this site.
1095/// `payload_args` identifies positional args that carry the tainted payload
1096/// (or [`ALL_ARGS_PAYLOAD`] for dynamic-activation conservative fallback).
1097/// `object_destination_fields`, when non-empty, restricts sink-taint checks
1098/// to identifiers found under those field names within an object-literal
1099/// positional argument, used by destination-aware outbound-HTTP gates so
1100/// `fetch({url, body})` fires only when taint reaches `url`, not `body`.
1101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1102pub struct GateMatch {
1103    pub label: DataLabel,
1104    pub payload_args: &'static [usize],
1105    pub object_destination_fields: &'static [&'static str],
1106}
1107
1108/// Classify a call against gated sink rules.
1109///
1110/// Returns every gate whose callee matches AND whose activation conditions
1111/// fire.  An empty result means the callee did not match any gated rule, or
1112/// every match was provably safe.  Multiple matches are possible when the
1113/// same callee carries gates for different sink classes, e.g. `fetch` is
1114/// both an SSRF gate (URL flow) and a `DATA_EXFIL` gate (body / headers /
1115/// json flow); each gate carries its own [`GateMatch`] so downstream code
1116/// can attribute findings per-cap.
1117///
1118/// `const_arg_at` extracts positional argument values.
1119/// `const_keyword_arg` extracts keyword argument values (for languages like Python).
1120pub fn classify_gated_sink(
1121    lang: &str,
1122    callee_text: &str,
1123    const_arg_at: impl Fn(usize) -> Option<String>,
1124    const_keyword_arg: impl Fn(&str) -> Option<String>,
1125    kwarg_present: impl Fn(&str) -> bool,
1126) -> SmallVec<[GateMatch; 2]> {
1127    let mut out: SmallVec<[GateMatch; 2]> = SmallVec::new();
1128    let gates = match GATED_REGISTRY.get(lang).or_else(|| {
1129        let key = lang.to_ascii_lowercase();
1130        GATED_REGISTRY.get(key.as_str())
1131    }) {
1132        Some(g) => g,
1133        None => return out,
1134    };
1135
1136    // Match against the original callee text AND a chain-normalised form
1137    // that strips `()` between dots so a chained construction like
1138    // `httpx.AsyncClient().post` matches a gate matcher of
1139    // `httpx.AsyncClient.post`.  Mirrors the normalisation applied by
1140    // `classify` for flat label rules.
1141    let callee_bytes = callee_text.as_bytes();
1142    let normalized = normalize_chained_call(callee_text);
1143    let normalized_bytes = normalized.as_bytes();
1144
1145    for gate in *gates {
1146        let matcher = gate.callee_matcher.as_bytes();
1147        if !match_suffix_cs(callee_bytes, matcher, gate.case_sensitive)
1148            && !match_suffix_cs(normalized_bytes, matcher, gate.case_sensitive)
1149        {
1150            continue;
1151        }
1152
1153        // Destination-flow activation: always fires.  Downstream filters sink
1154        // taint checks to `payload_args` (and, for object-literal args, further
1155        // to `object_destination_fields`).
1156        if let GateActivation::Destination {
1157            object_destination_fields,
1158        } = gate.activation
1159        {
1160            out.push(GateMatch {
1161                label: gate.label,
1162                payload_args: gate.payload_args,
1163                object_destination_fields,
1164            });
1165            continue;
1166        }
1167
1168        // ── ValueMatch activation (legacy) ───────────────────────────────
1169
1170        // Multi-kwarg gate path.  Takes precedence over positional / single-kwarg
1171        // inspection when populated.  Semantics are presence-aware: an absent
1172        // kwarg is treated as the language default (safe) and does not alone
1173        // activate the gate.
1174        if !gate.dangerous_kwargs.is_empty() && gate.keyword_name.is_none() {
1175            let mut any_dangerous = false;
1176            let mut any_dynamic_present = false;
1177            for (name, values) in gate.dangerous_kwargs {
1178                if !kwarg_present(name) {
1179                    continue; // absent → takes language default (safe)
1180                }
1181                match const_keyword_arg(name) {
1182                    Some(v) => {
1183                        let lower = v.to_ascii_lowercase();
1184                        if values.iter().any(|dv| lower == dv.to_ascii_lowercase()) {
1185                            any_dangerous = true;
1186                            break;
1187                        }
1188                        // Present with a safe literal, continue checking other kwargs.
1189                    }
1190                    None => {
1191                        any_dynamic_present = true;
1192                    }
1193                }
1194            }
1195            if any_dangerous {
1196                out.push(GateMatch {
1197                    label: gate.label,
1198                    payload_args: gate.payload_args,
1199                    object_destination_fields: &[],
1200                });
1201                continue;
1202            }
1203            if any_dynamic_present {
1204                // Dynamic kwarg value, we can't prove safe. Conservatively
1205                // flag every positional arg so the activation pathway isn't
1206                // silently narrowed to the gate's declared `payload_args`.
1207                out.push(GateMatch {
1208                    label: gate.label,
1209                    payload_args: ALL_ARGS_PAYLOAD,
1210                    object_destination_fields: &[],
1211                });
1212                continue;
1213            }
1214            continue; // all listed kwargs absent or safe-literal → suppress
1215        }
1216
1217        // Single-kwarg / positional gate path (original semantics).
1218        let activation_value = if let Some(kw) = gate.keyword_name {
1219            const_keyword_arg(kw)
1220        } else {
1221            const_arg_at(gate.arg_index)
1222        };
1223
1224        match activation_value {
1225            Some(value) => {
1226                let lower = value.to_ascii_lowercase();
1227                let is_dangerous = gate
1228                    .dangerous_values
1229                    .iter()
1230                    .any(|v| lower == v.to_ascii_lowercase())
1231                    || gate
1232                        .dangerous_prefixes
1233                        .iter()
1234                        .any(|p| lower.starts_with(&p.to_ascii_lowercase()));
1235                if is_dangerous {
1236                    out.push(GateMatch {
1237                        label: gate.label,
1238                        payload_args: gate.payload_args,
1239                        object_destination_fields: &[],
1240                    });
1241                }
1242                // safe constant → suppress (no push)
1243            }
1244            // Unknown / dynamic activation arg: the gate fires conservatively,
1245            // but we can't prove that only the declared `payload_args` carry
1246            // risk, a tainted activation arg (e.g. `setAttribute(userAttr, …)`
1247            // where `userAttr` is user-controlled) is itself a vulnerability
1248            // path. Return ALL_ARGS_PAYLOAD so downstream sink scanning
1249            // considers every positional argument.
1250            None => {
1251                out.push(GateMatch {
1252                    label: gate.label,
1253                    payload_args: ALL_ARGS_PAYLOAD,
1254                    object_destination_fields: &[],
1255                });
1256            }
1257        }
1258    }
1259    out
1260}
1261
1262/// Public wrapper for `normalize_chained_call` so callers outside the module
1263/// can share the same normalization used by the label classifier.
1264pub fn normalize_chained_call_for_classify(text: &str) -> String {
1265    normalize_chained_call(text)
1266}
1267
1268/// Return the bare method-name segment of a callee text. Returns the
1269/// input unchanged for bare callees. When you have an `SsaOp::Call`,
1270/// prefer reading `callee` directly and walking `receiver` through
1271/// `FieldProj` ops, this helper is the textual fallback for callsites
1272/// that only see a `&str`.
1273pub fn bare_method_name(callee: &str) -> &str {
1274    callee.rsplit('.').next().unwrap_or(callee)
1275}
1276
1277/// Normalize a chained method call: strip `()` between `.` segments.
1278/// e.g. `r.URL.Query().Get` → `r.URL.Query.Get`
1279/// e.g. `r.URL.Query().Get("host")` → `r.URL.Query.Get`
1280fn normalize_chained_call(text: &str) -> String {
1281    let mut result = String::with_capacity(text.len());
1282    let bytes = text.as_bytes();
1283    let mut i = 0;
1284    while i < bytes.len() {
1285        match bytes[i] {
1286            b'(' => {
1287                // Skip from `(` to matching `)`, but only if followed by `.`
1288                // This handles `Query().Get` → `Query.Get`
1289                let mut depth = 1u32;
1290                let mut j = i + 1;
1291                while j < bytes.len() && depth > 0 {
1292                    if bytes[j] == b'(' {
1293                        depth += 1;
1294                    } else if bytes[j] == b')' {
1295                        depth -= 1;
1296                    }
1297                    j += 1;
1298                }
1299                // If we're at end or next char is `.`, skip the parens
1300                if j >= bytes.len() || bytes[j] == b'.' {
1301                    i = j;
1302                } else {
1303                    // Keep the paren content (unusual case)
1304                    result.push('(');
1305                    i += 1;
1306                }
1307            }
1308            b'<' => break, // Stop at generic args
1309            _ => {
1310                result.push(bytes[i] as char);
1311                i += 1;
1312            }
1313        }
1314    }
1315    result
1316}
1317
1318// ── Rule enumeration ─────────────────────────────────────────────────────────
1319
1320/// All canonical language slugs (no aliases).
1321const CANONICAL_LANGS: &[&str] = &[
1322    "javascript",
1323    "typescript",
1324    "python",
1325    "go",
1326    "java",
1327    "c",
1328    "cpp",
1329    "php",
1330    "ruby",
1331    "rust",
1332];
1333
1334/// Map alias slugs to canonical language name.
1335pub fn canonical_lang(slug: &str) -> &str {
1336    // Check exact matches first (fast path, no allocation)
1337    match slug {
1338        "javascript" | "js" => "javascript",
1339        "typescript" | "ts" => "typescript",
1340        "python" | "py" => "python",
1341        "go" => "go",
1342        "java" => "java",
1343        "c" => "c",
1344        "cpp" | "c++" => "cpp",
1345        "php" => "php",
1346        "ruby" | "rb" => "ruby",
1347        "rust" | "rs" => "rust",
1348        // For unknown slugs, return as-is (the caller's borrow keeps it alive)
1349        _ => slug,
1350    }
1351}
1352
1353/// Human-readable name for a Cap bitflag value.
1354pub fn cap_to_name(cap: Cap) -> &'static str {
1355    if cap == Cap::all() {
1356        return "all";
1357    }
1358    match cap {
1359        Cap::ENV_VAR => "env_var",
1360        Cap::HTML_ESCAPE => "html_escape",
1361        Cap::SHELL_ESCAPE => "shell_escape",
1362        Cap::URL_ENCODE => "url_encode",
1363        Cap::JSON_PARSE => "json_parse",
1364        Cap::FILE_IO => "file_io",
1365        Cap::FMT_STRING => "fmt_string",
1366        Cap::SQL_QUERY => "sql_query",
1367        Cap::DESERIALIZE => "deserialize",
1368        Cap::SSRF => "ssrf",
1369        Cap::CODE_EXEC => "code_exec",
1370        Cap::CRYPTO => "crypto",
1371        Cap::UNAUTHORIZED_ID => "unauthorized_id",
1372        _ => "unknown",
1373    }
1374}
1375
1376/// Generate a stable rule ID from language, kind, and matchers.
1377pub fn rule_id(lang: &str, kind: &str, matchers: &[&str]) -> String {
1378    let mut sorted: Vec<&str> = matchers.to_vec();
1379    sorted.sort_unstable();
1380    let joined = sorted.join("\0");
1381    let hash = blake3::hash(joined.as_bytes());
1382    let hex = hash.to_hex();
1383    format!("{}.{}.{}", lang, kind, &hex[..8])
1384}
1385
1386/// Metadata-enriched view of a label rule (built-in or custom).
1387#[derive(Debug, Clone, Serialize)]
1388pub struct RuleInfo {
1389    pub id: String,
1390    pub title: String,
1391    pub language: String,
1392    pub kind: String,
1393    pub cap: String,
1394    pub cap_bits: u16,
1395    pub matchers: Vec<String>,
1396    pub case_sensitive: bool,
1397    pub is_custom: bool,
1398    pub is_gated: bool,
1399    pub enabled: bool,
1400}
1401
1402/// Enumerate all built-in rules across all languages.
1403pub fn enumerate_builtin_rules() -> Vec<RuleInfo> {
1404    let mut out = Vec::new();
1405
1406    for &lang in CANONICAL_LANGS {
1407        if let Some(rules) = REGISTRY.get(lang) {
1408            for rule in *rules {
1409                let (kind_str, cap) = match rule.label {
1410                    DataLabel::Source(c) => ("source", c),
1411                    DataLabel::Sanitizer(c) => ("sanitizer", c),
1412                    DataLabel::Sink(c) => ("sink", c),
1413                };
1414                let matchers_strs: Vec<&str> = rule.matchers.to_vec();
1415                let id = rule_id(lang, kind_str, &matchers_strs);
1416                let first = rule.matchers.first().copied().unwrap_or("?");
1417                let title = format!("{} ({})", first, kind_str);
1418                out.push(RuleInfo {
1419                    id,
1420                    title,
1421                    language: lang.to_string(),
1422                    kind: kind_str.to_string(),
1423                    cap: cap_to_name(cap).to_string(),
1424                    cap_bits: cap.bits(),
1425                    matchers: rule.matchers.iter().map(|s| s.to_string()).collect(),
1426                    case_sensitive: rule.case_sensitive,
1427                    is_custom: false,
1428                    is_gated: false,
1429                    enabled: true,
1430                });
1431            }
1432        }
1433
1434        // Include gated sink entries
1435        if let Some(gates) = GATED_REGISTRY.get(lang) {
1436            for gate in *gates {
1437                let cap = match gate.label {
1438                    DataLabel::Source(c) | DataLabel::Sanitizer(c) | DataLabel::Sink(c) => c,
1439                };
1440                let kind_str = "sink";
1441                let matchers_strs = &[gate.callee_matcher];
1442                let id = rule_id(lang, &format!("gated_{}", kind_str), matchers_strs);
1443                let title = format!("{} (gated {})", gate.callee_matcher, kind_str);
1444                out.push(RuleInfo {
1445                    id,
1446                    title,
1447                    language: lang.to_string(),
1448                    kind: kind_str.to_string(),
1449                    cap: cap_to_name(cap).to_string(),
1450                    cap_bits: cap.bits(),
1451                    matchers: vec![gate.callee_matcher.to_string()],
1452                    case_sensitive: gate.case_sensitive,
1453                    is_custom: false,
1454                    is_gated: true,
1455                    enabled: true,
1456                });
1457            }
1458        }
1459    }
1460
1461    out
1462}
1463
1464/// Generate a custom rule ID with `custom.` prefix.
1465pub fn custom_rule_id(lang: &str, kind: &str, matchers: &[String]) -> String {
1466    let refs: Vec<&str> = matchers.iter().map(|s| s.as_str()).collect();
1467    format!("custom.{}", rule_id(lang, kind, &refs))
1468}
1469
1470#[cfg(test)]
1471mod tests {
1472    use super::*;
1473
1474    #[test]
1475    fn bare_method_name_strips_chain() {
1476        // No-dot input → returned as-is.
1477        assert_eq!(bare_method_name("foo"), "foo");
1478        // 1-dot → trailing segment.
1479        assert_eq!(bare_method_name("obj.method"), "method");
1480        // Multi-dot → trailing segment.
1481        assert_eq!(bare_method_name("a.b.c.method"), "method");
1482        // Trailing dot → empty trailing segment.
1483        assert_eq!(bare_method_name("foo."), "");
1484        // Empty input.
1485        assert_eq!(bare_method_name(""), "");
1486        // SSA-decomposed chains pass through untouched.
1487        assert_eq!(bare_method_name("Lock"), "Lock");
1488    }
1489
1490    #[test]
1491    fn handler_param_names_exact_and_prefix() {
1492        // Exact names still match.
1493        assert!(is_js_ts_handler_param_name("cmd"));
1494        assert!(is_js_ts_handler_param_name("input"));
1495        assert!(is_js_ts_handler_param_name("userId"));
1496        assert!(is_js_ts_handler_param_name("USERID"));
1497        // camelCase `user*` prefix.
1498        assert!(is_js_ts_handler_param_name("userCmd"));
1499        assert!(is_js_ts_handler_param_name("userData"));
1500        assert!(is_js_ts_handler_param_name("userPath"));
1501        // snake_case prefix.
1502        assert!(is_js_ts_handler_param_name("user_cmd"));
1503        // Bare `user` does not match (no distinguishing suffix).
1504        assert!(!is_js_ts_handler_param_name("user"));
1505        assert!(!is_js_ts_handler_param_name("userx"));
1506        // Other names unaffected.
1507        assert!(!is_js_ts_handler_param_name("url"));
1508        assert!(!is_js_ts_handler_param_name("value"));
1509    }
1510
1511    #[test]
1512    fn classify_none_extra_unchanged() {
1513        // Built-in rule: innerHTML → Sink(HTML_ESCAPE)
1514        let result = classify("javascript", "innerHTML", None);
1515        assert_eq!(result, Some(DataLabel::Sink(Cap::HTML_ESCAPE)));
1516
1517        // Non-existent should still be None
1518        let result = classify("javascript", "myCustomFunc", None);
1519        assert_eq!(result, None);
1520    }
1521
1522    #[test]
1523    fn classify_extra_rules_take_priority() {
1524        let extras = vec![RuntimeLabelRule {
1525            matchers: vec!["escapeHtml".into()],
1526            label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
1527            case_sensitive: false,
1528        }];
1529
1530        let result = classify("javascript", "escapeHtml", Some(&extras));
1531        assert_eq!(result, Some(DataLabel::Sanitizer(Cap::HTML_ESCAPE)));
1532
1533        // Built-in rules still work
1534        let result = classify("javascript", "innerHTML", Some(&extras));
1535        assert_eq!(result, Some(DataLabel::Sink(Cap::HTML_ESCAPE)));
1536    }
1537
1538    #[test]
1539    fn classify_extra_overrides_builtin() {
1540        // Override innerHTML to be a sanitizer (contrived but tests priority)
1541        let extras = vec![RuntimeLabelRule {
1542            matchers: vec!["innerHTML".into()],
1543            label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
1544            case_sensitive: false,
1545        }];
1546
1547        let result = classify("javascript", "innerHTML", Some(&extras));
1548        assert_eq!(result, Some(DataLabel::Sanitizer(Cap::HTML_ESCAPE)));
1549    }
1550
1551    #[test]
1552    fn classify_location_href_is_sink() {
1553        let result = classify("javascript", "location.href", None);
1554        assert_eq!(result, Some(DataLabel::Sink(Cap::URL_ENCODE)));
1555    }
1556
1557    #[test]
1558    fn classify_bare_href_is_none() {
1559        // Bare "href" should NOT be a sink, only "location.href" and variants
1560        let result = classify("javascript", "href", None);
1561        assert_eq!(result, None);
1562    }
1563
1564    #[test]
1565    fn classify_case_insensitive_is_default() {
1566        let extras = vec![RuntimeLabelRule {
1567            matchers: vec!["myCustomSink".into()],
1568            label: DataLabel::Sink(Cap::HTML_ESCAPE),
1569            case_sensitive: false,
1570        }];
1571        // Default case_sensitive=false: case-insensitive match
1572        let result = classify("javascript", "MYCUSTOMSINK", Some(&extras));
1573        assert_eq!(result, Some(DataLabel::Sink(Cap::HTML_ESCAPE)));
1574    }
1575
1576    #[test]
1577    fn classify_case_sensitive_exact_match() {
1578        let extras = vec![RuntimeLabelRule {
1579            matchers: vec!["MyExactSink".into()],
1580            label: DataLabel::Sink(Cap::HTML_ESCAPE),
1581            case_sensitive: true,
1582        }];
1583        // Exact case matches
1584        let result = classify("javascript", "MyExactSink", Some(&extras));
1585        assert_eq!(result, Some(DataLabel::Sink(Cap::HTML_ESCAPE)));
1586        // Wrong case does NOT match
1587        let result = classify("javascript", "myexactsink", Some(&extras));
1588        assert_eq!(result, None);
1589    }
1590
1591    #[test]
1592    fn classify_case_sensitive_prefix() {
1593        let extras = vec![RuntimeLabelRule {
1594            matchers: vec!["Sanitize_".into()],
1595            label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
1596            case_sensitive: true,
1597        }];
1598        // Correct case prefix matches
1599        let result = classify("javascript", "Sanitize_input", Some(&extras));
1600        assert_eq!(result, Some(DataLabel::Sanitizer(Cap::HTML_ESCAPE)));
1601        // Wrong case does NOT match
1602        let result = classify("javascript", "sanitize_input", Some(&extras));
1603        assert_eq!(result, None);
1604    }
1605
1606    // CVE Hunt Session 2 (Go CVE-2024-31450 Owncast path traversal):
1607    // mutating filesystem helpers (`os.Remove`, `os.WriteFile`,
1608    // `os.RemoveAll`, `ioutil.WriteFile`) sink path-traversal flows that
1609    // the prior Go ruleset only saw on the read side (`os.Open`,
1610    // `os.ReadFile`).
1611    #[test]
1612    fn classify_go_os_remove_is_file_io_sink() {
1613        let result = classify("go", "os.Remove", None);
1614        assert_eq!(result, Some(DataLabel::Sink(Cap::FILE_IO)));
1615    }
1616
1617    #[test]
1618    fn classify_go_os_write_file_is_file_io_sink() {
1619        let result = classify("go", "os.WriteFile", None);
1620        assert_eq!(result, Some(DataLabel::Sink(Cap::FILE_IO)));
1621    }
1622
1623    #[test]
1624    fn classify_go_os_remove_all_is_file_io_sink() {
1625        let result = classify("go", "os.RemoveAll", None);
1626        assert_eq!(result, Some(DataLabel::Sink(Cap::FILE_IO)));
1627    }
1628
1629    // CVE Hunt Session 6 (Go CVE-2026-41422 daptin SQL injection): goqu's
1630    // raw SQL literal builders `goqu.L(s)` / `goqu.Lit(s)` insert `s`
1631    // verbatim into the generated query.  Modeled by name as SQL_QUERY
1632    // sinks; the safe siblings `goqu.I` (identifier), `goqu.C`, `goqu.T`,
1633    // `goqu.V`, `goqu.SUM`, `goqu.COUNT`, etc. are typed and stay
1634    // unlabeled.
1635    #[test]
1636    fn classify_go_goqu_l_is_sql_query_sink() {
1637        let result = classify("go", "goqu.L", None);
1638        assert_eq!(result, Some(DataLabel::Sink(Cap::SQL_QUERY)));
1639    }
1640
1641    #[test]
1642    fn classify_go_goqu_lit_is_sql_query_sink() {
1643        let result = classify("go", "goqu.Lit", None);
1644        assert_eq!(result, Some(DataLabel::Sink(Cap::SQL_QUERY)));
1645    }
1646
1647    #[test]
1648    fn classify_go_goqu_i_is_not_sink() {
1649        let result = classify("go", "goqu.I", None);
1650        assert_eq!(result, None);
1651    }
1652
1653    // CVE Hunt Session 2 (Go CVE-2023-3188 Owncast SSRF):
1654    // `http.DefaultClient.Get/Post/Head/Do/PostForm` is the idiomatic Go
1655    // SSRF sink shape (`http.DefaultClient` is the package-level shared
1656    // `*http.Client`).  These callees migrated from a flat `Sink(SSRF)`
1657    // rule to destination-aware gated sinks so that DATA_EXFIL gates can
1658    // coexist on the same callee (e.g. `http.DefaultClient.Post(url, _,
1659    // body)` carries SSRF on arg 0 and DATA_EXFIL on arg 2).  The
1660    // assertions below check the gate registration rather than the flat
1661    // classifier output.
1662    #[test]
1663    fn classify_go_http_default_client_get_is_ssrf_gate() {
1664        let no_kw = |_: &str| None;
1665        let no_kw_present = |_: &str| false;
1666        let result = classify_gated_sink(
1667            "go",
1668            "http.DefaultClient.Get",
1669            |_| None,
1670            no_kw,
1671            no_kw_present,
1672        );
1673        assert!(
1674            result.iter().any(|m| m.label == DataLabel::Sink(Cap::SSRF)),
1675            "expected SSRF gate match, got {result:?}"
1676        );
1677    }
1678
1679    #[test]
1680    fn classify_go_http_default_client_post_is_ssrf_and_data_exfil_gate() {
1681        let no_kw = |_: &str| None;
1682        let no_kw_present = |_: &str| false;
1683        let result = classify_gated_sink(
1684            "go",
1685            "http.DefaultClient.Post",
1686            |_| None,
1687            no_kw,
1688            no_kw_present,
1689        );
1690        assert!(
1691            result.iter().any(|m| m.label == DataLabel::Sink(Cap::SSRF)),
1692            "expected SSRF gate match, got {result:?}"
1693        );
1694        assert!(
1695            result
1696                .iter()
1697                .any(|m| m.label == DataLabel::Sink(Cap::DATA_EXFIL)),
1698            "expected DATA_EXFIL gate match, got {result:?}"
1699        );
1700    }
1701
1702    #[test]
1703    fn classify_go_http_default_client_do_is_data_exfil_gate() {
1704        let no_kw = |_: &str| None;
1705        let no_kw_present = |_: &str| false;
1706        let result = classify_gated_sink(
1707            "go",
1708            "http.DefaultClient.Do",
1709            |_| None,
1710            no_kw,
1711            no_kw_present,
1712        );
1713        assert!(
1714            result
1715                .iter()
1716                .any(|m| m.label == DataLabel::Sink(Cap::DATA_EXFIL)),
1717            "expected DATA_EXFIL gate match, got {result:?}"
1718        );
1719    }
1720
1721    #[test]
1722    fn classify_go_user_client_get_is_not_ssrf_sink() {
1723        // `client.Get` on a user-named *http.Client variable should NOT
1724        // match, the Go SSRF set is restricted to the stdlib package
1725        // helper `http.DefaultClient`. Type-aware resolution would be the
1726        // path to a broader rule, not a bare-name match.
1727        let result = classify("go", "client.Get", None);
1728        assert_eq!(result, None);
1729    }
1730
1731    // CVE Hunt Session 3 (Ruby CVE-2020-8130 rake `Kernel#open` CMDI):
1732    // bare `open(path)` interprets a leading `|` as a shell pipe.  The
1733    // `=` exact-match sigil distinguishes the dangerous bare-callee form
1734    // from `File.open` / `IO.open` / `URI.open`, each of which has its
1735    // own non-piping semantics.  Without the sigil, the suffix-with-
1736    // boundary matcher would over-fire on every `X.open` call.
1737    #[test]
1738    fn classify_ruby_bare_open_is_shell_escape_sink() {
1739        let result = classify("ruby", "open", None);
1740        assert_eq!(result, Some(DataLabel::Sink(Cap::SHELL_ESCAPE)));
1741    }
1742
1743    #[test]
1744    fn classify_ruby_file_open_is_not_shell_escape_sink() {
1745        // The exact-match sigil on `=open` must NOT fire on `File.open`.
1746        // `File.open` is a separate FILE_IO sink (existing rule); the
1747        // CMDI rule must not double-classify it.
1748        let result = classify_all("ruby", "File.open", None);
1749        // FILE_IO from the existing `File.open` matcher is allowed.
1750        assert!(result.contains(&DataLabel::Sink(Cap::FILE_IO)));
1751        // SHELL_ESCAPE from the new bare-`open` matcher must NOT appear.
1752        assert!(!result.contains(&DataLabel::Sink(Cap::SHELL_ESCAPE)));
1753    }
1754
1755    #[test]
1756    fn classify_ruby_io_open_is_not_shell_escape_sink() {
1757        // `IO.open` takes a file descriptor, never pipes.  The bare-
1758        // open CMDI rule must leave it alone.
1759        let result = classify("ruby", "IO.open", None);
1760        assert_ne!(result, Some(DataLabel::Sink(Cap::SHELL_ESCAPE)));
1761    }
1762
1763    #[test]
1764    fn classify_ruby_uri_open_remains_ssrf_sink() {
1765        // `URI.open` is the existing SSRF sink.  Adding `=open` as a
1766        // CMDI rule must not break or shadow it.
1767        let result = classify("ruby", "URI.open", None);
1768        assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
1769    }
1770
1771    #[test]
1772    fn classify_ruby_openuri_open_uri_is_ssrf_sink() {
1773        // OpenURI.open_uri is the canonical low-level URI fetcher that
1774        // URI.open delegates to. CarrierWave / Paperclip / similar gems
1775        // route SSRF-vulnerable downloads through it directly.
1776        // CVE-2021-21288 (CarrierWave) regression guard.
1777        let result = classify("ruby", "OpenURI.open_uri", None);
1778        assert_eq!(result, Some(DataLabel::Sink(Cap::SSRF)));
1779    }
1780
1781    #[test]
1782    fn unpack_matcher_strips_exact_sigil() {
1783        let (m, exact) = unpack_matcher(b"=open");
1784        assert_eq!(m, b"open");
1785        assert!(exact);
1786
1787        let (m, exact) = unpack_matcher(b"open");
1788        assert_eq!(m, b"open");
1789        assert!(!exact);
1790    }
1791
1792    #[test]
1793    fn classify_case_sensitive_suffix_boundary() {
1794        let extras = vec![RuntimeLabelRule {
1795            matchers: vec!["RunQuery".into()],
1796            label: DataLabel::Sink(Cap::SQL_QUERY),
1797            case_sensitive: true,
1798        }];
1799        // Correct case with dot boundary
1800        let result = classify("javascript", "db.RunQuery", Some(&extras));
1801        assert_eq!(result, Some(DataLabel::Sink(Cap::SQL_QUERY)));
1802        // Wrong case does NOT match
1803        let result = classify("javascript", "db.runquery", Some(&extras));
1804        assert_eq!(result, None);
1805    }
1806
1807    #[test]
1808    fn classify_cpp_sto_family_is_sanitizer() {
1809        // full `std::sto*` family (including 64-bit and `long
1810        // double` variants) clears every taint cap that flows through it,
1811        // matching the existing `std::stoi`/`std::stol` rule.
1812        for callee in [
1813            "std::stoi",
1814            "std::stol",
1815            "std::stoll",
1816            "std::stoul",
1817            "std::stoull",
1818            "std::stof",
1819            "std::stod",
1820            "std::stold",
1821        ] {
1822            assert_eq!(
1823                classify("cpp", callee, None),
1824                Some(DataLabel::Sanitizer(Cap::all())),
1825                "{callee} should be a Cap::all() sanitizer",
1826            );
1827        }
1828    }
1829
1830    #[test]
1831    fn parse_cap_works() {
1832        assert_eq!(parse_cap("html_escape"), Some(Cap::HTML_ESCAPE));
1833        assert_eq!(parse_cap("shell_escape"), Some(Cap::SHELL_ESCAPE));
1834        assert_eq!(parse_cap("url_encode"), Some(Cap::URL_ENCODE));
1835        assert_eq!(parse_cap("json_parse"), Some(Cap::JSON_PARSE));
1836        assert_eq!(parse_cap("env_var"), Some(Cap::ENV_VAR));
1837        assert_eq!(parse_cap("file_io"), Some(Cap::FILE_IO));
1838        assert_eq!(parse_cap("all"), Some(Cap::all()));
1839        assert_eq!(parse_cap("ALL"), Some(Cap::all()));
1840        assert_eq!(parse_cap("sql_query"), Some(Cap::SQL_QUERY));
1841        assert_eq!(parse_cap("deserialize"), Some(Cap::DESERIALIZE));
1842        assert_eq!(parse_cap("ssrf"), Some(Cap::SSRF));
1843        assert_eq!(parse_cap("code_exec"), Some(Cap::CODE_EXEC));
1844        assert_eq!(parse_cap("crypto"), Some(Cap::CRYPTO));
1845        assert_eq!(parse_cap("invalid"), None);
1846    }
1847
1848    /// No-op keyword arg extractor for tests (JS/TS have no keyword gates).
1849    fn no_kw(_: &str) -> Option<String> {
1850        None
1851    }
1852
1853    /// No-op kwarg presence check for tests that don't exercise the multi-kwarg path.
1854    fn no_kw_present(_: &str) -> bool {
1855        false
1856    }
1857
1858    /// Find the first matching gate whose label sink-caps overlap `caps`.
1859    /// Lets tests target a specific gate when a callee carries multiple
1860    /// (e.g. `fetch` is both an SSRF and a `DATA_EXFIL` gate).
1861    fn find_match_with_caps(matches: &[GateMatch], caps: Cap) -> Option<GateMatch> {
1862        matches
1863            .iter()
1864            .find(|m| matches!(m.label, DataLabel::Sink(c) if c.intersects(caps)))
1865            .copied()
1866    }
1867
1868    #[test]
1869    fn gated_sink_dangerous_exact() {
1870        let result = classify_gated_sink(
1871            "javascript",
1872            "setAttribute",
1873            |_| Some("href".to_string()),
1874            no_kw,
1875            no_kw_present,
1876        );
1877        assert_eq!(
1878            result.as_slice(),
1879            &[GateMatch {
1880                label: DataLabel::Sink(Cap::HTML_ESCAPE),
1881                payload_args: [1usize].as_slice(),
1882                object_destination_fields: &[],
1883            }]
1884        );
1885    }
1886
1887    #[test]
1888    fn gated_sink_dangerous_prefix() {
1889        let result = classify_gated_sink(
1890            "javascript",
1891            "setAttribute",
1892            |_| Some("onclick".to_string()),
1893            no_kw,
1894            no_kw_present,
1895        );
1896        assert_eq!(
1897            result.as_slice(),
1898            &[GateMatch {
1899                label: DataLabel::Sink(Cap::HTML_ESCAPE),
1900                payload_args: [1usize].as_slice(),
1901                object_destination_fields: &[],
1902            }]
1903        );
1904    }
1905
1906    #[test]
1907    fn gated_sink_safe_suppressed() {
1908        let result = classify_gated_sink(
1909            "javascript",
1910            "setAttribute",
1911            |_| Some("class".to_string()),
1912            no_kw,
1913            no_kw_present,
1914        );
1915        assert!(result.is_empty());
1916    }
1917
1918    #[test]
1919    fn gated_sink_dynamic_conservative() {
1920        // Dynamic activation (e.g. `setAttribute(attrVar, val)`) returns the
1921        // ALL_ARGS_PAYLOAD sentinel so callers expand payload tracking to
1922        // every positional arg, the activation arg itself is a vulnerability
1923        // path when attacker-controlled.
1924        let result =
1925            classify_gated_sink("javascript", "setAttribute", |_| None, no_kw, no_kw_present);
1926        assert_eq!(
1927            result.as_slice(),
1928            &[GateMatch {
1929                label: DataLabel::Sink(Cap::HTML_ESCAPE),
1930                payload_args: ALL_ARGS_PAYLOAD,
1931                object_destination_fields: &[],
1932            }]
1933        );
1934    }
1935
1936    #[test]
1937    fn gated_sink_no_match() {
1938        let result = classify_gated_sink(
1939            "rust",
1940            "setAttribute",
1941            |_| Some("href".to_string()),
1942            no_kw,
1943            no_kw_present,
1944        );
1945        assert!(result.is_empty());
1946    }
1947
1948    #[test]
1949    fn gated_sink_returns_payload_args() {
1950        // setAttribute: payload is arg 1
1951        let result = classify_gated_sink(
1952            "javascript",
1953            "setAttribute",
1954            |_| Some("href".to_string()),
1955            no_kw,
1956            no_kw_present,
1957        );
1958        assert_eq!(result[0].payload_args, &[1]);
1959
1960        // parseFromString: payload is arg 0
1961        let result = classify_gated_sink(
1962            "javascript",
1963            "parseFromString",
1964            |idx| {
1965                if idx == 1 {
1966                    Some("text/html".to_string())
1967                } else {
1968                    None
1969                }
1970            },
1971            no_kw,
1972            no_kw_present,
1973        );
1974        assert_eq!(result[0].payload_args, &[0]);
1975    }
1976
1977    #[test]
1978    fn gated_sink_parse_from_string_safe_mime() {
1979        let result = classify_gated_sink(
1980            "javascript",
1981            "parseFromString",
1982            |idx| {
1983                if idx == 1 {
1984                    Some("text/xml".to_string())
1985                } else {
1986                    None
1987                }
1988            },
1989            no_kw,
1990            no_kw_present,
1991        );
1992        assert!(result.is_empty());
1993    }
1994
1995    #[test]
1996    fn gated_sink_python_popen_shell_true() {
1997        let result = classify_gated_sink(
1998            "python",
1999            "Popen",
2000            |_| None,
2001            |kw| {
2002                if kw == "shell" {
2003                    Some("True".to_string())
2004                } else {
2005                    None
2006                }
2007            },
2008            |kw| kw == "shell",
2009        );
2010        assert_eq!(
2011            result.as_slice(),
2012            &[GateMatch {
2013                label: DataLabel::Sink(Cap::SHELL_ESCAPE),
2014                payload_args: [0usize].as_slice(),
2015                object_destination_fields: &[],
2016            }]
2017        );
2018    }
2019
2020    #[test]
2021    fn gated_sink_python_popen_shell_false() {
2022        let result = classify_gated_sink(
2023            "python",
2024            "Popen",
2025            |_| None,
2026            |kw| {
2027                if kw == "shell" {
2028                    Some("False".to_string())
2029                } else {
2030                    None
2031                }
2032            },
2033            |kw| kw == "shell",
2034        );
2035        assert!(result.is_empty());
2036    }
2037
2038    #[test]
2039    fn gated_sink_python_popen_no_shell_conservative() {
2040        // `Popen(cmd)` uses the single-kwarg / positional gate path: no `shell`
2041        // literal available → unknown activation → ALL_ARGS_PAYLOAD sentinel.
2042        let result = classify_gated_sink("python", "Popen", |_| None, |_| None, no_kw_present);
2043        assert_eq!(
2044            result.as_slice(),
2045            &[GateMatch {
2046                label: DataLabel::Sink(Cap::SHELL_ESCAPE),
2047                payload_args: ALL_ARGS_PAYLOAD,
2048                object_destination_fields: &[],
2049            }]
2050        );
2051    }
2052
2053    // ── New multi-kwarg gate path (dangerous_kwargs) tests ─────────────────
2054
2055    /// `subprocess.run(cmd, shell=True)` → activates via multi-kwarg gate.
2056    #[test]
2057    fn gated_sink_subprocess_run_shell_true() {
2058        let result = classify_gated_sink(
2059            "python",
2060            "subprocess.run",
2061            |_| None,
2062            |kw| {
2063                if kw == "shell" {
2064                    Some("True".to_string())
2065                } else {
2066                    None
2067                }
2068            },
2069            |kw| kw == "shell",
2070        );
2071        assert_eq!(
2072            result.as_slice(),
2073            &[GateMatch {
2074                label: DataLabel::Sink(Cap::SHELL_ESCAPE),
2075                payload_args: [0usize].as_slice(),
2076                object_destination_fields: &[],
2077            }]
2078        );
2079    }
2080
2081    /// `subprocess.run(cmd, shell=False)` → explicit safe literal suppresses the gate.
2082    #[test]
2083    fn gated_sink_subprocess_run_shell_false() {
2084        let result = classify_gated_sink(
2085            "python",
2086            "subprocess.run",
2087            |_| None,
2088            |kw| {
2089                if kw == "shell" {
2090                    Some("False".to_string())
2091                } else {
2092                    None
2093                }
2094            },
2095            |kw| kw == "shell",
2096        );
2097        assert!(result.is_empty());
2098    }
2099
2100    /// `subprocess.run(cmd)` → no shell kwarg → presence-aware gate suppresses.
2101    /// This is the behavioural difference from the legacy `Popen` gate path.
2102    #[test]
2103    fn gated_sink_subprocess_run_shell_absent_suppresses() {
2104        let result = classify_gated_sink(
2105            "python",
2106            "subprocess.run",
2107            |_| None,
2108            |_| None,
2109            no_kw_present,
2110        );
2111        assert!(result.is_empty());
2112    }
2113
2114    /// `subprocess.run(cmd, shell=flag)` → shell kwarg present but dynamic →
2115    /// conservative activate. Multi-kwarg dynamic-present branch also returns
2116    /// ALL_ARGS_PAYLOAD so the activation pathway is not narrowed.
2117    #[test]
2118    fn gated_sink_subprocess_run_shell_dynamic_conservative() {
2119        let result = classify_gated_sink(
2120            "python",
2121            "subprocess.run",
2122            |_| None,
2123            |_| None, // dynamic: no literal available
2124            |kw| kw == "shell",
2125        );
2126        assert_eq!(
2127            result.as_slice(),
2128            &[GateMatch {
2129                label: DataLabel::Sink(Cap::SHELL_ESCAPE),
2130                payload_args: ALL_ARGS_PAYLOAD,
2131                object_destination_fields: &[],
2132            }]
2133        );
2134    }
2135
2136    /// Destination-flow gate always fires; returns `object_destination_fields`
2137    /// verbatim for the caller to apply object-literal field filtering.
2138    #[test]
2139    fn gated_sink_destination_positional_always_fires() {
2140        // `fetch(url)`, arg 0 is the URL (positional destination) OR an
2141        // object with a `url` field. The gate fires unconditionally, with
2142        // `url` declared as the object-literal destination-field for the
2143        // `fetch({url, body})` shape.
2144        let result = classify_gated_sink(
2145            "javascript",
2146            "fetch",
2147            |_| None, // no literal, Destination mode doesn't inspect it
2148            no_kw,
2149            no_kw_present,
2150        );
2151        let m = find_match_with_caps(&result, Cap::SSRF).expect("fetch SSRF gate should fire");
2152        assert_eq!(m.label, DataLabel::Sink(Cap::SSRF));
2153        assert_eq!(m.payload_args, &[0]);
2154        assert_eq!(m.object_destination_fields, &["url"]);
2155    }
2156
2157    /// Destination gate with `object_destination_fields` surfaces them for
2158    /// the CFG caller to drive object-literal field filtering.
2159    #[test]
2160    fn gated_sink_destination_object_fields_surfaced() {
2161        // `http.request(opts, cb)`, opts is an object with destination fields.
2162        let result =
2163            classify_gated_sink("javascript", "http.request", |_| None, no_kw, no_kw_present);
2164        let m = result
2165            .first()
2166            .copied()
2167            .expect("http.request gate should fire");
2168        assert_eq!(m.label, DataLabel::Sink(Cap::SSRF));
2169        assert_eq!(m.payload_args, &[0]);
2170        assert!(
2171            m.object_destination_fields
2172                .iter()
2173                .any(|&f| f == "host" || f == "hostname"),
2174            "expected host/hostname in destination fields, got {:?}",
2175            m.object_destination_fields,
2176        );
2177    }
2178
2179    /// `fetch` carries both SSRF (URL flow) and `DATA_EXFIL` (body / headers /
2180    /// json flow) gates. Both must fire from a single classify call so the
2181    /// downstream CFG can build per-cap filters.
2182    #[test]
2183    fn gated_sink_fetch_emits_ssrf_and_data_exfil() {
2184        let result = classify_gated_sink("javascript", "fetch", |_| None, no_kw, no_kw_present);
2185        let ssrf = find_match_with_caps(&result, Cap::SSRF).expect("SSRF gate fires");
2186        assert_eq!(ssrf.label, DataLabel::Sink(Cap::SSRF));
2187        assert_eq!(ssrf.payload_args, &[0]);
2188        assert_eq!(ssrf.object_destination_fields, &["url"]);
2189
2190        let exfil = find_match_with_caps(&result, Cap::DATA_EXFIL).expect("DATA_EXFIL gate fires");
2191        assert_eq!(exfil.label, DataLabel::Sink(Cap::DATA_EXFIL));
2192        assert_eq!(exfil.payload_args, &[1]);
2193        assert!(
2194            exfil.object_destination_fields.contains(&"body"),
2195            "expected body in DATA_EXFIL destination fields, got {:?}",
2196            exfil.object_destination_fields,
2197        );
2198    }
2199
2200    #[test]
2201    fn classify_all_single_label() {
2202        let result = classify_all("javascript", "innerHTML", None);
2203        assert_eq!(result.len(), 1);
2204        assert_eq!(result[0], DataLabel::Sink(Cap::HTML_ESCAPE));
2205    }
2206
2207    #[test]
2208    fn classify_all_dual_label_php() {
2209        let result = classify_all("php", "file_get_contents", None);
2210        assert!(result.len() >= 2, "expected dual label, got {:?}", result);
2211        assert!(
2212            result.contains(&DataLabel::Source(Cap::all())),
2213            "expected Source(all), got {:?}",
2214            result
2215        );
2216        assert!(
2217            result.contains(&DataLabel::Sink(Cap::SSRF)),
2218            "expected Sink(SSRF), got {:?}",
2219            result
2220        );
2221    }
2222
2223    #[test]
2224    fn classify_all_dual_label_java() {
2225        let result = classify_all("java", "readObject", None);
2226        assert!(result.len() >= 2, "expected dual label, got {:?}", result);
2227        assert!(
2228            result.contains(&DataLabel::Source(Cap::all())),
2229            "expected Source(all), got {:?}",
2230            result
2231        );
2232        assert!(
2233            result.contains(&DataLabel::Sink(Cap::DESERIALIZE)),
2234            "expected Sink(DESERIALIZE), got {:?}",
2235            result
2236        );
2237    }
2238
2239    #[test]
2240    fn classify_go_echo_sinks_with_runtime_rules() {
2241        use crate::utils::project::{DetectedFramework, FrameworkContext};
2242
2243        let ctx = FrameworkContext {
2244            frameworks: vec![DetectedFramework::Echo],
2245            inspected_langs: std::collections::HashSet::new(),
2246        };
2247        let rules = go::framework_rules(&ctx);
2248        let extras = rules.to_vec();
2249
2250        assert_eq!(
2251            classify("go", "c.String", Some(&extras)),
2252            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2253        );
2254        assert_eq!(
2255            classify("go", "c.HTML", Some(&extras)),
2256            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2257        );
2258        assert_eq!(
2259            classify("go", "c.JSON", Some(&extras)),
2260            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2261        );
2262
2263        // Without Echo framework, these should not match
2264        let empty = go::framework_rules(&FrameworkContext::default());
2265        assert_eq!(classify("go", "c.String", Some(&empty)), None);
2266    }
2267
2268    #[test]
2269    fn classify_javascript_koa_runtime_rules() {
2270        use crate::utils::project::{DetectedFramework, FrameworkContext};
2271
2272        let ctx = FrameworkContext {
2273            frameworks: vec![DetectedFramework::Koa],
2274            inspected_langs: std::collections::HashSet::new(),
2275        };
2276        let extras = javascript::framework_rules(&ctx);
2277
2278        assert_eq!(
2279            classify("javascript", "ctx.query", Some(&extras)),
2280            Some(DataLabel::Source(Cap::all())),
2281        );
2282        assert_eq!(
2283            classify("javascript", "ctx.cookies.get", Some(&extras)),
2284            Some(DataLabel::Source(Cap::all())),
2285        );
2286        assert_eq!(
2287            classify("javascript", "ctx.body", Some(&extras)),
2288            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2289        );
2290        assert_eq!(
2291            classify("javascript", "ctx.redirect", Some(&extras)),
2292            Some(DataLabel::Sink(Cap::SSRF)),
2293        );
2294
2295        let empty = javascript::framework_rules(&FrameworkContext::default());
2296        assert_eq!(classify("javascript", "ctx.query", Some(&empty)), None);
2297    }
2298
2299    #[test]
2300    fn classify_typescript_fastify_runtime_rules() {
2301        use crate::utils::project::{DetectedFramework, FrameworkContext};
2302
2303        let ctx = FrameworkContext {
2304            frameworks: vec![DetectedFramework::Fastify],
2305            inspected_langs: std::collections::HashSet::new(),
2306        };
2307        let extras = typescript::framework_rules(&ctx);
2308
2309        assert_eq!(
2310            classify("typescript", "request.query", Some(&extras)),
2311            Some(DataLabel::Source(Cap::all())),
2312        );
2313        assert_eq!(
2314            classify("typescript", "reply.send", Some(&extras)),
2315            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2316        );
2317        assert_eq!(
2318            classify("typescript", "reply.redirect", Some(&extras)),
2319            Some(DataLabel::Sink(Cap::SSRF)),
2320        );
2321
2322        let empty = typescript::framework_rules(&FrameworkContext::default());
2323        assert_eq!(classify("typescript", "request.query", Some(&empty)), None);
2324    }
2325
2326    #[test]
2327    fn classify_ruby_sinatra_template_sinks() {
2328        use crate::utils::project::{DetectedFramework, FrameworkContext};
2329
2330        let ctx = FrameworkContext {
2331            frameworks: vec![DetectedFramework::Sinatra],
2332            inspected_langs: std::collections::HashSet::new(),
2333        };
2334        let rules = ruby::framework_rules(&ctx);
2335        let extras = rules.to_vec();
2336
2337        assert_eq!(
2338            classify("ruby", "erb", Some(&extras)),
2339            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2340        );
2341        assert_eq!(
2342            classify("ruby", "haml", Some(&extras)),
2343            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2344        );
2345
2346        // Without Sinatra, erb should not match
2347        let empty = ruby::framework_rules(&FrameworkContext::default());
2348        assert_eq!(classify("ruby", "erb", Some(&empty)), None);
2349    }
2350
2351    #[test]
2352    fn classify_rust_axum_runtime_rules() {
2353        use crate::utils::project::{DetectedFramework, FrameworkContext};
2354
2355        let ctx = FrameworkContext {
2356            frameworks: vec![DetectedFramework::Axum],
2357            inspected_langs: std::collections::HashSet::new(),
2358        };
2359        let extras = rust::framework_rules(&ctx);
2360
2361        assert_eq!(
2362            classify("rust", "Path<String>", Some(&extras)),
2363            Some(DataLabel::Source(Cap::all())),
2364        );
2365        assert_eq!(
2366            classify("rust", "HeaderMap.get(\"x-user\")", Some(&extras)),
2367            Some(DataLabel::Source(Cap::all())),
2368        );
2369        assert_eq!(
2370            classify("rust", "Html(name)", Some(&extras)),
2371            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2372        );
2373        assert_eq!(
2374            classify("rust", "Redirect::to(next)", Some(&extras)),
2375            Some(DataLabel::Sink(Cap::SSRF)),
2376        );
2377
2378        let empty = rust::framework_rules(&FrameworkContext::default());
2379        assert_eq!(classify("rust", "Html(name)", Some(&empty)), None);
2380    }
2381
2382    #[test]
2383    fn classify_rust_actix_runtime_rules() {
2384        use crate::utils::project::{DetectedFramework, FrameworkContext};
2385
2386        let ctx = FrameworkContext {
2387            frameworks: vec![DetectedFramework::ActixWeb],
2388            inspected_langs: std::collections::HashSet::new(),
2389        };
2390        let extras = rust::framework_rules(&ctx);
2391
2392        assert_eq!(
2393            classify("rust", "web::Json<String>", Some(&extras)),
2394            Some(DataLabel::Source(Cap::all())),
2395        );
2396        assert_eq!(
2397            classify("rust", "HttpRequest.match_info()", Some(&extras)),
2398            Some(DataLabel::Source(Cap::all())),
2399        );
2400        assert_eq!(
2401            classify("rust", "HttpResponse.body(payload)", Some(&extras)),
2402            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2403        );
2404    }
2405
2406    #[test]
2407    fn classify_rust_rocket_runtime_rules() {
2408        use crate::utils::project::{DetectedFramework, FrameworkContext};
2409
2410        let ctx = FrameworkContext {
2411            frameworks: vec![DetectedFramework::Rocket],
2412            inspected_langs: std::collections::HashSet::new(),
2413        };
2414        let extras = rust::framework_rules(&ctx);
2415
2416        assert_eq!(
2417            classify("rust", "CookieJar.get_private(\"sid\")", Some(&extras)),
2418            Some(DataLabel::Source(Cap::all())),
2419        );
2420        assert_eq!(
2421            classify("rust", "content::RawHtml(name)", Some(&extras)),
2422            Some(DataLabel::Sink(Cap::HTML_ESCAPE)),
2423        );
2424        assert_eq!(
2425            classify("rust", "Redirect::to(next)", Some(&extras)),
2426            Some(DataLabel::Sink(Cap::SSRF)),
2427        );
2428    }
2429}