candor_classify/lib.rs
1//! candor-classify — the curated effect classifier (crate+path -> effect), extracted to a STABLE
2//! crate so both the nightly `rustc_private` lint AND a stable backend share ONE source of truth
3//! (no drift). Pure string logic; no rustc internals. The effect vocabulary lives in candor-report.
4
5use candor_report::EFFECTS;
6
7/// The canonical CANDOR_POLICY DSL parser (SPEC §6.2), shared by the nightly gate and candor-query.
8pub mod policy;
9
10/// Project-supplied rules, consulted only when the built-in `classify` returns None.
11pub fn classify_extra(
12 crate_name: &str,
13 path: &str,
14 extra: &[(&'static str, bool, String)],
15) -> Option<&'static str> {
16 for (eff, is_crate, prefix) in extra {
17 let hit = if *is_crate { crate_name.starts_with(prefix.as_str()) } else { path.starts_with(prefix.as_str()) };
18 if hit {
19 return Some(eff);
20 }
21 }
22 None
23}
24
25/// The exact third-party crates `classify` has effect rules for, and the crate-name
26/// PREFIXES it recognizes. This is the single source of truth for "what candor knows":
27/// it is emitted beside the JSON report (`<prefix>.calibrated.json`) so the Claude Code
28/// receipt's coverage check reads candor's real coverage instead of a hand-copied list.
29/// Keep in lockstep with `classify` below — the `db_crates_are_calibrated` and
30/// `calibrated_crates_are_live` tests (in this crate's `tests` module) enforce both directions.
31pub const CALIBRATED_CRATES: [&str; 59] = [
32 // network (aws_config resolves credentials over the network on `.load()`;
33 // git2 remote ops — fetch/push/connect — contact the network; async_net is smol's net layer;
34 // pnet is raw L2/L3 packet capture)
35 "reqwest", "isahc", "ureq", "curl", "aws_config", "git2", "tokio_tcp", "tokio_udp", "async_net",
36 "async_nats", "lapin", "lettre", "tungstenite", "elasticsearch", "tonic", "rdkafka", "pnet",
37 // directory traversal (ignore = gitignore-aware walker, powers ripgrep/fd; its walk executors are Fs)
38 // + filesystem watching (notify = inotify/FSEvents/kqueue wrapper; powers watchexec/cargo-watch)
39 "ignore", "notify",
40 // database (see DB_CRATES in classify)
41 "sqlx", "rusqlite", "postgres", "tokio_postgres", "diesel", "redis", "mongodb",
42 "mysql", "mysql_async", "sea_orm", "deadpool_postgres",
43 // filesystem (async_fs = smol; fs_err = std::fs wrapper; tempfile; glob) / entropy /
44 // subprocess (async_process = smol; duct) / env (dotenvy/dotenv) / clock (time) / log / clipboard
45 "memmap2", "fs_err", "async_fs", "tempfile", "glob",
46 "rand", "getrandom", "fastrand",
47 // entropy: the password-hashing tier (salt mints + bcrypt's internal salt) + the OsRng source
48 "argon2", "bcrypt", "scrypt", "pbkdf2", "password_hash", "rand_core",
49 "portable_pty", "async_process", "duct",
50 "dotenvy", "dotenv",
51 "chrono", "time", "tracing", "log", "arboard",
52 // compiler diagnostic emission (a dylint lint's output) — see the Log rules in classify
53 "rustc_lint", "rustc_errors",
54 // raw syscalls via FFI — the syscall-name table that lights up the FFI-thin tier (nix is routed
55 // through the same table by leaf name, so a consumer of nix is covered without nix's own source)
56 "libc", "nix", "rustix",
57];
58
59pub const CALIBRATED_PREFIXES: [&str; 3] = ["aws_sdk_", "aws_smithy", "cap_"];
60
61/// Crates `classify` matches by PATH prefix rather than crate-name equality (their effectful modules
62/// are recognised, e.g. `tokio::net::`/`async_std::fs::`/`mio::net::`), so they're absent from
63/// `CALIBRATED_CRATES` (which the liveness test probes by crate name). The coverage check must still
64/// treat them as *covered* — otherwise it would mislabel the most common async crates as blind spots.
65pub const PATH_CALIBRATED_CRATES: [&str; 3] = ["tokio", "async_std", "mio"];
66
67/// Representative path tails (each appended to a crate name) that the `calibrated_crates_are_live`
68/// liveness test probes: at least one must match for every `CALIBRATED_CRATES` entry, else the entry is
69/// dead. Exported as ONE source of truth because the nightly lint crate (`src/lib.rs`) runs the SAME
70/// liveness test — when the two probe lists were duplicated they drifted, and a rule keyed on a
71/// distinctive tail (pnet `::datalink::channel`, ignore `::WalkBuilder::build_parallel`, notify
72/// `::RecommendedWatcher::new`) added to only one list silently broke the other crate's `cargo test`.
73pub const CALIBRATION_PROBE_TAILS: &[&str] = &[
74 "::X::send", "::X::execute", "::X::call", "::X::query", "::X::fetch_one", "::Remote::fetch",
75 "::datalink::channel", "::WalkBuilder::build_parallel", "::RecommendedWatcher::new",
76 "::X::connect", "::Utc::now", "::X::load", "::__private_api::log", "::tempfile", "::glob",
77 "::X::run", "::dotenv", "::random", "::emit", "::X::emit_span_lint", "::X::anything",
78 "::SaltString::generate", "::hash", "::OsRng::fill_bytes",
79 // verb-precise crates whose whole-crate rules were narrowed to the effectful surface (the pure
80 // accessors/ctors/data-types now return None), so the liveness probe must name an EFFECTFUL path:
81 "::Mmap::map", "::event", "::u32", "::Clipboard::get_text", "::spawn_command",
82];
83
84/// Database client crates whose execution verbs are I/O (see the DB branch in `classify`).
85/// Module-level so `db_crates_are_calibrated` can enforce `DB_CRATES ⊆ CALIBRATED_CRATES`.
86pub const DB_CRATES: [&str; 11] = [
87 "sqlx", "rusqlite", "postgres", "tokio_postgres", "diesel", "redis", "mongodb",
88 "mysql", "mysql_async", "sea_orm", "deadpool_postgres",
89];
90
91/// Pure file-descriptor *ownership-transfer* leaves. These ADOPT an already-open descriptor
92/// (`from_raw_fd`/`from_raw_socket`/`from_raw_handle`), EXTRACT/BORROW one
93/// (`into_raw_fd`/`into_raw_socket`/`into_raw_handle`, `as_raw_fd`/`as_raw_socket`/`as_raw_handle`),
94/// or UNWRAP an async wrapper back to its std type (`into_std`) — none of them issue a syscall or
95/// perform I/O. candor's cardinal sin is calling a PURE function effectful, and these collide with the
96/// coarse std-type PREFIX rules (`std::net::TcpStream`/`std::fs::File`/`std::os::unix::net` → Net/Fs/Ipc)
97/// even though the descriptor was opened ELSEWHERE. The portable_pty/async_process Exec rule already
98/// exempts `from_raw_fd`; this generalises the same carve-out across the net/fs/ipc prefix rules.
99/// (Found by a real-world sweep of tokio: `TcpStream::into_std`, `*::from_raw_fd`, `*::as_raw_fd` all
100/// fabricated Net/Fs/Ipc.)
101const PURE_FD_TRANSFER: &[&str] = &[
102 "from_raw_fd", "from_raw_socket", "from_raw_handle",
103 "into_raw_fd", "into_raw_socket", "into_raw_handle",
104 "as_raw_fd", "as_raw_socket", "as_raw_handle",
105 "into_std",
106 // `SocketAddr::from_pathname` (std/async-std unix net) builds an address STRUCT from a path —
107 // it opens no socket. The `std::os::unix::net` prefix rule below would otherwise fabricate Ipc
108 // on it. (Found sweeping socket2: `SockAddr::as_unix` → `from_pathname` reported Ipc.)
109 "from_pathname",
110];
111
112/// Classify a resolved callee by the crate it belongs to and its full path.
113pub fn classify(crate_name: &str, path: &str) -> Option<&'static str> {
114 // Pure fd ownership-transfer/extraction leaves are never an effect, regardless of which std I/O
115 // type they hang off — exempt them BEFORE the coarse prefix rules can fabricate Net/Fs/Ipc.
116 if PURE_FD_TRANSFER.contains(&path.rsplit("::").next().unwrap_or(path)) {
117 return None;
118 }
119 if crate_name.starts_with("aws_sdk_") || crate_name.starts_with("aws_smithy") {
120 // Only request dispatch is network I/O; builder setters/accessors are pure.
121 if path.ends_with("::send") || path.ends_with("::send_with") {
122 return Some("Net");
123 }
124 return None;
125 }
126 // aws-config resolves credentials/region on `.load()` — it reaches the IMDS metadata
127 // endpoint / STS over the network (and reads ~/.aws + env). Builders (`defaults()`,
128 // `SdkConfig::builder()`, `BehaviorVersion::latest()`) are pure; the `load` is the I/O.
129 // (Found hardening on a real app, ebman: `builder.load().await` was classified pure.)
130 if crate_name == "aws_config" {
131 if path.ends_with("::load") || path.ends_with("::load_defaults") {
132 return Some("Net");
133 }
134 return None;
135 }
136 // git2 (libgit2 FFI): remote operations contact the network; everything else is local
137 // to the .git directory. Match the remote verbs precisely — NOT bare `::clone`, which is
138 // the `Clone`-trait dup of a `Remote` handle (pure), not `Repository::clone`. (Found
139 // hardening on gitui: `remote.fetch`/`remote.push` were classified network-free — a git
140 // client reporting it makes no network calls.)
141 if crate_name == "git2" {
142 if path.ends_with("::fetch")
143 || path.ends_with("::push")
144 || path.ends_with("::download")
145 || path.ends_with("::connect")
146 || path.ends_with("::connect_auth")
147 || path.ends_with("::ls")
148 || path.ends_with("::upload")
149 {
150 return Some("Net");
151 }
152 return None;
153 }
154 // libc — raw syscalls via FFI. The FFI-thin tier (nix, and the syscall layer beneath rusqlite/git2)
155 // is invisible to a name classifier unless we model libc directly: a 35-crate calibration
156 // (eval/calibration) showed nix reporting ZERO library effects because every wrapper bottoms out in
157 // an unrecognised `libc::*` call. Classify by syscall name, but ONLY the UNAMBIGUOUS ones — the
158 // socket family is Net, path/dir syscalls are Fs, spawn/exec/wait is Exec, SysV/pipe IPC is Ipc,
159 // env/clock/entropy each their own. We deliberately SKIP the generic file-descriptor ops
160 // (read/write/close/lseek/dup/fcntl/ioctl/poll/select/epoll*/mmap): they operate on ANY fd — file,
161 // socket, or pipe — so a fixed label would mis-categorise as often as it helps. An honest
162 // no-classify (under-report) beats emitting the WRONG effect. Pure conversions (htons/inet_pton/
163 // gmtime) are also skipped.
164 //
165 // `nix` (the idiomatic SAFE libc wrapper, in ~every Rust systems/CLI crate) is routed through the
166 // SAME table: its functions keep the syscall leaf name (`nix::fcntl::open`, `nix::sys::socket::connect`,
167 // `nix::unistd::execvp`). Without this, a CONSUMER of nix analysed without nix's own source (the
168 // stable scanner, single-crate) sees `nix::*` cross-crate and under-reports — serialport-rs opens its
169 // device via `nix::fcntl::open` and reported ZERO Fs. The nightly lint reaches `libc::*` THROUGH nix's
170 // body; this gives the scanner the same coverage directly. (Found sweeping serialport-rs.)
171 // `rustix` is the same shape as nix but does RAW syscalls (no libc underneath), so its functions MUST
172 // be classified directly. Its leaf names are the syscall names too (`rustix::time::clock_settime`,
173 // `rustix::fs::mkfifoat`/`symlink`/`stat`, `rustix::net::connect`) — route it through the same table.
174 // The rustix-specific `*at`/variant leaves it doesn't share with libc just under-report (the safe
175 // direction). VALIDATED, not speculative: coreutils' `date` reads/sets the clock via
176 // `rustix::time::clock_getres`/`clock_settime` and reported Clock=0; the file I/O that goes through
177 // std::fs was already correct, which is why only the rustix-only effects (Clock/Ipc) were missing.
178 if crate_name == "libc" || crate_name == "nix" || crate_name == "rustix" {
179 let f = path.rsplit("::").next().unwrap_or(path);
180 // path / directory / metadata syscalls (incl. *64 and *at variants)
181 const FS: &[&str] = &[
182 "open", "open64", "openat", "openat2", "creat", "creat64", "stat", "stat64", "lstat",
183 "lstat64", "fstatat", "fstatat64", "newfstatat", "statx", "access", "faccessat",
184 "faccessat2", "mkdir", "mkdirat", "rmdir", "unlink", "unlinkat", "rename", "renameat",
185 "renameat2", "link", "linkat", "symlink", "symlinkat", "readlink", "readlinkat", "chmod",
186 "fchmodat", "chown", "lchown", "fchownat", "truncate", "truncate64", "ftruncate",
187 "ftruncate64", "opendir", "fdopendir", "readdir", "readdir64", "readdir_r", "closedir",
188 "rewinddir", "seekdir", "telldir", "scandir", "mkstemp", "mkstemps", "mkostemp", "mkdtemp",
189 "mknod", "mknodat", "chdir", "fchdir", "getcwd", "get_current_dir_name", "chroot",
190 "pivot_root", "statfs", "statfs64", "fstatfs", "fstatfs64", "statvfs", "fstatvfs", "mount",
191 "umount", "umount2", "fsync", "fdatasync", "sync", "syncfs", "sync_file_range", "fallocate",
192 "posix_fallocate", "posix_fadvise", "sendfile", "sendfile64", "copy_file_range", "flock",
193 "getdents", "getdents64", "utime", "utimes", "lutimes", "futimens", "utimensat", "futimesat",
194 "realpath",
195 ];
196 // socket family — these operate only on sockets, so Net is unambiguous (AF_UNIX domain isn't
197 // visible at the call, so a Unix socket reads as Net rather than Ipc; acceptable over-general).
198 const NET: &[&str] = &[
199 "socket", "setsockopt", "getsockopt", "bind", "listen", "accept", "accept4", "connect",
200 "shutdown", "send", "sendto", "sendmsg", "sendmmsg", "recv", "recvfrom", "recvmsg",
201 "recvmmsg", "getpeername", "getsockname", "getaddrinfo", "freeaddrinfo", "getnameinfo",
202 ];
203 // process creation / replacement / reaping
204 const EXEC: &[&str] = &[
205 "fork", "vfork", "clone", "clone3", "execl", "execlp", "execle", "execv", "execvp",
206 "execvpe", "execve", "execveat", "fexecve", "posix_spawn", "posix_spawnp", "system",
207 "popen", "pclose", "wait", "waitpid", "wait3", "wait4", "waitid",
208 ];
209 // pipes / FIFOs / SysV + POSIX message queues, semaphores, shared memory; socketpair (AF_UNIX)
210 const IPC: &[&str] = &[
211 "pipe", "pipe2", "mkfifo", "mkfifoat", "socketpair", "msgget", "msgsnd", "msgrcv", "msgctl",
212 "semget", "semop", "semtimedop", "semctl", "shmget", "shmat", "shmdt", "shmctl", "mq_open",
213 "mq_send", "mq_receive", "mq_timedsend", "mq_timedreceive", "mq_close", "mq_unlink",
214 ];
215 const ENV: &[&str] = &["getenv", "secure_getenv", "setenv", "putenv", "unsetenv", "clearenv"];
216 const CLOCK: &[&str] = &[
217 "time", "gettimeofday", "clock_gettime", "clock_getres", "nanosleep", "clock_nanosleep",
218 // SETTING the system clock is a clock effect too (was unclassified — found on coreutils `date`,
219 // which sets it via `clock_settime`).
220 "clock_settime", "settimeofday", "stime", "adjtime", "adjtimex", "clock_adjtime",
221 ];
222 const RAND: &[&str] = &["getrandom", "getentropy", "arc4random", "arc4random_buf", "arc4random_uniform"];
223 if FS.contains(&f) {
224 return Some("Fs");
225 }
226 if NET.contains(&f) {
227 return Some("Net");
228 }
229 if EXEC.contains(&f) {
230 return Some("Exec");
231 }
232 if IPC.contains(&f) {
233 return Some("Ipc");
234 }
235 if ENV.contains(&f) {
236 return Some("Env");
237 }
238 if CLOCK.contains(&f) {
239 return Some("Clock");
240 }
241 if RAND.contains(&f) {
242 return Some("Rand");
243 }
244 return None;
245 }
246 // C-library FFI bindings: libsqlite3 (under rusqlite) and libgit2 (under git2). Like the libc tier,
247 // these crates are thin Rust over a C library, so their real I/O is invisible until the C entry
248 // points are named. Match by the DISTINCTIVE C function name (`sqlite3_*` / `git_*`) via the call's
249 // LEAF — independent of the binding crate's alias: rusqlite calls `ffi::sqlite3_step`, git2 calls
250 // `raw::git_remote_fetch`, and the nightly lint resolves the same to `libsqlite3_sys`/`libgit2_sys`;
251 // all spellings share the leaf. Only the I/O-performing entry points are listed — the in-memory
252 // accessors (`sqlite3_bind_*`/`sqlite3_column_*`, `git_*_oid`/strarray/options builders) stay pure,
253 // so a non-listed `sqlite3_`/`git_` leaf returns None (under-report, never a wrong effect). Calibrated
254 // + validated against rusqlite 0.39 / git2 0.20 source (eval/calibration).
255 {
256 let leaf = path.rsplit("::").next().unwrap_or(path);
257 if let Some(rest) = leaf.strip_prefix("sqlite3_") {
258 let _ = rest;
259 // SQLite C API operations that touch the database (open/exec/step/prepare/backup/blob/wal).
260 const DB: &[&str] = &[
261 "sqlite3_open", "sqlite3_open_v2", "sqlite3_open16", "sqlite3_close", "sqlite3_close_v2",
262 "sqlite3_exec", "sqlite3_step", "sqlite3_prepare", "sqlite3_prepare_v2",
263 "sqlite3_prepare_v3", "sqlite3_prepare16", "sqlite3_prepare16_v2", "sqlite3_prepare16_v3",
264 "sqlite3_get_table", "sqlite3_backup_init", "sqlite3_backup_step", "sqlite3_backup_finish",
265 "sqlite3_blob_open", "sqlite3_blob_read", "sqlite3_blob_write", "sqlite3_blob_reopen",
266 "sqlite3_load_extension", "sqlite3_wal_checkpoint", "sqlite3_wal_checkpoint_v2",
267 ];
268 return DB.contains(&leaf).then_some("Db");
269 }
270 if leaf.starts_with("git_") {
271 // libgit2: remote/transport operations contact the network … (incl. submodule clone/update,
272 // which `git_clone`/fetch the subrepo over its remote — `allow_fetch` defaults on; an A/B on
273 // git2 0.20 caught `Submodule::update`/`clone` reporting no `Net`).
274 const NET: &[&str] = &[
275 "git_clone", "git_remote_connect", "git_remote_connect_ext", "git_remote_fetch",
276 "git_remote_download", "git_remote_upload", "git_remote_push", "git_remote_ls",
277 "git_submodule_clone", "git_submodule_update",
278 ];
279 // … and repository/index/odb/checkout/ref/config operations touch the on-disk .git store.
280 const FS: &[&str] = &[
281 "git_repository_open", "git_repository_open_ext", "git_repository_open_bare",
282 "git_repository_init", "git_repository_init_ext", "git_repository_discover",
283 "git_checkout_tree", "git_checkout_head", "git_checkout_index", "git_index_read",
284 "git_index_write", "git_index_write_tree", "git_index_write_tree_to",
285 "git_index_add_bypath", "git_index_add_all", "git_odb_open", "git_odb_read",
286 "git_odb_write", "git_odb_open_wstream", "git_odb_open_rstream",
287 "git_blob_create_fromdisk", "git_blob_create_fromworkdir", "git_blob_create_from_disk",
288 "git_blob_create_from_workdir", "git_blob_create_from_stream", "git_commit_create",
289 "git_commit_create_v", "git_reference_create", "git_reference_set_target",
290 "git_reference_delete", "git_config_open_default", "git_config_open_ondisk",
291 "git_config_add_file_ondisk", "git_tag_create", "git_treebuilder_write",
292 "git_packbuilder_write",
293 ];
294 if NET.contains(&leaf) {
295 return Some("Net");
296 }
297 if FS.contains(&leaf) {
298 return Some("Fs");
299 }
300 return None;
301 }
302 if leaf.starts_with("curl_") {
303 // libcurl (under the `curl` crate, called `curl_sys::curl_*`). Only the entry points that
304 // PERFORM network I/O: the blocking transfer (`curl_easy_perform`), raw socket send/recv,
305 // the HTTP/2 keepalive PING (`upkeep`), and the multi-interface transfer pumps. The large
306 // pure surface (setopt/init/cleanup/reset/getinfo/escape/multi_add_handle/fdset/info_read)
307 // stays unclassified, as do `curl_multi_wait`/`poll` (readiness WAIT on sockets, no payload —
308 // the loop's `perform` is the tagged boundary, per the I/O-boundary principle). An A/B on
309 // curl 0.4 caught the whole crate reporting ZERO Net (`Easy::perform` read as pure).
310 const NET: &[&str] = &[
311 "curl_easy_perform", "curl_easy_send", "curl_easy_recv", "curl_easy_upkeep",
312 "curl_multi_perform", "curl_multi_socket_action",
313 ];
314 return NET.contains(&leaf).then_some("Net");
315 }
316 if let Some(op) = leaf.strip_prefix("SSL_") {
317 // OpenSSL (libssl, under the `openssl`/`native-tls` crates, called `ffi::SSL_*`). The TLS
318 // handshake and record I/O run over the peer socket -> Net. Unlike libc read/write, an SSL_*
319 // op is ~always over a network BIO (the rare memory-BIO/sans-IO case is the honest exception
320 // we accept). The crypto surface (EVP_*/SHA*/AES*) and pure setup (SSL_CTX_new/SSL_set_fd) are
321 // NOT here; `BIO_*` is skipped (a BIO may be memory or socket). Validated vs openssl 0.9 source.
322 const SSL_NET: &[&str] = &[
323 "connect", "accept", "do_handshake", "read", "read_ex", "write", "write_ex", "peek",
324 "peek_ex", "shutdown",
325 ];
326 return SSL_NET.contains(&op).then_some("Net");
327 }
328 }
329 // HTTP clients use the same builder pattern as the AWS SDK: only the dispatch is
330 // I/O. (Found by the eval: ebman's reqwest calls to the Anthropic API + webhooks
331 // were silently classified network-free because reqwest wasn't recognized.)
332 if crate_name == "reqwest" || crate_name == "isahc" {
333 // The builder chain is pure; the dispatch (`::send`/`::execute`) is the I/O. PLUS the one-shot
334 // CONVENIENCE functions `reqwest::get` / `reqwest::blocking::get` / `isahc::get`, which send
335 // immediately — they're not the `Client::get` builder (a different path, `reqwest::Client::get`),
336 // so an exact match avoids false-positiving the builder. (Found running on `xh`: a one-shot
337 // `reqwest::get(url)` was classified network-free.)
338 if path.ends_with("::send")
339 || path.ends_with("::execute")
340 || path == "reqwest::get"
341 || path == "reqwest::blocking::get"
342 || path == "isahc::get"
343 {
344 return Some("Net");
345 }
346 return None;
347 }
348 if crate_name == "ureq" && path.ends_with("::call") {
349 return Some("Net");
350 }
351 // The `curl` crate (libcurl's safe binding — cargo's own HTTP client): the dispatch verbs are
352 // `perform` (Easy/Easy2/Transfer/Multi), raw-socket `send`/`recv`, the keepalive `upkeep`, and the
353 // multi-interface `action` (socket_action). The big setopt-style builder surface stays pure.
354 // `Multi::timeout` is deliberately NOT matched: `Easy::timeout` is a pure CURLOPT_TIMEOUT setter
355 // sharing the leaf — an under-report on the rare event-loop kick beats mis-tagging every consumer
356 // that sets a timeout. (Consumer-side companion to the curl_* FFI tier, same A/B finding.)
357 if crate_name == "curl"
358 && (path.ends_with("::perform")
359 || path.ends_with("::send")
360 || path.ends_with("::recv")
361 || path.ends_with("::upkeep")
362 || path.ends_with("::action"))
363 {
364 return Some("Net");
365 }
366 // Message-queue clients fully encapsulate the socket (the underlying tokio::net lives
367 // inside the crate, unseen), so a user's connect/publish/consume calls ARE the I/O
368 // boundary — to a remote broker, hence Net. Match the broker round-trip verbs (snake_case
369 // methods); the CamelCase option/property builders stay pure. (Found hardening on consumer
370 // apps: lapin `basic_publish`/`queue_declare` and async-nats `publish`/`subscribe` were
371 // classified pure — a message-queue client reporting no I/O.)
372 if crate_name == "async_nats" {
373 if path.ends_with("::connect")
374 || path.contains("::publish")
375 || path.ends_with("::subscribe")
376 || path.ends_with("::queue_subscribe")
377 || path.contains("::request")
378 || path.ends_with("::flush")
379 {
380 return Some("Net");
381 }
382 return None;
383 }
384 if crate_name == "lapin" {
385 if path.ends_with("::connect")
386 || path.ends_with("::create_channel")
387 || path.contains("::basic_")
388 || path.contains("::queue_")
389 || path.contains("::exchange_")
390 || path.contains("::tx_")
391 || path.ends_with("::confirm_select")
392 || path.ends_with("::close")
393 {
394 return Some("Net");
395 }
396 return None;
397 }
398 // SMTP email — lettre's `Transport::send` is the network dispatch; Message building is
399 // pure. (Found hardening on a lettre consumer: `mailer.send(&email)` classified pure.)
400 if crate_name == "lettre" {
401 if path.ends_with("::send") || path.ends_with("::send_raw") {
402 return Some("Net");
403 }
404 return None;
405 }
406 // WebSockets — tungstenite (the modern successor to the old `websocket` crate). connect
407 // and the socket read/write/send are network; Message constructors are pure. (Found on a
408 // tungstenite consumer: connect + send + read classified pure.)
409 if crate_name == "tungstenite" {
410 if path.ends_with("::connect")
411 || path.ends_with("::read")
412 || path.ends_with("::write")
413 || path.ends_with("::send")
414 || path.ends_with("::close")
415 || path.ends_with("::flush")
416 || path.ends_with("::read_message")
417 || path.ends_with("::write_message")
418 {
419 return Some("Net");
420 }
421 return None;
422 }
423 // elasticsearch: request builders are pure; only the `.send()` dispatch is HTTP I/O
424 // (same shape as reqwest / the AWS SDK). (Found on an elasticsearch consumer.)
425 if crate_name == "elasticsearch" && path.ends_with("::send") {
426 return Some("Net");
427 }
428 // gRPC — tonic. The transport connect and the Grpc client RPC dispatch are network;
429 // codecs and request/response wrappers are pure. (connect repro-confirmed on a consumer;
430 // the unary/streaming RPC verbs are from the tonic::client::Grpc API.)
431 if crate_name == "tonic" {
432 if path.ends_with("::connect")
433 || path.ends_with("::unary")
434 || path.ends_with("::server_streaming")
435 || path.ends_with("::client_streaming")
436 || path.ends_with("::streaming")
437 {
438 return Some("Net");
439 }
440 return None;
441 }
442 // Kafka — rdkafka (FFI to librdkafka). Producer send + consumer poll/recv/subscribe/
443 // commit are network round-trips to the brokers. (API-calibrated + unit-tested; a real
444 // repro needs librdkafka/cmake, deferred.)
445 if crate_name == "rdkafka" {
446 if path.ends_with("::send")
447 || path.ends_with("::send_result")
448 || path.ends_with("::recv")
449 || path.ends_with("::poll")
450 || path.ends_with("::subscribe")
451 || path.ends_with("::commit")
452 || path.ends_with("::commit_message")
453 || path.ends_with("::commit_consumer_state")
454 || path.ends_with("::store_offset")
455 || path.ends_with("::seek")
456 || path.ends_with("::fetch_metadata")
457 || path.ends_with("::fetch_watermarks")
458 || path.ends_with("::flush")
459 {
460 return Some("Net");
461 }
462 return None;
463 }
464 // cap-std: capability-oriented std. I/O goes *through* a held capability handle
465 // (Dir/Pool/Clock/...), so these calls ARE the effect. Recognising them means a
466 // cap-std project's real I/O is detected and matches the capability it declared
467 // (via `declared_caps`/`capstd_cap`) — conformance against unforgeable capabilities.
468 if crate_name.starts_with("cap_") {
469 if path.contains("::net::Unix") || path.contains("::os::") {
470 return Some("Ipc");
471 }
472 if path.contains("::net") {
473 return Some("Net");
474 }
475 if path.contains("::time") {
476 return Some("Clock");
477 }
478 if path.contains("::fs") || crate_name == "cap_tempfile" || crate_name == "cap_directories" {
479 return Some("Fs");
480 }
481 return None;
482 }
483 // Local IPC (Unix-domain sockets) is I/O but not *network* — keep it distinct so
484 // CANDOR_NO_AMBIENT and audits don't conflate it with internet access. async-std puts its
485 // Unix sockets under `os::unix::net` (mirroring std); async-net (smol's net layer) under
486 // `unix`.
487 if path.starts_with("tokio::net::Unix")
488 || path.starts_with("std::os::unix::net")
489 || path.starts_with("async_std::os::unix::net")
490 || path.starts_with("async_net::unix")
491 {
492 return Some("Ipc");
493 }
494 // Raw packet capture / raw sockets — libpnet (the dominant low-level networking crate; powers
495 // bandwhich, sniffers, custom-protocol tools). `datalink::channel` opens an L2 socket and
496 // `transport::transport_channel` an L3/L4 raw socket — both ARE network I/O. Packet construction
497 // (pnet_packet / pnet_base, MacAddr, Ethernet frames…) is pure and stays unclassified. The actual
498 // frame read/write happens via methods on the returned Sender/Receiver (trait-object dispatch the
499 // syntactic backend can't resolve), so the channel-open call is the precise Net boundary. (Found
500 // scanning bandwhich — a packet sniffer — which reported Net 0.)
501 if crate_name == "pnet" || crate_name == "pnet_datalink" || crate_name == "pnet_transport" {
502 if path.ends_with("::channel") || path.ends_with("::transport_channel") {
503 return Some("Net");
504 }
505 return None;
506 }
507 // Directory traversal — `ignore` (BurntSushi's gitignore-aware walker; powers ripgrep, fd). The walk
508 // EXECUTORS read the directory tree from disk = Fs. Type-precise on purpose: the configuration builders
509 // (`OverrideBuilder::build`, `GitignoreBuilder::build`, the `WalkBuilder` setters) and `DirEntry`
510 // accessors are PURE — only `WalkBuilder::build`/`build_parallel` (which kick off the walk) and
511 // `WalkParallel::run` (which drives it) touch the filesystem. A bare `build` would wrongly flag the
512 // config builders. (Found scanning fd — a file finder — which reported Fs 2: its own `fs::read_dir`
513 // was caught, but the `ignore`-based traversal that IS fd was invisible cross-crate.)
514 if crate_name == "ignore" {
515 if path == "ignore::WalkBuilder::build"
516 || path == "ignore::WalkBuilder::build_parallel"
517 || path.ends_with("::WalkParallel::run")
518 {
519 return Some("Fs");
520 }
521 return None;
522 }
523 // Filesystem watching — `notify` (the de-facto fs-watch crate: watchexec, cargo-watch, mdbook). A
524 // watcher opens an OS notification handle (inotify / FSEvents / kqueue / ReadDirectoryChanges) and
525 // registers paths — observing filesystem state changes = Fs. The lifecycle boundary: any
526 // `*Watcher::new` constructor (RecommendedWatcher/PollWatcher/INotifyWatcher/FsEventWatcher/…), the
527 // `recommended_watcher` convenience fn, and the `watch`/`unwatch` registration verbs. `Config`/`Event`/
528 // `EventKind` data types stay pure. (Found scanning watchexec: its watcher-`create` read Fs 0.)
529 if crate_name == "notify" {
530 if path.ends_with("Watcher::new")
531 || path.ends_with("::recommended_watcher")
532 || path.ends_with("::watch")
533 || path.ends_with("::unwatch")
534 {
535 return Some("Fs");
536 }
537 return None;
538 }
539 // Raw sockets. Match the I/O *types* only — `std::net` also holds pure data types
540 // (SocketAddr, IpAddr, …) whose construction must NOT be flagged.
541 if path.starts_with("std::net::TcpStream")
542 || path.starts_with("std::net::TcpListener")
543 || path.starts_with("std::net::UdpSocket")
544 || path.starts_with("tokio::net::")
545 {
546 return Some("Net");
547 }
548 // Legacy tokio 0.1 socket crates — `tokio_tcp`/`tokio_udp` are *entirely* networking
549 // (no pure types to over-flag), so the whole crate is Net. (Found hardening on websocat,
550 // which is still on tokio 0.1: its `tokio_tcp::TcpStream::connect` was classified
551 // network-free — a network tool confidently reporting 0 Net.)
552 if matches!(crate_name, "tokio_tcp" | "tokio_udp") {
553 return Some("Net");
554 }
555 // The other async runtimes mirror tokio's module layout, and their `net` modules hold only
556 // socket I/O types (the pure `SocketAddr`/`IpAddr` are re-exports that resolve to `std::net`,
557 // so they're excluded by def-path). `mio` is the low-level non-blocking-socket layer under
558 // tokio/others; `async_net` is smol's net crate. Closes the async-std/smol/mio gap the
559 // tokio_tcp note flagged. (Calibrated by module structure — these crates ARE networking — not
560 // a live repro; the TCP/UDP types are defined in-crate so the def-path prefix is exact.)
561 if path.starts_with("async_std::net::")
562 || path.starts_with("mio::net::")
563 || crate_name == "async_net"
564 {
565 return Some("Net");
566 }
567 // Database clients. Like the AWS/HTTP builders, only the execution verbs are I/O;
568 // query *construction* is pure. Best-effort across crates (tune via CANDOR_CONFIG).
569 // Note: bare `::query` is deliberately omitted — it executes in postgres/rusqlite but
570 // only *builds* in sqlx, so including it would false-positive sqlx's `query()` builder.
571 if DB_CRATES.contains(&crate_name) {
572 // Postgres / SQLite-family clients: `query`/`batch_execute`/`prepare`/etc. ARE the
573 // execution (round-trips to the server). sqlx is the outlier where bare `query()`
574 // only BUILDS — it keeps the narrow set below. (Found by running on a real
575 // tokio-postgres app, pgman: candor had reported only 4 of ~20 DB call sites.)
576 if matches!(crate_name, "postgres" | "tokio_postgres" | "deadpool_postgres" | "rusqlite") {
577 const PG: [&str; 19] = [
578 "::query", "::query_one", "::query_opt", "::query_raw", "::execute",
579 "::batch_execute", "::simple_query", "::prepare", "::prepare_typed",
580 "::copy_in", "::copy_out", "::transaction", "::connect",
581 // rusqlite's dialect of the same verbs (a verb-probe found the CANONICAL rusqlite
582 // consumer API classifying pure): `query_row` is the one-row read, `query_map`/
583 // `query_and_then` the many-row reads, `execute_batch` is rusqlite's name for
584 // batch_execute, `prepare_cached` round-trips like prepare. `query_typed` is
585 // tokio_postgres 0.7.10+.
586 "::query_row", "::query_map", "::query_and_then", "::execute_batch",
587 "::prepare_cached", "::query_typed",
588 ];
589 if PG.iter().any(|v| path.ends_with(v)) {
590 return Some("Db");
591 }
592 // rusqlite only: opening the database IS the connection establishment (`Connection::
593 // open`/`open_in_memory`/`open_with_flags` — the embedded analog of `::connect`).
594 if crate_name == "rusqlite"
595 && (path.ends_with("::open")
596 || path.ends_with("::open_in_memory")
597 || path.ends_with("::open_with_flags"))
598 {
599 return Some("Db");
600 }
601 return None;
602 }
603 // redis: the way redis is ACTUALLY used is the high-level `Commands`/`AsyncCommands`
604 // traits (`con.get`/`set`/`hset`/`lpush`/…) — every method is a round-trip — plus
605 // connection establishment. The shared VERBS below only catch the low-level
606 // `cmd("GET").query(con)`, so without this a normal redis user's calls classify as
607 // PURE. (Found hardening on redis-rs: a fn doing `con.get`/`set` reported no effects.)
608 if crate_name == "redis"
609 && (path.contains("Commands::")
610 || path.contains("::get_connection")
611 || path.contains("::get_async_connection")
612 || path.contains("::get_multiplexed_async_connection")
613 // a live `ConnectionManager` round-trips (Db), but `ConnectionManagerConfig` is a pure
614 // in-memory builder (set_number_of_retries/set_max_delay) — exclude it (adversarial review).
615 || (path.contains("ConnectionManager") && !path.contains("ConnectionManagerConfig"))
616 || path.ends_with("::query")
617 || path.ends_with("::query_async")
618 || path.ends_with("::req_command")
619 || path.ends_with("::req_packed_command")
620 || path.ends_with("::req_packed_commands"))
621 {
622 return Some("Db");
623 }
624 // mongodb: a document-store API with none of the SQL verbs — the user calls
625 // `coll.find_one`/`insert_one`/`aggregate`/… and `Client::with_uri_str`. Without
626 // these a mongodb user's calls classify PURE. (Found hardening: a fn doing
627 // `find_one`+`insert_one` reported no effects.) Handle accessors (name/namespace)
628 // and option/doc builders don't match these verbs, so they stay pure.
629 if crate_name == "mongodb" {
630 const MONGO: [&str; 27] = [
631 "::with_uri_str", "::connect", "::find", "::find_one", "::insert_one",
632 "::insert_many", "::update_one", "::update_many", "::delete_one",
633 "::delete_many", "::replace_one", "::aggregate", "::count_documents",
634 "::estimated_document_count", "::count", "::distinct", "::run_command",
635 "::find_one_and_update", "::find_one_and_delete", "::find_one_and_replace",
636 "::list_collections", "::list_collection_names", "::list_databases",
637 "::list_database_names", "::create_collection", "::create_index", "::watch",
638 ];
639 if MONGO.iter().any(|v| path.ends_with(v)) {
640 return Some("Db");
641 }
642 return None;
643 }
644 // mysql / mysql_async: the `query`/`exec` families + `get_conn`/`ping` execute
645 // immediately — no build-then-execute split like sqlx, so matching `::query` is safe
646 // here. Same DB-verb-dialect gap class as redis/mongodb; calibrated from the Queryable
647 // API (unit-tested; a real-app repro is the remaining confirmation).
648 if matches!(crate_name, "mysql" | "mysql_async") {
649 const MY: [&str; 16] = [
650 "::query", "::query_first", "::query_iter", "::query_map", "::query_fold",
651 "::query_drop", "::exec", "::exec_first", "::exec_iter", "::exec_map",
652 "::exec_fold", "::exec_drop", "::exec_batch", "::prep", "::ping", "::get_conn",
653 ];
654 if MY.iter().any(|v| path.ends_with(v)) {
655 return Some("Db");
656 }
657 return None;
658 }
659 // sea_orm: an ORM whose execution is split from building (like sqlx). The query
660 // BUILDERS (`Entity::find`, `Entity::insert`) are pure; execution happens at `.all`/
661 // `.one`/`.count`/`.stream` and `Insert/Update/Delete::exec`. The write path via an
662 // ActiveModel (`model.insert(db)`) executes too — distinguished from the `EntityTrait`
663 // builder by the trait in the path (`ActiveModelTrait::`). (Found hardening on a
664 // sea_orm consumer app: `.all(db)` reads and `ActiveModel::insert` writes were pure.)
665 if crate_name == "sea_orm" {
666 if path.ends_with("::all")
667 || path.ends_with("::one")
668 || path.ends_with("::count")
669 || path.ends_with("::stream")
670 || path.ends_with("::exec")
671 || path.ends_with("::exec_with_returning")
672 || path.ends_with("::exec_without_returning")
673 || path.ends_with("::connect")
674 || path.ends_with("::execute")
675 || path.ends_with("::execute_unprepared")
676 || path.ends_with("::query_one")
677 || path.ends_with("::query_all")
678 || path.ends_with("::fetch_page")
679 || path.ends_with("::num_items")
680 || path.contains("ActiveModelTrait::")
681 {
682 return Some("Db");
683 }
684 return None;
685 }
686 // (Reached by sqlx + diesel — the build-vs-execute-split crates.) `first` is diesel's
687 // LIMIT-1 round trip and `load_iter` its 2.x streaming execution; `fetch_many` is sqlx's
688 // multi-result stream. All crate-gated, so a std `Vec::first` never resolves here.
689 const VERBS: [&str; 19] = [
690 "::execute", "::query_row", "::query_map", "::query_one", "::fetch_one",
691 "::fetch_all", "::fetch_optional", "::fetch", "::fetch_many", "::connect",
692 "::acquire", "::begin", "::commit", "::rollback", "::load", "::load_iter",
693 "::first", "::get_result", "::get_results",
694 ];
695 if VERBS.iter().any(|v| path.ends_with(v)) {
696 return Some("Db");
697 }
698 return None;
699 }
700 // std::path::Path / PathBuf STAT-family methods hit the filesystem (each is a stat/readlink/
701 // readdir syscall) — unlike the rest of the std::path surface, which is pure string manipulation
702 // (join/file_name/extension/parent/…). Verb-precise so the scanner's receiver inference can safely
703 // route a `path.symlink_metadata()` method call here. (A blackout screen caught gix-dir — an entire
704 // directory WALKER — reporting ZERO Fs because all its I/O is Path-method calls; same class as
705 // fd's residual `Path::symlink_metadata` under-report.)
706 if let Some(m) = path
707 .strip_prefix("std::path::Path::")
708 .or_else(|| path.strip_prefix("std::path::PathBuf::"))
709 {
710 const STAT: &[&str] = &[
711 "metadata", "symlink_metadata", "canonicalize", "read_link", "read_dir", "exists",
712 "try_exists", "is_file", "is_dir", "is_symlink",
713 ];
714 return STAT.contains(&m).then_some("Fs");
715 }
716 // Filesystem. `tokio::fs`/`async_std::fs` are the async mirrors of `std::fs`; `async_fs` is
717 // smol's fs crate; `fs_err` is a drop-in `std::fs` wrapper (its whole surface is fs I/O).
718 if path.starts_with("std::fs::")
719 || path.starts_with("tokio::fs::")
720 || path.starts_with("async_std::fs::")
721 || crate_name == "async_fs"
722 || crate_name == "fs_err"
723 {
724 return Some("Fs");
725 }
726 // memmap2: only `MmapOptions::map*` (and the in-place `Mmap::flush`/`make_*` protection
727 // changes / `remap`) actually issue the mmap/msync/mprotect/mremap syscall = Fs. The rest of the
728 // crate is PURE: `MmapOptions::new`/setters BUILD the request, and once a region is mapped, reads
729 // over it (`Mmap::len`/`is_empty`/`as_ptr`/`as_mut_ptr`/`deref` into the byte slice) are plain
730 // memory access with no syscall. Whole-crate Fs fabricated Fs on those reads (a `m.len()` the
731 // scanner's receiver inference routes to `memmap2::Mmap::len`). Match the syscall-issuing verbs;
732 // everything else returns None (pure). `map*` covers `map`/`map_mut`/`map_exec`/`map_copy`/
733 // `map_copy_read_only`/`map_raw`/`map_raw_read_only`/`map_anon`.
734 if crate_name == "memmap2" {
735 let m = path.rsplit("::").next().unwrap_or(path);
736 if m.starts_with("map")
737 || m == "flush"
738 || m == "flush_async"
739 || m == "flush_range"
740 || m == "flush_async_range"
741 || m == "remap"
742 || m.starts_with("make_")
743 || m == "advise"
744 || m == "advise_range"
745 || m == "lock"
746 || m == "unlock"
747 {
748 return Some("Fs");
749 }
750 return None;
751 }
752 // tempfile: creating a temp file/dir touches the disk. Match the create/persist verbs (the
753 // `Builder` setters — prefix/suffix/rand_bytes — stay pure). `persist`/`keep` rename/retain
754 // the file on disk; `close` removes it.
755 if crate_name == "tempfile"
756 && (path.ends_with("::tempfile")
757 || path.ends_with("::tempfile_in")
758 || path.ends_with("::tempdir")
759 || path.ends_with("::tempdir_in")
760 || path.ends_with("NamedTempFile::new")
761 || path.ends_with("NamedTempFile::new_in")
762 || path.ends_with("TempDir::new")
763 || path.ends_with("TempDir::new_in")
764 || path.ends_with("::persist")
765 || path.ends_with("::persist_noclobber")
766 || path.ends_with("::keep"))
767 {
768 return Some("Fs");
769 }
770 // glob: walks the filesystem to expand a pattern (the returned iterator reads directories).
771 // `Pattern::matches` is pure string matching — match only the directory-walking entry points.
772 if crate_name == "glob" && (path.ends_with("::glob") || path.ends_with("::glob_with")) {
773 return Some("Fs");
774 }
775 // Password-hashing / KDF crates — the entropy tier (the TS engine's CTA lesson: an invisible
776 // argon2 landed on exactly the call a security review cares about). In this engine's
777 // verb-precise style the ENTROPY is the salt mint: `SaltString::generate(OsRng)` in the
778 // password-hash API family, and bcrypt's `hash`/`hash_with_result` (salt minted internally).
779 // Verification and explicit-salt hashing are deterministic recomputation — pure. `rand_core`
780 // carries the OsRng source itself (otherwise the most common salt mint is invisible).
781 if matches!(crate_name, "argon2" | "scrypt" | "pbkdf2" | "password_hash") {
782 if path.contains("SaltString::generate") {
783 return Some("Rand");
784 }
785 return None;
786 }
787 if crate_name == "bcrypt" {
788 if path.ends_with("::hash") || path.ends_with("::hash_with_result") {
789 return Some("Rand");
790 }
791 return None;
792 }
793 if crate_name == "rand_core" {
794 if path.contains("OsRng")
795 || path.ends_with("::next_u32")
796 || path.ends_with("::next_u64")
797 || path.ends_with("::fill_bytes")
798 {
799 return Some("Rand");
800 }
801 return None;
802 }
803 // Randomness / entropy. `getrandom`/`fastrand` are effectful end-to-end. `rand` is NOT — it
804 // mixes entropy/generation (effectful) with *pure* distribution constructors (`Uniform::new`,
805 // `Normal::new`) and deterministic-seed constructors (`seed_from_u64`). Flagging the whole crate
806 // over-reported those as `Rand`; match only the calls that actually consume randomness — the
807 // entropy sources (`OsRng`, `thread_rng`/`rng`, `from_entropy`/`from_os_rng`) and the generation
808 // verbs (`gen*`/`random*`/`fill*`/`sample*`/`next_u*`). A `Uniform::new` is now correctly pure.
809 if crate_name == "getrandom" {
810 return Some("Rand");
811 }
812 // fastrand: like `rand`, it mixes entropy-consuming generation (effectful) with PURE deterministic
813 // pieces. `Rng::with_seed(42)` is a DETERMINISTIC seeded constructor (consumes no entropy — the same
814 // seed gives the same stream), and `Rng::fork`/`Rng::clone` just split/copy existing state. Those are
815 // PURE; whole-crate Rand fabricated Rand on them. The effect is the value-drawing methods (`u32`/
816 // `usize`/`bool`/`f64`/`char`/`alphanumeric`/`choice`/`choose_multiple`/`shuffle`/`fill`/the range
817 // forms) AND the entropy-seeded entry points: bare `Rng::new()` (seeds from the global entropy-backed
818 // generator), `fastrand::seed`, and the top-level `fastrand::u32(..)` free functions (which draw from
819 // the thread-local generator). `with_seed` is exempted explicitly; any other method on an `Rng`
820 // (i.e. a value draw) is Rand.
821 if crate_name == "fastrand" {
822 let m = path.rsplit("::").next().unwrap_or(path);
823 // Provably pure: deterministic seeded ctor + state split/copy.
824 if m == "with_seed" || m == "fork" || m == "clone" {
825 return None;
826 }
827 // Everything else fastrand exposes either draws a value or seeds from entropy → Rand. (The crate
828 // has no pure data types beyond the `Rng` handle itself, so a non-draw stray would have to be a
829 // method we don't recognise — keep the effect, the safe direction.)
830 return Some("Rand");
831 }
832 if crate_name == "rand" {
833 let rng_verb = path.ends_with("::gen")
834 || path.ends_with("::gen_range")
835 || path.ends_with("::gen_bool")
836 || path.ends_with("::gen_ratio")
837 || path.ends_with("::random")
838 || path.ends_with("::random_range")
839 || path.ends_with("::random_bool")
840 || path.ends_with("::random_ratio")
841 || path.ends_with("::random_iter") // rand 0.9 iterator generator
842 || path.ends_with("::gen_iter")
843 || path.ends_with("::fill")
844 || path.ends_with("::fill_bytes")
845 || path.ends_with("::try_fill")
846 || path.ends_with("::try_fill_bytes")
847 || path.ends_with("::sample")
848 || path.ends_with("::sample_iter")
849 || path.ends_with("::next_u32")
850 || path.ends_with("::next_u64")
851 || path.ends_with("::thread_rng")
852 || path.ends_with("::rng")
853 || path.ends_with("::from_entropy")
854 || path.ends_with("::from_os_rng");
855 // `OsRng` is the OS entropy SOURCE, but `clone`/`fork`/`default` just copy or construct the
856 // (zero-sized) handle and draw no entropy — pure, exactly like the `fastrand` arm's clone/fork
857 // exemption above. The actual draws (`fill_bytes`/`next_u*`/…) are caught by `rng_verb`. Without
858 // this exemption the blanket `contains("OsRng")` fabricated `Rand` on `OsRng::clone` (adversarial
859 // review: OsRng is a unit struct, cloning consumes nothing).
860 let m = path.rsplit("::").next().unwrap_or(path);
861 let os_rng = path.contains("OsRng") && !matches!(m, "clone" | "fork" | "default");
862 if rng_verb || os_rng {
863 return Some("Rand");
864 }
865 return None;
866 }
867 // Subprocess spawning. `tokio::process` is the async mirror of `std::process` — it exists
868 // only to spawn/control subprocesses (`Command`/`Child`, no pure data types like std's
869 // `Stdio`/`ExitStatus`/`exit`), so spawning through it is Exec just the same. Without this an
870 // async app's `tokio::process::Command::new(..).spawn()` classified pure — a silent under-report
871 // of subprocess execution, the dangerous direction (mirrors the tokio::fs/tokio::net coverage).
872 if path.starts_with("std::process::Command")
873 || path.starts_with("std::process::Child")
874 || path.starts_with("tokio::process::Command")
875 || path.starts_with("tokio::process::Child")
876 || path.starts_with("async_std::process::Command")
877 || path.starts_with("async_std::process::Child")
878 {
879 return Some("Exec");
880 }
881 // portable_pty / async_process are whole-crate Exec EXCEPT for the proven-pure surface they expose:
882 // the `CommandBuilder` GETTERS (`get_argv`/`get_cwd`/`get_env`/`as_unix_command_line`…) read back
883 // configuration, and the PURE DATA types (`PtySize::default`, `ExitStatus`/`Stdio`/`CommandBuilder`
884 // construction/setters). The earlier `is_cmd_naming_method` fix stopped the head-refinement LEAK, but
885 // the BASE Exec still fabricated on these accessors (a `cmd.get_cwd()` the scanner routes to
886 // `portable_pty::CommandBuilder::get_cwd`). Subtract the read-back getters and the obvious pure
887 // ctors/setters; the spawn/wait/exec surface (`spawn_command`/`openpty`/`wait`/`kill`/`exec`…) keeps
888 // Exec. SUBTRACT only what is provably pure — when unrecognised, KEEP Exec (the safe direction).
889 if crate_name == "async_process" || crate_name == "portable_pty" {
890 let m = path.rsplit("::").next().unwrap_or(path);
891 // configuration read-back getters — pure (no spawn).
892 if m.starts_with("get_") || m == "as_unix_command_line" {
893 return None;
894 }
895 // pure data-type ctors/setters/derives that NAME no program and spawn nothing.
896 if matches!(
897 m,
898 "default" | "new" | "piped" | "null" | "inherit" | "from_raw_fd"
899 | "arg" | "args" | "arg0" | "env" | "envs" | "env_clear" | "env_remove"
900 | "cwd" | "current_dir" | "rows" | "cols"
901 | "clone" | "fmt" | "eq" | "ne" | "hash"
902 ) {
903 return None;
904 }
905 return Some("Exec");
906 }
907 // duct: a subprocess-orchestration crate. `cmd()`/`cmd!` only *build* an Expression; the
908 // spawn/wait happens at `run`/`read`/`start`. Match the execution verbs, not the builder.
909 if crate_name == "duct"
910 && (path.ends_with("::run")
911 || path.ends_with("::read")
912 || path.ends_with("::start")
913 || path.ends_with("::read_chars"))
914 {
915 return Some("Exec");
916 }
917 if path.starts_with("std::env::") {
918 return Some("Env");
919 }
920 // dotenvy / dotenv: load environment variables (reading a `.env` file and mutating the process
921 // environment). Match the load/read entry points; `Error`/builder types stay pure.
922 if matches!(crate_name, "dotenvy" | "dotenv")
923 && (path.ends_with("::dotenv")
924 || path.ends_with("::dotenv_override")
925 || path.ends_with("::from_path")
926 || path.ends_with("::from_path_override")
927 || path.ends_with("::from_filename")
928 || path.ends_with("::from_filename_override")
929 || path.ends_with("::from_read")
930 || path.ends_with("::from_read_override")
931 || path.ends_with("::load")
932 || path.ends_with("::var")
933 || path.ends_with("::vars"))
934 {
935 return Some("Env");
936 }
937 // Wall-clock reads. Match the `now` accessor precisely (ends_with), not any path
938 // containing the substring "now". The `time` crate (distinct from `std::time`/`chrono`)
939 // reads the clock via `now_utc`/`now_local` (and the deprecated `Instant::now`).
940 if (crate_name == "chrono" || path.starts_with("std::time::")) && path.ends_with("::now") {
941 return Some("Clock");
942 }
943 if crate_name == "time"
944 && (path.ends_with("::now_utc") || path.ends_with("::now_local") || path.ends_with("::now"))
945 {
946 return Some("Clock");
947 }
948 // `tracing`: same principle as the `log` facade below — the crate's TYPES are pure data, so match
949 // the emit, not the whole crate. The actual program output is the macro-expanded
950 // `Subscriber::event`/`event!`/`Span::*enter*` dispatch and the `Span::new*`/`Span::record`
951 // recording path that drives the subscriber. The data-type accessors — `Level::as_str`,
952 // `Span::is_disabled`/`metadata`/`id`, and constructing/reading `Level`/`LevelFilter`/`Span`/
953 // `Event`/`Metadata`/`Field`/`FieldSet`/`Id` — are PURE (no output is produced), so whole-crate Log
954 // fabricated Log on them. Match the emit verbs; everything else returns None.
955 if crate_name == "tracing" {
956 let m = path.rsplit("::").next().unwrap_or(path);
957 if m == "event"
958 || m == "new_span"
959 || m == "record"
960 || m == "record_follows_from"
961 || m == "enter"
962 || m == "exit"
963 || m == "in_scope"
964 || m == "entered"
965 || path.contains("::__macro_support")
966 || path.contains("::__tracing")
967 || path.contains("Subscriber::event")
968 || path.contains("Subscriber::new_span")
969 || path.contains("Subscriber::enter")
970 || path.contains("Subscriber::exit")
971 {
972 return Some("Log");
973 }
974 return None;
975 }
976 // The `log` facade: its macros route through `log::__private_api`; the crate's types
977 // (`Level`, `LevelFilter`) are pure, so match the logging entry, not the whole crate.
978 if crate_name == "log" && path.contains("::__private_api") {
979 return Some("Log");
980 }
981 // Compiler diagnostic emission — the ONE genuinely effectful operation in the otherwise-pure
982 // rustc_* surface (a dylint lint's actual OUTPUT: it writes warnings/errors to the compiler's
983 // diagnostic sink). Classified `Log` (same family as `tracing`/`log` — program output). Match the
984 // emission verbs precisely; rustc_lint/rustc_errors are mostly pure types (Lint, LintId, the Diag
985 // BUILDERS), and only the terminal `emit`/`emit_span_lint` actually produces output.
986 if crate_name == "rustc_lint"
987 && (path.ends_with("::emit_span_lint")
988 || path.ends_with("::span_lint")
989 || path.ends_with("::span_lint_hir"))
990 {
991 return Some("Log");
992 }
993 if crate_name == "rustc_errors"
994 && (path.ends_with("::emit")
995 || path.ends_with("::emit_diagnostic")
996 || path.ends_with("::emit_now"))
997 {
998 return Some("Log");
999 }
1000 // arboard: the effectful surface is the `Clipboard` handle's read/write verbs (each talks to the
1001 // OS clipboard / X11/Wayland/Win32/NSPasteboard server). The data types — chiefly `arboard::Error`
1002 // (whose `Display`/`to_string` formatting is pure) and the `ImageData`/`GetExtLinux`/`SetExtLinux`
1003 // option types — are PURE, so whole-crate Clipboard fabricated Clipboard on e.g. an error
1004 // `to_string()`. Match the handle verbs; everything else returns None. `Clipboard::new` opens the
1005 // connection to the clipboard server, so it's an effect too; `get`/`set` return the
1006 // builder-then-read `Get`/`Set` cursors whose `text`/`image`/`html` terminals do the I/O.
1007 if crate_name == "arboard" {
1008 let m = path.rsplit("::").next().unwrap_or(path);
1009 if m == "new"
1010 || m == "get"
1011 || m == "set"
1012 || m == "clear"
1013 || m == "get_text"
1014 || m == "set_text"
1015 || m == "set_html"
1016 || m == "get_image"
1017 || m == "set_image"
1018 || m == "text"
1019 || m == "image"
1020 || m == "html"
1021 {
1022 return Some("Clipboard");
1023 }
1024 return None;
1025 }
1026 None
1027}
1028
1029pub fn cap_from_name(name: &str) -> Option<&'static str> {
1030 EFFECTS.iter().copied().find(|e| *e == name)
1031}
1032
1033/// Refine the `Exec` cliff (spec §4 ⟨0.5⟩): the effects a *literal, statically-known* subprocess
1034/// head implies, matched by basename (`/usr/bin/curl` → `curl`). The head's effects are ADDED to a
1035/// caller that already carries `Exec` (a subprocess is still spawned — `Exec` is never dropped); an
1036/// unrecognised or dynamically-built head returns `&[]` and keeps the bare cliff (never guess). A
1037/// **candor engine** reads `Fs`/`Env` only — spec §7 item 12 (the analyzer self-boundary) guarantees
1038/// that, so that case is spec-supplied, not curation. The rest is a small curated table under the
1039/// same under-report rule as the crate classifier. INVARIANT: every head here is an external tool
1040/// that does NOT run the analysed project's own code (so `make`/`npm`/`cargo` are deliberately
1041/// absent — they stay the cliff). The reference engines share this table so the `Exec` boundary —
1042/// the one boundary every engine hits — refines identically (the §4-consistency argument).
1043pub fn classify_command_head(cmd: &str) -> &'static [&'static str] {
1044 // Only UNAMBIGUOUS single-effect tools belong here. A multi-modal head (`git status` is local,
1045 // `git push` is Net; `rsync` local-vs-remote) would FABRICATE the effect for its common case —
1046 // the under-report rule forbids it, so such heads keep the bare cliff.
1047 match cmd.rsplit(['/', '\\']).next().unwrap_or(cmd) {
1048 "curl" | "wget" | "http" | "ssh" | "scp" | "sftp" | "ftp" | "telnet" => &["Net"],
1049 "psql" | "mysql" | "sqlite3" | "mongosh" | "mongo" | "redis-cli" | "cqlsh" | "influx" => &["Db"],
1050 // candor engines — Fs/Env only, guaranteed by spec §7 item 12 (the analyzer self-boundary)
1051 "candor" | "candor-run.sh" | "candor-scan" | "candor-query" | "candor-java"
1052 | "candor-classify" | "candor-report" | "cargo-candor" => &["Env", "Fs"],
1053 _ => &[],
1054 }
1055}
1056
1057/// Whether a subprocess-builder method only MODIFIES the command (`.arg`, `.env`, `.current_dir`)
1058/// rather than NAMING the program (`Command::new`, `duct::cmd`). A WHOLE-CRATE-Exec crate
1059/// (`portable_pty`, `duct`, `async_process`) classifies *every* method as `Exec`, so the
1060/// head-refinement must skip these: an arg or env-var-name literal that happened to match a head
1061/// (`.env("psql", …)`, `.arg("curl")`) would FABRICATE that effect — the §1 under-report rule. The
1062/// method is the call path's last segment.
1063pub fn is_cmd_builder_method(method: &str) -> bool {
1064 matches!(
1065 method,
1066 "arg" | "args" | "arg0" | "env" | "envs" | "env_clear" | "env_remove" | "current_dir"
1067 | "cwd" | "stdin" | "stdout" | "stderr" | "pre_exec" | "creation_flags" | "uid" | "gid"
1068 | "groups" | "process_group"
1069 )
1070}
1071
1072/// Whether a subprocess method NAMES the program (so its first string literal IS the command head to
1073/// refine): `Command::new("curl")`, `duct::cmd("curl", …)`. The head-refinement must fire ONLY here —
1074/// an ALLOWLIST, not "any method except known modifiers". A whole-crate-Exec crate classifies EVERY
1075/// method as `Exec`, so a denylist leaked NON-naming methods that aren't modifiers — a getter like
1076/// `CommandBuilder::get_env("psql")` (reading back an env-var KEY, not a program) fed `"psql"` to the
1077/// head classifier and FABRICATED `Db` (review find). Only `new`/`cmd` name a program; everything else
1078/// (modifiers, getters `get_*`, custom builder methods) keeps the bare `Exec` cliff — under-refine
1079/// (safe) rather than fabricate. `std::process::Command` is verb-precise so getters never fire `Exec`
1080/// there anyway; the allowlist makes the whole-crate-Exec crates safe too.
1081pub fn is_cmd_naming_method(method: &str) -> bool {
1082 matches!(method, "new" | "cmd")
1083}
1084
1085/// Map a cap-std capability *type* to the effect it authorises. Holding one of these
1086/// (e.g. `&Dir`) is the real, unforgeable right to perform that effect — so candor
1087/// treats it as a declared capability, exactly like its own `&Fs` token.
1088pub fn capstd_cap(crate_name: &str, type_name: &str) -> Option<&'static str> {
1089 if !crate_name.starts_with("cap_") {
1090 return None;
1091 }
1092 Some(match type_name {
1093 "Dir" => "Fs",
1094 "TcpListener" | "TcpStream" | "UdpSocket" | "Pool" => "Net",
1095 "UnixListener" | "UnixStream" | "UnixDatagram" => "Ipc",
1096 "SystemClock" | "MonotonicClock" => "Clock",
1097 _ => return None,
1098 })
1099}
1100
1101/// Table names a SQL string literal STATICALLY reaches — the `Db` analog of the `Net` host /
1102/// `Exec` command / `Fs` path literal surface (feeds `allow Db in <scope> <table>…`, AS-EFF-008).
1103/// Conservative by construction, because a wrong capture here would FABRICATE: the string must
1104/// open with a SQL statement keyword, and only identifiers in table position are taken —
1105/// `FROM`/`JOIN` anywhere, `INTO` anywhere, statement-leading `UPDATE`/`TRUNCATE`, and
1106/// `TABLE` (create/drop/alter), skipping `ONLY`/`IF NOT EXISTS`. `UPDATE` mid-statement is
1107/// deliberately ignored (`FOR UPDATE SKIP LOCKED` must not yield a table "skip"). A
1108/// dynamically-built query yields nothing — the gate's opaque case — never a guess.
1109/// Output is lower-cased, quote/backtick-stripped, `schema.table` kept qualified, deduped.
1110/// SPEC §2 pins this algorithm token-for-token across engines; the cross-impl vector battery
1111/// (candor-spec conformance/tables/vectors.json, run.sh Part 4b) enforces the JVM/TS mirrors.
1112pub fn tables_in_sql(sql: &str) -> Vec<String> {
1113 const STMT: &[&str] =
1114 &["select", "insert", "update", "delete", "create", "drop", "alter", "truncate", "merge", "replace", "with"];
1115 // Tokens that can FOLLOW a table-introducing keyword without being a table.
1116 const SKIP: &[&str] = &["only", "if", "not", "exists", "table"];
1117 // Identifier-position tokens that are grammar, not a table (subqueries, locking clauses…).
1118 const STOP: &[&str] = &[
1119 "select", "set", "where", "values", "on", "using", "group", "order", "by", "limit",
1120 "returning", "as", "inner", "outer", "left", "right", "cross", "lateral", "natural",
1121 "union", "all", "distinct", "case", "when", "null", "default", "skip", "nowait", "of",
1122 "from", "join", "into", "update", "delete", "insert",
1123 ];
1124 // `,` survives as its OWN token (not a space): it's what lets `FROM t1, t2` continue the table
1125 // list without fabricating from other comma-ridden positions (column lists, ON clauses).
1126 let cleaned: String = sql
1127 .to_lowercase()
1128 .chars()
1129 .flat_map(|c| match c {
1130 '(' | ')' | ';' => vec![' '],
1131 ',' => vec![' ', ',', ' '],
1132 _ => vec![c],
1133 })
1134 .collect();
1135 let toks: Vec<&str> = cleaned.split_whitespace().collect();
1136 let Some(first) = toks.first() else { return Vec::new() };
1137 if !STMT.contains(first) {
1138 return Vec::new(); // not SQL — nothing to certify, nothing fabricated
1139 }
1140 let ident = |t: &str| -> Option<String> {
1141 let t = t.trim_matches(|c| matches!(c, '"' | '`' | '\''));
1142 let mut chars = t.chars();
1143 let ok_first = chars.next().is_some_and(|c| c.is_ascii_alphabetic() || c == '_');
1144 let ok_rest = t.chars().all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '$' | '"' | '`'));
1145 (ok_first && ok_rest && !STOP.contains(&t)).then(|| t.replace(['"', '`'], ""))
1146 };
1147 let mut out: Vec<String> = Vec::new();
1148 let mut push = |t: Option<String>| {
1149 if let Some(t) = t {
1150 if !out.contains(&t) {
1151 out.push(t);
1152 }
1153 }
1154 };
1155 for (i, tok) in toks.iter().enumerate() {
1156 let table_pos = match *tok {
1157 "from" | "join" | "into" | "table" => true,
1158 // statement-leading only (see doc comment): `update t set …`, `truncate [table] t`.
1159 "update" | "truncate" => i == 0,
1160 _ => false,
1161 };
1162 if !table_pos {
1163 continue;
1164 }
1165 let mut j = i + 1;
1166 while j < toks.len() && SKIP.contains(&toks[j]) {
1167 j += 1;
1168 }
1169 let Some(next) = toks.get(j) else { continue };
1170 let Some(first) = ident(next) else { continue };
1171 push(Some(first));
1172 // Comma-ADJACENT continuation only: `FROM t1, t2, t3` takes all three, while an alias breaks
1173 // the chain (`FROM t1 a, t2` keeps just t1 — an under-report, never a guess: skipping an
1174 // alias to chase the comma would fabricate tables out of `INSERT INTO t (a, b)`'s column
1175 // list, whose parens are spaces by the time we tokenize).
1176 while j + 2 < toks.len() && toks[j + 1] == "," {
1177 let Some(more) = ident(toks[j + 2]) else { break };
1178 push(Some(more));
1179 j += 2;
1180 }
1181 }
1182 out
1183}
1184
1185#[cfg(test)]
1186mod tests {
1187 #[test]
1188 fn sql_table_extraction_is_conservative() {
1189 use super::tables_in_sql as t;
1190 assert_eq!(t("SELECT id FROM users WHERE x = 1"), vec!["users"]);
1191 assert_eq!(t("select * from ledger.entries e join customers c on c.id = e.cid"),
1192 vec!["ledger.entries", "customers"]);
1193 assert_eq!(t("INSERT INTO audit_log (a) VALUES (?1)"), vec!["audit_log"]);
1194 assert_eq!(t("UPDATE accounts SET v = ?"), vec!["accounts"]);
1195 assert_eq!(t("DELETE FROM sessions WHERE id = ?"), vec!["sessions"]);
1196 assert_eq!(t("CREATE TABLE IF NOT EXISTS cache (k TEXT)"), vec!["cache"]);
1197 assert_eq!(t("TRUNCATE TABLE staging"), vec!["staging"]);
1198 // FOR UPDATE locking clause must not yield a phantom table (mid-statement update ignored)
1199 assert_eq!(t("SELECT * FROM jobs FOR UPDATE SKIP LOCKED"), vec!["jobs"]);
1200 // a subquery in FROM position yields nothing for that position
1201 assert_eq!(t("SELECT * FROM (SELECT 1) q"), Vec::<String>::new());
1202 // not SQL -> nothing (never fabricate)
1203 assert_eq!(t("/tmp/some/path"), Vec::<String>::new());
1204 assert_eq!(t("hello world from nowhere"), Vec::<String>::new());
1205 // comma-ADJACENT continuation: a FROM list takes every table in the chain…
1206 assert_eq!(t("SELECT a FROM t1, t2, s.t3 WHERE x = 1"), vec!["t1", "t2", "s.t3"]);
1207 // …but an alias breaks it (under-report, never a guess)…
1208 assert_eq!(t("SELECT a FROM t1 a1, t2 WHERE x = 1"), vec!["t1"]);
1209 // …which is exactly what keeps a column list from fabricating (parens are spaces by now).
1210 assert_eq!(t("INSERT INTO t (a, b) VALUES (1, 2)"), vec!["t"]);
1211 // a subquery after the comma stops the chain too
1212 assert_eq!(t("SELECT a FROM t1, (SELECT 1) q"), vec!["t1"]);
1213 }
1214
1215 use super::*;
1216
1217 #[test]
1218 fn db_crates_are_calibrated() {
1219 // The calibrated set must cover every DB client the classifier knows, or the receipt's coverage
1220 // check would flag a recognized crate as a blind spot. (Was nightly-lint-only; now runs on stable.)
1221 for c in DB_CRATES {
1222 assert!(
1223 CALIBRATED_CRATES.contains(&c),
1224 "DB crate `{c}` is matched by classify() but missing from CALIBRATED_CRATES"
1225 );
1226 }
1227 }
1228
1229 #[test]
1230 fn calibrated_crates_are_live() {
1231 // Conversely, every crate advertised as calibrated must actually be matched by classify() for
1232 // some representative path — a dead entry would silently suppress a real coverage warning.
1233 for c in CALIBRATED_CRATES {
1234 assert!(
1235 CALIBRATION_PROBE_TAILS.iter().any(|t| classify(c, &format!("{c}{t}")).is_some()),
1236 "calibrated crate `{c}` is matched by no path in classify() — dead list entry"
1237 );
1238 }
1239 }
1240
1241 #[test]
1242 fn classify_core_effects() {
1243 // A representative smoke test of the classifier's main families, so the published crate is not
1244 // shipped untested (these used to live only in the nightly-only src/lib.rs).
1245 assert_eq!(classify("std", "std::fs::read_to_string"), Some("Fs"));
1246 // std::path stat-family methods are Fs (each is a stat/readdir syscall); the pure
1247 // string-manipulation surface stays unclassified (the blackout screen's gix-dir find).
1248 assert_eq!(classify("std", "std::path::Path::symlink_metadata"), Some("Fs"));
1249 assert_eq!(classify("std", "std::path::PathBuf::read_dir"), Some("Fs"));
1250 assert_eq!(classify("std", "std::path::Path::exists"), Some("Fs"));
1251 assert_eq!(classify("std", "std::path::Path::join"), None); // pure string manipulation
1252 assert_eq!(classify("std", "std::path::PathBuf::file_name"), None);
1253 assert_eq!(classify("std", "std::path::Path::parent"), None);
1254 assert_eq!(classify("std", "std::process::Command::new"), Some("Exec"));
1255 assert_eq!(classify("std", "std::env::var"), Some("Env"));
1256 assert_eq!(classify("reqwest", "reqwest::Client::execute"), Some("Net"));
1257 // one-shot convenience fns send immediately → Net; the `Client::get` builder stays pure.
1258 assert_eq!(classify("reqwest", "reqwest::get"), Some("Net"));
1259 assert_eq!(classify("reqwest", "reqwest::blocking::get"), Some("Net"));
1260 assert_eq!(classify("reqwest", "reqwest::Client::get"), None);
1261 assert_eq!(classify("reqwest", "reqwest::RequestBuilder::header"), None);
1262 // nix routes through the libc syscall table (same leaves): I/O classified, generic fd ops skipped.
1263 assert_eq!(classify("nix", "nix::fcntl::open"), Some("Fs"));
1264 assert_eq!(classify("nix", "nix::sys::socket::connect"), Some("Net"));
1265 assert_eq!(classify("nix", "nix::unistd::execvp"), Some("Exec"));
1266 assert_eq!(classify("nix", "nix::unistd::write"), None); // generic fd op — deliberately unclassified
1267 assert_eq!(classify("nix", "nix::unistd::getpid"), None); // not I/O
1268 // rustix does raw syscalls (no libc underneath) → classified directly by leaf, same table.
1269 assert_eq!(classify("rustix", "rustix::time::clock_settime"), Some("Clock"));
1270 assert_eq!(classify("rustix", "rustix::fs::symlink"), Some("Fs"));
1271 assert_eq!(classify("rustix", "rustix::net::connect"), Some("Net"));
1272 assert_eq!(classify("rustix", "rustix::io::read"), None); // generic fd op
1273 // pnet raw packet capture: channel openers are Net, packet construction stays pure.
1274 assert_eq!(classify("pnet", "pnet::datalink::channel"), Some("Net"));
1275 assert_eq!(classify("pnet", "pnet::transport::transport_channel"), Some("Net"));
1276 assert_eq!(classify("pnet_datalink", "pnet_datalink::channel"), Some("Net"));
1277 assert_eq!(classify("pnet", "pnet::packet::ethernet::EthernetPacket::new"), None);
1278 assert_eq!(classify("pnet_base", "pnet_base::MacAddr::new"), None);
1279 // ignore (gitignore-aware walker): walk executors are Fs, config builders stay pure.
1280 assert_eq!(classify("ignore", "ignore::WalkBuilder::build_parallel"), Some("Fs"));
1281 assert_eq!(classify("ignore", "ignore::WalkBuilder::build"), Some("Fs"));
1282 assert_eq!(classify("ignore", "ignore::WalkParallel::run"), Some("Fs"));
1283 assert_eq!(classify("ignore", "ignore::overrides::OverrideBuilder::build"), None); // pure config
1284 assert_eq!(classify("ignore", "ignore::gitignore::GitignoreBuilder::build"), None); // pure config
1285 assert_eq!(classify("ignore", "ignore::DirEntry::path"), None); // pure accessor
1286 // notify fs-watching: watcher constructors + watch/unwatch are Fs, data types stay pure.
1287 assert_eq!(classify("notify", "notify::RecommendedWatcher::new"), Some("Fs"));
1288 assert_eq!(classify("notify", "notify::PollWatcher::new"), Some("Fs"));
1289 assert_eq!(classify("notify", "notify::recommended_watcher"), Some("Fs"));
1290 assert_eq!(classify("notify", "notify::INotifyWatcher::watch"), Some("Fs"));
1291 assert_eq!(classify("notify", "notify::Config::default"), None); // pure config
1292 assert_eq!(classify("notify", "notify::Event::new"), None); // pure data type
1293 assert_eq!(classify("rusqlite", "rusqlite::Connection::execute"), Some("Db"));
1294 // the rusqlite verb DIALECT (a verb probe found the canonical consumer API classifying pure):
1295 assert_eq!(classify("rusqlite", "rusqlite::Connection::query_row"), Some("Db"));
1296 assert_eq!(classify("rusqlite", "rusqlite::Statement::query_map"), Some("Db"));
1297 assert_eq!(classify("rusqlite", "rusqlite::Connection::execute_batch"), Some("Db"));
1298 assert_eq!(classify("rusqlite", "rusqlite::Connection::prepare_cached"), Some("Db"));
1299 assert_eq!(classify("rusqlite", "rusqlite::Connection::open"), Some("Db"));
1300 assert_eq!(classify("rusqlite", "rusqlite::Connection::open_in_memory"), Some("Db"));
1301 // …but `open` stays rusqlite-only (postgres has no open; nothing else may borrow it):
1302 assert_eq!(classify("postgres", "postgres::Client::open"), None);
1303 assert_eq!(classify("tokio_postgres", "tokio_postgres::Client::query_typed"), Some("Db"));
1304 // diesel's LIMIT-1 + streaming executions; sqlx's multi-result stream:
1305 assert_eq!(classify("diesel", "diesel::RunQueryDsl::first"), Some("Db"));
1306 assert_eq!(classify("diesel", "diesel::RunQueryDsl::load_iter"), Some("Db"));
1307 assert_eq!(classify("sqlx", "sqlx::query::Query::fetch_many"), Some("Db"));
1308 // sqlx's bare `query()` builder must STAY pure (the original sqlx lesson):
1309 assert_eq!(classify("sqlx", "sqlx::query"), None);
1310 // tracing: the emit/span-lifecycle dispatch is Log; the pure DATA-type accessors are not
1311 // (whole-crate Log fabricated Log on `Level::as_str` / `Span::is_disabled` — the data types are
1312 // pure, same principle as the `log` facade).
1313 assert_eq!(classify("tracing", "tracing::event"), Some("Log"));
1314 assert_eq!(classify("tracing", "tracing::Span::new_span"), Some("Log"));
1315 assert_eq!(classify("tracing", "tracing::Span::record"), Some("Log"));
1316 assert_eq!(classify("tracing", "tracing::Span::enter"), Some("Log"));
1317 assert_eq!(classify("tracing", "tracing::Level::as_str"), None); // pure accessor
1318 assert_eq!(classify("tracing", "tracing::Span::is_disabled"), None); // pure state read
1319 assert_eq!(classify("tracing", "tracing::Span::metadata"), None); // pure accessor
1320 assert_eq!(classify("tracing", "tracing::metadata::Level::TRACE"), None); // pure data type
1321 assert_eq!(classify("tracing", "tracing::field::Field::name"), None); // pure data type
1322 // memmap2: only the syscall-issuing map/flush/protect verbs are Fs; reads over an already-mapped
1323 // region (len/as_ptr/is_empty) and the request builder are PURE (whole-crate Fs fabricated Fs).
1324 assert_eq!(classify("memmap2", "memmap2::MmapOptions::map"), Some("Fs"));
1325 assert_eq!(classify("memmap2", "memmap2::MmapOptions::map_mut"), Some("Fs"));
1326 assert_eq!(classify("memmap2", "memmap2::Mmap::flush"), Some("Fs"));
1327 assert_eq!(classify("memmap2", "memmap2::MmapMut::make_read_only"), Some("Fs"));
1328 assert_eq!(classify("memmap2", "memmap2::Mmap::len"), None); // length read — pure
1329 assert_eq!(classify("memmap2", "memmap2::Mmap::is_empty"), None); // pure
1330 assert_eq!(classify("memmap2", "memmap2::Mmap::as_ptr"), None); // pointer — pure
1331 assert_eq!(classify("memmap2", "memmap2::MmapOptions::new"), None); // request builder — pure
1332 // arboard: the Clipboard handle's read/write verbs are Clipboard; `arboard::Error` formatting
1333 // and option data types are PURE (whole-crate Clipboard fabricated Clipboard on `Error::to_string`).
1334 assert_eq!(classify("arboard", "arboard::Clipboard::new"), Some("Clipboard"));
1335 assert_eq!(classify("arboard", "arboard::Clipboard::get_text"), Some("Clipboard"));
1336 assert_eq!(classify("arboard", "arboard::Clipboard::set_text"), Some("Clipboard"));
1337 assert_eq!(classify("arboard", "arboard::Clipboard::clear"), Some("Clipboard"));
1338 assert_eq!(classify("arboard", "arboard::Error::to_string"), None); // error formatting — pure
1339 assert_eq!(classify("arboard", "arboard::Error::fmt"), None); // Display impl — pure
1340 assert_eq!(classify("arboard", "arboard::ImageData::to_owned_img"), None); // pure data type
1341 // fastrand: value draws + entropy-seeded entry points are Rand; the DETERMINISTIC seeded ctor
1342 // `with_seed` and state split/copy (`fork`/`clone`) are PURE (whole-crate Rand fabricated Rand).
1343 assert_eq!(classify("fastrand", "fastrand::u32"), Some("Rand")); // top-level draw
1344 assert_eq!(classify("fastrand", "fastrand::Rng::usize"), Some("Rand"));
1345 assert_eq!(classify("fastrand", "fastrand::Rng::shuffle"), Some("Rand"));
1346 assert_eq!(classify("fastrand", "fastrand::Rng::new"), Some("Rand")); // entropy-seeded
1347 assert_eq!(classify("fastrand", "fastrand::Rng::with_seed"), None); // deterministic ctor — pure
1348 assert_eq!(classify("fastrand", "fastrand::Rng::fork"), None); // state split — pure
1349 assert_eq!(classify("fastrand", "fastrand::Rng::clone"), None); // state copy — pure
1350 // portable_pty / async_process: spawn/wait keep Exec; config GETTERS and pure data ctors/setters
1351 // do NOT (base Exec fabricated on `CommandBuilder::get_cwd` / `PtySize::default` / `Stdio::piped`).
1352 assert_eq!(classify("portable_pty", "portable_pty::PtySystem::openpty"), Some("Exec"));
1353 assert_eq!(classify("portable_pty", "portable_pty::SlavePty::spawn_command"), Some("Exec"));
1354 assert_eq!(classify("portable_pty", "portable_pty::CommandBuilder::get_argv"), None); // getter
1355 assert_eq!(classify("portable_pty", "portable_pty::CommandBuilder::get_cwd"), None); // getter
1356 assert_eq!(classify("portable_pty", "portable_pty::PtySize::default"), None); // pure data type
1357 assert_eq!(classify("portable_pty", "portable_pty::CommandBuilder::new"), None); // builder ctor
1358 assert_eq!(classify("async_process", "async_process::Command::spawn"), Some("Exec"));
1359 assert_eq!(classify("async_process", "async_process::Command::output"), Some("Exec"));
1360 assert_eq!(classify("async_process", "async_process::Stdio::piped"), None); // pure data type
1361 assert_eq!(classify("async_process", "async_process::Stdio::null"), None); // pure data type
1362 // FFI tiers (matched by distinctive leaf, alias-independent)
1363 assert_eq!(classify("libc", "libc::open"), Some("Fs"));
1364 assert_eq!(classify("libc", "libc::connect"), Some("Net"));
1365 assert_eq!(classify("libc", "libc::read"), None); // generic fd op — deliberately unclassified
1366 assert_eq!(classify("ffi", "ffi::sqlite3_step"), Some("Db"));
1367 assert_eq!(classify("raw", "raw::git_remote_fetch"), Some("Net"));
1368 // libgit2 clone + submodule clone/update fetch over the network (an A/B on git2 0.20 caught
1369 // `Submodule::update`/`clone` and `Repository::clone` reporting no Net — the latter because the
1370 // `src/build.rs` module was being dropped as if it were the Cargo build script).
1371 assert_eq!(classify("raw", "raw::git_clone"), Some("Net"));
1372 assert_eq!(classify("raw", "raw::git_submodule_clone"), Some("Net"));
1373 assert_eq!(classify("raw", "raw::git_submodule_update"), Some("Net"));
1374 assert_eq!(classify("raw", "raw::git_submodule_open"), None); // local subrepo open — not Net
1375 // libcurl: the transfer/raw-socket entry points are Net (an A/B on curl 0.4 caught the whole
1376 // crate reporting ZERO Net); the big setopt/init/getinfo surface — and the readiness-wait
1377 // multi_wait/poll — stay unclassified (the loop's perform is the boundary).
1378 assert_eq!(classify("curl_sys", "curl_sys::curl_easy_perform"), Some("Net"));
1379 assert_eq!(classify("curl_sys", "curl_sys::curl_easy_send"), Some("Net"));
1380 assert_eq!(classify("curl_sys", "curl_sys::curl_multi_perform"), Some("Net"));
1381 assert_eq!(classify("curl_sys", "curl_sys::curl_multi_socket_action"), Some("Net"));
1382 assert_eq!(classify("curl_sys", "curl_sys::curl_easy_setopt"), None); // in-memory option write
1383 assert_eq!(classify("curl_sys", "curl_sys::curl_easy_init"), None); // handle alloc
1384 assert_eq!(classify("curl_sys", "curl_sys::curl_multi_wait"), None); // readiness wait, no payload
1385 // consumer-side `curl` crate rule: the dispatch verbs are Net, the setopt builders pure.
1386 assert_eq!(classify("curl", "curl::easy::Easy::perform"), Some("Net"));
1387 assert_eq!(classify("curl", "curl::multi::Multi::perform"), Some("Net"));
1388 assert_eq!(classify("curl", "curl::easy::Easy::send"), Some("Net"));
1389 assert_eq!(classify("curl", "curl::easy::Easy::url"), None); // CURLOPT setter — pure
1390 assert_eq!(classify("curl", "curl::easy::Easy::timeout"), None); // pure setter; Multi::timeout under-reported by design
1391 assert_eq!(classify("ffi", "ffi::SSL_connect"), Some("Net"));
1392 // pure crates stay pure
1393 assert_eq!(classify("serde", "serde::Serialize::serialize"), None);
1394 assert_eq!(classify("std", "std::vec::Vec::push"), None);
1395 }
1396
1397 #[test]
1398 fn rand_osrng_handle_ops_are_pure_but_draws_are_rand() {
1399 // Adversarial-review fabrication: the blanket `contains("OsRng")` tagged `OsRng::clone` Rand,
1400 // but OsRng is a unit struct — clone/fork/default draw no entropy. The real draws still fire.
1401 assert_eq!(classify("rand", "rand::rngs::OsRng::clone"), None);
1402 assert_eq!(classify("rand", "rand::rngs::OsRng::default"), None);
1403 assert_eq!(classify("rand", "rand::rngs::OsRng::fill_bytes"), Some("Rand")); // a real draw
1404 assert_eq!(classify("rand", "rand::rngs::OsRng::next_u32"), Some("Rand"));
1405 assert_eq!(classify("rand", "rand::Rng::gen"), Some("Rand")); // verb path unaffected
1406 assert_eq!(classify("rand", "rand::distributions::Uniform::new"), None); // pure ctor still pure
1407 }
1408
1409 #[test]
1410 fn redis_connection_manager_config_builder_is_pure() {
1411 // Adversarial-review fabrication: `contains("ConnectionManager")` hit the pure *Config* builder.
1412 assert_eq!(classify("redis", "redis::aio::ConnectionManagerConfig::new"), None);
1413 assert_eq!(classify("redis", "redis::aio::ConnectionManagerConfig::set_max_delay"), None);
1414 // the LIVE manager still round-trips (Db).
1415 assert_eq!(classify("redis", "redis::aio::ConnectionManager::new"), Some("Db"));
1416 assert_eq!(classify("redis", "redis::Commands::get"), Some("Db"));
1417 }
1418
1419 #[test]
1420 fn pure_fd_transfer_is_not_an_effect() {
1421 // ADOPTING / EXTRACTING / BORROWING an already-open descriptor (or unwrapping an async type back
1422 // to its std type) issues NO syscall — it must be PURE even though it hangs off a std I/O type
1423 // whose prefix rule would otherwise fire Net/Fs/Ipc. (Real tokio sweep: `into_std`, `from_raw_fd`,
1424 // `as_raw_fd` all fabricated effects.)
1425 assert_eq!(classify("std", "std::net::TcpStream::from_raw_fd"), None);
1426 assert_eq!(classify("std", "std::net::TcpStream::into_raw_fd"), None);
1427 assert_eq!(classify("std", "std::net::TcpStream::as_raw_fd"), None);
1428 assert_eq!(classify("std", "std::net::TcpListener::from_raw_fd"), None);
1429 assert_eq!(classify("std", "std::net::UdpSocket::from_raw_socket"), None);
1430 assert_eq!(classify("std", "std::fs::File::from_raw_fd"), None);
1431 assert_eq!(classify("std", "std::fs::File::into_raw_fd"), None);
1432 assert_eq!(classify("std", "std::fs::File::as_raw_handle"), None);
1433 assert_eq!(classify("std", "std::os::unix::net::UnixStream::from_raw_fd"), None);
1434 // `SocketAddr::from_pathname` builds an address struct, opens no socket — pure. (socket2 sweep.)
1435 assert_eq!(classify("std", "std::os::unix::net::SocketAddr::from_pathname"), None);
1436 assert_eq!(classify("tokio", "tokio::net::TcpStream::from_raw_fd"), None);
1437 assert_eq!(classify("tokio", "tokio::net::TcpStream::into_std"), None); // unwrap → std type, pure
1438 assert_eq!(classify("tokio", "tokio::fs::File::into_std"), None);
1439 // …but a REAL open/connect on the SAME types still fires the effect — the carve-out is leaf-precise.
1440 assert_eq!(classify("std", "std::net::TcpStream::connect"), Some("Net"));
1441 assert_eq!(classify("std", "std::fs::File::open"), Some("Fs"));
1442 assert_eq!(classify("std", "std::fs::read"), Some("Fs"));
1443 assert_eq!(classify("std", "std::os::unix::net::UnixStream::connect"), Some("Ipc"));
1444 assert_eq!(classify("tokio", "tokio::net::TcpStream::connect"), Some("Net"));
1445 }
1446
1447 #[test]
1448 fn command_head_refines_the_exec_cliff() {
1449 use super::classify_command_head as h;
1450 // unambiguous external tools classify by basename (spec §4 ⟨0.5⟩)
1451 assert_eq!(h("curl"), &["Net"]);
1452 assert_eq!(h("telnet"), &["Net"]);
1453 assert_eq!(h("sftp"), &["Net"]);
1454 assert_eq!(h("/usr/local/bin/psql"), &["Db"]); // basename match strips the path
1455 assert_eq!(h("mongo"), &["Db"]);
1456 assert_eq!(h("cqlsh"), &["Db"]);
1457 // a candor engine is Fs/Env — spec-SUPPLIED by §7 item 12, not curation
1458 assert_eq!(h("candor-scan"), &["Env", "Fs"]);
1459 assert_eq!(h("candor-run.sh"), &["Env", "Fs"]);
1460 // an unrecognised head adds nothing — the bare Exec cliff stands (never guess). `make`/`npm`
1461 // run the project's own code; `git`/`rsync` are multi-modal (local vs remote) — all keep the
1462 // cliff rather than fabricate an effect for the common case.
1463 assert_eq!(h("some-unknown-tool"), &[] as &[&str]);
1464 assert_eq!(h("make"), &[] as &[&str]);
1465 assert_eq!(h("npm"), &[] as &[&str]);
1466 assert_eq!(h("git"), &[] as &[&str]);
1467 assert_eq!(h("rsync"), &[] as &[&str]);
1468 // a builder MODIFIER (`.arg`/`.env`) names no program — its literal must NOT refine (a
1469 // whole-crate-Exec crate classifies every method; `.env("psql",..)` must not fabricate Db).
1470 assert!(is_cmd_builder_method("env") && is_cmd_builder_method("arg") && is_cmd_builder_method("current_dir"));
1471 assert!(!is_cmd_builder_method("new")); // Command::new NAMES the program
1472 assert!(!is_cmd_builder_method("cmd")); // duct::cmd NAMES the program
1473 // The gate that ADMITS a literal to classify_command_head is an ALLOWLIST of program-NAMING
1474 // methods, not the builder denylist. Inversion matters: a whole-crate-Exec crate (portable_pty)
1475 // classifies EVERY method as Exec, so a getter like `cmd.get_env("psql")` — absent from the
1476 // builder denylist — would have leaked "psql" to the head and FABRICATED Db. Only `new`/`cmd`
1477 // name a program, so only they may refine.
1478 assert!(is_cmd_naming_method("new") && is_cmd_naming_method("cmd"));
1479 assert!(!is_cmd_naming_method("get_env")); // a GETTER, not a namer — the leak this closes
1480 assert!(!is_cmd_naming_method("arg") && !is_cmd_naming_method("env") && !is_cmd_naming_method("current_dir"));
1481 }
1482}