Skip to main content

candor_classify/
lib.rs

1//! candor-classify — the curated effect classifier (crate+path -> effect), extracted to a STABLE
2//! crate so both the nightly `rustc_private` lint AND a stable backend share ONE source of truth
3//! (no drift). Pure string logic; no rustc internals. The effect vocabulary lives in candor-report.
4
5use candor_report::EFFECTS;
6
7/// The canonical CANDOR_POLICY DSL parser (SPEC §6.2), shared by the nightly gate and candor-query.
8pub mod policy;
9
10/// Project-supplied rules, consulted only when the built-in `classify` returns None.
11pub fn classify_extra(
12    crate_name: &str,
13    path: &str,
14    extra: &[(&'static str, bool, String)],
15) -> Option<&'static str> {
16    for (eff, is_crate, prefix) in extra {
17        let hit = if *is_crate { crate_name.starts_with(prefix.as_str()) } else { path.starts_with(prefix.as_str()) };
18        if hit {
19            return Some(eff);
20        }
21    }
22    None
23}
24
25/// The exact third-party crates `classify` has effect rules for, and the crate-name
26/// PREFIXES it recognizes. This is the single source of truth for "what candor knows":
27/// it is emitted beside the JSON report (`<prefix>.calibrated.json`) so the Claude Code
28/// receipt's coverage check reads candor's real coverage instead of a hand-copied list.
29/// Keep in lockstep with `classify` below — the `db_crates_are_calibrated` and
30/// `calibrated_crates_are_live` tests (in this crate's `tests` module) enforce both directions.
31pub const CALIBRATED_CRATES: [&str; 59] = [
32    // network (aws_config resolves credentials over the network on `.load()`;
33    // git2 remote ops — fetch/push/connect — contact the network; async_net is smol's net layer;
34    // pnet is raw L2/L3 packet capture)
35    "reqwest", "isahc", "ureq", "curl", "aws_config", "git2", "tokio_tcp", "tokio_udp", "async_net",
36    "async_nats", "lapin", "lettre", "tungstenite", "elasticsearch", "tonic", "rdkafka", "pnet",
37    // directory traversal (ignore = gitignore-aware walker, powers ripgrep/fd; its walk executors are Fs)
38    // + filesystem watching (notify = inotify/FSEvents/kqueue wrapper; powers watchexec/cargo-watch)
39    "ignore", "notify",
40    // database (see DB_CRATES in classify)
41    "sqlx", "rusqlite", "postgres", "tokio_postgres", "diesel", "redis", "mongodb",
42    "mysql", "mysql_async", "sea_orm", "deadpool_postgres",
43    // filesystem (async_fs = smol; fs_err = std::fs wrapper; tempfile; glob) / entropy /
44    // subprocess (async_process = smol; duct) / env (dotenvy/dotenv) / clock (time) / log / clipboard
45    "memmap2", "fs_err", "async_fs", "tempfile", "glob",
46    "rand", "getrandom", "fastrand",
47    // entropy: the password-hashing tier (salt mints + bcrypt's internal salt) + the OsRng source
48    "argon2", "bcrypt", "scrypt", "pbkdf2", "password_hash", "rand_core",
49    "portable_pty", "async_process", "duct",
50    "dotenvy", "dotenv",
51    "chrono", "time", "tracing", "log", "arboard",
52    // compiler diagnostic emission (a dylint lint's output) — see the Log rules in classify
53    "rustc_lint", "rustc_errors",
54    // raw syscalls via FFI — the syscall-name table that lights up the FFI-thin tier (nix is routed
55    // through the same table by leaf name, so a consumer of nix is covered without nix's own source)
56    "libc", "nix", "rustix",
57];
58
59pub const CALIBRATED_PREFIXES: [&str; 3] = ["aws_sdk_", "aws_smithy", "cap_"];
60
61/// Crates `classify` matches by PATH prefix rather than crate-name equality (their effectful modules
62/// are recognised, e.g. `tokio::net::`/`async_std::fs::`/`mio::net::`), so they're absent from
63/// `CALIBRATED_CRATES` (which the liveness test probes by crate name). The coverage check must still
64/// treat them as *covered* — otherwise it would mislabel the most common async crates as blind spots.
65pub const PATH_CALIBRATED_CRATES: [&str; 3] = ["tokio", "async_std", "mio"];
66
67/// Representative path tails (each appended to a crate name) that the `calibrated_crates_are_live`
68/// liveness test probes: at least one must match for every `CALIBRATED_CRATES` entry, else the entry is
69/// dead. Exported as ONE source of truth because the nightly lint crate (`src/lib.rs`) runs the SAME
70/// liveness test — when the two probe lists were duplicated they drifted, and a rule keyed on a
71/// distinctive tail (pnet `::datalink::channel`, ignore `::WalkBuilder::build_parallel`, notify
72/// `::RecommendedWatcher::new`) added to only one list silently broke the other crate's `cargo test`.
73pub const CALIBRATION_PROBE_TAILS: &[&str] = &[
74    "::X::send", "::X::execute", "::X::call", "::X::query", "::X::fetch_one", "::Remote::fetch",
75    "::datalink::channel", "::WalkBuilder::build_parallel", "::RecommendedWatcher::new",
76    "::X::connect", "::Utc::now", "::X::load", "::__private_api::log", "::tempfile", "::glob",
77    "::X::run", "::dotenv", "::random", "::emit", "::X::emit_span_lint", "::X::anything",
78    "::SaltString::generate", "::hash", "::OsRng::fill_bytes",
79];
80
81/// Database client crates whose execution verbs are I/O (see the DB branch in `classify`).
82/// Module-level so `db_crates_are_calibrated` can enforce `DB_CRATES ⊆ CALIBRATED_CRATES`.
83pub const DB_CRATES: [&str; 11] = [
84    "sqlx", "rusqlite", "postgres", "tokio_postgres", "diesel", "redis", "mongodb",
85    "mysql", "mysql_async", "sea_orm", "deadpool_postgres",
86];
87
88/// Classify a resolved callee by the crate it belongs to and its full path.
89pub fn classify(crate_name: &str, path: &str) -> Option<&'static str> {
90    if crate_name.starts_with("aws_sdk_") || crate_name.starts_with("aws_smithy") {
91        // Only request dispatch is network I/O; builder setters/accessors are pure.
92        if path.ends_with("::send") || path.ends_with("::send_with") {
93            return Some("Net");
94        }
95        return None;
96    }
97    // aws-config resolves credentials/region on `.load()` — it reaches the IMDS metadata
98    // endpoint / STS over the network (and reads ~/.aws + env). Builders (`defaults()`,
99    // `SdkConfig::builder()`, `BehaviorVersion::latest()`) are pure; the `load` is the I/O.
100    // (Found hardening on a real app, ebman: `builder.load().await` was classified pure.)
101    if crate_name == "aws_config" {
102        if path.ends_with("::load") || path.ends_with("::load_defaults") {
103            return Some("Net");
104        }
105        return None;
106    }
107    // git2 (libgit2 FFI): remote operations contact the network; everything else is local
108    // to the .git directory. Match the remote verbs precisely — NOT bare `::clone`, which is
109    // the `Clone`-trait dup of a `Remote` handle (pure), not `Repository::clone`. (Found
110    // hardening on gitui: `remote.fetch`/`remote.push` were classified network-free — a git
111    // client reporting it makes no network calls.)
112    if crate_name == "git2" {
113        if path.ends_with("::fetch")
114            || path.ends_with("::push")
115            || path.ends_with("::download")
116            || path.ends_with("::connect")
117            || path.ends_with("::connect_auth")
118            || path.ends_with("::ls")
119            || path.ends_with("::upload")
120        {
121            return Some("Net");
122        }
123        return None;
124    }
125    // libc — raw syscalls via FFI. The FFI-thin tier (nix, and the syscall layer beneath rusqlite/git2)
126    // is invisible to a name classifier unless we model libc directly: a 35-crate calibration
127    // (eval/calibration) showed nix reporting ZERO library effects because every wrapper bottoms out in
128    // an unrecognised `libc::*` call. Classify by syscall name, but ONLY the UNAMBIGUOUS ones — the
129    // socket family is Net, path/dir syscalls are Fs, spawn/exec/wait is Exec, SysV/pipe IPC is Ipc,
130    // env/clock/entropy each their own. We deliberately SKIP the generic file-descriptor ops
131    // (read/write/close/lseek/dup/fcntl/ioctl/poll/select/epoll*/mmap): they operate on ANY fd — file,
132    // socket, or pipe — so a fixed label would mis-categorise as often as it helps. An honest
133    // no-classify (under-report) beats emitting the WRONG effect. Pure conversions (htons/inet_pton/
134    // gmtime) are also skipped.
135    //
136    // `nix` (the idiomatic SAFE libc wrapper, in ~every Rust systems/CLI crate) is routed through the
137    // SAME table: its functions keep the syscall leaf name (`nix::fcntl::open`, `nix::sys::socket::connect`,
138    // `nix::unistd::execvp`). Without this, a CONSUMER of nix analysed without nix's own source (the
139    // stable scanner, single-crate) sees `nix::*` cross-crate and under-reports — serialport-rs opens its
140    // device via `nix::fcntl::open` and reported ZERO Fs. The nightly lint reaches `libc::*` THROUGH nix's
141    // body; this gives the scanner the same coverage directly. (Found sweeping serialport-rs.)
142    // `rustix` is the same shape as nix but does RAW syscalls (no libc underneath), so its functions MUST
143    // be classified directly. Its leaf names are the syscall names too (`rustix::time::clock_settime`,
144    // `rustix::fs::mkfifoat`/`symlink`/`stat`, `rustix::net::connect`) — route it through the same table.
145    // The rustix-specific `*at`/variant leaves it doesn't share with libc just under-report (the safe
146    // direction). VALIDATED, not speculative: coreutils' `date` reads/sets the clock via
147    // `rustix::time::clock_getres`/`clock_settime` and reported Clock=0; the file I/O that goes through
148    // std::fs was already correct, which is why only the rustix-only effects (Clock/Ipc) were missing.
149    if crate_name == "libc" || crate_name == "nix" || crate_name == "rustix" {
150        let f = path.rsplit("::").next().unwrap_or(path);
151        // path / directory / metadata syscalls (incl. *64 and *at variants)
152        const FS: &[&str] = &[
153            "open", "open64", "openat", "openat2", "creat", "creat64", "stat", "stat64", "lstat",
154            "lstat64", "fstatat", "fstatat64", "newfstatat", "statx", "access", "faccessat",
155            "faccessat2", "mkdir", "mkdirat", "rmdir", "unlink", "unlinkat", "rename", "renameat",
156            "renameat2", "link", "linkat", "symlink", "symlinkat", "readlink", "readlinkat", "chmod",
157            "fchmodat", "chown", "lchown", "fchownat", "truncate", "truncate64", "ftruncate",
158            "ftruncate64", "opendir", "fdopendir", "readdir", "readdir64", "readdir_r", "closedir",
159            "rewinddir", "seekdir", "telldir", "scandir", "mkstemp", "mkstemps", "mkostemp", "mkdtemp",
160            "mknod", "mknodat", "chdir", "fchdir", "getcwd", "get_current_dir_name", "chroot",
161            "pivot_root", "statfs", "statfs64", "fstatfs", "fstatfs64", "statvfs", "fstatvfs", "mount",
162            "umount", "umount2", "fsync", "fdatasync", "sync", "syncfs", "sync_file_range", "fallocate",
163            "posix_fallocate", "posix_fadvise", "sendfile", "sendfile64", "copy_file_range", "flock",
164            "getdents", "getdents64", "utime", "utimes", "lutimes", "futimens", "utimensat", "futimesat",
165            "realpath",
166        ];
167        // socket family — these operate only on sockets, so Net is unambiguous (AF_UNIX domain isn't
168        // visible at the call, so a Unix socket reads as Net rather than Ipc; acceptable over-general).
169        const NET: &[&str] = &[
170            "socket", "setsockopt", "getsockopt", "bind", "listen", "accept", "accept4", "connect",
171            "shutdown", "send", "sendto", "sendmsg", "sendmmsg", "recv", "recvfrom", "recvmsg",
172            "recvmmsg", "getpeername", "getsockname", "getaddrinfo", "freeaddrinfo", "getnameinfo",
173        ];
174        // process creation / replacement / reaping
175        const EXEC: &[&str] = &[
176            "fork", "vfork", "clone", "clone3", "execl", "execlp", "execle", "execv", "execvp",
177            "execvpe", "execve", "execveat", "fexecve", "posix_spawn", "posix_spawnp", "system",
178            "popen", "pclose", "wait", "waitpid", "wait3", "wait4", "waitid",
179        ];
180        // pipes / FIFOs / SysV + POSIX message queues, semaphores, shared memory; socketpair (AF_UNIX)
181        const IPC: &[&str] = &[
182            "pipe", "pipe2", "mkfifo", "mkfifoat", "socketpair", "msgget", "msgsnd", "msgrcv", "msgctl",
183            "semget", "semop", "semtimedop", "semctl", "shmget", "shmat", "shmdt", "shmctl", "mq_open",
184            "mq_send", "mq_receive", "mq_timedsend", "mq_timedreceive", "mq_close", "mq_unlink",
185        ];
186        const ENV: &[&str] = &["getenv", "secure_getenv", "setenv", "putenv", "unsetenv", "clearenv"];
187        const CLOCK: &[&str] = &[
188            "time", "gettimeofday", "clock_gettime", "clock_getres", "nanosleep", "clock_nanosleep",
189            // SETTING the system clock is a clock effect too (was unclassified — found on coreutils `date`,
190            // which sets it via `clock_settime`).
191            "clock_settime", "settimeofday", "stime", "adjtime", "adjtimex", "clock_adjtime",
192        ];
193        const RAND: &[&str] = &["getrandom", "getentropy", "arc4random", "arc4random_buf", "arc4random_uniform"];
194        if FS.contains(&f) {
195            return Some("Fs");
196        }
197        if NET.contains(&f) {
198            return Some("Net");
199        }
200        if EXEC.contains(&f) {
201            return Some("Exec");
202        }
203        if IPC.contains(&f) {
204            return Some("Ipc");
205        }
206        if ENV.contains(&f) {
207            return Some("Env");
208        }
209        if CLOCK.contains(&f) {
210            return Some("Clock");
211        }
212        if RAND.contains(&f) {
213            return Some("Rand");
214        }
215        return None;
216    }
217    // C-library FFI bindings: libsqlite3 (under rusqlite) and libgit2 (under git2). Like the libc tier,
218    // these crates are thin Rust over a C library, so their real I/O is invisible until the C entry
219    // points are named. Match by the DISTINCTIVE C function name (`sqlite3_*` / `git_*`) via the call's
220    // LEAF — independent of the binding crate's alias: rusqlite calls `ffi::sqlite3_step`, git2 calls
221    // `raw::git_remote_fetch`, and the nightly lint resolves the same to `libsqlite3_sys`/`libgit2_sys`;
222    // all spellings share the leaf. Only the I/O-performing entry points are listed — the in-memory
223    // accessors (`sqlite3_bind_*`/`sqlite3_column_*`, `git_*_oid`/strarray/options builders) stay pure,
224    // so a non-listed `sqlite3_`/`git_` leaf returns None (under-report, never a wrong effect). Calibrated
225    // + validated against rusqlite 0.39 / git2 0.20 source (eval/calibration).
226    {
227        let leaf = path.rsplit("::").next().unwrap_or(path);
228        if let Some(rest) = leaf.strip_prefix("sqlite3_") {
229            let _ = rest;
230            // SQLite C API operations that touch the database (open/exec/step/prepare/backup/blob/wal).
231            const DB: &[&str] = &[
232                "sqlite3_open", "sqlite3_open_v2", "sqlite3_open16", "sqlite3_close", "sqlite3_close_v2",
233                "sqlite3_exec", "sqlite3_step", "sqlite3_prepare", "sqlite3_prepare_v2",
234                "sqlite3_prepare_v3", "sqlite3_prepare16", "sqlite3_prepare16_v2", "sqlite3_prepare16_v3",
235                "sqlite3_get_table", "sqlite3_backup_init", "sqlite3_backup_step", "sqlite3_backup_finish",
236                "sqlite3_blob_open", "sqlite3_blob_read", "sqlite3_blob_write", "sqlite3_blob_reopen",
237                "sqlite3_load_extension", "sqlite3_wal_checkpoint", "sqlite3_wal_checkpoint_v2",
238            ];
239            return DB.contains(&leaf).then_some("Db");
240        }
241        if leaf.starts_with("git_") {
242            // libgit2: remote/transport operations contact the network … (incl. submodule clone/update,
243            // which `git_clone`/fetch the subrepo over its remote — `allow_fetch` defaults on; an A/B on
244            // git2 0.20 caught `Submodule::update`/`clone` reporting no `Net`).
245            const NET: &[&str] = &[
246                "git_clone", "git_remote_connect", "git_remote_connect_ext", "git_remote_fetch",
247                "git_remote_download", "git_remote_upload", "git_remote_push", "git_remote_ls",
248                "git_submodule_clone", "git_submodule_update",
249            ];
250            // … and repository/index/odb/checkout/ref/config operations touch the on-disk .git store.
251            const FS: &[&str] = &[
252                "git_repository_open", "git_repository_open_ext", "git_repository_open_bare",
253                "git_repository_init", "git_repository_init_ext", "git_repository_discover",
254                "git_checkout_tree", "git_checkout_head", "git_checkout_index", "git_index_read",
255                "git_index_write", "git_index_write_tree", "git_index_write_tree_to",
256                "git_index_add_bypath", "git_index_add_all", "git_odb_open", "git_odb_read",
257                "git_odb_write", "git_odb_open_wstream", "git_odb_open_rstream",
258                "git_blob_create_fromdisk", "git_blob_create_fromworkdir", "git_blob_create_from_disk",
259                "git_blob_create_from_workdir", "git_blob_create_from_stream", "git_commit_create",
260                "git_commit_create_v", "git_reference_create", "git_reference_set_target",
261                "git_reference_delete", "git_config_open_default", "git_config_open_ondisk",
262                "git_config_add_file_ondisk", "git_tag_create", "git_treebuilder_write",
263                "git_packbuilder_write",
264            ];
265            if NET.contains(&leaf) {
266                return Some("Net");
267            }
268            if FS.contains(&leaf) {
269                return Some("Fs");
270            }
271            return None;
272        }
273        if leaf.starts_with("curl_") {
274            // libcurl (under the `curl` crate, called `curl_sys::curl_*`). Only the entry points that
275            // PERFORM network I/O: the blocking transfer (`curl_easy_perform`), raw socket send/recv,
276            // the HTTP/2 keepalive PING (`upkeep`), and the multi-interface transfer pumps. The large
277            // pure surface (setopt/init/cleanup/reset/getinfo/escape/multi_add_handle/fdset/info_read)
278            // stays unclassified, as do `curl_multi_wait`/`poll` (readiness WAIT on sockets, no payload —
279            // the loop's `perform` is the tagged boundary, per the I/O-boundary principle). An A/B on
280            // curl 0.4 caught the whole crate reporting ZERO Net (`Easy::perform` read as pure).
281            const NET: &[&str] = &[
282                "curl_easy_perform", "curl_easy_send", "curl_easy_recv", "curl_easy_upkeep",
283                "curl_multi_perform", "curl_multi_socket_action",
284            ];
285            return NET.contains(&leaf).then_some("Net");
286        }
287        if let Some(op) = leaf.strip_prefix("SSL_") {
288            // OpenSSL (libssl, under the `openssl`/`native-tls` crates, called `ffi::SSL_*`). The TLS
289            // handshake and record I/O run over the peer socket -> Net. Unlike libc read/write, an SSL_*
290            // op is ~always over a network BIO (the rare memory-BIO/sans-IO case is the honest exception
291            // we accept). The crypto surface (EVP_*/SHA*/AES*) and pure setup (SSL_CTX_new/SSL_set_fd) are
292            // NOT here; `BIO_*` is skipped (a BIO may be memory or socket). Validated vs openssl 0.9 source.
293            const SSL_NET: &[&str] = &[
294                "connect", "accept", "do_handshake", "read", "read_ex", "write", "write_ex", "peek",
295                "peek_ex", "shutdown",
296            ];
297            return SSL_NET.contains(&op).then_some("Net");
298        }
299    }
300    // HTTP clients use the same builder pattern as the AWS SDK: only the dispatch is
301    // I/O. (Found by the eval: ebman's reqwest calls to the Anthropic API + webhooks
302    // were silently classified network-free because reqwest wasn't recognized.)
303    if crate_name == "reqwest" || crate_name == "isahc" {
304        // The builder chain is pure; the dispatch (`::send`/`::execute`) is the I/O. PLUS the one-shot
305        // CONVENIENCE functions `reqwest::get` / `reqwest::blocking::get` / `isahc::get`, which send
306        // immediately — they're not the `Client::get` builder (a different path, `reqwest::Client::get`),
307        // so an exact match avoids false-positiving the builder. (Found running on `xh`: a one-shot
308        // `reqwest::get(url)` was classified network-free.)
309        if path.ends_with("::send")
310            || path.ends_with("::execute")
311            || path == "reqwest::get"
312            || path == "reqwest::blocking::get"
313            || path == "isahc::get"
314        {
315            return Some("Net");
316        }
317        return None;
318    }
319    if crate_name == "ureq" && path.ends_with("::call") {
320        return Some("Net");
321    }
322    // The `curl` crate (libcurl's safe binding — cargo's own HTTP client): the dispatch verbs are
323    // `perform` (Easy/Easy2/Transfer/Multi), raw-socket `send`/`recv`, the keepalive `upkeep`, and the
324    // multi-interface `action` (socket_action). The big setopt-style builder surface stays pure.
325    // `Multi::timeout` is deliberately NOT matched: `Easy::timeout` is a pure CURLOPT_TIMEOUT setter
326    // sharing the leaf — an under-report on the rare event-loop kick beats mis-tagging every consumer
327    // that sets a timeout. (Consumer-side companion to the curl_* FFI tier, same A/B finding.)
328    if crate_name == "curl"
329        && (path.ends_with("::perform")
330            || path.ends_with("::send")
331            || path.ends_with("::recv")
332            || path.ends_with("::upkeep")
333            || path.ends_with("::action"))
334    {
335        return Some("Net");
336    }
337    // Message-queue clients fully encapsulate the socket (the underlying tokio::net lives
338    // inside the crate, unseen), so a user's connect/publish/consume calls ARE the I/O
339    // boundary — to a remote broker, hence Net. Match the broker round-trip verbs (snake_case
340    // methods); the CamelCase option/property builders stay pure. (Found hardening on consumer
341    // apps: lapin `basic_publish`/`queue_declare` and async-nats `publish`/`subscribe` were
342    // classified pure — a message-queue client reporting no I/O.)
343    if crate_name == "async_nats" {
344        if path.ends_with("::connect")
345            || path.contains("::publish")
346            || path.ends_with("::subscribe")
347            || path.ends_with("::queue_subscribe")
348            || path.contains("::request")
349            || path.ends_with("::flush")
350        {
351            return Some("Net");
352        }
353        return None;
354    }
355    if crate_name == "lapin" {
356        if path.ends_with("::connect")
357            || path.ends_with("::create_channel")
358            || path.contains("::basic_")
359            || path.contains("::queue_")
360            || path.contains("::exchange_")
361            || path.contains("::tx_")
362            || path.ends_with("::confirm_select")
363            || path.ends_with("::close")
364        {
365            return Some("Net");
366        }
367        return None;
368    }
369    // SMTP email — lettre's `Transport::send` is the network dispatch; Message building is
370    // pure. (Found hardening on a lettre consumer: `mailer.send(&email)` classified pure.)
371    if crate_name == "lettre" {
372        if path.ends_with("::send") || path.ends_with("::send_raw") {
373            return Some("Net");
374        }
375        return None;
376    }
377    // WebSockets — tungstenite (the modern successor to the old `websocket` crate). connect
378    // and the socket read/write/send are network; Message constructors are pure. (Found on a
379    // tungstenite consumer: connect + send + read classified pure.)
380    if crate_name == "tungstenite" {
381        if path.ends_with("::connect")
382            || path.ends_with("::read")
383            || path.ends_with("::write")
384            || path.ends_with("::send")
385            || path.ends_with("::close")
386            || path.ends_with("::flush")
387            || path.ends_with("::read_message")
388            || path.ends_with("::write_message")
389        {
390            return Some("Net");
391        }
392        return None;
393    }
394    // elasticsearch: request builders are pure; only the `.send()` dispatch is HTTP I/O
395    // (same shape as reqwest / the AWS SDK). (Found on an elasticsearch consumer.)
396    if crate_name == "elasticsearch" && path.ends_with("::send") {
397        return Some("Net");
398    }
399    // gRPC — tonic. The transport connect and the Grpc client RPC dispatch are network;
400    // codecs and request/response wrappers are pure. (connect repro-confirmed on a consumer;
401    // the unary/streaming RPC verbs are from the tonic::client::Grpc API.)
402    if crate_name == "tonic" {
403        if path.ends_with("::connect")
404            || path.ends_with("::unary")
405            || path.ends_with("::server_streaming")
406            || path.ends_with("::client_streaming")
407            || path.ends_with("::streaming")
408        {
409            return Some("Net");
410        }
411        return None;
412    }
413    // Kafka — rdkafka (FFI to librdkafka). Producer send + consumer poll/recv/subscribe/
414    // commit are network round-trips to the brokers. (API-calibrated + unit-tested; a real
415    // repro needs librdkafka/cmake, deferred.)
416    if crate_name == "rdkafka" {
417        if path.ends_with("::send")
418            || path.ends_with("::send_result")
419            || path.ends_with("::recv")
420            || path.ends_with("::poll")
421            || path.ends_with("::subscribe")
422            || path.ends_with("::commit")
423            || path.ends_with("::commit_message")
424            || path.ends_with("::commit_consumer_state")
425            || path.ends_with("::store_offset")
426            || path.ends_with("::seek")
427            || path.ends_with("::fetch_metadata")
428            || path.ends_with("::fetch_watermarks")
429            || path.ends_with("::flush")
430        {
431            return Some("Net");
432        }
433        return None;
434    }
435    // cap-std: capability-oriented std. I/O goes *through* a held capability handle
436    // (Dir/Pool/Clock/...), so these calls ARE the effect. Recognising them means a
437    // cap-std project's real I/O is detected and matches the capability it declared
438    // (via `declared_caps`/`capstd_cap`) — conformance against unforgeable capabilities.
439    if crate_name.starts_with("cap_") {
440        if path.contains("::net::Unix") || path.contains("::os::") {
441            return Some("Ipc");
442        }
443        if path.contains("::net") {
444            return Some("Net");
445        }
446        if path.contains("::time") {
447            return Some("Clock");
448        }
449        if path.contains("::fs") || crate_name == "cap_tempfile" || crate_name == "cap_directories" {
450            return Some("Fs");
451        }
452        return None;
453    }
454    // Local IPC (Unix-domain sockets) is I/O but not *network* — keep it distinct so
455    // CANDOR_NO_AMBIENT and audits don't conflate it with internet access. async-std puts its
456    // Unix sockets under `os::unix::net` (mirroring std); async-net (smol's net layer) under
457    // `unix`.
458    if path.starts_with("tokio::net::Unix")
459        || path.starts_with("std::os::unix::net")
460        || path.starts_with("async_std::os::unix::net")
461        || path.starts_with("async_net::unix")
462    {
463        return Some("Ipc");
464    }
465    // Raw packet capture / raw sockets — libpnet (the dominant low-level networking crate; powers
466    // bandwhich, sniffers, custom-protocol tools). `datalink::channel` opens an L2 socket and
467    // `transport::transport_channel` an L3/L4 raw socket — both ARE network I/O. Packet construction
468    // (pnet_packet / pnet_base, MacAddr, Ethernet frames…) is pure and stays unclassified. The actual
469    // frame read/write happens via methods on the returned Sender/Receiver (trait-object dispatch the
470    // syntactic backend can't resolve), so the channel-open call is the precise Net boundary. (Found
471    // scanning bandwhich — a packet sniffer — which reported Net 0.)
472    if crate_name == "pnet" || crate_name == "pnet_datalink" || crate_name == "pnet_transport" {
473        if path.ends_with("::channel") || path.ends_with("::transport_channel") {
474            return Some("Net");
475        }
476        return None;
477    }
478    // Directory traversal — `ignore` (BurntSushi's gitignore-aware walker; powers ripgrep, fd). The walk
479    // EXECUTORS read the directory tree from disk = Fs. Type-precise on purpose: the configuration builders
480    // (`OverrideBuilder::build`, `GitignoreBuilder::build`, the `WalkBuilder` setters) and `DirEntry`
481    // accessors are PURE — only `WalkBuilder::build`/`build_parallel` (which kick off the walk) and
482    // `WalkParallel::run` (which drives it) touch the filesystem. A bare `build` would wrongly flag the
483    // config builders. (Found scanning fd — a file finder — which reported Fs 2: its own `fs::read_dir`
484    // was caught, but the `ignore`-based traversal that IS fd was invisible cross-crate.)
485    if crate_name == "ignore" {
486        if path == "ignore::WalkBuilder::build"
487            || path == "ignore::WalkBuilder::build_parallel"
488            || path.ends_with("::WalkParallel::run")
489        {
490            return Some("Fs");
491        }
492        return None;
493    }
494    // Filesystem watching — `notify` (the de-facto fs-watch crate: watchexec, cargo-watch, mdbook). A
495    // watcher opens an OS notification handle (inotify / FSEvents / kqueue / ReadDirectoryChanges) and
496    // registers paths — observing filesystem state changes = Fs. The lifecycle boundary: any
497    // `*Watcher::new` constructor (RecommendedWatcher/PollWatcher/INotifyWatcher/FsEventWatcher/…), the
498    // `recommended_watcher` convenience fn, and the `watch`/`unwatch` registration verbs. `Config`/`Event`/
499    // `EventKind` data types stay pure. (Found scanning watchexec: its watcher-`create` read Fs 0.)
500    if crate_name == "notify" {
501        if path.ends_with("Watcher::new")
502            || path.ends_with("::recommended_watcher")
503            || path.ends_with("::watch")
504            || path.ends_with("::unwatch")
505        {
506            return Some("Fs");
507        }
508        return None;
509    }
510    // Raw sockets. Match the I/O *types* only — `std::net` also holds pure data types
511    // (SocketAddr, IpAddr, …) whose construction must NOT be flagged.
512    if path.starts_with("std::net::TcpStream")
513        || path.starts_with("std::net::TcpListener")
514        || path.starts_with("std::net::UdpSocket")
515        || path.starts_with("tokio::net::")
516    {
517        return Some("Net");
518    }
519    // Legacy tokio 0.1 socket crates — `tokio_tcp`/`tokio_udp` are *entirely* networking
520    // (no pure types to over-flag), so the whole crate is Net. (Found hardening on websocat,
521    // which is still on tokio 0.1: its `tokio_tcp::TcpStream::connect` was classified
522    // network-free — a network tool confidently reporting 0 Net.)
523    if matches!(crate_name, "tokio_tcp" | "tokio_udp") {
524        return Some("Net");
525    }
526    // The other async runtimes mirror tokio's module layout, and their `net` modules hold only
527    // socket I/O types (the pure `SocketAddr`/`IpAddr` are re-exports that resolve to `std::net`,
528    // so they're excluded by def-path). `mio` is the low-level non-blocking-socket layer under
529    // tokio/others; `async_net` is smol's net crate. Closes the async-std/smol/mio gap the
530    // tokio_tcp note flagged. (Calibrated by module structure — these crates ARE networking — not
531    // a live repro; the TCP/UDP types are defined in-crate so the def-path prefix is exact.)
532    if path.starts_with("async_std::net::")
533        || path.starts_with("mio::net::")
534        || crate_name == "async_net"
535    {
536        return Some("Net");
537    }
538    // Database clients. Like the AWS/HTTP builders, only the execution verbs are I/O;
539    // query *construction* is pure. Best-effort across crates (tune via CANDOR_CONFIG).
540    // Note: bare `::query` is deliberately omitted — it executes in postgres/rusqlite but
541    // only *builds* in sqlx, so including it would false-positive sqlx's `query()` builder.
542    if DB_CRATES.contains(&crate_name) {
543        // Postgres / SQLite-family clients: `query`/`batch_execute`/`prepare`/etc. ARE the
544        // execution (round-trips to the server). sqlx is the outlier where bare `query()`
545        // only BUILDS — it keeps the narrow set below. (Found by running on a real
546        // tokio-postgres app, pgman: candor had reported only 4 of ~20 DB call sites.)
547        if matches!(crate_name, "postgres" | "tokio_postgres" | "deadpool_postgres" | "rusqlite") {
548            const PG: [&str; 19] = [
549                "::query", "::query_one", "::query_opt", "::query_raw", "::execute",
550                "::batch_execute", "::simple_query", "::prepare", "::prepare_typed",
551                "::copy_in", "::copy_out", "::transaction", "::connect",
552                // rusqlite's dialect of the same verbs (a verb-probe found the CANONICAL rusqlite
553                // consumer API classifying pure): `query_row` is the one-row read, `query_map`/
554                // `query_and_then` the many-row reads, `execute_batch` is rusqlite's name for
555                // batch_execute, `prepare_cached` round-trips like prepare. `query_typed` is
556                // tokio_postgres 0.7.10+.
557                "::query_row", "::query_map", "::query_and_then", "::execute_batch",
558                "::prepare_cached", "::query_typed",
559            ];
560            if PG.iter().any(|v| path.ends_with(v)) {
561                return Some("Db");
562            }
563            // rusqlite only: opening the database IS the connection establishment (`Connection::
564            // open`/`open_in_memory`/`open_with_flags` — the embedded analog of `::connect`).
565            if crate_name == "rusqlite"
566                && (path.ends_with("::open")
567                    || path.ends_with("::open_in_memory")
568                    || path.ends_with("::open_with_flags"))
569            {
570                return Some("Db");
571            }
572            return None;
573        }
574        // redis: the way redis is ACTUALLY used is the high-level `Commands`/`AsyncCommands`
575        // traits (`con.get`/`set`/`hset`/`lpush`/…) — every method is a round-trip — plus
576        // connection establishment. The shared VERBS below only catch the low-level
577        // `cmd("GET").query(con)`, so without this a normal redis user's calls classify as
578        // PURE. (Found hardening on redis-rs: a fn doing `con.get`/`set` reported no effects.)
579        if crate_name == "redis"
580            && (path.contains("Commands::")
581                || path.contains("::get_connection")
582                || path.contains("::get_async_connection")
583                || path.contains("::get_multiplexed_async_connection")
584                || path.contains("ConnectionManager")
585                || path.ends_with("::query")
586                || path.ends_with("::query_async")
587                || path.ends_with("::req_command")
588                || path.ends_with("::req_packed_command")
589                || path.ends_with("::req_packed_commands"))
590        {
591            return Some("Db");
592        }
593        // mongodb: a document-store API with none of the SQL verbs — the user calls
594        // `coll.find_one`/`insert_one`/`aggregate`/… and `Client::with_uri_str`. Without
595        // these a mongodb user's calls classify PURE. (Found hardening: a fn doing
596        // `find_one`+`insert_one` reported no effects.) Handle accessors (name/namespace)
597        // and option/doc builders don't match these verbs, so they stay pure.
598        if crate_name == "mongodb" {
599            const MONGO: [&str; 27] = [
600                "::with_uri_str", "::connect", "::find", "::find_one", "::insert_one",
601                "::insert_many", "::update_one", "::update_many", "::delete_one",
602                "::delete_many", "::replace_one", "::aggregate", "::count_documents",
603                "::estimated_document_count", "::count", "::distinct", "::run_command",
604                "::find_one_and_update", "::find_one_and_delete", "::find_one_and_replace",
605                "::list_collections", "::list_collection_names", "::list_databases",
606                "::list_database_names", "::create_collection", "::create_index", "::watch",
607            ];
608            if MONGO.iter().any(|v| path.ends_with(v)) {
609                return Some("Db");
610            }
611            return None;
612        }
613        // mysql / mysql_async: the `query`/`exec` families + `get_conn`/`ping` execute
614        // immediately — no build-then-execute split like sqlx, so matching `::query` is safe
615        // here. Same DB-verb-dialect gap class as redis/mongodb; calibrated from the Queryable
616        // API (unit-tested; a real-app repro is the remaining confirmation).
617        if matches!(crate_name, "mysql" | "mysql_async") {
618            const MY: [&str; 16] = [
619                "::query", "::query_first", "::query_iter", "::query_map", "::query_fold",
620                "::query_drop", "::exec", "::exec_first", "::exec_iter", "::exec_map",
621                "::exec_fold", "::exec_drop", "::exec_batch", "::prep", "::ping", "::get_conn",
622            ];
623            if MY.iter().any(|v| path.ends_with(v)) {
624                return Some("Db");
625            }
626            return None;
627        }
628        // sea_orm: an ORM whose execution is split from building (like sqlx). The query
629        // BUILDERS (`Entity::find`, `Entity::insert`) are pure; execution happens at `.all`/
630        // `.one`/`.count`/`.stream` and `Insert/Update/Delete::exec`. The write path via an
631        // ActiveModel (`model.insert(db)`) executes too — distinguished from the `EntityTrait`
632        // builder by the trait in the path (`ActiveModelTrait::`). (Found hardening on a
633        // sea_orm consumer app: `.all(db)` reads and `ActiveModel::insert` writes were pure.)
634        if crate_name == "sea_orm" {
635            if path.ends_with("::all")
636                || path.ends_with("::one")
637                || path.ends_with("::count")
638                || path.ends_with("::stream")
639                || path.ends_with("::exec")
640                || path.ends_with("::exec_with_returning")
641                || path.ends_with("::exec_without_returning")
642                || path.ends_with("::connect")
643                || path.ends_with("::execute")
644                || path.ends_with("::execute_unprepared")
645                || path.ends_with("::query_one")
646                || path.ends_with("::query_all")
647                || path.ends_with("::fetch_page")
648                || path.ends_with("::num_items")
649                || path.contains("ActiveModelTrait::")
650            {
651                return Some("Db");
652            }
653            return None;
654        }
655        // (Reached by sqlx + diesel — the build-vs-execute-split crates.) `first` is diesel's
656        // LIMIT-1 round trip and `load_iter` its 2.x streaming execution; `fetch_many` is sqlx's
657        // multi-result stream. All crate-gated, so a std `Vec::first` never resolves here.
658        const VERBS: [&str; 19] = [
659            "::execute", "::query_row", "::query_map", "::query_one", "::fetch_one",
660            "::fetch_all", "::fetch_optional", "::fetch", "::fetch_many", "::connect",
661            "::acquire", "::begin", "::commit", "::rollback", "::load", "::load_iter",
662            "::first", "::get_result", "::get_results",
663        ];
664        if VERBS.iter().any(|v| path.ends_with(v)) {
665            return Some("Db");
666        }
667        return None;
668    }
669    // std::path::Path / PathBuf STAT-family methods hit the filesystem (each is a stat/readlink/
670    // readdir syscall) — unlike the rest of the std::path surface, which is pure string manipulation
671    // (join/file_name/extension/parent/…). Verb-precise so the scanner's receiver inference can safely
672    // route a `path.symlink_metadata()` method call here. (A blackout screen caught gix-dir — an entire
673    // directory WALKER — reporting ZERO Fs because all its I/O is Path-method calls; same class as
674    // fd's residual `Path::symlink_metadata` under-report.)
675    if let Some(m) = path
676        .strip_prefix("std::path::Path::")
677        .or_else(|| path.strip_prefix("std::path::PathBuf::"))
678    {
679        const STAT: &[&str] = &[
680            "metadata", "symlink_metadata", "canonicalize", "read_link", "read_dir", "exists",
681            "try_exists", "is_file", "is_dir", "is_symlink",
682        ];
683        return STAT.contains(&m).then_some("Fs");
684    }
685    // Filesystem. `tokio::fs`/`async_std::fs` are the async mirrors of `std::fs`; `async_fs` is
686    // smol's fs crate; `fs_err` is a drop-in `std::fs` wrapper (its whole surface is fs I/O).
687    if path.starts_with("std::fs::")
688        || path.starts_with("tokio::fs::")
689        || path.starts_with("async_std::fs::")
690        || crate_name == "async_fs"
691        || crate_name == "fs_err"
692        || crate_name == "memmap2"
693    {
694        return Some("Fs");
695    }
696    // tempfile: creating a temp file/dir touches the disk. Match the create/persist verbs (the
697    // `Builder` setters — prefix/suffix/rand_bytes — stay pure). `persist`/`keep` rename/retain
698    // the file on disk; `close` removes it.
699    if crate_name == "tempfile"
700        && (path.ends_with("::tempfile")
701            || path.ends_with("::tempfile_in")
702            || path.ends_with("::tempdir")
703            || path.ends_with("::tempdir_in")
704            || path.ends_with("NamedTempFile::new")
705            || path.ends_with("NamedTempFile::new_in")
706            || path.ends_with("TempDir::new")
707            || path.ends_with("TempDir::new_in")
708            || path.ends_with("::persist")
709            || path.ends_with("::persist_noclobber")
710            || path.ends_with("::keep"))
711    {
712        return Some("Fs");
713    }
714    // glob: walks the filesystem to expand a pattern (the returned iterator reads directories).
715    // `Pattern::matches` is pure string matching — match only the directory-walking entry points.
716    if crate_name == "glob" && (path.ends_with("::glob") || path.ends_with("::glob_with")) {
717        return Some("Fs");
718    }
719    // Password-hashing / KDF crates — the entropy tier (the TS engine's CTA lesson: an invisible
720    // argon2 landed on exactly the call a security review cares about). In this engine's
721    // verb-precise style the ENTROPY is the salt mint: `SaltString::generate(OsRng)` in the
722    // password-hash API family, and bcrypt's `hash`/`hash_with_result` (salt minted internally).
723    // Verification and explicit-salt hashing are deterministic recomputation — pure. `rand_core`
724    // carries the OsRng source itself (otherwise the most common salt mint is invisible).
725    if matches!(crate_name, "argon2" | "scrypt" | "pbkdf2" | "password_hash") {
726        if path.contains("SaltString::generate") {
727            return Some("Rand");
728        }
729        return None;
730    }
731    if crate_name == "bcrypt" {
732        if path.ends_with("::hash") || path.ends_with("::hash_with_result") {
733            return Some("Rand");
734        }
735        return None;
736    }
737    if crate_name == "rand_core" {
738        if path.contains("OsRng")
739            || path.ends_with("::next_u32")
740            || path.ends_with("::next_u64")
741            || path.ends_with("::fill_bytes")
742        {
743            return Some("Rand");
744        }
745        return None;
746    }
747    // Randomness / entropy. `getrandom`/`fastrand` are effectful end-to-end. `rand` is NOT — it
748    // mixes entropy/generation (effectful) with *pure* distribution constructors (`Uniform::new`,
749    // `Normal::new`) and deterministic-seed constructors (`seed_from_u64`). Flagging the whole crate
750    // over-reported those as `Rand`; match only the calls that actually consume randomness — the
751    // entropy sources (`OsRng`, `thread_rng`/`rng`, `from_entropy`/`from_os_rng`) and the generation
752    // verbs (`gen*`/`random*`/`fill*`/`sample*`/`next_u*`). A `Uniform::new` is now correctly pure.
753    if crate_name == "getrandom" || crate_name == "fastrand" {
754        return Some("Rand");
755    }
756    if crate_name == "rand" {
757        let rng_verb = path.ends_with("::gen")
758            || path.ends_with("::gen_range")
759            || path.ends_with("::gen_bool")
760            || path.ends_with("::gen_ratio")
761            || path.ends_with("::random")
762            || path.ends_with("::random_range")
763            || path.ends_with("::random_bool")
764            || path.ends_with("::random_ratio")
765            || path.ends_with("::random_iter") // rand 0.9 iterator generator
766            || path.ends_with("::gen_iter")
767            || path.ends_with("::fill")
768            || path.ends_with("::fill_bytes")
769            || path.ends_with("::try_fill")
770            || path.ends_with("::try_fill_bytes")
771            || path.ends_with("::sample")
772            || path.ends_with("::sample_iter")
773            || path.ends_with("::next_u32")
774            || path.ends_with("::next_u64")
775            || path.ends_with("::thread_rng")
776            || path.ends_with("::rng")
777            || path.ends_with("::from_entropy")
778            || path.ends_with("::from_os_rng");
779        if rng_verb || path.contains("OsRng") {
780            return Some("Rand");
781        }
782        return None;
783    }
784    // Subprocess spawning. `tokio::process` is the async mirror of `std::process` — it exists
785    // only to spawn/control subprocesses (`Command`/`Child`, no pure data types like std's
786    // `Stdio`/`ExitStatus`/`exit`), so spawning through it is Exec just the same. Without this an
787    // async app's `tokio::process::Command::new(..).spawn()` classified pure — a silent under-report
788    // of subprocess execution, the dangerous direction (mirrors the tokio::fs/tokio::net coverage).
789    if path.starts_with("std::process::Command")
790        || path.starts_with("std::process::Child")
791        || path.starts_with("tokio::process::Command")
792        || path.starts_with("tokio::process::Child")
793        || path.starts_with("async_std::process::Command")
794        || path.starts_with("async_std::process::Child")
795        || crate_name == "async_process"
796        || crate_name == "portable_pty"
797    {
798        return Some("Exec");
799    }
800    // duct: a subprocess-orchestration crate. `cmd()`/`cmd!` only *build* an Expression; the
801    // spawn/wait happens at `run`/`read`/`start`. Match the execution verbs, not the builder.
802    if crate_name == "duct"
803        && (path.ends_with("::run")
804            || path.ends_with("::read")
805            || path.ends_with("::start")
806            || path.ends_with("::read_chars"))
807    {
808        return Some("Exec");
809    }
810    if path.starts_with("std::env::") {
811        return Some("Env");
812    }
813    // dotenvy / dotenv: load environment variables (reading a `.env` file and mutating the process
814    // environment). Match the load/read entry points; `Error`/builder types stay pure.
815    if matches!(crate_name, "dotenvy" | "dotenv")
816        && (path.ends_with("::dotenv")
817            || path.ends_with("::dotenv_override")
818            || path.ends_with("::from_path")
819            || path.ends_with("::from_path_override")
820            || path.ends_with("::from_filename")
821            || path.ends_with("::from_filename_override")
822            || path.ends_with("::from_read")
823            || path.ends_with("::from_read_override")
824            || path.ends_with("::load")
825            || path.ends_with("::var")
826            || path.ends_with("::vars"))
827    {
828        return Some("Env");
829    }
830    // Wall-clock reads. Match the `now` accessor precisely (ends_with), not any path
831    // containing the substring "now". The `time` crate (distinct from `std::time`/`chrono`)
832    // reads the clock via `now_utc`/`now_local` (and the deprecated `Instant::now`).
833    if (crate_name == "chrono" || path.starts_with("std::time::")) && path.ends_with("::now") {
834        return Some("Clock");
835    }
836    if crate_name == "time"
837        && (path.ends_with("::now_utc") || path.ends_with("::now_local") || path.ends_with("::now"))
838    {
839        return Some("Clock");
840    }
841    if crate_name == "tracing" {
842        return Some("Log");
843    }
844    // The `log` facade: its macros route through `log::__private_api`; the crate's types
845    // (`Level`, `LevelFilter`) are pure, so match the logging entry, not the whole crate.
846    if crate_name == "log" && path.contains("::__private_api") {
847        return Some("Log");
848    }
849    // Compiler diagnostic emission — the ONE genuinely effectful operation in the otherwise-pure
850    // rustc_* surface (a dylint lint's actual OUTPUT: it writes warnings/errors to the compiler's
851    // diagnostic sink). Classified `Log` (same family as `tracing`/`log` — program output). Match the
852    // emission verbs precisely; rustc_lint/rustc_errors are mostly pure types (Lint, LintId, the Diag
853    // BUILDERS), and only the terminal `emit`/`emit_span_lint` actually produces output.
854    if crate_name == "rustc_lint"
855        && (path.ends_with("::emit_span_lint")
856            || path.ends_with("::span_lint")
857            || path.ends_with("::span_lint_hir"))
858    {
859        return Some("Log");
860    }
861    if crate_name == "rustc_errors"
862        && (path.ends_with("::emit")
863            || path.ends_with("::emit_diagnostic")
864            || path.ends_with("::emit_now"))
865    {
866        return Some("Log");
867    }
868    if crate_name == "arboard" {
869        return Some("Clipboard");
870    }
871    None
872}
873
874pub fn cap_from_name(name: &str) -> Option<&'static str> {
875    EFFECTS.iter().copied().find(|e| *e == name)
876}
877
878/// Refine the `Exec` cliff (spec §4 ⟨0.5⟩): the effects a *literal, statically-known* subprocess
879/// head implies, matched by basename (`/usr/bin/curl` → `curl`). The head's effects are ADDED to a
880/// caller that already carries `Exec` (a subprocess is still spawned — `Exec` is never dropped); an
881/// unrecognised or dynamically-built head returns `&[]` and keeps the bare cliff (never guess). A
882/// **candor engine** reads `Fs`/`Env` only — spec §7 item 12 (the analyzer self-boundary) guarantees
883/// that, so that case is spec-supplied, not curation. The rest is a small curated table under the
884/// same under-report rule as the crate classifier. INVARIANT: every head here is an external tool
885/// that does NOT run the analysed project's own code (so `make`/`npm`/`cargo` are deliberately
886/// absent — they stay the cliff). The reference engines share this table so the `Exec` boundary —
887/// the one boundary every engine hits — refines identically (the §4-consistency argument).
888pub fn classify_command_head(cmd: &str) -> &'static [&'static str] {
889    // Only UNAMBIGUOUS single-effect tools belong here. A multi-modal head (`git status` is local,
890    // `git push` is Net; `rsync` local-vs-remote) would FABRICATE the effect for its common case —
891    // the under-report rule forbids it, so such heads keep the bare cliff.
892    match cmd.rsplit(['/', '\\']).next().unwrap_or(cmd) {
893        "curl" | "wget" | "http" | "ssh" | "scp" => &["Net"],
894        "psql" | "mysql" | "sqlite3" | "mongosh" | "redis-cli" => &["Db"],
895        // candor engines — Fs/Env only, guaranteed by spec §7 item 12 (the analyzer self-boundary)
896        "candor" | "candor-run.sh" | "candor-scan" | "candor-query" | "candor-java"
897        | "candor-classify" | "candor-report" | "cargo-candor" => &["Env", "Fs"],
898        _ => &[],
899    }
900}
901
902/// Map a cap-std capability *type* to the effect it authorises. Holding one of these
903/// (e.g. `&Dir`) is the real, unforgeable right to perform that effect — so candor
904/// treats it as a declared capability, exactly like its own `&Fs` token.
905pub fn capstd_cap(crate_name: &str, type_name: &str) -> Option<&'static str> {
906    if !crate_name.starts_with("cap_") {
907        return None;
908    }
909    Some(match type_name {
910        "Dir" => "Fs",
911        "TcpListener" | "TcpStream" | "UdpSocket" | "Pool" => "Net",
912        "UnixListener" | "UnixStream" | "UnixDatagram" => "Ipc",
913        "SystemClock" | "MonotonicClock" => "Clock",
914        _ => return None,
915    })
916}
917
918/// Table names a SQL string literal STATICALLY reaches — the `Db` analog of the `Net` host /
919/// `Exec` command / `Fs` path literal surface (feeds `allow Db in <scope> <table>…`, AS-EFF-008).
920/// Conservative by construction, because a wrong capture here would FABRICATE: the string must
921/// open with a SQL statement keyword, and only identifiers in table position are taken —
922/// `FROM`/`JOIN` anywhere, `INTO` anywhere, statement-leading `UPDATE`/`TRUNCATE`, and
923/// `TABLE` (create/drop/alter), skipping `ONLY`/`IF NOT EXISTS`. `UPDATE` mid-statement is
924/// deliberately ignored (`FOR UPDATE SKIP LOCKED` must not yield a table "skip"). A
925/// dynamically-built query yields nothing — the gate's opaque case — never a guess.
926/// Output is lower-cased, quote/backtick-stripped, `schema.table` kept qualified, deduped.
927/// SPEC §2 pins this algorithm token-for-token across engines; the cross-impl vector battery
928/// (candor-spec conformance/tables/vectors.json, run.sh Part 4b) enforces the JVM/TS mirrors.
929pub fn tables_in_sql(sql: &str) -> Vec<String> {
930    const STMT: &[&str] =
931        &["select", "insert", "update", "delete", "create", "drop", "alter", "truncate", "merge", "replace", "with"];
932    // Tokens that can FOLLOW a table-introducing keyword without being a table.
933    const SKIP: &[&str] = &["only", "if", "not", "exists", "table"];
934    // Identifier-position tokens that are grammar, not a table (subqueries, locking clauses…).
935    const STOP: &[&str] = &[
936        "select", "set", "where", "values", "on", "using", "group", "order", "by", "limit",
937        "returning", "as", "inner", "outer", "left", "right", "cross", "lateral", "natural",
938        "union", "all", "distinct", "case", "when", "null", "default", "skip", "nowait", "of",
939        "from", "join", "into", "update", "delete", "insert",
940    ];
941    // `,` survives as its OWN token (not a space): it's what lets `FROM t1, t2` continue the table
942    // list without fabricating from other comma-ridden positions (column lists, ON clauses).
943    let cleaned: String = sql
944        .to_lowercase()
945        .chars()
946        .flat_map(|c| match c {
947            '(' | ')' | ';' => vec![' '],
948            ',' => vec![' ', ',', ' '],
949            _ => vec![c],
950        })
951        .collect();
952    let toks: Vec<&str> = cleaned.split_whitespace().collect();
953    let Some(first) = toks.first() else { return Vec::new() };
954    if !STMT.contains(first) {
955        return Vec::new(); // not SQL — nothing to certify, nothing fabricated
956    }
957    let ident = |t: &str| -> Option<String> {
958        let t = t.trim_matches(|c| matches!(c, '"' | '`' | '\''));
959        let mut chars = t.chars();
960        let ok_first = chars.next().is_some_and(|c| c.is_ascii_alphabetic() || c == '_');
961        let ok_rest = t.chars().all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '.' | '$' | '"' | '`'));
962        (ok_first && ok_rest && !STOP.contains(&t)).then(|| t.replace(['"', '`'], ""))
963    };
964    let mut out: Vec<String> = Vec::new();
965    let mut push = |t: Option<String>| {
966        if let Some(t) = t {
967            if !out.contains(&t) {
968                out.push(t);
969            }
970        }
971    };
972    for (i, tok) in toks.iter().enumerate() {
973        let table_pos = match *tok {
974            "from" | "join" | "into" | "table" => true,
975            // statement-leading only (see doc comment): `update t set …`, `truncate [table] t`.
976            "update" | "truncate" => i == 0,
977            _ => false,
978        };
979        if !table_pos {
980            continue;
981        }
982        let mut j = i + 1;
983        while j < toks.len() && SKIP.contains(&toks[j]) {
984            j += 1;
985        }
986        let Some(next) = toks.get(j) else { continue };
987        let Some(first) = ident(next) else { continue };
988        push(Some(first));
989        // Comma-ADJACENT continuation only: `FROM t1, t2, t3` takes all three, while an alias breaks
990        // the chain (`FROM t1 a, t2` keeps just t1 — an under-report, never a guess: skipping an
991        // alias to chase the comma would fabricate tables out of `INSERT INTO t (a, b)`'s column
992        // list, whose parens are spaces by the time we tokenize).
993        while j + 2 < toks.len() && toks[j + 1] == "," {
994            let Some(more) = ident(toks[j + 2]) else { break };
995            push(Some(more));
996            j += 2;
997        }
998    }
999    out
1000}
1001
1002#[cfg(test)]
1003mod tests {
1004    #[test]
1005    fn sql_table_extraction_is_conservative() {
1006        use super::tables_in_sql as t;
1007        assert_eq!(t("SELECT id FROM users WHERE x = 1"), vec!["users"]);
1008        assert_eq!(t("select * from ledger.entries e join customers c on c.id = e.cid"),
1009                   vec!["ledger.entries", "customers"]);
1010        assert_eq!(t("INSERT INTO audit_log (a) VALUES (?1)"), vec!["audit_log"]);
1011        assert_eq!(t("UPDATE accounts SET v = ?"), vec!["accounts"]);
1012        assert_eq!(t("DELETE FROM sessions WHERE id = ?"), vec!["sessions"]);
1013        assert_eq!(t("CREATE TABLE IF NOT EXISTS cache (k TEXT)"), vec!["cache"]);
1014        assert_eq!(t("TRUNCATE TABLE staging"), vec!["staging"]);
1015        // FOR UPDATE locking clause must not yield a phantom table (mid-statement update ignored)
1016        assert_eq!(t("SELECT * FROM jobs FOR UPDATE SKIP LOCKED"), vec!["jobs"]);
1017        // a subquery in FROM position yields nothing for that position
1018        assert_eq!(t("SELECT * FROM (SELECT 1) q"), Vec::<String>::new());
1019        // not SQL -> nothing (never fabricate)
1020        assert_eq!(t("/tmp/some/path"), Vec::<String>::new());
1021        assert_eq!(t("hello world from nowhere"), Vec::<String>::new());
1022        // comma-ADJACENT continuation: a FROM list takes every table in the chain…
1023        assert_eq!(t("SELECT a FROM t1, t2, s.t3 WHERE x = 1"), vec!["t1", "t2", "s.t3"]);
1024        // …but an alias breaks it (under-report, never a guess)…
1025        assert_eq!(t("SELECT a FROM t1 a1, t2 WHERE x = 1"), vec!["t1"]);
1026        // …which is exactly what keeps a column list from fabricating (parens are spaces by now).
1027        assert_eq!(t("INSERT INTO t (a, b) VALUES (1, 2)"), vec!["t"]);
1028        // a subquery after the comma stops the chain too
1029        assert_eq!(t("SELECT a FROM t1, (SELECT 1) q"), vec!["t1"]);
1030    }
1031
1032    use super::*;
1033
1034    #[test]
1035    fn db_crates_are_calibrated() {
1036        // The calibrated set must cover every DB client the classifier knows, or the receipt's coverage
1037        // check would flag a recognized crate as a blind spot. (Was nightly-lint-only; now runs on stable.)
1038        for c in DB_CRATES {
1039            assert!(
1040                CALIBRATED_CRATES.contains(&c),
1041                "DB crate `{c}` is matched by classify() but missing from CALIBRATED_CRATES"
1042            );
1043        }
1044    }
1045
1046    #[test]
1047    fn calibrated_crates_are_live() {
1048        // Conversely, every crate advertised as calibrated must actually be matched by classify() for
1049        // some representative path — a dead entry would silently suppress a real coverage warning.
1050        for c in CALIBRATED_CRATES {
1051            assert!(
1052                CALIBRATION_PROBE_TAILS.iter().any(|t| classify(c, &format!("{c}{t}")).is_some()),
1053                "calibrated crate `{c}` is matched by no path in classify() — dead list entry"
1054            );
1055        }
1056    }
1057
1058    #[test]
1059    fn classify_core_effects() {
1060        // A representative smoke test of the classifier's main families, so the published crate is not
1061        // shipped untested (these used to live only in the nightly-only src/lib.rs).
1062        assert_eq!(classify("std", "std::fs::read_to_string"), Some("Fs"));
1063        // std::path stat-family methods are Fs (each is a stat/readdir syscall); the pure
1064        // string-manipulation surface stays unclassified (the blackout screen's gix-dir find).
1065        assert_eq!(classify("std", "std::path::Path::symlink_metadata"), Some("Fs"));
1066        assert_eq!(classify("std", "std::path::PathBuf::read_dir"), Some("Fs"));
1067        assert_eq!(classify("std", "std::path::Path::exists"), Some("Fs"));
1068        assert_eq!(classify("std", "std::path::Path::join"), None); // pure string manipulation
1069        assert_eq!(classify("std", "std::path::PathBuf::file_name"), None);
1070        assert_eq!(classify("std", "std::path::Path::parent"), None);
1071        assert_eq!(classify("std", "std::process::Command::new"), Some("Exec"));
1072        assert_eq!(classify("std", "std::env::var"), Some("Env"));
1073        assert_eq!(classify("reqwest", "reqwest::Client::execute"), Some("Net"));
1074        // one-shot convenience fns send immediately → Net; the `Client::get` builder stays pure.
1075        assert_eq!(classify("reqwest", "reqwest::get"), Some("Net"));
1076        assert_eq!(classify("reqwest", "reqwest::blocking::get"), Some("Net"));
1077        assert_eq!(classify("reqwest", "reqwest::Client::get"), None);
1078        assert_eq!(classify("reqwest", "reqwest::RequestBuilder::header"), None);
1079        // nix routes through the libc syscall table (same leaves): I/O classified, generic fd ops skipped.
1080        assert_eq!(classify("nix", "nix::fcntl::open"), Some("Fs"));
1081        assert_eq!(classify("nix", "nix::sys::socket::connect"), Some("Net"));
1082        assert_eq!(classify("nix", "nix::unistd::execvp"), Some("Exec"));
1083        assert_eq!(classify("nix", "nix::unistd::write"), None); // generic fd op — deliberately unclassified
1084        assert_eq!(classify("nix", "nix::unistd::getpid"), None); // not I/O
1085        // rustix does raw syscalls (no libc underneath) → classified directly by leaf, same table.
1086        assert_eq!(classify("rustix", "rustix::time::clock_settime"), Some("Clock"));
1087        assert_eq!(classify("rustix", "rustix::fs::symlink"), Some("Fs"));
1088        assert_eq!(classify("rustix", "rustix::net::connect"), Some("Net"));
1089        assert_eq!(classify("rustix", "rustix::io::read"), None); // generic fd op
1090        // pnet raw packet capture: channel openers are Net, packet construction stays pure.
1091        assert_eq!(classify("pnet", "pnet::datalink::channel"), Some("Net"));
1092        assert_eq!(classify("pnet", "pnet::transport::transport_channel"), Some("Net"));
1093        assert_eq!(classify("pnet_datalink", "pnet_datalink::channel"), Some("Net"));
1094        assert_eq!(classify("pnet", "pnet::packet::ethernet::EthernetPacket::new"), None);
1095        assert_eq!(classify("pnet_base", "pnet_base::MacAddr::new"), None);
1096        // ignore (gitignore-aware walker): walk executors are Fs, config builders stay pure.
1097        assert_eq!(classify("ignore", "ignore::WalkBuilder::build_parallel"), Some("Fs"));
1098        assert_eq!(classify("ignore", "ignore::WalkBuilder::build"), Some("Fs"));
1099        assert_eq!(classify("ignore", "ignore::WalkParallel::run"), Some("Fs"));
1100        assert_eq!(classify("ignore", "ignore::overrides::OverrideBuilder::build"), None); // pure config
1101        assert_eq!(classify("ignore", "ignore::gitignore::GitignoreBuilder::build"), None); // pure config
1102        assert_eq!(classify("ignore", "ignore::DirEntry::path"), None); // pure accessor
1103        // notify fs-watching: watcher constructors + watch/unwatch are Fs, data types stay pure.
1104        assert_eq!(classify("notify", "notify::RecommendedWatcher::new"), Some("Fs"));
1105        assert_eq!(classify("notify", "notify::PollWatcher::new"), Some("Fs"));
1106        assert_eq!(classify("notify", "notify::recommended_watcher"), Some("Fs"));
1107        assert_eq!(classify("notify", "notify::INotifyWatcher::watch"), Some("Fs"));
1108        assert_eq!(classify("notify", "notify::Config::default"), None); // pure config
1109        assert_eq!(classify("notify", "notify::Event::new"), None); // pure data type
1110        assert_eq!(classify("rusqlite", "rusqlite::Connection::execute"), Some("Db"));
1111        // the rusqlite verb DIALECT (a verb probe found the canonical consumer API classifying pure):
1112        assert_eq!(classify("rusqlite", "rusqlite::Connection::query_row"), Some("Db"));
1113        assert_eq!(classify("rusqlite", "rusqlite::Statement::query_map"), Some("Db"));
1114        assert_eq!(classify("rusqlite", "rusqlite::Connection::execute_batch"), Some("Db"));
1115        assert_eq!(classify("rusqlite", "rusqlite::Connection::prepare_cached"), Some("Db"));
1116        assert_eq!(classify("rusqlite", "rusqlite::Connection::open"), Some("Db"));
1117        assert_eq!(classify("rusqlite", "rusqlite::Connection::open_in_memory"), Some("Db"));
1118        // …but `open` stays rusqlite-only (postgres has no open; nothing else may borrow it):
1119        assert_eq!(classify("postgres", "postgres::Client::open"), None);
1120        assert_eq!(classify("tokio_postgres", "tokio_postgres::Client::query_typed"), Some("Db"));
1121        // diesel's LIMIT-1 + streaming executions; sqlx's multi-result stream:
1122        assert_eq!(classify("diesel", "diesel::RunQueryDsl::first"), Some("Db"));
1123        assert_eq!(classify("diesel", "diesel::RunQueryDsl::load_iter"), Some("Db"));
1124        assert_eq!(classify("sqlx", "sqlx::query::Query::fetch_many"), Some("Db"));
1125        // sqlx's bare `query()` builder must STAY pure (the original sqlx lesson):
1126        assert_eq!(classify("sqlx", "sqlx::query"), None);
1127        assert_eq!(classify("tracing", "tracing::event"), Some("Log"));
1128        // FFI tiers (matched by distinctive leaf, alias-independent)
1129        assert_eq!(classify("libc", "libc::open"), Some("Fs"));
1130        assert_eq!(classify("libc", "libc::connect"), Some("Net"));
1131        assert_eq!(classify("libc", "libc::read"), None); // generic fd op — deliberately unclassified
1132        assert_eq!(classify("ffi", "ffi::sqlite3_step"), Some("Db"));
1133        assert_eq!(classify("raw", "raw::git_remote_fetch"), Some("Net"));
1134        // libgit2 clone + submodule clone/update fetch over the network (an A/B on git2 0.20 caught
1135        // `Submodule::update`/`clone` and `Repository::clone` reporting no Net — the latter because the
1136        // `src/build.rs` module was being dropped as if it were the Cargo build script).
1137        assert_eq!(classify("raw", "raw::git_clone"), Some("Net"));
1138        assert_eq!(classify("raw", "raw::git_submodule_clone"), Some("Net"));
1139        assert_eq!(classify("raw", "raw::git_submodule_update"), Some("Net"));
1140        assert_eq!(classify("raw", "raw::git_submodule_open"), None); // local subrepo open — not Net
1141        // libcurl: the transfer/raw-socket entry points are Net (an A/B on curl 0.4 caught the whole
1142        // crate reporting ZERO Net); the big setopt/init/getinfo surface — and the readiness-wait
1143        // multi_wait/poll — stay unclassified (the loop's perform is the boundary).
1144        assert_eq!(classify("curl_sys", "curl_sys::curl_easy_perform"), Some("Net"));
1145        assert_eq!(classify("curl_sys", "curl_sys::curl_easy_send"), Some("Net"));
1146        assert_eq!(classify("curl_sys", "curl_sys::curl_multi_perform"), Some("Net"));
1147        assert_eq!(classify("curl_sys", "curl_sys::curl_multi_socket_action"), Some("Net"));
1148        assert_eq!(classify("curl_sys", "curl_sys::curl_easy_setopt"), None); // in-memory option write
1149        assert_eq!(classify("curl_sys", "curl_sys::curl_easy_init"), None); // handle alloc
1150        assert_eq!(classify("curl_sys", "curl_sys::curl_multi_wait"), None); // readiness wait, no payload
1151        // consumer-side `curl` crate rule: the dispatch verbs are Net, the setopt builders pure.
1152        assert_eq!(classify("curl", "curl::easy::Easy::perform"), Some("Net"));
1153        assert_eq!(classify("curl", "curl::multi::Multi::perform"), Some("Net"));
1154        assert_eq!(classify("curl", "curl::easy::Easy::send"), Some("Net"));
1155        assert_eq!(classify("curl", "curl::easy::Easy::url"), None); // CURLOPT setter — pure
1156        assert_eq!(classify("curl", "curl::easy::Easy::timeout"), None); // pure setter; Multi::timeout under-reported by design
1157        assert_eq!(classify("ffi", "ffi::SSL_connect"), Some("Net"));
1158        // pure crates stay pure
1159        assert_eq!(classify("serde", "serde::Serialize::serialize"), None);
1160        assert_eq!(classify("std", "std::vec::Vec::push"), None);
1161    }
1162
1163    #[test]
1164    fn command_head_refines_the_exec_cliff() {
1165        use super::classify_command_head as h;
1166        // unambiguous external tools classify by basename (spec §4 ⟨0.5⟩)
1167        assert_eq!(h("curl"), &["Net"]);
1168        assert_eq!(h("/usr/local/bin/psql"), &["Db"]); // basename match strips the path
1169        // a candor engine is Fs/Env — spec-SUPPLIED by §7 item 12, not curation
1170        assert_eq!(h("candor-scan"), &["Env", "Fs"]);
1171        assert_eq!(h("candor-run.sh"), &["Env", "Fs"]);
1172        // an unrecognised head adds nothing — the bare Exec cliff stands (never guess). `make`/`npm`
1173        // run the project's own code; `git`/`rsync` are multi-modal (local vs remote) — all keep the
1174        // cliff rather than fabricate an effect for the common case.
1175        assert_eq!(h("some-unknown-tool"), &[] as &[&str]);
1176        assert_eq!(h("make"), &[] as &[&str]);
1177        assert_eq!(h("npm"), &[] as &[&str]);
1178        assert_eq!(h("git"), &[] as &[&str]);
1179        assert_eq!(h("rsync"), &[] as &[&str]);
1180    }
1181}