Skip to main content

keyhog_core/
aws.rs

1//! Offline AWS account-ID recovery and canary-token classification.
2//!
3//! This is the **single source of truth** for two credential-string-only facts
4//! about an AWS access-key ID, shared by every keyhog crate (scanner attaches
5//! them as finding metadata with no verify; verifier consults the canary check
6//! to refuse tripping a canary on `--verify`). It lives in `keyhog-core` — the
7//! one crate both `keyhog-scanner` and `keyhog-verifier` depend on — so there is
8//! exactly one decode and one canary list, never a fork.
9//!
10//! 1. **Account decode.** Every modern AWS access-key ID (`AKIA…` long-term,
11//!    `ASIA…` temporary STS) has the 12-digit account number mathematically
12//!    embedded in it, recoverable with a pure base32-decode + bit-shift — NO
13//!    network call, NO STS `GetCallerIdentity`, and it works on LIVE *and*
14//!    revoked keys. Algorithm matches the trufflesecurity write-up
15//!    <https://trufflesecurity.com/blog/research-uncovers-aws-account-numbers-hidden-in-access-keys>:
16//!    drop the 4-char prefix; base32-decode the body; the first 6 decoded bytes
17//!    are a big-endian u48; `account = (u48 & 0x7fff_ffff_ff80) >> 7`, rendered
18//!    as a 12-digit zero-padded decimal string.
19//!
20//! 2. **Canary classification.** An access key whose decoded account belongs to
21//!    a known canary issuer (canarytokens.org / Thinkst and off-brand clones) is
22//!    a tripwire: any live verification alerts whoever planted it. The baseline
23//!    issuer list is Tier-B data embedded from `data/aws-canary-accounts.toml`
24//!    and unioned at first use with a runtime-extension file pointed to by
25//!    `KEYHOG_AWS_CANARY_ACCOUNTS`. Baseline source:
26//!    <https://trufflesecurity.com/blog/canaries>.
27
28use std::collections::{HashMap, HashSet};
29
30/// The two access-key-ID prefixes whose 12-digit account number is embedded.
31/// `AKIA` is a long-term IAM key, `ASIA` a temporary STS session key. Both use
32/// the identical embedding, so both decode with the same routine.
33const AWS_KEY_ID_PREFIXES: [&str; 2] = ["AKIA", "ASIA"];
34
35/// Length of a canonical AWS access-key ID: 4-char prefix + 16 base32 chars.
36const AWS_KEY_ID_LEN: usize = 20;
37
38/// The 48-bit mask + 7-bit right shift that extracts the account number from
39/// the leading 6 decoded bytes. Documented by trufflesecurity; the low 7 bits
40/// are a non-account discriminator, and bit 47 is always 0 for the account.
41const ACCOUNT_MASK: u64 = 0x7fff_ffff_ff80;
42const ACCOUNT_SHIFT: u64 = 7;
43
44/// Decode an RFC-4648 standard base32 character (`A`-`Z`, `2`-`7`) to its 5-bit
45/// value. Returns `None` for any out-of-alphabet byte (lowercase, padding,
46/// digits 0/1/8/9), which makes the whole decode fail closed on a malformed id.
47#[inline]
48fn base32_value(c: u8) -> Option<u8> {
49    match c {
50        b'A'..=b'Z' => Some(c - b'A'),
51        b'2'..=b'7' => Some(c - b'2' + 26),
52        _ => None,
53    }
54}
55
56/// Recover the 12-digit AWS account ID embedded in an access-key ID, fully
57/// offline. Returns `None` when `key_id` is not a well-formed `AKIA…`/`ASIA…`
58/// access-key ID (wrong length, wrong prefix, or a non-base32 body), so a
59/// caller can blindly try every credential and only act on `Some`.
60///
61/// The returned string is always exactly 12 ASCII digits, zero-padded — AWS
62/// account numbers are 12-digit identifiers and the leading-zero form (e.g.
63/// `052310077262`) is the canonical rendering, matching the STS `Account`
64/// field and trufflehog's output.
65#[must_use]
66pub fn aws_account_from_key_id(key_id: &str) -> Option<String> {
67    let key_id = key_id.trim();
68    if key_id.len() != AWS_KEY_ID_LEN {
69        return None;
70    }
71    if !AWS_KEY_ID_PREFIXES
72        .iter()
73        .any(|p| key_id.as_bytes().starts_with(p.as_bytes()))
74    {
75        return None;
76    }
77
78    // The 16 base32 chars after the prefix encode 80 bits; we only need the
79    // leading 48 bits (first 6 bytes), which come from the first 10 base32
80    // chars (10 * 5 = 50 bits). Accumulate those 50 bits, then keep the top 48.
81    let body = &key_id.as_bytes()[4..];
82    let mut acc: u64 = 0;
83    for &c in &body[..10] {
84        let v = base32_value(c)?;
85        acc = (acc << 5) | u64::from(v);
86    }
87    // `acc` now holds 50 bits (the first 10 chars). The leading 48 bits are the
88    // u48 we want, so drop the low 2 bits.
89    let u48 = acc >> 2;
90    let account = (u48 & ACCOUNT_MASK) >> ACCOUNT_SHIFT;
91    Some(format!("{account:012}"))
92}
93
94/// The Tier-B baseline canary account list, compiled into the binary from
95/// `data/aws-canary-accounts.toml`, unioned at first use with any runtime
96/// extension file pointed to by `KEYHOG_AWS_CANARY_ACCOUNTS`.
97///
98/// Soft-fails to an empty set so a corrupted data file degrades canary
99/// awareness rather than crashing.
100static CANARY_ACCOUNTS: std::sync::LazyLock<HashSet<String>> = std::sync::LazyLock::new(|| {
101    let mut set = HashSet::new();
102    merge_canary_accounts(&mut set, include_str!("../data/aws-canary-accounts.toml"));
103    if let Ok(path) = std::env::var("KEYHOG_AWS_CANARY_ACCOUNTS") {
104        match std::fs::read_to_string(&path) {
105            Ok(raw) => merge_canary_accounts(&mut set, &raw),
106            Err(e) => tracing::warn!(
107                path = %path,
108                error = %e,
109                "KEYHOG_AWS_CANARY_ACCOUNTS points at an unreadable file; \
110                 using the compiled-in canary baseline only"
111            ),
112        }
113    }
114    set
115});
116
117/// `[canary]`/`[knockoff]` TOML shape shared by the baseline and any runtime
118/// extension file. Both tables are merged into the same account set — keyhog
119/// treats off-brand knockoffs identically to first-party canaries.
120#[derive(serde::Deserialize, Default)]
121struct CanaryFile {
122    #[serde(default)]
123    canary: CanaryTable,
124    #[serde(default)]
125    knockoff: CanaryTable,
126}
127
128#[derive(serde::Deserialize, Default)]
129struct CanaryTable {
130    #[serde(default)]
131    accounts: Vec<String>,
132}
133
134/// Parse one canary TOML document and union its accounts into `set`. Trims each
135/// account so whitespace in a hand-edited extension file never silently misses.
136fn merge_canary_accounts(set: &mut HashSet<String>, raw: &str) {
137    match toml::from_str::<CanaryFile>(raw) {
138        Ok(parsed) => {
139            for acct in parsed
140                .canary
141                .accounts
142                .into_iter()
143                .chain(parsed.knockoff.accounts)
144            {
145                let acct = acct.trim();
146                if !acct.is_empty() {
147                    set.insert(acct.to_string());
148                }
149            }
150        }
151        Err(e) => tracing::warn!(
152            error = %e,
153            "aws-canary-accounts.toml failed to parse; canary awareness disabled this run"
154        ),
155    }
156}
157
158/// True when `account_id` (a 12-digit AWS account string) belongs to a known
159/// canary-token issuer.
160#[must_use]
161pub fn account_is_canary(account_id: &str) -> bool {
162    CANARY_ACCOUNTS.contains(account_id)
163}
164
165/// True when `key_id` is a decodable AWS access-key ID whose offline-decoded
166/// account belongs to a known canary issuer. The verifier uses this to refuse
167/// sending a live probe (which would trip the canary) without re-implementing
168/// the decode.
169#[must_use]
170pub fn key_id_is_canary(key_id: &str) -> bool {
171    aws_account_from_key_id(key_id).is_some_and(|acct| account_is_canary(&acct))
172}
173
174/// Operator-facing note attached to a canary finding so the report explains why
175/// verification was skipped. Mirrors trufflehog's responder message.
176pub const CANARY_MESSAGE: &str =
177    "AWS canary token (canarytokens.org / Thinkst-style). Do NOT verify: a \
178     verification request alerts whoever planted it. See \
179     https://trufflesecurity.com/canaries";
180
181/// Build the offline metadata for an AWS-access-key finding: always
182/// `{ "account_id": "<12 digits>" }` for a decodable `AKIA…`/`ASIA…` key, plus
183/// `{ "is_canary": "true", "canary_message": <note> }` when the decoded account
184/// belongs to a known canary issuer. `None` when `credential` is not a
185/// well-formed AWS access-key ID.
186///
187/// The `HashMap<String, String>` shape lets a [`crate::VerifiedFinding`]'s
188/// `metadata` absorb it directly, with no verify and no network.
189#[must_use]
190pub fn finding_metadata(credential: &str) -> Option<HashMap<String, String>> {
191    let account_id = aws_account_from_key_id(credential)?;
192    let is_canary = account_is_canary(&account_id);
193    let mut meta = HashMap::new();
194    meta.insert("account_id".to_string(), account_id);
195    if is_canary {
196        meta.insert("is_canary".to_string(), "true".to_string());
197        meta.insert("canary_message".to_string(), CANARY_MESSAGE.to_string());
198    }
199    Some(meta)
200}