ppoppo-token 0.2.0

//! Domain claim attack-surface checks — M39-M45 (ppoppo-specific).
//!
//! Mirror of `check_claims` but for ppoppo's domain extensions to the
//! RFC-registered claim set. The split is by SOURCE OF AUTHORITY:
//!
//! - `check_claims` enforces RFC 8725 / 9068 / 7519 (industry-standard
//!   registered claims — exp/iat/nbf/aud/iss/jti/sub/client_id/cat).
//! - `check_domain` (this module) enforces ppoppo's own contract — claim
//!   shapes the standards never mention (`account_type`, `caps`,
//!   `delegator`, `dlg_depth`, `admin`, `scopes`, `cid`, `sv`,
//!   `active_ppnum`) plus a strict allowlist that locks out PII (M45).
//!
//! ── Deep module shape ────────────────────────────────────────────────────
//!
//! Single `pub(crate) fn run` entry point — every M-row check fires from
//! this one call site. Internal helpers stay private. Callers (`engine/
//! mod.rs::verify`) see one function; the implementation hides the seven
//! row-by-row enforcements. Future M-rows append helper calls inside
//! `run`; the outward shape never changes.
//!
//! ── Order of operations ─────────────────────────────────────────────────
//!
//! Cheaper structural rejects fire before semantic rules so an attacker
//! probing the surface sees the same audit signal regardless of which
//! later rule would also fire. Specific variant first, generic
//! `UnknownClaim` (M45) last — the allowlist is the catch-net.

use crate::access_token::{AuthError, Claims, VerifyConfig};
use crate::engine::raw::parse_payload_json;

/// Run every domain check (M39 in commit 4.1; appended to in 4.2-4.7).
///
/// Takes the partially-built `Claims` (registered fields proven by
/// `check_claims`) plus the raw token (so this checker can re-parse the
/// payload for claims `Claims` doesn't surface). Returns the same
/// `Claims` with surfaced domain fields populated — the by-value shape
/// keeps mutation contained inside the engine and gives `verify` a
/// single composed result.
///
/// Order: cheaper structural rejects first (M39 sub format), then
/// payload-driven checks (M40+). The payload re-parse cost is one
/// `serde_json::from_str` per verify (~µs); the alternative (returning
/// the parsed JSON from `check_claims`) couples two checkers' return
/// types and pushes domain knowledge into a registered-claims module.
pub(crate) fn run(
    token: &str,
    mut claims: Claims,
    _cfg: &VerifyConfig,
) -> Result<Claims, AuthError> {
    // M39: `sub` MUST be a 26-character Crockford-base32 ULID. PAS-issued
    // tokens carry `ppnum_id` (Human) or an AI-agent ULID; any other
    // shape is either issuer drift or forgery. `Ulid::from_string`
    // rejects both wrong length and out-of-alphabet characters in one
    // call — using the canonical parser instead of a hand-rolled regex
    // keeps the validation in lockstep with the issuer's `Ulid::new`.
    if ulid::Ulid::from_string(&claims.sub).is_err() {
        return Err(AuthError::SubFormatInvalid);
    }

    let payload = parse_payload_json(token)?;

    // M40: `account_type` ∈ {"human", "ai_agent"} when present. Absence
    // is admitted (legacy tokens minted before the field existed). A
    // present-but-non-string or present-but-non-whitelist value is a
    // forgery signal: PAS issuance never emits free-form strings here.
    if let Some(value) = payload.get("account_type") {
        let s = value.as_str().ok_or(AuthError::AccountTypeInvalid)?;
        if !matches!(s, "human" | "ai_agent") {
            return Err(AuthError::AccountTypeInvalid);
        }
        claims.account_type = Some(s.to_string());
    }

    // M41: `caps` MUST be a JSON array of strings when present. Absence
    // and empty array both collapse to "no capabilities" on the
    // surfaced `Claims.caps` (default-deny). Engine validates the wire
    // shape only — semantic interpretation of capability strings is
    // per-surface. A string-typed `caps: "admin"` is the canonical
    // forgery vector and is rejected with `CapsShapeInvalid`.
    if let Some(value) = payload.get("caps") {
        claims.caps = parse_string_array(value, AuthError::CapsShapeInvalid)?;
    }

    // M44: admin band gate. When `admin: true`, `active_ppnum` MUST be
    // present and its first 3 digits MUST fall in the admin band
    // (`[100, 109]` — Phase 4 hardcodes; Phase 5+ may load from cfg).
    // Defense in depth on top of `is_admin` DB lookup
    // (STANDARDS_AUTH_PPOPPO §3.2 — DB is the source of truth):
    // narrows a stolen-signing-key forgery surface from "any ppnum" to
    // "an admin-banded ppnum". A non-bool `admin` claim is treated as
    // forgery — PAS issuance never emits other shapes.
    let admin = match payload.get("admin") {
        None => false,
        Some(v) => v.as_bool().ok_or(AuthError::AdminBandRejected)?,
    };
    let active_ppnum_str = payload.get("active_ppnum").and_then(|v| v.as_str());
    if admin {
        let ppnum = active_ppnum_str.ok_or(AuthError::AdminBandRejected)?;
        if !is_in_admin_band(ppnum) {
            return Err(AuthError::AdminBandRejected);
        }
    }
    claims.admin = admin;
    claims.active_ppnum = active_ppnum_str.map(String::from);

    // M43: `dlg_depth` ∈ [0, 4] when present. Absence collapses to "no
    // delegation" (depth 0). Engine fully enforces the bound; callers
    // never re-check, so the value is HIDDEN from `Claims` (Phase 2
    // Decision 1 — surfacing claims the engine fully resolves is a
    // forward-compat tax). `as_u64` handles non-integer / negative /
    // string-coerced forgeries in one call (`Some(n)` only when the
    // JSON number is a non-negative integer).
    if let Some(value) = payload.get("dlg_depth") {
        let depth = value.as_u64().ok_or(AuthError::DlgDepthInvalid)?;
        if depth > MAX_DLG_DEPTH {
            return Err(AuthError::DlgDepthInvalid);
        }
    }

    // M42: `scopes` MUST be a JSON array of strings AND have length ≤ 256.
    // Same default-deny collapse as `caps`. The 256 cap bounds the
    // per-request scope-check cost — a misconfigured issuer (or a forger
    // who got hold of a signing key) cannot mint a token whose
    // authorization vector is itself a DoS. Length check fires AFTER
    // shape so audit logs distinguish "wire malformed" (M42 shape) from
    // "issuer overshot" (M42 length).
    if let Some(value) = payload.get("scopes") {
        let scopes = parse_string_array(value, AuthError::ScopesShapeInvalid)?;
        if scopes.len() > MAX_SCOPES {
            return Err(AuthError::ScopesTooLong);
        }
        claims.scopes = scopes;
    }

    // Surface `delegator` and `cid` (Phase 2 Decision 1 plan): both are
    // legitimately needed post-verify (Token Exchange audit / passkey
    // forensics). Wire-shape check is "string or absent" — the engine
    // doesn't validate ULID format on `delegator` because Token
    // Exchange may carry non-ppoppo principals in a future Phase.
    claims.delegator = payload
        .get("delegator")
        .and_then(|v| v.as_str())
        .map(String::from);
    claims.cid = payload
        .get("cid")
        .and_then(|v| v.as_str())
        .map(String::from);
    // `sid` (M36) — surfaced when present so `engine/check_session.rs`
    // (Phase 5 commit 5.2) can hand it to `cfg.session_revocation`.
    // Absent on PAS-internal machine tokens and pre-Phase-5 tokens; the
    // session-revocation gate short-circuits when `None`.
    claims.sid = payload
        .get("sid")
        .and_then(|v| v.as_str())
        .map(String::from);

    // M45: PII allowlist. PAS issuance only emits claims in the
    // canonical set; anything else is forgery / smuggling / stale-PII.
    // Order: fires LAST so specific-variant rejects (M39-M44) get the
    // precise audit signal first; only after every typed check passes
    // does the catch-net allowlist run. Unknown claim names surface in
    // the variant payload so audit logs see WHICH claim tripped the
    // rejection (`email` looks very different from `x_attacker_marker`
    // even though both end up here).
    if let Some(obj) = payload.as_object() {
        for key in obj.keys() {
            if !ALLOWED_CLAIMS.contains(&key.as_str()) {
                return Err(AuthError::UnknownClaim(key.clone()));
            }
        }
    }

    Ok(claims)
}

/// Maximum number of `scopes` entries (M42). The 256 bound comes from
/// RFC §6.5; raising it requires a coordinated update with PAS issuance
/// (which already hard-caps at the same value via API limits) so the
/// constant is named here rather than inlined.
const MAX_SCOPES: usize = 256;

/// M45 PII allowlist — every claim name PAS issuance is permitted to
/// emit. Anything outside this set is a forgery / smuggling /
/// stale-PII signal and is rejected with `AuthError::UnknownClaim`.
///
/// Adding a new claim is a 4-step change (in this order):
/// 1. Append the wire name here.
/// 2. Add the field to `IssueRequest` + `with_*` builder.
/// 3. Add the field to `IssuePayload` with the right
///    `skip_serializing_if`.
/// 4. Surface (or hide) on `Claims` per the Phase 2 Decision 1 rule
///    ("only surface what callers legitimately need post-verify").
///
/// Skipping any step 1-3 leaves `issue` unable to emit the claim;
/// skipping step 1 leaves `verify` rejecting tokens that contain it.
const ALLOWED_CLAIMS: &[&str] = &[
    // Registered (RFC 7519 + 9068)
    "iss",
    "sub",
    "aud",
    "exp",
    "iat",
    "nbf",
    "jti",
    "client_id",
    "cat",
    // Domain (Phase 4 — M40+)
    "account_type",
    "admin",
    "caps",
    "delegator",
    "dlg_depth",
    "cid",
    "sv",
    "sid",
    "active_ppnum",
    "scopes",
];

/// Maximum delegation chain depth (M43). The inclusive `dlg_depth = 4`
/// bound matches RFC §6.5 — past four hops the audit trail explodes
/// faster than the legitimate use cases can justify.
const MAX_DLG_DEPTH: u64 = 4;

/// Admin allocation band — first 3 digits of an admin-eligible
/// `active_ppnum` fall in `[ADMIN_BAND_START, ADMIN_BAND_END]`. Phase 4
/// hardcodes `[100, 109]` (matching RFC §6.5 / STANDARDS_AUTH_PPOPPO
/// §11.x); Phase 5+ migration to `VerifyConfig::admin_bands` is tracked
/// in the Phase 5 NEXT_PROMPT. **STANDARDS line 73** ("코드는 prefix 값을
/// 모름") favours the cfg-driven shape long-term — the constant lives
/// here only because the band is policy-stable today and the cfg
/// surface adds a public field that Phase 5 can land surgically.
const ADMIN_BAND_START: u16 = 100;
const ADMIN_BAND_END: u16 = 109;

/// True when `active_ppnum`'s first 3 digits parse into the admin band
/// `[ADMIN_BAND_START, ADMIN_BAND_END]`. Tokens carry the digit-only
/// storage form (`^[0-9]{11,}$` — STANDARDS line 73); display form
/// (`123-1234-5678`) on the wire is itself a forgery / misconfiguration
/// signal and is rejected by the all-digits guard below.
fn is_in_admin_band(active_ppnum: &str) -> bool {
    if active_ppnum.len() < 3 {
        return false;
    }
    if !active_ppnum.chars().all(|c| c.is_ascii_digit()) {
        return false;
    }
    match active_ppnum[..3].parse::<u16>() {
        Ok(band) => (ADMIN_BAND_START..=ADMIN_BAND_END).contains(&band),
        Err(_) => false,
    }
}

/// Parse a JSON value as an array of strings, mapping any wire-shape
/// failure to the supplied variant. Used by both `caps` (M41) and
/// `scopes` (M42) — they share the array-of-strings contract but get
/// distinct audit variants because their threat models differ
/// (capability confusion vs scope confusion).
fn parse_string_array(
    value: &serde_json::Value,
    on_invalid: AuthError,
) -> Result<Vec<String>, AuthError> {
    let array = value.as_array().ok_or(on_invalid.clone())?;
    let mut out = Vec::with_capacity(array.len());
    for item in array {
        let s = item.as_str().ok_or(on_invalid.clone())?;
        out.push(s.to_string());
    }
    Ok(out)
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;

    fn cfg() -> VerifyConfig {
        VerifyConfig::access_token("https://accounts.ppoppo.com", "ppoppo")
    }

    fn claims_with_sub(sub: &str) -> Claims {
        Claims {
            iss: "https://accounts.ppoppo.com".to_string(),
            sub: sub.to_string(),
            exp: 9_999_999_999,
            iat: 1_700_000_000,
            nbf: None,
            jti: "01HABC00000000000000000000".to_string(),
            client_id: "ppoppo-internal".to_string(),
            account_type: None,
            caps: Vec::new(),
            scopes: Vec::new(),
            admin: false,
            active_ppnum: None,
            delegator: None,
            cid: None,
            sid: None,
        }
    }

    /// Forge a JWS Compact payload with the supplied JSON and a
    /// throwaway header + sig. The M39 ULID check operates on
    /// `Claims.sub` directly, so the wire bytes don't matter for the
    /// sub-only tests below — but M40+ tests need a real payload to
    /// re-parse. Sharing the helper keeps the unit tests independent
    /// of the integration-level signed-token forger.
    fn forge_payload(payload: serde_json::Value) -> String {
        use base64::Engine;
        let header = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(
            serde_json::to_vec(&serde_json::json!({"alg":"EdDSA","typ":"at+jwt","kid":"k"}))
                .unwrap(),
        );
        let body = base64::engine::general_purpose::URL_SAFE_NO_PAD
            .encode(serde_json::to_vec(&payload).unwrap());
        format!("{header}.{body}.<sig>")
    }

    fn payload_with_sub(sub: &str) -> serde_json::Value {
        serde_json::json!({
            "iss": "https://accounts.ppoppo.com",
            "sub": sub,
            "aud": "ppoppo",
            "exp": 9_999_999_999i64,
            "iat": 1_700_000_000i64,
            "jti": "01HABC00000000000000000000",
            "client_id": "ppoppo-internal",
            "cat": "access",
        })
    }

    #[test]
    fn accepts_valid_ulid_sub() {
        // 01HSAB... uses only Crockford-base32 chars (S A B all valid).
        // The earlier scaffold sometimes typed `01HSUB...` because "SUB"
        // is the spelled-out concept — but `U` is *excluded* from
        // Crockford to avoid look-alike confusion with `V`. Using a real
        // ULID stops that footgun.
        let claims = claims_with_sub("01HSAB00000000000000000000");
        let token = forge_payload(payload_with_sub("01HSAB00000000000000000000"));
        assert!(run(&token, claims, &cfg()).is_ok());
    }

    #[test]
    fn rejects_too_short_sub() {
        let claims = claims_with_sub("00000000000"); // 11 digits — old ppnum
        let token = forge_payload(payload_with_sub("00000000000"));
        assert_eq!(
            run(&token, claims, &cfg()),
            Err(AuthError::SubFormatInvalid),
        );
    }

    #[test]
    fn rejects_non_crockford_alphabet() {
        // 'I' is excluded from Crockford base32 (collides with '1').
        let claims = claims_with_sub("I1HSUB00000000000000000000");
        let token = forge_payload(payload_with_sub("I1HSUB00000000000000000000"));
        assert_eq!(
            run(&token, claims, &cfg()),
            Err(AuthError::SubFormatInvalid),
        );
    }

    #[test]
    fn account_type_populated_when_valid() {
        let claims = claims_with_sub("01HSAB00000000000000000000");
        let mut payload = payload_with_sub("01HSAB00000000000000000000");
        payload["account_type"] = serde_json::json!("human");
        let token = forge_payload(payload);
        let claims = run(&token, claims, &cfg()).expect("M40 valid");
        assert_eq!(claims.account_type.as_deref(), Some("human"));
    }
}