axess-core 0.2.0

Core implementation for the axess library. Session state machine, multi-factor authentication engine, Cedar Policy evaluation, and pluggable storage backends. Use the `axess` facade crate unless you need direct access to internals.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
//! Shared building blocks for the factor verify/prepare pipelines.
//!
//! [`AuthnService::verify_factor`] and [`AuthnService::prepare_factor`] both:
//! - extract the `Authenticating` triple from the session (or fail with
//!   [`AuthnError::NoFlow`]),
//! - re-check account status against the store on every step (lockout
//!   enforcement bypass-proofing: the store is authoritative, never
//!   the session),
//! - emit audit rows on attempts against locked accounts.
//!
//! `verify_factor` additionally needs:
//! - a `FailWithUpdate` CAS-retry loop for atomic counter increments
//!   (Email OTP / HOTP),
//! - the generic failure-handling block (audit + counter + lockout-policy
//!   verdict),
//! - user-scope CAS-or-insert persistence on `PassWithUpdate` (TOTP
//!   replay-prevention step / HOTP counter advance).
//!
//! Each concern lives here as a `pub(super)` method so:
//! - they can be tested independently (rather than only through the
//!   full factor-flow surface),
//! - the calling pipelines read top-to-bottom, with orthogonal
//!   concerns delegated rather than nested in `if let` ladders.
//!
//! Before this split, `verify_factor` was 345 lines in one
//! method; afterwards it is a ~50-line pipeline. `prepare_factor`'s
//! account-status block was a copy-paste of `verify_factor`'s; both
//! now share [`AuthnService::enforce_account_status`].
//!
//! [`AuthnError::NoFlow`]: crate::authn::error::AuthnError::NoFlow

use super::AuthnService;
use super::outcomes::FactorOutcome;
use super::verification::{apply_email_otp_failure, apply_hotp_failure};
use crate::authn::{
    error::AuthnError,
    event::{AuthEventBuilder, AuthEventType},
    factor::{FactorConfig, FactorKind},
    ids::{TenantId, UserId},
    store::{FactorStore, IdentityStore},
    types::{AuthnScope, EntityState},
};
use crate::session::extractor::AuthSession;

/// Outcome of [`AuthnService::enforce_account_status`].
///
/// Caller maps each variant to its own response shape:
/// - `verify_factor` → `Locked { until }` becomes [`FactorOutcome::Locked { until }`],
/// - `prepare_factor` → `Locked { until }` becomes `Err(AuthnError::Locked { until })`,
///   `NotActive(status)` becomes `Err(AuthnError::NotActive(status))`.
///
/// Both call sites audit identically before reaching this point: the
/// `Failure` row was emitted by `enforce_account_status` itself.
///
/// The `Locked` variant carries only `until` (not the full
/// `EntityState`): `Locked` and `NotActive` map to distinct
/// `AuthnError` variants, so the suspended-state carrier is not
/// needed downstream.
#[derive(Debug)]
pub(super) enum AccountStatusEnforcement {
    /// Account is in good standing: proceed.
    Ok,
    /// Account is `Suspended`. Audit row already emitted.
    Locked {
        /// Suspension expiry, when set; `None` means indefinite.
        until: Option<chrono::DateTime<chrono::Utc>>,
    },
    /// Account is in some other non-login-allowing state
    /// (`Pending`, `Terminated`, `Archived`, `Candidate`, `Guest`).
    /// No audit row was emitted; only `Suspended` triggers the audit.
    NotActive(EntityState),
}

impl<I, F> AuthnService<I, F>
where
    I: IdentityStore,
    F: FactorStore<Error = I::Error>,
{
    /// Re-check account status from the store and emit the audit
    /// row when the account is locked.
    ///
    /// The store is queried on every factor step rather than relying
    /// on session state: a session-based counter can be bypassed by
    /// the client starting a new session; the store-side counter
    /// cannot.
    ///
    /// Locked-account attempts MUST emit a `Failure` audit
    /// row. Without it, every attempt against a locked account
    /// vanishes from the audit log, losing visibility into the *most
    /// interesting* phase of an attack (the attacker confirming the
    /// username is valid by tripping the lockout).
    pub(super) async fn enforce_account_status(
        &self,
        user_id: &UserId,
        tenant_id: &TenantId,
        next_kind: Option<FactorKind>,
        session: &AuthSession,
    ) -> Result<AccountStatusEnforcement, AuthnError<I::Error>> {
        let status = self
            .identity
            .account_status(user_id)
            .await
            .map_err(AuthnError::Store)?;

        if status.is_locked() {
            let until = if let EntityState::Suspended(detail) = &status {
                detail.until
            } else {
                None
            };
            self.record_locked_attempt_audit(user_id, tenant_id, next_kind, session)
                .await;
            return Ok(AccountStatusEnforcement::Locked { until });
        }
        if !status.allows_login() {
            return Ok(AccountStatusEnforcement::NotActive(status));
        }
        Ok(AccountStatusEnforcement::Ok)
    }

    /// Emit the `FactorVerified Failure` audit row for an
    /// attempt against a locked account.
    async fn record_locked_attempt_audit(
        &self,
        user_id: &UserId,
        tenant_id: &TenantId,
        next_kind: Option<FactorKind>,
        session: &AuthSession,
    ) {
        let mut builder = AuthEventBuilder::failure(AuthEventType::FactorVerified)
            .attributed_to(user_id, tenant_id)
            .with_session(session.session_id().await)
            .with_error("locked");
        if let Some(k) = next_kind {
            builder = builder.with_factor(k);
        }
        self.emit_audit(builder).await;
    }

    /// Atomically apply a `FailWithUpdate` factor-config increment
    /// against the user-scope row.
    ///
    /// Why this exists: when `verify_credential` returns
    /// `FailWithUpdate(updated_config)`, the failure counter must
    /// advance by exactly one regardless of how many concurrent wrong
    /// attempts are in flight. Naïve "load → mutate → save" loses
    /// updates: N parallel attempts all read N=k, all save N=k+1,
    /// the counter ends at k+1 instead of k+N: the brute-force
    /// limit is effectively bypassed.
    ///
    /// Strategy:
    /// 1. **Probe user-scope** before entering the CAS loop.
    ///    The `updated_config` we received was computed against
    ///    whatever scope `load_factor_with_fallback` returned: that
    ///    may have been tenant or global. Using it as the CAS
    ///    `prior_config` against a user-scope row would always lose
    ///    the first round (different bytes), then fall through to
    ///    the reload-and-save path which can silently overwrite a
    ///    concurrent user-scope write (admin reconfiguration, prior
    ///    failed attempt). Probing first lets us:
    ///    - plain-insert when no user-scope row exists (no race),
    ///    - CAS against an authoritative user-scope value otherwise.
    /// 2. **Bounded CAS loop**: on swap failure, reload the live
    ///    user-scope value, recompute the `next_config` increment
    ///    from that value via the per-`FactorKind` `apply_*_failure`
    ///    helpers, retry. After `MAX_FAIL_UPDATE_RETRIES` we give up
    ///    and log: better to under-count than to spin forever under
    ///    pathological contention.
    /// 3. **Missing-row fallback**: if a reload returns `None`
    ///    mid-loop (admin deleted the row), fall back to a plain
    ///    save so the failure is recorded somewhere.
    /// 4. **Wrong-kind bail-out**: if the reloaded row has a
    ///    different `FactorKind` (admin reconfigured the user
    ///    mid-flow), break without retry: the failure update no
    ///    longer makes sense.
    pub(super) async fn persist_fail_with_update(
        &self,
        user_scope: &AuthnScope,
        current_kind: FactorKind,
        updated_config: &FactorConfig,
    ) -> Result<(), AuthnError<I::Error>> {
        const MAX_FAIL_UPDATE_RETRIES: usize = 8;

        let initial_user_scope = self
            .factors
            .load_factor(user_scope, current_kind.clone())
            .await
            .map_err(AuthnError::Store)?;

        let mut applied = false;
        let (mut prior_config, mut next_config) = if let Some(existing) = initial_user_scope {
            // Recompute the failure update against the live user-scope
            // value rather than the (possibly inherited) template that
            // initial verification used.
            let recomputed = match &existing {
                FactorConfig::EmailOtp(otp) => FactorConfig::EmailOtp(apply_email_otp_failure(otp)),
                FactorConfig::Hotp(otp) => FactorConfig::Hotp(apply_hotp_failure(otp)),
                _ => updated_config.clone(),
            };
            (existing, recomputed)
        } else {
            // No user-scope row yet; plain insert is correct, no
            // concurrent writer can race because there is nothing to
            // race with.
            self.factors
                .save_factor(user_scope, updated_config.clone())
                .await
                .map_err(AuthnError::Store)?;
            applied = true;
            (updated_config.clone(), updated_config.clone())
        };

        for _ in 0..MAX_FAIL_UPDATE_RETRIES {
            if applied {
                break;
            }
            // CAS-loss here is operationally normal: a concurrent
            // verification incremented the same counter between our
            // load and our save. Reload + recompute the increment
            // against the new prior on the next loop iteration. Not a
            // security signal; only the post-success CAS in
            // `persist_pass_with_update` treats `Ok(false)` as replay.
            let swapped = self
                .factors
                .compare_and_save_factor(user_scope, &prior_config, next_config.clone())
                .await
                .map_err(AuthnError::Store)?;
            if swapped {
                applied = true;
                break;
            }
            // CAS lost the race; reload and recompute the increment
            // from whatever the concurrent writer left behind.
            let reloaded = self
                .factors
                .load_factor(user_scope, current_kind.clone())
                .await
                .map_err(AuthnError::Store)?;
            let Some(reloaded_config) = reloaded else {
                // No user-scope row exists at all; fall back to a plain
                // save so the failure is at least recorded somewhere.
                if let Err(e) = self
                    .factors
                    .save_factor(user_scope, updated_config.clone())
                    .await
                {
                    tracing::warn!(
                        error = %e,
                        "fallback save_factor on missing user-scope row \
                         also failed; failure counter not persisted"
                    );
                }
                applied = true;
                break;
            };
            match &reloaded_config {
                FactorConfig::EmailOtp(otp) => {
                    next_config = FactorConfig::EmailOtp(apply_email_otp_failure(otp));
                    prior_config = reloaded_config;
                }
                FactorConfig::Hotp(otp) => {
                    next_config = FactorConfig::Hotp(apply_hotp_failure(otp));
                    prior_config = reloaded_config;
                }
                _ => {
                    // Different factor kind appeared under user_scope (race
                    // with admin reconfiguration). Bail out without retry.
                    break;
                }
            }
        }

        if !applied {
            tracing::warn!("factor: failed to atomically increment failure counter after retries");
        }

        Ok(())
    }

    /// Record a factor-verification failure: audit emit, counter
    /// increment, lockout-policy verdict.
    ///
    /// Returns the [`FactorOutcome`] to bubble up: `Locked { until: None }`
    /// once the lockout threshold is reached, otherwise `InvalidCredential`.
    ///
    /// Audit-emit comes BEFORE the counter increment so audit-store
    /// errors fail loudly instead of letting counter and log diverge
    /// (otherwise a user could get locked with no explanatory audit
    /// row, leaving the SOC team correlating inconsistent state).
    ///
    /// Store errors on `record_failed_attempt` are NOT
    /// propagated as `Err(AuthnError::Store)`. Doing so would leak a
    /// distinct timing/error signature to the attacker (a Store-error
    /// response is measurably different from a normal
    /// `InvalidCredential`), letting them tell good usernames apart
    /// from bad ones AND letting them brute-force without ever
    /// incrementing the lockout counter. Instead we log + return
    /// `InvalidCredential`. Persistent counter outages should be
    /// monitored externally (metrics hook + log levels) and a
    /// circuit-breaker added at the application layer if needed.
    pub(super) async fn record_factor_failure(
        &self,
        user_id: &UserId,
        tenant_id: &TenantId,
        current_kind: &FactorKind,
        session: &AuthSession,
    ) -> Result<FactorOutcome, AuthnError<I::Error>> {
        self.metrics.factor_failure();

        // This is the one intentional bypass of `emit_audit` /
        // `emit_audit_at`. The audit-ordering fix needs a `tracing::error!` with
        // `user_id = %user_id` context that the generic emit helpers
        // don't provide; every other audit emit in the crate goes
        // through them. Direct `self.identity.record_event(...)` is
        // load-bearing here, not stylistic.
        if let Err(e) = self
            .identity
            .record_event(
                AuthEventBuilder::failure(AuthEventType::FactorVerified)
                    .attributed_to(user_id, tenant_id)
                    .with_factor(current_kind.clone())
                    .build_at(self.clock.now()),
            )
            .await
        {
            tracing::error!(
                user_id = %user_id,
                error = %e,
                "failed to record FactorVerified Failure audit event; \
                 SOC dashboards will be missing this attempt; proceeding with counter update"
            );
        }

        let count = match self.identity.record_failed_attempt(user_id).await {
            Ok(n) => n,
            Err(e) => {
                // Counter-store outage is operationally distinct from the
                // user typing a wrong password: the request still maps to
                // InvalidCredential (so an attacker probing for outages
                // gets the same response shape as a normal mismatch), but
                // lockout policy is silently disabled while the outage
                // lasts. Tag the metric separately so operators can alert
                // on it without false-firing on every wrong password.
                self.metrics.factor_counter_store_outage();
                tracing::warn!(
                    user_id = %user_id,
                    error = %e,
                    outage = "factor_counter_store",
                    "record_failed_attempt errored; returning InvalidCredential \
                     without lockout-counter update; monitor counter-store health"
                );
                session.record_attempt_at(self.clock.now()).await;
                return Ok(FactorOutcome::InvalidCredential);
            }
        };

        // Update session state for UI feedback only; never used for
        // lockout decisions (those are store-authoritative).
        session.record_attempt_at(self.clock.now()).await;

        let policy = self.identity.lockout_policy_for_tenant(tenant_id);
        if count >= policy.max_attempts {
            self.metrics.account_locked();
            // Surface the lockout window to the caller as `now +
            // policy.duration`, computed at the verdict that creates
            // the lockout (mirrors what production stores write into
            // the suspension row at the same moment). UIs avoid a
            // follow-up `enforce_account_status` round-trip to learn
            // the expiry.
            let until = policy.duration.and_then(|d| {
                chrono::Duration::from_std(d)
                    .ok()
                    .map(|d| self.clock.now() + d)
            });
            return Ok(FactorOutcome::Locked { until });
        }
        Ok(FactorOutcome::InvalidCredential)
    }

    /// Persist a `PassWithUpdate` factor-config change to the user
    /// scope, using compare-and-swap when a user-scope row already
    /// exists so a concurrent verification cannot race past the
    /// replay-prevention counter.
    ///
    /// For TOTP this records the accepted step (replay prevention).
    /// For HOTP this advances the counter past the matched value.
    ///
    /// The updated config is always saved to the user scope, even
    /// when the original config was loaded from a tenant or global
    /// scope via fallback. This is intentional: per-user mutable
    /// state (TOTP `last_step`, HOTP counter) must be stored
    /// per-user, while the inherited template (secret, digits,
    /// period) remains at the higher scope.
    ///
    /// Returns `Ok(true)` on a successful save, `Ok(false)` when CAS
    /// loses the race (concurrent verification spent the same
    /// step/counter first; caller treats as a replay and rejects).
    pub(super) async fn persist_pass_with_update(
        &self,
        user_scope: &AuthnScope,
        current_kind: FactorKind,
        updated_config: FactorConfig,
    ) -> Result<bool, AuthnError<I::Error>> {
        let existing_user_scope = self
            .factors
            .load_factor(user_scope, current_kind)
            .await
            .map_err(AuthnError::Store)?;
        match existing_user_scope {
            // CAS-loss here is a **security signal**: a concurrent
            // verification spent the same TOTP step / HOTP counter
            // first. `Ok(false)` propagates up as replay-detected and
            // the caller rejects the credential. Distinct from the
            // failure-counter CAS in `apply_failure_update`, where
            // `Ok(false)` means "retry."
            Some(prior) => self
                .factors
                .compare_and_save_factor(user_scope, &prior, updated_config)
                .await
                .map_err(AuthnError::Store),
            None => {
                self.factors
                    .save_factor(user_scope, updated_config)
                    .await
                    .map_err(AuthnError::Store)?;
                Ok(true)
            }
        }
    }
}

#[cfg(test)]
mod tests;