rustls_native_roots_cache/lib.rs
1//! Process-wide cache for the system trust store.
2//!
3//! [`rustls_native_certs::load_native_certs`] reaches into the OS
4//! keychain (Security framework on macOS, NSS / OpenSSL stores on
5//! Linux). On macOS the underlying `Sec*` APIs are not concurrency-safe
6//! under load — multiple threads calling them in parallel can return
7//! `errSecIO` (-36) on what would otherwise succeed. Production
8//! daemons that build many distinct rustls `ClientConfig`s (one per
9//! upstream-TLS fingerprint, e.g.) hit this whenever a reload
10//! introduces a handful of new fingerprints concurrently.
11//!
12//! The fix is a process-wide cache: read the trust store **once per
13//! process**, share the resulting [`rustls::RootCertStore`] behind
14//! `Arc`. The [`std::sync::OnceLock`] initializer barrier serialises
15//! the (single) load attempt; every subsequent caller gets a cheap
16//! `Arc::clone`.
17//!
18//! In-process the `OnceLock` is sufficient. Across processes (e.g. a
19//! test runner that boots multiple binaries in parallel) each binary
20//! still makes its own first call, and those simultaneous calls can
21//! lose to keychain contention. The init path therefore retries on
22//! transient failure with a small backoff before giving up —
23//! `errSecIO` is documented by Apple as recoverable, and the happy
24//! path skips the backoff entirely.
25//!
26//! Failure semantics: after the bounded retries are exhausted the
27//! outcome is sticky. The cached error re-yields on every subsequent
28//! call so the operator sees consistent behaviour and can restart
29//! the process to attempt a fresh load — there is no per-request
30//! retry storm against an OS API that is already telling us it is
31//! unhappy.
32
33use std::sync::{Arc, OnceLock};
34
35/// Shared error type. Carries an operator-readable message; the
36/// underlying `rustls_native_certs::Error` is not `Clone`, so we
37/// stringify at first-failure time and re-yield the same string on
38/// subsequent calls.
39#[derive(Debug, Clone)]
40pub struct NativeRootsError {
41 pub message: String,
42}
43
44impl std::fmt::Display for NativeRootsError {
45 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46 f.write_str(&self.message)
47 }
48}
49
50impl std::error::Error for NativeRootsError {}
51
52static NATIVE_ROOTS: OnceLock<Result<Arc<rustls::RootCertStore>, NativeRootsError>> =
53 OnceLock::new();
54
55/// Return the cached system trust store, loading it on first call.
56///
57/// Concurrent first calls are serialised by the [`OnceLock`] barrier,
58/// so the OS keychain sees exactly one load attempt per process even
59/// under reload pressure that builds many fingerprints in parallel.
60///
61/// # Errors
62///
63/// Surfaces the load attempt's error (sticky for the lifetime of the
64/// process). Restart the process to retry.
65pub fn native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
66 NATIVE_ROOTS.get_or_init(load_native_roots).as_ref().map(Arc::clone).map_err(Clone::clone)
67}
68
69/// Eagerly trigger the first load. Useful when a daemon's boot path
70/// wants to know the trust-store status before any TLS code runs —
71/// idempotent; subsequent calls return the cached result without
72/// re-touching the OS keychain.
73///
74/// # Errors
75///
76/// Same shape as [`native_roots`]: returns the cached error if the
77/// load failed.
78pub fn warm_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
79 native_roots()
80}
81
82/// Maximum number of attempts at loading the OS trust store.
83///
84/// macOS Security framework returns `errSecIO` (-36) on transient
85/// I/O failure when the keychain APIs see concurrent callers (e.g. a
86/// test runner spawns dozens of test binaries that each boot their
87/// own process and hit `load_native_certs` simultaneously). Apple's
88/// own framework documents the error as recoverable. The happy path
89/// completes in attempt 1 with zero sleeps; only an observed failure
90/// pays the backoff. The [`OnceLock`] cache means we pay this cost
91/// at most once per process lifetime.
92const LOAD_RETRIES: usize = 3;
93const LOAD_RETRY_BACKOFF: std::time::Duration = std::time::Duration::from_millis(50);
94
95fn load_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
96 let started = std::time::Instant::now();
97 let mut last_err: Option<NativeRootsError> = None;
98 for attempt in 0..LOAD_RETRIES {
99 match try_load_native_roots() {
100 Ok(store) => {
101 tracing::info!(
102 anchors = store.len(),
103 elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX),
104 attempts = attempt + 1,
105 "native trust store loaded",
106 );
107 return Ok(store);
108 }
109 Err(e) => {
110 last_err = Some(e);
111 if attempt + 1 < LOAD_RETRIES {
112 std::thread::sleep(LOAD_RETRY_BACKOFF);
113 }
114 }
115 }
116 }
117 let err = last_err.expect("at least one attempt always populates last_err on the failure path");
118 tracing::error!(
119 error = %err,
120 elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX),
121 attempts = LOAD_RETRIES,
122 "native trust store load failed",
123 );
124 Err(err)
125}
126
127fn try_load_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
128 let native = rustls_native_certs::load_native_certs();
129 if !native.errors.is_empty() {
130 return Err(NativeRootsError { message: format!("load native certs: {:?}", native.errors) });
131 }
132 let mut store = rustls::RootCertStore::empty();
133 for cert in native.certs {
134 store.add(cert).map_err(|e| NativeRootsError { message: format!("add native cert: {e}") })?;
135 }
136 Ok(Arc::new(store))
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 #[test]
144 fn native_roots_returns_same_arc_across_calls() {
145 // Single-process invariant: the OnceLock serves the same
146 // underlying RootCertStore on every call. The keychain (or
147 // equivalent OS trust store) is only touched during the very
148 // first call across the entire test binary's lifetime.
149 let a = native_roots().expect("trust store loads in test env");
150 let b = native_roots().expect("cached call");
151 assert!(Arc::ptr_eq(&a, &b), "subsequent calls must hand out the same Arc");
152 assert!(!a.is_empty(), "system trust store should have at least one anchor");
153 }
154
155 #[test]
156 fn warm_native_roots_returns_same_result_as_lazy_call() {
157 let warmed = warm_native_roots().expect("warm");
158 let lazy = native_roots().expect("lazy");
159 assert!(Arc::ptr_eq(&warmed, &lazy));
160 }
161}