rustls_native_roots_cache/lib.rs
1//! Process-wide cache for the system trust store.
2//!
3//! [`rustls_native_certs::load_native_certs`] reaches into the OS
4//! keychain (Security framework on macOS, NSS / OpenSSL stores on
5//! Linux). On macOS the underlying `Sec*` APIs are not concurrency-safe
6//! under load — multiple threads calling them in parallel can return
7//! `errSecIO` (-36) on what would otherwise succeed. Production
8//! daemons that build many distinct rustls `ClientConfig`s (one per
9//! upstream-TLS fingerprint, e.g.) hit this whenever a reload
10//! introduces a handful of new fingerprints concurrently.
11//!
12//! The fix is a process-wide cache: read the trust store **once per
13//! process**, share the resulting [`rustls::RootCertStore`] behind
14//! `Arc`. The first call's init barrier serialises the (single) load
15//! attempt; every subsequent caller gets a cheap `Arc::clone`.
16//!
17//! In-process the cache is sufficient. Across processes (e.g. a test
18//! runner that boots multiple binaries in parallel) each binary still
19//! makes its own first call, and those simultaneous calls can lose to
20//! keychain contention. The init path therefore retries on transient
21//! failure with a small backoff before giving up — `errSecIO` is
22//! documented by Apple as recoverable, and the happy path skips the
23//! backoff entirely.
24//!
25//! Long-running daemons need to pick up CA-cert updates (an OS
26//! security update revoking a root, an operator dropping a corporate
27//! CA into the keychain) without restarting. [`refresh_native_roots`]
28//! re-runs the load and atomically swaps the cached store on success;
29//! on failure the previous value is preserved and a warning is
30//! logged. The swap is lock-free on the read side so the upstream-TLS
31//! hot path is unaffected.
32
33use std::sync::{Arc, OnceLock};
34
35use arc_swap::ArcSwap;
36
37/// Shared error type. Carries an operator-readable message; the
38/// underlying `rustls_native_certs::Error` is not `Clone`, so we
39/// stringify at first-failure time and re-yield the same string on
40/// subsequent calls.
41#[derive(Debug, Clone)]
42pub struct NativeRootsError {
43 pub message: String,
44}
45
46impl std::fmt::Display for NativeRootsError {
47 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48 f.write_str(&self.message)
49 }
50}
51
52impl std::error::Error for NativeRootsError {}
53
54/// Cached load outcome. Cloned cheaply behind `Arc` so `load_full`
55/// hands callers an owned snapshot without a per-call deep copy.
56type Cached = Arc<Result<Arc<rustls::RootCertStore>, NativeRootsError>>;
57
58/// Lazy-initialised current trust-store snapshot. The first call
59/// through [`native_roots`] populates this; subsequent calls,
60/// including [`refresh_native_roots`], swap the inner value through
61/// `ArcSwap` without invalidating the [`OnceLock`].
62static NATIVE_ROOTS: OnceLock<ArcSwap<Result<Arc<rustls::RootCertStore>, NativeRootsError>>> =
63 OnceLock::new();
64
65fn snapshot() -> &'static ArcSwap<Result<Arc<rustls::RootCertStore>, NativeRootsError>> {
66 NATIVE_ROOTS.get_or_init(|| ArcSwap::from(Arc::new(load_native_roots())))
67}
68
69/// Return the cached system trust store, loading it on first call.
70///
71/// Concurrent first calls are serialised by the [`OnceLock`] barrier,
72/// so the OS keychain sees exactly one load attempt per process even
73/// under reload pressure that builds many fingerprints in parallel.
74/// Subsequent calls are lock-free: they read the current snapshot
75/// through `ArcSwap` and clone the inner `Arc<RootCertStore>`.
76///
77/// # Errors
78///
79/// Surfaces the most recently observed load outcome. A failed first
80/// load remains sticky until [`refresh_native_roots`] succeeds.
81pub fn native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
82 let cached: Cached = snapshot().load_full();
83 cached.as_ref().as_ref().map(Arc::clone).map_err(Clone::clone)
84}
85
86/// Eagerly trigger the first load. Useful when a daemon's boot path
87/// wants to know the trust-store status before any TLS code runs —
88/// idempotent; subsequent calls return the cached result without
89/// re-touching the OS keychain.
90///
91/// # Errors
92///
93/// Same shape as [`native_roots`]: returns the cached error if the
94/// load failed.
95pub fn warm_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
96 native_roots()
97}
98
99/// Re-read the OS trust store and atomically swap the cached
100/// snapshot when the load succeeds.
101///
102/// Long-lived daemons call this on a periodic timer or in response
103/// to an operator-triggered mgmt verb so OS-side CA updates land
104/// without a process restart. On failure the previous snapshot is
105/// preserved and a warning is logged — operators still see a working
106/// trust store while the load error surfaces in the warn record.
107///
108/// # Errors
109///
110/// Returns the new load attempt's error verbatim. The cached value
111/// is **not** replaced with the error in that case; subsequent
112/// [`native_roots`] callers continue to see whichever outcome was
113/// last cached (typically the prior successful store).
114pub fn refresh_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
115 let outcome = load_native_roots();
116 match &outcome {
117 Ok(store) => {
118 snapshot().store(Arc::new(Ok(Arc::clone(store))));
119 Ok(Arc::clone(store))
120 }
121 Err(e) => {
122 tracing::warn!(
123 error = %e,
124 "native trust store refresh failed; keeping previous snapshot",
125 );
126 Err(e.clone())
127 }
128 }
129}
130
131/// Maximum number of attempts at loading the OS trust store.
132///
133/// macOS Security framework returns `errSecIO` (-36) on transient
134/// I/O failure when the keychain APIs see concurrent callers (e.g. a
135/// test runner spawns dozens of test binaries that each boot their
136/// own process and hit `load_native_certs` simultaneously). Apple's
137/// own framework documents the error as recoverable. The happy path
138/// completes in attempt 1 with zero sleeps; only an observed failure
139/// pays the backoff. The [`OnceLock`] cache means we pay this cost
140/// at most once per process lifetime.
141const LOAD_RETRIES: usize = 3;
142const LOAD_RETRY_BACKOFF: std::time::Duration = std::time::Duration::from_millis(50);
143
144fn load_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
145 let started = std::time::Instant::now();
146 let mut last_err: Option<NativeRootsError> = None;
147 for attempt in 0..LOAD_RETRIES {
148 match try_load_native_roots() {
149 Ok(store) => {
150 tracing::info!(
151 anchors = store.len(),
152 elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX),
153 attempts = attempt + 1,
154 "native trust store loaded",
155 );
156 return Ok(store);
157 }
158 Err(e) => {
159 last_err = Some(e);
160 if attempt + 1 < LOAD_RETRIES {
161 std::thread::sleep(LOAD_RETRY_BACKOFF);
162 }
163 }
164 }
165 }
166 let err = last_err.expect("at least one attempt always populates last_err on the failure path");
167 tracing::error!(
168 error = %err,
169 elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX),
170 attempts = LOAD_RETRIES,
171 "native trust store load failed",
172 );
173 Err(err)
174}
175
176fn try_load_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
177 let native = rustls_native_certs::load_native_certs();
178 if !native.errors.is_empty() {
179 return Err(NativeRootsError { message: format!("load native certs: {:?}", native.errors) });
180 }
181 let mut store = rustls::RootCertStore::empty();
182 for cert in native.certs {
183 store.add(cert).map_err(|e| NativeRootsError { message: format!("add native cert: {e}") })?;
184 }
185 Ok(Arc::new(store))
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn native_roots_returns_same_arc_across_calls() {
194 // Single-process invariant: the OnceLock serves the same
195 // underlying RootCertStore on every call. The keychain (or
196 // equivalent OS trust store) is only touched during the very
197 // first call across the entire test binary's lifetime.
198 let a = native_roots().expect("trust store loads in test env");
199 let b = native_roots().expect("cached call");
200 assert!(Arc::ptr_eq(&a, &b), "subsequent calls must hand out the same Arc");
201 assert!(!a.is_empty(), "system trust store should have at least one anchor");
202 }
203
204 #[test]
205 fn warm_native_roots_returns_same_result_as_lazy_call() {
206 let warmed = warm_native_roots().expect("warm");
207 let lazy = native_roots().expect("lazy");
208 assert!(Arc::ptr_eq(&warmed, &lazy));
209 }
210
211 #[test]
212 fn refresh_native_roots_swaps_to_a_fresh_arc() {
213 // A successful refresh must publish a *new* Arc so callers
214 // re-reading `native_roots` see the new value (even when the
215 // keychain contents happen to be identical). Pointer
216 // inequality is the proxy.
217 let before = native_roots().expect("first load");
218 let refreshed = refresh_native_roots().expect("refresh");
219 assert!(!Arc::ptr_eq(&before, &refreshed), "refresh swaps Arc identity");
220 let after = native_roots().expect("post-refresh");
221 assert!(Arc::ptr_eq(&refreshed, &after), "subsequent reads see refreshed snapshot");
222 }
223}