Skip to main content

rustls_native_roots_cache/
lib.rs

1//! Process-wide cache for the system trust store.
2//!
3//! [`rustls_native_certs::load_native_certs`] reaches into the OS
4//! keychain (Security framework on macOS, NSS / OpenSSL stores on
5//! Linux). On macOS the underlying `Sec*` APIs are not concurrency-safe
6//! under load — multiple threads calling them in parallel can return
7//! `errSecIO` (-36) on what would otherwise succeed. Production
8//! daemons that build many distinct rustls `ClientConfig`s (one per
9//! upstream-TLS fingerprint, e.g.) hit this whenever a reload
10//! introduces a handful of new fingerprints concurrently.
11//!
12//! The fix is a process-wide cache: read the trust store **once per
13//! process**, share the resulting [`rustls::RootCertStore`] behind
14//! `Arc`. The [`std::sync::OnceLock`] initializer barrier serialises
15//! the (single) load attempt; every subsequent caller gets a cheap
16//! `Arc::clone`.
17//!
18//! In-process the `OnceLock` is sufficient. Across processes (e.g. a
19//! test runner that boots multiple binaries in parallel) each binary
20//! still makes its own first call, and those simultaneous calls can
21//! lose to keychain contention. The init path therefore retries on
22//! transient failure with a small backoff before giving up —
23//! `errSecIO` is documented by Apple as recoverable, and the happy
24//! path skips the backoff entirely.
25//!
26//! Failure semantics: after the bounded retries are exhausted the
27//! outcome is sticky. The cached error re-yields on every subsequent
28//! call so the operator sees consistent behaviour and can restart
29//! the process to attempt a fresh load — there is no per-request
30//! retry storm against an OS API that is already telling us it is
31//! unhappy.
32
33use std::sync::{Arc, OnceLock};
34
35/// Shared error type. Carries an operator-readable message; the
36/// underlying `rustls_native_certs::Error` is not `Clone`, so we
37/// stringify at first-failure time and re-yield the same string on
38/// subsequent calls.
39#[derive(Debug, Clone)]
40pub struct NativeRootsError {
41	pub message: String,
42}
43
44impl std::fmt::Display for NativeRootsError {
45	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46		f.write_str(&self.message)
47	}
48}
49
50impl std::error::Error for NativeRootsError {}
51
52static NATIVE_ROOTS: OnceLock<Result<Arc<rustls::RootCertStore>, NativeRootsError>> =
53	OnceLock::new();
54
55/// Return the cached system trust store, loading it on first call.
56///
57/// Concurrent first calls are serialised by the [`OnceLock`] barrier,
58/// so the OS keychain sees exactly one load attempt per process even
59/// under reload pressure that builds many fingerprints in parallel.
60///
61/// # Errors
62///
63/// Surfaces the load attempt's error (sticky for the lifetime of the
64/// process). Restart the process to retry.
65pub fn native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
66	NATIVE_ROOTS.get_or_init(load_native_roots).as_ref().map(Arc::clone).map_err(Clone::clone)
67}
68
69/// Eagerly trigger the first load. Useful when a daemon's boot path
70/// wants to know the trust-store status before any TLS code runs —
71/// idempotent; subsequent calls return the cached result without
72/// re-touching the OS keychain.
73///
74/// # Errors
75///
76/// Same shape as [`native_roots`]: returns the cached error if the
77/// load failed.
78pub fn warm_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
79	native_roots()
80}
81
82/// Maximum number of attempts at loading the OS trust store.
83///
84/// macOS Security framework returns `errSecIO` (-36) on transient
85/// I/O failure when the keychain APIs see concurrent callers (e.g. a
86/// test runner spawns dozens of test binaries that each boot their
87/// own process and hit `load_native_certs` simultaneously). Apple's
88/// own framework documents the error as recoverable. The happy path
89/// completes in attempt 1 with zero sleeps; only an observed failure
90/// pays the backoff. The [`OnceLock`] cache means we pay this cost
91/// at most once per process lifetime.
92const LOAD_RETRIES: usize = 3;
93const LOAD_RETRY_BACKOFF: std::time::Duration = std::time::Duration::from_millis(50);
94
95fn load_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
96	let started = std::time::Instant::now();
97	let mut last_err: Option<NativeRootsError> = None;
98	for attempt in 0..LOAD_RETRIES {
99		match try_load_native_roots() {
100			Ok(store) => {
101				tracing::info!(
102					anchors = store.len(),
103					elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX),
104					attempts = attempt + 1,
105					"native trust store loaded",
106				);
107				return Ok(store);
108			}
109			Err(e) => {
110				last_err = Some(e);
111				if attempt + 1 < LOAD_RETRIES {
112					std::thread::sleep(LOAD_RETRY_BACKOFF);
113				}
114			}
115		}
116	}
117	let err = last_err.expect("at least one attempt always populates last_err on the failure path");
118	tracing::error!(
119		error = %err,
120		elapsed_ms = u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX),
121		attempts = LOAD_RETRIES,
122		"native trust store load failed",
123	);
124	Err(err)
125}
126
127fn try_load_native_roots() -> Result<Arc<rustls::RootCertStore>, NativeRootsError> {
128	let native = rustls_native_certs::load_native_certs();
129	if !native.errors.is_empty() {
130		return Err(NativeRootsError { message: format!("load native certs: {:?}", native.errors) });
131	}
132	let mut store = rustls::RootCertStore::empty();
133	for cert in native.certs {
134		store.add(cert).map_err(|e| NativeRootsError { message: format!("add native cert: {e}") })?;
135	}
136	Ok(Arc::new(store))
137}
138
139#[cfg(test)]
140mod tests {
141	use super::*;
142
143	#[test]
144	fn native_roots_returns_same_arc_across_calls() {
145		// Single-process invariant: the OnceLock serves the same
146		// underlying RootCertStore on every call. The keychain (or
147		// equivalent OS trust store) is only touched during the very
148		// first call across the entire test binary's lifetime.
149		let a = native_roots().expect("trust store loads in test env");
150		let b = native_roots().expect("cached call");
151		assert!(Arc::ptr_eq(&a, &b), "subsequent calls must hand out the same Arc");
152		assert!(!a.is_empty(), "system trust store should have at least one anchor");
153	}
154
155	#[test]
156	fn warm_native_roots_returns_same_result_as_lazy_call() {
157		let warmed = warm_native_roots().expect("warm");
158		let lazy = native_roots().expect("lazy");
159		assert!(Arc::ptr_eq(&warmed, &lazy));
160	}
161}