Skip to main content

bestool_canopy/
client.rs

1use std::{
2	fmt,
3	io::Write,
4	net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr},
5	sync::{Arc, OnceLock},
6	time::Duration,
7};
8
9use flate2::{Compression, write::GzEncoder};
10use hickory_resolver::{
11	ConnectionProvider, Resolver,
12	config::{ConnectionConfig, NameServerConfig, ResolverConfig},
13	net::runtime::TokioRuntimeProvider,
14};
15use jiff::Timestamp;
16use miette::{IntoDiagnostic, Result, WrapErr};
17use rcgen::{CertificateParams, DistinguishedName, DnType, KeyPair};
18use reqwest::Url;
19use serde::{Deserialize, Serialize};
20use time::{Duration as TimeDuration, OffsetDateTime};
21use tokio::sync::RwLock;
22use tracing::debug;
23
24use crate::{
25	Redacted,
26	backup::{
27		BackupCredentials, BackupCredentialsRequest, BackupReport, BackupTarget,
28		CapabilitiesRequest, Purpose, TargetOutcome,
29	},
30};
31
32pub const DEFAULT_CANOPY_URL: &str = "https://meta.tamanu.app";
33
34/// Base URL for the tailscale-internal canopy endpoint.
35///
36/// On hosts that share the canopy tailnet, posting to this URL works without
37/// mTLS — the tailscale identity is the auth.
38pub const TAILSCALE_URL: &str = "https://canopy.tail53aef.ts.net";
39
40/// Bare hostname used for `resolve_to_addrs` overrides.
41const TAILSCALE_HOST: &str = "canopy.tail53aef.ts.net";
42
43/// Hardcoded tailscale IPs for canopy, used when tailscale DNS
44/// (100.100.100.100) is unreachable but the tailnet otherwise is.
45const CANOPY_HARDCODED_V4: Ipv4Addr = Ipv4Addr::new(100, 99, 98, 97);
46const CANOPY_HARDCODED_V6: Ipv6Addr =
47	Ipv6Addr::new(0xfd7a, 0x115c, 0xa1e0, 0, 0, 0, 0x9337, 0xfb52);
48
49/// How long renewed canopy certs are valid for.
50///
51/// Set well above [`CERT_RENEW_AFTER`] so a renewal failure doesn't immediately
52/// strand the client.
53const CERT_VALIDITY_DAYS: i64 = 6;
54
55/// How long to wait between scheduled cert renewals.
56///
57/// Renewal runs in a background task in the daemon; the legacy single-shot
58/// alerts command builds the client once and exits well within this window.
59pub const CERT_RENEW_AFTER: Duration = Duration::from_secs(5 * 24 * 60 * 60);
60
61/// Timeout for the tailscale availability probe.
62const TAILSCALE_PROBE_TIMEOUT: Duration = Duration::from_secs(5);
63
64/// Factory producing the base [`reqwest::ClientBuilder`] for canopy's clients.
65///
66/// The caller supplies this so it owns cross-cutting client config (user-agent,
67/// `SSLKEYLOGFILE`, proxies, …). Canopy invokes it whenever it needs to build or
68/// rebuild a client — at probe time, on mTLS cert renewal, and on reload — then
69/// layers its own concerns (mTLS identity, DNS overrides, timeouts) on top.
70pub type ClientBuilderFactory = Arc<dyn Fn() -> reqwest::ClientBuilder + Send + Sync>;
71
72/// Browser-style user-agent string, e.g. `bestool/1.2.3 (Linux 7.0.9 Arch Linux; x86_64)`.
73///
74/// `product` and `version` identify the calling binary; the OS comment is
75/// detected at runtime and cached.
76pub fn user_agent(product: &str, version: &str) -> String {
77	static OS_COMMENT: OnceLock<String> = OnceLock::new();
78	let os_comment = OS_COMMENT.get_or_init(|| {
79		let os = sysinfo::System::long_os_version()
80			.or_else(sysinfo::System::name)
81			.unwrap_or_else(|| std::env::consts::OS.to_owned());
82		format!("{os}; {}", sysinfo::System::cpu_arch())
83	});
84	format!("{product}/{version} ({os_comment})")
85}
86
87/// A [`reqwest::ClientBuilder`] carrying the `bestool` [`user_agent`] for `version`.
88///
89/// Convenience for callers that don't need any extra client config; suitable as
90/// the base of a [`ClientBuilderFactory`].
91pub fn client_builder(version: &str) -> reqwest::ClientBuilder {
92	reqwest::Client::builder().user_agent(user_agent("bestool", version))
93}
94
95/// Probe the canopy tailnet endpoint, returning a client routed to it if
96/// reachable.
97///
98/// The returned client carries the same DNS / hardcoded-IP resolution override
99/// the reporting client uses and presents **no** client certificate — callers
100/// reaching canopy this way authenticate by tailnet identity. Returns `None`
101/// when the tailnet endpoint isn't reachable, so callers can fall back to
102/// public mTLS.
103pub async fn tailscale_client(make_builder: &ClientBuilderFactory) -> Option<reqwest::Client> {
104	probe_tailscale(make_builder).await
105}
106
107/// Severities accepted by the canopy `/events` API.
108///
109/// Canopy narrowed its vocabulary from RFC 5424 to this five-level set; the
110/// retired syslog severities (`emergency`, `alert`, `notice`) are rejected by
111/// the strict enum validation on `POST /events`.
112#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
113#[serde(rename_all = "lowercase")]
114pub enum Severity {
115	Critical,
116	Error,
117	Warning,
118	Info,
119	Debug,
120}
121
122/// Payload for posting to `POST /events` on a canopy server.
123#[derive(Debug, Clone, Serialize)]
124pub struct NewEvent<'a> {
125	pub source: &'a str,
126	#[serde(rename = "ref")]
127	pub r#ref: &'a str,
128	pub message: &'a str,
129	#[serde(skip_serializing_if = "Option::is_none")]
130	pub description: Option<&'a str>,
131	#[serde(skip_serializing_if = "Option::is_none")]
132	pub severity: Option<Severity>,
133	#[serde(rename = "occurredAt", skip_serializing_if = "Option::is_none")]
134	pub occurred_at: Option<Timestamp>,
135	#[serde(skip_serializing_if = "Option::is_none")]
136	pub active: Option<bool>,
137}
138
139/// HTTP client with auth configured for talking to a canopy server.
140///
141/// Tries two auth paths in order of preference:
142/// 1. **Tailscale**: if the canopy tailnet endpoint is reachable, plain HTTPS
143///    works (auth is implicit via tailscale identity).
144/// 2. **mTLS**: a fresh self-signed cert from the device key, short-lived
145///    ([`CERT_VALIDITY_DAYS`]); for long-running daemons, [`Self::renew`]
146///    should tick on [`CERT_RENEW_AFTER`] to swap in a fresh cert before expiry.
147///
148/// [`Self::refresh`] re-probes tailscale and swaps modes on reload.
149pub struct CanopyClient {
150	device_key: Option<Redacted<String>>,
151	/// Tamanu version of the install this client speaks for. Sent verbatim in
152	/// the `X-Version` request header — canopy rejects events / status pushes
153	/// that don't carry one. Sourced from the running Tamanu install's
154	/// `package.json` (via `find_tamanu`); not the bestool / alertd version.
155	tamanu_version: String,
156	/// Produces the base client builder; see [`ClientBuilderFactory`].
157	make_builder: ClientBuilderFactory,
158	state: RwLock<State>,
159}
160
161enum State {
162	Tailscale(reqwest::Client),
163	Mtls(reqwest::Client),
164}
165
166impl State {
167	fn is_tailscale(&self) -> bool {
168		matches!(self, State::Tailscale(_))
169	}
170
171	fn http(&self) -> reqwest::Client {
172		match self {
173			State::Tailscale(http) | State::Mtls(http) => http.clone(),
174		}
175	}
176}
177
178impl fmt::Debug for CanopyClient {
179	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
180		f.debug_struct("CanopyClient").finish_non_exhaustive()
181	}
182}
183
184impl CanopyClient {
185	/// Build a canopy client, preferring tailscale and falling back to mTLS.
186	///
187	/// Probes the tailscale canopy endpoint first; if reachable, uses it.
188	/// Otherwise, if a device key PEM is provided, builds an mTLS client.
189	/// Returns `Ok(None)` if neither path is available.
190	///
191	/// `tamanu_version` is the version of the Tamanu install this client
192	/// speaks for; sent on every request via the `X-Version` header.
193	///
194	/// `make_builder` supplies the base [`reqwest::ClientBuilder`] — see
195	/// [`ClientBuilderFactory`]. Use [`client_builder`] for a sensible default.
196	pub async fn new(
197		tamanu_version: impl Into<String>,
198		device_key_pem: Option<&str>,
199		make_builder: impl Fn() -> reqwest::ClientBuilder + Send + Sync + 'static,
200	) -> Result<Option<Self>> {
201		let tamanu_version = tamanu_version.into();
202		let device_key = device_key_pem.map(|s| Redacted(s.to_owned()));
203		let make_builder: ClientBuilderFactory = Arc::new(make_builder);
204
205		if let Some(http) = probe_tailscale(&make_builder).await {
206			debug!("canopy: tailscale endpoint reachable, preferring it");
207			return Ok(Some(Self {
208				device_key,
209				tamanu_version,
210				make_builder,
211				state: RwLock::new(State::Tailscale(http)),
212			}));
213		}
214
215		if let Some(pem) = device_key_pem {
216			debug!("canopy: tailscale unreachable, falling back to mTLS");
217			let http = build_mtls_http(&make_builder, pem)?;
218			return Ok(Some(Self {
219				device_key,
220				tamanu_version,
221				make_builder,
222				state: RwLock::new(State::Mtls(http)),
223			}));
224		}
225
226		Ok(None)
227	}
228
229	/// Returns true if the client is currently using the tailscale path.
230	pub async fn is_tailscale(&self) -> bool {
231		self.state.read().await.is_tailscale()
232	}
233
234	/// Re-probe tailscale and swap modes if the picture has changed.
235	///
236	/// Intended to be called when the daemon receives a reload signal.
237	pub async fn refresh(&self) -> Result<()> {
238		if let Some(http) = probe_tailscale(&self.make_builder).await {
239			let mut state = self.state.write().await;
240			if !state.is_tailscale() {
241				debug!("canopy refresh: switching to tailscale path");
242			}
243			*state = State::Tailscale(http);
244			return Ok(());
245		}
246
247		if let Some(pem) = &self.device_key {
248			let http = build_mtls_http(&self.make_builder, &pem.0)?;
249			let mut state = self.state.write().await;
250			if state.is_tailscale() {
251				debug!("canopy refresh: tailscale dropped, falling back to mTLS");
252			}
253			*state = State::Mtls(http);
254			return Ok(());
255		}
256
257		debug!("canopy refresh: no auth path available, keeping current state");
258		Ok(())
259	}
260
261	/// Rebuild the underlying HTTP client with a fresh certificate.
262	///
263	/// No-op in tailscale mode (no cert to rotate). In mTLS mode, atomically
264	/// replaces the live client; in-flight requests continue with the old
265	/// client until they complete.
266	pub async fn renew(&self) -> Result<()> {
267		let Some(pem) = &self.device_key else {
268			return Ok(());
269		};
270		let mut state = self.state.write().await;
271		if state.is_tailscale() {
272			return Ok(());
273		}
274		*state = State::Mtls(build_mtls_http(&self.make_builder, &pem.0)?);
275		Ok(())
276	}
277
278	/// POST a status snapshot to the canopy server.
279	///
280	/// In tailscale mode, `base_url` is ignored and a `{TAILSCALE_URL}/public/status/{server_id}`
281	/// URL is used. In mTLS mode, posts to `{base_url}/status/{server_id}`.
282	///
283	/// The payload is free-form JSON; the canopy `/status` contract reserves the
284	/// top-level `health: []` key, whose entries each carry a `result` of
285	/// `passed | warning | failed | broken | skipped`. The body is gzip-encoded
286	/// with `Content-Encoding: gzip`.
287	///
288	/// Returns `backup_now`: the backup-type names canopy says this server should
289	/// back up right now (operator one-offs + schedule-due). Empty means nothing
290	/// to do. A response that predates the field (no `backup_now`) yields an empty
291	/// list, so older canopy deployments keep working.
292	pub async fn post_status(
293		&self,
294		base_url: &Url,
295		server_id: &str,
296		payload: &serde_json::Value,
297	) -> Result<Vec<String>> {
298		let (http, url) = {
299			let state = self.state.read().await;
300			let url = match &*state {
301				State::Tailscale(_) => format!("{TAILSCALE_URL}/public/status/{server_id}")
302					.parse::<Url>()
303					.into_diagnostic()
304					.wrap_err("building tailscale /public/status URL")?,
305				State::Mtls(_) => base_url
306					.join(&format!("/status/{server_id}"))
307					.into_diagnostic()
308					.wrap_err("building /status URL")?,
309			};
310			(state.http(), url)
311		};
312
313		let raw = serde_json::to_vec(payload)
314			.into_diagnostic()
315			.wrap_err("serialising canopy /status payload")?;
316		let compressed = gzip_bytes(&raw)
317			.into_diagnostic()
318			.wrap_err("gzipping canopy /status payload")?;
319
320		debug!(
321			%url,
322			raw_bytes = raw.len(),
323			gzip_bytes = compressed.len(),
324			"posting status snapshot to canopy",
325		);
326
327		let response = http
328			.post(url)
329			.header("X-Version", &self.tamanu_version)
330			.header(reqwest::header::CONTENT_TYPE, "application/json")
331			.header(reqwest::header::CONTENT_ENCODING, "gzip")
332			.body(compressed)
333			.send()
334			.await
335			.into_diagnostic()
336			.wrap_err("posting status to canopy")?;
337
338		let status = response.status();
339		if !status.is_success() {
340			let body = response.text().await.unwrap_or_default();
341			return Err(miette::miette!("canopy /status returned {status}: {body}"));
342		}
343
344		#[derive(Deserialize, Default)]
345		struct StatusResponseTail {
346			#[serde(default)]
347			backup_now: Vec<String>,
348		}
349
350		// The response flattens the persisted Status plus `backup_now`; we read
351		// only the latter and ignore the rest. A body that fails to parse (or
352		// predates the field) is treated as "nothing to do" rather than failing
353		// the status push.
354		let tail = response
355			.json::<StatusResponseTail>()
356			.await
357			.unwrap_or_default();
358		Ok(tail.backup_now)
359	}
360
361	/// GET a path on the canopy server, routed via tailscale when available.
362	///
363	/// In tailscale mode, the request goes to `{TAILSCALE_URL}{tailscale_path}`
364	/// (typically `/public/...`, the only mount that accepts tagged-device
365	/// tailscale callers). In mTLS mode, the request goes to `{base_url}{mtls_path}`.
366	///
367	/// Returns the raw response — the caller is responsible for status checks
368	/// and body parsing so they can choose how to fall back if the response
369	/// isn't usable.
370	pub async fn get(
371		&self,
372		base_url: &Url,
373		tailscale_path: &str,
374		mtls_path: &str,
375	) -> Result<reqwest::Response> {
376		let (http, url) = {
377			let state = self.state.read().await;
378			let url = match &*state {
379				State::Tailscale(_) => format!("{TAILSCALE_URL}{tailscale_path}")
380					.parse::<Url>()
381					.into_diagnostic()
382					.wrap_err("building tailscale GET URL")?,
383				State::Mtls(_) => base_url
384					.join(mtls_path)
385					.into_diagnostic()
386					.wrap_err("building mTLS GET URL")?,
387			};
388			(state.http(), url)
389		};
390
391		debug!(%url, "GET via canopy");
392		http.get(url)
393			.header("X-Version", &self.tamanu_version)
394			.send()
395			.await
396			.into_diagnostic()
397			.wrap_err("GET via canopy")
398	}
399
400	/// POST an event to the canopy server.
401	///
402	/// In tailscale mode, `base_url` is ignored and [`TAILSCALE_URL`] is used.
403	/// In mTLS mode, posts to `{base_url}/events`.
404	pub async fn post_event(&self, base_url: &Url, event: NewEvent<'_>) -> Result<()> {
405		let (http, url) = {
406			let state = self.state.read().await;
407			let url = match &*state {
408				State::Tailscale(_) => format!("{TAILSCALE_URL}/public/events")
409					.parse::<Url>()
410					.into_diagnostic()
411					.wrap_err("building tailscale /public/events URL")?,
412				State::Mtls(_) => base_url
413					.join("/events")
414					.into_diagnostic()
415					.wrap_err("building /events URL")?,
416			};
417			(state.http(), url)
418		};
419
420		debug!(
421			%url,
422			source = event.source,
423			r#ref = event.r#ref,
424			active = ?event.active,
425			"posting event to canopy"
426		);
427
428		let response = http
429			.post(url)
430			.header("X-Version", &self.tamanu_version)
431			.json(&event)
432			.send()
433			.await
434			.into_diagnostic()
435			.wrap_err("posting event to canopy")?;
436
437		let status = response.status();
438		if !status.is_success() {
439			let body = response.text().await.unwrap_or_default();
440			return Err(miette::miette!("canopy /events returned {status}: {body}"));
441		}
442
443		Ok(())
444	}
445
446	/// Resolve an endpoint URL for the current auth path.
447	///
448	/// `path` is the mTLS-mode path (e.g. `/backup-target`); over tailscale the
449	/// same endpoint is mounted under `/public`, so this prepends it.
450	async fn endpoint_url(&self, base_url: &Url, path: &str) -> Result<(reqwest::Client, Url)> {
451		let state = self.state.read().await;
452		let url = match &*state {
453			State::Tailscale(_) => format!("{TAILSCALE_URL}/public{path}")
454				.parse::<Url>()
455				.into_diagnostic()
456				.wrap_err_with(|| format!("building tailscale /public{path} URL"))?,
457			State::Mtls(_) => base_url
458				.join(path)
459				.into_diagnostic()
460				.wrap_err_with(|| format!("building {path} URL"))?,
461		};
462		Ok((state.http(), url))
463	}
464
465	/// Register the backup types this server can run (`POST /backup-capabilities`).
466	pub async fn backup_capabilities(&self, base_url: &Url, types: &[String]) -> Result<()> {
467		let (http, url) = self.endpoint_url(base_url, "/backup-capabilities").await?;
468		debug!(%url, ?types, "registering backup capabilities with canopy");
469		let response = http
470			.post(url)
471			.header("X-Version", &self.tamanu_version)
472			.json(&CapabilitiesRequest { types })
473			.send()
474			.await
475			.into_diagnostic()
476			.wrap_err("posting backup capabilities to canopy")?;
477
478		let status = response.status();
479		if !status.is_success() {
480			let body = response.text().await.unwrap_or_default();
481			return Err(miette::miette!(
482				"canopy /backup-capabilities returned {status}: {body}"
483			));
484		}
485		Ok(())
486	}
487
488	/// Obtain short-lived S3 credentials for a backup type (`POST /backup-credentials`).
489	///
490	/// Returns the `credential_process`-shaped creds; the caller translates them
491	/// to the container-creds shape for kopia. `412`/`409`/`502` surface as errors.
492	pub async fn backup_credentials(
493		&self,
494		base_url: &Url,
495		backup_type: &str,
496		purpose: Purpose,
497	) -> Result<BackupCredentials> {
498		let (http, url) = self.endpoint_url(base_url, "/backup-credentials").await?;
499		debug!(%url, backup_type, ?purpose, "requesting backup credentials from canopy");
500		let response = http
501			.post(url)
502			.header("X-Version", &self.tamanu_version)
503			.json(&BackupCredentialsRequest {
504				r#type: backup_type,
505				purpose,
506			})
507			.send()
508			.await
509			.into_diagnostic()
510			.wrap_err("posting backup credentials request to canopy")?;
511
512		let status = response.status();
513		if !status.is_success() {
514			let body = response.text().await.unwrap_or_default();
515			return Err(miette::miette!(
516				"canopy /backup-credentials returned {status}: {body}"
517			));
518		}
519		response
520			.json::<BackupCredentials>()
521			.await
522			.into_diagnostic()
523			.wrap_err("parsing backup credentials from canopy")
524	}
525
526	/// Fetch the S3 repo target (`GET /backup-target`).
527	///
528	/// `412`/`409` mean the device isn't yet authorised for backups; these map to
529	/// [`TargetOutcome::Dormant`] (a benign idle state) rather than an error.
530	pub async fn backup_target(&self, base_url: &Url) -> Result<TargetOutcome> {
531		let (http, url) = self.endpoint_url(base_url, "/backup-target").await?;
532		debug!(%url, "fetching backup target from canopy");
533		let response = http
534			.get(url)
535			.header("X-Version", &self.tamanu_version)
536			.send()
537			.await
538			.into_diagnostic()
539			.wrap_err("fetching backup target from canopy")?;
540
541		let status = response.status();
542		if status == reqwest::StatusCode::PRECONDITION_FAILED
543			|| status == reqwest::StatusCode::CONFLICT
544		{
545			return Ok(TargetOutcome::Dormant);
546		}
547		if !status.is_success() {
548			let body = response.text().await.unwrap_or_default();
549			return Err(miette::miette!(
550				"canopy /backup-target returned {status}: {body}"
551			));
552		}
553		let target = response
554			.json::<BackupTarget>()
555			.await
556			.into_diagnostic()
557			.wrap_err("parsing backup target from canopy")?;
558		Ok(TargetOutcome::Ready(target))
559	}
560
561	/// Report a completed backup/restore run (`POST /backup-report`).
562	pub async fn backup_report(&self, base_url: &Url, report: &BackupReport<'_>) -> Result<()> {
563		let (http, url) = self.endpoint_url(base_url, "/backup-report").await?;
564		debug!(%url, run_id = report.run_id, "reporting backup outcome to canopy");
565		let response = http
566			.post(url)
567			.header("X-Version", &self.tamanu_version)
568			.json(report)
569			.send()
570			.await
571			.into_diagnostic()
572			.wrap_err("posting backup report to canopy")?;
573
574		let status = response.status();
575		if !status.is_success() {
576			let body = response.text().await.unwrap_or_default();
577			return Err(miette::miette!(
578				"canopy /backup-report returned {status}: {body}"
579			));
580		}
581		Ok(())
582	}
583}
584
585/// Probe the tailscale canopy endpoint.
586///
587/// Returns a configured `reqwest::Client` if `GET /public/servers` responds
588/// 2xx — anything else (timeout, non-2xx, transport error) returns `None` so
589/// the caller can fall back to mTLS.
590///
591/// Tries two paths in order:
592/// 1. Resolve `canopy` via the tailscale DNS server (100.100.100.100) and
593///    probe with those addresses.
594/// 2. Use hardcoded tailscale IPs for canopy and probe with those.
595///
596/// `/public/servers` is used because:
597/// - it lives under `/public/...`, the only mount that accepts tagged-device
598///   tailscale callers (everything else 403s with `tagged-device-not-allowed`);
599/// - it's a `GET` with no body, no `VersionHeader` requirement, and no auth;
600/// - it's read-only, so probing it has no side effects.
601async fn probe_tailscale(make_builder: &ClientBuilderFactory) -> Option<reqwest::Client> {
602	let dns_addrs: Vec<SocketAddr> = tailscale_resolver()
603		.lookup_ip("canopy")
604		.await
605		.ok()
606		.map(|addrs| addrs.iter().map(|ip| SocketAddr::new(ip, 443)).collect())
607		.unwrap_or_default();
608	if !dns_addrs.is_empty()
609		&& let Some(client) = try_probe(&dns_addrs, make_builder).await
610	{
611		return Some(client);
612	}
613
614	let hardcoded = [
615		SocketAddr::new(IpAddr::V4(CANOPY_HARDCODED_V4), 443),
616		SocketAddr::new(IpAddr::V6(CANOPY_HARDCODED_V6), 443),
617	];
618	debug!(
619		?hardcoded,
620		"canopy tailscale DNS lookup empty or probe failed, trying hardcoded IPs"
621	);
622	try_probe(&hardcoded, make_builder).await
623}
624
625async fn try_probe(
626	addrs: &[SocketAddr],
627	make_builder: &ClientBuilderFactory,
628) -> Option<reqwest::Client> {
629	let client = make_builder()
630		.timeout(TAILSCALE_PROBE_TIMEOUT)
631		.resolve_to_addrs(TAILSCALE_HOST, addrs)
632		.build()
633		.ok()?;
634
635	let url = format!("{TAILSCALE_URL}/public/servers");
636	match client.get(&url).send().await {
637		Ok(resp) if resp.status().is_success() => Some(client),
638		Ok(resp) => {
639			debug!(status = %resp.status(), ?addrs, "canopy tailscale probe: unexpected status");
640			None
641		}
642		Err(err) => {
643			debug!(?addrs, "canopy tailscale probe failed: {err}");
644			None
645		}
646	}
647}
648
649fn tailscale_resolver() -> Resolver<impl ConnectionProvider> {
650	Resolver::builder_with_config(
651		ResolverConfig::from_parts(
652			None,
653			vec!["tail53aef.ts.net.".parse().unwrap()],
654			vec![NameServerConfig::new(
655				"100.100.100.100".parse().unwrap(),
656				true,
657				vec![ConnectionConfig::udp()],
658			)],
659		),
660		TokioRuntimeProvider::default(),
661	)
662	.build()
663	.expect("tailscale resolver config is hardcoded and cannot fail to build")
664}
665
666fn gzip_bytes(bytes: &[u8]) -> std::io::Result<Vec<u8>> {
667	let mut encoder = GzEncoder::new(Vec::with_capacity(bytes.len() / 2), Compression::default());
668	encoder.write_all(bytes)?;
669	encoder.finish()
670}
671
672/// Build a short-lived self-signed client certificate from a P-256 device key
673/// PEM and wrap it as a reqwest mTLS [`Identity`].
674///
675/// Canopy identifies a device by its certificate's public key (SPKI), not by a
676/// CA chain, so a fresh self-signed cert from the device key is all that's
677/// needed. The same device key drives both the long-running canopy client here
678/// and the one-shot `canopy register` enrollment handshake, so they present the
679/// same identity to canopy.
680pub fn device_identity(device_key_pem: &str) -> Result<reqwest::Identity> {
681	let key_pair = KeyPair::from_pem(device_key_pem)
682		.into_diagnostic()
683		.wrap_err("parsing device key PEM")?;
684
685	let mut params = CertificateParams::new(vec!["device.local".into()])
686		.into_diagnostic()
687		.wrap_err("building certificate params")?;
688	params.distinguished_name = DistinguishedName::new();
689	params
690		.distinguished_name
691		.push(DnType::CommonName, "device.local");
692
693	let now = OffsetDateTime::now_utc();
694	params.not_before = now - TimeDuration::minutes(1);
695	params.not_after = now + TimeDuration::days(CERT_VALIDITY_DAYS);
696
697	let cert = params
698		.self_signed(&key_pair)
699		.into_diagnostic()
700		.wrap_err("self-signing certificate")?;
701
702	let mut combined = cert.pem();
703	combined.push('\n');
704	combined.push_str(&key_pair.serialize_pem());
705
706	reqwest::Identity::from_pem(combined.as_bytes())
707		.into_diagnostic()
708		.wrap_err("building reqwest TLS identity")
709}
710
711fn build_mtls_http(
712	make_builder: &ClientBuilderFactory,
713	device_key_pem: &str,
714) -> Result<reqwest::Client> {
715	let identity = device_identity(device_key_pem)?;
716
717	make_builder()
718		.identity(identity)
719		.use_rustls_tls()
720		.timeout(Duration::from_secs(30))
721		.build()
722		.into_diagnostic()
723		.wrap_err("building canopy HTTP client")
724}
725
726#[cfg(test)]
727mod tests {
728	use super::*;
729
730	const TEST_DEVICE_KEY: &str = "\
731-----BEGIN PRIVATE KEY-----
732MIGHAgEAMBMGByqGSM49AgEGCCqGSM49AwEHBG0wawIBAQQgVvhzsYiidp38GYn1
733KxD5Wipc/h8lglVsy1UFZq/SZbGhRANCAAT2EsEq7xjeWVnim9XwdYXga/LBbppm
734fXLgamTYOa/w9n/Ta64fiYWmN54kEd0DgnflJDLtID321Zz6xswvK/VN
735-----END PRIVATE KEY-----";
736
737	fn test_factory() -> ClientBuilderFactory {
738		Arc::new(reqwest::Client::builder)
739	}
740
741	#[test]
742	fn build_mtls_http_from_p256_key() {
743		// Direct mTLS-path build, bypassing the async constructor / tailscale probe.
744		let result = build_mtls_http(&test_factory(), TEST_DEVICE_KEY);
745		assert!(result.is_ok(), "{:?}", result.err());
746	}
747
748	#[test]
749	fn build_mtls_http_fails_on_garbage_key() {
750		assert!(build_mtls_http(&test_factory(), "not a real PEM").is_err());
751	}
752
753	#[tokio::test]
754	async fn renew_with_mtls_state_swaps_in_fresh_client() {
755		// Construct an mTLS-state client directly (no network probe) and renew it.
756		let http = build_mtls_http(&test_factory(), TEST_DEVICE_KEY).unwrap();
757		let client = CanopyClient {
758			device_key: Some(Redacted(TEST_DEVICE_KEY.to_owned())),
759			tamanu_version: "2.54.2".into(),
760			make_builder: test_factory(),
761			state: RwLock::new(State::Mtls(http)),
762		};
763		client.renew().await.expect("renew should succeed");
764		assert!(!client.is_tailscale().await);
765	}
766
767	#[tokio::test]
768	async fn renew_is_noop_in_tailscale_mode() {
769		// Tailscale-state client with no device key — renew is a no-op.
770		let http = reqwest::Client::new();
771		let client = CanopyClient {
772			device_key: None,
773			tamanu_version: "2.54.2".into(),
774			make_builder: test_factory(),
775			state: RwLock::new(State::Tailscale(http)),
776		};
777		client.renew().await.expect("renew should be a no-op");
778		assert!(client.is_tailscale().await);
779	}
780
781	#[test]
782	fn user_agent_has_product_and_os_comment() {
783		let ua = user_agent("bestool", "1.2.3");
784		assert!(
785			ua.starts_with("bestool/1.2.3 "),
786			"unexpected user-agent: {ua}"
787		);
788		assert!(ua.contains('('), "expected OS comment in: {ua}");
789		assert!(ua.ends_with(')'), "expected OS comment in: {ua}");
790		assert!(
791			ua.contains(sysinfo::System::cpu_arch().as_str()),
792			"expected arch in: {ua}"
793		);
794	}
795
796	#[test]
797	fn gzip_bytes_roundtrips() {
798		use flate2::read::GzDecoder;
799		use std::io::Read;
800
801		let original = br#"{"health":[{"check":"x","result":"passed"}]}"#;
802		let compressed = gzip_bytes(original).expect("gzip should succeed");
803		assert!(
804			compressed.starts_with(&[0x1f, 0x8b]),
805			"expected gzip magic bytes"
806		);
807		let mut decoder = GzDecoder::new(&compressed[..]);
808		let mut decompressed = Vec::new();
809		decoder.read_to_end(&mut decompressed).unwrap();
810		assert_eq!(decompressed, original);
811	}
812
813	#[test]
814	fn severity_serialises_lowercase() {
815		assert_eq!(
816			serde_json::to_string(&Severity::Warning).unwrap(),
817			"\"warning\""
818		);
819		assert_eq!(
820			serde_json::to_string(&Severity::Critical).unwrap(),
821			"\"critical\""
822		);
823	}
824
825	#[test]
826	fn new_event_omits_optional_fields() {
827		let evt = NewEvent {
828			source: "src",
829			r#ref: "host/alert:tgt",
830			message: "msg",
831			description: None,
832			severity: None,
833			occurred_at: None,
834			active: None,
835		};
836		let json = serde_json::to_string(&evt).unwrap();
837		assert!(json.contains("\"source\":\"src\""));
838		assert!(json.contains("\"ref\":\"host/alert:tgt\""));
839		assert!(json.contains("\"message\":\"msg\""));
840		assert!(!json.contains("description"));
841		assert!(!json.contains("severity"));
842		assert!(!json.contains("occurredAt"));
843		assert!(!json.contains("active"));
844	}
845
846	#[test]
847	fn new_event_serialises_occurred_at_as_camel_case() {
848		let evt = NewEvent {
849			source: "src",
850			r#ref: "ref",
851			message: "msg",
852			description: Some("desc"),
853			severity: Some(Severity::Warning),
854			occurred_at: Some("2025-01-01T00:00:00Z".parse().unwrap()),
855			active: Some(true),
856		};
857		let json = serde_json::to_string(&evt).unwrap();
858		assert!(json.contains("\"occurredAt\":"));
859		assert!(json.contains("\"description\":\"desc\""));
860		assert!(json.contains("\"severity\":\"warning\""));
861		assert!(json.contains("\"active\":true"));
862	}
863}