Skip to main content

bestool_canopy/
client.rs

1use std::{
2	fmt,
3	io::Write,
4	net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr},
5	sync::{Arc, OnceLock},
6	time::Duration,
7};
8
9use flate2::{Compression, write::GzEncoder};
10use hickory_resolver::{
11	ConnectionProvider, Resolver,
12	config::{ConnectionConfig, NameServerConfig, ResolverConfig},
13	net::runtime::TokioRuntimeProvider,
14};
15use jiff::Timestamp;
16use miette::{IntoDiagnostic, Result, WrapErr};
17use rcgen::{CertificateParams, DistinguishedName, DnType, KeyPair};
18use reqwest::Url;
19use serde::{Deserialize, Serialize};
20use time::{Duration as TimeDuration, OffsetDateTime};
21use tokio::sync::RwLock;
22use tracing::debug;
23use uuid::Uuid;
24
25use crate::{
26	Redacted,
27	backup::{
28		BackupCredentials, BackupCredentialsRequest, BackupReport, BackupTarget,
29		CapabilitiesRequest, Purpose, TargetOutcome,
30	},
31	restore::{
32		RestoreCapabilitiesRequest, RestoreCredentials, RestoreCredentialsRequest,
33		RestoreVerification, WorklistEntry,
34	},
35};
36
37pub const DEFAULT_CANOPY_URL: &str = "https://meta.tamanu.app";
38
39/// Base URL for the tailscale-internal canopy endpoint.
40///
41/// On hosts that share the canopy tailnet, posting to this URL works without
42/// mTLS — the tailscale identity is the auth.
43pub const TAILSCALE_URL: &str = "https://canopy.tail53aef.ts.net";
44
45/// Bare hostname used for `resolve_to_addrs` overrides.
46const TAILSCALE_HOST: &str = "canopy.tail53aef.ts.net";
47
48/// Hardcoded tailscale IPs for canopy, used when tailscale DNS
49/// (100.100.100.100) is unreachable but the tailnet otherwise is.
50const CANOPY_HARDCODED_V4: Ipv4Addr = Ipv4Addr::new(100, 99, 98, 97);
51const CANOPY_HARDCODED_V6: Ipv6Addr =
52	Ipv6Addr::new(0xfd7a, 0x115c, 0xa1e0, 0, 0, 0, 0x9337, 0xfb52);
53
54/// How long renewed canopy certs are valid for.
55///
56/// Set well above [`CERT_RENEW_AFTER`] so a renewal failure doesn't immediately
57/// strand the client.
58const CERT_VALIDITY_DAYS: i64 = 6;
59
60/// How long to wait between scheduled cert renewals.
61///
62/// Renewal runs in a background task in the daemon; the legacy single-shot
63/// alerts command builds the client once and exits well within this window.
64pub const CERT_RENEW_AFTER: Duration = Duration::from_secs(5 * 24 * 60 * 60);
65
66/// Timeout for the tailscale availability probe.
67const TAILSCALE_PROBE_TIMEOUT: Duration = Duration::from_secs(5);
68
69/// Factory producing the base [`reqwest::ClientBuilder`] for canopy's clients.
70///
71/// The caller supplies this so it owns cross-cutting client config (user-agent,
72/// `SSLKEYLOGFILE`, proxies, …). Canopy invokes it whenever it needs to build or
73/// rebuild a client — at probe time, on mTLS cert renewal, and on reload — then
74/// layers its own concerns (mTLS identity, DNS overrides, timeouts) on top.
75pub type ClientBuilderFactory = Arc<dyn Fn() -> reqwest::ClientBuilder + Send + Sync>;
76
77/// Browser-style user-agent string, e.g. `bestool/1.2.3 (Linux 7.0.9 Arch Linux; x86_64)`.
78///
79/// `product` and `version` identify the calling binary; the OS comment is
80/// detected at runtime and cached.
81pub fn user_agent(product: &str, version: &str) -> String {
82	static OS_COMMENT: OnceLock<String> = OnceLock::new();
83	let os_comment = OS_COMMENT.get_or_init(|| {
84		let os = sysinfo::System::long_os_version()
85			.or_else(sysinfo::System::name)
86			.unwrap_or_else(|| std::env::consts::OS.to_owned());
87		format!("{os}; {}", sysinfo::System::cpu_arch())
88	});
89	format!("{product}/{version} ({os_comment})")
90}
91
92/// A [`reqwest::ClientBuilder`] carrying the `bestool` [`user_agent`] for `version`.
93///
94/// Convenience for callers that don't need any extra client config; suitable as
95/// the base of a [`ClientBuilderFactory`].
96pub fn client_builder(version: &str) -> reqwest::ClientBuilder {
97	reqwest::Client::builder().user_agent(user_agent("bestool", version))
98}
99
100/// Probe the canopy tailnet endpoint, returning a client routed to it if
101/// reachable.
102///
103/// The returned client carries the same DNS / hardcoded-IP resolution override
104/// the reporting client uses and presents **no** client certificate — callers
105/// reaching canopy this way authenticate by tailnet identity. Returns `None`
106/// when the tailnet endpoint isn't reachable, so callers can fall back to
107/// public mTLS.
108pub async fn tailscale_client(make_builder: &ClientBuilderFactory) -> Option<reqwest::Client> {
109	probe_tailscale(make_builder).await
110}
111
112/// Severities accepted by the canopy `/events` API.
113///
114/// Canopy narrowed its vocabulary from RFC 5424 to this five-level set; the
115/// retired syslog severities (`emergency`, `alert`, `notice`) are rejected by
116/// the strict enum validation on `POST /events`.
117#[derive(Copy, Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
118#[serde(rename_all = "lowercase")]
119pub enum Severity {
120	Critical,
121	Error,
122	Warning,
123	Info,
124	Debug,
125}
126
127/// Payload for posting to `POST /events` on a canopy server.
128#[derive(Debug, Clone, Serialize)]
129pub struct NewEvent<'a> {
130	pub source: &'a str,
131	#[serde(rename = "ref")]
132	pub r#ref: &'a str,
133	pub message: &'a str,
134	#[serde(skip_serializing_if = "Option::is_none")]
135	pub description: Option<&'a str>,
136	#[serde(skip_serializing_if = "Option::is_none")]
137	pub severity: Option<Severity>,
138	#[serde(rename = "occurredAt", skip_serializing_if = "Option::is_none")]
139	pub occurred_at: Option<Timestamp>,
140	#[serde(skip_serializing_if = "Option::is_none")]
141	pub active: Option<bool>,
142}
143
144/// HTTP client with auth configured for talking to a canopy server.
145///
146/// Tries two auth paths in order of preference:
147/// 1. **Tailscale**: if the canopy tailnet endpoint is reachable, plain HTTPS
148///    works (auth is implicit via tailscale identity).
149/// 2. **mTLS**: a fresh self-signed cert from the device key, short-lived
150///    ([`CERT_VALIDITY_DAYS`]); for long-running daemons, [`Self::renew`]
151///    should tick on [`CERT_RENEW_AFTER`] to swap in a fresh cert before expiry.
152///
153/// [`Self::refresh`] re-probes tailscale and swaps modes on reload.
154pub struct CanopyClient {
155	device_key: Option<Redacted<String>>,
156	/// Tamanu version of the install this client speaks for. Sent verbatim in
157	/// the `X-Version` request header — canopy rejects events / status pushes
158	/// that don't carry one. Sourced from the running Tamanu install's
159	/// `package.json` (via `find_tamanu`); not the bestool / alertd version.
160	tamanu_version: String,
161	/// Produces the base client builder; see [`ClientBuilderFactory`].
162	make_builder: ClientBuilderFactory,
163	state: RwLock<State>,
164}
165
166enum State {
167	Tailscale(reqwest::Client),
168	Mtls(reqwest::Client),
169}
170
171impl State {
172	fn is_tailscale(&self) -> bool {
173		matches!(self, State::Tailscale(_))
174	}
175
176	fn http(&self) -> reqwest::Client {
177		match self {
178			State::Tailscale(http) | State::Mtls(http) => http.clone(),
179		}
180	}
181}
182
183impl fmt::Debug for CanopyClient {
184	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
185		f.debug_struct("CanopyClient").finish_non_exhaustive()
186	}
187}
188
189impl CanopyClient {
190	/// Build a canopy client, preferring tailscale and falling back to mTLS.
191	///
192	/// Probes the tailscale canopy endpoint first; if reachable, uses it.
193	/// Otherwise, if a device key PEM is provided, builds an mTLS client.
194	/// Returns `Ok(None)` if neither path is available.
195	///
196	/// `tamanu_version` is the version of the Tamanu install this client
197	/// speaks for; sent on every request via the `X-Version` header.
198	///
199	/// `make_builder` supplies the base [`reqwest::ClientBuilder`] — see
200	/// [`ClientBuilderFactory`]. Use [`client_builder`] for a sensible default.
201	pub async fn new(
202		tamanu_version: impl Into<String>,
203		device_key_pem: Option<&str>,
204		make_builder: impl Fn() -> reqwest::ClientBuilder + Send + Sync + 'static,
205	) -> Result<Option<Self>> {
206		let tamanu_version = tamanu_version.into();
207		let device_key = device_key_pem.map(|s| Redacted(s.to_owned()));
208		let make_builder: ClientBuilderFactory = Arc::new(make_builder);
209
210		if let Some(http) = probe_tailscale(&make_builder).await {
211			debug!("canopy: tailscale endpoint reachable, preferring it");
212			return Ok(Some(Self {
213				device_key,
214				tamanu_version,
215				make_builder,
216				state: RwLock::new(State::Tailscale(http)),
217			}));
218		}
219
220		if let Some(pem) = device_key_pem {
221			debug!("canopy: tailscale unreachable, falling back to mTLS");
222			let http = build_mtls_http(&make_builder, pem)?;
223			return Ok(Some(Self {
224				device_key,
225				tamanu_version,
226				make_builder,
227				state: RwLock::new(State::Mtls(http)),
228			}));
229		}
230
231		Ok(None)
232	}
233
234	/// Returns true if the client is currently using the tailscale path.
235	pub async fn is_tailscale(&self) -> bool {
236		self.state.read().await.is_tailscale()
237	}
238
239	/// Re-probe tailscale and swap modes if the picture has changed.
240	///
241	/// Intended to be called when the daemon receives a reload signal.
242	pub async fn refresh(&self) -> Result<()> {
243		if let Some(http) = probe_tailscale(&self.make_builder).await {
244			let mut state = self.state.write().await;
245			if !state.is_tailscale() {
246				debug!("canopy refresh: switching to tailscale path");
247			}
248			*state = State::Tailscale(http);
249			return Ok(());
250		}
251
252		if let Some(pem) = &self.device_key {
253			let http = build_mtls_http(&self.make_builder, &pem.0)?;
254			let mut state = self.state.write().await;
255			if state.is_tailscale() {
256				debug!("canopy refresh: tailscale dropped, falling back to mTLS");
257			}
258			*state = State::Mtls(http);
259			return Ok(());
260		}
261
262		debug!("canopy refresh: no auth path available, keeping current state");
263		Ok(())
264	}
265
266	/// Rebuild the underlying HTTP client with a fresh certificate.
267	///
268	/// No-op in tailscale mode (no cert to rotate). In mTLS mode, atomically
269	/// replaces the live client; in-flight requests continue with the old
270	/// client until they complete.
271	pub async fn renew(&self) -> Result<()> {
272		let Some(pem) = &self.device_key else {
273			return Ok(());
274		};
275		let mut state = self.state.write().await;
276		if state.is_tailscale() {
277			return Ok(());
278		}
279		*state = State::Mtls(build_mtls_http(&self.make_builder, &pem.0)?);
280		Ok(())
281	}
282
283	/// POST a status snapshot to the canopy server.
284	///
285	/// In tailscale mode, `base_url` is ignored and a `{TAILSCALE_URL}/public/status/{server_id}`
286	/// URL is used. In mTLS mode, posts to `{base_url}/status/{server_id}`.
287	///
288	/// The payload is free-form JSON; the canopy `/status` contract reserves the
289	/// top-level `health: []` key, whose entries each carry a `result` of
290	/// `passed | warning | failed | broken | skipped`. The body is gzip-encoded
291	/// with `Content-Encoding: gzip`.
292	///
293	/// Returns `backup_now`: the backup-type names canopy says this server should
294	/// back up right now (operator one-offs + schedule-due). Empty means nothing
295	/// to do. A response that predates the field (no `backup_now`) yields an empty
296	/// list, so older canopy deployments keep working.
297	pub async fn post_status(
298		&self,
299		base_url: &Url,
300		server_id: &str,
301		payload: &serde_json::Value,
302	) -> Result<Vec<String>> {
303		let (http, url) = {
304			let state = self.state.read().await;
305			let url = match &*state {
306				State::Tailscale(_) => format!("{TAILSCALE_URL}/public/status/{server_id}")
307					.parse::<Url>()
308					.into_diagnostic()
309					.wrap_err("building tailscale /public/status URL")?,
310				State::Mtls(_) => base_url
311					.join(&format!("/status/{server_id}"))
312					.into_diagnostic()
313					.wrap_err("building /status URL")?,
314			};
315			(state.http(), url)
316		};
317
318		let raw = serde_json::to_vec(payload)
319			.into_diagnostic()
320			.wrap_err("serialising canopy /status payload")?;
321		let compressed = gzip_bytes(&raw)
322			.into_diagnostic()
323			.wrap_err("gzipping canopy /status payload")?;
324
325		debug!(
326			%url,
327			raw_bytes = raw.len(),
328			gzip_bytes = compressed.len(),
329			"posting status snapshot to canopy",
330		);
331
332		let response = http
333			.post(url)
334			.header("X-Version", &self.tamanu_version)
335			.header(reqwest::header::CONTENT_TYPE, "application/json")
336			.header(reqwest::header::CONTENT_ENCODING, "gzip")
337			.body(compressed)
338			.send()
339			.await
340			.into_diagnostic()
341			.wrap_err("posting status to canopy")?;
342
343		let status = response.status();
344		if !status.is_success() {
345			let body = response.text().await.unwrap_or_default();
346			return Err(miette::miette!("canopy /status returned {status}: {body}"));
347		}
348
349		#[derive(Deserialize, Default)]
350		struct StatusResponseTail {
351			#[serde(default)]
352			backup_now: Vec<String>,
353		}
354
355		// The response flattens the persisted Status plus `backup_now`; we read
356		// only the latter and ignore the rest. A body that fails to parse (or
357		// predates the field) is treated as "nothing to do" rather than failing
358		// the status push.
359		let tail = response
360			.json::<StatusResponseTail>()
361			.await
362			.unwrap_or_default();
363		Ok(tail.backup_now)
364	}
365
366	/// GET a path on the canopy server, routed via tailscale when available.
367	///
368	/// In tailscale mode, the request goes to `{TAILSCALE_URL}{tailscale_path}`
369	/// (typically `/public/...`, the only mount that accepts tagged-device
370	/// tailscale callers). In mTLS mode, the request goes to `{base_url}{mtls_path}`.
371	///
372	/// Returns the raw response — the caller is responsible for status checks
373	/// and body parsing so they can choose how to fall back if the response
374	/// isn't usable.
375	pub async fn get(
376		&self,
377		base_url: &Url,
378		tailscale_path: &str,
379		mtls_path: &str,
380	) -> Result<reqwest::Response> {
381		let (http, url) = {
382			let state = self.state.read().await;
383			let url = match &*state {
384				State::Tailscale(_) => format!("{TAILSCALE_URL}{tailscale_path}")
385					.parse::<Url>()
386					.into_diagnostic()
387					.wrap_err("building tailscale GET URL")?,
388				State::Mtls(_) => base_url
389					.join(mtls_path)
390					.into_diagnostic()
391					.wrap_err("building mTLS GET URL")?,
392			};
393			(state.http(), url)
394		};
395
396		debug!(%url, "GET via canopy");
397		http.get(url)
398			.header("X-Version", &self.tamanu_version)
399			.send()
400			.await
401			.into_diagnostic()
402			.wrap_err("GET via canopy")
403	}
404
405	/// POST an event to the canopy server.
406	///
407	/// In tailscale mode, `base_url` is ignored and [`TAILSCALE_URL`] is used.
408	/// In mTLS mode, posts to `{base_url}/events`.
409	pub async fn post_event(&self, base_url: &Url, event: NewEvent<'_>) -> Result<()> {
410		let (http, url) = {
411			let state = self.state.read().await;
412			let url = match &*state {
413				State::Tailscale(_) => format!("{TAILSCALE_URL}/public/events")
414					.parse::<Url>()
415					.into_diagnostic()
416					.wrap_err("building tailscale /public/events URL")?,
417				State::Mtls(_) => base_url
418					.join("/events")
419					.into_diagnostic()
420					.wrap_err("building /events URL")?,
421			};
422			(state.http(), url)
423		};
424
425		debug!(
426			%url,
427			source = event.source,
428			r#ref = event.r#ref,
429			active = ?event.active,
430			"posting event to canopy"
431		);
432
433		let response = http
434			.post(url)
435			.header("X-Version", &self.tamanu_version)
436			.json(&event)
437			.send()
438			.await
439			.into_diagnostic()
440			.wrap_err("posting event to canopy")?;
441
442		let status = response.status();
443		if !status.is_success() {
444			let body = response.text().await.unwrap_or_default();
445			return Err(miette::miette!("canopy /events returned {status}: {body}"));
446		}
447
448		Ok(())
449	}
450
451	/// Resolve an endpoint URL for the current auth path.
452	///
453	/// `path` is the mTLS-mode path (e.g. `/backup-target`); over tailscale the
454	/// same endpoint is mounted under `/public`, so this prepends it.
455	async fn endpoint_url(&self, base_url: &Url, path: &str) -> Result<(reqwest::Client, Url)> {
456		let state = self.state.read().await;
457		let url = match &*state {
458			State::Tailscale(_) => format!("{TAILSCALE_URL}/public{path}")
459				.parse::<Url>()
460				.into_diagnostic()
461				.wrap_err_with(|| format!("building tailscale /public{path} URL"))?,
462			State::Mtls(_) => base_url
463				.join(path)
464				.into_diagnostic()
465				.wrap_err_with(|| format!("building {path} URL"))?,
466		};
467		Ok((state.http(), url))
468	}
469
470	/// Register the backup types this server can run (`POST /backup-capabilities`).
471	pub async fn backup_capabilities(&self, base_url: &Url, types: &[String]) -> Result<()> {
472		let (http, url) = self.endpoint_url(base_url, "/backup-capabilities").await?;
473		debug!(%url, ?types, "registering backup capabilities with canopy");
474		let response = http
475			.post(url)
476			.header("X-Version", &self.tamanu_version)
477			.json(&CapabilitiesRequest { types })
478			.send()
479			.await
480			.into_diagnostic()
481			.wrap_err("posting backup capabilities to canopy")?;
482
483		let status = response.status();
484		if !status.is_success() {
485			let body = response.text().await.unwrap_or_default();
486			return Err(miette::miette!(
487				"canopy /backup-capabilities returned {status}: {body}"
488			));
489		}
490		Ok(())
491	}
492
493	/// Obtain short-lived S3 credentials for a backup type (`POST /backup-credentials`).
494	///
495	/// Returns the `credential_process`-shaped creds; the caller translates them
496	/// to the container-creds shape for kopia. `412`/`409`/`502` surface as errors.
497	pub async fn backup_credentials(
498		&self,
499		base_url: &Url,
500		backup_type: &str,
501		purpose: Purpose,
502	) -> Result<BackupCredentials> {
503		let (http, url) = self.endpoint_url(base_url, "/backup-credentials").await?;
504		debug!(%url, backup_type, ?purpose, "requesting backup credentials from canopy");
505		let response = http
506			.post(url)
507			.header("X-Version", &self.tamanu_version)
508			.json(&BackupCredentialsRequest {
509				r#type: backup_type,
510				purpose,
511			})
512			.send()
513			.await
514			.into_diagnostic()
515			.wrap_err("posting backup credentials request to canopy")?;
516
517		let status = response.status();
518		if !status.is_success() {
519			let body = response.text().await.unwrap_or_default();
520			return Err(miette::miette!(
521				"canopy /backup-credentials returned {status}: {body}"
522			));
523		}
524		response
525			.json::<BackupCredentials>()
526			.await
527			.into_diagnostic()
528			.wrap_err("parsing backup credentials from canopy")
529	}
530
531	/// Fetch the S3 repo target (`GET /backup-target`).
532	///
533	/// `412`/`409` mean the device isn't yet authorised for backups; these map to
534	/// [`TargetOutcome::Dormant`] (a benign idle state) rather than an error.
535	pub async fn backup_target(&self, base_url: &Url) -> Result<TargetOutcome> {
536		let (http, url) = self.endpoint_url(base_url, "/backup-target").await?;
537		debug!(%url, "fetching backup target from canopy");
538		let response = http
539			.get(url)
540			.header("X-Version", &self.tamanu_version)
541			.send()
542			.await
543			.into_diagnostic()
544			.wrap_err("fetching backup target from canopy")?;
545
546		let status = response.status();
547		if status == reqwest::StatusCode::PRECONDITION_FAILED
548			|| status == reqwest::StatusCode::CONFLICT
549		{
550			return Ok(TargetOutcome::Dormant);
551		}
552		if !status.is_success() {
553			let body = response.text().await.unwrap_or_default();
554			return Err(miette::miette!(
555				"canopy /backup-target returned {status}: {body}"
556			));
557		}
558		let target = response
559			.json::<BackupTarget>()
560			.await
561			.into_diagnostic()
562			.wrap_err("parsing backup target from canopy")?;
563		Ok(TargetOutcome::Ready(target))
564	}
565
566	/// Report a completed backup/restore run (`POST /backup-report`).
567	pub async fn backup_report(&self, base_url: &Url, report: &BackupReport<'_>) -> Result<()> {
568		let (http, url) = self.endpoint_url(base_url, "/backup-report").await?;
569		debug!(%url, run_id = report.run_id, "reporting backup outcome to canopy");
570		let response = http
571			.post(url)
572			.header("X-Version", &self.tamanu_version)
573			.json(report)
574			.send()
575			.await
576			.into_diagnostic()
577			.wrap_err("posting backup report to canopy")?;
578
579		let status = response.status();
580		if !status.is_success() {
581			let body = response.text().await.unwrap_or_default();
582			return Err(miette::miette!(
583				"canopy /backup-report returned {status}: {body}"
584			));
585		}
586		Ok(())
587	}
588
589	/// Register the restore intents this consumer supports (`POST /restore-capabilities`).
590	///
591	/// Replaces the registered intent set wholesale. Canopy dispatches only
592	/// matching worklist entries.
593	pub async fn restore_capabilities(&self, base_url: &Url, intents: &[&str]) -> Result<()> {
594		let (http, url) = self.endpoint_url(base_url, "/restore-capabilities").await?;
595		debug!(%url, ?intents, "registering restore capabilities with canopy");
596		let response = http
597			.post(url)
598			.header("X-Version", &self.tamanu_version)
599			.json(&RestoreCapabilitiesRequest { intents })
600			.send()
601			.await
602			.into_diagnostic()
603			.wrap_err("posting restore capabilities to canopy")?;
604
605		let status = response.status();
606		if !status.is_success() {
607			let body = response.text().await.unwrap_or_default();
608			return Err(miette::miette!(
609				"canopy /restore-capabilities returned {status}: {body}"
610			));
611		}
612		Ok(())
613	}
614
615	/// Fetch the desired-state worklist of replicas to restore (`GET /restore-worklist`).
616	pub async fn restore_worklist(&self, base_url: &Url) -> Result<Vec<WorklistEntry>> {
617		let (http, url) = self.endpoint_url(base_url, "/restore-worklist").await?;
618		debug!(%url, "fetching restore worklist from canopy");
619		let response = http
620			.get(url)
621			.header("X-Version", &self.tamanu_version)
622			.send()
623			.await
624			.into_diagnostic()
625			.wrap_err("fetching restore worklist from canopy")?;
626
627		let status = response.status();
628		if !status.is_success() {
629			let body = response.text().await.unwrap_or_default();
630			return Err(miette::miette!(
631				"canopy /restore-worklist returned {status}: {body}"
632			));
633		}
634		response
635			.json::<Vec<WorklistEntry>>()
636			.await
637			.into_diagnostic()
638			.wrap_err("parsing restore worklist from canopy")
639	}
640
641	/// Obtain read-only S3 credentials and the repo password for a group
642	/// (`POST /restore-credentials`).
643	///
644	/// Creds are 1-hour chained STS; refresh by re-calling. `403`/`409`/`502`
645	/// surface as errors.
646	pub async fn restore_credentials(
647		&self,
648		base_url: &Url,
649		backup_type: &str,
650		group: Uuid,
651	) -> Result<RestoreCredentials> {
652		let (http, url) = self.endpoint_url(base_url, "/restore-credentials").await?;
653		debug!(%url, backup_type, %group, "requesting restore credentials from canopy");
654		let response = http
655			.post(url)
656			.header("X-Version", &self.tamanu_version)
657			.json(&RestoreCredentialsRequest {
658				group,
659				r#type: backup_type,
660			})
661			.send()
662			.await
663			.into_diagnostic()
664			.wrap_err("posting restore credentials request to canopy")?;
665
666		let status = response.status();
667		if !status.is_success() {
668			let body = response.text().await.unwrap_or_default();
669			return Err(miette::miette!(
670				"canopy /restore-credentials returned {status}: {body}"
671			));
672		}
673		response
674			.json::<RestoreCredentials>()
675			.await
676			.into_diagnostic()
677			.wrap_err("parsing restore credentials from canopy")
678	}
679
680	/// Report a restore's health (`POST /restore-verification`).
681	pub async fn restore_verification(
682		&self,
683		base_url: &Url,
684		report: &RestoreVerification<'_>,
685	) -> Result<()> {
686		let (http, url) = self.endpoint_url(base_url, "/restore-verification").await?;
687		debug!(%url, group = %report.group, "reporting restore verification to canopy");
688		let response = http
689			.post(url)
690			.header("X-Version", &self.tamanu_version)
691			.json(report)
692			.send()
693			.await
694			.into_diagnostic()
695			.wrap_err("posting restore verification to canopy")?;
696
697		let status = response.status();
698		if !status.is_success() {
699			let body = response.text().await.unwrap_or_default();
700			return Err(miette::miette!(
701				"canopy /restore-verification returned {status}: {body}"
702			));
703		}
704		Ok(())
705	}
706}
707
708/// Probe the tailscale canopy endpoint.
709///
710/// Returns a configured `reqwest::Client` if `GET /public/servers` responds
711/// 2xx — anything else (timeout, non-2xx, transport error) returns `None` so
712/// the caller can fall back to mTLS.
713///
714/// Tries two paths in order:
715/// 1. Resolve `canopy` via the tailscale DNS server (100.100.100.100) and
716///    probe with those addresses.
717/// 2. Use hardcoded tailscale IPs for canopy and probe with those.
718///
719/// `/public/servers` is used because:
720/// - it lives under `/public/...`, the only mount that accepts tagged-device
721///   tailscale callers (everything else 403s with `tagged-device-not-allowed`);
722/// - it's a `GET` with no body, no `VersionHeader` requirement, and no auth;
723/// - it's read-only, so probing it has no side effects.
724async fn probe_tailscale(make_builder: &ClientBuilderFactory) -> Option<reqwest::Client> {
725	let dns_addrs: Vec<SocketAddr> = tailscale_resolver()
726		.lookup_ip("canopy")
727		.await
728		.ok()
729		.map(|addrs| addrs.iter().map(|ip| SocketAddr::new(ip, 443)).collect())
730		.unwrap_or_default();
731	if !dns_addrs.is_empty()
732		&& let Some(client) = try_probe(&dns_addrs, make_builder).await
733	{
734		return Some(client);
735	}
736
737	let hardcoded = [
738		SocketAddr::new(IpAddr::V4(CANOPY_HARDCODED_V4), 443),
739		SocketAddr::new(IpAddr::V6(CANOPY_HARDCODED_V6), 443),
740	];
741	debug!(
742		?hardcoded,
743		"canopy tailscale DNS lookup empty or probe failed, trying hardcoded IPs"
744	);
745	try_probe(&hardcoded, make_builder).await
746}
747
748async fn try_probe(
749	addrs: &[SocketAddr],
750	make_builder: &ClientBuilderFactory,
751) -> Option<reqwest::Client> {
752	let client = make_builder()
753		.timeout(TAILSCALE_PROBE_TIMEOUT)
754		.resolve_to_addrs(TAILSCALE_HOST, addrs)
755		.build()
756		.ok()?;
757
758	let url = format!("{TAILSCALE_URL}/public/servers");
759	match client.get(&url).send().await {
760		Ok(resp) if resp.status().is_success() => Some(client),
761		Ok(resp) => {
762			debug!(status = %resp.status(), ?addrs, "canopy tailscale probe: unexpected status");
763			None
764		}
765		Err(err) => {
766			debug!(?addrs, "canopy tailscale probe failed: {err}");
767			None
768		}
769	}
770}
771
772fn tailscale_resolver() -> Resolver<impl ConnectionProvider> {
773	Resolver::builder_with_config(
774		ResolverConfig::from_parts(
775			None,
776			vec!["tail53aef.ts.net.".parse().unwrap()],
777			vec![NameServerConfig::new(
778				"100.100.100.100".parse().unwrap(),
779				true,
780				vec![ConnectionConfig::udp()],
781			)],
782		),
783		TokioRuntimeProvider::default(),
784	)
785	.build()
786	.expect("tailscale resolver config is hardcoded and cannot fail to build")
787}
788
789fn gzip_bytes(bytes: &[u8]) -> std::io::Result<Vec<u8>> {
790	let mut encoder = GzEncoder::new(Vec::with_capacity(bytes.len() / 2), Compression::default());
791	encoder.write_all(bytes)?;
792	encoder.finish()
793}
794
795/// Build a short-lived self-signed client certificate from a P-256 device key
796/// PEM and wrap it as a reqwest mTLS [`Identity`].
797///
798/// Canopy identifies a device by its certificate's public key (SPKI), not by a
799/// CA chain, so a fresh self-signed cert from the device key is all that's
800/// needed. The same device key drives both the long-running canopy client here
801/// and the one-shot `canopy register` enrollment handshake, so they present the
802/// same identity to canopy.
803pub fn device_identity(device_key_pem: &str) -> Result<reqwest::Identity> {
804	let key_pair = KeyPair::from_pem(device_key_pem)
805		.into_diagnostic()
806		.wrap_err("parsing device key PEM")?;
807
808	let mut params = CertificateParams::new(vec!["device.local".into()])
809		.into_diagnostic()
810		.wrap_err("building certificate params")?;
811	params.distinguished_name = DistinguishedName::new();
812	params
813		.distinguished_name
814		.push(DnType::CommonName, "device.local");
815
816	let now = OffsetDateTime::now_utc();
817	params.not_before = now - TimeDuration::minutes(1);
818	params.not_after = now + TimeDuration::days(CERT_VALIDITY_DAYS);
819
820	let cert = params
821		.self_signed(&key_pair)
822		.into_diagnostic()
823		.wrap_err("self-signing certificate")?;
824
825	let mut combined = cert.pem();
826	combined.push('\n');
827	combined.push_str(&key_pair.serialize_pem());
828
829	reqwest::Identity::from_pem(combined.as_bytes())
830		.into_diagnostic()
831		.wrap_err("building reqwest TLS identity")
832}
833
834fn build_mtls_http(
835	make_builder: &ClientBuilderFactory,
836	device_key_pem: &str,
837) -> Result<reqwest::Client> {
838	let identity = device_identity(device_key_pem)?;
839
840	make_builder()
841		.identity(identity)
842		.use_rustls_tls()
843		.timeout(Duration::from_secs(30))
844		.build()
845		.into_diagnostic()
846		.wrap_err("building canopy HTTP client")
847}
848
849#[cfg(test)]
850mod tests {
851	use super::*;
852
853	const TEST_DEVICE_KEY: &str = "\
854-----BEGIN PRIVATE KEY-----
855MIGHAgEAMBMGByqGSM49AgEGCCqGSM49AwEHBG0wawIBAQQgVvhzsYiidp38GYn1
856KxD5Wipc/h8lglVsy1UFZq/SZbGhRANCAAT2EsEq7xjeWVnim9XwdYXga/LBbppm
857fXLgamTYOa/w9n/Ta64fiYWmN54kEd0DgnflJDLtID321Zz6xswvK/VN
858-----END PRIVATE KEY-----";
859
860	fn test_factory() -> ClientBuilderFactory {
861		Arc::new(reqwest::Client::builder)
862	}
863
864	#[test]
865	fn build_mtls_http_from_p256_key() {
866		// Direct mTLS-path build, bypassing the async constructor / tailscale probe.
867		let result = build_mtls_http(&test_factory(), TEST_DEVICE_KEY);
868		assert!(result.is_ok(), "{:?}", result.err());
869	}
870
871	#[test]
872	fn build_mtls_http_fails_on_garbage_key() {
873		assert!(build_mtls_http(&test_factory(), "not a real PEM").is_err());
874	}
875
876	#[tokio::test]
877	async fn renew_with_mtls_state_swaps_in_fresh_client() {
878		// Construct an mTLS-state client directly (no network probe) and renew it.
879		let http = build_mtls_http(&test_factory(), TEST_DEVICE_KEY).unwrap();
880		let client = CanopyClient {
881			device_key: Some(Redacted(TEST_DEVICE_KEY.to_owned())),
882			tamanu_version: "2.54.2".into(),
883			make_builder: test_factory(),
884			state: RwLock::new(State::Mtls(http)),
885		};
886		client.renew().await.expect("renew should succeed");
887		assert!(!client.is_tailscale().await);
888	}
889
890	#[tokio::test]
891	async fn renew_is_noop_in_tailscale_mode() {
892		// Tailscale-state client with no device key — renew is a no-op.
893		let http = reqwest::Client::new();
894		let client = CanopyClient {
895			device_key: None,
896			tamanu_version: "2.54.2".into(),
897			make_builder: test_factory(),
898			state: RwLock::new(State::Tailscale(http)),
899		};
900		client.renew().await.expect("renew should be a no-op");
901		assert!(client.is_tailscale().await);
902	}
903
904	#[test]
905	fn user_agent_has_product_and_os_comment() {
906		let ua = user_agent("bestool", "1.2.3");
907		assert!(
908			ua.starts_with("bestool/1.2.3 "),
909			"unexpected user-agent: {ua}"
910		);
911		assert!(ua.contains('('), "expected OS comment in: {ua}");
912		assert!(ua.ends_with(')'), "expected OS comment in: {ua}");
913		assert!(
914			ua.contains(sysinfo::System::cpu_arch().as_str()),
915			"expected arch in: {ua}"
916		);
917	}
918
919	#[test]
920	fn gzip_bytes_roundtrips() {
921		use flate2::read::GzDecoder;
922		use std::io::Read;
923
924		let original = br#"{"health":[{"check":"x","result":"passed"}]}"#;
925		let compressed = gzip_bytes(original).expect("gzip should succeed");
926		assert!(
927			compressed.starts_with(&[0x1f, 0x8b]),
928			"expected gzip magic bytes"
929		);
930		let mut decoder = GzDecoder::new(&compressed[..]);
931		let mut decompressed = Vec::new();
932		decoder.read_to_end(&mut decompressed).unwrap();
933		assert_eq!(decompressed, original);
934	}
935
936	#[test]
937	fn severity_serialises_lowercase() {
938		assert_eq!(
939			serde_json::to_string(&Severity::Warning).unwrap(),
940			"\"warning\""
941		);
942		assert_eq!(
943			serde_json::to_string(&Severity::Critical).unwrap(),
944			"\"critical\""
945		);
946	}
947
948	#[test]
949	fn new_event_omits_optional_fields() {
950		let evt = NewEvent {
951			source: "src",
952			r#ref: "host/alert:tgt",
953			message: "msg",
954			description: None,
955			severity: None,
956			occurred_at: None,
957			active: None,
958		};
959		let json = serde_json::to_string(&evt).unwrap();
960		assert!(json.contains("\"source\":\"src\""));
961		assert!(json.contains("\"ref\":\"host/alert:tgt\""));
962		assert!(json.contains("\"message\":\"msg\""));
963		assert!(!json.contains("description"));
964		assert!(!json.contains("severity"));
965		assert!(!json.contains("occurredAt"));
966		assert!(!json.contains("active"));
967	}
968
969	#[test]
970	fn new_event_serialises_occurred_at_as_camel_case() {
971		let evt = NewEvent {
972			source: "src",
973			r#ref: "ref",
974			message: "msg",
975			description: Some("desc"),
976			severity: Some(Severity::Warning),
977			occurred_at: Some("2025-01-01T00:00:00Z".parse().unwrap()),
978			active: Some(true),
979		};
980		let json = serde_json::to_string(&evt).unwrap();
981		assert!(json.contains("\"occurredAt\":"));
982		assert!(json.contains("\"description\":\"desc\""));
983		assert!(json.contains("\"severity\":\"warning\""));
984		assert!(json.contains("\"active\":true"));
985	}
986}