treeship_core/keys/mod.rs
1use std::{
2 collections::HashMap,
3 fs,
4 io::{self, Read, Write},
5 path::{Path, PathBuf},
6 sync::{Arc, RwLock},
7};
8
9use aes_gcm::{
10 aead::{Aead, KeyInit, OsRng as AeadOsRng, Payload},
11 AeadCore, Aes256Gcm, Key as AesKey, Nonce,
12};
13use rand::RngCore;
14use serde::{Deserialize, Serialize};
15use sha2::{Digest as Sha2Digest, Sha256};
16use zeroize::Zeroizing;
17
18use crate::attestation::{Ed25519Signer, Signer};
19
20// --- Public types ---
21
22pub type KeyId = String;
23
24/// Public information about a stored key. Never contains private material.
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct KeyInfo {
27 pub id: KeyId,
28 pub algorithm: String, // "ed25519"
29 pub is_default: bool,
30 pub created_at: String, // RFC 3339
31 /// First 8 bytes of sha256(public_key), hex-encoded.
32 pub fingerprint: String,
33 pub public_key: Vec<u8>, // raw 32-byte Ed25519 public key
34 /// RFC 3339 timestamp after which signatures by this key should be
35 /// considered stale. `None` means the key has not been rotated and is
36 /// indefinitely valid. Set automatically by `Store::rotate` to
37 /// `now + grace_period` on the predecessor key.
38 #[serde(default, skip_serializing_if = "Option::is_none")]
39 pub valid_until: Option<String>,
40 /// If this key was rotated to a successor, the successor's key id.
41 /// Lets verifiers walk a rotation chain forward when validating an old
42 /// receipt against the current keystore. `None` means this is the head
43 /// of its chain.
44 #[serde(default, skip_serializing_if = "Option::is_none")]
45 pub successor_key_id: Option<KeyId>,
46}
47
48/// Outcome of a `Store::rotate` call.
49#[derive(Debug, Clone)]
50pub struct RotationResult {
51 /// The key that was rotated. Its `valid_until` is now set.
52 pub predecessor: KeyInfo,
53 /// The freshly minted successor key.
54 pub successor: KeyInfo,
55 /// RFC 3339 timestamp until which the predecessor remains valid for
56 /// signature verification under the grace period. Equal to
57 /// `predecessor.valid_until.unwrap()`.
58 pub grace_period_until: String,
59}
60
61/// Errors from keystore operations.
62#[derive(Debug)]
63pub enum KeyError {
64 Io(io::Error),
65 Json(serde_json::Error),
66 Crypto(String),
67 NotFound(KeyId),
68 EmptyKeyId,
69 NoDefaultKey,
70 /// Private key file has insecure permissions (group- or world-readable).
71 /// Carries the path and the observed octal mode so the caller can show
72 /// an actionable error. Set `TREESHIP_ALLOW_INSECURE_KEY_PERMS=1` to
73 /// bypass during testing or controlled environments.
74 InsecureKeyPerms { path: PathBuf, mode: u32 },
75}
76
77impl std::fmt::Display for KeyError {
78 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79 match self {
80 Self::Io(e) => write!(f, "keys io: {}", e),
81 Self::Json(e) => write!(f, "keys json: {}", e),
82 Self::Crypto(e) => write!(f, "keys crypto: {}", e),
83 Self::NotFound(k) => write!(f, "key not found: {}", k),
84 Self::EmptyKeyId => write!(f, "key id must not be empty"),
85 Self::NoDefaultKey => write!(f, "no default key — run treeship init"),
86 Self::InsecureKeyPerms { path, mode } => write!(
87 f,
88 "private key {} has insecure permissions (mode {:o}); \
89 run `treeship doctor --fix` or chmod 600 the file. \
90 Set TREESHIP_ALLOW_INSECURE_KEY_PERMS=1 to bypass.",
91 path.display(),
92 mode & 0o777,
93 ),
94 }
95 }
96}
97
98impl std::error::Error for KeyError {}
99impl From<io::Error> for KeyError { fn from(e: io::Error) -> Self { Self::Io(e) } }
100impl From<serde_json::Error> for KeyError { fn from(e: serde_json::Error) -> Self { Self::Json(e) } }
101
102// --- On-disk formats ---
103
104/// The encrypted representation of one keypair on disk.
105#[derive(Serialize, Deserialize, Clone)]
106struct EncryptedEntry {
107 id: KeyId,
108 algorithm: String,
109 created_at: String,
110 public_key: Vec<u8>,
111 /// AES-256-GCM ciphertext of the 32-byte Ed25519 secret scalar.
112 enc_priv_key: Vec<u8>,
113 /// 12-byte GCM nonce used when encrypting.
114 nonce: Vec<u8>,
115 /// RFC 3339 timestamp after which signatures by this key should be
116 /// considered stale. `None` means the key is indefinitely valid.
117 /// Defaulted on deserialization so pre-0.9.5 entry files still load.
118 #[serde(default, skip_serializing_if = "Option::is_none")]
119 valid_until: Option<String>,
120 /// Successor key id if this key was rotated. Defaulted on
121 /// deserialization for pre-0.9.5 entry files.
122 #[serde(default, skip_serializing_if = "Option::is_none")]
123 successor_key_id: Option<KeyId>,
124}
125
126/// The manifest file: which keys exist and which is the default.
127#[derive(Serialize, Deserialize, Default)]
128struct Manifest {
129 default_key_id: Option<KeyId>,
130 key_ids: Vec<KeyId>,
131}
132
133// --- Store ---
134
135/// Local encrypted keystore.
136///
137/// Private keys are encrypted with AES-256-GCM (RustCrypto `aes-gcm`
138/// 0.10) before writing to disk. The encryption key is derived from a
139/// machine-specific secret so key files are useless if copied to
140/// another machine.
141///
142/// Pre-v0.10.3 keystores used a homemade SHA-256-CTR + HMAC-SHA-256
143/// construction (TS-2026-001) and are transparently migrated to the
144/// new AEAD format on first decrypt; see `encrypt_for_disk_v2` /
145/// `decrypt_from_disk` for the format dispatcher.
146///
147/// A future version will delegate to OS credential stores (Secure
148/// Enclave / TPM 2.0).
149pub struct Store {
150 dir: PathBuf,
151 machine_key: [u8; 32],
152 /// In-memory cache — avoids disk reads on hot paths.
153 cache: Arc<RwLock<HashMap<KeyId, EncryptedEntry>>>,
154}
155
156impl Store {
157 /// Opens or creates a keystore at `dir`.
158 pub fn open(dir: impl AsRef<Path>) -> Result<Self, KeyError> {
159 let dir = dir.as_ref().to_path_buf();
160 fs::create_dir_all(&dir)?;
161
162 let machine_key = derive_machine_key(&dir)?;
163
164 Ok(Self {
165 dir,
166 machine_key,
167 cache: Arc::new(RwLock::new(HashMap::new())),
168 })
169 }
170
171 /// Generates a new Ed25519 keypair, encrypts and stores it.
172 /// If `set_default` is true (or there is no current default), makes
173 /// this key the default signing key.
174 pub fn generate(&self, set_default: bool) -> Result<KeyInfo, KeyError> {
175 let key_id = new_key_id();
176
177 let signer = Ed25519Signer::generate(&key_id)
178 .map_err(|e| KeyError::Crypto(e.to_string()))?;
179
180 let secret = signer.secret_bytes();
181 let pub_key = signer.public_key_bytes();
182
183 let enc = encrypt_for_disk_v2(&self.machine_key, key_id.as_str(), &pub_key, &secret)
184 .map_err(KeyError::Crypto)?;
185
186 let entry = EncryptedEntry {
187 id: key_id.clone(),
188 algorithm: "ed25519".into(),
189 created_at: crate::statements::unix_to_rfc3339(unix_now()),
190 public_key: pub_key.clone(),
191 enc_priv_key: enc,
192 // v2 ciphertexts carry their nonce inline (bytes [2..14]).
193 // The separate `nonce` field is retained for v1 legacy
194 // compatibility; for fresh v2 entries we serialize an empty
195 // vec so the JSON stays well-formed.
196 nonce: Vec::new(),
197 valid_until: None,
198 successor_key_id: None,
199 };
200
201 self.write_entry(&entry)?;
202
203 // Update manifest.
204 let mut manifest = self.read_manifest()?;
205 manifest.key_ids.push(key_id.clone());
206 if set_default || manifest.default_key_id.is_none() {
207 manifest.default_key_id = Some(key_id.clone());
208 }
209 self.write_manifest(&manifest)?;
210
211 // Populate cache.
212 self.cache.write().unwrap().insert(key_id.clone(), entry);
213
214 Ok(KeyInfo {
215 id: key_id.clone(),
216 algorithm: "ed25519".into(),
217 is_default: manifest.default_key_id.as_deref() == Some(key_id.as_str()),
218 created_at: crate::statements::unix_to_rfc3339(unix_now()),
219 fingerprint: fingerprint(&pub_key),
220 public_key: pub_key,
221 valid_until: None,
222 successor_key_id: None,
223 })
224 }
225
226 /// Rotate the current default key (or a specific key) to a freshly
227 /// generated successor.
228 ///
229 /// Mints a new Ed25519 keypair, links the predecessor to it via
230 /// `successor_key_id`, and stamps the predecessor with a `valid_until`
231 /// of `now + grace_period`. The grace window lets verifiers continue to
232 /// accept signatures from the predecessor while clients catch up to
233 /// the new public key.
234 ///
235 /// If `set_default` is true (the typical case -- you rotate because you
236 /// want to start signing with the new key immediately), the successor
237 /// becomes the default. Pass `false` to stage a rotation for review
238 /// without flipping the active signer.
239 ///
240 /// `predecessor_id` may be `None` to rotate the current default. Pass
241 /// an explicit id to rotate a non-default key (e.g. a per-environment
242 /// secondary).
243 ///
244 /// Note on threat model: this is a graceful rotation primitive, not a
245 /// revocation primitive. If the predecessor key is suspected compromised
246 /// the grace_period should be `Duration::ZERO` (or use a future
247 /// `revoke()` call once that lands) so the predecessor's `valid_until`
248 /// is in the past and any verifier honoring the metadata refuses
249 /// further signatures from it.
250 pub fn rotate(
251 &self,
252 predecessor_id: Option<&str>,
253 grace_period: std::time::Duration,
254 set_default: bool,
255 ) -> Result<RotationResult, KeyError> {
256 // Resolve predecessor: explicit id, else the current default.
257 let pred_id = match predecessor_id {
258 Some(id) => id.to_string(),
259 None => self.default_key_id()?,
260 };
261
262 // Refuse to rotate a key that has already been rotated -- the
263 // chain head is the only valid rotation source. This makes the
264 // operation idempotent in the face of accidental re-runs.
265 let pred_entry_existing = self.load_entry(&pred_id)?;
266 if let Some(existing) = &pred_entry_existing.successor_key_id {
267 return Err(KeyError::Crypto(format!(
268 "key {pred_id} has already been rotated to {existing}; \
269 rotate the chain head instead"
270 )));
271 }
272
273 // Mint the successor. We deliberately do NOT call `self.generate()`
274 // because that path also updates the manifest's default. We need a
275 // single transactional update that sets both predecessor metadata
276 // AND (optionally) the new default in one manifest write.
277 let succ_id = new_key_id();
278 let signer = Ed25519Signer::generate(&succ_id)
279 .map_err(|e| KeyError::Crypto(e.to_string()))?;
280 let succ_secret = signer.secret_bytes();
281 let succ_pub_key = signer.public_key_bytes();
282 let succ_enc =
283 encrypt_for_disk_v2(&self.machine_key, succ_id.as_str(), &succ_pub_key, &succ_secret)
284 .map_err(KeyError::Crypto)?;
285
286 let succ_created = crate::statements::unix_to_rfc3339(unix_now());
287 let succ_entry = EncryptedEntry {
288 id: succ_id.clone(),
289 algorithm: "ed25519".into(),
290 created_at: succ_created.clone(),
291 public_key: succ_pub_key.clone(),
292 enc_priv_key: succ_enc,
293 // v2 ciphertexts carry their nonce inline; the legacy
294 // `nonce` field is left empty for fresh writes.
295 nonce: Vec::new(),
296 valid_until: None,
297 successor_key_id: None,
298 };
299
300 // Stamp the predecessor with the grace deadline and link forward.
301 let valid_until = crate::statements::unix_to_rfc3339(
302 unix_now() + grace_period.as_secs(),
303 );
304 let mut pred_entry = pred_entry_existing;
305 pred_entry.valid_until = Some(valid_until.clone());
306 pred_entry.successor_key_id = Some(succ_id.clone());
307
308 // Write order matters for partial-failure recovery. Persist the
309 // successor entry FIRST, then stamp the predecessor pointing at
310 // it. If we wrote the predecessor first and then the successor
311 // write failed, the predecessor's successor_key_id would dangle
312 // at a key that doesn't exist on disk -- and the
313 // already-been-rotated guard would refuse to retry. With this
314 // order:
315 // - successor write fails: nothing observable changed; retry clean.
316 // - predecessor write fails: orphan successor key file on disk
317 // (not yet referenced by manifest or by any other key); retry
318 // generates a new successor and the orphan is harmless.
319 // - manifest write fails: predecessor + successor both on disk,
320 // manifest stale; retry's already-rotated guard catches the
321 // half-finished state and surfaces a clear error.
322 self.write_entry(&succ_entry)?;
323 self.write_entry(&pred_entry)?;
324
325 // Refresh the cache to mirror the on-disk state we just wrote --
326 // BEFORE the manifest update. If the manifest write fails, the
327 // cache must still match disk so a same-process retry sees the
328 // half-rotated state and the already-rotated guard fires
329 // correctly. Doing this AFTER write_manifest would leave a
330 // window where disk reflects the rotation but the in-memory
331 // cache still serves the unstamped predecessor, and a retry
332 // from the same Store instance would generate a duplicate
333 // successor -- defeating the whole point of the guard.
334 {
335 let mut cache = self.cache.write().unwrap();
336 cache.insert(pred_entry.id.clone(), pred_entry.clone());
337 cache.insert(succ_id.clone(), succ_entry.clone());
338 }
339
340 // Update the manifest: register the new key, optionally promote it.
341 let mut manifest = self.read_manifest()?;
342 manifest.key_ids.push(succ_id.clone());
343 if set_default {
344 manifest.default_key_id = Some(succ_id.clone());
345 }
346 self.write_manifest(&manifest)?;
347
348 let default_id = manifest.default_key_id.clone();
349 let predecessor = KeyInfo {
350 id: pred_entry.id.clone(),
351 algorithm: pred_entry.algorithm.clone(),
352 is_default: default_id.as_deref() == Some(pred_entry.id.as_str()),
353 created_at: pred_entry.created_at.clone(),
354 fingerprint: fingerprint(&pred_entry.public_key),
355 public_key: pred_entry.public_key.clone(),
356 valid_until: pred_entry.valid_until.clone(),
357 successor_key_id: pred_entry.successor_key_id.clone(),
358 };
359 let successor = KeyInfo {
360 id: succ_id.clone(),
361 algorithm: "ed25519".into(),
362 is_default: default_id.as_deref() == Some(succ_id.as_str()),
363 created_at: succ_created,
364 fingerprint: fingerprint(&succ_pub_key),
365 public_key: succ_pub_key,
366 valid_until: None,
367 successor_key_id: None,
368 };
369
370 Ok(RotationResult {
371 predecessor,
372 successor,
373 grace_period_until: valid_until,
374 })
375 }
376
377 /// Walk the rotation chain forward from `id`, returning the ordered
378 /// list of key ids: `[id, successor_of_id, ...]`. The first element is
379 /// always `id` itself. Stops at a key with no `successor_key_id`.
380 pub fn successor_chain(&self, id: &str) -> Result<Vec<KeyId>, KeyError> {
381 let mut chain = Vec::new();
382 let mut cursor = id.to_string();
383 // Cap iterations at the manifest size to defend against a corrupt
384 // chain that loops back on itself. A well-formed chain is bounded
385 // by the number of keys in the keystore.
386 let max_steps = self.read_manifest()?.key_ids.len() + 1;
387 for _ in 0..max_steps {
388 chain.push(cursor.clone());
389 let entry = self.load_entry(&cursor)?;
390 match entry.successor_key_id {
391 Some(next) => cursor = next,
392 None => return Ok(chain),
393 }
394 }
395 Err(KeyError::Crypto(format!(
396 "rotation chain starting at {id} exceeds keystore size; suspected loop"
397 )))
398 }
399
400 /// Returns the `KeyInfo` for every key whose `valid_until` is either
401 /// unset or strictly after `at_unix_secs`. The result includes both
402 /// rotated-but-still-in-grace predecessors and never-rotated keys.
403 /// Useful for building a verifier's accept-set as of a given time.
404 pub fn valid_keys_at(&self, at_unix_secs: u64) -> Result<Vec<KeyInfo>, KeyError> {
405 let cutoff_rfc = crate::statements::unix_to_rfc3339(at_unix_secs);
406 Ok(self.list()?
407 .into_iter()
408 .filter(|k| match &k.valid_until {
409 None => true,
410 Some(until) => until.as_str() > cutoff_rfc.as_str(),
411 })
412 .collect())
413 }
414
415 /// Returns a boxed `Signer` for the current default key.
416 pub fn default_signer(&self) -> Result<Box<dyn Signer>, KeyError> {
417 let manifest = self.read_manifest()?;
418 let id = manifest.default_key_id.ok_or(KeyError::NoDefaultKey)?;
419 self.signer(&id)
420 }
421
422 /// Returns a boxed `Signer` for a specific key ID.
423 ///
424 /// Refuses to load if the on-disk key file has insecure permissions
425 /// (any group or world bits). This is the choke point for *all*
426 /// signing — public-key reads and successor lookups go through
427 /// `read_entry` / `public_key` and are not affected.
428 ///
429 /// Bypass with `TREESHIP_ALLOW_INSECURE_KEY_PERMS=1` for controlled
430 /// environments (CI sandboxes, recovery flows). The bypass should
431 /// not be set in normal operation.
432 pub fn signer(&self, id: &str) -> Result<Box<dyn Signer>, KeyError> {
433 check_key_file_perms(&self.entry_path(id))?;
434
435 let entry = self.load_entry(id)?;
436
437 // Dispatcher: v2 ciphertexts start with magic 0x54, version 0x02
438 // and use real AES-256-GCM. Older entries fall through to the
439 // legacy SHA-256-CTR+HMAC path (`decrypt_legacy_v1`) and are
440 // transparently re-encrypted in the new format below.
441 let was_legacy = is_legacy_v1(&entry.enc_priv_key);
442 let secret = decrypt_from_disk(
443 &self.machine_key,
444 &entry.id,
445 &entry.public_key,
446 &entry.enc_priv_key,
447 &entry.nonce,
448 )
449 .map_err(|e| self.enrich_crypto_error(e))?;
450
451 // L3: wrap the on-stack copy of the decrypted secret in a
452 // `Zeroizing` so the byte buffer is wiped on drop. `secret`
453 // itself is already a `Zeroizing<Vec<u8>>` returned by
454 // `decrypt_from_disk`, but `try_into::<[u8; 32]>` produces an
455 // independent stack-allocated array that the Vec's Drop will
456 // not cover. Without this wrapper, returning from `signer()`
457 // would leave the secret scalar in stale stack memory until
458 // a future stack frame happens to overwrite it.
459 let secret_arr: Zeroizing<[u8; 32]> = Zeroizing::new(
460 secret.as_slice().try_into()
461 .map_err(|_| KeyError::Crypto("decrypted key is wrong length".into()))?
462 );
463
464 // Transparent migration: if this entry was still in the legacy
465 // v1 format (the broken SHA-256-CTR construction from
466 // TS-2026-001), re-encrypt it with v2 AES-256-GCM and rewrite
467 // the file. We do this best-effort -- a migration failure here
468 // must NOT block signing for the current call, since the
469 // in-memory secret is already valid. The next decrypt on a
470 // fresh process will retry.
471 if was_legacy {
472 if let Err(e) = self.migrate_entry_to_v2(&entry, &secret_arr) {
473 // Surface the failure as a tracing-style stderr note
474 // rather than an error -- the user's signing flow is
475 // unaffected, and we'd rather them know about it than
476 // wedge the call.
477 eprintln!(
478 "treeship: keystore entry {} could not be migrated \
479 from legacy v1 format to v2 ({}); will retry next \
480 load",
481 entry.id, e
482 );
483 }
484 }
485
486 let signer = Ed25519Signer::from_bytes(&entry.id, &secret_arr)
487 .map_err(|e| KeyError::Crypto(e.to_string()))?;
488
489 Ok(Box::new(signer))
490 }
491
492 /// Re-encrypt a legacy v1 entry with the new v2 AEAD and persist
493 /// it. Updates the in-memory cache so subsequent loads in the same
494 /// process see the migrated entry. Idempotent; safe to invoke
495 /// concurrently because the migration is serialized by a per-entry
496 /// advisory lock on `<entry>.migrate.lock` (TS-2026-001 H3).
497 ///
498 /// We lock a *sentinel* file rather than the entry file itself,
499 /// because the entry file is renamed-into-place during the atomic
500 /// write inside `write_entry`. Holding a flock on the entry's inode
501 /// while a sibling process renames a new inode into its path is
502 /// nonsensical (the lock would survive on the now-orphaned inode);
503 /// the sentinel sidecar has a stable identity for the whole
504 /// migration window.
505 ///
506 /// Same blocking-flock pattern as `packages/core/src/session/event_log.rs`
507 /// (Lane F): exclusive lock, then a same-thread re-read to settle
508 /// "did a peer already migrate while I was waiting?" cleanly.
509 fn migrate_entry_to_v2(
510 &self,
511 old_entry: &EncryptedEntry,
512 secret: &[u8; 32],
513 ) -> Result<(), KeyError> {
514 let entry_path = self.entry_path(&old_entry.id);
515 let lock_path = entry_path.with_extension("migrate.lock");
516
517 // Open (or create) the sentinel lock file with restrictive perms
518 // and take an exclusive flock. We intentionally use the blocking
519 // `lock_exclusive` -- not `try_lock_exclusive` -- because the
520 // migration window is short (a single AEAD encrypt + atomic
521 // rename) and the worst case under contention is one writer
522 // serialized behind another. Pulling the
523 // try-with-bounded-retry pattern in here would buy us nothing:
524 // the second writer's re-read after the lock releases would
525 // observe the now-v2 entry and short-circuit.
526 let lock_file = open_migration_lock_file(&lock_path)
527 .map_err(KeyError::Io)?;
528
529 #[cfg(not(target_family = "wasm"))]
530 {
531 use fs2::FileExt;
532 lock_file.lock_exclusive().map_err(KeyError::Io)?;
533 }
534
535 // Under the lock: did a peer already complete the migration
536 // while we were waiting? If so, our work is done -- we must
537 // NOT rewrite, because we'd overwrite a peer's freshly-rotated
538 // v2 ciphertext with our own (semantically equivalent, but
539 // unnecessary I/O and an unnecessary cache update).
540 if let Ok(current) = self.read_entry(&old_entry.id) {
541 if !is_legacy_v1(¤t.enc_priv_key) {
542 // Peer already migrated. Refresh the cache so subsequent
543 // loads in this process see the v2 entry rather than
544 // the stale legacy copy our caller passed in.
545 if let Ok(mut cache) = self.cache.write() {
546 cache.insert(current.id.clone(), current);
547 }
548 // Lock drops at function exit; sentinel file remains on
549 // disk as a harmless inode (no migration data, idempotent
550 // for future invocations).
551 return Ok(());
552 }
553 }
554
555 let new_ciphertext = encrypt_for_disk_v2(
556 &self.machine_key,
557 &old_entry.id,
558 &old_entry.public_key,
559 secret,
560 )
561 .map_err(KeyError::Crypto)?;
562
563 let migrated = EncryptedEntry {
564 id: old_entry.id.clone(),
565 algorithm: old_entry.algorithm.clone(),
566 created_at: old_entry.created_at.clone(),
567 public_key: old_entry.public_key.clone(),
568 enc_priv_key: new_ciphertext,
569 // v2 carries the nonce inline; clear the legacy field.
570 nonce: Vec::new(),
571 valid_until: old_entry.valid_until.clone(),
572 successor_key_id: old_entry.successor_key_id.clone(),
573 };
574
575 self.write_entry(&migrated)?;
576 if let Ok(mut cache) = self.cache.write() {
577 cache.insert(migrated.id.clone(), migrated);
578 }
579
580 // Best-effort cleanup of the sentinel lock file. We hold the
581 // lock until function exit (drop), so by the time we reach
582 // here it is safe to unlink the inode -- future migrations
583 // for this entry will succeed via the early-return path
584 // because the entry is now v2. Leaving the sentinel behind is
585 // also harmless; on Unix removing a flocked file is allowed
586 // and the lock is released on fd drop regardless.
587 let _ = std::fs::remove_file(&lock_path);
588
589 // Keep the lock_file binding alive to function exit so the
590 // flock is held across write_entry + remove_file. Explicit
591 // drop makes the intent obvious to readers.
592 drop(lock_file);
593 Ok(())
594 }
595
596 /// Wrap a bare crypto error (typically "MAC verification failed ..." from
597 /// the AES-GCM decrypt path) with a diagnostic and an actionable recovery
598 /// path.
599 ///
600 /// The common failure mode in the wild is a pre-0.9.x keystore whose
601 /// machine-key derivation was seed-file-based. Later versions derive
602 /// the machine key from hostname+username (macOS) or /etc/machine-id
603 /// (Linux), so old ciphertexts can't be MAC-verified with the new key.
604 /// Detecting that case is best-effort: the presence of a legacy seed
605 /// file (`.machineseed` or `machine_seed` inside the keys dir) is a
606 /// strong hint. If we see one, call it out explicitly.
607 fn enrich_crypto_error(&self, raw: String) -> KeyError {
608 // Only enrich on MAC failures -- other errors (I/O, wrong length) are
609 // surfaced as-is because their remediation differs.
610 if !raw.contains("MAC verification failed") {
611 return KeyError::Crypto(raw);
612 }
613
614 let legacy_seed_dot = self.dir.join(".machineseed");
615 let legacy_seed = self.dir.join("machine_seed");
616 let has_legacy_seed = legacy_seed_dot.exists() || legacy_seed.exists();
617
618 let diagnosis = if has_legacy_seed {
619 "your keystore was created by an older Treeship version whose \
620 machine-key derivation has since changed. The ciphertext is \
621 intact but cannot be decrypted under the current derivation."
622 } else {
623 "the keystore cannot be decrypted. Usual causes: the key file \
624 was copied from a different machine, the hostname or username \
625 changed, or the file was corrupted."
626 };
627
628 // Resolve the user's ~/.treeship path for the recovery command, so
629 // we give a copy-pasteable command rather than a generic instruction.
630 let ts_dir = std::env::var("HOME")
631 .map(|h| format!("{h}/.treeship"))
632 .unwrap_or_else(|_| "~/.treeship".into());
633
634 // The outer KeyError::Crypto Display impl already prepends
635 // "keys crypto: "; don't double it. Start with the raw MAC error
636 // so the user still sees the underlying cryptographic reason,
637 // then follow with the human-readable diagnosis and recovery.
638 let msg = format!(
639 "{raw}\n\n \
640 Diagnosis: {diagnosis}\n\n \
641 Recovery (nondestructive -- the old keystore is moved aside, \
642 not deleted; any sealed .treeship packages you produced remain \
643 verifiable since their receipts embed the old public key):\n\n \
644 mv {ts_dir} {ts_dir}.bak.$(date +%s)\n \
645 treeship init\n"
646 );
647
648 KeyError::Crypto(msg)
649 }
650
651 /// Returns the default key ID.
652 pub fn default_key_id(&self) -> Result<KeyId, KeyError> {
653 self.read_manifest()?
654 .default_key_id
655 .ok_or(KeyError::NoDefaultKey)
656 }
657
658 /// Lists all keys.
659 pub fn list(&self) -> Result<Vec<KeyInfo>, KeyError> {
660 let manifest = self.read_manifest()?;
661 let default = manifest.default_key_id.as_deref().unwrap_or("");
662
663 manifest.key_ids.iter().map(|id| {
664 let entry = self.load_entry(id)?;
665 Ok(KeyInfo {
666 id: entry.id.clone(),
667 algorithm: entry.algorithm.clone(),
668 is_default: entry.id == default,
669 created_at: entry.created_at.clone(),
670 fingerprint: fingerprint(&entry.public_key),
671 public_key: entry.public_key.clone(),
672 valid_until: entry.valid_until.clone(),
673 successor_key_id: entry.successor_key_id.clone(),
674 })
675 }).collect()
676 }
677
678 /// Sets the default signing key.
679 pub fn set_default(&self, id: &str) -> Result<(), KeyError> {
680 // Verify the key exists before updating the manifest.
681 self.load_entry(id)?;
682 let mut manifest = self.read_manifest()?;
683 manifest.default_key_id = Some(id.to_string());
684 self.write_manifest(&manifest)
685 }
686
687 /// Returns the public key bytes for a key ID.
688 pub fn public_key(&self, id: &str) -> Result<Vec<u8>, KeyError> {
689 Ok(self.load_entry(id)?.public_key)
690 }
691
692 // --- private ---
693
694 fn load_entry(&self, id: &str) -> Result<EncryptedEntry, KeyError> {
695 // Check cache first.
696 if let Ok(cache) = self.cache.read() {
697 if let Some(entry) = cache.get(id) {
698 return Ok(entry.clone());
699 }
700 }
701 self.read_entry(id)
702 }
703
704 fn entry_path(&self, id: &str) -> PathBuf {
705 self.dir.join(format!("{}.json", id))
706 }
707
708 fn write_entry(&self, entry: &EncryptedEntry) -> Result<(), KeyError> {
709 let path = self.entry_path(&entry.id);
710 let json = serde_json::to_vec_pretty(entry)?;
711 write_file_600(&path, &json)?;
712 Ok(())
713 }
714
715 fn read_entry(&self, id: &str) -> Result<EncryptedEntry, KeyError> {
716 let path = self.entry_path(id);
717 if !path.exists() {
718 return Err(KeyError::NotFound(id.to_string()));
719 }
720 let bytes = fs::read(&path)?;
721 let entry: EncryptedEntry = serde_json::from_slice(&bytes)?;
722 Ok(entry)
723 }
724
725 fn manifest_path(&self) -> PathBuf {
726 self.dir.join("manifest.json")
727 }
728
729 fn read_manifest(&self) -> Result<Manifest, KeyError> {
730 let path = self.manifest_path();
731 if !path.exists() {
732 return Ok(Manifest::default());
733 }
734 let bytes = fs::read(&path)?;
735 Ok(serde_json::from_slice(&bytes)?)
736 }
737
738 fn write_manifest(&self, m: &Manifest) -> Result<(), KeyError> {
739 let json = serde_json::to_vec_pretty(m)?;
740 write_file_600(&self.manifest_path(), &json)?;
741 Ok(())
742 }
743}
744
745// --- Crypto helpers ---
746//
747// AEAD choice: AES-256-GCM via the RustCrypto `aes-gcm` 0.10 crate.
748// Reasons:
749// - Matches the original (documented but never implemented) intent of
750// the keystore, so audit reports and SECURITY.md don't need to be
751// re-anchored on a different primitive.
752// - Well-audited, widely deployed, no platform gotchas.
753// - `chacha20poly1305` would have been a defensible alternative
754// (slightly better software performance), but the migration cost of
755// changing the documented primitive while we already have to ship a
756// migration for the broken construction is not worth it.
757//
758// On-disk v2 format (`encrypt_for_disk_v2`):
759// [ magic = 0x54 ('T') ] 1 byte
760// [ version = 0x02 ] 1 byte
761// [ nonce ] 12 bytes (random per encryption)
762// [ ciphertext || tag ] N + 16 bytes (tag appended by aead crate)
763//
764// The first byte (0x54) is a structural sentinel so we can dispatch on
765// the format without relying on length heuristics. v1 ciphertexts start
766// with the first byte of their random nonce, so the chance of an
767// accidental v1 entry that looks like v2 is ~1/2^16 (matching both magic
768// AND version byte) and we still re-validate by AEAD-decrypting; if the
769// AEAD fails on something that looks like v2, we fall back to v1.
770
771const KEYSTORE_MAGIC: u8 = 0x54; // 'T'
772const KEYSTORE_VERSION_V2: u8 = 0x02;
773
774/// Build the v2 keystore AEAD AAD.
775///
776/// The AAD binds two things into the GCM tag beyond ciphertext+nonce:
777///
778/// 1. **Framing prefix** (`[KEYSTORE_MAGIC, KEYSTORE_VERSION_V2]`) so
779/// flipping the magic or version byte on disk surfaces as a MAC
780/// failure rather than dispatcher confusion (the M2 audit finding).
781/// 2. **Entry identity** (`entry_id` and `public_key`) so an attacker
782/// with write access to `~/.treeship/keys/` cannot copy entry A's
783/// `enc_priv_key` ciphertext into entry B's JSON envelope. Without
784/// this binding, the swap would decrypt cleanly (same machine key,
785/// same framing-only AAD) and the signer for advertised key id A
786/// would silently sign with key B's secret scalar — un-binding
787/// `KeyInfo.public_key` from the actual scalar in use. This closes
788/// the "intra-keystore swap" class flagged in the post-merge audit
789/// of TS-2026-001.
790///
791/// Every variable-length field is length-prefixed with a big-endian
792/// u32 before its bytes. Concatenating variable-length fields without
793/// length prefixes is a forgery class (an attacker who controls field
794/// boundaries can shift bytes between fields and present a different
795/// `(entry_id, public_key)` pair whose AAD-bytes serialize identically).
796/// `entry_id` is a fixed-prefix `key_<hex>` string in practice, but we
797/// length-prefix it anyway to defend against future id schemes.
798///
799/// The AAD must be byte-identical on encrypt and decrypt. Future
800/// versions (V3+) get their own builder; the dispatcher picks which
801/// to use based on the framing prefix.
802fn build_aad_v2(entry_id: &str, public_key: &[u8]) -> Vec<u8> {
803 let mut aad = Vec::with_capacity(2 + 4 + entry_id.len() + 4 + public_key.len());
804 aad.push(KEYSTORE_MAGIC);
805 aad.push(KEYSTORE_VERSION_V2);
806 aad.extend_from_slice(&(entry_id.len() as u32).to_be_bytes());
807 aad.extend_from_slice(entry_id.as_bytes());
808 aad.extend_from_slice(&(public_key.len() as u32).to_be_bytes());
809 aad.extend_from_slice(public_key);
810 aad
811}
812
813/// AES-256-GCM (the real one) encrypt for at-rest keystore storage.
814/// Returns the framed v2 blob ready to drop into `EncryptedEntry::enc_priv_key`.
815///
816/// Output: `[magic, version, nonce(12), ciphertext || tag(16)]`.
817///
818/// The AEAD's Associated Authenticated Data binds:
819/// - the framing prefix (M2 — flipping magic/version surfaces as MAC failure)
820/// - the entry id and public key (post-merge audit fix-up — closes the
821/// intra-keystore swap class where a local attacker copies entry A's
822/// `enc_priv_key` into entry B's JSON envelope).
823///
824/// See `build_aad_v2` for the exact layout. `entry_id` and `public_key`
825/// must match what gets serialized into the `EncryptedEntry` JSON;
826/// `decrypt_for_disk_v2` reads them back from the deserialized entry
827/// to recompute the AAD.
828fn encrypt_for_disk_v2(
829 key: &[u8; 32],
830 entry_id: &str,
831 public_key: &[u8],
832 plaintext: &[u8],
833) -> Result<Vec<u8>, String> {
834 // Wrap the in-memory AEAD key in Zeroizing so the local stack copy
835 // is wiped on drop. The aes-gcm cipher object owns its own internal
836 // expanded key schedule; that's outside our control, but the raw
837 // 32-byte buffer at this scope is ours to clear.
838 let key_buf: Zeroizing<[u8; 32]> = Zeroizing::new(*key);
839 let aead_key: &AesKey<Aes256Gcm> = AesKey::<Aes256Gcm>::from_slice(key_buf.as_slice());
840 let cipher = Aes256Gcm::new(aead_key);
841
842 // 96-bit random nonce from the OS CSPRNG.
843 let nonce = Aes256Gcm::generate_nonce(&mut AeadOsRng);
844
845 let aad = build_aad_v2(entry_id, public_key);
846 let ciphertext = cipher
847 .encrypt(
848 &nonce,
849 Payload {
850 msg: plaintext,
851 aad: aad.as_slice(),
852 },
853 )
854 .map_err(|e| format!("aead encrypt failed: {e}"))?;
855
856 let mut out = Vec::with_capacity(2 + 12 + ciphertext.len());
857 out.push(KEYSTORE_MAGIC);
858 out.push(KEYSTORE_VERSION_V2);
859 out.extend_from_slice(nonce.as_slice());
860 out.extend_from_slice(&ciphertext);
861 Ok(out)
862}
863
864/// AES-256-GCM decrypt of a v2 framed blob. Uses the same AAD binding
865/// as `encrypt_for_disk_v2`:
866/// - framing prefix (so a tampered magic/version surfaces as MAC failure)
867/// - entry id + public key (so swapping `enc_priv_key` between entries
868/// in the same keystore surfaces as MAC failure).
869///
870/// `entry_id` and `public_key` come from the `EncryptedEntry` JSON
871/// envelope that holds `blob`. The caller is responsible for passing the
872/// *envelope's* id and pubkey, not values from some other source — that
873/// is precisely what binds the ciphertext to its envelope.
874fn decrypt_v2(
875 key: &[u8; 32],
876 entry_id: &str,
877 public_key: &[u8],
878 blob: &[u8],
879) -> Result<Vec<u8>, String> {
880 // Minimum: magic(1) + version(1) + nonce(12) + tag(16) = 30 bytes.
881 if blob.len() < 30 {
882 return Err("v2 ciphertext too short".into());
883 }
884 if blob[0] != KEYSTORE_MAGIC || blob[1] != KEYSTORE_VERSION_V2 {
885 return Err("v2 ciphertext has wrong magic/version".into());
886 }
887 let nonce_bytes = &blob[2..14];
888 let ct = &blob[14..];
889
890 let key_buf: Zeroizing<[u8; 32]> = Zeroizing::new(*key);
891 let aead_key: &AesKey<Aes256Gcm> = AesKey::<Aes256Gcm>::from_slice(key_buf.as_slice());
892 let cipher = Aes256Gcm::new(aead_key);
893 let nonce = Nonce::from_slice(nonce_bytes);
894
895 let aad = build_aad_v2(entry_id, public_key);
896 cipher
897 .decrypt(
898 nonce,
899 Payload {
900 msg: ct,
901 aad: aad.as_slice(),
902 },
903 )
904 .map_err(|_| "MAC verification failed — key file may be corrupt or wrong machine".into())
905}
906
907/// Returns true iff `blob` is shaped like a v1 (legacy) ciphertext.
908/// Used by the dispatcher to decide whether a successful decrypt should
909/// trigger a transparent re-encrypt to v2.
910fn is_legacy_v1(blob: &[u8]) -> bool {
911 // A v2 blob always starts with [magic, version]. Anything else
912 // (including the empty enc_priv_key case during partial writes) is
913 // treated as legacy and routed through the v1 path, which will fail
914 // cleanly on garbage.
915 !(blob.len() >= 2 && blob[0] == KEYSTORE_MAGIC && blob[1] == KEYSTORE_VERSION_V2)
916}
917
918/// Top-level decrypt dispatcher used by the keystore. Tries v2 if the
919/// blob carries the magic+version prefix, otherwise falls through to the
920/// legacy v1 path. If a blob looks like v2 but AEAD verification fails,
921/// we also try v1 — this defends against the (negligible) probability
922/// that a legacy ciphertext's random first two bytes happen to collide
923/// with our magic+version.
924///
925/// M1 (TS-2026-001 audit): when the blob is v2-shaped and BOTH the v2
926/// AEAD and the v1 fallback fail, surface the v2 error rather than the
927/// v1 error. v1's failure on a v2-shaped blob is mechanical (wrong
928/// MAC computed under the wrong construction) and tells the user
929/// nothing useful; v2's failure is the actually-relevant signal
930/// (MAC verification under the documented AEAD). The previous code
931/// would mask the meaningful error with a confused legacy error
932/// message that pointed at the wrong remediation.
933fn decrypt_from_disk(
934 key: &[u8; 32],
935 entry_id: &str,
936 public_key: &[u8],
937 enc_data: &[u8],
938 legacy_nonce_field: &[u8],
939) -> Result<Zeroizing<Vec<u8>>, String> {
940 if !is_legacy_v1(enc_data) {
941 match decrypt_v2(key, entry_id, public_key, enc_data) {
942 Ok(pt) => return Ok(Zeroizing::new(pt)),
943 Err(v2_err) => {
944 // Collision fallback. v1 entries had random first bytes;
945 // there's a vanishing chance one looks like v2 framing.
946 // Try v1 first; if it succeeds we have a legitimate
947 // legacy entry whose framing happens to look v2-shaped.
948 // If v1 also fails, surface the v2 error (the
949 // semantically meaningful one) rather than v1's
950 // mechanical-junk failure.
951 return match decrypt_legacy_v1(key, enc_data, legacy_nonce_field) {
952 Ok(pt) => Ok(Zeroizing::new(pt)),
953 Err(_) => Err(v2_err),
954 };
955 }
956 }
957 }
958 decrypt_legacy_v1(key, enc_data, legacy_nonce_field).map(Zeroizing::new)
959}
960
961/// DEPRECATED: legacy at-rest decryption for keystores written before
962/// v0.10.3. This is the SHA-256-CTR + HMAC-SHA-256 construction that
963/// was mis-labelled as AES-256-GCM (TS-2026-001). The CTR keystream is
964/// also degenerate (the same `enc_key` byte is reused once per
965/// plaintext byte, since `block[i % 32]` indexes the same SHA-256 output
966/// modulo 32), so the construction is NOT a real stream cipher even
967/// ignoring the AEAD mislabelling.
968///
969/// Kept ONLY to migrate existing on-disk keystores forward to the v2
970/// AEAD format. Never call this for new writes. The encrypt counterpart
971/// has been removed from the v2 codepath — the only place v1
972/// ciphertexts come from is files written by older Treeship versions.
973pub fn aes_gcm_decrypt(
974 key: &[u8; 32],
975 enc_data: &[u8],
976 _nonce_unused: &[u8],
977) -> Result<Vec<u8>, String> {
978 // Preserved as a public symbol because the `treeship-vi` sibling
979 // crate calls it directly. vi only ever produces v1 ciphertexts
980 // (its `aes_gcm_encrypt` shim calls `legacy_v1_encrypt`) and has
981 // no concept of the `EncryptedEntry` envelope that carries the
982 // entry id + public key the v2 AAD now requires. Route this shim
983 // directly through the legacy v1 path so vi's call site keeps
984 // working byte-for-byte; vi's eventual migration release will
985 // adopt its own AEAD path with its own envelope binding.
986 decrypt_legacy_v1(key, enc_data, _nonce_unused)
987}
988
989/// DEPRECATED: legacy at-rest encryption. Same caveats as
990/// `aes_gcm_decrypt`. Kept ONLY as a public symbol for compatibility
991/// with the `treeship-vi` sibling crate; the core keystore no longer
992/// produces v1 ciphertexts.
993///
994/// New code MUST use `encrypt_for_disk_v2`. This function still
995/// produces v1-format output so the vi crate's on-disk format remains
996/// byte-stable until it migrates on its own cadence.
997pub fn aes_gcm_encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result<(Vec<u8>, Vec<u8>), String> {
998 legacy_v1_encrypt(key, plaintext)
999}
1000
1001/// Legacy v1 encrypt. SHA-256-CTR + HMAC-SHA-256. DO NOT USE for new
1002/// writes — present only so vi-keystore callers keep working until
1003/// they migrate. See `aes_gcm_encrypt` doc-comment for the security
1004/// caveats.
1005fn legacy_v1_encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result<(Vec<u8>, Vec<u8>), String> {
1006 use sha2::Sha256;
1007
1008 let mut nonce = [0u8; 12];
1009 rand::thread_rng().fill_bytes(&mut nonce);
1010
1011 let mut enc_key_input = key.to_vec();
1012 enc_key_input.extend_from_slice(&nonce);
1013 enc_key_input.extend_from_slice(b"enc");
1014 let enc_key = Sha256::digest(&enc_key_input);
1015
1016 let mut mac_key_input = key.to_vec();
1017 mac_key_input.extend_from_slice(&nonce);
1018 mac_key_input.extend_from_slice(b"mac");
1019 let mac_key = Sha256::digest(&mac_key_input);
1020
1021 let ciphertext: Vec<u8> = plaintext.iter().enumerate().map(|(i, &b)| {
1022 let mut block_input = enc_key.to_vec();
1023 block_input.extend_from_slice(&(i as u64).to_le_bytes());
1024 let block = Sha256::digest(&block_input);
1025 b ^ block[i % 32]
1026 }).collect();
1027
1028 let mut mac_input = mac_key.to_vec();
1029 mac_input.extend_from_slice(&nonce);
1030 mac_input.extend_from_slice(&ciphertext);
1031 let mac = Sha256::digest(&mac_input);
1032
1033 let mut out = Vec::with_capacity(12 + 32 + ciphertext.len());
1034 out.extend_from_slice(&nonce);
1035 out.extend_from_slice(&mac);
1036 out.extend_from_slice(&ciphertext);
1037
1038 Ok((out, nonce.to_vec()))
1039}
1040
1041/// Legacy v1 decrypt. SHA-256-CTR + HMAC-SHA-256. See the module-level
1042/// notes on TS-2026-001 for why this is broken; kept only to migrate
1043/// existing keystores forward.
1044fn decrypt_legacy_v1(
1045 key: &[u8; 32],
1046 enc_data: &[u8],
1047 _nonce_unused: &[u8],
1048) -> Result<Vec<u8>, String> {
1049 if enc_data.len() < 44 {
1050 return Err("ciphertext too short".into());
1051 }
1052 use sha2::Sha256;
1053
1054 let nonce = &enc_data[..12];
1055 let stored_mac = &enc_data[12..44];
1056 let ciphertext = &enc_data[44..];
1057
1058 let nonce_arr: [u8; 12] = nonce.try_into().unwrap();
1059
1060 let mut enc_key_input = key.to_vec();
1061 enc_key_input.extend_from_slice(&nonce_arr);
1062 enc_key_input.extend_from_slice(b"enc");
1063 let enc_key = Sha256::digest(&enc_key_input);
1064
1065 let mut mac_key_input = key.to_vec();
1066 mac_key_input.extend_from_slice(&nonce_arr);
1067 mac_key_input.extend_from_slice(b"mac");
1068 let mac_key = Sha256::digest(&mac_key_input);
1069
1070 let mut mac_input = mac_key.to_vec();
1071 mac_input.extend_from_slice(&nonce_arr);
1072 mac_input.extend_from_slice(ciphertext);
1073 let computed_mac = Sha256::digest(&mac_input);
1074
1075 let mac_ok = stored_mac.iter().zip(computed_mac.iter())
1076 .fold(0u8, |acc, (a, b)| acc | (a ^ b)) == 0;
1077
1078 if !mac_ok {
1079 return Err("MAC verification failed — key file may be corrupt or wrong machine".into());
1080 }
1081
1082 let plaintext: Vec<u8> = ciphertext.iter().enumerate().map(|(i, &b)| {
1083 let mut block_input = enc_key.to_vec();
1084 block_input.extend_from_slice(&(i as u64).to_le_bytes());
1085 let block = Sha256::digest(&block_input);
1086 b ^ block[i % 32]
1087 }).collect();
1088
1089 Ok(plaintext)
1090}
1091
1092// --- Machine key derivation ---
1093
1094pub fn derive_machine_key(store_dir: &Path) -> Result<[u8; 32], KeyError> {
1095 // 1. Linux: /etc/machine-id (stable across reboots)
1096 if let Ok(id) = fs::read_to_string("/etc/machine-id") {
1097 let trimmed = id.trim();
1098 if !trimmed.is_empty() {
1099 let mut h = Sha256::new();
1100 h.update(trimmed.as_bytes());
1101 h.update(store_dir.to_string_lossy().as_bytes());
1102 return Ok(h.finalize().into());
1103 }
1104 }
1105
1106 // 2. macOS: hostname + username derivation (v1, backward compatible).
1107 //
1108 // TODO(v0.7.0): Migrate to IOPlatformSerialNumber-based derivation.
1109 // The serial number is more stable (survives hostname and username
1110 // changes), but switching now would silently invalidate all existing
1111 // keys on macOS. A proper migration needs to:
1112 // 1. Try the new derivation first.
1113 // 2. On decryption failure, fall back to hostname+username.
1114 // 3. If legacy succeeds, re-encrypt with the new key and save.
1115 // Until that migration tooling is in place, keep hostname+username
1116 // as the primary derivation so existing users are not locked out.
1117 #[cfg(target_os = "macos")]
1118 {
1119 let hostname = std::process::Command::new("hostname")
1120 .output()
1121 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
1122 .unwrap_or_default();
1123 let username = std::env::var("USER").unwrap_or_default();
1124 if !hostname.is_empty() && !username.is_empty() {
1125 let mut h = Sha256::new();
1126 h.update(b"treeship-machine-key:");
1127 h.update(hostname.as_bytes());
1128 h.update(b":");
1129 h.update(username.as_bytes());
1130 h.update(b":");
1131 h.update(store_dir.to_string_lossy().as_bytes());
1132 return Ok(h.finalize().into());
1133 }
1134 }
1135
1136 // 3. Fallback: random seed file. Co-located with the keystore so a
1137 // project-local keystore (/proj/.treeship/keys/) keeps its seed at
1138 // /proj/.treeship/machine_seed -- never reaching for ~/.treeship.
1139 // A global keystore (~/.treeship/keys/) co-locates to
1140 // ~/.treeship/machine_seed, which is byte-identical to the
1141 // pre-v0.9.6 location, so existing global keystores keep working.
1142 //
1143 // Backward-compat read order:
1144 // 1. <store_dir>/../machine_seed (the new co-located path)
1145 // 2. ~/.treeship/machine_seed (the old hardcoded path)
1146 // Write order on first creation:
1147 // 1. <store_dir>/../machine_seed if the parent exists/is writable
1148 // 2. ~/.treeship/machine_seed as a last resort
1149 //
1150 // This makes project-local config truly self-contained: an
1151 // isolated /proj keystore can decrypt its own keys even when
1152 // the user's ~/.treeship is corrupt or on a different machine,
1153 // closing the trust-fabric isolation gap that blocked
1154 // project-local smoke tests.
1155 let local_seed_path = store_dir.parent().map(|p| p.join("machine_seed"));
1156 let home = std::env::var("HOME")
1157 .map(std::path::PathBuf::from)
1158 .map_err(|_| KeyError::Crypto("HOME not set".to_string()))?;
1159 let global_seed_path = home.join(".treeship").join("machine_seed");
1160
1161 let seed = if let Some(local) = local_seed_path.as_ref().filter(|p| p.exists()) {
1162 fs::read_to_string(local).map_err(KeyError::Io)?
1163 } else if global_seed_path.exists() {
1164 // Backward-compat: an existing global seed keeps decrypting any
1165 // keystore that was encrypted under it (in particular the
1166 // standard ~/.treeship/keys/ case where local == global).
1167 fs::read_to_string(&global_seed_path).map_err(KeyError::Io)?
1168 } else {
1169 let mut bytes = [0u8; 32];
1170 rand::thread_rng().fill_bytes(&mut bytes);
1171 let seed_hex = hex_encode(&bytes);
1172
1173 // Prefer creating the seed locally. Falls back to the global
1174 // path only when the keystore has no usable parent (rare;
1175 // happens when store_dir is "/" or similar pathological input).
1176 let target = match local_seed_path.as_ref() {
1177 Some(p) => {
1178 let _ = fs::create_dir_all(p.parent().unwrap_or(Path::new(".")));
1179 p.clone()
1180 }
1181 None => {
1182 let _ = fs::create_dir_all(global_seed_path.parent().unwrap_or(Path::new(".")));
1183 global_seed_path.clone()
1184 }
1185 };
1186 fs::write(&target, &seed_hex).map_err(KeyError::Io)?;
1187 #[cfg(unix)]
1188 {
1189 use std::os::unix::fs::PermissionsExt;
1190 let _ = fs::set_permissions(&target, fs::Permissions::from_mode(0o600));
1191 }
1192 seed_hex
1193 };
1194
1195 let mut h = Sha256::new();
1196 h.update(b"treeship-machine-key-fallback:");
1197 h.update(seed.trim().as_bytes());
1198 h.update(b":");
1199 h.update(store_dir.to_string_lossy().as_bytes());
1200 Ok(h.finalize().into())
1201}
1202
1203/// Stable machine key derivation for NEW keys (VI P-256, etc).
1204/// Uses hardware identifiers that survive hostname/user changes.
1205/// For legacy ship Ed25519 keys, use `derive_machine_key()` instead.
1206pub fn derive_machine_key_stable(store_dir: &Path) -> Result<[u8; 32], KeyError> {
1207 // 1. Linux: /etc/machine-id
1208 if let Ok(id) = fs::read_to_string("/etc/machine-id") {
1209 let trimmed = id.trim();
1210 if !trimmed.is_empty() {
1211 let mut h = Sha256::new();
1212 h.update(b"treeship-machine-key-v2:");
1213 h.update(trimmed.as_bytes());
1214 h.update(b":");
1215 h.update(store_dir.to_string_lossy().as_bytes());
1216 return Ok(h.finalize().into());
1217 }
1218 }
1219
1220 // 2. macOS: IOPlatformSerialNumber (hardware serial, stable across
1221 // hostname changes, user renames, non-interactive shells)
1222 #[cfg(target_os = "macos")]
1223 {
1224 if let Ok(output) = std::process::Command::new("ioreg")
1225 .args(["-rd1", "-c", "IOPlatformExpertDevice"])
1226 .output()
1227 {
1228 let stdout = String::from_utf8_lossy(&output.stdout);
1229 for line in stdout.lines() {
1230 if line.contains("IOPlatformSerialNumber") {
1231 if let Some(serial) = line.split('"').nth(3) {
1232 if !serial.is_empty() {
1233 let mut h = Sha256::new();
1234 h.update(b"treeship-machine-key-v2:");
1235 h.update(serial.as_bytes());
1236 h.update(b":");
1237 h.update(store_dir.to_string_lossy().as_bytes());
1238 return Ok(h.finalize().into());
1239 }
1240 }
1241 }
1242 }
1243 }
1244 }
1245
1246 // 3. Fallback: persistent random seed in ~/.treeship/.internal/
1247 // Separate from key material. Mode 0600.
1248 let home = std::env::var("HOME")
1249 .map(std::path::PathBuf::from)
1250 .map_err(|_| KeyError::Crypto("HOME not set".to_string()))?;
1251 let seed_dir = home.join(".treeship").join(".internal");
1252 let _ = fs::create_dir_all(&seed_dir);
1253 #[cfg(unix)]
1254 {
1255 use std::os::unix::fs::PermissionsExt;
1256 let _ = fs::set_permissions(&seed_dir, fs::Permissions::from_mode(0o700));
1257 }
1258
1259 let seed_path = seed_dir.join("machine_seed_v2");
1260 let seed = if seed_path.exists() {
1261 fs::read_to_string(&seed_path).map_err(KeyError::Io)?
1262 } else {
1263 let mut bytes = [0u8; 32];
1264 rand::thread_rng().fill_bytes(&mut bytes);
1265 let seed_hex = hex_encode(&bytes);
1266 fs::write(&seed_path, &seed_hex).map_err(KeyError::Io)?;
1267 #[cfg(unix)]
1268 {
1269 use std::os::unix::fs::PermissionsExt;
1270 let _ = fs::set_permissions(&seed_path, fs::Permissions::from_mode(0o600));
1271 }
1272 seed_hex
1273 };
1274
1275 let mut h = Sha256::new();
1276 h.update(b"treeship-machine-key-v2-fallback:");
1277 h.update(seed.trim().as_bytes());
1278 h.update(b":");
1279 h.update(store_dir.to_string_lossy().as_bytes());
1280 Ok(h.finalize().into())
1281}
1282
1283// --- Utility ---
1284
1285fn new_key_id() -> KeyId {
1286 let mut b = [0u8; 8];
1287 rand::thread_rng().fill_bytes(&mut b);
1288 format!("key_{}", hex_encode(&b))
1289}
1290
1291fn fingerprint(pub_key: &[u8]) -> String {
1292 let h = Sha256::digest(pub_key);
1293 hex_encode(&h[..8])
1294}
1295
1296fn hex_encode(b: &[u8]) -> String {
1297 b.iter().fold(String::new(), |mut s, byte| {
1298 s.push_str(&format!("{:02x}", byte));
1299 s
1300 })
1301}
1302
1303/// Verify a private-key file has restrictive permissions before loading
1304/// it for signing. Returns `Ok(())` on non-Unix platforms, when the
1305/// `TREESHIP_ALLOW_INSECURE_KEY_PERMS=1` escape hatch is set, or when
1306/// the file is not group/world accessible. Otherwise returns
1307/// `KeyError::InsecureKeyPerms` with the offending path and mode.
1308fn check_key_file_perms(path: &Path) -> Result<(), KeyError> {
1309 #[cfg(unix)]
1310 {
1311 use std::os::unix::fs::PermissionsExt;
1312 if std::env::var_os("TREESHIP_ALLOW_INSECURE_KEY_PERMS")
1313 .map(|v| v == "1")
1314 .unwrap_or(false)
1315 {
1316 return Ok(());
1317 }
1318 // Missing files are reported by the caller as NotFound -- don't
1319 // mask that with a perm error.
1320 let meta = match fs::metadata(path) {
1321 Ok(m) => m,
1322 Err(_) => return Ok(()),
1323 };
1324 let mode = meta.permissions().mode();
1325 if mode & 0o077 != 0 {
1326 return Err(KeyError::InsecureKeyPerms {
1327 path: path.to_path_buf(),
1328 mode,
1329 });
1330 }
1331 }
1332 let _ = path;
1333 Ok(())
1334}
1335
1336impl Store {
1337 /// Repair file permissions on the keystore directory and every file
1338 /// inside it: dir to 0700, key entry files and manifest to 0600.
1339 /// Used by `treeship doctor --fix`. No-op on non-Unix.
1340 ///
1341 /// Returns the list of (path, old_mode, new_mode) tuples for paths
1342 /// that were actually changed, so the caller can report what it did.
1343 pub fn fix_perms(&self) -> Result<Vec<(PathBuf, u32, u32)>, KeyError> {
1344 let mut changed: Vec<(PathBuf, u32, u32)> = Vec::new();
1345 #[cfg(unix)]
1346 {
1347 use std::os::unix::fs::PermissionsExt;
1348
1349 let dir_meta = fs::metadata(&self.dir)?;
1350 let dir_mode = dir_meta.permissions().mode() & 0o777;
1351 if dir_mode != 0o700 {
1352 fs::set_permissions(&self.dir, fs::Permissions::from_mode(0o700))?;
1353 changed.push((self.dir.clone(), dir_mode, 0o700));
1354 }
1355
1356 for entry in fs::read_dir(&self.dir)? {
1357 let entry = entry?;
1358 let path = entry.path();
1359 if !entry.file_type()?.is_file() {
1360 continue;
1361 }
1362 let mode = entry.metadata()?.permissions().mode() & 0o777;
1363 if mode != 0o600 {
1364 fs::set_permissions(&path, fs::Permissions::from_mode(0o600))?;
1365 changed.push((path, mode, 0o600));
1366 }
1367 }
1368 }
1369 Ok(changed)
1370 }
1371}
1372
1373/// Open (or create) the per-entry migration sentinel lock file with
1374/// owner-only permissions (0o600 on Unix). The handle returned can be
1375/// passed to `fs2::FileExt::lock_exclusive` to serialize concurrent
1376/// v1->v2 migrations of the same entry across processes/threads
1377/// (TS-2026-001 H3).
1378///
1379/// On Unix the mode is set at creation via `OpenOptionsExt::mode` so the
1380/// sentinel never has a moment of looser perms. On non-Unix platforms the
1381/// file inherits parent ACLs (the keystore dir is owner-scoped already).
1382#[cfg(unix)]
1383fn open_migration_lock_file(path: &Path) -> Result<fs::File, io::Error> {
1384 use std::os::unix::fs::OpenOptionsExt;
1385 fs::OpenOptions::new()
1386 .create(true)
1387 .read(true)
1388 .write(true)
1389 .truncate(false)
1390 .mode(0o600)
1391 .open(path)
1392}
1393
1394#[cfg(not(unix))]
1395fn open_migration_lock_file(path: &Path) -> Result<fs::File, io::Error> {
1396 fs::OpenOptions::new()
1397 .create(true)
1398 .read(true)
1399 .write(true)
1400 .truncate(false)
1401 .open(path)
1402}
1403
1404/// Atomically write `data` to `path` with owner-only (0o600) permissions on
1405/// Unix.
1406///
1407/// TS-2026-001 H1 + H2: the prior implementation was truncate-then-write,
1408/// which destroys the original file if the process crashes mid-write. For
1409/// the keystore that's catastrophic -- a crash during transparent v1->v2
1410/// migration would leave a zero-byte (or partial) key entry on disk and
1411/// the private key would be unrecoverable. This implementation writes to
1412/// a sibling tmp file in the same directory, fsyncs the bytes through to
1413/// the platter, then performs a POSIX-atomic same-filesystem `rename(2)`.
1414/// A crash before the rename leaves the original file intact; the tmp
1415/// file is harmless garbage that the next successful write will overwrite.
1416///
1417/// The 0o600 mode is set at file *creation* via `OpenOptionsExt::mode`
1418/// so there is no window in which the file exists with looser perms.
1419/// The prior `set_permissions` post-write call is dropped because it was
1420/// redundant and gave the appearance (but not the substance) of safety.
1421fn write_file_600(path: &Path, data: &[u8]) -> Result<(), KeyError> {
1422 // Place the tmp file in the same directory as the final path so the
1423 // rename stays on the same filesystem (cross-FS renames are not atomic
1424 // and degrade to copy+unlink, defeating the whole point).
1425 let tmp_path = path.with_extension("tmp");
1426
1427 // Best-effort cleanup of any stale tmp from a prior crash before we
1428 // start writing. Ignored on error -- if it doesn't exist that's fine,
1429 // and if it can't be removed the OpenOptions call below will surface
1430 // the underlying error.
1431 let _ = fs::remove_file(&tmp_path);
1432
1433 let write_result: Result<(), KeyError> = (|| {
1434 #[cfg(unix)]
1435 let open = {
1436 use std::os::unix::fs::OpenOptionsExt;
1437 fs::OpenOptions::new()
1438 .write(true)
1439 .create(true)
1440 .truncate(true)
1441 .mode(0o600)
1442 .open(&tmp_path)
1443 };
1444 #[cfg(not(unix))]
1445 let open = fs::OpenOptions::new()
1446 .write(true)
1447 .create(true)
1448 .truncate(true)
1449 .open(&tmp_path);
1450
1451 let mut f = open?;
1452 f.write_all(data)?;
1453 // sync_all flushes both data AND metadata, so on a crash after
1454 // the rename, fsck/journal recovery sees the new bytes -- not a
1455 // ghost inode with stale content.
1456 f.sync_all()?;
1457 Ok(())
1458 })();
1459
1460 if let Err(e) = write_result {
1461 // Best-effort cleanup so the next write isn't surprised by a
1462 // half-written tmp. Errors here are not surfaced: the original
1463 // write error is what the caller needs to see.
1464 let _ = fs::remove_file(&tmp_path);
1465 return Err(e);
1466 }
1467
1468 // Atomic same-filesystem rename. On Unix this is a single
1469 // rename(2) syscall guaranteed by POSIX to be atomic with respect
1470 // to other observers. On Windows std::fs::rename is implemented
1471 // via MoveFileEx with MOVEFILE_REPLACE_EXISTING (atomic on NTFS,
1472 // best-effort elsewhere). After this returns Ok, the new bytes are
1473 // visible at `path` and the tmp file no longer exists.
1474 if let Err(e) = fs::rename(&tmp_path, path) {
1475 let _ = fs::remove_file(&tmp_path);
1476 return Err(KeyError::Io(e));
1477 }
1478
1479 // fsync the parent directory so the rename's directory-entry update
1480 // is itself persisted. The previous code only fsynced the tmp
1481 // file's contents (via sync_all on the file handle) -- on ext4/xfs
1482 // with default mount options, the rename can return to userspace
1483 // before the dirent metadata has been written to the journal. A
1484 // power loss in that window leaves the directory entry pointing at
1485 // the OLD inode (or, worse, missing entirely if both old and new
1486 // were unlinked from the parent), even though both the data bytes
1487 // and the rename syscall ostensibly completed. The H1 doc-comment
1488 // above promised stronger durability than the code delivered;
1489 // fsyncing the parent dir closes that gap.
1490 //
1491 // Best-effort on Unix: a directory open + sync_all is the standard
1492 // pattern (see e.g. SQLite's atomic-commit, leveldb, lmdb). On
1493 // platforms where opening a directory for sync isn't supported, we
1494 // silently skip -- the rename is still atomic-with-respect-to-
1495 // observers, we just don't guarantee crash-durability of the
1496 // dirent update.
1497 #[cfg(unix)]
1498 {
1499 if let Some(parent) = path.parent() {
1500 // Errors here are non-fatal: the rename succeeded and the
1501 // common case (no power loss before the next fs flush) is
1502 // correct. We surface a failure to open/sync the dir only
1503 // if the rename itself succeeded, since otherwise the
1504 // caller would mistake a durability hint for a write
1505 // failure. swallow silently rather than return.
1506 if let Ok(dir) = fs::File::open(parent) {
1507 let _ = dir.sync_all();
1508 }
1509 }
1510 }
1511
1512 Ok(())
1513}
1514
1515fn unix_now() -> u64 {
1516 use std::time::{SystemTime, UNIX_EPOCH};
1517 SystemTime::now()
1518 .duration_since(UNIX_EPOCH)
1519 .unwrap_or_default()
1520 .as_secs()
1521}
1522
1523#[cfg(test)]
1524mod tests {
1525 use super::*;
1526
1527 fn temp_dir_path() -> PathBuf {
1528 let mut p = std::env::temp_dir();
1529 p.push(format!("treeship-test-{}", {
1530 let mut b = [0u8; 4];
1531 rand::thread_rng().fill_bytes(&mut b);
1532 hex_encode(&b)
1533 }));
1534 p
1535 }
1536
1537 fn make_store() -> (Store, PathBuf) {
1538 let dir = temp_dir_path();
1539 let store = Store::open(&dir).unwrap();
1540 (store, dir)
1541 }
1542
1543 fn cleanup(dir: PathBuf) {
1544 let _ = fs::remove_dir_all(dir);
1545 }
1546
1547 #[test]
1548 fn generate_key() {
1549 let (store, dir) = make_store();
1550 let info = store.generate(true).unwrap();
1551 assert!(info.id.starts_with("key_"));
1552 assert_eq!(info.algorithm, "ed25519");
1553 assert!(!info.fingerprint.is_empty());
1554 assert_eq!(info.public_key.len(), 32);
1555 cleanup(dir);
1556 }
1557
1558 #[test]
1559 fn default_signer_works() {
1560 let (store, dir) = make_store();
1561 store.generate(true).unwrap();
1562 let signer = store.default_signer().unwrap();
1563 assert!(!signer.key_id().is_empty());
1564 let pae = crate::attestation::pae("text/plain", b"test");
1565 let sig = signer.sign(&pae).unwrap();
1566 assert_eq!(sig.len(), 64);
1567 cleanup(dir);
1568 }
1569
1570 #[test]
1571 fn encrypt_decrypt_roundtrip() {
1572 // Routes the legacy public API through the dispatcher; v1
1573 // ciphertexts must still decrypt correctly.
1574 let key = [42u8; 32];
1575 let plaintext = b"super secret private key material here!";
1576 let (enc, nonce) = aes_gcm_encrypt(&key, plaintext).unwrap();
1577 let dec = aes_gcm_decrypt(&key, &enc, &nonce).unwrap();
1578 assert_eq!(dec, plaintext);
1579 }
1580
1581 #[test]
1582 fn decrypt_wrong_key_fails() {
1583 let key = [42u8; 32];
1584 let wrong = [99u8; 32];
1585 let (enc, nonce) = aes_gcm_encrypt(&key, b"secret").unwrap();
1586 assert!(aes_gcm_decrypt(&wrong, &enc, &nonce).is_err());
1587 }
1588
1589 // --- v2 AEAD tests (TS-2026-001 fix) -----------------------------------
1590
1591 // Fixed entry id + pubkey for the unit-level v2 tests below. The AAD
1592 // builder binds these into the GCM tag, so encrypt and decrypt must
1593 // see identical values. Using constants keeps each test focused on
1594 // its own bit-flip / tamper assertion without dragging Store setup
1595 // into the picture.
1596 const TEST_ENTRY_ID: &str = "key_unit_test_entry_0001";
1597 const TEST_PUBLIC_KEY: &[u8; 32] = &[0xAA; 32];
1598
1599 #[test]
1600 fn v2_encrypt_decrypt_roundtrip() {
1601 let key = [7u8; 32];
1602 let plaintext = b"super secret private key material here!";
1603 let blob =
1604 encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, plaintext).unwrap();
1605 // Structural check on the framing.
1606 assert_eq!(blob[0], KEYSTORE_MAGIC, "magic byte");
1607 assert_eq!(blob[1], KEYSTORE_VERSION_V2, "version byte");
1608 assert_eq!(blob.len(), 2 + 12 + plaintext.len() + 16,
1609 "magic+version+nonce+ct+tag length");
1610
1611 let dec =
1612 decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]).unwrap();
1613 assert_eq!(&*dec, plaintext);
1614 }
1615
1616 #[test]
1617 fn v2_decrypt_wrong_key_fails() {
1618 let key = [7u8; 32];
1619 let wrong = [99u8; 32];
1620 let blob = encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"secret").unwrap();
1621 // Wrong key with v2 framing: AEAD must reject. Dispatcher will
1622 // try v1 fallback (which also fails on garbage), so the final
1623 // error surfaces as a MAC failure rather than wrong plaintext.
1624 let result = decrypt_from_disk(&wrong, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1625 assert!(result.is_err(), "wrong key must fail");
1626 }
1627
1628 #[test]
1629 fn v2_tamper_ciphertext_fails() {
1630 let key = [7u8; 32];
1631 let mut blob = encrypt_for_disk_v2(
1632 &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1633 ).unwrap();
1634 // Flip one bit inside the ciphertext body (after the 14-byte
1635 // framing). GCM authenticates ciphertext + nonce; any flip must
1636 // fail.
1637 let last = blob.len() - 5;
1638 blob[last] ^= 0x01;
1639 let result = decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1640 assert!(result.is_err(), "tampered ciphertext must fail to decrypt");
1641 }
1642
1643 #[test]
1644 fn v2_tamper_nonce_fails() {
1645 let key = [7u8; 32];
1646 let mut blob = encrypt_for_disk_v2(
1647 &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1648 ).unwrap();
1649 // Flip a bit in the nonce (bytes [2..14]).
1650 blob[5] ^= 0x01;
1651 let result = decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1652 assert!(result.is_err(), "tampered nonce must fail to decrypt");
1653 }
1654
1655 #[test]
1656 fn v2_tamper_tag_fails() {
1657 let key = [7u8; 32];
1658 let mut blob = encrypt_for_disk_v2(
1659 &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1660 ).unwrap();
1661 // Flip a bit in the trailing GCM tag (last 16 bytes).
1662 let len = blob.len();
1663 blob[len - 1] ^= 0x80;
1664 let result = decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1665 assert!(result.is_err(), "tampered GCM tag must fail to decrypt");
1666 }
1667
1668 #[test]
1669 fn v2_nonces_are_unique_across_writes() {
1670 // Sanity check: two encryptions of identical plaintext under the
1671 // same key must produce different blobs (random per-write nonce).
1672 // Without this property, AES-GCM is catastrophically broken.
1673 let key = [7u8; 32];
1674 let blob_a =
1675 encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"identical").unwrap();
1676 let blob_b =
1677 encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"identical").unwrap();
1678 assert_ne!(blob_a, blob_b,
1679 "two v2 encryptions of the same plaintext must differ");
1680 assert_ne!(&blob_a[2..14], &blob_b[2..14], "nonces must differ");
1681
1682 // L1 (TS-2026-001 audit): draw 10k nonces in a row and assert
1683 // every one is distinct. A duplicate at this volume would be a
1684 // strong (10k^2 / 2^96 ~ 2^-65 floor) signal that the OS CSPRNG
1685 // backing aead::OsRng is misbehaving on this build. Cheap, fast,
1686 // and catches a regression class (PRNG mis-seeding,
1687 // accidentally-deterministic nonce, RNG getting forked across
1688 // threads without re-seed) that the 2-sample check above can't.
1689 const N: usize = 10_000;
1690 let mut nonces: std::collections::HashSet<Vec<u8>> =
1691 std::collections::HashSet::with_capacity(N);
1692 for _ in 0..N {
1693 let blob =
1694 encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"x").unwrap();
1695 // bytes [2..14] are the 12-byte GCM nonce.
1696 nonces.insert(blob[2..14].to_vec());
1697 }
1698 assert_eq!(
1699 nonces.len(),
1700 N,
1701 "all {} v2 nonces must be unique; collision => RNG defect",
1702 N
1703 );
1704 }
1705
1706 #[test]
1707 fn v2_tamper_version_byte_fails() {
1708 // M2: flipping the version byte must cause decryption to fail.
1709 // The framing sanity check catches obvious flips immediately;
1710 // the AAD-binding test below covers the case where the framing
1711 // sanity check would otherwise pass.
1712 let key = [7u8; 32];
1713 let mut blob = encrypt_for_disk_v2(
1714 &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1715 ).unwrap();
1716 assert_eq!(blob[1], KEYSTORE_VERSION_V2);
1717 blob[1] = 0xff;
1718 assert!(
1719 decrypt_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob).is_err(),
1720 "altered version byte must be rejected"
1721 );
1722 }
1723
1724 #[test]
1725 fn v2_aad_binding_detects_framing_substitution() {
1726 // M2 direct check: encrypt a payload with v2 AAD, then construct
1727 // a blob whose framing claims to be v2 but whose ciphertext was
1728 // computed under a different AAD (empty). decrypt_v2 must
1729 // reject with MAC failure rather than returning the plaintext.
1730 let key = [7u8; 32];
1731 let plaintext = b"M2 AAD bound material";
1732
1733 // Compute a v2-framed blob without supplying AAD -- mimics what
1734 // the *pre-M2* code would have produced. This is the exact
1735 // attack surface AAD closes: an old blob whose framing is v2
1736 // but whose tag was computed empty.
1737 use aes_gcm::aead::Aead;
1738 let key_buf: Zeroizing<[u8; 32]> = Zeroizing::new(key);
1739 let aead_key: &AesKey<Aes256Gcm> = AesKey::<Aes256Gcm>::from_slice(key_buf.as_slice());
1740 let cipher = Aes256Gcm::new(aead_key);
1741 let nonce = Aes256Gcm::generate_nonce(&mut AeadOsRng);
1742 let ct_no_aad = cipher.encrypt(&nonce, plaintext.as_slice()).unwrap();
1743
1744 let mut forged = Vec::with_capacity(2 + 12 + ct_no_aad.len());
1745 forged.push(KEYSTORE_MAGIC);
1746 forged.push(KEYSTORE_VERSION_V2);
1747 forged.extend_from_slice(nonce.as_slice());
1748 forged.extend_from_slice(&ct_no_aad);
1749
1750 // Framing sanity passes. AAD does not. decrypt_v2 must reject.
1751 assert_eq!(forged[0], KEYSTORE_MAGIC);
1752 assert_eq!(forged[1], KEYSTORE_VERSION_V2);
1753 let result = decrypt_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &forged);
1754 assert!(result.is_err(),
1755 "ciphertext computed without AAD must fail to decrypt now that AAD is bound");
1756 }
1757
1758 #[test]
1759 fn dispatcher_surfaces_v2_error_on_corrupted_v2_blob() {
1760 // M1: a v2-shaped blob whose AEAD verification fails (and
1761 // whose v1 fallback also fails, since the bytes are garbage
1762 // under both constructions) must surface the v2 MAC error, not
1763 // the v1 "ciphertext too short" / random-junk error. The user
1764 // sees a meaningful message that points at the right
1765 // remediation.
1766 let key = [7u8; 32];
1767 let mut blob =
1768 encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"hello").unwrap();
1769 // Flip a byte in the GCM tag (last 16 bytes) so the v2 AEAD
1770 // rejects but the framing still classifies as v2.
1771 let last = blob.len() - 1;
1772 blob[last] ^= 0x01;
1773
1774 let err =
1775 decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]).unwrap_err();
1776 // The dispatcher should bubble the v2 error string up. v2's
1777 // error message contains "MAC verification failed"; v1's
1778 // shape on garbage data is either "ciphertext too short" or
1779 // a different MAC error. Match on the v2-specific tail.
1780 assert!(
1781 err.contains("MAC verification failed"),
1782 "dispatcher must surface the v2 MAC error on corrupted v2 blob, got: {err}"
1783 );
1784 }
1785
1786 #[test]
1787 fn legacy_v1_ciphertext_still_decrypts_via_dispatcher() {
1788 // Simulates an on-disk keystore written by Treeship <= v0.10.2:
1789 // the dispatcher must successfully route legacy ciphertexts
1790 // through the v1 path so existing users are not locked out.
1791 let key = [13u8; 32];
1792 let plaintext = b"pre-v0.10.3 keystore entry";
1793 let (legacy_blob, legacy_nonce) =
1794 legacy_v1_encrypt(&key, plaintext).unwrap();
1795
1796 // Sanity: legacy blob does NOT start with v2 framing.
1797 assert!(is_legacy_v1(&legacy_blob),
1798 "legacy_v1_encrypt output must classify as legacy");
1799
1800 // Dispatcher must accept it. AAD inputs are irrelevant for the
1801 // v1 path (it doesn't use them), but the signature requires them
1802 // — pass the same placeholder constants used elsewhere.
1803 let dec = decrypt_from_disk(
1804 &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &legacy_blob, &legacy_nonce,
1805 )
1806 .unwrap();
1807 assert_eq!(&*dec, plaintext);
1808 }
1809
1810 #[test]
1811 fn store_signer_migrates_legacy_entry_to_v2() {
1812 // End-to-end: write a key entry with the legacy v1 ciphertext
1813 // (as if upgrading from v0.10.2), call `signer()`, then verify
1814 // the on-disk entry has been rewritten in v2 format.
1815 let (store, dir) = make_store();
1816
1817 // Generate normally (this writes v2). Then re-encrypt the
1818 // secret in v1 format and overwrite the entry on disk to
1819 // simulate the upgrade scenario.
1820 let info = store.generate(true).unwrap();
1821 let entry_path = store.entry_path(&info.id);
1822
1823 // Pull the v2 entry off disk, decrypt to recover the secret,
1824 // then re-encode in legacy v1 format and write it back.
1825 let v2_entry: EncryptedEntry =
1826 serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
1827 let secret = decrypt_from_disk(
1828 &store.machine_key,
1829 &v2_entry.id,
1830 &v2_entry.public_key,
1831 &v2_entry.enc_priv_key,
1832 &v2_entry.nonce,
1833 )
1834 .unwrap();
1835 let (legacy_blob, legacy_nonce) =
1836 legacy_v1_encrypt(&store.machine_key, &secret).unwrap();
1837 let legacy_entry = EncryptedEntry {
1838 id: v2_entry.id.clone(),
1839 algorithm: v2_entry.algorithm.clone(),
1840 created_at: v2_entry.created_at.clone(),
1841 public_key: v2_entry.public_key.clone(),
1842 enc_priv_key: legacy_blob,
1843 nonce: legacy_nonce,
1844 valid_until: v2_entry.valid_until.clone(),
1845 successor_key_id: v2_entry.successor_key_id.clone(),
1846 };
1847 fs::write(&entry_path, serde_json::to_vec_pretty(&legacy_entry).unwrap()).unwrap();
1848
1849 // Reload with a fresh Store so the cache doesn't paper over the
1850 // on-disk change.
1851 let store2 = Store::open(&dir).unwrap();
1852 // Loading the signer must succeed (legacy path works) AND
1853 // trigger the transparent migration to v2.
1854 let _signer = store2.signer(&info.id).unwrap();
1855
1856 let after: EncryptedEntry =
1857 serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
1858 assert!(!is_legacy_v1(&after.enc_priv_key),
1859 "post-migration entry must be in v2 format");
1860 assert_eq!(after.enc_priv_key[0], KEYSTORE_MAGIC);
1861 assert_eq!(after.enc_priv_key[1], KEYSTORE_VERSION_V2);
1862 assert!(after.nonce.is_empty(),
1863 "v2 entries serialize an empty legacy nonce field");
1864
1865 // L2 (TS-2026-001 audit): the framing check above proves the
1866 // migrator *wrote* a v2-shaped blob, but a downstream
1867 // assert_eq! on framing alone doesn't prove the v2 ciphertext
1868 // is actually a working AEAD encryption of the right secret.
1869 // Load the signer one more time through a fresh Store; this
1870 // routes through the dispatcher's v2-first branch and would
1871 // fail loudly if the migration had produced garbage.
1872 let store3 = Store::open(&dir).unwrap();
1873 let _signer = store3
1874 .signer(&info.id)
1875 .expect("post-migration v2 decrypt works");
1876
1877 cleanup(dir);
1878 }
1879
1880 #[test]
1881 fn persist_and_reload() {
1882 let (store, dir) = make_store();
1883 let info = store.generate(true).unwrap();
1884
1885 // Open a new Store instance pointing to the same directory.
1886 let store2 = Store::open(&dir).unwrap();
1887 let signer = store2.signer(&info.id).unwrap();
1888 assert_eq!(signer.key_id(), info.id);
1889
1890 // The reloaded signer must produce signatures verifiable with
1891 // the same public key.
1892 let verifier = {
1893 use crate::attestation::Verifier;
1894 use ed25519_dalek::VerifyingKey;
1895 let pk_bytes: [u8; 32] = info.public_key.try_into().unwrap();
1896 let vk = VerifyingKey::from_bytes(&pk_bytes).unwrap();
1897 let mut v = Verifier::new(std::collections::HashMap::new());
1898 v.add_key(info.id.clone(), vk);
1899 v
1900 };
1901
1902 use crate::attestation::sign;
1903 use crate::statements::ActionStatement;
1904 let stmt = ActionStatement::new("agent://test", "tool.call");
1905 let pt = crate::statements::payload_type("action");
1906 let signed = sign(&pt, &stmt, signer.as_ref()).unwrap();
1907 verifier.verify(&signed.envelope).unwrap();
1908
1909 cleanup(dir);
1910 }
1911
1912 #[test]
1913 fn list_keys() {
1914 let (store, dir) = make_store();
1915 store.generate(true).unwrap();
1916 store.generate(false).unwrap();
1917
1918 let keys = store.list().unwrap();
1919 assert_eq!(keys.len(), 2);
1920 assert_eq!(keys.iter().filter(|k| k.is_default).count(), 1);
1921 cleanup(dir);
1922 }
1923
1924 #[test]
1925 fn no_default_key_errors() {
1926 let (store, dir) = make_store();
1927 assert!(store.default_signer().is_err());
1928 cleanup(dir);
1929 }
1930
1931 #[test]
1932 fn rotate_mints_successor_and_links_predecessor() {
1933 let (store, dir) = make_store();
1934 let pred = store.generate(true).unwrap();
1935 assert!(pred.valid_until.is_none(), "fresh key has no expiry");
1936 assert!(pred.successor_key_id.is_none(), "fresh key has no successor");
1937
1938 let result = store
1939 .rotate(None, std::time::Duration::from_secs(3600), true)
1940 .unwrap();
1941
1942 // Predecessor metadata is updated.
1943 assert_eq!(result.predecessor.id, pred.id);
1944 assert!(result.predecessor.valid_until.is_some(),
1945 "predecessor must get valid_until after rotation");
1946 assert_eq!(result.predecessor.successor_key_id.as_deref(),
1947 Some(result.successor.id.as_str()),
1948 "predecessor must link forward to successor");
1949 assert!(!result.predecessor.is_default,
1950 "after rotation with set_default=true, predecessor is no longer default");
1951
1952 // Successor is fresh.
1953 assert_ne!(result.successor.id, pred.id);
1954 assert!(result.successor.valid_until.is_none(), "successor has no expiry yet");
1955 assert!(result.successor.successor_key_id.is_none(), "successor is chain head");
1956 assert!(result.successor.is_default, "successor is the new default");
1957
1958 // Same metadata visible via list().
1959 let listed = store.list().unwrap();
1960 assert_eq!(listed.len(), 2);
1961 let pred_listed = listed.iter().find(|k| k.id == pred.id).unwrap();
1962 assert!(pred_listed.valid_until.is_some());
1963 assert_eq!(pred_listed.successor_key_id.as_deref(),
1964 Some(result.successor.id.as_str()));
1965
1966 cleanup(dir);
1967 }
1968
1969 #[test]
1970 fn rotate_with_set_default_false_keeps_predecessor_active() {
1971 let (store, dir) = make_store();
1972 let pred = store.generate(true).unwrap();
1973
1974 let result = store
1975 .rotate(None, std::time::Duration::from_secs(3600), false)
1976 .unwrap();
1977
1978 // Predecessor is still default. Successor exists but is not default.
1979 assert!(result.predecessor.is_default);
1980 assert!(!result.successor.is_default);
1981 assert_eq!(store.default_key_id().unwrap(), pred.id);
1982
1983 cleanup(dir);
1984 }
1985
1986 #[test]
1987 fn rotate_predecessor_signing_still_works_during_grace_window() {
1988 let (store, dir) = make_store();
1989 let pred = store.generate(true).unwrap();
1990 let _ = store
1991 .rotate(None, std::time::Duration::from_secs(3600), true)
1992 .unwrap();
1993
1994 // Predecessor key must still be loadable and capable of signing
1995 // during its grace window. Verifiers can refuse on lifecycle, but
1996 // the keystore must not preemptively destroy material.
1997 let signer = store.signer(&pred.id).unwrap();
1998 let pae = crate::attestation::pae("text/plain", b"grace-window-payload");
1999 let sig = signer.sign(&pae).unwrap();
2000 assert_eq!(sig.len(), 64);
2001
2002 cleanup(dir);
2003 }
2004
2005 #[test]
2006 fn rotate_refuses_to_rotate_already_rotated_key() {
2007 let (store, dir) = make_store();
2008 store.generate(true).unwrap();
2009 let r1 = store
2010 .rotate(None, std::time::Duration::from_secs(60), true)
2011 .unwrap();
2012
2013 // Rotating the predecessor again must be refused -- it already
2014 // points at r1.successor. Caller should rotate the chain head.
2015 let err = store
2016 .rotate(Some(&r1.predecessor.id),
2017 std::time::Duration::from_secs(60),
2018 true)
2019 .unwrap_err();
2020 match err {
2021 KeyError::Crypto(msg) => assert!(
2022 msg.contains("already been rotated"),
2023 "error must explain why: {msg}"
2024 ),
2025 other => panic!("expected Crypto error, got {other:?}"),
2026 }
2027 cleanup(dir);
2028 }
2029
2030 #[test]
2031 fn successor_chain_walks_forward() {
2032 let (store, dir) = make_store();
2033 let k0 = store.generate(true).unwrap();
2034 let r1 = store
2035 .rotate(None, std::time::Duration::from_secs(60), true)
2036 .unwrap();
2037 let r2 = store
2038 .rotate(None, std::time::Duration::from_secs(60), true)
2039 .unwrap();
2040
2041 let chain = store.successor_chain(&k0.id).unwrap();
2042 assert_eq!(chain, vec![k0.id.clone(), r1.successor.id.clone(), r2.successor.id.clone()],
2043 "chain must be ordered head -> tail");
2044
2045 // Mid-chain start: chain from r1.successor should drop k0.
2046 let mid = store.successor_chain(&r1.successor.id).unwrap();
2047 assert_eq!(mid, vec![r1.successor.id.clone(), r2.successor.id.clone()]);
2048
2049 // Tail: just itself.
2050 let tail = store.successor_chain(&r2.successor.id).unwrap();
2051 assert_eq!(tail, vec![r2.successor.id.clone()]);
2052
2053 cleanup(dir);
2054 }
2055
2056 #[test]
2057 fn valid_keys_at_filters_by_grace_window() {
2058 let (store, dir) = make_store();
2059 let _ = store.generate(true).unwrap();
2060 let result = store
2061 .rotate(None, std::time::Duration::from_secs(3600), true)
2062 .unwrap();
2063
2064 // At time-of-rotation, both keys must be valid -- predecessor is
2065 // mid-grace, successor is freshly minted.
2066 let now = unix_now();
2067 let valid_now = store.valid_keys_at(now).unwrap();
2068 assert_eq!(valid_now.len(), 2, "both predecessor (in grace) and successor should be valid");
2069
2070 // After the grace window expires, only the successor remains.
2071 let after_grace = unix_now() + 7200;
2072 let valid_after = store.valid_keys_at(after_grace).unwrap();
2073 assert_eq!(valid_after.len(), 1,
2074 "after grace window only successor remains valid");
2075 assert_eq!(valid_after[0].id, result.successor.id);
2076
2077 cleanup(dir);
2078 }
2079
2080 /// Regression: if the successor key file is missing on disk (because a
2081 /// prior rotate() crashed AFTER stamping the predecessor but BEFORE
2082 /// writing the successor), retrying must NOT be wedged. With the
2083 /// successor-first write order this scenario can't be reached by a
2084 /// single-process crash, but we still need to defend against an operator
2085 /// who manually deletes a successor file mid-life. The recovery path
2086 /// is: clear the predecessor's successor pointer (or restore the file
2087 /// from backup) and try again.
2088 /// Regression: even if the manifest write FAILED (say, disk full at
2089 /// the worst possible moment), the in-memory cache must reflect the
2090 /// stamped predecessor that already landed on disk -- otherwise a
2091 /// same-process retry would skip the already-rotated guard and mint
2092 /// a duplicate successor.
2093 ///
2094 /// We can't easily inject a manifest-write failure mid-test, but we
2095 /// can verify the precondition that makes the recovery work: after a
2096 /// successful rotate(), the cache holds the stamped predecessor (so
2097 /// any subsequent rotate would correctly refuse). Combined with the
2098 /// write order (cache update BEFORE manifest write in rotate()),
2099 /// this proves a manifest-write crash leaves the cache aligned with
2100 /// disk, not behind it.
2101 #[test]
2102 fn rotate_cache_reflects_stamped_predecessor_for_retry_safety() {
2103 let (store, dir) = make_store();
2104 let pred = store.generate(true).unwrap();
2105 let _ = store
2106 .rotate(None, std::time::Duration::from_secs(60), true)
2107 .unwrap();
2108
2109 // The cache must have the stamped predecessor; a same-process
2110 // retry of rotate(predecessor) MUST be refused. If the cache
2111 // were stale (still showing the unstamped predecessor), this
2112 // call would proceed and mint a duplicate successor.
2113 let err = store
2114 .rotate(Some(&pred.id),
2115 std::time::Duration::from_secs(60),
2116 true)
2117 .unwrap_err();
2118 match err {
2119 KeyError::Crypto(msg) => assert!(
2120 msg.contains("already been rotated"),
2121 "cache should reflect stamped predecessor; got: {msg}"
2122 ),
2123 other => panic!("expected Crypto error, got {other:?}"),
2124 }
2125
2126 cleanup(dir);
2127 }
2128
2129 #[test]
2130 fn rotated_predecessor_pointing_at_missing_successor_surfaces_clear_error() {
2131 let (store, dir) = make_store();
2132 store.generate(true).unwrap();
2133 let result = store
2134 .rotate(None, std::time::Duration::from_secs(60), true)
2135 .unwrap();
2136
2137 // Simulate operator-deleted successor file. The manifest still
2138 // references it, so a cold-cache reader trying to walk the chain
2139 // hits a clear NotFound for the missing key.
2140 let succ_path = store.entry_path(&result.successor.id);
2141 fs::remove_file(&succ_path).unwrap();
2142
2143 // Open a fresh Store instance so the cache doesn't paper over the
2144 // missing on-disk entry. successor_chain() walks via load_entry;
2145 // the missing file must produce KeyError::NotFound, not a panic
2146 // and not an infinite loop.
2147 let store2 = Store::open(&dir).unwrap();
2148 let err = store2.successor_chain(&result.predecessor.id).unwrap_err();
2149 match err {
2150 KeyError::NotFound(id) => assert_eq!(id, result.successor.id),
2151 other => panic!("expected NotFound error, got {other:?}"),
2152 }
2153
2154 cleanup(dir);
2155 }
2156
2157 /// Pre-0.9.5 entry files lack `valid_until` and `successor_key_id`.
2158 /// They must still deserialize cleanly and be visible via `list()` /
2159 /// `default_signer()` etc.
2160 #[test]
2161 fn legacy_entry_without_lifecycle_fields_loads() {
2162 let (store, dir) = make_store();
2163 let info = store.generate(true).unwrap();
2164
2165 // Re-serialize the on-disk entry without the new fields, simulating
2166 // a file created by a 0.9.4 or earlier CLI.
2167 let path = store.entry_path(&info.id);
2168 let raw = fs::read(&path).unwrap();
2169 let mut json: serde_json::Value = serde_json::from_slice(&raw).unwrap();
2170 let obj = json.as_object_mut().unwrap();
2171 obj.remove("valid_until");
2172 obj.remove("successor_key_id");
2173 fs::write(&path, serde_json::to_vec_pretty(&json).unwrap()).unwrap();
2174
2175 // A fresh Store (cold cache) must still load the entry and treat
2176 // the missing fields as None.
2177 let store2 = Store::open(&dir).unwrap();
2178 let listed = store2.list().unwrap();
2179 assert_eq!(listed.len(), 1);
2180 assert!(listed[0].valid_until.is_none(),
2181 "missing valid_until must default to None on legacy entry");
2182 assert!(listed[0].successor_key_id.is_none(),
2183 "missing successor_key_id must default to None on legacy entry");
2184 let signer = store2.default_signer().unwrap();
2185 assert_eq!(signer.key_id(), info.id);
2186
2187 cleanup(dir);
2188 }
2189
2190 // --- keystore permission hardening (PR 1) -------------------------------
2191
2192 // The perm tests below mutate the process-global env var
2193 // TREESHIP_ALLOW_INSECURE_KEY_PERMS. cargo test runs cases in
2194 // parallel by default, so without serialization one test can set
2195 // the bypass while another expects it unset and racefully fail.
2196 // This mutex serializes them; everything else in the file remains
2197 // parallel-safe.
2198 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
2199
2200 #[test]
2201 #[cfg(unix)]
2202 fn write_entry_creates_file_with_0600() {
2203 use std::os::unix::fs::PermissionsExt;
2204 let (store, dir) = make_store();
2205 let info = store.generate(true).unwrap();
2206 let mode = fs::metadata(store.entry_path(&info.id))
2207 .unwrap()
2208 .permissions()
2209 .mode()
2210 & 0o777;
2211 assert_eq!(mode, 0o600, "freshly written key file must be 0600, got {:o}", mode);
2212 cleanup(dir);
2213 }
2214
2215 #[test]
2216 #[cfg(unix)]
2217 fn signer_refuses_world_readable_key() {
2218 use std::os::unix::fs::PermissionsExt;
2219 // Mutex prevents the bypass var from being toggled by a
2220 // sibling test mid-flight (cargo test parallel runner).
2221 let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
2222 // Make sure the bypass var is not leaking from the host env.
2223 std::env::remove_var("TREESHIP_ALLOW_INSECURE_KEY_PERMS");
2224
2225 let (store, dir) = make_store();
2226 let info = store.generate(true).unwrap();
2227
2228 // Loosen perms on the key file -- simulates a checkout, scp, or
2229 // shared-volume mishap.
2230 let path = store.entry_path(&info.id);
2231 fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap();
2232
2233 match store.signer(&info.id) {
2234 Err(KeyError::InsecureKeyPerms { path: p, mode }) => {
2235 assert_eq!(p, path);
2236 assert_eq!(mode & 0o777, 0o644);
2237 }
2238 other => panic!("expected InsecureKeyPerms, got {:?}", other.map(|_| "ok")),
2239 }
2240 cleanup(dir);
2241 }
2242
2243 #[test]
2244 #[cfg(unix)]
2245 fn signer_bypass_via_env_var() {
2246 use std::os::unix::fs::PermissionsExt;
2247 let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
2248 let (store, dir) = make_store();
2249 let info = store.generate(true).unwrap();
2250 let path = store.entry_path(&info.id);
2251 fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap();
2252
2253 std::env::set_var("TREESHIP_ALLOW_INSECURE_KEY_PERMS", "1");
2254 let result = store.signer(&info.id);
2255 std::env::remove_var("TREESHIP_ALLOW_INSECURE_KEY_PERMS");
2256
2257 assert!(
2258 result.is_ok(),
2259 "bypass env var must allow signing: {:?}",
2260 result.err()
2261 );
2262 cleanup(dir);
2263 }
2264
2265 // --- TS-2026-001 H3 migration-lock concurrency test -----------------
2266
2267 /// H3: two threads calling `Store::signer` on the same legacy v1
2268 /// entry must both succeed, the on-disk entry must end up as a
2269 /// valid v2 entry (decryptable via the v2 path), and no `.tmp`
2270 /// fragment must be left in the keystore directory.
2271 ///
2272 /// Without the advisory lock around `migrate_entry_to_v2`, two
2273 /// concurrent migrators would race the read-modify-rename cycle:
2274 /// the loser's rename would clobber the winner's v2 entry with
2275 /// its own (also-valid) v2 entry, but in between the two
2276 /// renames a third reader could observe a v2 entry, decrypt
2277 /// successfully, then have its in-memory state invalidated by
2278 /// the second writer. The flock turns the race into a queue --
2279 /// both writers produce identical v2 plaintext, only one rename
2280 /// per entry is actually needed, and the second writer's
2281 /// post-lock recheck observes the v2 state and exits cleanly.
2282 #[test]
2283 fn concurrent_migration_serializes_correctly() {
2284 use std::sync::Arc;
2285 use std::thread;
2286
2287 // Set up a legacy v1 entry on disk -- same shape as the
2288 // store_signer_migrates_legacy_entry_to_v2 test, just shared
2289 // with two threads.
2290 let (store, dir) = make_store();
2291 let info = store.generate(true).unwrap();
2292 let entry_path = store.entry_path(&info.id);
2293
2294 let v2_entry: EncryptedEntry =
2295 serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
2296 let secret = decrypt_from_disk(
2297 &store.machine_key,
2298 &v2_entry.id,
2299 &v2_entry.public_key,
2300 &v2_entry.enc_priv_key,
2301 &v2_entry.nonce,
2302 )
2303 .unwrap();
2304 let (legacy_blob, legacy_nonce) =
2305 legacy_v1_encrypt(&store.machine_key, &secret).unwrap();
2306 let legacy_entry = EncryptedEntry {
2307 id: v2_entry.id.clone(),
2308 algorithm: v2_entry.algorithm.clone(),
2309 created_at: v2_entry.created_at.clone(),
2310 public_key: v2_entry.public_key.clone(),
2311 enc_priv_key: legacy_blob,
2312 nonce: legacy_nonce,
2313 valid_until: v2_entry.valid_until.clone(),
2314 successor_key_id: v2_entry.successor_key_id.clone(),
2315 };
2316 fs::write(&entry_path, serde_json::to_vec_pretty(&legacy_entry).unwrap()).unwrap();
2317
2318 // Two independent Store instances racing on the same on-disk
2319 // legacy entry. Using independent Store instances forces the
2320 // lock-on-disk path to engage (a shared Store would serialize
2321 // through the internal RwLock cache and we'd be testing the
2322 // wrong thing).
2323 let dir_a = Arc::new(dir.clone());
2324 let dir_b = Arc::new(dir.clone());
2325 let id_a = info.id.clone();
2326 let id_b = info.id.clone();
2327
2328 let h1 = thread::spawn(move || -> Result<(), String> {
2329 let s = Store::open(&*dir_a).map_err(|e| e.to_string())?;
2330 let _signer = s.signer(&id_a).map_err(|e| e.to_string())?;
2331 Ok(())
2332 });
2333 let h2 = thread::spawn(move || -> Result<(), String> {
2334 let s = Store::open(&*dir_b).map_err(|e| e.to_string())?;
2335 let _signer = s.signer(&id_b).map_err(|e| e.to_string())?;
2336 Ok(())
2337 });
2338
2339 h1.join().unwrap().expect("thread 1 signer load must succeed");
2340 h2.join().unwrap().expect("thread 2 signer load must succeed");
2341
2342 // Post-condition: on-disk entry is v2 framed.
2343 let after: EncryptedEntry =
2344 serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
2345 assert!(
2346 !is_legacy_v1(&after.enc_priv_key),
2347 "post-concurrent-migration entry must be in v2 format"
2348 );
2349 assert_eq!(after.enc_priv_key[0], KEYSTORE_MAGIC);
2350 assert_eq!(after.enc_priv_key[1], KEYSTORE_VERSION_V2);
2351
2352 // v2 decrypts cleanly. Use the post-migration entry's own id +
2353 // pubkey — the migration must have re-encrypted with those bound
2354 // into the AAD, or this assertion would surface a MAC failure.
2355 let dec = decrypt_v2(
2356 &store.machine_key,
2357 &after.id,
2358 &after.public_key,
2359 &after.enc_priv_key,
2360 )
2361 .expect("v2 entry must decrypt cleanly after concurrent migration");
2362 assert_eq!(dec.len(), 32, "decrypted secret must be a 32-byte ed25519 scalar");
2363
2364 // No stale .tmp file left behind.
2365 for entry in fs::read_dir(&dir).unwrap() {
2366 let p = entry.unwrap().path();
2367 assert!(
2368 p.extension().is_none_or(|e| e != "tmp"),
2369 "no .tmp fragment must remain after migration, found: {}",
2370 p.display()
2371 );
2372 }
2373
2374 cleanup(dir);
2375 }
2376
2377 // --- TS-2026-001 H1 + H2 atomic write tests ------------------------
2378
2379 /// H1: a partial failure between writing the tmp file and renaming
2380 /// it into place MUST leave the original on-disk file intact. We
2381 /// simulate the failure by pre-creating a tmp file (so the next
2382 /// write_file_600 would clobber it) and then independently verifying
2383 /// that an already-written key entry remains decryptable even after
2384 /// a fresh write_file_600 fails partway.
2385 ///
2386 /// We exercise the failure path by pointing the rename at an
2387 /// unwritable target. On Unix we make the *parent directory*
2388 /// read-only after the original key is in place, which causes the
2389 /// final fs::rename to fail with EACCES. The original key file is
2390 /// unaffected because rename(2) returns before touching the target.
2391 #[test]
2392 #[cfg(unix)]
2393 fn atomic_write_leaves_original_intact_on_partial_failure() {
2394 use std::os::unix::fs::PermissionsExt;
2395 let (store, dir) = make_store();
2396 let info = store.generate(true).unwrap();
2397 let entry_path = store.entry_path(&info.id);
2398
2399 // Capture the original bytes for byte-identity comparison.
2400 let original = fs::read(&entry_path).expect("entry file must exist");
2401 assert!(!original.is_empty(), "freshly generated entry must be non-empty");
2402
2403 // Lock the directory: read+execute only, no write. fs::rename
2404 // into this directory will fail.
2405 let orig_dir_mode = fs::metadata(&dir).unwrap().permissions().mode() & 0o777;
2406 fs::set_permissions(&dir, fs::Permissions::from_mode(0o500)).unwrap();
2407
2408 // Attempt a fresh write to the SAME path -- must fail because
2409 // the directory is read-only, exercising the rename-failure
2410 // branch.
2411 let res = write_file_600(&entry_path, b"new junk that must not land");
2412 assert!(res.is_err(), "write_file_600 must fail when dir is read-only");
2413
2414 // Restore perms so we can read back the entry.
2415 fs::set_permissions(&dir, fs::Permissions::from_mode(orig_dir_mode)).unwrap();
2416
2417 // The original key file must be byte-identical to what we
2418 // captured before the failed write.
2419 let after = fs::read(&entry_path).expect("entry file must still exist after failed write");
2420 assert_eq!(
2421 after, original,
2422 "failed atomic write must not corrupt the original file",
2423 );
2424
2425 // And the keystore must still produce a working signer from it.
2426 let store2 = Store::open(&dir).unwrap();
2427 let signer = store2
2428 .signer(&info.id)
2429 .expect("original key must still decrypt after a failed write");
2430 let pae = crate::attestation::pae("text/plain", b"survive");
2431 assert_eq!(signer.sign(&pae).unwrap().len(), 64);
2432
2433 // No stale tmp file left behind.
2434 let tmp = entry_path.with_extension("tmp");
2435 assert!(!tmp.exists(), "tmp file must be cleaned up after rename failure");
2436
2437 cleanup(dir);
2438 }
2439
2440 /// H2: the entry file's mode is 0o600 at the moment of creation, set
2441 /// via OpenOptionsExt::mode rather than a post-write set_permissions
2442 /// (which had a tiny window of looser perms). Also confirms the tmp
2443 /// file is removed by the rename.
2444 #[test]
2445 #[cfg(unix)]
2446 fn mode_is_600_at_creation() {
2447 use std::os::unix::fs::PermissionsExt;
2448 let (store, dir) = make_store();
2449 let info = store.generate(true).unwrap();
2450 let entry_path = store.entry_path(&info.id);
2451
2452 let mode = fs::metadata(&entry_path).unwrap().permissions().mode() & 0o777;
2453 assert_eq!(mode, 0o600, "entry file must be 0600 at creation, got {:o}", mode);
2454
2455 let tmp = entry_path.with_extension("tmp");
2456 assert!(
2457 !tmp.exists(),
2458 "no .tmp file must be left behind after a successful atomic write"
2459 );
2460
2461 cleanup(dir);
2462 }
2463
2464 #[test]
2465 #[cfg(unix)]
2466 fn fix_perms_repairs_loose_modes() {
2467 use std::os::unix::fs::PermissionsExt;
2468 let (store, dir) = make_store();
2469 let info = store.generate(true).unwrap();
2470 let key_path = store.entry_path(&info.id);
2471
2472 fs::set_permissions(&dir, fs::Permissions::from_mode(0o755)).unwrap();
2473 fs::set_permissions(&key_path, fs::Permissions::from_mode(0o644)).unwrap();
2474
2475 let changes = store.fix_perms().unwrap();
2476 // dir + key file + manifest = 3 paths to fix (manifest may already be 0600
2477 // depending on Manifest write path; we only assert the loose ones moved).
2478 assert!(
2479 changes.iter().any(|(p, _, _)| p == &dir),
2480 "dir should be repaired"
2481 );
2482 assert!(
2483 changes.iter().any(|(p, _, _)| p == &key_path),
2484 "key file should be repaired"
2485 );
2486
2487 let dir_mode = fs::metadata(&dir).unwrap().permissions().mode() & 0o777;
2488 let key_mode = fs::metadata(&key_path).unwrap().permissions().mode() & 0o777;
2489 assert_eq!(dir_mode, 0o700);
2490 assert_eq!(key_mode, 0o600);
2491
2492 // After repair, signing must work again.
2493 store.signer(&info.id).expect("signing must work after fix_perms");
2494
2495 cleanup(dir);
2496 }
2497
2498 // --- TS-2026-001 post-merge fix-up: entry-binding AAD ------------------
2499
2500 /// Post-merge audit fix: the v2 AAD now binds entry id + public key
2501 /// into the GCM tag. Without that binding, a local attacker with
2502 /// write access to ~/.treeship/keys/ could copy entry A's
2503 /// `enc_priv_key` ciphertext into entry B's JSON envelope; the
2504 /// decrypt would succeed (same machine key, same framing-only AAD)
2505 /// and the signer for advertised key id A would silently sign with
2506 /// key B's secret scalar.
2507 ///
2508 /// This test performs exactly that swap and asserts decryption now
2509 /// fails. Before the fix this test would silently pass with the
2510 /// wrong scalar -- a true regression guard.
2511 #[test]
2512 fn cross_entry_swap_fails_decryption() {
2513 let (store, dir) = make_store();
2514
2515 // Two independent keys in the same store, same machine key.
2516 let a = store.generate(true).unwrap();
2517 let b = store.generate(false).unwrap();
2518
2519 // Snapshot both on-disk envelopes.
2520 let path_a = store.entry_path(&a.id);
2521 let path_b = store.entry_path(&b.id);
2522 let entry_a: EncryptedEntry =
2523 serde_json::from_slice(&fs::read(&path_a).unwrap()).unwrap();
2524 let entry_b: EncryptedEntry =
2525 serde_json::from_slice(&fs::read(&path_b).unwrap()).unwrap();
2526
2527 // Sanity: both are v2 framed, and the ciphertexts differ.
2528 assert_eq!(entry_a.enc_priv_key[0], KEYSTORE_MAGIC);
2529 assert_eq!(entry_a.enc_priv_key[1], KEYSTORE_VERSION_V2);
2530 assert_eq!(entry_b.enc_priv_key[0], KEYSTORE_MAGIC);
2531 assert_eq!(entry_b.enc_priv_key[1], KEYSTORE_VERSION_V2);
2532 assert_ne!(
2533 entry_a.enc_priv_key, entry_b.enc_priv_key,
2534 "two freshly-generated entries must have distinct ciphertexts"
2535 );
2536
2537 // The attack: copy B's enc_priv_key into A's envelope. Leave
2538 // everything else (id, public_key, algorithm) as it was in A.
2539 // This is the file an attacker with write access to the keys
2540 // directory would produce.
2541 let mut tampered_a = entry_a.clone();
2542 tampered_a.enc_priv_key = entry_b.enc_priv_key.clone();
2543 // The v2 nonce travels inline with the ciphertext (bytes
2544 // [2..14] of enc_priv_key), so swapping the blob also swaps
2545 // the nonce; the separate JSON `nonce` field is empty for v2
2546 // entries either way.
2547 fs::write(&path_a, serde_json::to_vec_pretty(&tampered_a).unwrap()).unwrap();
2548
2549 // Fresh Store so the in-memory cache doesn't paper over the
2550 // on-disk tamper.
2551 let store2 = Store::open(&dir).unwrap();
2552 let err = match store2.signer(&a.id) {
2553 Ok(_) => panic!(
2554 "swapping B's ciphertext into A's envelope must fail decrypt; \
2555 got Ok which means the signer would silently sign with key B"
2556 ),
2557 Err(e) => e,
2558 };
2559
2560 // The specific error must be a crypto/MAC failure, not (e.g.)
2561 // a NotFound or InsecureKeyPerms surface that could mask the
2562 // class of bug.
2563 match err {
2564 KeyError::Crypto(msg) => assert!(
2565 msg.contains("MAC verification failed"),
2566 "swap must surface MAC failure; got: {msg}"
2567 ),
2568 other => panic!("expected Crypto MAC error, got: {other:?}"),
2569 }
2570
2571 cleanup(dir);
2572 }
2573
2574 /// Companion to `cross_entry_swap_fails_decryption`: the id field
2575 /// is also bound into the AAD, so editing the JSON `id` while
2576 /// leaving the ciphertext alone must also fail. (An attacker who
2577 /// renames a stolen entry file onto a victim's id without
2578 /// re-encrypting would land here.)
2579 #[test]
2580 fn aad_tampered_entry_id_fails_decryption() {
2581 let (store, dir) = make_store();
2582 let info = store.generate(true).unwrap();
2583 let path = store.entry_path(&info.id);
2584
2585 let mut entry: EncryptedEntry =
2586 serde_json::from_slice(&fs::read(&path).unwrap()).unwrap();
2587 assert_eq!(entry.id, info.id, "sanity: id matches what generate returned");
2588
2589 // Pretend the attacker forged an id. Note we write this back to
2590 // the SAME file path so Store::load_entry by the original id
2591 // finds it; if we changed the path too we'd just be testing
2592 // NotFound, which isn't the point.
2593 entry.id = "key_attacker_substituted_id".to_string();
2594 fs::write(&path, serde_json::to_vec_pretty(&entry).unwrap()).unwrap();
2595
2596 // Fresh Store so cache doesn't paper this over. Load via the
2597 // tampered id (matching what's in the JSON) so we exercise the
2598 // decrypt path rather than a path-vs-id mismatch.
2599 let store2 = Store::open(&dir).unwrap();
2600 // Drop the cache by opening fresh; load by the on-disk id.
2601 // The entry_path for "key_attacker_substituted_id" doesn't
2602 // exist, so we deliberately call the lower-level read by
2603 // path-of-original and assert decrypt fails via the dispatcher.
2604 // Easiest: bypass entry_path and invoke decrypt_from_disk with
2605 // the tampered id directly.
2606 let key_buf = store2.machine_key;
2607 let result = decrypt_from_disk(
2608 &key_buf,
2609 &entry.id, // tampered id (bound into AAD)
2610 &entry.public_key, // original pubkey
2611 &entry.enc_priv_key,
2612 &entry.nonce,
2613 );
2614 assert!(
2615 result.is_err(),
2616 "AAD-bound entry id mismatch must fail decrypt; got Ok"
2617 );
2618
2619 cleanup(dir);
2620 }
2621}