treeship_core/keys/
mod.rs

1use std::{
2    collections::HashMap,
3    fs,
4    io::{self, Read, Write},
5    path::{Path, PathBuf},
6    sync::{Arc, RwLock},
7};
8
9use aes_gcm::{
10    aead::{Aead, KeyInit, OsRng as AeadOsRng, Payload},
11    AeadCore, Aes256Gcm, Key as AesKey, Nonce,
12};
13use rand::RngCore;
14use serde::{Deserialize, Serialize};
15use sha2::{Digest as Sha2Digest, Sha256};
16use zeroize::Zeroizing;
17
18use crate::attestation::{Ed25519Signer, Signer};
19
20// --- Public types ---
21
22pub type KeyId = String;
23
24/// Public information about a stored key. Never contains private material.
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct KeyInfo {
27    pub id:          KeyId,
28    pub algorithm:   String,   // "ed25519"
29    pub is_default:  bool,
30    pub created_at:  String,   // RFC 3339
31    /// First 8 bytes of sha256(public_key), hex-encoded.
32    pub fingerprint: String,
33    pub public_key:  Vec<u8>,  // raw 32-byte Ed25519 public key
34    /// RFC 3339 timestamp after which signatures by this key should be
35    /// considered stale. `None` means the key has not been rotated and is
36    /// indefinitely valid. Set automatically by `Store::rotate` to
37    /// `now + grace_period` on the predecessor key.
38    #[serde(default, skip_serializing_if = "Option::is_none")]
39    pub valid_until: Option<String>,
40    /// If this key was rotated to a successor, the successor's key id.
41    /// Lets verifiers walk a rotation chain forward when validating an old
42    /// receipt against the current keystore. `None` means this is the head
43    /// of its chain.
44    #[serde(default, skip_serializing_if = "Option::is_none")]
45    pub successor_key_id: Option<KeyId>,
46}
47
48/// Outcome of a `Store::rotate` call.
49#[derive(Debug, Clone)]
50pub struct RotationResult {
51    /// The key that was rotated. Its `valid_until` is now set.
52    pub predecessor: KeyInfo,
53    /// The freshly minted successor key.
54    pub successor: KeyInfo,
55    /// RFC 3339 timestamp until which the predecessor remains valid for
56    /// signature verification under the grace period. Equal to
57    /// `predecessor.valid_until.unwrap()`.
58    pub grace_period_until: String,
59}
60
61/// Errors from keystore operations.
62#[derive(Debug)]
63pub enum KeyError {
64    Io(io::Error),
65    Json(serde_json::Error),
66    Crypto(String),
67    NotFound(KeyId),
68    EmptyKeyId,
69    NoDefaultKey,
70    /// Private key file has insecure permissions (group- or world-readable).
71    /// Carries the path and the observed octal mode so the caller can show
72    /// an actionable error. Set `TREESHIP_ALLOW_INSECURE_KEY_PERMS=1` to
73    /// bypass during testing or controlled environments.
74    InsecureKeyPerms { path: PathBuf, mode: u32 },
75}
76
77impl std::fmt::Display for KeyError {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        match self {
80            Self::Io(e)       => write!(f, "keys io: {}", e),
81            Self::Json(e)     => write!(f, "keys json: {}", e),
82            Self::Crypto(e)   => write!(f, "keys crypto: {}", e),
83            Self::NotFound(k) => write!(f, "key not found: {}", k),
84            Self::EmptyKeyId  => write!(f, "key id must not be empty"),
85            Self::NoDefaultKey => write!(f, "no default key — run treeship init"),
86            Self::InsecureKeyPerms { path, mode } => write!(
87                f,
88                "private key {} has insecure permissions (mode {:o}); \
89                 run `treeship doctor --fix` or chmod 600 the file. \
90                 Set TREESHIP_ALLOW_INSECURE_KEY_PERMS=1 to bypass.",
91                path.display(),
92                mode & 0o777,
93            ),
94        }
95    }
96}
97
98impl std::error::Error for KeyError {}
99impl From<io::Error>          for KeyError { fn from(e: io::Error)          -> Self { Self::Io(e) } }
100impl From<serde_json::Error>  for KeyError { fn from(e: serde_json::Error)  -> Self { Self::Json(e) } }
101
102// --- On-disk formats ---
103
104/// The encrypted representation of one keypair on disk.
105#[derive(Serialize, Deserialize, Clone)]
106struct EncryptedEntry {
107    id:           KeyId,
108    algorithm:    String,
109    created_at:   String,
110    public_key:   Vec<u8>,
111    /// AES-256-GCM ciphertext of the 32-byte Ed25519 secret scalar.
112    enc_priv_key: Vec<u8>,
113    /// 12-byte GCM nonce used when encrypting.
114    nonce:        Vec<u8>,
115    /// RFC 3339 timestamp after which signatures by this key should be
116    /// considered stale. `None` means the key is indefinitely valid.
117    /// Defaulted on deserialization so pre-0.9.5 entry files still load.
118    #[serde(default, skip_serializing_if = "Option::is_none")]
119    valid_until: Option<String>,
120    /// Successor key id if this key was rotated. Defaulted on
121    /// deserialization for pre-0.9.5 entry files.
122    #[serde(default, skip_serializing_if = "Option::is_none")]
123    successor_key_id: Option<KeyId>,
124}
125
126/// The manifest file: which keys exist and which is the default.
127#[derive(Serialize, Deserialize, Default)]
128struct Manifest {
129    default_key_id: Option<KeyId>,
130    key_ids:        Vec<KeyId>,
131}
132
133// --- Store ---
134
135/// Local encrypted keystore.
136///
137/// Private keys are encrypted with AES-256-GCM (RustCrypto `aes-gcm`
138/// 0.10) before writing to disk. The encryption key is derived from a
139/// machine-specific secret so key files are useless if copied to
140/// another machine.
141///
142/// Pre-v0.10.3 keystores used a homemade SHA-256-CTR + HMAC-SHA-256
143/// construction (TS-2026-001) and are transparently migrated to the
144/// new AEAD format on first decrypt; see `encrypt_for_disk_v2` /
145/// `decrypt_from_disk` for the format dispatcher.
146///
147/// A future version will delegate to OS credential stores (Secure
148/// Enclave / TPM 2.0).
149pub struct Store {
150    dir:         PathBuf,
151    machine_key: [u8; 32],
152    /// In-memory cache — avoids disk reads on hot paths.
153    cache:       Arc<RwLock<HashMap<KeyId, EncryptedEntry>>>,
154}
155
156impl Store {
157    /// Opens or creates a keystore at `dir`.
158    pub fn open(dir: impl AsRef<Path>) -> Result<Self, KeyError> {
159        let dir = dir.as_ref().to_path_buf();
160        fs::create_dir_all(&dir)?;
161
162        let machine_key = derive_machine_key(&dir)?;
163
164        Ok(Self {
165            dir,
166            machine_key,
167            cache: Arc::new(RwLock::new(HashMap::new())),
168        })
169    }
170
171    /// Generates a new Ed25519 keypair, encrypts and stores it.
172    /// If `set_default` is true (or there is no current default), makes
173    /// this key the default signing key.
174    pub fn generate(&self, set_default: bool) -> Result<KeyInfo, KeyError> {
175        let key_id = new_key_id();
176
177        let signer = Ed25519Signer::generate(&key_id)
178            .map_err(|e| KeyError::Crypto(e.to_string()))?;
179
180        let secret  = signer.secret_bytes();
181        let pub_key = signer.public_key_bytes();
182
183        let enc = encrypt_for_disk_v2(&self.machine_key, key_id.as_str(), &pub_key, &secret)
184            .map_err(KeyError::Crypto)?;
185
186        let entry = EncryptedEntry {
187            id:               key_id.clone(),
188            algorithm:        "ed25519".into(),
189            created_at:       crate::statements::unix_to_rfc3339(unix_now()),
190            public_key:       pub_key.clone(),
191            enc_priv_key:     enc,
192            // v2 ciphertexts carry their nonce inline (bytes [2..14]).
193            // The separate `nonce` field is retained for v1 legacy
194            // compatibility; for fresh v2 entries we serialize an empty
195            // vec so the JSON stays well-formed.
196            nonce:            Vec::new(),
197            valid_until:      None,
198            successor_key_id: None,
199        };
200
201        self.write_entry(&entry)?;
202
203        // Update manifest.
204        let mut manifest = self.read_manifest()?;
205        manifest.key_ids.push(key_id.clone());
206        if set_default || manifest.default_key_id.is_none() {
207            manifest.default_key_id = Some(key_id.clone());
208        }
209        self.write_manifest(&manifest)?;
210
211        // Populate cache.
212        self.cache.write().unwrap().insert(key_id.clone(), entry);
213
214        Ok(KeyInfo {
215            id:               key_id.clone(),
216            algorithm:        "ed25519".into(),
217            is_default:       manifest.default_key_id.as_deref() == Some(key_id.as_str()),
218            created_at:       crate::statements::unix_to_rfc3339(unix_now()),
219            fingerprint:      fingerprint(&pub_key),
220            public_key:       pub_key,
221            valid_until:      None,
222            successor_key_id: None,
223        })
224    }
225
226    /// Rotate the current default key (or a specific key) to a freshly
227    /// generated successor.
228    ///
229    /// Mints a new Ed25519 keypair, links the predecessor to it via
230    /// `successor_key_id`, and stamps the predecessor with a `valid_until`
231    /// of `now + grace_period`. The grace window lets verifiers continue to
232    /// accept signatures from the predecessor while clients catch up to
233    /// the new public key.
234    ///
235    /// If `set_default` is true (the typical case -- you rotate because you
236    /// want to start signing with the new key immediately), the successor
237    /// becomes the default. Pass `false` to stage a rotation for review
238    /// without flipping the active signer.
239    ///
240    /// `predecessor_id` may be `None` to rotate the current default. Pass
241    /// an explicit id to rotate a non-default key (e.g. a per-environment
242    /// secondary).
243    ///
244    /// Note on threat model: this is a graceful rotation primitive, not a
245    /// revocation primitive. If the predecessor key is suspected compromised
246    /// the grace_period should be `Duration::ZERO` (or use a future
247    /// `revoke()` call once that lands) so the predecessor's `valid_until`
248    /// is in the past and any verifier honoring the metadata refuses
249    /// further signatures from it.
250    pub fn rotate(
251        &self,
252        predecessor_id: Option<&str>,
253        grace_period: std::time::Duration,
254        set_default: bool,
255    ) -> Result<RotationResult, KeyError> {
256        // Resolve predecessor: explicit id, else the current default.
257        let pred_id = match predecessor_id {
258            Some(id) => id.to_string(),
259            None => self.default_key_id()?,
260        };
261
262        // Refuse to rotate a key that has already been rotated -- the
263        // chain head is the only valid rotation source. This makes the
264        // operation idempotent in the face of accidental re-runs.
265        let pred_entry_existing = self.load_entry(&pred_id)?;
266        if let Some(existing) = &pred_entry_existing.successor_key_id {
267            return Err(KeyError::Crypto(format!(
268                "key {pred_id} has already been rotated to {existing}; \
269                 rotate the chain head instead"
270            )));
271        }
272
273        // Mint the successor. We deliberately do NOT call `self.generate()`
274        // because that path also updates the manifest's default. We need a
275        // single transactional update that sets both predecessor metadata
276        // AND (optionally) the new default in one manifest write.
277        let succ_id = new_key_id();
278        let signer = Ed25519Signer::generate(&succ_id)
279            .map_err(|e| KeyError::Crypto(e.to_string()))?;
280        let succ_secret  = signer.secret_bytes();
281        let succ_pub_key = signer.public_key_bytes();
282        let succ_enc =
283            encrypt_for_disk_v2(&self.machine_key, succ_id.as_str(), &succ_pub_key, &succ_secret)
284                .map_err(KeyError::Crypto)?;
285
286        let succ_created = crate::statements::unix_to_rfc3339(unix_now());
287        let succ_entry = EncryptedEntry {
288            id:               succ_id.clone(),
289            algorithm:        "ed25519".into(),
290            created_at:       succ_created.clone(),
291            public_key:       succ_pub_key.clone(),
292            enc_priv_key:     succ_enc,
293            // v2 ciphertexts carry their nonce inline; the legacy
294            // `nonce` field is left empty for fresh writes.
295            nonce:            Vec::new(),
296            valid_until:      None,
297            successor_key_id: None,
298        };
299
300        // Stamp the predecessor with the grace deadline and link forward.
301        let valid_until = crate::statements::unix_to_rfc3339(
302            unix_now() + grace_period.as_secs(),
303        );
304        let mut pred_entry = pred_entry_existing;
305        pred_entry.valid_until      = Some(valid_until.clone());
306        pred_entry.successor_key_id = Some(succ_id.clone());
307
308        // Write order matters for partial-failure recovery. Persist the
309        // successor entry FIRST, then stamp the predecessor pointing at
310        // it. If we wrote the predecessor first and then the successor
311        // write failed, the predecessor's successor_key_id would dangle
312        // at a key that doesn't exist on disk -- and the
313        // already-been-rotated guard would refuse to retry. With this
314        // order:
315        //   - successor write fails: nothing observable changed; retry clean.
316        //   - predecessor write fails: orphan successor key file on disk
317        //     (not yet referenced by manifest or by any other key); retry
318        //     generates a new successor and the orphan is harmless.
319        //   - manifest write fails: predecessor + successor both on disk,
320        //     manifest stale; retry's already-rotated guard catches the
321        //     half-finished state and surfaces a clear error.
322        self.write_entry(&succ_entry)?;
323        self.write_entry(&pred_entry)?;
324
325        // Refresh the cache to mirror the on-disk state we just wrote --
326        // BEFORE the manifest update. If the manifest write fails, the
327        // cache must still match disk so a same-process retry sees the
328        // half-rotated state and the already-rotated guard fires
329        // correctly. Doing this AFTER write_manifest would leave a
330        // window where disk reflects the rotation but the in-memory
331        // cache still serves the unstamped predecessor, and a retry
332        // from the same Store instance would generate a duplicate
333        // successor -- defeating the whole point of the guard.
334        {
335            let mut cache = self.cache.write().unwrap();
336            cache.insert(pred_entry.id.clone(), pred_entry.clone());
337            cache.insert(succ_id.clone(),       succ_entry.clone());
338        }
339
340        // Update the manifest: register the new key, optionally promote it.
341        let mut manifest = self.read_manifest()?;
342        manifest.key_ids.push(succ_id.clone());
343        if set_default {
344            manifest.default_key_id = Some(succ_id.clone());
345        }
346        self.write_manifest(&manifest)?;
347
348        let default_id = manifest.default_key_id.clone();
349        let predecessor = KeyInfo {
350            id:               pred_entry.id.clone(),
351            algorithm:        pred_entry.algorithm.clone(),
352            is_default:       default_id.as_deref() == Some(pred_entry.id.as_str()),
353            created_at:       pred_entry.created_at.clone(),
354            fingerprint:      fingerprint(&pred_entry.public_key),
355            public_key:       pred_entry.public_key.clone(),
356            valid_until:      pred_entry.valid_until.clone(),
357            successor_key_id: pred_entry.successor_key_id.clone(),
358        };
359        let successor = KeyInfo {
360            id:               succ_id.clone(),
361            algorithm:        "ed25519".into(),
362            is_default:       default_id.as_deref() == Some(succ_id.as_str()),
363            created_at:       succ_created,
364            fingerprint:      fingerprint(&succ_pub_key),
365            public_key:       succ_pub_key,
366            valid_until:      None,
367            successor_key_id: None,
368        };
369
370        Ok(RotationResult {
371            predecessor,
372            successor,
373            grace_period_until: valid_until,
374        })
375    }
376
377    /// Walk the rotation chain forward from `id`, returning the ordered
378    /// list of key ids: `[id, successor_of_id, ...]`. The first element is
379    /// always `id` itself. Stops at a key with no `successor_key_id`.
380    pub fn successor_chain(&self, id: &str) -> Result<Vec<KeyId>, KeyError> {
381        let mut chain = Vec::new();
382        let mut cursor = id.to_string();
383        // Cap iterations at the manifest size to defend against a corrupt
384        // chain that loops back on itself. A well-formed chain is bounded
385        // by the number of keys in the keystore.
386        let max_steps = self.read_manifest()?.key_ids.len() + 1;
387        for _ in 0..max_steps {
388            chain.push(cursor.clone());
389            let entry = self.load_entry(&cursor)?;
390            match entry.successor_key_id {
391                Some(next) => cursor = next,
392                None => return Ok(chain),
393            }
394        }
395        Err(KeyError::Crypto(format!(
396            "rotation chain starting at {id} exceeds keystore size; suspected loop"
397        )))
398    }
399
400    /// Returns the `KeyInfo` for every key whose `valid_until` is either
401    /// unset or strictly after `at_unix_secs`. The result includes both
402    /// rotated-but-still-in-grace predecessors and never-rotated keys.
403    /// Useful for building a verifier's accept-set as of a given time.
404    pub fn valid_keys_at(&self, at_unix_secs: u64) -> Result<Vec<KeyInfo>, KeyError> {
405        let cutoff_rfc = crate::statements::unix_to_rfc3339(at_unix_secs);
406        Ok(self.list()?
407            .into_iter()
408            .filter(|k| match &k.valid_until {
409                None => true,
410                Some(until) => until.as_str() > cutoff_rfc.as_str(),
411            })
412            .collect())
413    }
414
415    /// Returns a boxed `Signer` for the current default key.
416    pub fn default_signer(&self) -> Result<Box<dyn Signer>, KeyError> {
417        let manifest = self.read_manifest()?;
418        let id = manifest.default_key_id.ok_or(KeyError::NoDefaultKey)?;
419        self.signer(&id)
420    }
421
422    /// Returns a boxed `Signer` for a specific key ID.
423    ///
424    /// Refuses to load if the on-disk key file has insecure permissions
425    /// (any group or world bits). This is the choke point for *all*
426    /// signing — public-key reads and successor lookups go through
427    /// `read_entry` / `public_key` and are not affected.
428    ///
429    /// Bypass with `TREESHIP_ALLOW_INSECURE_KEY_PERMS=1` for controlled
430    /// environments (CI sandboxes, recovery flows). The bypass should
431    /// not be set in normal operation.
432    pub fn signer(&self, id: &str) -> Result<Box<dyn Signer>, KeyError> {
433        check_key_file_perms(&self.entry_path(id))?;
434
435        let entry = self.load_entry(id)?;
436
437        // Dispatcher: v2 ciphertexts start with magic 0x54, version 0x02
438        // and use real AES-256-GCM. Older entries fall through to the
439        // legacy SHA-256-CTR+HMAC path (`decrypt_legacy_v1`) and are
440        // transparently re-encrypted in the new format below.
441        let was_legacy = is_legacy_v1(&entry.enc_priv_key);
442        let secret = decrypt_from_disk(
443            &self.machine_key,
444            &entry.id,
445            &entry.public_key,
446            &entry.enc_priv_key,
447            &entry.nonce,
448        )
449            .map_err(|e| self.enrich_crypto_error(e))?;
450
451        // L3: wrap the on-stack copy of the decrypted secret in a
452        // `Zeroizing` so the byte buffer is wiped on drop. `secret`
453        // itself is already a `Zeroizing<Vec<u8>>` returned by
454        // `decrypt_from_disk`, but `try_into::<[u8; 32]>` produces an
455        // independent stack-allocated array that the Vec's Drop will
456        // not cover. Without this wrapper, returning from `signer()`
457        // would leave the secret scalar in stale stack memory until
458        // a future stack frame happens to overwrite it.
459        let secret_arr: Zeroizing<[u8; 32]> = Zeroizing::new(
460            secret.as_slice().try_into()
461                .map_err(|_| KeyError::Crypto("decrypted key is wrong length".into()))?
462        );
463
464        // Transparent migration: if this entry was still in the legacy
465        // v1 format (the broken SHA-256-CTR construction from
466        // TS-2026-001), re-encrypt it with v2 AES-256-GCM and rewrite
467        // the file. We do this best-effort -- a migration failure here
468        // must NOT block signing for the current call, since the
469        // in-memory secret is already valid. The next decrypt on a
470        // fresh process will retry.
471        if was_legacy {
472            if let Err(e) = self.migrate_entry_to_v2(&entry, &secret_arr) {
473                // Surface the failure as a tracing-style stderr note
474                // rather than an error -- the user's signing flow is
475                // unaffected, and we'd rather them know about it than
476                // wedge the call.
477                eprintln!(
478                    "treeship: keystore entry {} could not be migrated \
479                     from legacy v1 format to v2 ({}); will retry next \
480                     load",
481                    entry.id, e
482                );
483            }
484        }
485
486        let signer = Ed25519Signer::from_bytes(&entry.id, &secret_arr)
487            .map_err(|e| KeyError::Crypto(e.to_string()))?;
488
489        Ok(Box::new(signer))
490    }
491
492    /// Re-encrypt a legacy v1 entry with the new v2 AEAD and persist
493    /// it. Updates the in-memory cache so subsequent loads in the same
494    /// process see the migrated entry. Idempotent; safe to invoke
495    /// concurrently because the migration is serialized by a per-entry
496    /// advisory lock on `<entry>.migrate.lock` (TS-2026-001 H3).
497    ///
498    /// We lock a *sentinel* file rather than the entry file itself,
499    /// because the entry file is renamed-into-place during the atomic
500    /// write inside `write_entry`. Holding a flock on the entry's inode
501    /// while a sibling process renames a new inode into its path is
502    /// nonsensical (the lock would survive on the now-orphaned inode);
503    /// the sentinel sidecar has a stable identity for the whole
504    /// migration window.
505    ///
506    /// Same blocking-flock pattern as `packages/core/src/session/event_log.rs`
507    /// (Lane F): exclusive lock, then a same-thread re-read to settle
508    /// "did a peer already migrate while I was waiting?" cleanly.
509    fn migrate_entry_to_v2(
510        &self,
511        old_entry: &EncryptedEntry,
512        secret: &[u8; 32],
513    ) -> Result<(), KeyError> {
514        let entry_path = self.entry_path(&old_entry.id);
515        let lock_path = entry_path.with_extension("migrate.lock");
516
517        // Open (or create) the sentinel lock file with restrictive perms
518        // and take an exclusive flock. We intentionally use the blocking
519        // `lock_exclusive` -- not `try_lock_exclusive` -- because the
520        // migration window is short (a single AEAD encrypt + atomic
521        // rename) and the worst case under contention is one writer
522        // serialized behind another. Pulling the
523        // try-with-bounded-retry pattern in here would buy us nothing:
524        // the second writer's re-read after the lock releases would
525        // observe the now-v2 entry and short-circuit.
526        let lock_file = open_migration_lock_file(&lock_path)
527            .map_err(KeyError::Io)?;
528
529        #[cfg(not(target_family = "wasm"))]
530        {
531            use fs2::FileExt;
532            lock_file.lock_exclusive().map_err(KeyError::Io)?;
533        }
534
535        // Under the lock: did a peer already complete the migration
536        // while we were waiting? If so, our work is done -- we must
537        // NOT rewrite, because we'd overwrite a peer's freshly-rotated
538        // v2 ciphertext with our own (semantically equivalent, but
539        // unnecessary I/O and an unnecessary cache update).
540        if let Ok(current) = self.read_entry(&old_entry.id) {
541            if !is_legacy_v1(&current.enc_priv_key) {
542                // Peer already migrated. Refresh the cache so subsequent
543                // loads in this process see the v2 entry rather than
544                // the stale legacy copy our caller passed in.
545                if let Ok(mut cache) = self.cache.write() {
546                    cache.insert(current.id.clone(), current);
547                }
548                // Lock drops at function exit; sentinel file remains on
549                // disk as a harmless inode (no migration data, idempotent
550                // for future invocations).
551                return Ok(());
552            }
553        }
554
555        let new_ciphertext = encrypt_for_disk_v2(
556            &self.machine_key,
557            &old_entry.id,
558            &old_entry.public_key,
559            secret,
560        )
561        .map_err(KeyError::Crypto)?;
562
563        let migrated = EncryptedEntry {
564            id:               old_entry.id.clone(),
565            algorithm:        old_entry.algorithm.clone(),
566            created_at:       old_entry.created_at.clone(),
567            public_key:       old_entry.public_key.clone(),
568            enc_priv_key:     new_ciphertext,
569            // v2 carries the nonce inline; clear the legacy field.
570            nonce:            Vec::new(),
571            valid_until:      old_entry.valid_until.clone(),
572            successor_key_id: old_entry.successor_key_id.clone(),
573        };
574
575        self.write_entry(&migrated)?;
576        if let Ok(mut cache) = self.cache.write() {
577            cache.insert(migrated.id.clone(), migrated);
578        }
579
580        // Best-effort cleanup of the sentinel lock file. We hold the
581        // lock until function exit (drop), so by the time we reach
582        // here it is safe to unlink the inode -- future migrations
583        // for this entry will succeed via the early-return path
584        // because the entry is now v2. Leaving the sentinel behind is
585        // also harmless; on Unix removing a flocked file is allowed
586        // and the lock is released on fd drop regardless.
587        let _ = std::fs::remove_file(&lock_path);
588
589        // Keep the lock_file binding alive to function exit so the
590        // flock is held across write_entry + remove_file. Explicit
591        // drop makes the intent obvious to readers.
592        drop(lock_file);
593        Ok(())
594    }
595
596    /// Wrap a bare crypto error (typically "MAC verification failed ..." from
597    /// the AES-GCM decrypt path) with a diagnostic and an actionable recovery
598    /// path.
599    ///
600    /// The common failure mode in the wild is a pre-0.9.x keystore whose
601    /// machine-key derivation was seed-file-based. Later versions derive
602    /// the machine key from hostname+username (macOS) or /etc/machine-id
603    /// (Linux), so old ciphertexts can't be MAC-verified with the new key.
604    /// Detecting that case is best-effort: the presence of a legacy seed
605    /// file (`.machineseed` or `machine_seed` inside the keys dir) is a
606    /// strong hint. If we see one, call it out explicitly.
607    fn enrich_crypto_error(&self, raw: String) -> KeyError {
608        // Only enrich on MAC failures -- other errors (I/O, wrong length) are
609        // surfaced as-is because their remediation differs.
610        if !raw.contains("MAC verification failed") {
611            return KeyError::Crypto(raw);
612        }
613
614        let legacy_seed_dot = self.dir.join(".machineseed");
615        let legacy_seed     = self.dir.join("machine_seed");
616        let has_legacy_seed = legacy_seed_dot.exists() || legacy_seed.exists();
617
618        let diagnosis = if has_legacy_seed {
619            "your keystore was created by an older Treeship version whose \
620             machine-key derivation has since changed. The ciphertext is \
621             intact but cannot be decrypted under the current derivation."
622        } else {
623            "the keystore cannot be decrypted. Usual causes: the key file \
624             was copied from a different machine, the hostname or username \
625             changed, or the file was corrupted."
626        };
627
628        // Resolve the user's ~/.treeship path for the recovery command, so
629        // we give a copy-pasteable command rather than a generic instruction.
630        let ts_dir = std::env::var("HOME")
631            .map(|h| format!("{h}/.treeship"))
632            .unwrap_or_else(|_| "~/.treeship".into());
633
634        // The outer KeyError::Crypto Display impl already prepends
635        // "keys crypto: "; don't double it. Start with the raw MAC error
636        // so the user still sees the underlying cryptographic reason,
637        // then follow with the human-readable diagnosis and recovery.
638        let msg = format!(
639            "{raw}\n\n  \
640             Diagnosis: {diagnosis}\n\n  \
641             Recovery (nondestructive -- the old keystore is moved aside, \
642             not deleted; any sealed .treeship packages you produced remain \
643             verifiable since their receipts embed the old public key):\n\n    \
644             mv {ts_dir} {ts_dir}.bak.$(date +%s)\n    \
645             treeship init\n"
646        );
647
648        KeyError::Crypto(msg)
649    }
650
651    /// Returns the default key ID.
652    pub fn default_key_id(&self) -> Result<KeyId, KeyError> {
653        self.read_manifest()?
654            .default_key_id
655            .ok_or(KeyError::NoDefaultKey)
656    }
657
658    /// Lists all keys.
659    pub fn list(&self) -> Result<Vec<KeyInfo>, KeyError> {
660        let manifest = self.read_manifest()?;
661        let default  = manifest.default_key_id.as_deref().unwrap_or("");
662
663        manifest.key_ids.iter().map(|id| {
664            let entry = self.load_entry(id)?;
665            Ok(KeyInfo {
666                id:               entry.id.clone(),
667                algorithm:        entry.algorithm.clone(),
668                is_default:       entry.id == default,
669                created_at:       entry.created_at.clone(),
670                fingerprint:      fingerprint(&entry.public_key),
671                public_key:       entry.public_key.clone(),
672                valid_until:      entry.valid_until.clone(),
673                successor_key_id: entry.successor_key_id.clone(),
674            })
675        }).collect()
676    }
677
678    /// Sets the default signing key.
679    pub fn set_default(&self, id: &str) -> Result<(), KeyError> {
680        // Verify the key exists before updating the manifest.
681        self.load_entry(id)?;
682        let mut manifest = self.read_manifest()?;
683        manifest.default_key_id = Some(id.to_string());
684        self.write_manifest(&manifest)
685    }
686
687    /// Returns the public key bytes for a key ID.
688    pub fn public_key(&self, id: &str) -> Result<Vec<u8>, KeyError> {
689        Ok(self.load_entry(id)?.public_key)
690    }
691
692    // --- private ---
693
694    fn load_entry(&self, id: &str) -> Result<EncryptedEntry, KeyError> {
695        // Check cache first.
696        if let Ok(cache) = self.cache.read() {
697            if let Some(entry) = cache.get(id) {
698                return Ok(entry.clone());
699            }
700        }
701        self.read_entry(id)
702    }
703
704    fn entry_path(&self, id: &str) -> PathBuf {
705        self.dir.join(format!("{}.json", id))
706    }
707
708    fn write_entry(&self, entry: &EncryptedEntry) -> Result<(), KeyError> {
709        let path = self.entry_path(&entry.id);
710        let json = serde_json::to_vec_pretty(entry)?;
711        write_file_600(&path, &json)?;
712        Ok(())
713    }
714
715    fn read_entry(&self, id: &str) -> Result<EncryptedEntry, KeyError> {
716        let path = self.entry_path(id);
717        if !path.exists() {
718            return Err(KeyError::NotFound(id.to_string()));
719        }
720        let bytes = fs::read(&path)?;
721        let entry: EncryptedEntry = serde_json::from_slice(&bytes)?;
722        Ok(entry)
723    }
724
725    fn manifest_path(&self) -> PathBuf {
726        self.dir.join("manifest.json")
727    }
728
729    fn read_manifest(&self) -> Result<Manifest, KeyError> {
730        let path = self.manifest_path();
731        if !path.exists() {
732            return Ok(Manifest::default());
733        }
734        let bytes = fs::read(&path)?;
735        Ok(serde_json::from_slice(&bytes)?)
736    }
737
738    fn write_manifest(&self, m: &Manifest) -> Result<(), KeyError> {
739        let json = serde_json::to_vec_pretty(m)?;
740        write_file_600(&self.manifest_path(), &json)?;
741        Ok(())
742    }
743}
744
745// --- Crypto helpers ---
746//
747// AEAD choice: AES-256-GCM via the RustCrypto `aes-gcm` 0.10 crate.
748// Reasons:
749//   - Matches the original (documented but never implemented) intent of
750//     the keystore, so audit reports and SECURITY.md don't need to be
751//     re-anchored on a different primitive.
752//   - Well-audited, widely deployed, no platform gotchas.
753//   - `chacha20poly1305` would have been a defensible alternative
754//     (slightly better software performance), but the migration cost of
755//     changing the documented primitive while we already have to ship a
756//     migration for the broken construction is not worth it.
757//
758// On-disk v2 format (`encrypt_for_disk_v2`):
759//   [ magic = 0x54 ('T') ]   1 byte
760//   [ version = 0x02     ]   1 byte
761//   [ nonce              ]  12 bytes (random per encryption)
762//   [ ciphertext || tag  ]  N + 16 bytes (tag appended by aead crate)
763//
764// The first byte (0x54) is a structural sentinel so we can dispatch on
765// the format without relying on length heuristics. v1 ciphertexts start
766// with the first byte of their random nonce, so the chance of an
767// accidental v1 entry that looks like v2 is ~1/2^16 (matching both magic
768// AND version byte) and we still re-validate by AEAD-decrypting; if the
769// AEAD fails on something that looks like v2, we fall back to v1.
770
771const KEYSTORE_MAGIC: u8 = 0x54; // 'T'
772const KEYSTORE_VERSION_V2: u8 = 0x02;
773
774/// Build the v2 keystore AEAD AAD.
775///
776/// The AAD binds two things into the GCM tag beyond ciphertext+nonce:
777///
778/// 1. **Framing prefix** (`[KEYSTORE_MAGIC, KEYSTORE_VERSION_V2]`) so
779///    flipping the magic or version byte on disk surfaces as a MAC
780///    failure rather than dispatcher confusion (the M2 audit finding).
781/// 2. **Entry identity** (`entry_id` and `public_key`) so an attacker
782///    with write access to `~/.treeship/keys/` cannot copy entry A's
783///    `enc_priv_key` ciphertext into entry B's JSON envelope. Without
784///    this binding, the swap would decrypt cleanly (same machine key,
785///    same framing-only AAD) and the signer for advertised key id A
786///    would silently sign with key B's secret scalar — un-binding
787///    `KeyInfo.public_key` from the actual scalar in use. This closes
788///    the "intra-keystore swap" class flagged in the post-merge audit
789///    of TS-2026-001.
790///
791/// Every variable-length field is length-prefixed with a big-endian
792/// u32 before its bytes. Concatenating variable-length fields without
793/// length prefixes is a forgery class (an attacker who controls field
794/// boundaries can shift bytes between fields and present a different
795/// `(entry_id, public_key)` pair whose AAD-bytes serialize identically).
796/// `entry_id` is a fixed-prefix `key_<hex>` string in practice, but we
797/// length-prefix it anyway to defend against future id schemes.
798///
799/// The AAD must be byte-identical on encrypt and decrypt. Future
800/// versions (V3+) get their own builder; the dispatcher picks which
801/// to use based on the framing prefix.
802fn build_aad_v2(entry_id: &str, public_key: &[u8]) -> Vec<u8> {
803    let mut aad = Vec::with_capacity(2 + 4 + entry_id.len() + 4 + public_key.len());
804    aad.push(KEYSTORE_MAGIC);
805    aad.push(KEYSTORE_VERSION_V2);
806    aad.extend_from_slice(&(entry_id.len() as u32).to_be_bytes());
807    aad.extend_from_slice(entry_id.as_bytes());
808    aad.extend_from_slice(&(public_key.len() as u32).to_be_bytes());
809    aad.extend_from_slice(public_key);
810    aad
811}
812
813/// AES-256-GCM (the real one) encrypt for at-rest keystore storage.
814/// Returns the framed v2 blob ready to drop into `EncryptedEntry::enc_priv_key`.
815///
816/// Output: `[magic, version, nonce(12), ciphertext || tag(16)]`.
817///
818/// The AEAD's Associated Authenticated Data binds:
819/// - the framing prefix (M2 — flipping magic/version surfaces as MAC failure)
820/// - the entry id and public key (post-merge audit fix-up — closes the
821///   intra-keystore swap class where a local attacker copies entry A's
822///   `enc_priv_key` into entry B's JSON envelope).
823///
824/// See `build_aad_v2` for the exact layout. `entry_id` and `public_key`
825/// must match what gets serialized into the `EncryptedEntry` JSON;
826/// `decrypt_for_disk_v2` reads them back from the deserialized entry
827/// to recompute the AAD.
828fn encrypt_for_disk_v2(
829    key: &[u8; 32],
830    entry_id: &str,
831    public_key: &[u8],
832    plaintext: &[u8],
833) -> Result<Vec<u8>, String> {
834    // Wrap the in-memory AEAD key in Zeroizing so the local stack copy
835    // is wiped on drop. The aes-gcm cipher object owns its own internal
836    // expanded key schedule; that's outside our control, but the raw
837    // 32-byte buffer at this scope is ours to clear.
838    let key_buf: Zeroizing<[u8; 32]> = Zeroizing::new(*key);
839    let aead_key: &AesKey<Aes256Gcm> = AesKey::<Aes256Gcm>::from_slice(key_buf.as_slice());
840    let cipher = Aes256Gcm::new(aead_key);
841
842    // 96-bit random nonce from the OS CSPRNG.
843    let nonce = Aes256Gcm::generate_nonce(&mut AeadOsRng);
844
845    let aad = build_aad_v2(entry_id, public_key);
846    let ciphertext = cipher
847        .encrypt(
848            &nonce,
849            Payload {
850                msg: plaintext,
851                aad: aad.as_slice(),
852            },
853        )
854        .map_err(|e| format!("aead encrypt failed: {e}"))?;
855
856    let mut out = Vec::with_capacity(2 + 12 + ciphertext.len());
857    out.push(KEYSTORE_MAGIC);
858    out.push(KEYSTORE_VERSION_V2);
859    out.extend_from_slice(nonce.as_slice());
860    out.extend_from_slice(&ciphertext);
861    Ok(out)
862}
863
864/// AES-256-GCM decrypt of a v2 framed blob. Uses the same AAD binding
865/// as `encrypt_for_disk_v2`:
866///   - framing prefix (so a tampered magic/version surfaces as MAC failure)
867///   - entry id + public key (so swapping `enc_priv_key` between entries
868///     in the same keystore surfaces as MAC failure).
869///
870/// `entry_id` and `public_key` come from the `EncryptedEntry` JSON
871/// envelope that holds `blob`. The caller is responsible for passing the
872/// *envelope's* id and pubkey, not values from some other source — that
873/// is precisely what binds the ciphertext to its envelope.
874fn decrypt_v2(
875    key: &[u8; 32],
876    entry_id: &str,
877    public_key: &[u8],
878    blob: &[u8],
879) -> Result<Vec<u8>, String> {
880    // Minimum: magic(1) + version(1) + nonce(12) + tag(16) = 30 bytes.
881    if blob.len() < 30 {
882        return Err("v2 ciphertext too short".into());
883    }
884    if blob[0] != KEYSTORE_MAGIC || blob[1] != KEYSTORE_VERSION_V2 {
885        return Err("v2 ciphertext has wrong magic/version".into());
886    }
887    let nonce_bytes = &blob[2..14];
888    let ct = &blob[14..];
889
890    let key_buf: Zeroizing<[u8; 32]> = Zeroizing::new(*key);
891    let aead_key: &AesKey<Aes256Gcm> = AesKey::<Aes256Gcm>::from_slice(key_buf.as_slice());
892    let cipher = Aes256Gcm::new(aead_key);
893    let nonce = Nonce::from_slice(nonce_bytes);
894
895    let aad = build_aad_v2(entry_id, public_key);
896    cipher
897        .decrypt(
898            nonce,
899            Payload {
900                msg: ct,
901                aad: aad.as_slice(),
902            },
903        )
904        .map_err(|_| "MAC verification failed — key file may be corrupt or wrong machine".into())
905}
906
907/// Returns true iff `blob` is shaped like a v1 (legacy) ciphertext.
908/// Used by the dispatcher to decide whether a successful decrypt should
909/// trigger a transparent re-encrypt to v2.
910fn is_legacy_v1(blob: &[u8]) -> bool {
911    // A v2 blob always starts with [magic, version]. Anything else
912    // (including the empty enc_priv_key case during partial writes) is
913    // treated as legacy and routed through the v1 path, which will fail
914    // cleanly on garbage.
915    !(blob.len() >= 2 && blob[0] == KEYSTORE_MAGIC && blob[1] == KEYSTORE_VERSION_V2)
916}
917
918/// Top-level decrypt dispatcher used by the keystore. Tries v2 if the
919/// blob carries the magic+version prefix, otherwise falls through to the
920/// legacy v1 path. If a blob looks like v2 but AEAD verification fails,
921/// we also try v1 — this defends against the (negligible) probability
922/// that a legacy ciphertext's random first two bytes happen to collide
923/// with our magic+version.
924///
925/// M1 (TS-2026-001 audit): when the blob is v2-shaped and BOTH the v2
926/// AEAD and the v1 fallback fail, surface the v2 error rather than the
927/// v1 error. v1's failure on a v2-shaped blob is mechanical (wrong
928/// MAC computed under the wrong construction) and tells the user
929/// nothing useful; v2's failure is the actually-relevant signal
930/// (MAC verification under the documented AEAD). The previous code
931/// would mask the meaningful error with a confused legacy error
932/// message that pointed at the wrong remediation.
933fn decrypt_from_disk(
934    key: &[u8; 32],
935    entry_id: &str,
936    public_key: &[u8],
937    enc_data: &[u8],
938    legacy_nonce_field: &[u8],
939) -> Result<Zeroizing<Vec<u8>>, String> {
940    if !is_legacy_v1(enc_data) {
941        match decrypt_v2(key, entry_id, public_key, enc_data) {
942            Ok(pt) => return Ok(Zeroizing::new(pt)),
943            Err(v2_err) => {
944                // Collision fallback. v1 entries had random first bytes;
945                // there's a vanishing chance one looks like v2 framing.
946                // Try v1 first; if it succeeds we have a legitimate
947                // legacy entry whose framing happens to look v2-shaped.
948                // If v1 also fails, surface the v2 error (the
949                // semantically meaningful one) rather than v1's
950                // mechanical-junk failure.
951                return match decrypt_legacy_v1(key, enc_data, legacy_nonce_field) {
952                    Ok(pt) => Ok(Zeroizing::new(pt)),
953                    Err(_) => Err(v2_err),
954                };
955            }
956        }
957    }
958    decrypt_legacy_v1(key, enc_data, legacy_nonce_field).map(Zeroizing::new)
959}
960
961/// DEPRECATED: legacy at-rest decryption for keystores written before
962/// v0.10.3. This is the SHA-256-CTR + HMAC-SHA-256 construction that
963/// was mis-labelled as AES-256-GCM (TS-2026-001). The CTR keystream is
964/// also degenerate (the same `enc_key` byte is reused once per
965/// plaintext byte, since `block[i % 32]` indexes the same SHA-256 output
966/// modulo 32), so the construction is NOT a real stream cipher even
967/// ignoring the AEAD mislabelling.
968///
969/// Kept ONLY to migrate existing on-disk keystores forward to the v2
970/// AEAD format. Never call this for new writes. The encrypt counterpart
971/// has been removed from the v2 codepath — the only place v1
972/// ciphertexts come from is files written by older Treeship versions.
973pub fn aes_gcm_decrypt(
974    key: &[u8; 32],
975    enc_data: &[u8],
976    _nonce_unused: &[u8],
977) -> Result<Vec<u8>, String> {
978    // Preserved as a public symbol because the `treeship-vi` sibling
979    // crate calls it directly. vi only ever produces v1 ciphertexts
980    // (its `aes_gcm_encrypt` shim calls `legacy_v1_encrypt`) and has
981    // no concept of the `EncryptedEntry` envelope that carries the
982    // entry id + public key the v2 AAD now requires. Route this shim
983    // directly through the legacy v1 path so vi's call site keeps
984    // working byte-for-byte; vi's eventual migration release will
985    // adopt its own AEAD path with its own envelope binding.
986    decrypt_legacy_v1(key, enc_data, _nonce_unused)
987}
988
989/// DEPRECATED: legacy at-rest encryption. Same caveats as
990/// `aes_gcm_decrypt`. Kept ONLY as a public symbol for compatibility
991/// with the `treeship-vi` sibling crate; the core keystore no longer
992/// produces v1 ciphertexts.
993///
994/// New code MUST use `encrypt_for_disk_v2`. This function still
995/// produces v1-format output so the vi crate's on-disk format remains
996/// byte-stable until it migrates on its own cadence.
997pub fn aes_gcm_encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result<(Vec<u8>, Vec<u8>), String> {
998    legacy_v1_encrypt(key, plaintext)
999}
1000
1001/// Legacy v1 encrypt. SHA-256-CTR + HMAC-SHA-256. DO NOT USE for new
1002/// writes — present only so vi-keystore callers keep working until
1003/// they migrate. See `aes_gcm_encrypt` doc-comment for the security
1004/// caveats.
1005fn legacy_v1_encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result<(Vec<u8>, Vec<u8>), String> {
1006    use sha2::Sha256;
1007
1008    let mut nonce = [0u8; 12];
1009    rand::thread_rng().fill_bytes(&mut nonce);
1010
1011    let mut enc_key_input = key.to_vec();
1012    enc_key_input.extend_from_slice(&nonce);
1013    enc_key_input.extend_from_slice(b"enc");
1014    let enc_key = Sha256::digest(&enc_key_input);
1015
1016    let mut mac_key_input = key.to_vec();
1017    mac_key_input.extend_from_slice(&nonce);
1018    mac_key_input.extend_from_slice(b"mac");
1019    let mac_key = Sha256::digest(&mac_key_input);
1020
1021    let ciphertext: Vec<u8> = plaintext.iter().enumerate().map(|(i, &b)| {
1022        let mut block_input = enc_key.to_vec();
1023        block_input.extend_from_slice(&(i as u64).to_le_bytes());
1024        let block = Sha256::digest(&block_input);
1025        b ^ block[i % 32]
1026    }).collect();
1027
1028    let mut mac_input = mac_key.to_vec();
1029    mac_input.extend_from_slice(&nonce);
1030    mac_input.extend_from_slice(&ciphertext);
1031    let mac = Sha256::digest(&mac_input);
1032
1033    let mut out = Vec::with_capacity(12 + 32 + ciphertext.len());
1034    out.extend_from_slice(&nonce);
1035    out.extend_from_slice(&mac);
1036    out.extend_from_slice(&ciphertext);
1037
1038    Ok((out, nonce.to_vec()))
1039}
1040
1041/// Legacy v1 decrypt. SHA-256-CTR + HMAC-SHA-256. See the module-level
1042/// notes on TS-2026-001 for why this is broken; kept only to migrate
1043/// existing keystores forward.
1044fn decrypt_legacy_v1(
1045    key: &[u8; 32],
1046    enc_data: &[u8],
1047    _nonce_unused: &[u8],
1048) -> Result<Vec<u8>, String> {
1049    if enc_data.len() < 44 {
1050        return Err("ciphertext too short".into());
1051    }
1052    use sha2::Sha256;
1053
1054    let nonce      = &enc_data[..12];
1055    let stored_mac = &enc_data[12..44];
1056    let ciphertext = &enc_data[44..];
1057
1058    let nonce_arr: [u8; 12] = nonce.try_into().unwrap();
1059
1060    let mut enc_key_input = key.to_vec();
1061    enc_key_input.extend_from_slice(&nonce_arr);
1062    enc_key_input.extend_from_slice(b"enc");
1063    let enc_key = Sha256::digest(&enc_key_input);
1064
1065    let mut mac_key_input = key.to_vec();
1066    mac_key_input.extend_from_slice(&nonce_arr);
1067    mac_key_input.extend_from_slice(b"mac");
1068    let mac_key = Sha256::digest(&mac_key_input);
1069
1070    let mut mac_input = mac_key.to_vec();
1071    mac_input.extend_from_slice(&nonce_arr);
1072    mac_input.extend_from_slice(ciphertext);
1073    let computed_mac = Sha256::digest(&mac_input);
1074
1075    let mac_ok = stored_mac.iter().zip(computed_mac.iter())
1076        .fold(0u8, |acc, (a, b)| acc | (a ^ b)) == 0;
1077
1078    if !mac_ok {
1079        return Err("MAC verification failed — key file may be corrupt or wrong machine".into());
1080    }
1081
1082    let plaintext: Vec<u8> = ciphertext.iter().enumerate().map(|(i, &b)| {
1083        let mut block_input = enc_key.to_vec();
1084        block_input.extend_from_slice(&(i as u64).to_le_bytes());
1085        let block = Sha256::digest(&block_input);
1086        b ^ block[i % 32]
1087    }).collect();
1088
1089    Ok(plaintext)
1090}
1091
1092// --- Machine key derivation ---
1093
1094pub fn derive_machine_key(store_dir: &Path) -> Result<[u8; 32], KeyError> {
1095    // 1. Linux: /etc/machine-id (stable across reboots)
1096    if let Ok(id) = fs::read_to_string("/etc/machine-id") {
1097        let trimmed = id.trim();
1098        if !trimmed.is_empty() {
1099            let mut h = Sha256::new();
1100            h.update(trimmed.as_bytes());
1101            h.update(store_dir.to_string_lossy().as_bytes());
1102            return Ok(h.finalize().into());
1103        }
1104    }
1105
1106    // 2. macOS: hostname + username derivation (v1, backward compatible).
1107    //
1108    // TODO(v0.7.0): Migrate to IOPlatformSerialNumber-based derivation.
1109    // The serial number is more stable (survives hostname and username
1110    // changes), but switching now would silently invalidate all existing
1111    // keys on macOS. A proper migration needs to:
1112    //   1. Try the new derivation first.
1113    //   2. On decryption failure, fall back to hostname+username.
1114    //   3. If legacy succeeds, re-encrypt with the new key and save.
1115    // Until that migration tooling is in place, keep hostname+username
1116    // as the primary derivation so existing users are not locked out.
1117    #[cfg(target_os = "macos")]
1118    {
1119        let hostname = std::process::Command::new("hostname")
1120            .output()
1121            .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
1122            .unwrap_or_default();
1123        let username = std::env::var("USER").unwrap_or_default();
1124        if !hostname.is_empty() && !username.is_empty() {
1125            let mut h = Sha256::new();
1126            h.update(b"treeship-machine-key:");
1127            h.update(hostname.as_bytes());
1128            h.update(b":");
1129            h.update(username.as_bytes());
1130            h.update(b":");
1131            h.update(store_dir.to_string_lossy().as_bytes());
1132            return Ok(h.finalize().into());
1133        }
1134    }
1135
1136    // 3. Fallback: random seed file. Co-located with the keystore so a
1137    //    project-local keystore (/proj/.treeship/keys/) keeps its seed at
1138    //    /proj/.treeship/machine_seed -- never reaching for ~/.treeship.
1139    //    A global keystore (~/.treeship/keys/) co-locates to
1140    //    ~/.treeship/machine_seed, which is byte-identical to the
1141    //    pre-v0.9.6 location, so existing global keystores keep working.
1142    //
1143    //    Backward-compat read order:
1144    //      1. <store_dir>/../machine_seed  (the new co-located path)
1145    //      2. ~/.treeship/machine_seed     (the old hardcoded path)
1146    //    Write order on first creation:
1147    //      1. <store_dir>/../machine_seed  if the parent exists/is writable
1148    //      2. ~/.treeship/machine_seed     as a last resort
1149    //
1150    //    This makes project-local config truly self-contained: an
1151    //    isolated /proj keystore can decrypt its own keys even when
1152    //    the user's ~/.treeship is corrupt or on a different machine,
1153    //    closing the trust-fabric isolation gap that blocked
1154    //    project-local smoke tests.
1155    let local_seed_path = store_dir.parent().map(|p| p.join("machine_seed"));
1156    let home = std::env::var("HOME")
1157        .map(std::path::PathBuf::from)
1158        .map_err(|_| KeyError::Crypto("HOME not set".to_string()))?;
1159    let global_seed_path = home.join(".treeship").join("machine_seed");
1160
1161    let seed = if let Some(local) = local_seed_path.as_ref().filter(|p| p.exists()) {
1162        fs::read_to_string(local).map_err(KeyError::Io)?
1163    } else if global_seed_path.exists() {
1164        // Backward-compat: an existing global seed keeps decrypting any
1165        // keystore that was encrypted under it (in particular the
1166        // standard ~/.treeship/keys/ case where local == global).
1167        fs::read_to_string(&global_seed_path).map_err(KeyError::Io)?
1168    } else {
1169        let mut bytes = [0u8; 32];
1170        rand::thread_rng().fill_bytes(&mut bytes);
1171        let seed_hex = hex_encode(&bytes);
1172
1173        // Prefer creating the seed locally. Falls back to the global
1174        // path only when the keystore has no usable parent (rare;
1175        // happens when store_dir is "/" or similar pathological input).
1176        let target = match local_seed_path.as_ref() {
1177            Some(p) => {
1178                let _ = fs::create_dir_all(p.parent().unwrap_or(Path::new(".")));
1179                p.clone()
1180            }
1181            None => {
1182                let _ = fs::create_dir_all(global_seed_path.parent().unwrap_or(Path::new(".")));
1183                global_seed_path.clone()
1184            }
1185        };
1186        fs::write(&target, &seed_hex).map_err(KeyError::Io)?;
1187        #[cfg(unix)]
1188        {
1189            use std::os::unix::fs::PermissionsExt;
1190            let _ = fs::set_permissions(&target, fs::Permissions::from_mode(0o600));
1191        }
1192        seed_hex
1193    };
1194
1195    let mut h = Sha256::new();
1196    h.update(b"treeship-machine-key-fallback:");
1197    h.update(seed.trim().as_bytes());
1198    h.update(b":");
1199    h.update(store_dir.to_string_lossy().as_bytes());
1200    Ok(h.finalize().into())
1201}
1202
1203/// Stable machine key derivation for NEW keys (VI P-256, etc).
1204/// Uses hardware identifiers that survive hostname/user changes.
1205/// For legacy ship Ed25519 keys, use `derive_machine_key()` instead.
1206pub fn derive_machine_key_stable(store_dir: &Path) -> Result<[u8; 32], KeyError> {
1207    // 1. Linux: /etc/machine-id
1208    if let Ok(id) = fs::read_to_string("/etc/machine-id") {
1209        let trimmed = id.trim();
1210        if !trimmed.is_empty() {
1211            let mut h = Sha256::new();
1212            h.update(b"treeship-machine-key-v2:");
1213            h.update(trimmed.as_bytes());
1214            h.update(b":");
1215            h.update(store_dir.to_string_lossy().as_bytes());
1216            return Ok(h.finalize().into());
1217        }
1218    }
1219
1220    // 2. macOS: IOPlatformSerialNumber (hardware serial, stable across
1221    //    hostname changes, user renames, non-interactive shells)
1222    #[cfg(target_os = "macos")]
1223    {
1224        if let Ok(output) = std::process::Command::new("ioreg")
1225            .args(["-rd1", "-c", "IOPlatformExpertDevice"])
1226            .output()
1227        {
1228            let stdout = String::from_utf8_lossy(&output.stdout);
1229            for line in stdout.lines() {
1230                if line.contains("IOPlatformSerialNumber") {
1231                    if let Some(serial) = line.split('"').nth(3) {
1232                        if !serial.is_empty() {
1233                            let mut h = Sha256::new();
1234                            h.update(b"treeship-machine-key-v2:");
1235                            h.update(serial.as_bytes());
1236                            h.update(b":");
1237                            h.update(store_dir.to_string_lossy().as_bytes());
1238                            return Ok(h.finalize().into());
1239                        }
1240                    }
1241                }
1242            }
1243        }
1244    }
1245
1246    // 3. Fallback: persistent random seed in ~/.treeship/.internal/
1247    //    Separate from key material. Mode 0600.
1248    let home = std::env::var("HOME")
1249        .map(std::path::PathBuf::from)
1250        .map_err(|_| KeyError::Crypto("HOME not set".to_string()))?;
1251    let seed_dir = home.join(".treeship").join(".internal");
1252    let _ = fs::create_dir_all(&seed_dir);
1253    #[cfg(unix)]
1254    {
1255        use std::os::unix::fs::PermissionsExt;
1256        let _ = fs::set_permissions(&seed_dir, fs::Permissions::from_mode(0o700));
1257    }
1258
1259    let seed_path = seed_dir.join("machine_seed_v2");
1260    let seed = if seed_path.exists() {
1261        fs::read_to_string(&seed_path).map_err(KeyError::Io)?
1262    } else {
1263        let mut bytes = [0u8; 32];
1264        rand::thread_rng().fill_bytes(&mut bytes);
1265        let seed_hex = hex_encode(&bytes);
1266        fs::write(&seed_path, &seed_hex).map_err(KeyError::Io)?;
1267        #[cfg(unix)]
1268        {
1269            use std::os::unix::fs::PermissionsExt;
1270            let _ = fs::set_permissions(&seed_path, fs::Permissions::from_mode(0o600));
1271        }
1272        seed_hex
1273    };
1274
1275    let mut h = Sha256::new();
1276    h.update(b"treeship-machine-key-v2-fallback:");
1277    h.update(seed.trim().as_bytes());
1278    h.update(b":");
1279    h.update(store_dir.to_string_lossy().as_bytes());
1280    Ok(h.finalize().into())
1281}
1282
1283// --- Utility ---
1284
1285fn new_key_id() -> KeyId {
1286    let mut b = [0u8; 8];
1287    rand::thread_rng().fill_bytes(&mut b);
1288    format!("key_{}", hex_encode(&b))
1289}
1290
1291fn fingerprint(pub_key: &[u8]) -> String {
1292    let h = Sha256::digest(pub_key);
1293    hex_encode(&h[..8])
1294}
1295
1296fn hex_encode(b: &[u8]) -> String {
1297    b.iter().fold(String::new(), |mut s, byte| {
1298        s.push_str(&format!("{:02x}", byte));
1299        s
1300    })
1301}
1302
1303/// Verify a private-key file has restrictive permissions before loading
1304/// it for signing. Returns `Ok(())` on non-Unix platforms, when the
1305/// `TREESHIP_ALLOW_INSECURE_KEY_PERMS=1` escape hatch is set, or when
1306/// the file is not group/world accessible. Otherwise returns
1307/// `KeyError::InsecureKeyPerms` with the offending path and mode.
1308fn check_key_file_perms(path: &Path) -> Result<(), KeyError> {
1309    #[cfg(unix)]
1310    {
1311        use std::os::unix::fs::PermissionsExt;
1312        if std::env::var_os("TREESHIP_ALLOW_INSECURE_KEY_PERMS")
1313            .map(|v| v == "1")
1314            .unwrap_or(false)
1315        {
1316            return Ok(());
1317        }
1318        // Missing files are reported by the caller as NotFound -- don't
1319        // mask that with a perm error.
1320        let meta = match fs::metadata(path) {
1321            Ok(m) => m,
1322            Err(_) => return Ok(()),
1323        };
1324        let mode = meta.permissions().mode();
1325        if mode & 0o077 != 0 {
1326            return Err(KeyError::InsecureKeyPerms {
1327                path: path.to_path_buf(),
1328                mode,
1329            });
1330        }
1331    }
1332    let _ = path;
1333    Ok(())
1334}
1335
1336impl Store {
1337    /// Repair file permissions on the keystore directory and every file
1338    /// inside it: dir to 0700, key entry files and manifest to 0600.
1339    /// Used by `treeship doctor --fix`. No-op on non-Unix.
1340    ///
1341    /// Returns the list of (path, old_mode, new_mode) tuples for paths
1342    /// that were actually changed, so the caller can report what it did.
1343    pub fn fix_perms(&self) -> Result<Vec<(PathBuf, u32, u32)>, KeyError> {
1344        let mut changed: Vec<(PathBuf, u32, u32)> = Vec::new();
1345        #[cfg(unix)]
1346        {
1347            use std::os::unix::fs::PermissionsExt;
1348
1349            let dir_meta = fs::metadata(&self.dir)?;
1350            let dir_mode = dir_meta.permissions().mode() & 0o777;
1351            if dir_mode != 0o700 {
1352                fs::set_permissions(&self.dir, fs::Permissions::from_mode(0o700))?;
1353                changed.push((self.dir.clone(), dir_mode, 0o700));
1354            }
1355
1356            for entry in fs::read_dir(&self.dir)? {
1357                let entry = entry?;
1358                let path = entry.path();
1359                if !entry.file_type()?.is_file() {
1360                    continue;
1361                }
1362                let mode = entry.metadata()?.permissions().mode() & 0o777;
1363                if mode != 0o600 {
1364                    fs::set_permissions(&path, fs::Permissions::from_mode(0o600))?;
1365                    changed.push((path, mode, 0o600));
1366                }
1367            }
1368        }
1369        Ok(changed)
1370    }
1371}
1372
1373/// Open (or create) the per-entry migration sentinel lock file with
1374/// owner-only permissions (0o600 on Unix). The handle returned can be
1375/// passed to `fs2::FileExt::lock_exclusive` to serialize concurrent
1376/// v1->v2 migrations of the same entry across processes/threads
1377/// (TS-2026-001 H3).
1378///
1379/// On Unix the mode is set at creation via `OpenOptionsExt::mode` so the
1380/// sentinel never has a moment of looser perms. On non-Unix platforms the
1381/// file inherits parent ACLs (the keystore dir is owner-scoped already).
1382#[cfg(unix)]
1383fn open_migration_lock_file(path: &Path) -> Result<fs::File, io::Error> {
1384    use std::os::unix::fs::OpenOptionsExt;
1385    fs::OpenOptions::new()
1386        .create(true)
1387        .read(true)
1388        .write(true)
1389        .truncate(false)
1390        .mode(0o600)
1391        .open(path)
1392}
1393
1394#[cfg(not(unix))]
1395fn open_migration_lock_file(path: &Path) -> Result<fs::File, io::Error> {
1396    fs::OpenOptions::new()
1397        .create(true)
1398        .read(true)
1399        .write(true)
1400        .truncate(false)
1401        .open(path)
1402}
1403
1404/// Atomically write `data` to `path` with owner-only (0o600) permissions on
1405/// Unix.
1406///
1407/// TS-2026-001 H1 + H2: the prior implementation was truncate-then-write,
1408/// which destroys the original file if the process crashes mid-write. For
1409/// the keystore that's catastrophic -- a crash during transparent v1->v2
1410/// migration would leave a zero-byte (or partial) key entry on disk and
1411/// the private key would be unrecoverable. This implementation writes to
1412/// a sibling tmp file in the same directory, fsyncs the bytes through to
1413/// the platter, then performs a POSIX-atomic same-filesystem `rename(2)`.
1414/// A crash before the rename leaves the original file intact; the tmp
1415/// file is harmless garbage that the next successful write will overwrite.
1416///
1417/// The 0o600 mode is set at file *creation* via `OpenOptionsExt::mode`
1418/// so there is no window in which the file exists with looser perms.
1419/// The prior `set_permissions` post-write call is dropped because it was
1420/// redundant and gave the appearance (but not the substance) of safety.
1421fn write_file_600(path: &Path, data: &[u8]) -> Result<(), KeyError> {
1422    // Place the tmp file in the same directory as the final path so the
1423    // rename stays on the same filesystem (cross-FS renames are not atomic
1424    // and degrade to copy+unlink, defeating the whole point).
1425    let tmp_path = path.with_extension("tmp");
1426
1427    // Best-effort cleanup of any stale tmp from a prior crash before we
1428    // start writing. Ignored on error -- if it doesn't exist that's fine,
1429    // and if it can't be removed the OpenOptions call below will surface
1430    // the underlying error.
1431    let _ = fs::remove_file(&tmp_path);
1432
1433    let write_result: Result<(), KeyError> = (|| {
1434        #[cfg(unix)]
1435        let open = {
1436            use std::os::unix::fs::OpenOptionsExt;
1437            fs::OpenOptions::new()
1438                .write(true)
1439                .create(true)
1440                .truncate(true)
1441                .mode(0o600)
1442                .open(&tmp_path)
1443        };
1444        #[cfg(not(unix))]
1445        let open = fs::OpenOptions::new()
1446            .write(true)
1447            .create(true)
1448            .truncate(true)
1449            .open(&tmp_path);
1450
1451        let mut f = open?;
1452        f.write_all(data)?;
1453        // sync_all flushes both data AND metadata, so on a crash after
1454        // the rename, fsck/journal recovery sees the new bytes -- not a
1455        // ghost inode with stale content.
1456        f.sync_all()?;
1457        Ok(())
1458    })();
1459
1460    if let Err(e) = write_result {
1461        // Best-effort cleanup so the next write isn't surprised by a
1462        // half-written tmp. Errors here are not surfaced: the original
1463        // write error is what the caller needs to see.
1464        let _ = fs::remove_file(&tmp_path);
1465        return Err(e);
1466    }
1467
1468    // Atomic same-filesystem rename. On Unix this is a single
1469    // rename(2) syscall guaranteed by POSIX to be atomic with respect
1470    // to other observers. On Windows std::fs::rename is implemented
1471    // via MoveFileEx with MOVEFILE_REPLACE_EXISTING (atomic on NTFS,
1472    // best-effort elsewhere). After this returns Ok, the new bytes are
1473    // visible at `path` and the tmp file no longer exists.
1474    if let Err(e) = fs::rename(&tmp_path, path) {
1475        let _ = fs::remove_file(&tmp_path);
1476        return Err(KeyError::Io(e));
1477    }
1478
1479    // fsync the parent directory so the rename's directory-entry update
1480    // is itself persisted. The previous code only fsynced the tmp
1481    // file's contents (via sync_all on the file handle) -- on ext4/xfs
1482    // with default mount options, the rename can return to userspace
1483    // before the dirent metadata has been written to the journal. A
1484    // power loss in that window leaves the directory entry pointing at
1485    // the OLD inode (or, worse, missing entirely if both old and new
1486    // were unlinked from the parent), even though both the data bytes
1487    // and the rename syscall ostensibly completed. The H1 doc-comment
1488    // above promised stronger durability than the code delivered;
1489    // fsyncing the parent dir closes that gap.
1490    //
1491    // Best-effort on Unix: a directory open + sync_all is the standard
1492    // pattern (see e.g. SQLite's atomic-commit, leveldb, lmdb). On
1493    // platforms where opening a directory for sync isn't supported, we
1494    // silently skip -- the rename is still atomic-with-respect-to-
1495    // observers, we just don't guarantee crash-durability of the
1496    // dirent update.
1497    #[cfg(unix)]
1498    {
1499        if let Some(parent) = path.parent() {
1500            // Errors here are non-fatal: the rename succeeded and the
1501            // common case (no power loss before the next fs flush) is
1502            // correct. We surface a failure to open/sync the dir only
1503            // if the rename itself succeeded, since otherwise the
1504            // caller would mistake a durability hint for a write
1505            // failure. swallow silently rather than return.
1506            if let Ok(dir) = fs::File::open(parent) {
1507                let _ = dir.sync_all();
1508            }
1509        }
1510    }
1511
1512    Ok(())
1513}
1514
1515fn unix_now() -> u64 {
1516    use std::time::{SystemTime, UNIX_EPOCH};
1517    SystemTime::now()
1518        .duration_since(UNIX_EPOCH)
1519        .unwrap_or_default()
1520        .as_secs()
1521}
1522
1523#[cfg(test)]
1524mod tests {
1525    use super::*;
1526
1527    fn temp_dir_path() -> PathBuf {
1528        let mut p = std::env::temp_dir();
1529        p.push(format!("treeship-test-{}", {
1530            let mut b = [0u8; 4];
1531            rand::thread_rng().fill_bytes(&mut b);
1532            hex_encode(&b)
1533        }));
1534        p
1535    }
1536
1537    fn make_store() -> (Store, PathBuf) {
1538        let dir = temp_dir_path();
1539        let store = Store::open(&dir).unwrap();
1540        (store, dir)
1541    }
1542
1543    fn cleanup(dir: PathBuf) {
1544        let _ = fs::remove_dir_all(dir);
1545    }
1546
1547    #[test]
1548    fn generate_key() {
1549        let (store, dir) = make_store();
1550        let info = store.generate(true).unwrap();
1551        assert!(info.id.starts_with("key_"));
1552        assert_eq!(info.algorithm, "ed25519");
1553        assert!(!info.fingerprint.is_empty());
1554        assert_eq!(info.public_key.len(), 32);
1555        cleanup(dir);
1556    }
1557
1558    #[test]
1559    fn default_signer_works() {
1560        let (store, dir) = make_store();
1561        store.generate(true).unwrap();
1562        let signer = store.default_signer().unwrap();
1563        assert!(!signer.key_id().is_empty());
1564        let pae = crate::attestation::pae("text/plain", b"test");
1565        let sig = signer.sign(&pae).unwrap();
1566        assert_eq!(sig.len(), 64);
1567        cleanup(dir);
1568    }
1569
1570    #[test]
1571    fn encrypt_decrypt_roundtrip() {
1572        // Routes the legacy public API through the dispatcher; v1
1573        // ciphertexts must still decrypt correctly.
1574        let key = [42u8; 32];
1575        let plaintext = b"super secret private key material here!";
1576        let (enc, nonce) = aes_gcm_encrypt(&key, plaintext).unwrap();
1577        let dec = aes_gcm_decrypt(&key, &enc, &nonce).unwrap();
1578        assert_eq!(dec, plaintext);
1579    }
1580
1581    #[test]
1582    fn decrypt_wrong_key_fails() {
1583        let key   = [42u8; 32];
1584        let wrong = [99u8; 32];
1585        let (enc, nonce) = aes_gcm_encrypt(&key, b"secret").unwrap();
1586        assert!(aes_gcm_decrypt(&wrong, &enc, &nonce).is_err());
1587    }
1588
1589    // --- v2 AEAD tests (TS-2026-001 fix) -----------------------------------
1590
1591    // Fixed entry id + pubkey for the unit-level v2 tests below. The AAD
1592    // builder binds these into the GCM tag, so encrypt and decrypt must
1593    // see identical values. Using constants keeps each test focused on
1594    // its own bit-flip / tamper assertion without dragging Store setup
1595    // into the picture.
1596    const TEST_ENTRY_ID: &str = "key_unit_test_entry_0001";
1597    const TEST_PUBLIC_KEY: &[u8; 32] = &[0xAA; 32];
1598
1599    #[test]
1600    fn v2_encrypt_decrypt_roundtrip() {
1601        let key = [7u8; 32];
1602        let plaintext = b"super secret private key material here!";
1603        let blob =
1604            encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, plaintext).unwrap();
1605        // Structural check on the framing.
1606        assert_eq!(blob[0], KEYSTORE_MAGIC, "magic byte");
1607        assert_eq!(blob[1], KEYSTORE_VERSION_V2, "version byte");
1608        assert_eq!(blob.len(), 2 + 12 + plaintext.len() + 16,
1609                   "magic+version+nonce+ct+tag length");
1610
1611        let dec =
1612            decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]).unwrap();
1613        assert_eq!(&*dec, plaintext);
1614    }
1615
1616    #[test]
1617    fn v2_decrypt_wrong_key_fails() {
1618        let key   = [7u8; 32];
1619        let wrong = [99u8; 32];
1620        let blob = encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"secret").unwrap();
1621        // Wrong key with v2 framing: AEAD must reject. Dispatcher will
1622        // try v1 fallback (which also fails on garbage), so the final
1623        // error surfaces as a MAC failure rather than wrong plaintext.
1624        let result = decrypt_from_disk(&wrong, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1625        assert!(result.is_err(), "wrong key must fail");
1626    }
1627
1628    #[test]
1629    fn v2_tamper_ciphertext_fails() {
1630        let key = [7u8; 32];
1631        let mut blob = encrypt_for_disk_v2(
1632            &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1633        ).unwrap();
1634        // Flip one bit inside the ciphertext body (after the 14-byte
1635        // framing). GCM authenticates ciphertext + nonce; any flip must
1636        // fail.
1637        let last = blob.len() - 5;
1638        blob[last] ^= 0x01;
1639        let result = decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1640        assert!(result.is_err(), "tampered ciphertext must fail to decrypt");
1641    }
1642
1643    #[test]
1644    fn v2_tamper_nonce_fails() {
1645        let key = [7u8; 32];
1646        let mut blob = encrypt_for_disk_v2(
1647            &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1648        ).unwrap();
1649        // Flip a bit in the nonce (bytes [2..14]).
1650        blob[5] ^= 0x01;
1651        let result = decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1652        assert!(result.is_err(), "tampered nonce must fail to decrypt");
1653    }
1654
1655    #[test]
1656    fn v2_tamper_tag_fails() {
1657        let key = [7u8; 32];
1658        let mut blob = encrypt_for_disk_v2(
1659            &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1660        ).unwrap();
1661        // Flip a bit in the trailing GCM tag (last 16 bytes).
1662        let len = blob.len();
1663        blob[len - 1] ^= 0x80;
1664        let result = decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]);
1665        assert!(result.is_err(), "tampered GCM tag must fail to decrypt");
1666    }
1667
1668    #[test]
1669    fn v2_nonces_are_unique_across_writes() {
1670        // Sanity check: two encryptions of identical plaintext under the
1671        // same key must produce different blobs (random per-write nonce).
1672        // Without this property, AES-GCM is catastrophically broken.
1673        let key = [7u8; 32];
1674        let blob_a =
1675            encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"identical").unwrap();
1676        let blob_b =
1677            encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"identical").unwrap();
1678        assert_ne!(blob_a, blob_b,
1679                   "two v2 encryptions of the same plaintext must differ");
1680        assert_ne!(&blob_a[2..14], &blob_b[2..14], "nonces must differ");
1681
1682        // L1 (TS-2026-001 audit): draw 10k nonces in a row and assert
1683        // every one is distinct. A duplicate at this volume would be a
1684        // strong (10k^2 / 2^96 ~ 2^-65 floor) signal that the OS CSPRNG
1685        // backing aead::OsRng is misbehaving on this build. Cheap, fast,
1686        // and catches a regression class (PRNG mis-seeding,
1687        // accidentally-deterministic nonce, RNG getting forked across
1688        // threads without re-seed) that the 2-sample check above can't.
1689        const N: usize = 10_000;
1690        let mut nonces: std::collections::HashSet<Vec<u8>> =
1691            std::collections::HashSet::with_capacity(N);
1692        for _ in 0..N {
1693            let blob =
1694                encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"x").unwrap();
1695            // bytes [2..14] are the 12-byte GCM nonce.
1696            nonces.insert(blob[2..14].to_vec());
1697        }
1698        assert_eq!(
1699            nonces.len(),
1700            N,
1701            "all {} v2 nonces must be unique; collision => RNG defect",
1702            N
1703        );
1704    }
1705
1706    #[test]
1707    fn v2_tamper_version_byte_fails() {
1708        // M2: flipping the version byte must cause decryption to fail.
1709        // The framing sanity check catches obvious flips immediately;
1710        // the AAD-binding test below covers the case where the framing
1711        // sanity check would otherwise pass.
1712        let key = [7u8; 32];
1713        let mut blob = encrypt_for_disk_v2(
1714            &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"super secret private key"
1715        ).unwrap();
1716        assert_eq!(blob[1], KEYSTORE_VERSION_V2);
1717        blob[1] = 0xff;
1718        assert!(
1719            decrypt_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob).is_err(),
1720            "altered version byte must be rejected"
1721        );
1722    }
1723
1724    #[test]
1725    fn v2_aad_binding_detects_framing_substitution() {
1726        // M2 direct check: encrypt a payload with v2 AAD, then construct
1727        // a blob whose framing claims to be v2 but whose ciphertext was
1728        // computed under a different AAD (empty). decrypt_v2 must
1729        // reject with MAC failure rather than returning the plaintext.
1730        let key = [7u8; 32];
1731        let plaintext = b"M2 AAD bound material";
1732
1733        // Compute a v2-framed blob without supplying AAD -- mimics what
1734        // the *pre-M2* code would have produced. This is the exact
1735        // attack surface AAD closes: an old blob whose framing is v2
1736        // but whose tag was computed empty.
1737        use aes_gcm::aead::Aead;
1738        let key_buf: Zeroizing<[u8; 32]> = Zeroizing::new(key);
1739        let aead_key: &AesKey<Aes256Gcm> = AesKey::<Aes256Gcm>::from_slice(key_buf.as_slice());
1740        let cipher = Aes256Gcm::new(aead_key);
1741        let nonce = Aes256Gcm::generate_nonce(&mut AeadOsRng);
1742        let ct_no_aad = cipher.encrypt(&nonce, plaintext.as_slice()).unwrap();
1743
1744        let mut forged = Vec::with_capacity(2 + 12 + ct_no_aad.len());
1745        forged.push(KEYSTORE_MAGIC);
1746        forged.push(KEYSTORE_VERSION_V2);
1747        forged.extend_from_slice(nonce.as_slice());
1748        forged.extend_from_slice(&ct_no_aad);
1749
1750        // Framing sanity passes. AAD does not. decrypt_v2 must reject.
1751        assert_eq!(forged[0], KEYSTORE_MAGIC);
1752        assert_eq!(forged[1], KEYSTORE_VERSION_V2);
1753        let result = decrypt_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &forged);
1754        assert!(result.is_err(),
1755                "ciphertext computed without AAD must fail to decrypt now that AAD is bound");
1756    }
1757
1758    #[test]
1759    fn dispatcher_surfaces_v2_error_on_corrupted_v2_blob() {
1760        // M1: a v2-shaped blob whose AEAD verification fails (and
1761        // whose v1 fallback also fails, since the bytes are garbage
1762        // under both constructions) must surface the v2 MAC error, not
1763        // the v1 "ciphertext too short" / random-junk error. The user
1764        // sees a meaningful message that points at the right
1765        // remediation.
1766        let key = [7u8; 32];
1767        let mut blob =
1768            encrypt_for_disk_v2(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, b"hello").unwrap();
1769        // Flip a byte in the GCM tag (last 16 bytes) so the v2 AEAD
1770        // rejects but the framing still classifies as v2.
1771        let last = blob.len() - 1;
1772        blob[last] ^= 0x01;
1773
1774        let err =
1775            decrypt_from_disk(&key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &blob, &[]).unwrap_err();
1776        // The dispatcher should bubble the v2 error string up. v2's
1777        // error message contains "MAC verification failed"; v1's
1778        // shape on garbage data is either "ciphertext too short" or
1779        // a different MAC error. Match on the v2-specific tail.
1780        assert!(
1781            err.contains("MAC verification failed"),
1782            "dispatcher must surface the v2 MAC error on corrupted v2 blob, got: {err}"
1783        );
1784    }
1785
1786    #[test]
1787    fn legacy_v1_ciphertext_still_decrypts_via_dispatcher() {
1788        // Simulates an on-disk keystore written by Treeship <= v0.10.2:
1789        // the dispatcher must successfully route legacy ciphertexts
1790        // through the v1 path so existing users are not locked out.
1791        let key = [13u8; 32];
1792        let plaintext = b"pre-v0.10.3 keystore entry";
1793        let (legacy_blob, legacy_nonce) =
1794            legacy_v1_encrypt(&key, plaintext).unwrap();
1795
1796        // Sanity: legacy blob does NOT start with v2 framing.
1797        assert!(is_legacy_v1(&legacy_blob),
1798                "legacy_v1_encrypt output must classify as legacy");
1799
1800        // Dispatcher must accept it. AAD inputs are irrelevant for the
1801        // v1 path (it doesn't use them), but the signature requires them
1802        // — pass the same placeholder constants used elsewhere.
1803        let dec = decrypt_from_disk(
1804            &key, TEST_ENTRY_ID, TEST_PUBLIC_KEY, &legacy_blob, &legacy_nonce,
1805        )
1806        .unwrap();
1807        assert_eq!(&*dec, plaintext);
1808    }
1809
1810    #[test]
1811    fn store_signer_migrates_legacy_entry_to_v2() {
1812        // End-to-end: write a key entry with the legacy v1 ciphertext
1813        // (as if upgrading from v0.10.2), call `signer()`, then verify
1814        // the on-disk entry has been rewritten in v2 format.
1815        let (store, dir) = make_store();
1816
1817        // Generate normally (this writes v2). Then re-encrypt the
1818        // secret in v1 format and overwrite the entry on disk to
1819        // simulate the upgrade scenario.
1820        let info = store.generate(true).unwrap();
1821        let entry_path = store.entry_path(&info.id);
1822
1823        // Pull the v2 entry off disk, decrypt to recover the secret,
1824        // then re-encode in legacy v1 format and write it back.
1825        let v2_entry: EncryptedEntry =
1826            serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
1827        let secret = decrypt_from_disk(
1828            &store.machine_key,
1829            &v2_entry.id,
1830            &v2_entry.public_key,
1831            &v2_entry.enc_priv_key,
1832            &v2_entry.nonce,
1833        )
1834            .unwrap();
1835        let (legacy_blob, legacy_nonce) =
1836            legacy_v1_encrypt(&store.machine_key, &secret).unwrap();
1837        let legacy_entry = EncryptedEntry {
1838            id:               v2_entry.id.clone(),
1839            algorithm:        v2_entry.algorithm.clone(),
1840            created_at:       v2_entry.created_at.clone(),
1841            public_key:       v2_entry.public_key.clone(),
1842            enc_priv_key:     legacy_blob,
1843            nonce:            legacy_nonce,
1844            valid_until:      v2_entry.valid_until.clone(),
1845            successor_key_id: v2_entry.successor_key_id.clone(),
1846        };
1847        fs::write(&entry_path, serde_json::to_vec_pretty(&legacy_entry).unwrap()).unwrap();
1848
1849        // Reload with a fresh Store so the cache doesn't paper over the
1850        // on-disk change.
1851        let store2 = Store::open(&dir).unwrap();
1852        // Loading the signer must succeed (legacy path works) AND
1853        // trigger the transparent migration to v2.
1854        let _signer = store2.signer(&info.id).unwrap();
1855
1856        let after: EncryptedEntry =
1857            serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
1858        assert!(!is_legacy_v1(&after.enc_priv_key),
1859                "post-migration entry must be in v2 format");
1860        assert_eq!(after.enc_priv_key[0], KEYSTORE_MAGIC);
1861        assert_eq!(after.enc_priv_key[1], KEYSTORE_VERSION_V2);
1862        assert!(after.nonce.is_empty(),
1863                "v2 entries serialize an empty legacy nonce field");
1864
1865        // L2 (TS-2026-001 audit): the framing check above proves the
1866        // migrator *wrote* a v2-shaped blob, but a downstream
1867        // assert_eq! on framing alone doesn't prove the v2 ciphertext
1868        // is actually a working AEAD encryption of the right secret.
1869        // Load the signer one more time through a fresh Store; this
1870        // routes through the dispatcher's v2-first branch and would
1871        // fail loudly if the migration had produced garbage.
1872        let store3 = Store::open(&dir).unwrap();
1873        let _signer = store3
1874            .signer(&info.id)
1875            .expect("post-migration v2 decrypt works");
1876
1877        cleanup(dir);
1878    }
1879
1880    #[test]
1881    fn persist_and_reload() {
1882        let (store, dir) = make_store();
1883        let info = store.generate(true).unwrap();
1884
1885        // Open a new Store instance pointing to the same directory.
1886        let store2 = Store::open(&dir).unwrap();
1887        let signer = store2.signer(&info.id).unwrap();
1888        assert_eq!(signer.key_id(), info.id);
1889
1890        // The reloaded signer must produce signatures verifiable with
1891        // the same public key.
1892        let verifier = {
1893            use crate::attestation::Verifier;
1894            use ed25519_dalek::VerifyingKey;
1895            let pk_bytes: [u8; 32] = info.public_key.try_into().unwrap();
1896            let vk = VerifyingKey::from_bytes(&pk_bytes).unwrap();
1897            let mut v = Verifier::new(std::collections::HashMap::new());
1898            v.add_key(info.id.clone(), vk);
1899            v
1900        };
1901
1902        use crate::attestation::sign;
1903        use crate::statements::ActionStatement;
1904        let stmt   = ActionStatement::new("agent://test", "tool.call");
1905        let pt     = crate::statements::payload_type("action");
1906        let signed = sign(&pt, &stmt, signer.as_ref()).unwrap();
1907        verifier.verify(&signed.envelope).unwrap();
1908
1909        cleanup(dir);
1910    }
1911
1912    #[test]
1913    fn list_keys() {
1914        let (store, dir) = make_store();
1915        store.generate(true).unwrap();
1916        store.generate(false).unwrap();
1917
1918        let keys = store.list().unwrap();
1919        assert_eq!(keys.len(), 2);
1920        assert_eq!(keys.iter().filter(|k| k.is_default).count(), 1);
1921        cleanup(dir);
1922    }
1923
1924    #[test]
1925    fn no_default_key_errors() {
1926        let (store, dir) = make_store();
1927        assert!(store.default_signer().is_err());
1928        cleanup(dir);
1929    }
1930
1931    #[test]
1932    fn rotate_mints_successor_and_links_predecessor() {
1933        let (store, dir) = make_store();
1934        let pred = store.generate(true).unwrap();
1935        assert!(pred.valid_until.is_none(), "fresh key has no expiry");
1936        assert!(pred.successor_key_id.is_none(), "fresh key has no successor");
1937
1938        let result = store
1939            .rotate(None, std::time::Duration::from_secs(3600), true)
1940            .unwrap();
1941
1942        // Predecessor metadata is updated.
1943        assert_eq!(result.predecessor.id, pred.id);
1944        assert!(result.predecessor.valid_until.is_some(),
1945                "predecessor must get valid_until after rotation");
1946        assert_eq!(result.predecessor.successor_key_id.as_deref(),
1947                   Some(result.successor.id.as_str()),
1948                   "predecessor must link forward to successor");
1949        assert!(!result.predecessor.is_default,
1950                "after rotation with set_default=true, predecessor is no longer default");
1951
1952        // Successor is fresh.
1953        assert_ne!(result.successor.id, pred.id);
1954        assert!(result.successor.valid_until.is_none(), "successor has no expiry yet");
1955        assert!(result.successor.successor_key_id.is_none(), "successor is chain head");
1956        assert!(result.successor.is_default, "successor is the new default");
1957
1958        // Same metadata visible via list().
1959        let listed = store.list().unwrap();
1960        assert_eq!(listed.len(), 2);
1961        let pred_listed = listed.iter().find(|k| k.id == pred.id).unwrap();
1962        assert!(pred_listed.valid_until.is_some());
1963        assert_eq!(pred_listed.successor_key_id.as_deref(),
1964                   Some(result.successor.id.as_str()));
1965
1966        cleanup(dir);
1967    }
1968
1969    #[test]
1970    fn rotate_with_set_default_false_keeps_predecessor_active() {
1971        let (store, dir) = make_store();
1972        let pred = store.generate(true).unwrap();
1973
1974        let result = store
1975            .rotate(None, std::time::Duration::from_secs(3600), false)
1976            .unwrap();
1977
1978        // Predecessor is still default. Successor exists but is not default.
1979        assert!(result.predecessor.is_default);
1980        assert!(!result.successor.is_default);
1981        assert_eq!(store.default_key_id().unwrap(), pred.id);
1982
1983        cleanup(dir);
1984    }
1985
1986    #[test]
1987    fn rotate_predecessor_signing_still_works_during_grace_window() {
1988        let (store, dir) = make_store();
1989        let pred = store.generate(true).unwrap();
1990        let _ = store
1991            .rotate(None, std::time::Duration::from_secs(3600), true)
1992            .unwrap();
1993
1994        // Predecessor key must still be loadable and capable of signing
1995        // during its grace window. Verifiers can refuse on lifecycle, but
1996        // the keystore must not preemptively destroy material.
1997        let signer = store.signer(&pred.id).unwrap();
1998        let pae = crate::attestation::pae("text/plain", b"grace-window-payload");
1999        let sig = signer.sign(&pae).unwrap();
2000        assert_eq!(sig.len(), 64);
2001
2002        cleanup(dir);
2003    }
2004
2005    #[test]
2006    fn rotate_refuses_to_rotate_already_rotated_key() {
2007        let (store, dir) = make_store();
2008        store.generate(true).unwrap();
2009        let r1 = store
2010            .rotate(None, std::time::Duration::from_secs(60), true)
2011            .unwrap();
2012
2013        // Rotating the predecessor again must be refused -- it already
2014        // points at r1.successor. Caller should rotate the chain head.
2015        let err = store
2016            .rotate(Some(&r1.predecessor.id),
2017                    std::time::Duration::from_secs(60),
2018                    true)
2019            .unwrap_err();
2020        match err {
2021            KeyError::Crypto(msg) => assert!(
2022                msg.contains("already been rotated"),
2023                "error must explain why: {msg}"
2024            ),
2025            other => panic!("expected Crypto error, got {other:?}"),
2026        }
2027        cleanup(dir);
2028    }
2029
2030    #[test]
2031    fn successor_chain_walks_forward() {
2032        let (store, dir) = make_store();
2033        let k0 = store.generate(true).unwrap();
2034        let r1 = store
2035            .rotate(None, std::time::Duration::from_secs(60), true)
2036            .unwrap();
2037        let r2 = store
2038            .rotate(None, std::time::Duration::from_secs(60), true)
2039            .unwrap();
2040
2041        let chain = store.successor_chain(&k0.id).unwrap();
2042        assert_eq!(chain, vec![k0.id.clone(), r1.successor.id.clone(), r2.successor.id.clone()],
2043                   "chain must be ordered head -> tail");
2044
2045        // Mid-chain start: chain from r1.successor should drop k0.
2046        let mid = store.successor_chain(&r1.successor.id).unwrap();
2047        assert_eq!(mid, vec![r1.successor.id.clone(), r2.successor.id.clone()]);
2048
2049        // Tail: just itself.
2050        let tail = store.successor_chain(&r2.successor.id).unwrap();
2051        assert_eq!(tail, vec![r2.successor.id.clone()]);
2052
2053        cleanup(dir);
2054    }
2055
2056    #[test]
2057    fn valid_keys_at_filters_by_grace_window() {
2058        let (store, dir) = make_store();
2059        let _ = store.generate(true).unwrap();
2060        let result = store
2061            .rotate(None, std::time::Duration::from_secs(3600), true)
2062            .unwrap();
2063
2064        // At time-of-rotation, both keys must be valid -- predecessor is
2065        // mid-grace, successor is freshly minted.
2066        let now = unix_now();
2067        let valid_now = store.valid_keys_at(now).unwrap();
2068        assert_eq!(valid_now.len(), 2, "both predecessor (in grace) and successor should be valid");
2069
2070        // After the grace window expires, only the successor remains.
2071        let after_grace = unix_now() + 7200;
2072        let valid_after = store.valid_keys_at(after_grace).unwrap();
2073        assert_eq!(valid_after.len(), 1,
2074                   "after grace window only successor remains valid");
2075        assert_eq!(valid_after[0].id, result.successor.id);
2076
2077        cleanup(dir);
2078    }
2079
2080    /// Regression: if the successor key file is missing on disk (because a
2081    /// prior rotate() crashed AFTER stamping the predecessor but BEFORE
2082    /// writing the successor), retrying must NOT be wedged. With the
2083    /// successor-first write order this scenario can't be reached by a
2084    /// single-process crash, but we still need to defend against an operator
2085    /// who manually deletes a successor file mid-life. The recovery path
2086    /// is: clear the predecessor's successor pointer (or restore the file
2087    /// from backup) and try again.
2088    /// Regression: even if the manifest write FAILED (say, disk full at
2089    /// the worst possible moment), the in-memory cache must reflect the
2090    /// stamped predecessor that already landed on disk -- otherwise a
2091    /// same-process retry would skip the already-rotated guard and mint
2092    /// a duplicate successor.
2093    ///
2094    /// We can't easily inject a manifest-write failure mid-test, but we
2095    /// can verify the precondition that makes the recovery work: after a
2096    /// successful rotate(), the cache holds the stamped predecessor (so
2097    /// any subsequent rotate would correctly refuse). Combined with the
2098    /// write order (cache update BEFORE manifest write in rotate()),
2099    /// this proves a manifest-write crash leaves the cache aligned with
2100    /// disk, not behind it.
2101    #[test]
2102    fn rotate_cache_reflects_stamped_predecessor_for_retry_safety() {
2103        let (store, dir) = make_store();
2104        let pred = store.generate(true).unwrap();
2105        let _ = store
2106            .rotate(None, std::time::Duration::from_secs(60), true)
2107            .unwrap();
2108
2109        // The cache must have the stamped predecessor; a same-process
2110        // retry of rotate(predecessor) MUST be refused. If the cache
2111        // were stale (still showing the unstamped predecessor), this
2112        // call would proceed and mint a duplicate successor.
2113        let err = store
2114            .rotate(Some(&pred.id),
2115                    std::time::Duration::from_secs(60),
2116                    true)
2117            .unwrap_err();
2118        match err {
2119            KeyError::Crypto(msg) => assert!(
2120                msg.contains("already been rotated"),
2121                "cache should reflect stamped predecessor; got: {msg}"
2122            ),
2123            other => panic!("expected Crypto error, got {other:?}"),
2124        }
2125
2126        cleanup(dir);
2127    }
2128
2129    #[test]
2130    fn rotated_predecessor_pointing_at_missing_successor_surfaces_clear_error() {
2131        let (store, dir) = make_store();
2132        store.generate(true).unwrap();
2133        let result = store
2134            .rotate(None, std::time::Duration::from_secs(60), true)
2135            .unwrap();
2136
2137        // Simulate operator-deleted successor file. The manifest still
2138        // references it, so a cold-cache reader trying to walk the chain
2139        // hits a clear NotFound for the missing key.
2140        let succ_path = store.entry_path(&result.successor.id);
2141        fs::remove_file(&succ_path).unwrap();
2142
2143        // Open a fresh Store instance so the cache doesn't paper over the
2144        // missing on-disk entry. successor_chain() walks via load_entry;
2145        // the missing file must produce KeyError::NotFound, not a panic
2146        // and not an infinite loop.
2147        let store2 = Store::open(&dir).unwrap();
2148        let err = store2.successor_chain(&result.predecessor.id).unwrap_err();
2149        match err {
2150            KeyError::NotFound(id) => assert_eq!(id, result.successor.id),
2151            other => panic!("expected NotFound error, got {other:?}"),
2152        }
2153
2154        cleanup(dir);
2155    }
2156
2157    /// Pre-0.9.5 entry files lack `valid_until` and `successor_key_id`.
2158    /// They must still deserialize cleanly and be visible via `list()` /
2159    /// `default_signer()` etc.
2160    #[test]
2161    fn legacy_entry_without_lifecycle_fields_loads() {
2162        let (store, dir) = make_store();
2163        let info = store.generate(true).unwrap();
2164
2165        // Re-serialize the on-disk entry without the new fields, simulating
2166        // a file created by a 0.9.4 or earlier CLI.
2167        let path = store.entry_path(&info.id);
2168        let raw  = fs::read(&path).unwrap();
2169        let mut json: serde_json::Value = serde_json::from_slice(&raw).unwrap();
2170        let obj = json.as_object_mut().unwrap();
2171        obj.remove("valid_until");
2172        obj.remove("successor_key_id");
2173        fs::write(&path, serde_json::to_vec_pretty(&json).unwrap()).unwrap();
2174
2175        // A fresh Store (cold cache) must still load the entry and treat
2176        // the missing fields as None.
2177        let store2 = Store::open(&dir).unwrap();
2178        let listed = store2.list().unwrap();
2179        assert_eq!(listed.len(), 1);
2180        assert!(listed[0].valid_until.is_none(),
2181                "missing valid_until must default to None on legacy entry");
2182        assert!(listed[0].successor_key_id.is_none(),
2183                "missing successor_key_id must default to None on legacy entry");
2184        let signer = store2.default_signer().unwrap();
2185        assert_eq!(signer.key_id(), info.id);
2186
2187        cleanup(dir);
2188    }
2189
2190    // --- keystore permission hardening (PR 1) -------------------------------
2191
2192    // The perm tests below mutate the process-global env var
2193    // TREESHIP_ALLOW_INSECURE_KEY_PERMS. cargo test runs cases in
2194    // parallel by default, so without serialization one test can set
2195    // the bypass while another expects it unset and racefully fail.
2196    // This mutex serializes them; everything else in the file remains
2197    // parallel-safe.
2198    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
2199
2200    #[test]
2201    #[cfg(unix)]
2202    fn write_entry_creates_file_with_0600() {
2203        use std::os::unix::fs::PermissionsExt;
2204        let (store, dir) = make_store();
2205        let info = store.generate(true).unwrap();
2206        let mode = fs::metadata(store.entry_path(&info.id))
2207            .unwrap()
2208            .permissions()
2209            .mode()
2210            & 0o777;
2211        assert_eq!(mode, 0o600, "freshly written key file must be 0600, got {:o}", mode);
2212        cleanup(dir);
2213    }
2214
2215    #[test]
2216    #[cfg(unix)]
2217    fn signer_refuses_world_readable_key() {
2218        use std::os::unix::fs::PermissionsExt;
2219        // Mutex prevents the bypass var from being toggled by a
2220        // sibling test mid-flight (cargo test parallel runner).
2221        let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
2222        // Make sure the bypass var is not leaking from the host env.
2223        std::env::remove_var("TREESHIP_ALLOW_INSECURE_KEY_PERMS");
2224
2225        let (store, dir) = make_store();
2226        let info = store.generate(true).unwrap();
2227
2228        // Loosen perms on the key file -- simulates a checkout, scp, or
2229        // shared-volume mishap.
2230        let path = store.entry_path(&info.id);
2231        fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap();
2232
2233        match store.signer(&info.id) {
2234            Err(KeyError::InsecureKeyPerms { path: p, mode }) => {
2235                assert_eq!(p, path);
2236                assert_eq!(mode & 0o777, 0o644);
2237            }
2238            other => panic!("expected InsecureKeyPerms, got {:?}", other.map(|_| "ok")),
2239        }
2240        cleanup(dir);
2241    }
2242
2243    #[test]
2244    #[cfg(unix)]
2245    fn signer_bypass_via_env_var() {
2246        use std::os::unix::fs::PermissionsExt;
2247        let _g = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
2248        let (store, dir) = make_store();
2249        let info = store.generate(true).unwrap();
2250        let path = store.entry_path(&info.id);
2251        fs::set_permissions(&path, fs::Permissions::from_mode(0o644)).unwrap();
2252
2253        std::env::set_var("TREESHIP_ALLOW_INSECURE_KEY_PERMS", "1");
2254        let result = store.signer(&info.id);
2255        std::env::remove_var("TREESHIP_ALLOW_INSECURE_KEY_PERMS");
2256
2257        assert!(
2258            result.is_ok(),
2259            "bypass env var must allow signing: {:?}",
2260            result.err()
2261        );
2262        cleanup(dir);
2263    }
2264
2265    // --- TS-2026-001 H3 migration-lock concurrency test -----------------
2266
2267    /// H3: two threads calling `Store::signer` on the same legacy v1
2268    /// entry must both succeed, the on-disk entry must end up as a
2269    /// valid v2 entry (decryptable via the v2 path), and no `.tmp`
2270    /// fragment must be left in the keystore directory.
2271    ///
2272    /// Without the advisory lock around `migrate_entry_to_v2`, two
2273    /// concurrent migrators would race the read-modify-rename cycle:
2274    /// the loser's rename would clobber the winner's v2 entry with
2275    /// its own (also-valid) v2 entry, but in between the two
2276    /// renames a third reader could observe a v2 entry, decrypt
2277    /// successfully, then have its in-memory state invalidated by
2278    /// the second writer. The flock turns the race into a queue --
2279    /// both writers produce identical v2 plaintext, only one rename
2280    /// per entry is actually needed, and the second writer's
2281    /// post-lock recheck observes the v2 state and exits cleanly.
2282    #[test]
2283    fn concurrent_migration_serializes_correctly() {
2284        use std::sync::Arc;
2285        use std::thread;
2286
2287        // Set up a legacy v1 entry on disk -- same shape as the
2288        // store_signer_migrates_legacy_entry_to_v2 test, just shared
2289        // with two threads.
2290        let (store, dir) = make_store();
2291        let info = store.generate(true).unwrap();
2292        let entry_path = store.entry_path(&info.id);
2293
2294        let v2_entry: EncryptedEntry =
2295            serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
2296        let secret = decrypt_from_disk(
2297            &store.machine_key,
2298            &v2_entry.id,
2299            &v2_entry.public_key,
2300            &v2_entry.enc_priv_key,
2301            &v2_entry.nonce,
2302        )
2303            .unwrap();
2304        let (legacy_blob, legacy_nonce) =
2305            legacy_v1_encrypt(&store.machine_key, &secret).unwrap();
2306        let legacy_entry = EncryptedEntry {
2307            id:               v2_entry.id.clone(),
2308            algorithm:        v2_entry.algorithm.clone(),
2309            created_at:       v2_entry.created_at.clone(),
2310            public_key:       v2_entry.public_key.clone(),
2311            enc_priv_key:     legacy_blob,
2312            nonce:            legacy_nonce,
2313            valid_until:      v2_entry.valid_until.clone(),
2314            successor_key_id: v2_entry.successor_key_id.clone(),
2315        };
2316        fs::write(&entry_path, serde_json::to_vec_pretty(&legacy_entry).unwrap()).unwrap();
2317
2318        // Two independent Store instances racing on the same on-disk
2319        // legacy entry. Using independent Store instances forces the
2320        // lock-on-disk path to engage (a shared Store would serialize
2321        // through the internal RwLock cache and we'd be testing the
2322        // wrong thing).
2323        let dir_a = Arc::new(dir.clone());
2324        let dir_b = Arc::new(dir.clone());
2325        let id_a = info.id.clone();
2326        let id_b = info.id.clone();
2327
2328        let h1 = thread::spawn(move || -> Result<(), String> {
2329            let s = Store::open(&*dir_a).map_err(|e| e.to_string())?;
2330            let _signer = s.signer(&id_a).map_err(|e| e.to_string())?;
2331            Ok(())
2332        });
2333        let h2 = thread::spawn(move || -> Result<(), String> {
2334            let s = Store::open(&*dir_b).map_err(|e| e.to_string())?;
2335            let _signer = s.signer(&id_b).map_err(|e| e.to_string())?;
2336            Ok(())
2337        });
2338
2339        h1.join().unwrap().expect("thread 1 signer load must succeed");
2340        h2.join().unwrap().expect("thread 2 signer load must succeed");
2341
2342        // Post-condition: on-disk entry is v2 framed.
2343        let after: EncryptedEntry =
2344            serde_json::from_slice(&fs::read(&entry_path).unwrap()).unwrap();
2345        assert!(
2346            !is_legacy_v1(&after.enc_priv_key),
2347            "post-concurrent-migration entry must be in v2 format"
2348        );
2349        assert_eq!(after.enc_priv_key[0], KEYSTORE_MAGIC);
2350        assert_eq!(after.enc_priv_key[1], KEYSTORE_VERSION_V2);
2351
2352        // v2 decrypts cleanly. Use the post-migration entry's own id +
2353        // pubkey — the migration must have re-encrypted with those bound
2354        // into the AAD, or this assertion would surface a MAC failure.
2355        let dec = decrypt_v2(
2356            &store.machine_key,
2357            &after.id,
2358            &after.public_key,
2359            &after.enc_priv_key,
2360        )
2361            .expect("v2 entry must decrypt cleanly after concurrent migration");
2362        assert_eq!(dec.len(), 32, "decrypted secret must be a 32-byte ed25519 scalar");
2363
2364        // No stale .tmp file left behind.
2365        for entry in fs::read_dir(&dir).unwrap() {
2366            let p = entry.unwrap().path();
2367            assert!(
2368                p.extension().is_none_or(|e| e != "tmp"),
2369                "no .tmp fragment must remain after migration, found: {}",
2370                p.display()
2371            );
2372        }
2373
2374        cleanup(dir);
2375    }
2376
2377    // --- TS-2026-001 H1 + H2 atomic write tests ------------------------
2378
2379    /// H1: a partial failure between writing the tmp file and renaming
2380    /// it into place MUST leave the original on-disk file intact. We
2381    /// simulate the failure by pre-creating a tmp file (so the next
2382    /// write_file_600 would clobber it) and then independently verifying
2383    /// that an already-written key entry remains decryptable even after
2384    /// a fresh write_file_600 fails partway.
2385    ///
2386    /// We exercise the failure path by pointing the rename at an
2387    /// unwritable target. On Unix we make the *parent directory*
2388    /// read-only after the original key is in place, which causes the
2389    /// final fs::rename to fail with EACCES. The original key file is
2390    /// unaffected because rename(2) returns before touching the target.
2391    #[test]
2392    #[cfg(unix)]
2393    fn atomic_write_leaves_original_intact_on_partial_failure() {
2394        use std::os::unix::fs::PermissionsExt;
2395        let (store, dir) = make_store();
2396        let info = store.generate(true).unwrap();
2397        let entry_path = store.entry_path(&info.id);
2398
2399        // Capture the original bytes for byte-identity comparison.
2400        let original = fs::read(&entry_path).expect("entry file must exist");
2401        assert!(!original.is_empty(), "freshly generated entry must be non-empty");
2402
2403        // Lock the directory: read+execute only, no write. fs::rename
2404        // into this directory will fail.
2405        let orig_dir_mode = fs::metadata(&dir).unwrap().permissions().mode() & 0o777;
2406        fs::set_permissions(&dir, fs::Permissions::from_mode(0o500)).unwrap();
2407
2408        // Attempt a fresh write to the SAME path -- must fail because
2409        // the directory is read-only, exercising the rename-failure
2410        // branch.
2411        let res = write_file_600(&entry_path, b"new junk that must not land");
2412        assert!(res.is_err(), "write_file_600 must fail when dir is read-only");
2413
2414        // Restore perms so we can read back the entry.
2415        fs::set_permissions(&dir, fs::Permissions::from_mode(orig_dir_mode)).unwrap();
2416
2417        // The original key file must be byte-identical to what we
2418        // captured before the failed write.
2419        let after = fs::read(&entry_path).expect("entry file must still exist after failed write");
2420        assert_eq!(
2421            after, original,
2422            "failed atomic write must not corrupt the original file",
2423        );
2424
2425        // And the keystore must still produce a working signer from it.
2426        let store2 = Store::open(&dir).unwrap();
2427        let signer = store2
2428            .signer(&info.id)
2429            .expect("original key must still decrypt after a failed write");
2430        let pae = crate::attestation::pae("text/plain", b"survive");
2431        assert_eq!(signer.sign(&pae).unwrap().len(), 64);
2432
2433        // No stale tmp file left behind.
2434        let tmp = entry_path.with_extension("tmp");
2435        assert!(!tmp.exists(), "tmp file must be cleaned up after rename failure");
2436
2437        cleanup(dir);
2438    }
2439
2440    /// H2: the entry file's mode is 0o600 at the moment of creation, set
2441    /// via OpenOptionsExt::mode rather than a post-write set_permissions
2442    /// (which had a tiny window of looser perms). Also confirms the tmp
2443    /// file is removed by the rename.
2444    #[test]
2445    #[cfg(unix)]
2446    fn mode_is_600_at_creation() {
2447        use std::os::unix::fs::PermissionsExt;
2448        let (store, dir) = make_store();
2449        let info = store.generate(true).unwrap();
2450        let entry_path = store.entry_path(&info.id);
2451
2452        let mode = fs::metadata(&entry_path).unwrap().permissions().mode() & 0o777;
2453        assert_eq!(mode, 0o600, "entry file must be 0600 at creation, got {:o}", mode);
2454
2455        let tmp = entry_path.with_extension("tmp");
2456        assert!(
2457            !tmp.exists(),
2458            "no .tmp file must be left behind after a successful atomic write"
2459        );
2460
2461        cleanup(dir);
2462    }
2463
2464    #[test]
2465    #[cfg(unix)]
2466    fn fix_perms_repairs_loose_modes() {
2467        use std::os::unix::fs::PermissionsExt;
2468        let (store, dir) = make_store();
2469        let info = store.generate(true).unwrap();
2470        let key_path = store.entry_path(&info.id);
2471
2472        fs::set_permissions(&dir, fs::Permissions::from_mode(0o755)).unwrap();
2473        fs::set_permissions(&key_path, fs::Permissions::from_mode(0o644)).unwrap();
2474
2475        let changes = store.fix_perms().unwrap();
2476        // dir + key file + manifest = 3 paths to fix (manifest may already be 0600
2477        // depending on Manifest write path; we only assert the loose ones moved).
2478        assert!(
2479            changes.iter().any(|(p, _, _)| p == &dir),
2480            "dir should be repaired"
2481        );
2482        assert!(
2483            changes.iter().any(|(p, _, _)| p == &key_path),
2484            "key file should be repaired"
2485        );
2486
2487        let dir_mode = fs::metadata(&dir).unwrap().permissions().mode() & 0o777;
2488        let key_mode = fs::metadata(&key_path).unwrap().permissions().mode() & 0o777;
2489        assert_eq!(dir_mode, 0o700);
2490        assert_eq!(key_mode, 0o600);
2491
2492        // After repair, signing must work again.
2493        store.signer(&info.id).expect("signing must work after fix_perms");
2494
2495        cleanup(dir);
2496    }
2497
2498    // --- TS-2026-001 post-merge fix-up: entry-binding AAD ------------------
2499
2500    /// Post-merge audit fix: the v2 AAD now binds entry id + public key
2501    /// into the GCM tag. Without that binding, a local attacker with
2502    /// write access to ~/.treeship/keys/ could copy entry A's
2503    /// `enc_priv_key` ciphertext into entry B's JSON envelope; the
2504    /// decrypt would succeed (same machine key, same framing-only AAD)
2505    /// and the signer for advertised key id A would silently sign with
2506    /// key B's secret scalar.
2507    ///
2508    /// This test performs exactly that swap and asserts decryption now
2509    /// fails. Before the fix this test would silently pass with the
2510    /// wrong scalar -- a true regression guard.
2511    #[test]
2512    fn cross_entry_swap_fails_decryption() {
2513        let (store, dir) = make_store();
2514
2515        // Two independent keys in the same store, same machine key.
2516        let a = store.generate(true).unwrap();
2517        let b = store.generate(false).unwrap();
2518
2519        // Snapshot both on-disk envelopes.
2520        let path_a = store.entry_path(&a.id);
2521        let path_b = store.entry_path(&b.id);
2522        let entry_a: EncryptedEntry =
2523            serde_json::from_slice(&fs::read(&path_a).unwrap()).unwrap();
2524        let entry_b: EncryptedEntry =
2525            serde_json::from_slice(&fs::read(&path_b).unwrap()).unwrap();
2526
2527        // Sanity: both are v2 framed, and the ciphertexts differ.
2528        assert_eq!(entry_a.enc_priv_key[0], KEYSTORE_MAGIC);
2529        assert_eq!(entry_a.enc_priv_key[1], KEYSTORE_VERSION_V2);
2530        assert_eq!(entry_b.enc_priv_key[0], KEYSTORE_MAGIC);
2531        assert_eq!(entry_b.enc_priv_key[1], KEYSTORE_VERSION_V2);
2532        assert_ne!(
2533            entry_a.enc_priv_key, entry_b.enc_priv_key,
2534            "two freshly-generated entries must have distinct ciphertexts"
2535        );
2536
2537        // The attack: copy B's enc_priv_key into A's envelope. Leave
2538        // everything else (id, public_key, algorithm) as it was in A.
2539        // This is the file an attacker with write access to the keys
2540        // directory would produce.
2541        let mut tampered_a = entry_a.clone();
2542        tampered_a.enc_priv_key = entry_b.enc_priv_key.clone();
2543        // The v2 nonce travels inline with the ciphertext (bytes
2544        // [2..14] of enc_priv_key), so swapping the blob also swaps
2545        // the nonce; the separate JSON `nonce` field is empty for v2
2546        // entries either way.
2547        fs::write(&path_a, serde_json::to_vec_pretty(&tampered_a).unwrap()).unwrap();
2548
2549        // Fresh Store so the in-memory cache doesn't paper over the
2550        // on-disk tamper.
2551        let store2 = Store::open(&dir).unwrap();
2552        let err = match store2.signer(&a.id) {
2553            Ok(_) => panic!(
2554                "swapping B's ciphertext into A's envelope must fail decrypt; \
2555                 got Ok which means the signer would silently sign with key B"
2556            ),
2557            Err(e) => e,
2558        };
2559
2560        // The specific error must be a crypto/MAC failure, not (e.g.)
2561        // a NotFound or InsecureKeyPerms surface that could mask the
2562        // class of bug.
2563        match err {
2564            KeyError::Crypto(msg) => assert!(
2565                msg.contains("MAC verification failed"),
2566                "swap must surface MAC failure; got: {msg}"
2567            ),
2568            other => panic!("expected Crypto MAC error, got: {other:?}"),
2569        }
2570
2571        cleanup(dir);
2572    }
2573
2574    /// Companion to `cross_entry_swap_fails_decryption`: the id field
2575    /// is also bound into the AAD, so editing the JSON `id` while
2576    /// leaving the ciphertext alone must also fail. (An attacker who
2577    /// renames a stolen entry file onto a victim's id without
2578    /// re-encrypting would land here.)
2579    #[test]
2580    fn aad_tampered_entry_id_fails_decryption() {
2581        let (store, dir) = make_store();
2582        let info = store.generate(true).unwrap();
2583        let path = store.entry_path(&info.id);
2584
2585        let mut entry: EncryptedEntry =
2586            serde_json::from_slice(&fs::read(&path).unwrap()).unwrap();
2587        assert_eq!(entry.id, info.id, "sanity: id matches what generate returned");
2588
2589        // Pretend the attacker forged an id. Note we write this back to
2590        // the SAME file path so Store::load_entry by the original id
2591        // finds it; if we changed the path too we'd just be testing
2592        // NotFound, which isn't the point.
2593        entry.id = "key_attacker_substituted_id".to_string();
2594        fs::write(&path, serde_json::to_vec_pretty(&entry).unwrap()).unwrap();
2595
2596        // Fresh Store so cache doesn't paper this over. Load via the
2597        // tampered id (matching what's in the JSON) so we exercise the
2598        // decrypt path rather than a path-vs-id mismatch.
2599        let store2 = Store::open(&dir).unwrap();
2600        // Drop the cache by opening fresh; load by the on-disk id.
2601        // The entry_path for "key_attacker_substituted_id" doesn't
2602        // exist, so we deliberately call the lower-level read by
2603        // path-of-original and assert decrypt fails via the dispatcher.
2604        // Easiest: bypass entry_path and invoke decrypt_from_disk with
2605        // the tampered id directly.
2606        let key_buf = store2.machine_key;
2607        let result = decrypt_from_disk(
2608            &key_buf,
2609            &entry.id,          // tampered id (bound into AAD)
2610            &entry.public_key,  // original pubkey
2611            &entry.enc_priv_key,
2612            &entry.nonce,
2613        );
2614        assert!(
2615            result.is_err(),
2616            "AAD-bound entry id mismatch must fail decrypt; got Ok"
2617        );
2618
2619        cleanup(dir);
2620    }
2621}
treeship_core/keys/mod.rs

treeship_core/keys/
mod.rs