Skip to main content

trusty_memory/
lib.rs

1//! MCP server (HTTP/SSE + stdio) for trusty-memory.
2//!
3//! Why: Claude Code and other MCP-aware clients integrate with trusty-memory
4//! through the standardized Model Context Protocol; we expose memory + KG
5//! tools so they can be called by name. The canonical stdio integration is
6//! `trusty-memory serve --stdio` (PR1 #919 of the #914 cutover epic), a
7//! self-contained direct MCP server that binds no HTTP port or UDS socket.
8//! The former `trusty-memory-mcp-bridge` binary and Unix-domain-socket
9//! transport were removed in PR3 (#914) once `serve --stdio` made them dead
10//! code.
11//! What: Provides `run_http` / `run_http_dynamic` / `run_http_on` (axum
12//! HTTP/SSE + REST + UI) plus an `AppState` that carries the shared
13//! `PalaceRegistry`, on-disk data root, and a lazily-initialized embedder.
14//! Test: `cargo test -p trusty-memory` validates handshake + dispatch via
15//! the in-process `handle_message` unit tests and the
16//! `tests/serve_stdio_e2e.rs` end-to-end harness.
17
18use anyhow::Result;
19use serde_json::{json, Value};
20use std::net::SocketAddr;
21use std::path::{Path, PathBuf};
22use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
23use std::sync::{Arc, OnceLock};
24use tokio::sync::{broadcast, OnceCell, RwLock};
25use trusty_common::bm25_client::Bm25Client;
26use trusty_common::mcp::initialize_response;
27use trusty_common::memory_core::embed::FastEmbedder;
28use trusty_common::memory_core::store::ChatSessionStore;
29use trusty_common::memory_core::PalaceRegistry;
30use trusty_common::ChatProvider;
31
32// Why: `tracing::info` is only used by the axum HTTP-serving helpers
33//      (`run_http_on`, `spawn_uds_listener`). Pulling it in unconditionally
34//      would trigger `unused_imports` warnings when the `axum-server`
35//      feature is disabled. `SocketAddr` is still used by `bound_addr` on
36//      `AppState` so it stays unconditional.
37#[cfg(feature = "axum-server")]
38use tracing::info;
39
40/// Two-phase daemon readiness state (issues #910/#911).
41///
42/// Why: The embedder cold-init (CoreML compile, 30-120 s) blocks the first
43/// real `memory_remember`/`memory_recall` call if it arrives before warm-up
44/// completes.  Advertising the state lets handlers return an explicit, fast
45/// error ("daemon is warming up, retry shortly") instead of blocking for
46/// minutes.
47/// What: Two stable values stored atomically.  `Warming` (0) is the initial
48/// state; `Ready` (1) is set once the embedder has been successfully
49/// initialised by `spawn_startup_tasks`.  The transition is one-way and
50/// lock-free: a single `AtomicU8` compare-and-swap.
51/// Test: `daemon_readiness_transitions_warming_to_ready` in this module;
52///       end-to-end warming-error path covered by
53///       `tools::tests::remember_returns_warming_error_while_state_is_warming`.
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum DaemonReadiness {
56    /// Embedder cold-init (and/or pin scan) still in progress.
57    Warming = 0,
58    /// Embedder initialised; all handlers may proceed normally.
59    Ready = 1,
60}
61
62impl DaemonReadiness {
63    /// Decode the raw atomic value.
64    ///
65    /// Why: centralises the `0 → Warming, else Ready` mapping so every
66    /// caller loads a meaningful enum rather than comparing raw integers.
67    /// What: returns `Warming` for `0`, `Ready` for any other value (only
68    /// `1` is ever written).
69    /// Test: `daemon_readiness_from_u8` in this module.
70    pub fn from_u8(v: u8) -> Self {
71        if v == 0 {
72            Self::Warming
73        } else {
74            Self::Ready
75        }
76    }
77}
78
79pub mod activity;
80pub mod attribution;
81pub mod bm25_supervisor;
82pub mod bootstrap;
83/// File-descriptor usage and limit reporting for `/health`.
84///
85/// Why: expose `open_fds` / `fd_soft_limit` so operators can see the fd
86/// ceiling and current consumption without needing lsof or shell access.
87/// Test: `fd_metrics::tests::fd_metrics_returns_sane_values`.
88pub mod fd_metrics;
89// Why (issue #226): `chat` and `web` are pure axum HTTP/SSE handler
90//      surfaces. Gating them behind the `axum-server` feature is what lets
91//      library consumers (e.g. `open-mpm` linking only `MemoryMcpService`)
92//      drop axum + tower-http entirely from their build graph.
93#[cfg(feature = "axum-server")]
94pub mod chat;
95pub mod commands;
96pub mod console_metrics;
97pub mod discovery;
98/// Supervised `serve --foreground` entry point (issue #787).
99///
100/// Why: launchd supervisors need loud failure on port collision, not silent
101/// port-walking to 7071+. Extracted to stay under the 500-line ratchet cap.
102/// What: exports `bind_foreground_port` (Fix C — abort on EADDRINUSE) and
103/// `run_http_foreground` (Fix A lock + Fix B http_addr + Fix C combined).
104/// Test: `foreground::tests::bind_foreground_port_refuses_collision`;
105/// `daemon_lock` module tests cover the lock-file logic.
106pub mod foreground;
107pub mod hook_emit;
108pub mod kg_extract;
109pub mod mcp_service;
110pub mod messaging;
111pub mod openrpc;
112/// Issue #88: project-root detection and palace-slug enforcement.
113///
114/// Why: prevents unbounded palace creation by anchoring palace names to the
115/// canonical slug of the project directory that contains the CWD, or to the
116/// `personal` sentinel for non-project contexts.
117/// What: exports `find_project_root`, `project_slug_at`, `project_slug`,
118/// `validate_palace_name`, `PERSONAL_PALACE`, and `PROJECT_MARKERS`.
119/// Test: see unit tests inside this module.
120pub mod project_root;
121pub mod prompt_facts;
122pub mod prompt_log;
123pub mod service;
124pub mod startup_scan;
125pub mod tools;
126pub mod transport;
127#[cfg(feature = "axum-server")]
128pub mod web;
129
130pub use activity::{ActivityEntry, ActivityFilter, ActivityLog, ActivitySource};
131pub use attribution::{CreatorInfo, CreatorSource};
132
133/// Maximum bytes retained in the trigger-prompt excerpt embedded on a
134/// `HookFired` event.
135///
136/// Why: the full triggering prompt is sensitive and already lives in the
137/// JSONL prompt log; the activity feed only needs enough text to give an
138/// operator a glance — a single-line ~80 char preview matches the existing
139/// `drawer_content_preview` convention so dashboard rows render uniformly.
140/// What: 80 characters; longer prompts are truncated with a trailing `…`.
141/// Test: `hook_excerpt_truncates_long_prompts`.
142pub const HOOK_PROMPT_EXCERPT_CHARS: usize = 80;
143
144/// Reduce a triggering prompt to the short excerpt embedded on a
145/// `HookFired` activity event.
146///
147/// Why: see [`HOOK_PROMPT_EXCERPT_CHARS`]. Centralising the truncation rule
148/// keeps every emitter (HTTP, hook CLI handlers, future tests) producing
149/// the same preview shape so UI rendering is uniform.
150/// What: whitespace-collapses `prompt` and trims to
151/// [`HOOK_PROMPT_EXCERPT_CHARS`] chars with `…` when cut. Empty input
152/// returns an empty string.
153/// Test: `hook_excerpt_truncates_long_prompts`,
154/// `hook_excerpt_collapses_whitespace`.
155pub fn hook_prompt_excerpt(prompt: &str) -> String {
156    let normalised: String = prompt.split_whitespace().collect::<Vec<_>>().join(" ");
157    if normalised.chars().count() <= HOOK_PROMPT_EXCERPT_CHARS {
158        normalised
159    } else {
160        let kept: String = normalised
161            .chars()
162            .take(HOOK_PROMPT_EXCERPT_CHARS.saturating_sub(1))
163            .collect();
164        format!("{kept}…")
165    }
166}
167
168pub use mcp_service::MemoryMcpService;
169pub use tools::MemoryMcpServer;
170
171/// Resolve the directory that actually holds the per-palace subdirectories.
172///
173/// Why: there are two on-disk layouts in the wild. The current monorepo code
174/// treats the registry directory *itself* as the parent of per-palace dirs
175/// (`<dir>/<id>/palace.json`). The legacy standalone `trusty-memory` repo
176/// nested everything one level deeper under a `palaces/` subdirectory
177/// (`<data_dir>/palaces/<id>/palace.json`) — and that is where existing
178/// installs' data lives (e.g. 88 palaces under
179/// `~/Library/Application Support/trusty-memory/palaces/`). A daemon that uses
180/// the bare data dir as its registry root finds zero palaces because every
181/// `palace.json` sits one level below where it looked — the "palaces lost on
182/// restart" bug.
183/// What: given the standard data dir, returns `<data_dir>/palaces` when that
184/// subdirectory exists, otherwise `<data_dir>` itself. Resolving this once in
185/// `main.rs` and using the result as `AppState::data_root` keeps every call
186/// site (`status`, `palace_list`, `open_palace`, `palace_create`,
187/// `load_palaces_from_disk`) consistent without forcing a data migration.
188/// Test: `tests::resolve_palace_registry_dir_prefers_palaces_subdir` and
189/// `resolve_palace_registry_dir_falls_back_to_data_dir`.
190pub fn resolve_palace_registry_dir(data_dir: PathBuf) -> PathBuf {
191    let nested = data_dir.join("palaces");
192    if nested.is_dir() {
193        nested
194    } else {
195        data_dir
196    }
197}
198
199/// Hook type — labels the Claude Code hook that triggered a submission.
200///
201/// Why: every hook firing produces an activity-feed entry tagged with the
202/// originating hook so operators can tell whether activity came from a user
203/// prompt (`UserPromptSubmit`), a new session (`SessionStart`), or a future
204/// hook variant. Threading this through `DaemonEvent::HookFired` lets the
205/// dashboard badge each row with the hook label.
206/// What: serde-serialised in PascalCase so the wire format matches Claude
207/// Code's own hook-name strings exactly (e.g. `"UserPromptSubmit"`).
208/// Test: `hook_type_serde_round_trips`.
209#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
210pub enum HookType {
211    /// Claude Code's `UserPromptSubmit` hook — fires on every user prompt.
212    UserPromptSubmit,
213    /// Claude Code's `SessionStart` hook — fires once at session open.
214    SessionStart,
215}
216
217impl HookType {
218    /// Stable string label used for the wire format.
219    pub fn as_str(&self) -> &'static str {
220        match self {
221            Self::UserPromptSubmit => "UserPromptSubmit",
222            Self::SessionStart => "SessionStart",
223        }
224    }
225}
226
227/// Injection kind — labels what the hook actually injected (or attempted).
228///
229/// Why: distinct from `HookType` because one hook could in principle render
230/// more than one kind of injection (e.g. SessionStart can deliver both an
231/// inbox check and bootstrap context). Tagging the rendered kind explicitly
232/// keeps the activity log searchable when that fan-out lands.
233/// What: serde-serialised as kebab-case so it matches the labels already
234/// used in the JSONL prompt log (`prompt-context-facts`,
235/// `inbox-check-messages`).
236/// Test: `injection_kind_serde_round_trips`.
237#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
238#[serde(rename_all = "kebab-case")]
239pub enum InjectionKind {
240    /// `prompt-context` hook rendered the prompt-facts block.
241    PromptContext,
242    /// `inbox-check` hook delivered unread messages.
243    InboxCheck,
244}
245
246impl InjectionKind {
247    /// Stable string label used for the wire format.
248    pub fn as_str(&self) -> &'static str {
249        match self {
250            Self::PromptContext => "prompt-context",
251            Self::InboxCheck => "inbox-check",
252        }
253    }
254}
255
256/// Live daemon events broadcast to connected SSE subscribers.
257///
258/// Why: The dashboard needs push-driven updates so palace creation, drawer
259/// add/delete, dream cycles, and aggregate status changes are visible without
260/// polling. A single broadcast channel fans out to every connected browser.
261/// What: Tagged enum serialized as `{"type": "...", ...fields}` over SSE.
262/// Test: `web::tests::sse_stream_emits_events` subscribes, triggers a
263/// mutation, and asserts the frame arrives.
264#[derive(Clone, Debug, serde::Serialize)]
265#[serde(tag = "type", rename_all = "snake_case")]
266pub enum DaemonEvent {
267    PalaceCreated {
268        id: String,
269        name: String,
270        /// Originating subsystem (HTTP, MCP, Hook). Why (issue #96): the
271        /// UI badges each row with its source so operators can tell at a
272        /// glance whether a write came from the dashboard form, an MCP
273        /// tool call, or a hook-driven path. The wire-format key is
274        /// `source` (lower-case strings via serde rename_all on
275        /// `ActivitySource`).
276        source: ActivitySource,
277    },
278    DrawerAdded {
279        palace_id: String,
280        /// Friendly palace name (Palace.name) at write time. Why: lets SSE
281        /// consumers (the dashboard activity feed) render the human-readable
282        /// label without a separate id→name lookup. Empty string if the
283        /// emitter could not resolve the name.
284        #[serde(default)]
285        palace_name: String,
286        drawer_count: usize,
287        /// Wall-clock timestamp when the drawer was added. Why: SSE
288        /// receivers want to render "just now / 2m ago" relative to the
289        /// daemon's clock, not the time the SSE frame happens to arrive.
290        timestamp: chrono::DateTime<chrono::Utc>,
291        /// Short preview of the drawer's content (whitespace-collapsed,
292        /// truncated to ~80 chars with an ellipsis when cut). Why: the TUI
293        /// activity feed and dashboard ticker want to show *what* was
294        /// stored, not just the running drawer count. Empty when the
295        /// emitter could not resolve the content (legacy clients tolerate
296        /// the missing field via `#[serde(default)]`).
297        #[serde(default)]
298        content_preview: String,
299        /// Originating subsystem (issue #96).
300        source: ActivitySource,
301    },
302    DrawerDeleted {
303        palace_id: String,
304        drawer_count: usize,
305        /// Originating subsystem (issue #96).
306        source: ActivitySource,
307    },
308    DreamCompleted {
309        palace_id: Option<String>,
310        merged: usize,
311        pruned: usize,
312        compacted: usize,
313        closets_updated: usize,
314        duration_ms: u64,
315        /// Originating subsystem (issue #96).
316        source: ActivitySource,
317    },
318    StatusChanged {
319        total_drawers: usize,
320        total_vectors: usize,
321        total_kg_triples: usize,
322    },
323    /// A Claude Code hook completed and rendered (or attempted to render) an
324    /// injection block.
325    ///
326    /// Why: pre-#XXX the activity feed only fired on drawer / palace / dream
327    /// writes, which meant a normal Claude Code session — whose only daemon
328    /// traffic is hook invocations — left the feed empty. Surfacing every
329    /// hook firing answers the user complaint "no activity in the TUI" and
330    /// gives operators a way to see how often each project palace is
331    /// actually picking up prompt-context / inbox-check work.
332    /// What: carries the resolved palace (or `None` if cwd resolution
333    /// failed), the [`HookType`] label, the [`InjectionKind`] label, the
334    /// rendered injection byte length, a short excerpt of the triggering
335    /// prompt (capped at ~80 chars; the full content stays in the JSONL
336    /// prompt log only), the timestamp, the hook's wall-clock duration,
337    /// and the [`ActivitySource`] tag (always `Hook` for this variant).
338    /// Backwards-compatible: SSE clients that do not recognise the
339    /// `hook_fired` `type` tag can safely ignore the frame.
340    HookFired {
341        /// Resolved palace id (slug) — `None` if cwd resolution failed.
342        #[serde(default)]
343        palace_id: Option<String>,
344        /// Friendly palace name at hook time — `None` if the registry
345        /// could not be consulted (HTTP path uses `palace_id` here when
346        /// no separate name is known).
347        #[serde(default)]
348        palace_name: Option<String>,
349        hook_type: HookType,
350        injection_kind: InjectionKind,
351        /// Rendered injection size in bytes (`0` when no injection was
352        /// emitted, e.g. SessionStart with an empty inbox).
353        injection_length: u64,
354        /// Short excerpt of the triggering prompt for the activity feed
355        /// display. Capped at ~80 chars with a trailing `…` when cut.
356        /// Why: the activity feed renders this directly; full prompt
357        /// content (which may be sensitive) stays in the JSONL log.
358        #[serde(default)]
359        trigger_prompt_excerpt: String,
360        timestamp: chrono::DateTime<chrono::Utc>,
361        /// Hook wall-clock duration in milliseconds.
362        duration_ms: u64,
363        /// Always `ActivitySource::Hook` for this variant; encoded explicitly
364        /// so the same dispatch path (`emit`) can persist + broadcast it.
365        source: ActivitySource,
366    },
367}
368
369/// Open the activity log under `data_root`, falling back to a per-process
370/// tempdir and finally to a no-op `Discard` variant when no writable
371/// directory is available.
372///
373/// Why (issues #96, #225): the activity log is a best-effort feature — if
374/// the data root is on a read-only mount, missing, or locked by another
375/// process, the daemon should still come up and serve every other endpoint.
376/// The first fallback is a `std::env::temp_dir()`-anchored subdirectory
377/// keyed by the daemon's process id. Issue #225: a previous version called
378/// `expect()` on the tempdir fallback, which crashed the daemon on hosts
379/// where neither `data_root` nor `std::env::temp_dir()` is writable
380/// (read-only containers, locked-down sandboxes). The contract is
381/// "best-effort", so the final fallback is now `ActivityLog::discard()` —
382/// a no-op variant that drops every append and returns empty reads. The
383/// dashboard's activity feed simply shows up empty in that degraded state.
384/// What: tries `ActivityLog::open(data_root)`; on error logs a warning and
385/// retries against `<temp>/trusty-memory-activity-<pid>/`. If both fail,
386/// emits a final warning and returns `ActivityLog::discard()`.
387/// Test: `open_activity_log_with_fallback_returns_discard_when_unwritable`
388/// covers the discard branch; existing `AppState` construction tests cover
389/// the happy and tempdir-fallback paths.
390fn open_activity_log_with_fallback(data_root: &Path) -> Arc<ActivityLog> {
391    match ActivityLog::open(data_root) {
392        Ok(log) => Arc::new(log),
393        Err(primary_err) => {
394            tracing::warn!(
395                "could not open activity log at {}: {primary_err:#}; falling back to per-process tempdir",
396                data_root.display()
397            );
398            let fallback =
399                std::env::temp_dir().join(format!("trusty-memory-activity-{}", std::process::id()));
400            match ActivityLog::open(&fallback) {
401                Ok(log) => Arc::new(log),
402                Err(fallback_err) => {
403                    tracing::warn!(
404                        "activity log tempdir fallback at {} also failed: {fallback_err:#}; \
405                         activity feed disabled for this process (no-op log)",
406                        fallback.display()
407                    );
408                    Arc::new(ActivityLog::discard())
409                }
410            }
411        }
412    }
413}
414
415impl DaemonEvent {
416    /// Short discriminant label matching the SSE `type` field.
417    ///
418    /// Why: the persisted activity log stores `event_type` as a string so
419    /// the UI can render the row without re-parsing the payload. Sharing
420    /// the same labels the SSE serializer uses keeps the wire and the
421    /// stored history consistent.
422    /// What: returns one of `palace_created`, `drawer_added`,
423    /// `drawer_deleted`, `dream_completed`, `status_changed`.
424    /// Test: `daemon_event_type_str_matches_sse_tag` in the lib tests.
425    pub fn type_str(&self) -> &'static str {
426        match self {
427            Self::PalaceCreated { .. } => "palace_created",
428            Self::DrawerAdded { .. } => "drawer_added",
429            Self::DrawerDeleted { .. } => "drawer_deleted",
430            Self::DreamCompleted { .. } => "dream_completed",
431            Self::StatusChanged { .. } => "status_changed",
432            Self::HookFired { .. } => "hook_fired",
433        }
434    }
435
436    /// `palace_id` if the event is scoped to a single palace.
437    ///
438    /// Why: the activity log indexes entries by palace id so the UI can
439    /// filter by palace; daemon-wide events (`status_changed`,
440    /// dream-across-all-palaces) return `None`.
441    /// What: returns a borrowed string when the variant carries a palace
442    /// id, otherwise `None`.
443    /// Test: `daemon_event_palace_id_extraction`.
444    pub fn palace_id(&self) -> Option<&str> {
445        match self {
446            Self::PalaceCreated { id, .. } => Some(id),
447            Self::DrawerAdded { palace_id, .. } | Self::DrawerDeleted { palace_id, .. } => {
448                Some(palace_id)
449            }
450            Self::DreamCompleted { palace_id, .. } => palace_id.as_deref(),
451            Self::HookFired { palace_id, .. } => palace_id.as_deref(),
452            Self::StatusChanged { .. } => None,
453        }
454    }
455
456    /// Originating subsystem if the event carries one.
457    ///
458    /// Why: only mutation events carry a `source`; the aggregate
459    /// `StatusChanged` is recomputed by the daemon and has no caller, so
460    /// it returns `None`.
461    /// What: returns the variant's `source` field where present.
462    /// Test: `daemon_event_source_extraction`.
463    pub fn source(&self) -> Option<ActivitySource> {
464        match self {
465            Self::PalaceCreated { source, .. }
466            | Self::DrawerAdded { source, .. }
467            | Self::DrawerDeleted { source, .. }
468            | Self::DreamCompleted { source, .. }
469            | Self::HookFired { source, .. } => Some(*source),
470            Self::StatusChanged { .. } => None,
471        }
472    }
473}
474
475/// Shared application state passed to every request handler.
476///
477/// Why: The stdio loop and HTTP server need the same handles to the registry,
478/// data root, and embedder so MCP tools can perform real reads/writes against
479/// the live trusty-memory core. The embedder is heavy (loads ONNX weights) so
480/// we hold it behind a `OnceCell` and initialize lazily on first use.
481/// What: `Clone`-able via `Arc` fields. The registry / data root are eager;
482/// `embedder` is `Arc<OnceCell<Arc<FastEmbedder>>>` so concurrent first-use
483/// races resolve to a single shared instance.
484/// Test: `app_state_default_constructs` confirms construction without panic.
485#[derive(Clone)]
486pub struct AppState {
487    pub version: String,
488    pub registry: Arc<PalaceRegistry>,
489    pub data_root: PathBuf,
490    pub embedder: Arc<OnceCell<Arc<FastEmbedder>>>,
491    /// Optional default palace applied to MCP tool calls when the caller
492    /// omits the `palace` argument. Set via `trusty-memory serve --palace`.
493    pub default_palace: Option<String>,
494    /// Active chat provider selected at startup. `None` means no upstream is
495    /// configured (no Ollama detected and no OpenRouter key) — callers must
496    /// degrade gracefully (chat endpoint returns 412).
497    pub chat_provider: Arc<OnceCell<Option<Arc<dyn ChatProvider>>>>,
498    /// Per-palace chat-session stores, opened lazily so cold-start cost is
499    /// paid only when chat-history endpoints are hit.
500    pub session_stores: Arc<dashmap::DashMap<String, Arc<ChatSessionStore>>>,
501    /// Broadcast sender for live `DaemonEvent` pushes to SSE subscribers.
502    ///
503    /// Why: Lets mutating handlers emit events that any connected dashboard
504    /// receives instantly. Cap of 128 buffers transient slow readers; if a
505    /// receiver lags it gets `RecvError::Lagged` and we emit a `lag` frame.
506    pub events: Arc<broadcast::Sender<DaemonEvent>>,
507    /// Instant the daemon started, used to compute `uptime_secs` on `/health`.
508    ///
509    /// Why (issue #35): `GET /health` reports how long the daemon has been
510    /// up. Capturing a monotonic `Instant` at `AppState` construction lets the
511    /// handler compute the elapsed seconds cheaply and without a clock-skew
512    /// hazard.
513    /// What: a wall-monotonic `Instant`; `AppState::new` stamps it at startup.
514    /// Test: `health_endpoint_includes_resource_fields`.
515    pub started_at: std::time::Instant,
516    /// In-memory ring buffer of recent tracing log lines (issue #35).
517    ///
518    /// Why: the `GET /api/v1/logs/tail` endpoint serves the last N log lines
519    /// so operators can inspect a running daemon without tailing a file. The
520    /// buffer is shared between the tracing `LogBufferLayer` (writer) and the
521    /// HTTP handler (reader).
522    /// What: a cheap `Arc`-backed clone of the buffer the subscriber writes
523    /// to. Defaults to an empty buffer for states that never install the
524    /// layer (tests, the stdio path).
525    /// Test: `logs_tail_returns_recent_lines`.
526    pub log_buffer: trusty_common::log_buffer::LogBuffer,
527    /// Bug-capture ERROR store (bug-reporting #478, Phase 1).
528    ///
529    /// Why: Phase 2 MCP / HTTP endpoints need to query captured errors; stashing
530    ///      the `ErrorStore` handle here lets any handler reach it cheaply without
531    ///      a second global or per-request construction.
532    /// What: populated by `run_serve` from the `init_tracing_with_buffer_and_capture`
533    ///      result; the layer writes to this store automatically so every
534    ///      `tracing::error!` call site contributes without any changes to call
535    ///      sites. `None` in states that do not install the layer (tests, the
536    ///      stdio path).
537    /// Test: compile-presence is verified by the `trusty-memory` build; Phase 2
538    ///      will add query tests in `web.rs`.
539    pub error_store: Option<trusty_common::error_capture::ErrorStore>,
540    /// Most recent on-disk footprint of `data_root`, in bytes (issue #35).
541    ///
542    /// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
543    /// every health request would make a frequent health poll do unbounded
544    /// I/O; a background task recomputes it every 10 s and stores it here so
545    /// the handler reads it lock-free.
546    /// What: an `AtomicU64` updated by the ticker spawned in `run_http_on`.
547    /// `0` until the first walk completes.
548    /// Test: `health_endpoint_includes_resource_fields`.
549    pub disk_bytes: Arc<std::sync::atomic::AtomicU64>,
550    /// Per-process RSS + CPU sampler, refreshed on each `/health` request
551    /// (issue #35).
552    ///
553    /// Why: CPU usage is a delta between two `sysinfo` refreshes, so the
554    /// sampler must persist between requests — hence the shared `Mutex`.
555    /// What: a `tokio::sync::Mutex<SysMetrics>` so the async health handler
556    /// can sample without blocking the runtime.
557    /// Test: `health_endpoint_includes_resource_fields`.
558    pub sys_metrics: Arc<tokio::sync::Mutex<trusty_common::sys_metrics::SysMetrics>>,
559    /// HTTP listener address the daemon bound to, once `run_http_on` is running.
560    ///
561    /// Why: clients (and `/health` responses) need to advertise the live
562    /// `host:port` even though port selection happens dynamically (7070–7079
563    /// walk + OS fallback). Stashing it on `AppState` lets request handlers
564    /// surface the discovery value without re-querying the listener.
565    /// What: a `OnceLock<SocketAddr>` so `run_http_on` writes it exactly once
566    /// at bind time and every handler reads it lock-free thereafter. Empty
567    /// (`None` from `get()`) on the stdio path where no listener exists.
568    /// Test: `health_endpoint_reports_bound_addr` (added below).
569    pub bound_addr: Arc<OnceLock<SocketAddr>>,
570    /// Cached prompt-facts surface served by the MCP `get_prompt_context`
571    /// tool (issue #42).
572    ///
573    /// Why: The original session-init `prompts/get` design loaded context
574    /// once per connection; switching to a per-message tool lets the model
575    /// pull fresh, query-filtered context on demand. The cache holds both
576    /// the raw triples (for filtered lookups) and a pre-formatted Markdown
577    /// block (for the unfiltered hot path) so neither code path re-walks
578    /// the KG. The cache is rebuilt by
579    /// `prompt_facts::rebuild_prompt_cache` after any write that touches a
580    /// hot predicate (`kg_assert`, `add_alias`, `remove_prompt_fact`).
581    /// What: An `Arc<tokio::sync::RwLock<PromptFactsCache>>` so the hot
582    /// read path takes a brief read lock and clones the cache; rebuilds
583    /// take a write lock for the assignment only. The async-aware lock
584    /// (issue #229) yields to the tokio runtime instead of blocking a
585    /// runtime thread for the rebuild duration. An empty `triples` vec ↔
586    /// "no context stored yet" (the tool handler renders a hint).
587    /// Test: `get_prompt_context_returns_cached_or_hint`,
588    /// `get_prompt_context_filters_by_query`.
589    pub prompt_context_cache: Arc<RwLock<prompt_facts::PromptFactsCache>>,
590    /// Persistent activity log (issue #96).
591    ///
592    /// Why: the dashboard activity feed used to be a pure live-stream over
593    /// `/sse` — opening the UI showed an empty feed and any mutation from
594    /// the MCP path was invisible. Holding an `ActivityLog` on `AppState`
595    /// lets `emit` record an entry on every push so the
596    /// `GET /api/v1/activity` handler can return historical rows on mount
597    /// and the live SSE stream can continue prepending events on top of
598    /// the loaded history. `None` on builds that opt out (tests that use
599    /// `AppState::new` get a real log under their tempdir so behaviour
600    /// matches production).
601    /// What: an `Arc<ActivityLog>` shared with every emitter.
602    /// Test: `web::tests::activity_endpoint_lists_recent_emits`.
603    pub activity_log: Arc<ActivityLog>,
604    /// Optional per-palace BM25 lexical search lane (issue #156).
605    ///
606    /// Why: in-process BM25 would serialise the recall hot path on disk
607    /// I/O during writes and contend with the redb/usearch locks. Delegating
608    /// to the `trusty-bm25-daemon` subprocess (one socket per palace) keeps
609    /// BM25 ingestion and search off the critical path while still feeding
610    /// hits into the recall RRF fusion.
611    /// What: `Some(client)` only when `TRUSTY_BM25_DAEMON=1` at startup —
612    /// every code path that uses this field is gated on `is_some()` and
613    /// falls back to vector-only behaviour otherwise so existing deployments
614    /// see zero behavioural change.
615    /// Test: `bm25_client_disabled_by_default`,
616    /// `bm25_client_enabled_when_env_set`.
617    pub bm25_client: Option<Arc<Bm25Client>>,
618    /// Optional per-palace BM25 daemon spawn supervisor (issue #193).
619    ///
620    /// Why: without an in-process supervisor the BM25 daemon must be
621    /// launched out-of-band (launchd, manual `trusty-bm25-daemon`), which
622    /// is the same UX trap PR #190 fixed for trusty-embedderd. Holding a
623    /// supervisor here lets us spawn the daemon on first BM25 use for a
624    /// palace, restart it if it dies, and reap it on clean shutdown.
625    /// `Some` only when `TRUSTY_BM25_DAEMON=1` at startup — the same gate
626    /// that enables `bm25_client`. When set but `TRUSTY_BM25_EXTERNAL=1`,
627    /// the supervisor's `ensure_running` becomes a no-op that just returns
628    /// the canonical socket path so operators can keep using their own
629    /// process manager.
630    /// Test: covered by `bm25_supervisor_present_when_env_set` and the
631    /// `bm25_supervisor::tests` unit tests.
632    pub bm25_supervisor: Option<Arc<bm25_supervisor::Bm25Supervisor>>,
633    /// Per-palace write serialisation locks (issue #230).
634    ///
635    /// Why: the dedup gate in `tools.rs` previously read a snapshot of
636    /// existing drawers, checked for near-duplicates via Jaro-Winkler, and
637    /// then issued the write — a classic time-of-check/time-of-use race.
638    /// Two concurrent `memory_remember` calls with the same content could
639    /// both see the pre-write snapshot, both pass the gate, and both land
640    /// duplicate drawers. Serialising the gate-then-write sequence per
641    /// palace closes the window: while one task holds the mutex, any
642    /// concurrent writer for the same palace blocks until the first write
643    /// finishes and is visible to `list_drawers`. The lock is **per
644    /// palace** (not global) so writes to different palaces continue to
645    /// run in parallel.
646    /// What: a `DashMap` keyed by palace id, where each entry is an
647    /// `Arc<tokio::sync::Mutex<()>>`. The mutex is constructed lazily by
648    /// `palace_write_lock` on first access. `Arc` lets callers hold a
649    /// clone of the lock past the lifetime of the `DashMap` entry so the
650    /// map never needs to be held across an `.await`.
651    /// Test: `tools::tests::dedup_gate_blocks_concurrent_duplicate_writes`.
652    pub palace_write_locks: Arc<dashmap::DashMap<String, Arc<tokio::sync::Mutex<()>>>>,
653    /// Counter of in-flight activity-log writes spawned by `emit`
654    /// (issue #232).
655    ///
656    /// Why: `emit` offloads the synchronous redb append to the tokio blocking
657    /// pool via `spawn_blocking` so the async runtime is never parked waiting
658    /// on fsync. The write is fire-and-forget — `emit` returns immediately
659    /// after spawning. Tests that observe the activity log right after a
660    /// burst of `emit` calls need a deterministic synchronization point;
661    /// holding an in-flight counter lets `flush_activity_writes` poll until
662    /// every spawned append has settled, which keeps the assertions
663    /// race-free without forcing every caller to `.await`.
664    /// What: an `Arc<AtomicUsize>` incremented before each `spawn_blocking`
665    /// and decremented inside the closure (after the append completes, even
666    /// if it errored). The counter is cheap (one atomic add per emit) and
667    /// stays at zero in steady-state production traffic.
668    /// Test: `web::tests::activity_endpoint_lists_recent_emits` and
669    /// `tests::emit_persists_mutations_but_skips_status_changed` call
670    /// `flush_activity_writes` to drain the counter before reading the log.
671    pub pending_activity_writes: Arc<AtomicUsize>,
672    /// In-memory cache mapping palace id → `Palace.name` (issue #228).
673    ///
674    /// Why: every `memory_remember` / `memory_note` write used to call
675    /// `PalaceRegistry::list_palaces` (a synchronous filesystem walk of the
676    /// data root) just to resolve a friendly palace name for the SSE
677    /// `DrawerAdded` event. With N palaces on disk the cost was O(N) opendirs
678    /// plus `palace.json` reads on every write, blocking the async runtime.
679    /// Caching the name in-memory turns the lookup into a `DashMap::get`.
680    /// What: `DashMap<String, String>` populated by `create_palace` and
681    /// `load_palaces_from_disk`, kept in sync by rename / delete paths.
682    /// Missing entries are treated as "name unknown" so callers fall back to
683    /// the palace id and the emit path never fails.
684    /// Test: `palace_name_cache_populated_after_hydration` and
685    /// `palace_name_cache_updates_on_create`.
686    pub palace_names: Arc<dashmap::DashMap<String, String>>,
687    /// Single-pass startup pin-file map: palace id → project root path (issue #470).
688    ///
689    /// Why: after daemon startup we have no record of which on-disk project
690    /// directories correspond to which palace ids — that information only
691    /// existed inside the pin files on disk. Eager-opening every palace on
692    /// startup is too expensive. This field captures the scan-only result of
693    /// `startup_scan::scan_pin_map` so handlers that want to locate a project
694    /// by its palace id (e.g. future cwd-inference, project-health checks)
695    /// can do a single `DashMap::get` instead of a filesystem walk.
696    /// Populated once, shortly after `load_palaces_from_disk` returns, by
697    /// `spawn_startup_tasks`. Never mutated after population — it is a
698    /// snapshot of what the filesystem looked like at startup.
699    /// What: `DashMap<String (palace_id), PathBuf (project root)>`.
700    /// The outer `Arc` lets `spawn_startup_tasks` (which holds only a clone
701    /// of `AppState`) write to the same backing map that request handlers
702    /// read. Population is asynchronous so callers must treat an absent entry
703    /// as "not yet scanned" (or "no pin found"), never as "palace unknown".
704    /// Test: `startup_scan::tests::scan_pin_map_*` validate the underlying
705    /// scanner function; the wiring in `spawn_startup_tasks` is covered by
706    /// the integration-test daemon start path.
707    pub pin_project_map: Arc<dashmap::DashMap<String, PathBuf>>,
708    /// Bounded sender for the BM25 index worker (issue #231).
709    ///
710    /// Why: the previous fire-and-forget design `tokio::spawn`ed one task per
711    /// `memory_remember` / `memory_note` call, so a write burst against a slow
712    /// or unreachable BM25 daemon grew an unbounded in-flight task queue. A
713    /// single long-lived worker draining a bounded mpsc channel caps that
714    /// back-pressure: writers `try_send` (never block), full-queue requests
715    /// are dropped with a `warn!`, and the worker exits cleanly when the last
716    /// sender is dropped on shutdown.
717    /// What: an `mpsc::Sender` cloned to every `AppState` clone (cheap). The
718    /// matching receiver is consumed by the worker spawned in
719    /// [`AppState::new`] via [`tools::spawn_bm25_index_worker`]. Capacity is
720    /// [`tools::BM25_INDEX_QUEUE_CAPACITY`] (256).
721    /// Test: `bm25_index_queue_drops_when_full` exercises the full-queue
722    /// branch via `bm25_index_enqueue`.
723    pub bm25_index_tx: tokio::sync::mpsc::Sender<tools::Bm25IndexRequest>,
724    /// Cached result of the startup update check (issue #537).
725    ///
726    /// Why: `/health` should report `update_available` without hitting crates.io
727    /// on every probe. A single background check at daemon startup stores the
728    /// result here; the health handler reads it lock-free (well, a brief mutex
729    /// lock) without a network call.
730    /// What: `None` = up-to-date or check not yet done; `Some("x.y.z")` = newer
731    /// version available. The field is populated by a `tokio::spawn` in
732    /// `spawn_startup_tasks` (main.rs) after the daemon binds.
733    /// Test: indirectly by the `/health` endpoint tests in `web.rs`.
734    pub update_available: Arc<std::sync::Mutex<Option<String>>>,
735    /// Two-phase readiness state — `Warming` until the embedder is initialised,
736    /// then `Ready` (issues #910 / #911).
737    ///
738    /// Why: `AppState::embedder()` used to call `FastEmbedder::new()` without
739    /// any timeout, so the first `memory_recall`/`memory_remember` that arrived
740    /// before CoreML finished compiling would block for 5–11 hours until the
741    /// OnceCell resolved (issue #910). Exposing this state lets the preflight
742    /// guards in `tools.rs` return an explicit fast error immediately —
743    /// `"trusty-memory is warming up, retry shortly"` — instead of queueing
744    /// behind an open-ended init.
745    /// What: An `AtomicU8` starting at `DaemonReadiness::Warming` (0) and flipped
746    /// to `DaemonReadiness::Ready` (1) by `spawn_startup_tasks` after the embedder
747    /// warm-up succeeds.  The transition is one-way and lock-free.
748    /// Test: `daemon_readiness_transitions_warming_to_ready`.
749    pub daemon_readiness: Arc<AtomicU8>,
750}
751
752impl AppState {
753    /// Construct an `AppState` rooted at the given on-disk data directory.
754    ///
755    /// Why: The CLI (`serve`) and integration tests need to point the MCP
756    /// server at different roots — production at `dirs::data_dir`, tests at a
757    /// `tempfile::tempdir()`.
758    /// What: Builds an empty `PalaceRegistry`, captures the version, and
759    /// allocates an empty `OnceCell` for the embedder. `default_palace` is
760    /// `None`; use `with_default_palace` to set it.
761    /// Test: `tools::tests::dispatch_palace_create_persists` constructs an
762    /// AppState pointed at a tempdir and round-trips a palace through it.
763    pub fn new(data_root: PathBuf) -> Self {
764        let (events_tx, _) = broadcast::channel::<DaemonEvent>(128);
765        // Issue #96: open (or create) the persistent activity log under the
766        // daemon data root. Open failure is logged but never crashes the
767        // daemon — we fall back to a per-process tempdir so emits remain
768        // best-effort and the rest of the daemon keeps working.
769        let activity_log = open_activity_log_with_fallback(&data_root);
770        // Issue #231: bounded mpsc channel + single long-lived worker
771        // replaces the per-write `tokio::spawn` fire-and-forget pattern so
772        // BM25 indexing back-pressure is capped. The worker is spawned here
773        // unconditionally so the channel always has a drain — even when
774        // `bm25_client` is `None`, the worker just consumes and discards
775        // each request so senders never block on a full queue.
776        let (bm25_index_tx, bm25_index_rx) =
777            tokio::sync::mpsc::channel::<tools::Bm25IndexRequest>(tools::BM25_INDEX_QUEUE_CAPACITY);
778        // `bm25_client` / `bm25_supervisor` start as `None`; the builder
779        // `with_bm25_client_from_env` rebuilds the worker with the real
780        // client + supervisor once env-gated opt-in is resolved.
781        tools::spawn_bm25_index_worker(bm25_index_rx, None, None);
782        Self {
783            version: env!("CARGO_PKG_VERSION").to_string(),
784            registry: Arc::new(PalaceRegistry::new()),
785            data_root,
786            embedder: Arc::new(OnceCell::new()),
787            default_palace: None,
788            chat_provider: Arc::new(OnceCell::new()),
789            session_stores: Arc::new(dashmap::DashMap::new()),
790            events: Arc::new(events_tx),
791            started_at: std::time::Instant::now(),
792            // Default to an empty buffer — `with_log_buffer` overrides this
793            // when the daemon installs the `LogBufferLayer` (HTTP mode).
794            log_buffer: trusty_common::log_buffer::LogBuffer::new(
795                trusty_common::log_buffer::DEFAULT_LOG_CAPACITY,
796            ),
797            // Bug-reporting #478: `None` until `with_error_store` is called
798            // during daemon startup (HTTP mode). Tests keep `None` so no
799            // unexpected files are written to the OS data dir.
800            error_store: None,
801            disk_bytes: Arc::new(std::sync::atomic::AtomicU64::new(0)),
802            sys_metrics: Arc::new(tokio::sync::Mutex::new(
803                trusty_common::sys_metrics::SysMetrics::new(),
804            )),
805            bound_addr: Arc::new(OnceLock::new()),
806            prompt_context_cache: Arc::new(RwLock::new(prompt_facts::PromptFactsCache::default())),
807            activity_log,
808            bm25_client: None,
809            bm25_supervisor: None,
810            palace_write_locks: Arc::new(dashmap::DashMap::new()),
811            pending_activity_writes: Arc::new(AtomicUsize::new(0)),
812            palace_names: Arc::new(dashmap::DashMap::new()),
813            pin_project_map: Arc::new(dashmap::DashMap::new()),
814            bm25_index_tx,
815            update_available: Arc::new(std::sync::Mutex::new(None)),
816            // Start in Warming state; flipped to Ready by spawn_startup_tasks
817            // once the embedder warm-up succeeds (issues #910/#911).
818            daemon_readiness: Arc::new(AtomicU8::new(DaemonReadiness::Warming as u8)),
819        }
820    }
821
822    /// Acquire (lazily, then clone) the per-palace write mutex.
823    ///
824    /// Why (issue #230): the dedup-check + `remember_with_options` write
825    /// sequence in `tools.rs` must be atomic per palace to prevent two
826    /// concurrent identical writes from both passing the dedup gate.
827    /// Callers hold the returned `Arc<Mutex<()>>`'s guard across the gate
828    /// check and the write so the second writer blocks until the first
829    /// write is visible to `list_drawers`. Returning a clone of the `Arc`
830    /// rather than a borrow into the `DashMap` lets the caller `.await`
831    /// while holding the lock without risking a deadlock against any
832    /// future map mutation (DashMap shards are sync mutexes).
833    /// What: looks up the palace id in `palace_write_locks` and returns
834    /// a clone of the existing mutex; on the first call for a palace,
835    /// inserts a freshly-constructed `tokio::sync::Mutex<()>` first. The
836    /// `DashMap::entry().or_insert_with` API guarantees the lazy
837    /// construction is racy-safe — only one mutex is ever inserted per
838    /// palace id.
839    /// Test: `tools::tests::dedup_gate_blocks_concurrent_duplicate_writes`.
840    pub fn palace_write_lock(&self, palace_id: &str) -> Arc<tokio::sync::Mutex<()>> {
841        if let Some(existing) = self.palace_write_locks.get(palace_id) {
842            return existing.clone();
843        }
844        self.palace_write_locks
845            .entry(palace_id.to_string())
846            .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(())))
847            .clone()
848    }
849
850    /// Look up a project root path by palace id in the startup pin-scan map.
851    ///
852    /// Why: provides a stable, cheap accessor so handlers do not reach directly
853    /// into the `DashMap` field and so the accessor can be mocked in future
854    /// tests without touching `AppState` internals. The map is populated
855    /// asynchronously by `spawn_startup_tasks` — an absent entry means either
856    /// the scan has not completed yet or no pin file claimed that id.
857    /// What: returns `Some(project_path)` when the palace id was found during
858    /// startup scan; `None` otherwise.
859    /// Test: covered indirectly via the startup-scan integration path; the
860    /// underlying map data is validated by `startup_scan::tests`.
861    pub fn pinned_project_path(&self, palace_id: &str) -> Option<PathBuf> {
862        self.pin_project_map.get(palace_id).map(|e| e.clone())
863    }
864
865    /// Builder-style: opt-in to the BM25 lexical lane (issue #156).
866    ///
867    /// Why: the BM25 subprocess is gated behind `TRUSTY_BM25_DAEMON=1` so
868    /// the default `cargo install trusty-memory` / launchd plist deployment
869    /// stays vector-only and existing test fixtures keep passing without
870    /// having to provision a daemon. Reading the env var here keeps the
871    /// gating logic in one place (the helper in `main.rs` just plumbs the
872    /// result through).
873    /// What: when `TRUSTY_BM25_DAEMON=1`, constructs one `Bm25Client` per
874    /// palace by lazy-resolving the socket path the first time the palace
875    /// id is observed. Currently we install a shared `default` client up
876    /// front and re-key on the palace id at the call site — palaces with no
877    /// daemon socket simply see search/index errors which we log + ignore.
878    /// Returns `self` unchanged when the env var is unset or set to anything
879    /// other than `1`.
880    /// Test: `bm25_client_disabled_by_default`,
881    /// `bm25_client_enabled_when_env_set`.
882    #[must_use]
883    pub fn with_bm25_client_from_env(mut self) -> Self {
884        if std::env::var("TRUSTY_BM25_DAEMON").as_deref() == Ok("1") {
885            // Install the default-palace client; per-palace clients are
886            // constructed on demand via `Bm25Client::for_palace`.
887            let default_palace = self.default_palace.as_deref().unwrap_or("default");
888            self.bm25_client = Some(Arc::new(Bm25Client::for_palace(default_palace)));
889            // Issue #193: hand-in-hand with the client, attach a spawn
890            // supervisor so the BM25 daemon is auto-started on first use
891            // for any palace. Operators who want to manage daemons
892            // out-of-band (launchd, systemd, manual) set
893            // TRUSTY_BM25_EXTERNAL=1 which makes the supervisor a no-op.
894            self.bm25_supervisor = Some(Arc::new(bm25_supervisor::Bm25Supervisor::new()));
895            // Issue #231: rebuild the bounded indexer channel + worker so
896            // the worker holds the now-populated client + supervisor. The
897            // placeholder worker installed by `AppState::new` (with `None`
898            // / `None`) drained the channel into the void — replacing the
899            // sender here closes the placeholder receiver and the
900            // placeholder worker exits cleanly. The new worker takes over
901            // as the sole drain for the indexer queue.
902            let (tx, rx) = tokio::sync::mpsc::channel::<tools::Bm25IndexRequest>(
903                tools::BM25_INDEX_QUEUE_CAPACITY,
904            );
905            tools::spawn_bm25_index_worker(
906                rx,
907                self.bm25_client.clone(),
908                self.bm25_supervisor.clone(),
909            );
910            self.bm25_index_tx = tx;
911            tracing::info!(
912                palace = default_palace,
913                "BM25 daemon client + spawn supervisor enabled (TRUSTY_BM25_DAEMON=1)"
914            );
915        }
916        self
917    }
918
919    /// Scan the palace registry directory and re-register every persisted
920    /// palace into the in-memory [`PalaceRegistry`].
921    ///
922    /// Why: `AppState::new` builds an *empty* registry, so after a daemon
923    /// restart `palace_list` / the dashboard reported zero palaces even though
924    /// dozens existed on disk — palace metadata was persisted by
925    /// `palace_create` but never re-hydrated on startup. This method closes
926    /// that gap by walking the on-disk layout (each subdirectory holding a
927    /// `palace.json` is one palace) and rebuilding a live `PalaceHandle` for
928    /// each, so recall paths see the full set immediately after a restart.
929    /// What: runs the blocking filesystem walk + per-palace `PalaceHandle::open`
930    /// on a `spawn_blocking` thread (so it never stalls the async runtime),
931    /// registers each successfully opened palace via `register_arc`, logs every
932    /// load at `debug!`, and returns the count loaded. A palace that fails to
933    /// open (corrupt index, unreadable `kg.db`, etc.) is logged at `warn!` and
934    /// skipped — one bad palace must not abort startup or crash the daemon.
935    /// `data_root` is expected to already be the palace registry directory —
936    /// `main.rs` resolves it via [`resolve_palace_registry_dir`] before
937    /// constructing the `AppState`, so the flat / legacy-`palaces/` layout
938    /// difference is handled exactly once.
939    /// Test: `tests::load_palaces_from_disk_rehydrates_registry` writes two
940    /// palaces into a tempdir, constructs an `AppState`, calls this method, and
941    /// asserts the returned count and registry contents.
942    pub async fn load_palaces_from_disk(&self) -> Result<usize> {
943        let registry_dir = self.data_root.clone();
944        let registry = self.registry.clone();
945        let palace_names = self.palace_names.clone();
946        // The directory walk and each `PalaceHandle::open` perform blocking
947        // filesystem + redb/usearch I/O — run the whole hydration on the
948        // blocking pool so it never parks an async worker thread.
949        let count = tokio::task::spawn_blocking(move || -> Result<usize> {
950            let palaces = PalaceRegistry::list_palaces(&registry_dir)?;
951            let total = palaces.len();
952            let mut loaded = 0usize;
953            let mut skipped = 0usize;
954            for palace in palaces {
955                match trusty_common::memory_core::PalaceHandle::open(&palace) {
956                    Ok(handle) => {
957                        tracing::debug!(
958                            palace = %palace.id,
959                            data_dir = %palace.data_dir.display(),
960                            "loaded palace from disk"
961                        );
962                        // Issue #228: seed the in-memory name cache so write
963                        // hot paths (memory_remember / memory_note) can resolve
964                        // the friendly palace name without re-walking the data
965                        // root. Insert here (during hydration) is the single
966                        // point of truth for restart-time population.
967                        palace_names.insert(palace.id.0.clone(), palace.name.clone());
968                        registry.register_arc(handle);
969                        loaded += 1;
970                    }
971                    Err(e) => {
972                        // Why (issue #467): a single bad palace (corrupt kg.db,
973                        // stale WAL, EMFILE — "Too many open files", permissions)
974                        // must never abort startup or block the HTTP server from
975                        // binding. Log per-palace and keep going; the summary
976                        // below tells operators how many were skipped without
977                        // trawling the log.
978                        // The palace is NOT registered in the in-memory registry,
979                        // so the next `open_palace` call for this id will attempt
980                        // a fresh open from disk — the lazy-reopen path. If the
981                        // root cause was EMFILE and the fd-limit fix (#462) raised
982                        // the soft limit to 8192, that first request will succeed.
983                        tracing::warn!(
984                            palace = %palace.id,
985                            data_dir = %palace.data_dir.display(),
986                            "skipping palace during startup hydration: {e:#}; \
987                             will retry lazily on first access"
988                        );
989                        skipped += 1;
990                    }
991                }
992            }
993            tracing::info!(
994                "palace hydration summary: loaded {loaded}/{total} ({skipped} skipped due to errors)"
995            );
996            Ok(loaded)
997        })
998        .await
999        .map_err(|e| anyhow::anyhow!("join load_palaces_from_disk: {e}"))??;
1000        Ok(count)
1001    }
1002
1003    /// Builder-style: attach the daemon's shared [`LogBuffer`] so the
1004    /// `GET /api/v1/logs/tail` endpoint serves the same lines the tracing
1005    /// subscriber captures (issue #35).
1006    ///
1007    /// Why: `main` builds the buffer (via `init_tracing_with_buffer`) before
1008    /// constructing the `AppState`, then hands a clone here so the HTTP
1009    /// handler and the tracing layer observe the same ring.
1010    /// What: replaces the empty default buffer with the supplied one.
1011    /// Test: `logs_tail_returns_recent_lines`.
1012    #[must_use]
1013    pub fn with_log_buffer(mut self, buffer: trusty_common::log_buffer::LogBuffer) -> Self {
1014        self.log_buffer = buffer;
1015        self
1016    }
1017
1018    /// Builder-style: attach the bug-capture `ErrorStore` handle (bug-reporting #478).
1019    ///
1020    /// Why: Phase 2 MCP / HTTP endpoints need a handle to the in-memory error
1021    ///      ring so they can serve `recent_errors` / `errors_by_fingerprint`
1022    ///      without disk I/O on the hot path. Installing it here — rather than
1023    ///      adding it as a separate global — keeps the state graph explicit and
1024    ///      lets tests skip it by never calling this method.
1025    /// What: stores `Some(store)` in `AppState::error_store`; the `BugCaptureLayer`
1026    ///      that writes to this store is already installed in the tracing
1027    ///      subscriber by `init_tracing_with_buffer_and_capture`. The store is
1028    ///      `Clone` (cheap `Arc` clone internally) so both the layer and this
1029    ///      field share the same underlying ring.
1030    /// Test: Phase 2 will add `error_store_captures_and_queries` in `web.rs`.
1031    #[must_use]
1032    pub fn with_error_store(mut self, store: trusty_common::error_capture::ErrorStore) -> Self {
1033        self.error_store = Some(store);
1034        self
1035    }
1036
1037    /// Send a `DaemonEvent` to all connected SSE subscribers and persist
1038    /// it to the activity log when the variant carries a source.
1039    ///
1040    /// Why: Mutating handlers call this after a successful write so the
1041    /// dashboard can update without polling. The send is best-effort —
1042    /// `broadcast::Sender::send` returns `Err` only when there are no live
1043    /// receivers, which is fine (no listeners == no work to do). Issue
1044    /// #96 additionally writes the entry to the persistent activity log
1045    /// so the feed can serve historical rows on page load and so MCP /
1046    /// HTTP / Hook origins are visible to the operator. Persistence is
1047    /// also best-effort — a write failure is logged but never blocks the
1048    /// SSE broadcast.
1049    ///
1050    /// Issue #232: the activity-log append is a synchronous redb write +
1051    /// fsync. Calling it directly on the async caller's task parked a tokio
1052    /// worker thread on disk I/O for every SSE event. We now offload the
1053    /// append to the blocking thread pool via `spawn_blocking` and return
1054    /// immediately — `emit` stays synchronous so every existing caller
1055    /// (including the sync `dispatch_hook_fired` JSON-RPC handler) keeps
1056    /// compiling unchanged. The fire-and-forget pattern matches the
1057    /// pre-fix semantics (best-effort, never blocks the SSE broadcast)
1058    /// while freeing the async runtime to do real work during the write.
1059    /// What: serialises the event for the log (skipping `StatusChanged`
1060    /// which is a recomputed aggregate, not a mutation), spawns the redb
1061    /// append on `tokio::task::spawn_blocking` keyed by a clone of the
1062    /// `Arc<ActivityLog>` and the cloned event, then sends the event over
1063    /// the broadcast channel. A `pending_activity_writes` counter is bumped
1064    /// before the spawn and decremented inside the closure so
1065    /// [`Self::flush_activity_writes`] can drain in tests.
1066    /// Test: `web::tests::sse_stream_receives_palace_created` confirms a
1067    /// subscriber observes the emitted event;
1068    /// `activity_endpoint_lists_recent_emits` confirms persistence via
1069    /// `flush_activity_writes`.
1070    pub fn emit(&self, event: DaemonEvent) {
1071        if let Some(source) = event.source() {
1072            let event_type = event.type_str();
1073            let palace_id = event.palace_id().map(|s| s.to_string());
1074            let log = Arc::clone(&self.activity_log);
1075            let event_for_log = event.clone();
1076            let pending = Arc::clone(&self.pending_activity_writes);
1077            // Pre-allocate the sequence id in the emitting thread so the
1078            // persisted order matches the emission order even when blocking-pool
1079            // workers execute the writes concurrently (issue #247). Without
1080            // this, four rapid emits would assign IDs inside their respective
1081            // `spawn_blocking` closures in a non-deterministic order.
1082            let id = log.alloc_id();
1083            pending.fetch_add(1, Ordering::SeqCst);
1084            // Why: the synchronous redb append + fsync must not park an
1085            // async worker thread (issue #232). Spawn the write on the
1086            // blocking pool; the JoinHandle is intentionally dropped —
1087            // the write is best-effort and any failure is logged below.
1088            tokio::task::spawn_blocking(move || {
1089                let result = log.append_with_id(id, source, palace_id, event_type, &event_for_log);
1090                if let Err(e) = result {
1091                    tracing::warn!("activity_log.append failed for {event_type}: {e:#}");
1092                }
1093                pending.fetch_sub(1, Ordering::SeqCst);
1094            });
1095        }
1096        let _ = self.events.send(event);
1097    }
1098
1099    /// Block (asynchronously) until every in-flight activity-log write
1100    /// spawned by [`Self::emit`] has settled.
1101    ///
1102    /// Why: `emit` offloads its redb append to `tokio::task::spawn_blocking`
1103    /// and returns immediately (issue #232). Tests that observe the
1104    /// activity log right after a burst of emits would otherwise race the
1105    /// blocking-pool worker; this helper gives them a deterministic
1106    /// synchronization point. Production code never needs to call this —
1107    /// the dashboard reads through `GET /api/v1/activity`, which already
1108    /// tolerates writes settling asynchronously.
1109    /// What: spins on `pending_activity_writes` with a 1 ms yield until the
1110    /// counter is zero. Cheap: tests typically emit a handful of events
1111    /// and the loop exits within a single scheduler tick.
1112    /// Test: covered indirectly by `emit_persists_mutations_but_skips_status_changed`
1113    /// and `web::tests::activity_endpoint_lists_recent_emits`.
1114    pub async fn flush_activity_writes(&self) {
1115        while self.pending_activity_writes.load(Ordering::SeqCst) > 0 {
1116            tokio::time::sleep(std::time::Duration::from_millis(1)).await;
1117        }
1118    }
1119
1120    /// Open (or return cached) the chat-session store for a palace.
1121    ///
1122    /// Why: Chat session persistence lives in a dedicated SQLite file under
1123    /// the palace's data dir (`chat_sessions.db`) so it doesn't intermingle
1124    /// with the KG's transactional load. The store is cheap to clone via
1125    /// `Arc` but the underlying r2d2 pool should be reused, so cache by id.
1126    /// What: Creates the palace data dir if missing, opens (or reuses) a
1127    /// `ChatSessionStore` and stashes an `Arc` in the DashMap.
1128    /// Test: Indirectly via the session HTTP handlers in `web::tests`.
1129    pub fn session_store(&self, palace_id: &str) -> Result<Arc<ChatSessionStore>> {
1130        if let Some(entry) = self.session_stores.get(palace_id) {
1131            return Ok(entry.clone());
1132        }
1133        let dir = self.data_root.join(palace_id);
1134        std::fs::create_dir_all(&dir)
1135            .map_err(|e| anyhow::anyhow!("create palace dir {}: {e}", dir.display()))?;
1136        let store = Arc::new(ChatSessionStore::open(&dir.join("chat_sessions.db"))?);
1137        self.session_stores
1138            .insert(palace_id.to_string(), store.clone());
1139        Ok(store)
1140    }
1141
1142    /// Builder-style setter for the default palace name.
1143    ///
1144    /// Why: `serve --palace <name>` wants to bind every tool call to a
1145    /// project-scoped namespace without forcing every MCP request to repeat
1146    /// the palace argument.
1147    /// What: Returns `self` with `default_palace = Some(name)`.
1148    /// Test: `default_palace_used_when_arg_omitted` covers the resolution
1149    /// path; this setter is exercised there.
1150    pub fn with_default_palace(mut self, name: Option<String>) -> Self {
1151        self.default_palace = name;
1152        self
1153    }
1154
1155    /// Resolve (or initialize) the shared embedder.
1156    ///
1157    /// Why: FastEmbedder load is expensive — we share one instance across all
1158    /// tool calls; the `OnceCell` ensures concurrent first-use races collapse
1159    /// to a single load.
1160    /// What: Returns `Arc<FastEmbedder>` on success. Errors propagate from the
1161    /// underlying ONNX load.
1162    /// Test: Indirectly via `dispatch_remember_then_recall`.
1163    /// Resolve the active chat provider, auto-detecting on first call.
1164    ///
1165    /// Why: Provider selection depends on filesystem-loaded config plus a
1166    /// network probe (Ollama liveness), so it must be lazily initialised at
1167    /// runtime. Caching the choice in a `OnceCell` keeps it stable across
1168    /// concurrent requests without re-probing on every chat call.
1169    /// What: On first use loads `~/.trusty-memory/config.toml`, prefers an
1170    /// auto-detected Ollama instance (when `local_model.enabled`), and falls
1171    /// back to OpenRouter when an API key is set. Returns `Ok(None)` when
1172    /// neither is available so the caller can emit a 412.
1173    /// Test: `web::tests::providers_endpoint_returns_payload` covers the
1174    /// detection path indirectly through `/api/v1/chat/providers`.
1175    pub async fn chat_provider(&self) -> Option<Arc<dyn ChatProvider>> {
1176        self.chat_provider
1177            .get_or_init(|| async {
1178                // Why (issue #226): `service::load_user_config` is the
1179                //      axum-free home of the loader; the `web::load_user_config`
1180                //      re-export only exists for the HTTP handlers. Going
1181                //      direct to `service` keeps this method usable when
1182                //      the `axum-server` feature is disabled.
1183                let cfg = crate::service::load_user_config().unwrap_or_default();
1184                if cfg.local_model.enabled {
1185                    if let Some(mut p) =
1186                        trusty_common::auto_detect_local_provider(&cfg.local_model.base_url).await
1187                    {
1188                        // auto_detect returns an empty model id; callers must
1189                        // set the configured model name themselves.
1190                        p.model = cfg.local_model.model.clone();
1191                        return Some(Arc::new(p) as Arc<dyn ChatProvider>);
1192                    }
1193                }
1194                if !cfg.openrouter_api_key.is_empty() {
1195                    return Some(Arc::new(trusty_common::OpenRouterProvider::new(
1196                        cfg.openrouter_api_key,
1197                        cfg.openrouter_model,
1198                    )) as Arc<dyn ChatProvider>);
1199                }
1200                None
1201            })
1202            .await
1203            .clone()
1204    }
1205
1206    /// Spawn a fire-and-forget background task that auto-discovers project
1207    /// aliases under `project_root` and asserts new ones into `palace`.
1208    ///
1209    /// Why (issue #42): Projects carry implicit shorthand — cargo package
1210    /// names that differ from their directory, binary names that differ
1211    /// from packages, first-letter abbreviations — that should be surfaced
1212    /// without a user ever calling `add_alias`. Running discovery as a
1213    /// detached task on palace-open keeps startup latency unchanged: the
1214    /// daemon binds and starts serving immediately while the discovery scan
1215    /// completes in the background, and any newly-asserted aliases land in
1216    /// the prompt cache before the model's next `get_prompt_context` call.
1217    /// What: clones `self` (cheap; `Arc`-backed), spawns a tokio task that
1218    /// invokes the `discover_aliases` tool handler directly so the
1219    /// dedup + cache-rebuild logic runs exactly the same path as the MCP
1220    /// tool call. Errors are logged at `warn!`; one failed discovery never
1221    /// destabilises the daemon.
1222    /// Test: not unit-tested (timing-dependent fire-and-forget); the
1223    /// underlying `discover_aliases` dispatch is covered by
1224    /// `dispatch_discover_aliases_inserts_new_and_dedupes` in `tools::tests`.
1225    pub fn spawn_alias_discovery(&self, palace: String, project_root: PathBuf) {
1226        let state = self.clone();
1227        tokio::spawn(async move {
1228            let args = serde_json::json!({
1229                "palace": palace,
1230                "project_root": project_root.to_string_lossy(),
1231            });
1232            match tools::dispatch_tool(&state, "discover_aliases", args).await {
1233                Ok(result) => tracing::info!(
1234                    new = ?result.get("new"),
1235                    already_known = ?result.get("already_known"),
1236                    "alias discovery complete"
1237                ),
1238                Err(e) => tracing::warn!("alias discovery failed: {e:#}"),
1239            }
1240        });
1241    }
1242
1243    /// Return the current readiness state.
1244    ///
1245    /// Why: tool handlers and the `/health` endpoint need a cheap, lock-free
1246    /// way to check whether the embedder has been initialised yet.
1247    /// What: loads `daemon_readiness` with `Acquire` ordering so the caller
1248    /// sees all writes the startup task made before setting the state.
1249    /// Test: `daemon_readiness_transitions_warming_to_ready`.
1250    pub fn readiness(&self) -> DaemonReadiness {
1251        DaemonReadiness::from_u8(self.daemon_readiness.load(Ordering::Acquire))
1252    }
1253
1254    /// Flip the readiness state from `Warming` to `Ready`.
1255    ///
1256    /// Why: called by `spawn_startup_tasks` in `main.rs` once the embedder
1257    /// warm-up succeeds — this is the single state-transition site.
1258    /// What: `store(Ready, Release)` so subsequent `Acquire` loads in handlers
1259    /// observe a consistent state.  Idempotent: calling it multiple times is
1260    /// harmless.
1261    /// Test: `daemon_readiness_transitions_warming_to_ready`.
1262    pub fn set_ready(&self) {
1263        self.daemon_readiness
1264            .store(DaemonReadiness::Ready as u8, Ordering::Release);
1265    }
1266
1267    /// Return `Ok(())` when `Ready`, or an explicit `Err` with the warming
1268    /// message when still `Warming`.
1269    ///
1270    /// Why: the preflight in every bounded handler calls this and returns the
1271    /// error immediately so no embedding / redb I/O is attempted while the
1272    /// daemon is still initialising (tracks #911 internally).
1273    /// What: cheaply reads `daemon_readiness`; returns the fast error string
1274    /// on `Warming`.  Zero allocation on the happy path.
1275    /// Test: covered by `tools::tests::remember_returns_warming_error_while_state_is_warming`.
1276    pub fn readiness_check(&self) -> Result<()> {
1277        if self.readiness() == DaemonReadiness::Warming {
1278            return Err(anyhow::anyhow!(
1279                "trusty-memory is warming up (embedder initialising); \
1280                 please retry in a few seconds"
1281            ));
1282        }
1283        Ok(())
1284    }
1285
1286    /// Obtain the shared `FastEmbedder` instance, initialising it on first call.
1287    ///
1288    /// Why: centralises lazy embedder access so every tool handler goes through
1289    /// one bounded init path (tracks #910 internally).
1290    /// What: wraps `OnceCell::get_or_try_init` with a timeout so a slow
1291    /// CoreML/CUDA first-compile cannot block a handler indefinitely.  On
1292    /// timeout the `OnceCell` is left unresolved and the next caller retries.
1293    ///
1294    /// **Callers on the request path MUST call `readiness_check()` before
1295    /// this method.**  The four guarded handlers (`memory_remember`,
1296    /// `memory_recall`, `memory_recall_deep`, `memory_note`) do so; any new
1297    /// handler that calls `embedder()` must follow the same pattern.
1298    /// Reaching this method while still `Warming` is not a bug — the warm-up
1299    /// task itself calls `embedder()` while in `Warming` state — but request
1300    /// handlers should have short-circuited before here via `readiness_check()`.
1301    ///
1302    /// The `readiness_check()` preflight is the PRIMARY guard (fast rejection
1303    /// with no I/O).  This timeout is the last-resort backstop in case a
1304    /// handler bypasses the preflight or the warm-up task itself hits a
1305    /// pathological init delay.  If this timeout fires the `OnceCell` is left
1306    /// in the unresolved state and the next call retries from scratch.
1307    pub async fn embedder(&self) -> Result<Arc<FastEmbedder>> {
1308        use trusty_common::memory_core::timeouts;
1309        let cell = self.embedder.clone();
1310        let timeout = timeouts::embedder_init_timeout();
1311        // `readiness_check()` is the PRIMARY guard — handlers return a fast
1312        // warming error before reaching here.  This timeout is the last-resort
1313        // backstop: if the embedder init races past the preflight (e.g. in the
1314        // warm-up task itself, which calls embedder() while still Warming) or
1315        // the CoreML/CUDA compile stalls, we fail fast rather than blocking
1316        // indefinitely.  On timeout the OnceCell stays unresolved; the next
1317        // caller will retry the init from scratch.
1318        let embedder = tokio::time::timeout(
1319            timeout,
1320            cell.get_or_try_init(|| async {
1321                let e = FastEmbedder::new().await?;
1322                Ok::<Arc<FastEmbedder>, anyhow::Error>(Arc::new(e))
1323            }),
1324        )
1325        .await
1326        .map_err(|_| {
1327            anyhow::anyhow!(
1328                "AppState::embedder() timed out after {:?}; \
1329                 the CoreML/CUDA model is taking unusually long to compile — \
1330                 increase TRUSTY_EMBEDDER_INIT_TIMEOUT_SECS if needed",
1331                timeout
1332            )
1333        })??
1334        .clone();
1335        Ok(embedder)
1336    }
1337}
1338
1339impl std::fmt::Debug for AppState {
1340    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1341        f.debug_struct("AppState")
1342            .field("version", &self.version)
1343            .field("data_root", &self.data_root)
1344            .field("registry_len", &self.registry.len())
1345            .finish()
1346    }
1347}
1348
1349/// Handle a single MCP JSON-RPC message and produce its response.
1350///
1351/// Why: Pulled out of the stdio loop so unit tests can drive every method
1352/// without touching real stdin/stdout.
1353/// What: Routes `initialize`, `tools/list`, `tools/call`, `ping`, and the
1354/// `notifications/initialized` notification (which returns `Value::Null`).
1355/// Test: See unit tests below — initialize/list/call all return expected
1356/// JSON-RPC envelopes; notifications return `Null` (no response written).
1357pub async fn handle_message(state: &AppState, msg: Value) -> Value {
1358    let id = msg.get("id").cloned().unwrap_or(Value::Null);
1359    let method = msg.get("method").and_then(|m| m.as_str()).unwrap_or("");
1360
1361    match method {
1362        "initialize" => {
1363            let extra = state
1364                .default_palace
1365                .as_ref()
1366                .map(|dp| json!({ "default_palace": dp }));
1367            let result = initialize_response("trusty-memory", &state.version, extra);
1368            // Why (issue #42): prompt-facts now flow through the
1369            // per-message `get_prompt_context` tool rather than MCP
1370            // prompts, so we no longer advertise the `prompts` capability.
1371            json!({
1372                "jsonrpc": "2.0",
1373                "id": id,
1374                "result": result,
1375            })
1376        }
1377        // Notifications must NOT receive a response.
1378        "notifications/initialized" | "notifications/cancelled" => Value::Null,
1379        "tools/list" => json!({
1380            "jsonrpc": "2.0",
1381            "id": id,
1382            "result": tools::tool_definitions_with(state.default_palace.is_some())
1383        }),
1384        // OpenRPC 1.3.2 discovery — see `openrpc.rs`. Returns the full
1385        // service description so orchestrators (open-mpm, etc.) can
1386        // introspect every tool and its required `memory.read`/`memory.write`
1387        // scope without bespoke per-server adapters.
1388        "rpc.discover" => json!({
1389            "jsonrpc": "2.0",
1390            "id": id,
1391            "result": openrpc::build_discover_response(
1392                &state.version,
1393                state.default_palace.is_some(),
1394            ),
1395        }),
1396        "tools/call" => {
1397            let params = msg.get("params").cloned().unwrap_or_default();
1398            let tool_name = params
1399                .get("name")
1400                .and_then(|n| n.as_str())
1401                .unwrap_or("")
1402                .to_string();
1403            let args = params.get("arguments").cloned().unwrap_or_default();
1404            match tools::dispatch_tool(state, &tool_name, args).await {
1405                Ok(content) => {
1406                    // Why: tools that return a bare JSON string (e.g.
1407                    // `get_prompt_context` returning the formatted
1408                    // Markdown block) should surface as plain text in the
1409                    // MCP `content[0].text` field — wrapping in
1410                    // `Value::to_string()` would re-quote the payload and
1411                    // force every caller to strip outer quotes.
1412                    let text = match &content {
1413                        Value::String(s) => s.clone(),
1414                        other => other.to_string(),
1415                    };
1416                    json!({
1417                        "jsonrpc": "2.0",
1418                        "id": id,
1419                        "result": {
1420                            "content": [{"type": "text", "text": text}]
1421                        }
1422                    })
1423                }
1424                Err(e) => json!({
1425                    "jsonrpc": "2.0",
1426                    "id": id,
1427                    // Why: anyhow's `{:#}` alternate format walks the full
1428                    // `Caused by:` chain so MCP clients see actionable
1429                    // detail (e.g. "PalaceHandle::remember_with_options:
1430                    // filter rejected: too short") instead of just the
1431                    // outermost context label.
1432                    "error": {"code": -32603, "message": format!("{e:#}")}
1433                }),
1434            }
1435        }
1436        "ping" => json!({"jsonrpc": "2.0", "id": id, "result": {}}),
1437        _ => json!({
1438            "jsonrpc": "2.0",
1439            "id": id,
1440            "error": {
1441                "code": -32601,
1442                "message": format!("Method not found: {method}")
1443            }
1444        }),
1445    }
1446}
1447
1448/// Preferred starting port for the trusty-memory HTTP daemon.
1449///
1450/// Why: keeps the well-known default stable for clients that have hard-coded
1451/// `127.0.0.1:7070` in their configuration, while still allowing dynamic
1452/// walking when the port is in use (`DYNAMIC_PORT_RANGE` ports starting here).
1453/// What: `7070` — historic default, matches the launchd plist's prior value.
1454/// Test: covered indirectly by `bind_dynamic_port_returns_listener`.
1455pub const DEFAULT_HTTP_PORT: u16 = 7070;
1456
1457/// Number of consecutive ports `bind_dynamic_port` walks before falling back
1458/// to the OS-assigned port. Matches the trusty-search convention.
1459const DYNAMIC_PORT_RANGE: u16 = 10;
1460
1461/// Path to the canonical address-discovery file for the trusty-memory daemon.
1462///
1463/// Why: clients (CLI, MCP tools, dashboards) need to find the running daemon
1464/// without configuration when the port was selected dynamically. Using
1465/// `trusty_common::resolve_data_dir` aligns this path with the location
1466/// that `trusty_common::read_daemon_addr("trusty-memory")` reads from, so
1467/// `prompt-context`, `doctor`, and `start`'s probe all find the running daemon.
1468/// The old `~/.trusty-memory/http_addr` path and the new
1469/// `~/Library/Application Support/trusty-memory/http_addr` (macOS) path were
1470/// divergent — the daemon wrote one; readers expected the other.
1471/// What: returns `{resolve_data_dir("trusty-memory")}/http_addr`, or `None` if
1472/// the data dir cannot be resolved (locked-down container, no passwd entry).
1473/// Test: `http_addr_path_uses_resolve_data_dir`.
1474pub fn http_addr_path() -> Option<PathBuf> {
1475    trusty_common::resolve_data_dir("trusty-memory")
1476        .ok()
1477        .map(|d| d.join("http_addr"))
1478}
1479
1480/// Bind a `TcpListener` to `127.0.0.1`, dynamically selecting a port.
1481///
1482/// Why: the historic default `7070` is convenient for clients but a stale
1483/// process or a second daemon must not produce a noisy failure. Walking
1484/// `DEFAULT_HTTP_PORT..DEFAULT_HTTP_PORT+DYNAMIC_PORT_RANGE` first preserves
1485/// backwards compatibility for the common case; OS-assigned fallback (`:0`)
1486/// guarantees the daemon always comes up even when every preferred port is
1487/// busy.
1488/// What: returns the first successful `TcpListener` (7070..=7079, then
1489/// OS-assigned); caller inspects `local_addr()` to learn the chosen port.
1490/// Test: `bind_dynamic_port_returns_listener` confirms it always binds *some*
1491/// port even after another listener occupies the preferred one.
1492pub async fn bind_dynamic_port() -> Result<tokio::net::TcpListener> {
1493    let preferred: SocketAddr = SocketAddr::from(([127, 0, 0, 1], DEFAULT_HTTP_PORT));
1494    // First: walk the preferred range (7070..=7079).
1495    if let Ok(listener) =
1496        trusty_common::bind_with_auto_port(preferred, DYNAMIC_PORT_RANGE - 1).await
1497    {
1498        return Ok(listener);
1499    }
1500    // Last resort: ask the kernel for any free port. `bind_with_auto_port`
1501    // with `:0` resolves immediately to the OS-assigned port.
1502    tracing::warn!(
1503        "all ports {DEFAULT_HTTP_PORT}..{} in use; requesting OS-assigned port",
1504        DEFAULT_HTTP_PORT + DYNAMIC_PORT_RANGE - 1
1505    );
1506    let any: SocketAddr = SocketAddr::from(([127, 0, 0, 1], 0));
1507    trusty_common::bind_with_auto_port(any, 0).await
1508}
1509
1510/// Write the bound `host:port` to `~/.trusty-memory/http_addr` atomically.
1511///
1512/// Why: clients must read the file mid-write without observing a partial
1513/// value. Writing to a `.tmp` sibling and renaming over the target gives
1514/// POSIX atomicity, matching the trusty-search implementation.
1515/// What: creates the parent directory if missing; writes `addr` followed by a
1516/// trailing newline (avoids the "no newline at end of file" warnings from
1517/// `cat`); renames `.tmp` → `http_addr`. Best-effort: I/O errors are
1518/// returned to the caller so `run_http_on` can log without panicking.
1519/// Test: `http_addr_file_round_trip_via_helpers`.
1520#[cfg(feature = "axum-server")]
1521fn write_http_addr_file(path: &Path, addr: &SocketAddr) -> std::io::Result<()> {
1522    use std::io::Write;
1523    if let Some(parent) = path.parent() {
1524        std::fs::create_dir_all(parent)?;
1525    }
1526    let tmp = path.with_extension("addr.tmp");
1527    {
1528        let mut f = std::fs::File::create(&tmp)?;
1529        writeln!(f, "{addr}")?;
1530        f.sync_all()?;
1531    }
1532    std::fs::rename(&tmp, path)?;
1533    Ok(())
1534}
1535
1536/// Return `true` when a non-default data directory is in effect.
1537///
1538/// Why (issue #880): two startup side-effects must be suppressed when the
1539/// daemon runs with an isolated/overridden data root:
1540/// 1. The legacy `~/.trusty-memory/http_addr` dotfile write — it would
1541///    overwrite the real production daemon's discovery file with the isolated
1542///    instance's throwaway address.
1543/// 2. The startup pin-scan — it reads project pin files from the **real**
1544///    user environment (~/Projects, ~/Developer, …) and imports palaces from
1545///    the real environment into the isolated data root, defeating isolation.
1546///
1547/// A "non-default data dir" means `TRUSTY_DATA_DIR_OVERRIDE` is set to a
1548/// non-empty, non-whitespace value. Empty or whitespace-only values are
1549/// treated as unset (same rule as `resolve_data_dir`), so an accidental blank
1550/// env var does not suppress the dotfile write on real production instances.
1551/// What: reads `TRUSTY_DATA_DIR_OVERRIDE`; returns `true` when it contains a
1552/// non-empty, non-whitespace string. Returns `false` otherwise.
1553/// Test: `is_data_dir_override_active_when_set`,
1554///       `is_data_dir_override_inactive_when_unset`,
1555///       `is_data_dir_override_inactive_when_blank`.
1556#[inline]
1557pub fn is_data_dir_override_active() -> bool {
1558    matches!(
1559        std::env::var(trusty_common::DATA_DIR_OVERRIDE_ENV),
1560        Ok(v) if !v.trim().is_empty()
1561    )
1562}
1563
1564/// Resolve the dotfile discovery path `~/.trusty-memory/http_addr`.
1565///
1566/// Why (issue #498): external tooling such as claude-mpm's `migrate_trusty_autodetect`
1567/// reads `~/.trusty-memory/http_addr` to find the running daemon's port. On
1568/// macOS, `resolve_data_dir("trusty-memory")` returns
1569/// `~/Library/Application Support/trusty-memory/`, not `~/.trusty-memory/`,
1570/// so the daemon was writing to the OS-standard location while readers expected
1571/// the dotfile location. Writing to both locations keeps every reader happy
1572/// regardless of which convention they follow.
1573///
1574/// Fix #880: returns `None` when `TRUSTY_DATA_DIR_OVERRIDE` is active so an
1575/// isolated instance (test rig, CI, parallel run) never overwrites the real
1576/// production daemon's discovery dotfile.
1577///
1578/// What: returns `$HOME/.trusty-memory/http_addr` in the default (production)
1579/// case, or `None` when `dirs::home_dir()` is unavailable OR when a data-dir
1580/// override is active (see `is_data_dir_override_active`).
1581/// Test: `dotfile_http_addr_path_uses_home_dir`,
1582///       `dotfile_suppressed_when_override_active`.
1583#[cfg(feature = "axum-server")]
1584fn dotfile_http_addr_path() -> Option<PathBuf> {
1585    // Fix #880: never write to the shared dotfile when an override is active.
1586    if is_data_dir_override_active() {
1587        return None;
1588    }
1589    dirs::home_dir().map(|h| h.join(".trusty-memory").join("http_addr"))
1590}
1591
1592/// Run the optional HTTP/SSE + web admin server.
1593///
1594/// Why: A long-running daemon mode lets non-stdio clients (browsers, curl,
1595/// future remote agents) hit `/health`, the `/api/v1/*` REST surface, and the
1596/// embedded admin SPA. The Unix-domain-socket transport and the
1597/// `trusty-memory-mcp-bridge` binary were removed in PR3 of the #914
1598/// stdio-cutover epic; the canonical MCP integration is now
1599/// `trusty-memory serve --stdio` (PR1 #919).
1600/// What: axum router built from `web::router()` plus a `/sse` stub for the
1601/// existing MCP-over-SSE clients. Caller provides a pre-bound listener so
1602/// port auto-detection lives at the call site. Before accepting connections
1603/// the daemon stamps the bound `host:port` onto `AppState.bound_addr` and
1604/// writes `~/.trusty-memory/http_addr` so clients can discover the live port.
1605/// On shutdown the file is removed best-effort (a stale file with the wrong
1606/// port is worse than a missing one).
1607/// Test: `cargo test -p trusty-memory web::tests` exercises the router shape;
1608/// manual: `curl http://127.0.0.1:<port>/health` returns `ok` with `addr`.
1609#[cfg(feature = "axum-server")]
1610pub async fn run_http_on(state: AppState, listener: tokio::net::TcpListener) -> Result<()> {
1611    use axum::routing::get;
1612
1613    // Issue #35: recompute the `data_root` disk footprint every 10 s on a
1614    // background task so `GET /health` reports `disk_bytes` without doing a
1615    // recursive directory walk on the request path.
1616    spawn_disk_size_ticker(state.clone());
1617
1618    // Issue #228: emit aggregate `StatusChanged` on a fixed cadence rather
1619    // than on every drawer write. The previous design called
1620    // `aggregate_status_event` from every `memory_remember` / `memory_note`
1621    // / `memory_forget` (and the matching HTTP handlers), each of which
1622    // walked the data root + opened every palace handle. Coalescing the
1623    // emit to a 30 s ticker keeps dashboards live without dragging an
1624    // O(N palaces) recompute onto the write hot path.
1625    spawn_status_event_ticker(state.clone());
1626
1627    // Capture and advertise the bound address BEFORE serving so the first
1628    // request handler — and the http_addr discovery file — see the real port
1629    // even if `local_addr()` would otherwise be racy.
1630    let local = listener.local_addr().ok();
1631    let (written_path, written_dotfile_path) = if let Some(a) = local {
1632        // Stash on state for handlers (e.g. /health) to surface.
1633        let _ = state.bound_addr.set(a);
1634        info!("HTTP server listening on http://{a}");
1635        eprintln!("HTTP server listening on http://{a}");
1636        // Primary: write to the OS-standard data dir (`~/Library/Application
1637        // Support/trusty-memory/http_addr` on macOS, `~/.local/share/…` on
1638        // Linux). This is what `trusty_common::read_daemon_addr` reads.
1639        // Best-effort: a missing $HOME or read-only fs is non-fatal.
1640        let primary = match http_addr_path() {
1641            Some(p) => match write_http_addr_file(&p, &a) {
1642                Ok(()) => {
1643                    info!("wrote daemon address to {}", p.display());
1644                    Some(p)
1645                }
1646                Err(e) => {
1647                    tracing::warn!("could not write {}: {e}", p.display());
1648                    None
1649                }
1650            },
1651            None => {
1652                tracing::warn!("no $HOME — skipping http_addr discovery file");
1653                None
1654            }
1655        };
1656        // Issue #498: also write to `~/.trusty-memory/http_addr` so external
1657        // tools (e.g. claude-mpm's `migrate_trusty_autodetect`) that read the
1658        // dotfile path can discover the daemon's port. On macOS the OS-standard
1659        // path differs from the dotfile path; writing both ensures consumers
1660        // using either convention find the file. Best-effort: failures are
1661        // logged but do not block startup.
1662        let dotfile = match dotfile_http_addr_path() {
1663            Some(p) => match write_http_addr_file(&p, &a) {
1664                Ok(()) => {
1665                    info!("wrote daemon address to dotfile {}", p.display());
1666                    Some(p)
1667                }
1668                Err(e) => {
1669                    tracing::warn!("could not write dotfile {}: {e}", p.display());
1670                    None
1671                }
1672            },
1673            None => None,
1674        };
1675        (primary, dotfile)
1676    } else {
1677        (None, None)
1678    };
1679
1680    // Keep a handle to the BM25 supervisor (if any) so we can call
1681    // `shutdown()` on the exit path. Cloning here is cheap (`Arc`) and
1682    // detaches the lifetime of the supervisor from the `state` move into
1683    // the router below.
1684    let bm25_supervisor = state.bm25_supervisor.clone();
1685
1686    let app = web::router()
1687        .route("/sse", get(sse_handler))
1688        .with_state(state);
1689
1690    // Why (issue #534): bare axum::serve exits only on an internal error; SIGTERM
1691    // (launchctl bootout) would kill the process before the cleanup below had a
1692    // chance to run, leaving stale addr/socket files behind and dropping any
1693    // in-flight request without draining. `with_graceful_shutdown` installs a
1694    // SIGTERM + SIGINT watcher; when either fires axum stops accepting new
1695    // connections, drains active requests, then returns here so cleanup runs.
1696    let serve_result = axum::serve(listener, app)
1697        .with_graceful_shutdown(trusty_common::shutdown_signal())
1698        .await;
1699
1700    // Best-effort cleanup: remove `http_addr` files so stale clients fail fast
1701    // instead of timing out against a dead port. Remove both the OS-standard
1702    // path and the dotfile path (#498).
1703    if let Some(p) = written_path.as_ref() {
1704        let _ = std::fs::remove_file(p);
1705    }
1706    if let Some(p) = written_dotfile_path.as_ref() {
1707        let _ = std::fs::remove_file(p);
1708    }
1709
1710    // Issue #193: gracefully reap every spawned BM25 daemon before the
1711    // process exits so each one gets a chance to flush its snapshot and
1712    // unlink its socket. `kill_on_drop=true` on the children would
1713    // SIGKILL them on Drop anyway, but that skips the daemon's own
1714    // shutdown sequence and leaves stale sockets behind.
1715    if let Some(supervisor) = bm25_supervisor {
1716        supervisor.shutdown().await;
1717    }
1718
1719    serve_result?;
1720    Ok(())
1721}
1722
1723/// Convenience: bind `addr` and serve via [`run_http_on`].
1724#[cfg(feature = "axum-server")]
1725pub async fn run_http(state: AppState, addr: std::net::SocketAddr) -> Result<()> {
1726    let listener = tokio::net::TcpListener::bind(addr).await?;
1727    run_http_on(state, listener).await
1728}
1729
1730/// Convenience: bind dynamically (7070..=7079, OS fallback) and serve.
1731///
1732/// Why: `trusty-memory serve` with no `--http` flag is the canonical
1733/// launchd-managed daemon entry point. Dynamic binding lets a stale daemon
1734/// or a hand-spawned `serve --http 127.0.0.1:7070` coexist without breaking
1735/// the launchd-managed instance.
1736/// What: calls [`bind_dynamic_port`] then [`run_http_on`].
1737/// Test: integration via `trusty-memory serve` + `cat ~/.trusty-memory/http_addr`.
1738#[cfg(feature = "axum-server")]
1739pub async fn run_http_dynamic(state: AppState) -> Result<()> {
1740    let listener = bind_dynamic_port().await?;
1741    run_http_on(state, listener).await
1742}
1743
1744/// Spawn a background ticker that recomputes the `data_root` disk footprint
1745/// every 10 seconds and stores it in `state.disk_bytes` (issue #35).
1746///
1747/// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
1748/// every health request would turn a frequent health poll into unbounded
1749/// recursive I/O. Computing it off the request path on a fixed cadence keeps
1750/// `/health` cheap and bounds the staleness to ~10 s — fine for an
1751/// at-a-glance footprint figure.
1752/// What: spawns a detached tokio task. `AppState` is cheap to `Clone` (all
1753/// `Arc` fields), so the task holds a full clone; the daemon process lives
1754/// for the lifetime of the server anyway, so no `Weak` downgrade is needed.
1755/// Each tick runs the blocking directory walk on `spawn_blocking` so it never
1756/// stalls the async runtime, then stores the byte total atomically.
1757/// Test: `health_endpoint_includes_resource_fields` asserts the field shape;
1758/// the ticker cadence is not unit-tested (timing-dependent).
1759#[cfg(feature = "axum-server")]
1760fn spawn_disk_size_ticker(state: AppState) {
1761    tokio::spawn(async move {
1762        let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
1763        loop {
1764            interval.tick().await;
1765            let dir = state.data_root.clone();
1766            // The directory walk is blocking filesystem I/O — run it on the
1767            // blocking pool so it never parks an async worker thread.
1768            let bytes = tokio::task::spawn_blocking(move || {
1769                trusty_common::sys_metrics::dir_size_bytes(&dir)
1770            })
1771            .await
1772            .unwrap_or(0);
1773            state
1774                .disk_bytes
1775                .store(bytes, std::sync::atomic::Ordering::Relaxed);
1776        }
1777    });
1778}
1779
1780/// Interval between aggregate-status snapshot emits on the SSE bus.
1781///
1782/// Why (issue #228): mutations used to fire `StatusChanged` synchronously on
1783/// the write path, which forced an O(N palaces) sum of drawer / vector / KG
1784/// counts on every `memory_remember`. Coalescing into a fixed-cadence ticker
1785/// lets dashboards stay current (a 30 s lag is invisible at human scale)
1786/// while keeping the write path free of aggregate work.
1787/// What: 30 seconds — short enough that the operator UI doesn't feel stale
1788/// between manual writes, long enough that the recompute cost (in-memory
1789/// registry walk plus the redb `count_active_triples` per palace) is a
1790/// rounding error on the daemon's CPU budget.
1791/// Test: covered indirectly — the math has not changed, only the cadence.
1792#[allow(dead_code)]
1793const STATUS_EVENT_TICK_SECS: u64 = 30;
1794
1795/// Spawn a background ticker that emits `DaemonEvent::StatusChanged` every
1796/// [`STATUS_EVENT_TICK_SECS`] seconds (issue #228).
1797///
1798/// Why: replaces the per-write `state.emit(self.aggregate_status_event())`
1799/// call sites that used to recompute the aggregate every time a drawer was
1800/// created or deleted. Walking N palaces on every write blocks the async
1801/// runtime; coalescing the emit onto a ticker keeps dashboards up-to-date
1802/// without that cost.
1803/// What: spawns a detached tokio task that holds a full `AppState` clone
1804/// (cheap — every field is `Arc`-backed) and ticks every
1805/// [`STATUS_EVENT_TICK_SECS`] seconds. Each tick computes
1806/// `MemoryService::aggregate_status_event` (which now iterates the
1807/// in-memory registry, not disk) and broadcasts it via `state.emit`. If
1808/// no SSE subscribers are connected the broadcast `send` is a cheap no-op,
1809/// so the ticker imposes no cost when nobody is listening.
1810/// Test: not unit-tested (timing-dependent fire-and-forget); the underlying
1811/// `aggregate_status_event` math is exercised by the existing
1812/// `status_endpoint_returns_payload` path.
1813#[allow(dead_code)]
1814fn spawn_status_event_ticker(state: AppState) {
1815    tokio::spawn(async move {
1816        let mut interval =
1817            tokio::time::interval(std::time::Duration::from_secs(STATUS_EVENT_TICK_SECS));
1818        // The first tick fires immediately, which is fine: it gives SSE
1819        // subscribers a baseline `StatusChanged` shortly after they connect.
1820        loop {
1821            interval.tick().await;
1822            let event = service::MemoryService::new(state.clone()).aggregate_status_event();
1823            state.emit(event);
1824        }
1825    });
1826}
1827
1828/// Live SSE event stream — pushes `DaemonEvent` frames to dashboard clients.
1829///
1830/// Why: The dashboard subscribes once and reacts to live pushes (palace
1831/// created, drawer added/deleted, dream completed, status changed) instead of
1832/// polling `/api/v1/*` endpoints.
1833/// What: Subscribes to `state.events`, emits an initial `connected` frame,
1834/// then forwards every `DaemonEvent` as `data: <json>\n\n`. Lagged
1835/// subscribers receive a `lag` frame indicating skipped events; channel
1836/// closure ends the stream.
1837/// Test: `web::tests::sse_stream_emits_palace_created` (covers subscribe +
1838/// emit + receive); manual: `curl -N http://.../sse`.
1839#[cfg(feature = "axum-server")]
1840pub(crate) async fn sse_handler(
1841    axum::extract::State(state): axum::extract::State<AppState>,
1842) -> impl axum::response::IntoResponse {
1843    use futures::StreamExt;
1844    use tokio_stream::wrappers::BroadcastStream;
1845
1846    let rx = state.events.subscribe();
1847    let initial = futures::stream::once(async {
1848        Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(
1849            "data: {\"type\":\"connected\"}\n\n",
1850        ))
1851    });
1852    let events = BroadcastStream::new(rx).map(|res| {
1853        let frame = match res {
1854            Ok(event) => match serde_json::to_string(&event) {
1855                Ok(json) => format!("data: {json}\n\n"),
1856                Err(e) => format!("data: {{\"type\":\"error\",\"message\":\"{e}\"}}\n\n"),
1857            },
1858            Err(tokio_stream::wrappers::errors::BroadcastStreamRecvError::Lagged(n)) => {
1859                format!("data: {{\"type\":\"lag\",\"skipped\":{n}}}\n\n")
1860            }
1861        };
1862        Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(frame))
1863    });
1864    let stream = initial.chain(events);
1865
1866    axum::response::Response::builder()
1867        .header("Content-Type", "text/event-stream")
1868        .header("Cache-Control", "no-cache")
1869        .header("X-Accel-Buffering", "no")
1870        .body(axum::body::Body::from_stream(stream))
1871        .expect("valid SSE response") // Why: invariant — SSE headers are compile-time constants; builder cannot fail
1872}
1873
1874#[cfg(test)]
1875mod tests {
1876    use super::*;
1877
1878    /// Why: Issue #234 — previously we `mem::forget`ed the `TempDir` so tests
1879    /// could keep using `AppState` without juggling the directory handle, but
1880    /// that leaked one temp directory per test (262+ accumulated each run).
1881    /// What: Returns the `TempDir` alongside the `AppState` so the caller can
1882    /// bind it (`let (state, _tmp) = ...;`) and let drop semantics clean up
1883    /// when the test scope ends.
1884    /// Test: Every test in this module that constructs state.
1885    fn test_state() -> (AppState, tempfile::TempDir) {
1886        let tmp = tempfile::tempdir().expect("tempdir");
1887        let root = tmp.path().to_path_buf();
1888        // Issue #88: bypass palace-slug enforcement so lib tests that call
1889        // `palace_create` with arbitrary names keep passing.
1890        // SAFETY: constant idempotent write; safe across test threads.
1891        unsafe {
1892            std::env::set_var("TRUSTY_SKIP_PALACE_ENFORCEMENT", "1");
1893        }
1894        let state = AppState::new(root);
1895        // Pre-existing tests exercise functional paths — flip to Ready so the
1896        // issue #911 warming preflight does not reject them.
1897        state.set_ready();
1898        (state, tmp)
1899    }
1900
1901    /// Why: DaemonReadiness tests need a state that starts in Warming; this
1902    /// variant skips `set_ready()` so the transition can be tested explicitly.
1903    /// Test: `daemon_readiness_transitions_warming_to_ready`,
1904    ///       `readiness_check_ok_when_ready_err_when_warming`.
1905    fn test_state_warming() -> (AppState, tempfile::TempDir) {
1906        // Use OnceLock so the env var is written exactly once across all
1907        // parallel test threads — avoids the unsynchronised set_var race while
1908        // remaining consistent with the idempotent-write approach used in
1909        // `test_state()`.
1910        static SKIP_ENFORCEMENT_SET: std::sync::OnceLock<()> = std::sync::OnceLock::new();
1911        SKIP_ENFORCEMENT_SET.get_or_init(|| unsafe {
1912            std::env::set_var("TRUSTY_SKIP_PALACE_ENFORCEMENT", "1");
1913        });
1914        let tmp = tempfile::tempdir().expect("tempdir");
1915        let root = tmp.path().to_path_buf();
1916        // Deliberately do NOT call set_ready() — stays in Warming state.
1917        (AppState::new(root), tmp)
1918    }
1919
1920    #[tokio::test]
1921    async fn initialize_returns_protocol_version_and_capabilities() {
1922        let (state, _tmp) = test_state();
1923        let req = json!({
1924            "jsonrpc": "2.0",
1925            "id": 1,
1926            "method": "initialize",
1927            "params": {
1928                "protocolVersion": "2024-11-05",
1929                "capabilities": {},
1930                "clientInfo": {"name": "test", "version": "0"}
1931            }
1932        });
1933        let resp = handle_message(&state, req).await;
1934        assert_eq!(resp["jsonrpc"], "2.0");
1935        assert_eq!(resp["id"], 1);
1936        assert_eq!(resp["result"]["protocolVersion"], "2024-11-05");
1937        assert!(resp["result"]["capabilities"]["tools"].is_object());
1938        assert_eq!(resp["result"]["serverInfo"]["name"], "trusty-memory");
1939    }
1940
1941    #[tokio::test]
1942    async fn initialized_notification_returns_null() {
1943        let (state, _tmp) = test_state();
1944        let req = json!({
1945            "jsonrpc": "2.0",
1946            "method": "notifications/initialized",
1947            "params": {}
1948        });
1949        let resp = handle_message(&state, req).await;
1950        assert!(resp.is_null());
1951    }
1952
1953    #[tokio::test]
1954    async fn tools_list_returns_all_tools() {
1955        let (state, _tmp) = test_state();
1956        let req = json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"});
1957        let resp = handle_message(&state, req).await;
1958        let tools = resp["result"]["tools"].as_array().expect("tools array");
1959        // Issue #99 added `memory_send_message`; issue #180 added
1960        // `palace_delete`; the #180 follow-up adds `palace_update` on top
1961        // of the 22-tool baseline; issue #537 adds `upgrade`;
1962        // issue #1104 adds `console_metrics`.
1963        assert_eq!(tools.len(), 25);
1964    }
1965
1966    #[tokio::test]
1967    async fn unknown_method_returns_error() {
1968        let (state, _tmp) = test_state();
1969        let req = json!({"jsonrpc": "2.0", "id": 4, "method": "wat"});
1970        let resp = handle_message(&state, req).await;
1971        assert_eq!(resp["error"]["code"], -32601);
1972    }
1973
1974    #[tokio::test]
1975    async fn ping_returns_empty_result() {
1976        let (state, _tmp) = test_state();
1977        let req = json!({"jsonrpc": "2.0", "id": 5, "method": "ping"});
1978        let resp = handle_message(&state, req).await;
1979        assert!(resp["result"].is_object());
1980    }
1981
1982    #[tokio::test]
1983    async fn app_state_default_constructs() {
1984        let (s, _tmp) = test_state();
1985        assert!(!s.version.is_empty());
1986        assert!(s.registry.is_empty());
1987        assert!(s.default_palace.is_none());
1988    }
1989
1990    /// Why (issue #225): the previous implementation called `.expect()` on the
1991    /// tempdir fallback, which panicked the daemon at startup on hosts where
1992    /// neither the data root nor `std::env::temp_dir()` is writable
1993    /// (read-only Docker overlays, locked-down sandboxes). The activity log
1994    /// is documented as best-effort, so the fix returns a no-op `Discard`
1995    /// variant instead. This test forces both paths to fail and asserts the
1996    /// helper returns the discard variant rather than panicking.
1997    ///
1998    /// Skipped when running as root because `chmod 000` is a no-op for the
1999    /// root user — the kernel grants root access regardless of mode bits.
2000    /// CI typically runs as non-root, so coverage is preserved in the
2001    /// common case; local root invocations simply skip with a warning.
2002    #[test]
2003    #[cfg(unix)]
2004    fn open_activity_log_with_fallback_returns_discard_when_unwritable() {
2005        // Skip when running as root — chmod is ignored.
2006        // SAFETY: libc::geteuid is a thread-safe syscall with no preconditions.
2007        if unsafe { libc::geteuid() } == 0 {
2008            eprintln!(
2009                "skipping open_activity_log_with_fallback_returns_discard_when_unwritable: running as root"
2010            );
2011            return;
2012        }
2013
2014        use std::os::unix::fs::PermissionsExt;
2015
2016        // Build two unwritable directories: the primary "data root" and a
2017        // shadow "TMPDIR" so the tempdir fallback also fails.
2018        let outer = tempfile::tempdir().expect("outer tempdir");
2019        let primary = outer.path().join("primary");
2020        let tmpdir = outer.path().join("fake-tmp");
2021        std::fs::create_dir(&primary).expect("create primary");
2022        std::fs::create_dir(&tmpdir).expect("create tmpdir");
2023
2024        // chmod 000 on both — neither can be opened for write.
2025        std::fs::set_permissions(&primary, std::fs::Permissions::from_mode(0o000))
2026            .expect("chmod primary");
2027        std::fs::set_permissions(&tmpdir, std::fs::Permissions::from_mode(0o000))
2028            .expect("chmod tmpdir");
2029
2030        // Override the tempdir lookup so `open_activity_log_with_fallback`
2031        // hits our unwritable fake-tmp instead of the real system temp.
2032        // Note: env var mutation is process-global; this test is the only
2033        // accessor for `TMPDIR` in this test binary, and we restore the
2034        // previous value before returning.
2035        let prev_tmpdir = std::env::var_os("TMPDIR");
2036        std::env::set_var("TMPDIR", &tmpdir);
2037
2038        let log = open_activity_log_with_fallback(&primary);
2039
2040        // Restore TMPDIR ASAP so a panic later in the test doesn't leak it.
2041        match prev_tmpdir {
2042            Some(v) => std::env::set_var("TMPDIR", v),
2043            None => std::env::remove_var("TMPDIR"),
2044        }
2045
2046        // Restore permissions so the outer tempdir can clean up.
2047        let _ = std::fs::set_permissions(&primary, std::fs::Permissions::from_mode(0o700));
2048        let _ = std::fs::set_permissions(&tmpdir, std::fs::Permissions::from_mode(0o700));
2049
2050        assert!(
2051            log.is_discard(),
2052            "expected ActivityLog::Discard when both data root and tempdir are unwritable"
2053        );
2054
2055        // The Discard variant must still satisfy the public contract: no
2056        // panic on append/count/list.
2057        let id = log
2058            .append(
2059                ActivitySource::Http,
2060                None,
2061                "drawer_added",
2062                json!({"smoke": true}),
2063            )
2064            .expect("discard append must succeed");
2065        assert_eq!(id, 0);
2066        assert_eq!(log.count().expect("discard count"), 0);
2067        assert!(log
2068            .list(&ActivityFilter::default(), 10, 0)
2069            .expect("discard list")
2070            .is_empty());
2071    }
2072
2073    /// Why: Issue #26 — when `serve --palace <name>` is set, the MCP server
2074    /// must (a) report the default in the `initialize` `serverInfo`, (b)
2075    /// drop `palace` from the required schema in `tools/list`, and (c) let
2076    /// `tools/call` use the default when the caller omits `palace`.
2077    /// Test: Construct an AppState with a default palace, create that palace
2078    /// on disk via the registry, then call `memory_remember` without a
2079    /// `palace` argument and confirm it resolves to the default.
2080    #[tokio::test]
2081    async fn default_palace_used_when_arg_omitted() {
2082        let tmp = tempfile::tempdir().expect("tempdir");
2083        let root = tmp.path().to_path_buf();
2084
2085        // Pre-create the default palace so remember has somewhere to land.
2086        let registry = trusty_common::memory_core::PalaceRegistry::new();
2087        let palace = trusty_common::memory_core::Palace {
2088            id: trusty_common::memory_core::PalaceId::new("default-pal"),
2089            name: "default-pal".to_string(),
2090            description: None,
2091            created_at: chrono::Utc::now(),
2092            data_dir: root.join("default-pal"),
2093        };
2094        registry
2095            .create_palace(&root, palace)
2096            .expect("create_palace");
2097
2098        let state = AppState::new(root).with_default_palace(Some("default-pal".to_string()));
2099        // Flip to Ready so the readiness preflight (#911) does not reject the
2100        // `memory_remember` call below.
2101        state.set_ready();
2102
2103        // (a) initialize advertises the default.
2104        let init = handle_message(
2105            &state,
2106            json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2107        )
2108        .await;
2109        assert_eq!(
2110            init["result"]["serverInfo"]["default_palace"], "default-pal",
2111            "initialize must echo default_palace in serverInfo"
2112        );
2113
2114        // (b) tools/list drops `palace` from required when default is set.
2115        let list = handle_message(
2116            &state,
2117            json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"}),
2118        )
2119        .await;
2120        let tools = list["result"]["tools"].as_array().expect("tools array");
2121        let remember = tools
2122            .iter()
2123            .find(|t| t["name"] == "memory_remember")
2124            .expect("memory_remember tool");
2125        let required: Vec<&str> = remember["inputSchema"]["required"]
2126            .as_array()
2127            .expect("required array")
2128            .iter()
2129            .filter_map(|v| v.as_str())
2130            .collect();
2131        assert!(
2132            !required.contains(&"palace"),
2133            "palace must not be required when default is configured; got {required:?}"
2134        );
2135        assert!(required.contains(&"text"));
2136
2137        // (c) tools/call resolves the default when arg is omitted.
2138        let call = handle_message(
2139            &state,
2140            json!({
2141                "jsonrpc": "2.0",
2142                "id": 3,
2143                "method": "tools/call",
2144                "params": {
2145                    "name": "memory_remember",
2146                    "arguments": {"text": "default palace test memory content with several tokens"},
2147                },
2148            }),
2149        )
2150        .await;
2151        // Successful dispatch returns `result.content[0].text` JSON.
2152        let text = call["result"]["content"][0]["text"]
2153            .as_str()
2154            .unwrap_or_else(|| panic!("expected success result, got {call}"));
2155        let parsed: Value = serde_json::from_str(text).expect("parse content json");
2156        assert_eq!(parsed["palace"], "default-pal");
2157        assert_eq!(parsed["status"], "stored");
2158        assert!(parsed["drawer_id"].as_str().is_some());
2159    }
2160
2161    /// Why: When no default is set, `tools/call` for a palace-bound tool
2162    /// without a `palace` argument should error helpfully rather than panic.
2163    #[tokio::test]
2164    async fn missing_palace_without_default_errors() {
2165        let (state, _tmp) = test_state();
2166        let resp = handle_message(
2167            &state,
2168            json!({
2169                "jsonrpc": "2.0",
2170                "id": 7,
2171                "method": "tools/call",
2172                "params": {
2173                    "name": "memory_recall",
2174                    "arguments": {"query": "anything"},
2175                },
2176            }),
2177        )
2178        .await;
2179        assert_eq!(resp["error"]["code"], -32603);
2180        let msg = resp["error"]["message"].as_str().unwrap_or("");
2181        assert!(
2182            msg.contains("missing 'palace'"),
2183            "expected helpful error, got: {msg}"
2184        );
2185    }
2186
2187    /// Why: regression for the "palaces lost on restart" bug — `AppState::new`
2188    /// builds an empty registry, so the daemon must call
2189    /// `load_palaces_from_disk` on startup to re-register palaces persisted by
2190    /// a previous run. Without that call the registry stays empty even though
2191    /// `palace.json` files exist on disk.
2192    /// What: persists two palaces under a tempdir (via the same
2193    /// `create_palace` path the `palace_create` tool uses), constructs a fresh
2194    /// `AppState` rooted there, calls `load_palaces_from_disk`, and asserts the
2195    /// returned count and registry contents.
2196    /// Test: this test itself.
2197    #[tokio::test]
2198    async fn load_palaces_from_disk_rehydrates_registry() {
2199        use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2200
2201        let tmp = tempfile::tempdir().expect("tempdir");
2202        let root = tmp.path().to_path_buf();
2203
2204        // Phase 1: persist two palaces to disk, then drop the writer registry
2205        // so nothing is held in memory — simulating a prior daemon run.
2206        {
2207            let writer = PalaceRegistry::new();
2208            for id in ["alpha", "beta"] {
2209                let palace = Palace {
2210                    id: PalaceId::new(id),
2211                    name: id.to_string(),
2212                    description: None,
2213                    created_at: chrono::Utc::now(),
2214                    data_dir: root.join(id),
2215                };
2216                writer
2217                    .create_palace(&root, palace)
2218                    .expect("persist palace to disk");
2219            }
2220        }
2221
2222        // Add a stray non-palace subdirectory; the walker must ignore it.
2223        std::fs::create_dir_all(root.join("not-a-palace")).expect("mkdir");
2224
2225        // Phase 2: fresh AppState starts with an empty registry (the bug).
2226        let state = AppState::new(root);
2227        assert!(
2228            state.registry.is_empty(),
2229            "AppState::new must start with an empty registry"
2230        );
2231
2232        // The fix: hydrate from disk.
2233        let count = state
2234            .load_palaces_from_disk()
2235            .await
2236            .expect("load_palaces_from_disk");
2237
2238        assert_eq!(count, 2, "both persisted palaces should be loaded");
2239        assert_eq!(state.registry.len(), 2, "registry should hold both palaces");
2240        let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
2241        assert!(ids.contains(&"alpha".to_string()));
2242        assert!(ids.contains(&"beta".to_string()));
2243    }
2244
2245    /// Why: existing installs (and the legacy standalone `trusty-memory` repo)
2246    /// nest palaces one level deeper under a `palaces/` subdirectory. When that
2247    /// subdirectory exists, `resolve_palace_registry_dir` must descend into it
2248    /// so the daemon scans the level that actually holds the `palace.json`
2249    /// files — otherwise it finds zero palaces, which is the restart bug.
2250    /// What: creates `<dir>/palaces/`, resolves, and asserts the nested path is
2251    /// returned.
2252    /// Test: this test itself.
2253    #[test]
2254    fn resolve_palace_registry_dir_prefers_palaces_subdir() {
2255        let tmp = tempfile::tempdir().expect("tempdir");
2256        let data_dir = tmp.path().to_path_buf();
2257        std::fs::create_dir_all(data_dir.join("palaces")).expect("mkdir palaces");
2258
2259        let resolved = resolve_palace_registry_dir(data_dir.clone());
2260        assert_eq!(resolved, data_dir.join("palaces"));
2261    }
2262
2263    /// Why: a fresh install with no `palaces/` subdirectory must fall back to
2264    /// the data dir itself (the current flat monorepo layout).
2265    #[test]
2266    fn resolve_palace_registry_dir_falls_back_to_data_dir() {
2267        let tmp = tempfile::tempdir().expect("tempdir");
2268        let data_dir = tmp.path().to_path_buf();
2269
2270        let resolved = resolve_palace_registry_dir(data_dir.clone());
2271        assert_eq!(resolved, data_dir);
2272    }
2273
2274    /// Why: defense-in-depth assertion (#503) — a non-absolute data_dir must be
2275    /// caught and rejected before reaching `AppState::new`, as it would create
2276    /// palace dirs relative to the daemon CWD (/ under launchd).
2277    /// What: passing a relative path to `resolve_palace_registry_dir` produces
2278    /// a relative result; the daemon startup guard in `main.rs` must refuse it.
2279    /// This test validates the outcome of that guard path by confirming that a
2280    /// relative input would NOT produce an absolute registry dir.
2281    /// Test: this test itself.
2282    #[test]
2283    fn resolve_palace_registry_dir_relative_input_is_not_absolute() {
2284        // A relative dir is not a valid input, but we want to confirm that
2285        // if one somehow slipped through, the result would also be relative —
2286        // so the `main.rs` guard (is_absolute check) correctly rejects it.
2287        let relative = std::path::PathBuf::from("relative/path");
2288        let result = resolve_palace_registry_dir(relative.clone());
2289        assert!(
2290            !result.is_absolute(),
2291            "a relative input should produce a relative registry dir (caught by main.rs guard)"
2292        );
2293    }
2294
2295    /// Why: defense-in-depth assertion (#503) — a data_dir equal to "/" must be
2296    /// caught before reaching `AppState::new` to prevent palace dirs at `/`.
2297    /// What: confirms that "/" produces a result that equals "/", which the
2298    /// `main.rs` startup guard correctly rejects.
2299    /// Test: this test itself.
2300    #[test]
2301    fn resolve_palace_registry_dir_root_input_stays_root() {
2302        let root = std::path::PathBuf::from("/");
2303        let result = resolve_palace_registry_dir(root);
2304        // The guard in main.rs checks data_dir (before calling this fn), so
2305        // "/" would be caught before reaching this fn. But if it did reach here,
2306        // we confirm it yields "/" (no palaces/ subdir exists there), which would
2307        // then be caught by the main.rs post-resolve guard.
2308        assert_eq!(result, std::path::PathBuf::from("/"));
2309    }
2310
2311    /// Why: end-to-end check that the nested-`palaces/` layout hydrates — the
2312    /// daemon resolves the registry dir via `resolve_palace_registry_dir`, so
2313    /// an `AppState` rooted there must load palaces persisted one level below
2314    /// the bare data dir.
2315    /// What: persists two palaces under `<root>/palaces/<id>/`, constructs an
2316    /// `AppState` rooted at the resolved registry dir, and asserts hydration
2317    /// finds both.
2318    /// Test: this test itself.
2319    #[tokio::test]
2320    async fn load_palaces_from_disk_handles_palaces_subdir() {
2321        use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2322
2323        let tmp = tempfile::tempdir().expect("tempdir");
2324        let root = tmp.path().to_path_buf();
2325        let nested = root.join("palaces");
2326
2327        {
2328            let writer = PalaceRegistry::new();
2329            for id in ["cto", "engineering"] {
2330                let palace = Palace {
2331                    id: PalaceId::new(id),
2332                    name: id.to_string(),
2333                    description: None,
2334                    created_at: chrono::Utc::now(),
2335                    data_dir: nested.join(id),
2336                };
2337                // create_palace anchors data_dir under the passed root, so
2338                // pass `nested` here to land palaces under `<root>/palaces/`.
2339                writer
2340                    .create_palace(&nested, palace)
2341                    .expect("persist palace under palaces/ subdir");
2342            }
2343        }
2344
2345        // Mirror main.rs: resolve the registry dir, then root AppState there.
2346        let registry_dir = resolve_palace_registry_dir(root);
2347        assert_eq!(registry_dir, nested, "must resolve into palaces/ subdir");
2348        let state = AppState::new(registry_dir);
2349        let count = state
2350            .load_palaces_from_disk()
2351            .await
2352            .expect("load_palaces_from_disk");
2353
2354        assert_eq!(count, 2, "both nested palaces should be loaded");
2355        assert_eq!(state.registry.len(), 2);
2356        let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
2357        assert!(ids.contains(&"cto".to_string()));
2358        assert!(ids.contains(&"engineering".to_string()));
2359    }
2360
2361    /// Why: an empty (or missing) palace registry directory must not error — a
2362    /// brand-new install has nothing to hydrate and should report zero.
2363    #[tokio::test]
2364    async fn load_palaces_from_disk_empty_root_returns_zero() {
2365        let (state, _tmp) = test_state();
2366        let count = state
2367            .load_palaces_from_disk()
2368            .await
2369            .expect("load_palaces_from_disk on empty root");
2370        assert_eq!(count, 0);
2371        assert!(state.registry.is_empty());
2372    }
2373
2374    /// Why (issue #228): hydration must seed `state.palace_names` so the
2375    /// MCP write hot path (`memory_remember` / `memory_note`) can resolve a
2376    /// friendly palace name without re-walking the data root on every call.
2377    /// Regression risk: a future refactor that forgets to populate the cache
2378    /// would silently degrade write latency.
2379    /// What: persists two palaces with distinct `name` values, constructs a
2380    /// fresh `AppState`, hydrates from disk, and asserts the cache holds the
2381    /// expected mappings.
2382    /// Test: this test itself.
2383    #[tokio::test]
2384    async fn palace_name_cache_populated_after_hydration() {
2385        use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2386
2387        let tmp = tempfile::tempdir().expect("tempdir");
2388        let root = tmp.path().to_path_buf();
2389        {
2390            let writer = PalaceRegistry::new();
2391            for (id, name) in [("alpha", "Alpha Project"), ("beta", "Beta Project")] {
2392                let palace = Palace {
2393                    id: PalaceId::new(id),
2394                    name: name.to_string(),
2395                    description: None,
2396                    created_at: chrono::Utc::now(),
2397                    data_dir: root.join(id),
2398                };
2399                writer.create_palace(&root, palace).expect("persist palace");
2400            }
2401        }
2402
2403        let state = AppState::new(root);
2404        assert!(
2405            state.palace_names.is_empty(),
2406            "fresh AppState must start with an empty name cache"
2407        );
2408        state
2409            .load_palaces_from_disk()
2410            .await
2411            .expect("load_palaces_from_disk");
2412
2413        assert_eq!(state.palace_names.len(), 2, "cache must hold both palaces");
2414        assert_eq!(
2415            state.palace_names.get("alpha").map(|e| e.value().clone()),
2416            Some("Alpha Project".to_string()),
2417        );
2418        assert_eq!(
2419            state.palace_names.get("beta").map(|e| e.value().clone()),
2420            Some("Beta Project".to_string()),
2421        );
2422    }
2423
2424    /// Why (issue #228): `palace_create` (MCP tool) and `MemoryService::create_palace`
2425    /// (HTTP path) both insert into the name cache so a freshly-created palace
2426    /// is resolvable on the very next write — without waiting for the next
2427    /// hydration cycle.
2428    /// What: dispatches the `palace_create` MCP tool against a tempdir and
2429    /// asserts the cache row was written.
2430    /// Test: this test itself.
2431    #[tokio::test]
2432    async fn palace_name_cache_updates_on_create() {
2433        use serde_json::json;
2434
2435        let (state, _tmp) = test_state();
2436        let _ = tools::dispatch_tool(&state, "palace_create", json!({"name": "gamma"}))
2437            .await
2438            .expect("palace_create");
2439        assert_eq!(
2440            state.palace_names.get("gamma").map(|e| e.value().clone()),
2441            Some("gamma".to_string()),
2442            "palace_create must populate the in-memory name cache so writes \
2443             can resolve the friendly name without a disk walk"
2444        );
2445    }
2446
2447    /// Why: initialize without a default palace must omit `default_palace`
2448    /// from `serverInfo` so clients can detect the unbound mode.
2449    #[tokio::test]
2450    async fn initialize_without_default_palace_omits_field() {
2451        let (state, _tmp) = test_state();
2452        let init = handle_message(
2453            &state,
2454            json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2455        )
2456        .await;
2457        assert!(init["result"]["serverInfo"]["default_palace"].is_null());
2458    }
2459
2460    /// Why: every `~/.trusty-memory/http_addr` consumer (CLI, dashboard,
2461    /// future trusty-mpm wiring) must agree on the path. A regression that
2462    /// moves this file breaks every client relying on `read_daemon_addr`.
2463    /// What: under a stubbed data dir, the path ends in
2464    /// `trusty-memory/http_addr` — matching `trusty_common::read_daemon_addr`'s
2465    /// expected location.
2466    #[tokio::test]
2467    async fn http_addr_path_uses_resolve_data_dir() {
2468        // Hold the env_test_lock so this test does not race with
2469        // `prompt_context::tests::*` which spin a real daemon under
2470        // the same env override and would otherwise observe a
2471        // half-mutated $TRUSTY_DATA_DIR_OVERRIDE.
2472        let _guard = crate::commands::env_test_lock().lock().await;
2473        let tmp = tempfile::tempdir().unwrap();
2474        // SAFETY: test-only env mutation serialised by env_test_lock.
2475        unsafe {
2476            std::env::set_var(trusty_common::DATA_DIR_OVERRIDE_ENV, tmp.path());
2477        }
2478        let result = http_addr_path();
2479        unsafe {
2480            std::env::remove_var(trusty_common::DATA_DIR_OVERRIDE_ENV);
2481        }
2482        let p = result.expect("http_addr_path must return Some when data dir is resolvable");
2483        assert!(
2484            p.ends_with("trusty-memory/http_addr"),
2485            "unexpected http_addr path: {}",
2486            p.display()
2487        );
2488    }
2489
2490    /// Why: write+read round-trip pins the disk format: a single line of
2491    /// `host:port\n`. Clients (cat, sh `$(cat ...)`) trim whitespace, so the
2492    /// trailing newline is invisible — but anything else (extra whitespace,
2493    /// multi-line) would break callers.
2494    /// Note (issue #226): `write_http_addr_file` is part of the HTTP-serving
2495    /// surface gated behind `axum-server`; the test follows the same gate.
2496    #[cfg(feature = "axum-server")]
2497    #[test]
2498    fn http_addr_file_round_trip_via_helpers() {
2499        let dir = tempfile::tempdir().unwrap();
2500        let path = dir.path().join("http_addr");
2501        let addr: SocketAddr = "127.0.0.1:7073".parse().unwrap();
2502        write_http_addr_file(&path, &addr).unwrap();
2503        let raw = std::fs::read_to_string(&path).unwrap();
2504        assert_eq!(raw.trim(), "127.0.0.1:7073");
2505        // The trailing newline keeps `cat` and editors happy.
2506        assert!(raw.ends_with('\n'));
2507    }
2508
2509    /// Why: dynamic binding must succeed even when the preferred port is
2510    /// already in use. Walking 7070..=7079 + OS fallback guarantees the
2511    /// daemon never fails to come up just because another process holds 7070.
2512    /// What: pre-bind 7070 (best-effort — skip the test if it's already
2513    /// busy on the host), then call `bind_dynamic_port` and assert we got
2514    /// *some* listener back.
2515    #[tokio::test]
2516    async fn bind_dynamic_port_returns_listener() {
2517        let listener = bind_dynamic_port().await.expect("bind_dynamic_port");
2518        let addr = listener.local_addr().expect("local_addr");
2519        assert_eq!(addr.ip().to_string(), "127.0.0.1");
2520        assert!(addr.port() > 0, "port must be non-zero after bind");
2521    }
2522
2523    /// Why: Issue #42 — prompt-facts are now served by the per-message
2524    /// `get_prompt_context` tool rather than the MCP prompts surface, so the
2525    /// `initialize` handshake must NOT advertise a `prompts` capability and
2526    /// `prompts/list` / `prompts/get` must fall through to the "method not
2527    /// found" path.
2528    #[tokio::test]
2529    async fn initialize_does_not_advertise_prompts_capability() {
2530        let (state, _tmp) = test_state();
2531        let init = handle_message(
2532            &state,
2533            json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2534        )
2535        .await;
2536        assert!(
2537            init["result"]["capabilities"]["prompts"].is_null(),
2538            "initialize must NOT advertise the prompts capability; got {init}"
2539        );
2540
2541        // Both prompts/* dispatchers should now report method-not-found.
2542        for method in ["prompts/list", "prompts/get"] {
2543            let resp =
2544                handle_message(&state, json!({"jsonrpc": "2.0", "id": 2, "method": method})).await;
2545            assert_eq!(
2546                resp["error"]["code"], -32601,
2547                "{method} should return method-not-found; got {resp}"
2548            );
2549        }
2550    }
2551
2552    /// Why: `AppState::new` must initialise `bound_addr` to an empty
2553    /// `OnceLock` so `/health` reports `addr: None` on the stdio path. A
2554    /// regression that pre-populates this field would advertise a bogus
2555    /// address from a stale clone.
2556    ///
2557    /// Note (issue #231): now async so it runs inside a Tokio runtime —
2558    /// `AppState::new` spawns the bounded BM25 index worker via
2559    /// `tokio::spawn`, which requires an active runtime.
2560    #[tokio::test]
2561    async fn app_state_starts_with_empty_bound_addr() {
2562        let (state, _tmp) = test_state();
2563        assert!(state.bound_addr.get().is_none());
2564    }
2565
2566    /// Why (issue #96): `DaemonEvent::type_str` underpins the persisted
2567    /// activity log's `event_type` column — every variant must map to the
2568    /// exact SSE `type` tag the UI already handles. A drift between the
2569    /// SSE wire format and the stored type would break the feed's icon /
2570    /// label rendering for historical rows.
2571    /// What: constructs one of each variant, serialises via serde, and
2572    /// confirms `type_str()` matches the JSON `type` field.
2573    /// Test: this test.
2574    #[test]
2575    fn daemon_event_type_str_matches_sse_tag() {
2576        let cases = [
2577            DaemonEvent::PalaceCreated {
2578                id: "p".into(),
2579                name: "p".into(),
2580                source: ActivitySource::Http,
2581            },
2582            DaemonEvent::DrawerAdded {
2583                palace_id: "p".into(),
2584                palace_name: "p".into(),
2585                drawer_count: 1,
2586                timestamp: chrono::Utc::now(),
2587                content_preview: String::new(),
2588                source: ActivitySource::Mcp,
2589            },
2590            DaemonEvent::DrawerDeleted {
2591                palace_id: "p".into(),
2592                drawer_count: 0,
2593                source: ActivitySource::Http,
2594            },
2595            DaemonEvent::DreamCompleted {
2596                palace_id: None,
2597                merged: 0,
2598                pruned: 0,
2599                compacted: 0,
2600                closets_updated: 0,
2601                duration_ms: 0,
2602                source: ActivitySource::Http,
2603            },
2604            DaemonEvent::StatusChanged {
2605                total_drawers: 0,
2606                total_vectors: 0,
2607                total_kg_triples: 0,
2608            },
2609            DaemonEvent::HookFired {
2610                palace_id: Some("p".into()),
2611                palace_name: Some("p".into()),
2612                hook_type: HookType::UserPromptSubmit,
2613                injection_kind: InjectionKind::PromptContext,
2614                injection_length: 12,
2615                trigger_prompt_excerpt: "hello".into(),
2616                timestamp: chrono::Utc::now(),
2617                duration_ms: 5,
2618                source: ActivitySource::Hook,
2619            },
2620        ];
2621        for ev in &cases {
2622            let json = serde_json::to_value(ev).unwrap();
2623            assert_eq!(json["type"].as_str(), Some(ev.type_str()));
2624        }
2625    }
2626
2627    /// Why: `HookType` is serialised on every `HookFired` activity row; its
2628    /// wire format must round-trip cleanly so dashboard / TUI consumers can
2629    /// safely parse historic entries written by an older daemon build.
2630    /// What: serde-encodes each variant, asserts the JSON matches the
2631    /// expected PascalCase label, then decodes back.
2632    /// Test: itself.
2633    #[test]
2634    fn hook_type_serde_round_trips() {
2635        let cases = [
2636            (HookType::UserPromptSubmit, "\"UserPromptSubmit\""),
2637            (HookType::SessionStart, "\"SessionStart\""),
2638        ];
2639        for (ht, expected) in cases {
2640            let s = serde_json::to_string(&ht).unwrap();
2641            assert_eq!(s, expected, "{ht:?} should serialise to {expected}");
2642            let back: HookType = serde_json::from_str(&s).unwrap();
2643            assert_eq!(back, ht);
2644            assert_eq!(ht.as_str(), expected.trim_matches('"'));
2645        }
2646    }
2647
2648    /// Why: same as `hook_type_serde_round_trips` but for `InjectionKind`.
2649    /// What: kebab-case round trip on every variant.
2650    /// Test: itself.
2651    #[test]
2652    fn injection_kind_serde_round_trips() {
2653        let cases = [
2654            (InjectionKind::PromptContext, "\"prompt-context\""),
2655            (InjectionKind::InboxCheck, "\"inbox-check\""),
2656        ];
2657        for (ik, expected) in cases {
2658            let s = serde_json::to_string(&ik).unwrap();
2659            assert_eq!(s, expected);
2660            let back: InjectionKind = serde_json::from_str(&s).unwrap();
2661            assert_eq!(back, ik);
2662            assert_eq!(ik.as_str(), expected.trim_matches('"'));
2663        }
2664    }
2665
2666    /// Why: the activity feed renders the trigger prompt excerpt directly;
2667    /// runaway prompts must be capped at [`HOOK_PROMPT_EXCERPT_CHARS`] with
2668    /// a `…` marker so the row stays readable.
2669    /// What: feeds a 200-character prompt and asserts the excerpt is
2670    /// bounded.
2671    /// Test: itself.
2672    #[test]
2673    fn hook_excerpt_truncates_long_prompts() {
2674        let long = "x".repeat(200);
2675        let excerpt = hook_prompt_excerpt(&long);
2676        assert!(excerpt.chars().count() <= HOOK_PROMPT_EXCERPT_CHARS);
2677        assert!(excerpt.ends_with('…'));
2678        assert_eq!(hook_prompt_excerpt(""), "");
2679    }
2680
2681    /// Why: multi-line prompts must collapse to a single line so the
2682    /// activity feed row doesn't blow out vertically.
2683    /// What: feeds a multi-line whitespace-heavy prompt and asserts the
2684    /// output is a single-spaced single line.
2685    /// Test: itself.
2686    #[test]
2687    fn hook_excerpt_collapses_whitespace() {
2688        let input = "hello\n\nworld\t\tfoo";
2689        let excerpt = hook_prompt_excerpt(input);
2690        assert_eq!(excerpt, "hello world foo");
2691    }
2692
2693    /// Why (issue #96): `palace_id()` and `source()` feed the persisted
2694    /// activity log's columns; they must extract the right field per
2695    /// variant. Sloppy refactors could swap two fields and the log would
2696    /// silently mis-attribute writes.
2697    /// What: builds each variant with known field values and asserts the
2698    /// extractor returns them.
2699    /// Test: this test.
2700    #[test]
2701    fn daemon_event_palace_id_and_source_extraction() {
2702        let ev = DaemonEvent::DrawerAdded {
2703            palace_id: "alpha".into(),
2704            palace_name: "alpha".into(),
2705            drawer_count: 1,
2706            timestamp: chrono::Utc::now(),
2707            content_preview: String::new(),
2708            source: ActivitySource::Mcp,
2709        };
2710        assert_eq!(ev.palace_id(), Some("alpha"));
2711        assert_eq!(ev.source(), Some(ActivitySource::Mcp));
2712
2713        let status = DaemonEvent::StatusChanged {
2714            total_drawers: 1,
2715            total_vectors: 2,
2716            total_kg_triples: 3,
2717        };
2718        assert_eq!(status.palace_id(), None);
2719        assert_eq!(status.source(), None);
2720
2721        let dream = DaemonEvent::DreamCompleted {
2722            palace_id: Some("p1".into()),
2723            merged: 0,
2724            pruned: 0,
2725            compacted: 0,
2726            closets_updated: 0,
2727            duration_ms: 10,
2728            source: ActivitySource::Http,
2729        };
2730        assert_eq!(dream.palace_id(), Some("p1"));
2731        assert_eq!(dream.source(), Some(ActivitySource::Http));
2732    }
2733
2734    /// Why (issue #96): `AppState::emit` must persist mutation events to
2735    /// the activity log while keeping `StatusChanged` (a recomputed
2736    /// aggregate, not a mutation) out of the persisted history.
2737    /// What: emits one of each variant under a fresh state and asserts
2738    /// the persisted count matches the number of mutation events.
2739    /// Test: this test.
2740    #[tokio::test]
2741    async fn emit_persists_mutations_but_skips_status_changed() {
2742        let (state, _tmp) = test_state();
2743        state.emit(DaemonEvent::PalaceCreated {
2744            id: "p".into(),
2745            name: "p".into(),
2746            source: ActivitySource::Http,
2747        });
2748        state.emit(DaemonEvent::StatusChanged {
2749            total_drawers: 1,
2750            total_vectors: 0,
2751            total_kg_triples: 0,
2752        });
2753        state.emit(DaemonEvent::DrawerAdded {
2754            palace_id: "p".into(),
2755            palace_name: "p".into(),
2756            drawer_count: 1,
2757            timestamp: chrono::Utc::now(),
2758            content_preview: "x".into(),
2759            source: ActivitySource::Mcp,
2760        });
2761        // Issue #232: `emit` now offloads the redb write to `spawn_blocking`,
2762        // so the test must wait for the background pool to drain before
2763        // asserting on the persisted count.
2764        state.flush_activity_writes().await;
2765        let count = state.activity_log.count().unwrap();
2766        assert_eq!(count, 2, "only PalaceCreated + DrawerAdded must persist");
2767    }
2768
2769    /// Why (issue #156): the BM25 lane must be opt-in — existing deployments
2770    /// that don't set `TRUSTY_BM25_DAEMON=1` must see `bm25_client = None`
2771    /// and the recall hot path must continue to behave exactly as before.
2772    /// What: builds an `AppState` with `with_bm25_client_from_env()` while
2773    /// the env var is unset; asserts the field stays `None`.
2774    /// Test: this test.
2775    #[tokio::test]
2776    async fn bm25_client_disabled_by_default() {
2777        // Serialise with the sibling `bm25_client_enabled_when_env_set` test
2778        // so they don't race on the shared `TRUSTY_BM25_DAEMON` env var.
2779        let _guard = crate::commands::env_test_lock().lock().await;
2780        // SAFETY: this test exercises std::env::remove_var which is unsafe
2781        // in 2024 edition because the global env is shared. We restore the
2782        // pre-test value at the end so neighbours are unaffected.
2783        let prev = std::env::var("TRUSTY_BM25_DAEMON").ok();
2784        unsafe {
2785            std::env::remove_var("TRUSTY_BM25_DAEMON");
2786        }
2787        let (state, _tmp) = test_state();
2788        let state = state.with_bm25_client_from_env();
2789        assert!(
2790            state.bm25_client.is_none(),
2791            "bm25_client must be None when TRUSTY_BM25_DAEMON is unset"
2792        );
2793        // Issue #193: the spawn supervisor is bound to the same env gate as
2794        // the client — opt-out parity matters so we never accidentally
2795        // spawn daemons in deployments that explicitly didn't opt in.
2796        assert!(
2797            state.bm25_supervisor.is_none(),
2798            "bm25_supervisor must be None when TRUSTY_BM25_DAEMON is unset"
2799        );
2800        if let Some(v) = prev {
2801            unsafe {
2802                std::env::set_var("TRUSTY_BM25_DAEMON", v);
2803            }
2804        }
2805    }
2806
2807    /// Why (issue #156): when the operator opts in via `TRUSTY_BM25_DAEMON=1`,
2808    /// the builder must construct a real `Bm25Client` pointed at the canonical
2809    /// per-palace socket path. We don't connect — no daemon need be running —
2810    /// we only assert the client field is populated.
2811    /// What: sets the env var, runs the builder, asserts `Some(_)`.
2812    /// Test: this test.
2813    #[tokio::test]
2814    async fn bm25_client_enabled_when_env_set() {
2815        let _guard = crate::commands::env_test_lock().lock().await;
2816        let prev = std::env::var("TRUSTY_BM25_DAEMON").ok();
2817        unsafe {
2818            std::env::set_var("TRUSTY_BM25_DAEMON", "1");
2819        }
2820        let (state, _tmp) = test_state();
2821        let state = state.with_bm25_client_from_env();
2822        assert!(
2823            state.bm25_client.is_some(),
2824            "bm25_client must be Some when TRUSTY_BM25_DAEMON=1"
2825        );
2826        // Issue #193: opting in to the client must also install the spawn
2827        // supervisor so the daemon is auto-started on first use.
2828        assert!(
2829            state.bm25_supervisor.is_some(),
2830            "bm25_supervisor must be Some when TRUSTY_BM25_DAEMON=1"
2831        );
2832        match prev {
2833            Some(v) => unsafe { std::env::set_var("TRUSTY_BM25_DAEMON", v) },
2834            None => unsafe { std::env::remove_var("TRUSTY_BM25_DAEMON") },
2835        }
2836    }
2837
2838    // -------------------------------------------------------------------------
2839    // Issues #910 / #911 — DaemonReadiness
2840    // -------------------------------------------------------------------------
2841
2842    /// Why (issue #910/#911): `AppState` starts in `Warming` state; `set_ready`
2843    /// must flip it to `Ready` atomically; subsequent `readiness()` calls must
2844    /// observe `Ready`.
2845    /// What: construct a state, assert `Warming`, call `set_ready`, assert
2846    /// `Ready`.
2847    /// Test: this test.
2848    #[tokio::test]
2849    async fn daemon_readiness_transitions_warming_to_ready() {
2850        let (state, _tmp) = test_state_warming();
2851        assert_eq!(
2852            state.readiness(),
2853            DaemonReadiness::Warming,
2854            "daemon must start in Warming state"
2855        );
2856        state.set_ready();
2857        assert_eq!(
2858            state.readiness(),
2859            DaemonReadiness::Ready,
2860            "daemon must be Ready after set_ready()"
2861        );
2862    }
2863
2864    /// Why (issue #911): `readiness_check` must return `Ok(())` when Ready and
2865    /// an explicit `Err` when Warming, so tool handlers can use `?` to short-
2866    /// circuit without blocking.
2867    /// What: verify both states.
2868    /// Test: this test.
2869    #[tokio::test]
2870    async fn readiness_check_ok_when_ready_err_when_warming() {
2871        let (state, _tmp) = test_state_warming();
2872        // Warming → should error.
2873        let err = state
2874            .readiness_check()
2875            .expect_err("readiness_check must fail when Warming");
2876        let msg = err.to_string();
2877        assert!(
2878            msg.contains("warming up"),
2879            "error must mention 'warming up'; got: {msg}"
2880        );
2881        // Ready → should succeed.
2882        state.set_ready();
2883        state
2884            .readiness_check()
2885            .expect("readiness_check must succeed when Ready");
2886    }
2887
2888    /// Why (issue #911): `DaemonReadiness::from_u8` must map 0 → Warming and
2889    /// any non-zero → Ready.
2890    /// Test: this test.
2891    #[test]
2892    fn daemon_readiness_from_u8() {
2893        assert_eq!(DaemonReadiness::from_u8(0), DaemonReadiness::Warming);
2894        assert_eq!(DaemonReadiness::from_u8(1), DaemonReadiness::Ready);
2895        assert_eq!(DaemonReadiness::from_u8(255), DaemonReadiness::Ready);
2896    }
2897
2898    // -------------------------------------------------------------------------
2899    // Issue #467 — palaces skipped at startup hydration are lazily re-opened
2900    // -------------------------------------------------------------------------
2901
2902    /// Why (issue #467): when `load_palaces_from_disk` fails to open a palace
2903    /// (e.g. EMFILE — "Too many open files"), it logs a warning and does NOT
2904    /// register the palace in the in-memory registry. A subsequent call to
2905    /// `open_palace` for that id must attempt a fresh open from disk, not
2906    /// permanently return "not found". This test verifies the lazy-reopen
2907    /// path: create a palace on disk, remove it from the registry (simulating a
2908    /// startup-hydration skip), then open it via `open_palace` and assert success.
2909    /// What: builds an `AppState` with an on-disk palace that is subsequently
2910    /// removed from the in-memory registry (simulating what happens when
2911    /// `PalaceHandle::open` fails during `load_palaces_from_disk`), calls
2912    /// `registry.open_palace`, and asserts the palace handle is returned.
2913    /// Test: this test.
2914    #[tokio::test]
2915    async fn open_palace_lazy_reopens_hydration_skipped_palace() {
2916        let (state, _tmp) = test_state();
2917        // Create a palace on disk.
2918        let pid = trusty_common::memory_core::palace::PalaceId::new("hydration-skip");
2919        let palace = trusty_common::memory_core::Palace {
2920            id: pid.clone(),
2921            name: "hydration-skip".to_string(),
2922            description: None,
2923            created_at: chrono::Utc::now(),
2924            data_dir: state.data_root.join("hydration-skip"),
2925        };
2926        state
2927            .registry
2928            .create_palace(&state.data_root, palace)
2929            .expect("create_palace");
2930
2931        // Simulate a startup-hydration skip by removing the just-registered handle.
2932        // In production, the palace is simply never registered because
2933        // PalaceHandle::open failed during load_palaces_from_disk (EMFILE etc.).
2934        state.registry.remove(&pid);
2935
2936        // The registry must now report no in-memory handle for this palace.
2937        assert!(
2938            state.registry.get(&pid).is_none(),
2939            "palace must appear absent (simulating hydration skip) before the lazy-reopen"
2940        );
2941
2942        // Calling open_palace should attempt a fresh open from disk and succeed.
2943        let handle = state
2944            .registry
2945            .open_palace(&state.data_root, &pid)
2946            .expect("open_palace must lazily reopen a hydration-skipped palace");
2947        assert_eq!(handle.id.as_str(), "hydration-skip");
2948    }
2949
2950    // -------------------------------------------------------------------------
2951    // Issue #498 — dotfile http_addr path uses $HOME/.trusty-memory/http_addr
2952    // -------------------------------------------------------------------------
2953
2954    /// Why (issue #498): claude-mpm's `migrate_trusty_autodetect` reads
2955    /// `~/.trusty-memory/http_addr` to discover the daemon's port. On macOS
2956    /// the OS-standard data dir differs from the dotfile path, so the daemon
2957    /// was writing to the wrong location and claude-mpm always fell back to the
2958    /// hardcoded port `7070`. This test confirms `dotfile_http_addr_path()`
2959    /// returns a path rooted at `$HOME/.trusty-memory/http_addr`.
2960    /// What: under a known HOME, calls `dotfile_http_addr_path` and asserts the
2961    /// returned path ends in `.trusty-memory/http_addr`.
2962    /// Test: this test.
2963    #[cfg(feature = "axum-server")]
2964    #[test]
2965    fn dotfile_http_addr_path_uses_home_dir() {
2966        // `dirs::home_dir()` is not redirectable via env on macOS, but we can
2967        // at least assert that when it returns Some, the suffix is correct.
2968        if let Some(p) = dotfile_http_addr_path() {
2969            assert!(
2970                p.ends_with(".trusty-memory/http_addr"),
2971                "dotfile path must end in .trusty-memory/http_addr; got {}",
2972                p.display()
2973            );
2974        }
2975        // If home_dir() returns None (locked-down env), the function returns None —
2976        // that's acceptable; we just skip the assertion.
2977    }
2978
2979    /// Why (issue #498): the daemon must write to the dotfile path so that
2980    /// claude-mpm's `_resolve_base_url` finds the running port. This round-trip
2981    /// test exercises `write_http_addr_file` at a dotfile-shaped path and
2982    /// confirms the content is readable after the atomic rename.
2983    /// What: picks a tempdir as a stand-in for $HOME, writes an addr to
2984    /// `.trusty-memory/http_addr`, and reads it back.
2985    /// Test: this test.
2986    #[cfg(feature = "axum-server")]
2987    #[test]
2988    fn dotfile_http_addr_write_read_round_trip() {
2989        let home = tempfile::tempdir().unwrap();
2990        let dotfile_dir = home.path().join(".trusty-memory");
2991        let path = dotfile_dir.join("http_addr");
2992        let addr: SocketAddr = "127.0.0.1:7099".parse().unwrap();
2993        write_http_addr_file(&path, &addr).expect("write_http_addr_file to dotfile path");
2994        let raw = std::fs::read_to_string(&path).unwrap();
2995        assert_eq!(
2996            raw.trim(),
2997            "127.0.0.1:7099",
2998            "dotfile round-trip content mismatch"
2999        );
3000        assert!(raw.ends_with('\n'), "dotfile must end with a newline");
3001    }
3002}