Skip to main content

trusty_memory/
lib.rs

1//! MCP server (stdio + HTTP/SSE) for trusty-memory.
2//!
3//! Why: Claude Code and other MCP-aware clients integrate with trusty-memory
4//! through the standardized Model Context Protocol; we expose memory + KG
5//! tools so they can be called by name.
6//! What: Provides `run_stdio` (JSON-RPC 2.0 over stdin/stdout) and `run_http`
7//! (axum HTTP/SSE stub), plus an `AppState` that carries the shared
8//! `PalaceRegistry`, on-disk data root, and a lazily-initialized embedder.
9//! Test: `cargo test -p trusty-memory-mcp` validates handshake + dispatch.
10
11use anyhow::Result;
12use serde_json::{json, Value};
13use std::net::SocketAddr;
14use std::path::{Path, PathBuf};
15use std::sync::{Arc, OnceLock, RwLock};
16use tokio::sync::{broadcast, OnceCell};
17use tracing::info;
18use trusty_common::mcp::{error_codes, initialize_response, Request, Response};
19use trusty_common::memory_core::embed::FastEmbedder;
20use trusty_common::memory_core::store::ChatSessionStore;
21use trusty_common::memory_core::PalaceRegistry;
22use trusty_common::ChatProvider;
23
24pub mod bootstrap;
25pub mod commands;
26pub mod discovery;
27pub mod openrpc;
28pub mod prompt_facts;
29pub mod service;
30pub mod tools;
31pub mod web;
32
33pub use service::MemoryMcpService;
34pub use tools::MemoryMcpServer;
35
36/// Resolve the directory that actually holds the per-palace subdirectories.
37///
38/// Why: there are two on-disk layouts in the wild. The current monorepo code
39/// treats the registry directory *itself* as the parent of per-palace dirs
40/// (`<dir>/<id>/palace.json`). The legacy standalone `trusty-memory` repo
41/// nested everything one level deeper under a `palaces/` subdirectory
42/// (`<data_dir>/palaces/<id>/palace.json`) — and that is where existing
43/// installs' data lives (e.g. 88 palaces under
44/// `~/Library/Application Support/trusty-memory/palaces/`). A daemon that uses
45/// the bare data dir as its registry root finds zero palaces because every
46/// `palace.json` sits one level below where it looked — the "palaces lost on
47/// restart" bug.
48/// What: given the standard data dir, returns `<data_dir>/palaces` when that
49/// subdirectory exists, otherwise `<data_dir>` itself. Resolving this once in
50/// `main.rs` and using the result as `AppState::data_root` keeps every call
51/// site (`status`, `palace_list`, `open_palace`, `palace_create`,
52/// `load_palaces_from_disk`) consistent without forcing a data migration.
53/// Test: `tests::resolve_palace_registry_dir_prefers_palaces_subdir` and
54/// `resolve_palace_registry_dir_falls_back_to_data_dir`.
55pub fn resolve_palace_registry_dir(data_dir: PathBuf) -> PathBuf {
56    let nested = data_dir.join("palaces");
57    if nested.is_dir() {
58        nested
59    } else {
60        data_dir
61    }
62}
63
64/// Live daemon events broadcast to connected SSE subscribers.
65///
66/// Why: The dashboard needs push-driven updates so palace creation, drawer
67/// add/delete, dream cycles, and aggregate status changes are visible without
68/// polling. A single broadcast channel fans out to every connected browser.
69/// What: Tagged enum serialized as `{"type": "...", ...fields}` over SSE.
70/// Test: `web::tests::sse_stream_emits_events` subscribes, triggers a
71/// mutation, and asserts the frame arrives.
72#[derive(Clone, Debug, serde::Serialize)]
73#[serde(tag = "type", rename_all = "snake_case")]
74pub enum DaemonEvent {
75    PalaceCreated {
76        id: String,
77        name: String,
78    },
79    DrawerAdded {
80        palace_id: String,
81        /// Friendly palace name (Palace.name) at write time. Why: lets SSE
82        /// consumers (the dashboard activity feed) render the human-readable
83        /// label without a separate id→name lookup. Empty string if the
84        /// emitter could not resolve the name.
85        #[serde(default)]
86        palace_name: String,
87        drawer_count: usize,
88        /// Wall-clock timestamp when the drawer was added. Why: SSE
89        /// receivers want to render "just now / 2m ago" relative to the
90        /// daemon's clock, not the time the SSE frame happens to arrive.
91        timestamp: chrono::DateTime<chrono::Utc>,
92        /// Short preview of the drawer's content (whitespace-collapsed,
93        /// truncated to ~80 chars with an ellipsis when cut). Why: the TUI
94        /// activity feed and dashboard ticker want to show *what* was
95        /// stored, not just the running drawer count. Empty when the
96        /// emitter could not resolve the content (legacy clients tolerate
97        /// the missing field via `#[serde(default)]`).
98        #[serde(default)]
99        content_preview: String,
100    },
101    DrawerDeleted {
102        palace_id: String,
103        drawer_count: usize,
104    },
105    DreamCompleted {
106        palace_id: Option<String>,
107        merged: usize,
108        pruned: usize,
109        compacted: usize,
110        closets_updated: usize,
111        duration_ms: u64,
112    },
113    StatusChanged {
114        total_drawers: usize,
115        total_vectors: usize,
116        total_kg_triples: usize,
117    },
118}
119
120/// Shared application state passed to every request handler.
121///
122/// Why: The stdio loop and HTTP server need the same handles to the registry,
123/// data root, and embedder so MCP tools can perform real reads/writes against
124/// the live trusty-memory core. The embedder is heavy (loads ONNX weights) so
125/// we hold it behind a `OnceCell` and initialize lazily on first use.
126/// What: `Clone`-able via `Arc` fields. The registry / data root are eager;
127/// `embedder` is `Arc<OnceCell<Arc<FastEmbedder>>>` so concurrent first-use
128/// races resolve to a single shared instance.
129/// Test: `app_state_default_constructs` confirms construction without panic.
130#[derive(Clone)]
131pub struct AppState {
132    pub version: String,
133    pub registry: Arc<PalaceRegistry>,
134    pub data_root: PathBuf,
135    pub embedder: Arc<OnceCell<Arc<FastEmbedder>>>,
136    /// Optional default palace applied to MCP tool calls when the caller
137    /// omits the `palace` argument. Set via `trusty-memory serve --palace`.
138    pub default_palace: Option<String>,
139    /// Active chat provider selected at startup. `None` means no upstream is
140    /// configured (no Ollama detected and no OpenRouter key) — callers must
141    /// degrade gracefully (chat endpoint returns 412).
142    pub chat_provider: Arc<OnceCell<Option<Arc<dyn ChatProvider>>>>,
143    /// Per-palace chat-session stores, opened lazily so cold-start cost is
144    /// paid only when chat-history endpoints are hit.
145    pub session_stores: Arc<dashmap::DashMap<String, Arc<ChatSessionStore>>>,
146    /// Broadcast sender for live `DaemonEvent` pushes to SSE subscribers.
147    ///
148    /// Why: Lets mutating handlers emit events that any connected dashboard
149    /// receives instantly. Cap of 128 buffers transient slow readers; if a
150    /// receiver lags it gets `RecvError::Lagged` and we emit a `lag` frame.
151    pub events: Arc<broadcast::Sender<DaemonEvent>>,
152    /// Instant the daemon started, used to compute `uptime_secs` on `/health`.
153    ///
154    /// Why (issue #35): `GET /health` reports how long the daemon has been
155    /// up. Capturing a monotonic `Instant` at `AppState` construction lets the
156    /// handler compute the elapsed seconds cheaply and without a clock-skew
157    /// hazard.
158    /// What: a wall-monotonic `Instant`; `AppState::new` stamps it at startup.
159    /// Test: `health_endpoint_includes_resource_fields`.
160    pub started_at: std::time::Instant,
161    /// In-memory ring buffer of recent tracing log lines (issue #35).
162    ///
163    /// Why: the `GET /api/v1/logs/tail` endpoint serves the last N log lines
164    /// so operators can inspect a running daemon without tailing a file. The
165    /// buffer is shared between the tracing `LogBufferLayer` (writer) and the
166    /// HTTP handler (reader).
167    /// What: a cheap `Arc`-backed clone of the buffer the subscriber writes
168    /// to. Defaults to an empty buffer for states that never install the
169    /// layer (tests, the stdio path).
170    /// Test: `logs_tail_returns_recent_lines`.
171    pub log_buffer: trusty_common::log_buffer::LogBuffer,
172    /// Most recent on-disk footprint of `data_root`, in bytes (issue #35).
173    ///
174    /// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
175    /// every health request would make a frequent health poll do unbounded
176    /// I/O; a background task recomputes it every 10 s and stores it here so
177    /// the handler reads it lock-free.
178    /// What: an `AtomicU64` updated by the ticker spawned in `run_http_on`.
179    /// `0` until the first walk completes.
180    /// Test: `health_endpoint_includes_resource_fields`.
181    pub disk_bytes: Arc<std::sync::atomic::AtomicU64>,
182    /// Per-process RSS + CPU sampler, refreshed on each `/health` request
183    /// (issue #35).
184    ///
185    /// Why: CPU usage is a delta between two `sysinfo` refreshes, so the
186    /// sampler must persist between requests — hence the shared `Mutex`.
187    /// What: a `tokio::sync::Mutex<SysMetrics>` so the async health handler
188    /// can sample without blocking the runtime.
189    /// Test: `health_endpoint_includes_resource_fields`.
190    pub sys_metrics: Arc<tokio::sync::Mutex<trusty_common::sys_metrics::SysMetrics>>,
191    /// HTTP listener address the daemon bound to, once `run_http_on` is running.
192    ///
193    /// Why: clients (and `/health` responses) need to advertise the live
194    /// `host:port` even though port selection happens dynamically (7070–7079
195    /// walk + OS fallback). Stashing it on `AppState` lets request handlers
196    /// surface the discovery value without re-querying the listener.
197    /// What: a `OnceLock<SocketAddr>` so `run_http_on` writes it exactly once
198    /// at bind time and every handler reads it lock-free thereafter. Empty
199    /// (`None` from `get()`) on the stdio path where no listener exists.
200    /// Test: `health_endpoint_reports_bound_addr` (added below).
201    pub bound_addr: Arc<OnceLock<SocketAddr>>,
202    /// Cached prompt-facts surface served by the MCP `get_prompt_context`
203    /// tool (issue #42).
204    ///
205    /// Why: The original session-init `prompts/get` design loaded context
206    /// once per connection; switching to a per-message tool lets the model
207    /// pull fresh, query-filtered context on demand. The cache holds both
208    /// the raw triples (for filtered lookups) and a pre-formatted Markdown
209    /// block (for the unfiltered hot path) so neither code path re-walks
210    /// the KG. The cache is rebuilt by
211    /// `prompt_facts::rebuild_prompt_cache` after any write that touches a
212    /// hot predicate (`kg_assert`, `add_alias`, `remove_prompt_fact`).
213    /// What: An `Arc<RwLock<PromptFactsCache>>` so the hot read path takes
214    /// a brief read lock and clones the cache; rebuilds take a write lock
215    /// for the assignment only. An empty `triples` vec ↔ "no context
216    /// stored yet" (the tool handler renders a hint).
217    /// Test: `get_prompt_context_returns_cached_or_hint`,
218    /// `get_prompt_context_filters_by_query`.
219    pub prompt_context_cache: Arc<RwLock<prompt_facts::PromptFactsCache>>,
220}
221
222impl AppState {
223    /// Construct an `AppState` rooted at the given on-disk data directory.
224    ///
225    /// Why: The CLI (`serve`) and integration tests need to point the MCP
226    /// server at different roots — production at `dirs::data_dir`, tests at a
227    /// `tempfile::tempdir()`.
228    /// What: Builds an empty `PalaceRegistry`, captures the version, and
229    /// allocates an empty `OnceCell` for the embedder. `default_palace` is
230    /// `None`; use `with_default_palace` to set it.
231    /// Test: `tools::tests::dispatch_palace_create_persists` constructs an
232    /// AppState pointed at a tempdir and round-trips a palace through it.
233    pub fn new(data_root: PathBuf) -> Self {
234        let (events_tx, _) = broadcast::channel::<DaemonEvent>(128);
235        Self {
236            version: env!("CARGO_PKG_VERSION").to_string(),
237            registry: Arc::new(PalaceRegistry::new()),
238            data_root,
239            embedder: Arc::new(OnceCell::new()),
240            default_palace: None,
241            chat_provider: Arc::new(OnceCell::new()),
242            session_stores: Arc::new(dashmap::DashMap::new()),
243            events: Arc::new(events_tx),
244            started_at: std::time::Instant::now(),
245            // Default to an empty buffer — `with_log_buffer` overrides this
246            // when the daemon installs the `LogBufferLayer` (HTTP mode).
247            log_buffer: trusty_common::log_buffer::LogBuffer::new(
248                trusty_common::log_buffer::DEFAULT_LOG_CAPACITY,
249            ),
250            disk_bytes: Arc::new(std::sync::atomic::AtomicU64::new(0)),
251            sys_metrics: Arc::new(tokio::sync::Mutex::new(
252                trusty_common::sys_metrics::SysMetrics::new(),
253            )),
254            bound_addr: Arc::new(OnceLock::new()),
255            prompt_context_cache: Arc::new(RwLock::new(prompt_facts::PromptFactsCache::default())),
256        }
257    }
258
259    /// Scan the palace registry directory and re-register every persisted
260    /// palace into the in-memory [`PalaceRegistry`].
261    ///
262    /// Why: `AppState::new` builds an *empty* registry, so after a daemon
263    /// restart `palace_list` / the dashboard reported zero palaces even though
264    /// dozens existed on disk — palace metadata was persisted by
265    /// `palace_create` but never re-hydrated on startup. This method closes
266    /// that gap by walking the on-disk layout (each subdirectory holding a
267    /// `palace.json` is one palace) and rebuilding a live `PalaceHandle` for
268    /// each, so recall paths see the full set immediately after a restart.
269    /// What: runs the blocking filesystem walk + per-palace `PalaceHandle::open`
270    /// on a `spawn_blocking` thread (so it never stalls the async runtime),
271    /// registers each successfully opened palace via `register_arc`, logs every
272    /// load at `debug!`, and returns the count loaded. A palace that fails to
273    /// open (corrupt index, unreadable `kg.db`, etc.) is logged at `warn!` and
274    /// skipped — one bad palace must not abort startup or crash the daemon.
275    /// `data_root` is expected to already be the palace registry directory —
276    /// `main.rs` resolves it via [`resolve_palace_registry_dir`] before
277    /// constructing the `AppState`, so the flat / legacy-`palaces/` layout
278    /// difference is handled exactly once.
279    /// Test: `tests::load_palaces_from_disk_rehydrates_registry` writes two
280    /// palaces into a tempdir, constructs an `AppState`, calls this method, and
281    /// asserts the returned count and registry contents.
282    pub async fn load_palaces_from_disk(&self) -> Result<usize> {
283        let registry_dir = self.data_root.clone();
284        let registry = self.registry.clone();
285        // The directory walk and each `PalaceHandle::open` perform blocking
286        // filesystem + redb/usearch I/O — run the whole hydration on the
287        // blocking pool so it never parks an async worker thread.
288        let count = tokio::task::spawn_blocking(move || -> Result<usize> {
289            let palaces = PalaceRegistry::list_palaces(&registry_dir)?;
290            let total = palaces.len();
291            let mut loaded = 0usize;
292            let mut skipped = 0usize;
293            for palace in palaces {
294                match trusty_common::memory_core::PalaceHandle::open(&palace) {
295                    Ok(handle) => {
296                        tracing::debug!(
297                            palace = %palace.id,
298                            data_dir = %palace.data_dir.display(),
299                            "loaded palace from disk"
300                        );
301                        registry.register_arc(handle);
302                        loaded += 1;
303                    }
304                    Err(e) => {
305                        // Why: a single bad palace (corrupt kg.db, stale WAL,
306                        // permissions) must never abort startup or block the
307                        // HTTP server from binding. Log per-palace and keep
308                        // going; the summary below tells operators how many
309                        // were skipped without trawling the log.
310                        tracing::warn!(
311                            palace = %palace.id,
312                            data_dir = %palace.data_dir.display(),
313                            "skipping palace during startup hydration: {e:#}"
314                        );
315                        skipped += 1;
316                    }
317                }
318            }
319            tracing::info!(
320                "palace hydration summary: loaded {loaded}/{total} ({skipped} skipped due to errors)"
321            );
322            Ok(loaded)
323        })
324        .await
325        .map_err(|e| anyhow::anyhow!("join load_palaces_from_disk: {e}"))??;
326        Ok(count)
327    }
328
329    /// Builder-style: attach the daemon's shared [`LogBuffer`] so the
330    /// `GET /api/v1/logs/tail` endpoint serves the same lines the tracing
331    /// subscriber captures (issue #35).
332    ///
333    /// Why: `main` builds the buffer (via `init_tracing_with_buffer`) before
334    /// constructing the `AppState`, then hands a clone here so the HTTP
335    /// handler and the tracing layer observe the same ring.
336    /// What: replaces the empty default buffer with the supplied one.
337    /// Test: `logs_tail_returns_recent_lines`.
338    #[must_use]
339    pub fn with_log_buffer(mut self, buffer: trusty_common::log_buffer::LogBuffer) -> Self {
340        self.log_buffer = buffer;
341        self
342    }
343
344    /// Send a `DaemonEvent` to all connected SSE subscribers.
345    ///
346    /// Why: Mutating handlers call this after a successful write so the
347    /// dashboard can update without polling. The send is best-effort —
348    /// `broadcast::Sender::send` returns `Err` only when there are no live
349    /// receivers, which is fine (no listeners == no work to do).
350    /// What: Drops the result, so callers don't need to care whether anyone
351    /// is listening.
352    /// Test: `web::tests::sse_stream_receives_palace_created` confirms a
353    /// subscriber observes the emitted event.
354    pub fn emit(&self, event: DaemonEvent) {
355        let _ = self.events.send(event);
356    }
357
358    /// Open (or return cached) the chat-session store for a palace.
359    ///
360    /// Why: Chat session persistence lives in a dedicated SQLite file under
361    /// the palace's data dir (`chat_sessions.db`) so it doesn't intermingle
362    /// with the KG's transactional load. The store is cheap to clone via
363    /// `Arc` but the underlying r2d2 pool should be reused, so cache by id.
364    /// What: Creates the palace data dir if missing, opens (or reuses) a
365    /// `ChatSessionStore` and stashes an `Arc` in the DashMap.
366    /// Test: Indirectly via the session HTTP handlers in `web::tests`.
367    pub fn session_store(&self, palace_id: &str) -> Result<Arc<ChatSessionStore>> {
368        if let Some(entry) = self.session_stores.get(palace_id) {
369            return Ok(entry.clone());
370        }
371        let dir = self.data_root.join(palace_id);
372        std::fs::create_dir_all(&dir)
373            .map_err(|e| anyhow::anyhow!("create palace dir {}: {e}", dir.display()))?;
374        let store = Arc::new(ChatSessionStore::open(&dir.join("chat_sessions.db"))?);
375        self.session_stores
376            .insert(palace_id.to_string(), store.clone());
377        Ok(store)
378    }
379
380    /// Builder-style setter for the default palace name.
381    ///
382    /// Why: `serve --palace <name>` wants to bind every tool call to a
383    /// project-scoped namespace without forcing every MCP request to repeat
384    /// the palace argument.
385    /// What: Returns `self` with `default_palace = Some(name)`.
386    /// Test: `default_palace_used_when_arg_omitted` covers the resolution
387    /// path; this setter is exercised there.
388    pub fn with_default_palace(mut self, name: Option<String>) -> Self {
389        self.default_palace = name;
390        self
391    }
392
393    /// Resolve (or initialize) the shared embedder.
394    ///
395    /// Why: FastEmbedder load is expensive — we share one instance across all
396    /// tool calls; the `OnceCell` ensures concurrent first-use races collapse
397    /// to a single load.
398    /// What: Returns `Arc<FastEmbedder>` on success. Errors propagate from the
399    /// underlying ONNX load.
400    /// Test: Indirectly via `dispatch_remember_then_recall`.
401    /// Resolve the active chat provider, auto-detecting on first call.
402    ///
403    /// Why: Provider selection depends on filesystem-loaded config plus a
404    /// network probe (Ollama liveness), so it must be lazily initialised at
405    /// runtime. Caching the choice in a `OnceCell` keeps it stable across
406    /// concurrent requests without re-probing on every chat call.
407    /// What: On first use loads `~/.trusty-memory/config.toml`, prefers an
408    /// auto-detected Ollama instance (when `local_model.enabled`), and falls
409    /// back to OpenRouter when an API key is set. Returns `Ok(None)` when
410    /// neither is available so the caller can emit a 412.
411    /// Test: `web::tests::providers_endpoint_returns_payload` covers the
412    /// detection path indirectly through `/api/v1/chat/providers`.
413    pub async fn chat_provider(&self) -> Option<Arc<dyn ChatProvider>> {
414        self.chat_provider
415            .get_or_init(|| async {
416                let cfg = crate::web::load_user_config().unwrap_or_default();
417                if cfg.local_model.enabled {
418                    if let Some(mut p) =
419                        trusty_common::auto_detect_local_provider(&cfg.local_model.base_url).await
420                    {
421                        // auto_detect returns an empty model id; callers must
422                        // set the configured model name themselves.
423                        p.model = cfg.local_model.model.clone();
424                        return Some(Arc::new(p) as Arc<dyn ChatProvider>);
425                    }
426                }
427                if !cfg.openrouter_api_key.is_empty() {
428                    return Some(Arc::new(trusty_common::OpenRouterProvider::new(
429                        cfg.openrouter_api_key,
430                        cfg.openrouter_model,
431                    )) as Arc<dyn ChatProvider>);
432                }
433                None
434            })
435            .await
436            .clone()
437    }
438
439    /// Spawn a fire-and-forget background task that auto-discovers project
440    /// aliases under `project_root` and asserts new ones into `palace`.
441    ///
442    /// Why (issue #42): Projects carry implicit shorthand — cargo package
443    /// names that differ from their directory, binary names that differ
444    /// from packages, first-letter abbreviations — that should be surfaced
445    /// without a user ever calling `add_alias`. Running discovery as a
446    /// detached task on palace-open keeps startup latency unchanged: the
447    /// daemon binds and starts serving immediately while the discovery scan
448    /// completes in the background, and any newly-asserted aliases land in
449    /// the prompt cache before the model's next `get_prompt_context` call.
450    /// What: clones `self` (cheap; `Arc`-backed), spawns a tokio task that
451    /// invokes the `discover_aliases` tool handler directly so the
452    /// dedup + cache-rebuild logic runs exactly the same path as the MCP
453    /// tool call. Errors are logged at `warn!`; one failed discovery never
454    /// destabilises the daemon.
455    /// Test: not unit-tested (timing-dependent fire-and-forget); the
456    /// underlying `discover_aliases` dispatch is covered by
457    /// `dispatch_discover_aliases_inserts_new_and_dedupes` in `tools::tests`.
458    pub fn spawn_alias_discovery(&self, palace: String, project_root: PathBuf) {
459        let state = self.clone();
460        tokio::spawn(async move {
461            let args = serde_json::json!({
462                "palace": palace,
463                "project_root": project_root.to_string_lossy(),
464            });
465            match tools::dispatch_tool(&state, "discover_aliases", args).await {
466                Ok(result) => tracing::info!(
467                    new = ?result.get("new"),
468                    already_known = ?result.get("already_known"),
469                    "alias discovery complete"
470                ),
471                Err(e) => tracing::warn!("alias discovery failed: {e:#}"),
472            }
473        });
474    }
475
476    pub async fn embedder(&self) -> Result<Arc<FastEmbedder>> {
477        let cell = self.embedder.clone();
478        let embedder = cell
479            .get_or_try_init(|| async {
480                let e = FastEmbedder::new().await?;
481                Ok::<Arc<FastEmbedder>, anyhow::Error>(Arc::new(e))
482            })
483            .await?
484            .clone();
485        Ok(embedder)
486    }
487}
488
489impl std::fmt::Debug for AppState {
490    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
491        f.debug_struct("AppState")
492            .field("version", &self.version)
493            .field("data_root", &self.data_root)
494            .field("registry_len", &self.registry.len())
495            .finish()
496    }
497}
498
499/// Handle a single MCP JSON-RPC message and produce its response.
500///
501/// Why: Pulled out of the stdio loop so unit tests can drive every method
502/// without touching real stdin/stdout.
503/// What: Routes `initialize`, `tools/list`, `tools/call`, `ping`, and the
504/// `notifications/initialized` notification (which returns `Value::Null`).
505/// Test: See unit tests below — initialize/list/call all return expected
506/// JSON-RPC envelopes; notifications return `Null` (no response written).
507pub async fn handle_message(state: &AppState, msg: Value) -> Value {
508    let id = msg.get("id").cloned().unwrap_or(Value::Null);
509    let method = msg.get("method").and_then(|m| m.as_str()).unwrap_or("");
510
511    match method {
512        "initialize" => {
513            let extra = state
514                .default_palace
515                .as_ref()
516                .map(|dp| json!({ "default_palace": dp }));
517            let result = initialize_response("trusty-memory", &state.version, extra);
518            // Why (issue #42): prompt-facts now flow through the
519            // per-message `get_prompt_context` tool rather than MCP
520            // prompts, so we no longer advertise the `prompts` capability.
521            json!({
522                "jsonrpc": "2.0",
523                "id": id,
524                "result": result,
525            })
526        }
527        // Notifications must NOT receive a response.
528        "notifications/initialized" | "notifications/cancelled" => Value::Null,
529        "tools/list" => json!({
530            "jsonrpc": "2.0",
531            "id": id,
532            "result": tools::tool_definitions_with(state.default_palace.is_some())
533        }),
534        // OpenRPC 1.3.2 discovery — see `openrpc.rs`. Returns the full
535        // service description so orchestrators (open-mpm, etc.) can
536        // introspect every tool and its required `memory.read`/`memory.write`
537        // scope without bespoke per-server adapters.
538        "rpc.discover" => json!({
539            "jsonrpc": "2.0",
540            "id": id,
541            "result": openrpc::build_discover_response(
542                &state.version,
543                state.default_palace.is_some(),
544            ),
545        }),
546        "tools/call" => {
547            let params = msg.get("params").cloned().unwrap_or_default();
548            let tool_name = params
549                .get("name")
550                .and_then(|n| n.as_str())
551                .unwrap_or("")
552                .to_string();
553            let args = params.get("arguments").cloned().unwrap_or_default();
554            match tools::dispatch_tool(state, &tool_name, args).await {
555                Ok(content) => {
556                    // Why: tools that return a bare JSON string (e.g.
557                    // `get_prompt_context` returning the formatted
558                    // Markdown block) should surface as plain text in the
559                    // MCP `content[0].text` field — wrapping in
560                    // `Value::to_string()` would re-quote the payload and
561                    // force every caller to strip outer quotes.
562                    let text = match &content {
563                        Value::String(s) => s.clone(),
564                        other => other.to_string(),
565                    };
566                    json!({
567                        "jsonrpc": "2.0",
568                        "id": id,
569                        "result": {
570                            "content": [{"type": "text", "text": text}]
571                        }
572                    })
573                }
574                Err(e) => json!({
575                    "jsonrpc": "2.0",
576                    "id": id,
577                    // Why: anyhow's `{:#}` alternate format walks the full
578                    // `Caused by:` chain so MCP clients see actionable
579                    // detail (e.g. "PalaceHandle::remember_with_options:
580                    // filter rejected: too short") instead of just the
581                    // outermost context label.
582                    "error": {"code": -32603, "message": format!("{e:#}")}
583                }),
584            }
585        }
586        "ping" => json!({"jsonrpc": "2.0", "id": id, "result": {}}),
587        _ => json!({
588            "jsonrpc": "2.0",
589            "id": id,
590            "error": {
591                "code": -32601,
592                "message": format!("Method not found: {method}")
593            }
594        }),
595    }
596}
597
598/// Run the MCP stdio JSON-RPC 2.0 server loop.
599///
600/// Why: Claude Code launches MCP servers as child processes and speaks
601/// JSON-RPC over stdin/stdout — this is the primary integration path.
602/// What: Delegates to `trusty_mcp_core::run_stdio_loop`, adapting each
603/// shared `Request` back into the JSON `Value` shape `handle_message`
604/// expects, and translating the returned `Value` into a `Response`.
605/// Notifications (where `handle_message` returns `Value::Null`) become
606/// suppressed responses so the loop emits nothing on the wire.
607/// Test: `handle_message` covers protocol behaviour in unit tests.
608pub async fn run_stdio(state: AppState) -> Result<()> {
609    info!("trusty-memory MCP stdio server starting");
610    let state = Arc::new(state);
611    trusty_common::mcp::run_stdio_loop(move |req: Request| {
612        let state = state.clone();
613        async move {
614            // Re-serialise the Request into the JSON shape handle_message expects.
615            // (handle_message predates the shared types and reads loose Values.)
616            let msg = json!({
617                "jsonrpc": req.jsonrpc.unwrap_or_else(|| "2.0".to_string()),
618                "id": req.id.clone().unwrap_or(Value::Null),
619                "method": req.method,
620                "params": req.params.unwrap_or(Value::Null),
621            });
622            let resp_value = handle_message(&state, msg).await;
623            // handle_message returns Value::Null for notifications.
624            if resp_value.is_null() {
625                return Response::suppressed();
626            }
627            // Otherwise it returns the full JSON-RPC envelope as a Value;
628            // re-encode into the shared Response struct so the loop can serialise.
629            let id = resp_value.get("id").cloned();
630            if let Some(result) = resp_value.get("result").cloned() {
631                Response::ok(id, result)
632            } else if let Some(err) = resp_value.get("error") {
633                let code =
634                    err.get("code")
635                        .and_then(|c| c.as_i64())
636                        .unwrap_or(error_codes::INTERNAL_ERROR as i64) as i32;
637                let message = err
638                    .get("message")
639                    .and_then(|m| m.as_str())
640                    .unwrap_or("internal error")
641                    .to_string();
642                Response::err(id, code, message)
643            } else {
644                Response::err(
645                    id,
646                    error_codes::INTERNAL_ERROR,
647                    "malformed handler response",
648                )
649            }
650        }
651    })
652    .await
653}
654
655/// Preferred starting port for the trusty-memory HTTP daemon.
656///
657/// Why: keeps the well-known default stable for clients that have hard-coded
658/// `127.0.0.1:7070` in their configuration, while still allowing dynamic
659/// walking when the port is in use (`DYNAMIC_PORT_RANGE` ports starting here).
660/// What: `7070` — historic default, matches the launchd plist's prior value.
661/// Test: covered indirectly by `bind_dynamic_port_returns_listener`.
662pub const DEFAULT_HTTP_PORT: u16 = 7070;
663
664/// Number of consecutive ports `bind_dynamic_port` walks before falling back
665/// to the OS-assigned port. Matches the trusty-search convention.
666const DYNAMIC_PORT_RANGE: u16 = 10;
667
668/// Path to `~/.trusty-memory/http_addr` — the canonical address-discovery file.
669///
670/// Why: clients (CLI, MCP tools, dashboards) need to find the running daemon
671/// without configuration when the port was selected dynamically. Mirrors
672/// `trusty-search`'s `~/.trusty-search/http_addr` contract so the two tools
673/// share a single discovery convention.
674/// What: returns `$HOME/.trusty-memory/http_addr`, or `None` if `$HOME` is
675/// unresolvable (locked-down container, no passwd entry).
676/// Test: `http_addr_path_uses_dot_trusty_memory`.
677pub fn http_addr_path() -> Option<PathBuf> {
678    dirs::home_dir().map(|h| h.join(".trusty-memory").join("http_addr"))
679}
680
681/// Bind a `TcpListener` to `127.0.0.1`, dynamically selecting a port.
682///
683/// Why: the historic default `7070` is convenient for clients but a stale
684/// process or a second daemon must not produce a noisy failure. Walking
685/// `DEFAULT_HTTP_PORT..DEFAULT_HTTP_PORT+DYNAMIC_PORT_RANGE` first preserves
686/// backwards compatibility for the common case; OS-assigned fallback (`:0`)
687/// guarantees the daemon always comes up even when every preferred port is
688/// busy.
689/// What: returns the first successful `TcpListener`. Tries 7070..=7079
690/// in order, then falls back to OS-assigned. Caller inspects
691/// `local_addr()` to learn the chosen port.
692/// Test: `bind_dynamic_port_returns_listener` confirms it always binds *some*
693/// port even after another listener occupies the preferred one.
694pub async fn bind_dynamic_port() -> Result<tokio::net::TcpListener> {
695    let preferred: SocketAddr = SocketAddr::from(([127, 0, 0, 1], DEFAULT_HTTP_PORT));
696    // First: walk the preferred range (7070..=7079).
697    if let Ok(listener) =
698        trusty_common::bind_with_auto_port(preferred, DYNAMIC_PORT_RANGE - 1).await
699    {
700        return Ok(listener);
701    }
702    // Last resort: ask the kernel for any free port. `bind_with_auto_port`
703    // with `:0` resolves immediately to the OS-assigned port.
704    tracing::warn!(
705        "all ports {DEFAULT_HTTP_PORT}..{} in use; requesting OS-assigned port",
706        DEFAULT_HTTP_PORT + DYNAMIC_PORT_RANGE - 1
707    );
708    let any: SocketAddr = SocketAddr::from(([127, 0, 0, 1], 0));
709    trusty_common::bind_with_auto_port(any, 0).await
710}
711
712/// Write the bound `host:port` to `~/.trusty-memory/http_addr` atomically.
713///
714/// Why: clients must read the file mid-write without observing a partial
715/// value. Writing to a `.tmp` sibling and renaming over the target gives
716/// POSIX atomicity, matching the trusty-search implementation.
717/// What: creates `~/.trusty-memory/` if missing; writes `addr` followed by a
718/// trailing newline (avoids the "no newline at end of file" warnings from
719/// `cat`); renames `.tmp` → `http_addr`. Best-effort: I/O errors are
720/// returned to the caller so `run_http_on` can log without panicking.
721/// Test: `http_addr_file_round_trip_via_helpers`.
722fn write_http_addr_file(path: &Path, addr: &SocketAddr) -> std::io::Result<()> {
723    use std::io::Write;
724    if let Some(parent) = path.parent() {
725        std::fs::create_dir_all(parent)?;
726    }
727    let tmp = path.with_extension("addr.tmp");
728    {
729        let mut f = std::fs::File::create(&tmp)?;
730        writeln!(f, "{addr}")?;
731        f.sync_all()?;
732    }
733    std::fs::rename(&tmp, path)?;
734    Ok(())
735}
736
737/// Run the optional HTTP/SSE + web admin server.
738///
739/// Why: A long-running daemon mode lets non-stdio clients (browsers, curl,
740/// future remote agents) hit `/health`, the `/api/v1/*` REST surface, and the
741/// embedded admin SPA.
742/// What: axum router built from `web::router()` plus a `/sse` stub for the
743/// existing MCP-over-SSE clients. Caller provides a pre-bound listener so
744/// port auto-detection lives at the call site. Before accepting connections
745/// the daemon stamps the bound `host:port` onto `AppState.bound_addr` and
746/// writes `~/.trusty-memory/http_addr` so clients can discover the live port.
747/// On shutdown the file is removed best-effort (a stale file with the wrong
748/// port is worse than a missing one).
749/// Test: `cargo test -p trusty-memory web::tests` exercises the router shape;
750/// manual: `curl http://127.0.0.1:<port>/health` returns `ok` with `addr`.
751pub async fn run_http_on(state: AppState, listener: tokio::net::TcpListener) -> Result<()> {
752    use axum::routing::get;
753
754    // Issue #35: recompute the `data_root` disk footprint every 10 s on a
755    // background task so `GET /health` reports `disk_bytes` without doing a
756    // recursive directory walk on the request path.
757    spawn_disk_size_ticker(state.clone());
758
759    // Capture and advertise the bound address BEFORE serving so the first
760    // request handler — and the http_addr discovery file — see the real port
761    // even if `local_addr()` would otherwise be racy.
762    let local = listener.local_addr().ok();
763    let written_path = if let Some(a) = local {
764        // Stash on state for handlers (e.g. /health) to surface.
765        let _ = state.bound_addr.set(a);
766        info!("HTTP server listening on http://{a}");
767        eprintln!("HTTP server listening on http://{a}");
768        // Best-effort: a missing $HOME or read-only fs is non-fatal — the
769        // /health endpoint still advertises `addr`. Logging the failure
770        // helps operators diagnose discovery problems.
771        match http_addr_path() {
772            Some(p) => match write_http_addr_file(&p, &a) {
773                Ok(()) => {
774                    info!("wrote daemon address to {}", p.display());
775                    Some(p)
776                }
777                Err(e) => {
778                    tracing::warn!("could not write {}: {e}", p.display());
779                    None
780                }
781            },
782            None => {
783                tracing::warn!("no $HOME — skipping http_addr discovery file");
784                None
785            }
786        }
787    } else {
788        None
789    };
790
791    let app = web::router()
792        .route("/sse", get(sse_handler))
793        .with_state(state);
794
795    let serve_result = axum::serve(listener, app).await;
796
797    // Best-effort cleanup: remove `http_addr` so stale clients fail fast
798    // instead of timing out against a dead port.
799    if let Some(p) = written_path.as_ref() {
800        let _ = std::fs::remove_file(p);
801    }
802
803    serve_result?;
804    Ok(())
805}
806
807/// Convenience: bind `addr` and serve via [`run_http_on`].
808pub async fn run_http(state: AppState, addr: std::net::SocketAddr) -> Result<()> {
809    let listener = tokio::net::TcpListener::bind(addr).await?;
810    run_http_on(state, listener).await
811}
812
813/// Convenience: bind dynamically (7070..=7079, OS fallback) and serve.
814///
815/// Why: `trusty-memory serve` with no `--http` flag is the canonical
816/// launchd-managed daemon entry point. Dynamic binding lets a stale daemon
817/// or a hand-spawned `serve --http 127.0.0.1:7070` coexist without breaking
818/// the launchd-managed instance.
819/// What: calls [`bind_dynamic_port`] then [`run_http_on`].
820/// Test: integration via `trusty-memory serve` + `cat ~/.trusty-memory/http_addr`.
821pub async fn run_http_dynamic(state: AppState) -> Result<()> {
822    let listener = bind_dynamic_port().await?;
823    run_http_on(state, listener).await
824}
825
826/// Spawn a background ticker that recomputes the `data_root` disk footprint
827/// every 10 seconds and stores it in `state.disk_bytes` (issue #35).
828///
829/// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
830/// every health request would turn a frequent health poll into unbounded
831/// recursive I/O. Computing it off the request path on a fixed cadence keeps
832/// `/health` cheap and bounds the staleness to ~10 s — fine for an
833/// at-a-glance footprint figure.
834/// What: spawns a detached tokio task. `AppState` is cheap to `Clone` (all
835/// `Arc` fields), so the task holds a full clone; the daemon process lives
836/// for the lifetime of the server anyway, so no `Weak` downgrade is needed.
837/// Each tick runs the blocking directory walk on `spawn_blocking` so it never
838/// stalls the async runtime, then stores the byte total atomically.
839/// Test: `health_endpoint_includes_resource_fields` asserts the field shape;
840/// the ticker cadence is not unit-tested (timing-dependent).
841fn spawn_disk_size_ticker(state: AppState) {
842    tokio::spawn(async move {
843        let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
844        loop {
845            interval.tick().await;
846            let dir = state.data_root.clone();
847            // The directory walk is blocking filesystem I/O — run it on the
848            // blocking pool so it never parks an async worker thread.
849            let bytes = tokio::task::spawn_blocking(move || {
850                trusty_common::sys_metrics::dir_size_bytes(&dir)
851            })
852            .await
853            .unwrap_or(0);
854            state
855                .disk_bytes
856                .store(bytes, std::sync::atomic::Ordering::Relaxed);
857        }
858    });
859}
860
861/// Live SSE event stream — pushes `DaemonEvent` frames to dashboard clients.
862///
863/// Why: The dashboard subscribes once and reacts to live pushes (palace
864/// created, drawer added/deleted, dream completed, status changed) instead of
865/// polling `/api/v1/*` endpoints.
866/// What: Subscribes to `state.events`, emits an initial `connected` frame,
867/// then forwards every `DaemonEvent` as `data: <json>\n\n`. Lagged
868/// subscribers receive a `lag` frame indicating skipped events; channel
869/// closure ends the stream.
870/// Test: `web::tests::sse_stream_emits_palace_created` (covers subscribe +
871/// emit + receive); manual: `curl -N http://.../sse`.
872pub(crate) async fn sse_handler(
873    axum::extract::State(state): axum::extract::State<AppState>,
874) -> impl axum::response::IntoResponse {
875    use futures::StreamExt;
876    use tokio_stream::wrappers::BroadcastStream;
877
878    let rx = state.events.subscribe();
879    let initial = futures::stream::once(async {
880        Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(
881            "data: {\"type\":\"connected\"}\n\n",
882        ))
883    });
884    let events = BroadcastStream::new(rx).map(|res| {
885        let frame = match res {
886            Ok(event) => match serde_json::to_string(&event) {
887                Ok(json) => format!("data: {json}\n\n"),
888                Err(e) => format!("data: {{\"type\":\"error\",\"message\":\"{e}\"}}\n\n"),
889            },
890            Err(tokio_stream::wrappers::errors::BroadcastStreamRecvError::Lagged(n)) => {
891                format!("data: {{\"type\":\"lag\",\"skipped\":{n}}}\n\n")
892            }
893        };
894        Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(frame))
895    });
896    let stream = initial.chain(events);
897
898    axum::response::Response::builder()
899        .header("Content-Type", "text/event-stream")
900        .header("Cache-Control", "no-cache")
901        .header("X-Accel-Buffering", "no")
902        .body(axum::body::Body::from_stream(stream))
903        .expect("valid SSE response")
904}
905
906#[cfg(test)]
907mod tests {
908    use super::*;
909
910    fn test_state() -> AppState {
911        let tmp = tempfile::tempdir().expect("tempdir");
912        let root = tmp.path().to_path_buf();
913        // Leak the tempdir so it lives for the test process; tests are short.
914        std::mem::forget(tmp);
915        AppState::new(root)
916    }
917
918    #[tokio::test]
919    async fn initialize_returns_protocol_version_and_capabilities() {
920        let state = test_state();
921        let req = json!({
922            "jsonrpc": "2.0",
923            "id": 1,
924            "method": "initialize",
925            "params": {
926                "protocolVersion": "2024-11-05",
927                "capabilities": {},
928                "clientInfo": {"name": "test", "version": "0"}
929            }
930        });
931        let resp = handle_message(&state, req).await;
932        assert_eq!(resp["jsonrpc"], "2.0");
933        assert_eq!(resp["id"], 1);
934        assert_eq!(resp["result"]["protocolVersion"], "2024-11-05");
935        assert!(resp["result"]["capabilities"]["tools"].is_object());
936        assert_eq!(resp["result"]["serverInfo"]["name"], "trusty-memory");
937    }
938
939    #[tokio::test]
940    async fn initialized_notification_returns_null() {
941        let state = test_state();
942        let req = json!({
943            "jsonrpc": "2.0",
944            "method": "notifications/initialized",
945            "params": {}
946        });
947        let resp = handle_message(&state, req).await;
948        assert!(resp.is_null());
949    }
950
951    #[tokio::test]
952    async fn tools_list_returns_all_tools() {
953        let state = test_state();
954        let req = json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"});
955        let resp = handle_message(&state, req).await;
956        let tools = resp["result"]["tools"].as_array().expect("tools array");
957        assert_eq!(tools.len(), 20);
958    }
959
960    #[tokio::test]
961    async fn unknown_method_returns_error() {
962        let state = test_state();
963        let req = json!({"jsonrpc": "2.0", "id": 4, "method": "wat"});
964        let resp = handle_message(&state, req).await;
965        assert_eq!(resp["error"]["code"], -32601);
966    }
967
968    #[tokio::test]
969    async fn ping_returns_empty_result() {
970        let state = test_state();
971        let req = json!({"jsonrpc": "2.0", "id": 5, "method": "ping"});
972        let resp = handle_message(&state, req).await;
973        assert!(resp["result"].is_object());
974    }
975
976    #[tokio::test]
977    async fn app_state_default_constructs() {
978        let s = test_state();
979        assert!(!s.version.is_empty());
980        assert!(s.registry.is_empty());
981        assert!(s.default_palace.is_none());
982    }
983
984    /// Why: Issue #26 — when `serve --palace <name>` is set, the MCP server
985    /// must (a) report the default in the `initialize` `serverInfo`, (b)
986    /// drop `palace` from the required schema in `tools/list`, and (c) let
987    /// `tools/call` use the default when the caller omits `palace`.
988    /// Test: Construct an AppState with a default palace, create that palace
989    /// on disk via the registry, then call `memory_remember` without a
990    /// `palace` argument and confirm it resolves to the default.
991    #[tokio::test]
992    async fn default_palace_used_when_arg_omitted() {
993        let tmp = tempfile::tempdir().expect("tempdir");
994        let root = tmp.path().to_path_buf();
995
996        // Pre-create the default palace so remember has somewhere to land.
997        let registry = trusty_common::memory_core::PalaceRegistry::new();
998        let palace = trusty_common::memory_core::Palace {
999            id: trusty_common::memory_core::PalaceId::new("default-pal"),
1000            name: "default-pal".to_string(),
1001            description: None,
1002            created_at: chrono::Utc::now(),
1003            data_dir: root.join("default-pal"),
1004        };
1005        registry
1006            .create_palace(&root, palace)
1007            .expect("create_palace");
1008
1009        let state = AppState::new(root).with_default_palace(Some("default-pal".to_string()));
1010
1011        // (a) initialize advertises the default.
1012        let init = handle_message(
1013            &state,
1014            json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
1015        )
1016        .await;
1017        assert_eq!(
1018            init["result"]["serverInfo"]["default_palace"], "default-pal",
1019            "initialize must echo default_palace in serverInfo"
1020        );
1021
1022        // (b) tools/list drops `palace` from required when default is set.
1023        let list = handle_message(
1024            &state,
1025            json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"}),
1026        )
1027        .await;
1028        let tools = list["result"]["tools"].as_array().expect("tools array");
1029        let remember = tools
1030            .iter()
1031            .find(|t| t["name"] == "memory_remember")
1032            .expect("memory_remember tool");
1033        let required: Vec<&str> = remember["inputSchema"]["required"]
1034            .as_array()
1035            .expect("required array")
1036            .iter()
1037            .filter_map(|v| v.as_str())
1038            .collect();
1039        assert!(
1040            !required.contains(&"palace"),
1041            "palace must not be required when default is configured; got {required:?}"
1042        );
1043        assert!(required.contains(&"text"));
1044
1045        // (c) tools/call resolves the default when arg is omitted.
1046        let call = handle_message(
1047            &state,
1048            json!({
1049                "jsonrpc": "2.0",
1050                "id": 3,
1051                "method": "tools/call",
1052                "params": {
1053                    "name": "memory_remember",
1054                    "arguments": {"text": "default palace test memory content with several tokens"},
1055                },
1056            }),
1057        )
1058        .await;
1059        // Successful dispatch returns `result.content[0].text` JSON.
1060        let text = call["result"]["content"][0]["text"]
1061            .as_str()
1062            .unwrap_or_else(|| panic!("expected success result, got {call}"));
1063        let parsed: Value = serde_json::from_str(text).expect("parse content json");
1064        assert_eq!(parsed["palace"], "default-pal");
1065        assert_eq!(parsed["status"], "stored");
1066        assert!(parsed["drawer_id"].as_str().is_some());
1067    }
1068
1069    /// Why: When no default is set, `tools/call` for a palace-bound tool
1070    /// without a `palace` argument should error helpfully rather than panic.
1071    #[tokio::test]
1072    async fn missing_palace_without_default_errors() {
1073        let state = test_state();
1074        let resp = handle_message(
1075            &state,
1076            json!({
1077                "jsonrpc": "2.0",
1078                "id": 7,
1079                "method": "tools/call",
1080                "params": {
1081                    "name": "memory_recall",
1082                    "arguments": {"query": "anything"},
1083                },
1084            }),
1085        )
1086        .await;
1087        assert_eq!(resp["error"]["code"], -32603);
1088        let msg = resp["error"]["message"].as_str().unwrap_or("");
1089        assert!(
1090            msg.contains("missing 'palace'"),
1091            "expected helpful error, got: {msg}"
1092        );
1093    }
1094
1095    /// Why: regression for the "palaces lost on restart" bug — `AppState::new`
1096    /// builds an empty registry, so the daemon must call
1097    /// `load_palaces_from_disk` on startup to re-register palaces persisted by
1098    /// a previous run. Without that call the registry stays empty even though
1099    /// `palace.json` files exist on disk.
1100    /// What: persists two palaces under a tempdir (via the same
1101    /// `create_palace` path the `palace_create` tool uses), constructs a fresh
1102    /// `AppState` rooted there, calls `load_palaces_from_disk`, and asserts the
1103    /// returned count and registry contents.
1104    /// Test: this test itself.
1105    #[tokio::test]
1106    async fn load_palaces_from_disk_rehydrates_registry() {
1107        use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
1108
1109        let tmp = tempfile::tempdir().expect("tempdir");
1110        let root = tmp.path().to_path_buf();
1111
1112        // Phase 1: persist two palaces to disk, then drop the writer registry
1113        // so nothing is held in memory — simulating a prior daemon run.
1114        {
1115            let writer = PalaceRegistry::new();
1116            for id in ["alpha", "beta"] {
1117                let palace = Palace {
1118                    id: PalaceId::new(id),
1119                    name: id.to_string(),
1120                    description: None,
1121                    created_at: chrono::Utc::now(),
1122                    data_dir: root.join(id),
1123                };
1124                writer
1125                    .create_palace(&root, palace)
1126                    .expect("persist palace to disk");
1127            }
1128        }
1129
1130        // Add a stray non-palace subdirectory; the walker must ignore it.
1131        std::fs::create_dir_all(root.join("not-a-palace")).expect("mkdir");
1132
1133        // Phase 2: fresh AppState starts with an empty registry (the bug).
1134        let state = AppState::new(root);
1135        assert!(
1136            state.registry.is_empty(),
1137            "AppState::new must start with an empty registry"
1138        );
1139
1140        // The fix: hydrate from disk.
1141        let count = state
1142            .load_palaces_from_disk()
1143            .await
1144            .expect("load_palaces_from_disk");
1145
1146        assert_eq!(count, 2, "both persisted palaces should be loaded");
1147        assert_eq!(state.registry.len(), 2, "registry should hold both palaces");
1148        let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
1149        assert!(ids.contains(&"alpha".to_string()));
1150        assert!(ids.contains(&"beta".to_string()));
1151    }
1152
1153    /// Why: existing installs (and the legacy standalone `trusty-memory` repo)
1154    /// nest palaces one level deeper under a `palaces/` subdirectory. When that
1155    /// subdirectory exists, `resolve_palace_registry_dir` must descend into it
1156    /// so the daemon scans the level that actually holds the `palace.json`
1157    /// files — otherwise it finds zero palaces, which is the restart bug.
1158    /// What: creates `<dir>/palaces/`, resolves, and asserts the nested path is
1159    /// returned.
1160    /// Test: this test itself.
1161    #[test]
1162    fn resolve_palace_registry_dir_prefers_palaces_subdir() {
1163        let tmp = tempfile::tempdir().expect("tempdir");
1164        let data_dir = tmp.path().to_path_buf();
1165        std::fs::create_dir_all(data_dir.join("palaces")).expect("mkdir palaces");
1166
1167        let resolved = resolve_palace_registry_dir(data_dir.clone());
1168        assert_eq!(resolved, data_dir.join("palaces"));
1169    }
1170
1171    /// Why: a fresh install with no `palaces/` subdirectory must fall back to
1172    /// the data dir itself (the current flat monorepo layout).
1173    #[test]
1174    fn resolve_palace_registry_dir_falls_back_to_data_dir() {
1175        let tmp = tempfile::tempdir().expect("tempdir");
1176        let data_dir = tmp.path().to_path_buf();
1177
1178        let resolved = resolve_palace_registry_dir(data_dir.clone());
1179        assert_eq!(resolved, data_dir);
1180    }
1181
1182    /// Why: end-to-end check that the nested-`palaces/` layout hydrates — the
1183    /// daemon resolves the registry dir via `resolve_palace_registry_dir`, so
1184    /// an `AppState` rooted there must load palaces persisted one level below
1185    /// the bare data dir.
1186    /// What: persists two palaces under `<root>/palaces/<id>/`, constructs an
1187    /// `AppState` rooted at the resolved registry dir, and asserts hydration
1188    /// finds both.
1189    /// Test: this test itself.
1190    #[tokio::test]
1191    async fn load_palaces_from_disk_handles_palaces_subdir() {
1192        use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
1193
1194        let tmp = tempfile::tempdir().expect("tempdir");
1195        let root = tmp.path().to_path_buf();
1196        let nested = root.join("palaces");
1197
1198        {
1199            let writer = PalaceRegistry::new();
1200            for id in ["cto", "engineering"] {
1201                let palace = Palace {
1202                    id: PalaceId::new(id),
1203                    name: id.to_string(),
1204                    description: None,
1205                    created_at: chrono::Utc::now(),
1206                    data_dir: nested.join(id),
1207                };
1208                // create_palace anchors data_dir under the passed root, so
1209                // pass `nested` here to land palaces under `<root>/palaces/`.
1210                writer
1211                    .create_palace(&nested, palace)
1212                    .expect("persist palace under palaces/ subdir");
1213            }
1214        }
1215
1216        // Mirror main.rs: resolve the registry dir, then root AppState there.
1217        let registry_dir = resolve_palace_registry_dir(root);
1218        assert_eq!(registry_dir, nested, "must resolve into palaces/ subdir");
1219        let state = AppState::new(registry_dir);
1220        let count = state
1221            .load_palaces_from_disk()
1222            .await
1223            .expect("load_palaces_from_disk");
1224
1225        assert_eq!(count, 2, "both nested palaces should be loaded");
1226        assert_eq!(state.registry.len(), 2);
1227        let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
1228        assert!(ids.contains(&"cto".to_string()));
1229        assert!(ids.contains(&"engineering".to_string()));
1230    }
1231
1232    /// Why: an empty (or missing) palace registry directory must not error — a
1233    /// brand-new install has nothing to hydrate and should report zero.
1234    #[tokio::test]
1235    async fn load_palaces_from_disk_empty_root_returns_zero() {
1236        let state = test_state();
1237        let count = state
1238            .load_palaces_from_disk()
1239            .await
1240            .expect("load_palaces_from_disk on empty root");
1241        assert_eq!(count, 0);
1242        assert!(state.registry.is_empty());
1243    }
1244
1245    /// Why: initialize without a default palace must omit `default_palace`
1246    /// from `serverInfo` so clients can detect the unbound mode.
1247    #[tokio::test]
1248    async fn initialize_without_default_palace_omits_field() {
1249        let state = test_state();
1250        let init = handle_message(
1251            &state,
1252            json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
1253        )
1254        .await;
1255        assert!(init["result"]["serverInfo"]["default_palace"].is_null());
1256    }
1257
1258    /// Why: every `~/.trusty-memory/http_addr` consumer (CLI, dashboard,
1259    /// future trusty-mpm wiring) must agree on the path. A regression that
1260    /// moves this file to e.g. `$XDG_DATA_HOME/trusty-memory/http_addr` would
1261    /// silently break every client.
1262    /// What: under a real `$HOME`, the path ends in `.trusty-memory/http_addr`.
1263    #[test]
1264    fn http_addr_path_uses_dot_trusty_memory() {
1265        if let Some(p) = http_addr_path() {
1266            assert!(
1267                p.ends_with(".trusty-memory/http_addr"),
1268                "unexpected http_addr path: {}",
1269                p.display()
1270            );
1271        }
1272        // CI containers with no $HOME return None — that's fine; the writer
1273        // logs and falls back gracefully.
1274    }
1275
1276    /// Why: write+read round-trip pins the disk format: a single line of
1277    /// `host:port\n`. Clients (cat, sh `$(cat ...)`) trim whitespace, so the
1278    /// trailing newline is invisible — but anything else (extra whitespace,
1279    /// multi-line) would break callers.
1280    #[test]
1281    fn http_addr_file_round_trip_via_helpers() {
1282        let dir = tempfile::tempdir().unwrap();
1283        let path = dir.path().join("http_addr");
1284        let addr: SocketAddr = "127.0.0.1:7073".parse().unwrap();
1285        write_http_addr_file(&path, &addr).unwrap();
1286        let raw = std::fs::read_to_string(&path).unwrap();
1287        assert_eq!(raw.trim(), "127.0.0.1:7073");
1288        // The trailing newline keeps `cat` and editors happy.
1289        assert!(raw.ends_with('\n'));
1290    }
1291
1292    /// Why: dynamic binding must succeed even when the preferred port is
1293    /// already in use. Walking 7070..=7079 + OS fallback guarantees the
1294    /// daemon never fails to come up just because another process holds 7070.
1295    /// What: pre-bind 7070 (best-effort — skip the test if it's already
1296    /// busy on the host), then call `bind_dynamic_port` and assert we got
1297    /// *some* listener back.
1298    #[tokio::test]
1299    async fn bind_dynamic_port_returns_listener() {
1300        let listener = bind_dynamic_port().await.expect("bind_dynamic_port");
1301        let addr = listener.local_addr().expect("local_addr");
1302        assert_eq!(addr.ip().to_string(), "127.0.0.1");
1303        assert!(addr.port() > 0, "port must be non-zero after bind");
1304    }
1305
1306    /// Why: Issue #42 — prompt-facts are now served by the per-message
1307    /// `get_prompt_context` tool rather than the MCP prompts surface, so the
1308    /// `initialize` handshake must NOT advertise a `prompts` capability and
1309    /// `prompts/list` / `prompts/get` must fall through to the "method not
1310    /// found" path.
1311    #[tokio::test]
1312    async fn initialize_does_not_advertise_prompts_capability() {
1313        let state = test_state();
1314        let init = handle_message(
1315            &state,
1316            json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
1317        )
1318        .await;
1319        assert!(
1320            init["result"]["capabilities"]["prompts"].is_null(),
1321            "initialize must NOT advertise the prompts capability; got {init}"
1322        );
1323
1324        // Both prompts/* dispatchers should now report method-not-found.
1325        for method in ["prompts/list", "prompts/get"] {
1326            let resp =
1327                handle_message(&state, json!({"jsonrpc": "2.0", "id": 2, "method": method})).await;
1328            assert_eq!(
1329                resp["error"]["code"], -32601,
1330                "{method} should return method-not-found; got {resp}"
1331            );
1332        }
1333    }
1334
1335    /// Why: `AppState::new` must initialise `bound_addr` to an empty
1336    /// `OnceLock` so `/health` reports `addr: None` on the stdio path. A
1337    /// regression that pre-populates this field would advertise a bogus
1338    /// address from a stale clone.
1339    #[test]
1340    fn app_state_starts_with_empty_bound_addr() {
1341        let state = test_state();
1342        assert!(state.bound_addr.get().is_none());
1343    }
1344}