trusty_memory/lib.rs
1//! MCP server (HTTP/SSE + stdio) for trusty-memory.
2//!
3//! Why: Claude Code and other MCP-aware clients integrate with trusty-memory
4//! through the standardized Model Context Protocol; we expose memory + KG
5//! tools so they can be called by name. The canonical stdio integration is
6//! `trusty-memory serve --stdio` (PR1 #919 of the #914 cutover epic), a
7//! self-contained direct MCP server that binds no HTTP port or UDS socket.
8//! The former `trusty-memory-mcp-bridge` binary and Unix-domain-socket
9//! transport were removed in PR3 (#914) once `serve --stdio` made them dead
10//! code.
11//! What: Provides `run_http` / `run_http_dynamic` / `run_http_on` (axum
12//! HTTP/SSE + REST + UI) plus an `AppState` that carries the shared
13//! `PalaceRegistry`, on-disk data root, and a lazily-initialized embedder.
14//! Test: `cargo test -p trusty-memory` validates handshake + dispatch via
15//! the in-process `handle_message` unit tests and the
16//! `tests/serve_stdio_e2e.rs` end-to-end harness.
17
18use anyhow::Result;
19use serde_json::{json, Value};
20use std::net::SocketAddr;
21use std::path::{Path, PathBuf};
22use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
23use std::sync::{Arc, OnceLock};
24use tokio::sync::{broadcast, OnceCell, RwLock};
25use trusty_common::bm25_client::Bm25Client;
26use trusty_common::mcp::initialize_response;
27use trusty_common::memory_core::embed::FastEmbedder;
28use trusty_common::memory_core::store::ChatSessionStore;
29use trusty_common::memory_core::PalaceRegistry;
30use trusty_common::ChatProvider;
31
32// Why: `tracing::info` is only used by the axum HTTP-serving helpers
33// (`run_http_on`, `spawn_uds_listener`). Pulling it in unconditionally
34// would trigger `unused_imports` warnings when the `axum-server`
35// feature is disabled. `SocketAddr` is still used by `bound_addr` on
36// `AppState` so it stays unconditional.
37#[cfg(feature = "axum-server")]
38use tracing::info;
39
40/// Two-phase daemon readiness state (issues #910/#911).
41///
42/// Why: The embedder cold-init (CoreML compile, 30-120 s) blocks the first
43/// real `memory_remember`/`memory_recall` call if it arrives before warm-up
44/// completes. Advertising the state lets handlers return an explicit, fast
45/// error ("daemon is warming up, retry shortly") instead of blocking for
46/// minutes.
47/// What: Two stable values stored atomically. `Warming` (0) is the initial
48/// state; `Ready` (1) is set once the embedder has been successfully
49/// initialised by `spawn_startup_tasks`. The transition is one-way and
50/// lock-free: a single `AtomicU8` compare-and-swap.
51/// Test: `daemon_readiness_transitions_warming_to_ready` in this module;
52/// end-to-end warming-error path covered by
53/// `tools::tests::remember_returns_warming_error_while_state_is_warming`.
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum DaemonReadiness {
56 /// Embedder cold-init (and/or pin scan) still in progress.
57 Warming = 0,
58 /// Embedder initialised; all handlers may proceed normally.
59 Ready = 1,
60}
61
62impl DaemonReadiness {
63 /// Decode the raw atomic value.
64 ///
65 /// Why: centralises the `0 → Warming, else Ready` mapping so every
66 /// caller loads a meaningful enum rather than comparing raw integers.
67 /// What: returns `Warming` for `0`, `Ready` for any other value (only
68 /// `1` is ever written).
69 /// Test: `daemon_readiness_from_u8` in this module.
70 pub fn from_u8(v: u8) -> Self {
71 if v == 0 {
72 Self::Warming
73 } else {
74 Self::Ready
75 }
76 }
77}
78
79pub mod activity;
80pub mod attribution;
81pub mod bm25_supervisor;
82pub mod bootstrap;
83/// File-descriptor usage and limit reporting for `/health`.
84///
85/// Why: expose `open_fds` / `fd_soft_limit` so operators can see the fd
86/// ceiling and current consumption without needing lsof or shell access.
87/// Test: `fd_metrics::tests::fd_metrics_returns_sane_values`.
88pub mod fd_metrics;
89// Why (issue #226): `chat` and `web` are pure axum HTTP/SSE handler
90// surfaces. Gating them behind the `axum-server` feature is what lets
91// library consumers (e.g. `open-mpm` linking only `MemoryMcpService`)
92// drop axum + tower-http entirely from their build graph.
93#[cfg(feature = "axum-server")]
94pub mod chat;
95pub mod commands;
96pub mod discovery;
97/// Single-pass startup pin-file scanner (issue #470).
98///
99/// Why: builds the `palace_id → project_path` map at daemon startup without
100/// eager palace opens. One readdir sweep over the standard search roots is
101/// cheaper than the O(#palaces) per-id loop used by the doctor path and runs
102/// before the first HTTP request arrives.
103/// What: exports `scan_pin_map` and `default_search_dirs`.
104/// Test: see `startup_scan::tests`.
105pub mod foreground;
106pub mod hook_emit;
107pub mod kg_extract;
108pub mod mcp_service;
109pub mod messaging;
110pub mod openrpc;
111/// Issue #88: project-root detection and palace-slug enforcement.
112///
113/// Why: prevents unbounded palace creation by anchoring palace names to the
114/// canonical slug of the project directory that contains the CWD, or to the
115/// `personal` sentinel for non-project contexts.
116/// What: exports `find_project_root`, `project_slug_at`, `project_slug`,
117/// `validate_palace_name`, `PERSONAL_PALACE`, and `PROJECT_MARKERS`.
118/// Test: see unit tests inside this module.
119pub mod project_root;
120pub mod prompt_facts;
121pub mod prompt_log;
122pub mod service;
123pub mod startup_scan;
124pub mod tools;
125pub mod transport;
126#[cfg(feature = "axum-server")]
127pub mod web;
128
129pub use activity::{ActivityEntry, ActivityFilter, ActivityLog, ActivitySource};
130pub use attribution::{CreatorInfo, CreatorSource};
131
132/// Maximum bytes retained in the trigger-prompt excerpt embedded on a
133/// `HookFired` event.
134///
135/// Why: the full triggering prompt is sensitive and already lives in the
136/// JSONL prompt log; the activity feed only needs enough text to give an
137/// operator a glance — a single-line ~80 char preview matches the existing
138/// `drawer_content_preview` convention so dashboard rows render uniformly.
139/// What: 80 characters; longer prompts are truncated with a trailing `…`.
140/// Test: `hook_excerpt_truncates_long_prompts`.
141pub const HOOK_PROMPT_EXCERPT_CHARS: usize = 80;
142
143/// Reduce a triggering prompt to the short excerpt embedded on a
144/// `HookFired` activity event.
145///
146/// Why: see [`HOOK_PROMPT_EXCERPT_CHARS`]. Centralising the truncation rule
147/// keeps every emitter (HTTP, hook CLI handlers, future tests) producing
148/// the same preview shape so UI rendering is uniform.
149/// What: whitespace-collapses `prompt` and trims to
150/// [`HOOK_PROMPT_EXCERPT_CHARS`] chars with `…` when cut. Empty input
151/// returns an empty string.
152/// Test: `hook_excerpt_truncates_long_prompts`,
153/// `hook_excerpt_collapses_whitespace`.
154pub fn hook_prompt_excerpt(prompt: &str) -> String {
155 let normalised: String = prompt.split_whitespace().collect::<Vec<_>>().join(" ");
156 if normalised.chars().count() <= HOOK_PROMPT_EXCERPT_CHARS {
157 normalised
158 } else {
159 let kept: String = normalised
160 .chars()
161 .take(HOOK_PROMPT_EXCERPT_CHARS.saturating_sub(1))
162 .collect();
163 format!("{kept}…")
164 }
165}
166
167pub use mcp_service::MemoryMcpService;
168pub use tools::MemoryMcpServer;
169
170/// Resolve the directory that actually holds the per-palace subdirectories.
171///
172/// Why: there are two on-disk layouts in the wild. The current monorepo code
173/// treats the registry directory *itself* as the parent of per-palace dirs
174/// (`<dir>/<id>/palace.json`). The legacy standalone `trusty-memory` repo
175/// nested everything one level deeper under a `palaces/` subdirectory
176/// (`<data_dir>/palaces/<id>/palace.json`) — and that is where existing
177/// installs' data lives (e.g. 88 palaces under
178/// `~/Library/Application Support/trusty-memory/palaces/`). A daemon that uses
179/// the bare data dir as its registry root finds zero palaces because every
180/// `palace.json` sits one level below where it looked — the "palaces lost on
181/// restart" bug.
182/// What: given the standard data dir, returns `<data_dir>/palaces` when that
183/// subdirectory exists, otherwise `<data_dir>` itself. Resolving this once in
184/// `main.rs` and using the result as `AppState::data_root` keeps every call
185/// site (`status`, `palace_list`, `open_palace`, `palace_create`,
186/// `load_palaces_from_disk`) consistent without forcing a data migration.
187/// Test: `tests::resolve_palace_registry_dir_prefers_palaces_subdir` and
188/// `resolve_palace_registry_dir_falls_back_to_data_dir`.
189pub fn resolve_palace_registry_dir(data_dir: PathBuf) -> PathBuf {
190 let nested = data_dir.join("palaces");
191 if nested.is_dir() {
192 nested
193 } else {
194 data_dir
195 }
196}
197
198/// Hook type — labels the Claude Code hook that triggered a submission.
199///
200/// Why: every hook firing produces an activity-feed entry tagged with the
201/// originating hook so operators can tell whether activity came from a user
202/// prompt (`UserPromptSubmit`), a new session (`SessionStart`), or a future
203/// hook variant. Threading this through `DaemonEvent::HookFired` lets the
204/// dashboard badge each row with the hook label.
205/// What: serde-serialised in PascalCase so the wire format matches Claude
206/// Code's own hook-name strings exactly (e.g. `"UserPromptSubmit"`).
207/// Test: `hook_type_serde_round_trips`.
208#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
209pub enum HookType {
210 /// Claude Code's `UserPromptSubmit` hook — fires on every user prompt.
211 UserPromptSubmit,
212 /// Claude Code's `SessionStart` hook — fires once at session open.
213 SessionStart,
214}
215
216impl HookType {
217 /// Stable string label used for the wire format.
218 pub fn as_str(&self) -> &'static str {
219 match self {
220 Self::UserPromptSubmit => "UserPromptSubmit",
221 Self::SessionStart => "SessionStart",
222 }
223 }
224}
225
226/// Injection kind — labels what the hook actually injected (or attempted).
227///
228/// Why: distinct from `HookType` because one hook could in principle render
229/// more than one kind of injection (e.g. SessionStart can deliver both an
230/// inbox check and bootstrap context). Tagging the rendered kind explicitly
231/// keeps the activity log searchable when that fan-out lands.
232/// What: serde-serialised as kebab-case so it matches the labels already
233/// used in the JSONL prompt log (`prompt-context-facts`,
234/// `inbox-check-messages`).
235/// Test: `injection_kind_serde_round_trips`.
236#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
237#[serde(rename_all = "kebab-case")]
238pub enum InjectionKind {
239 /// `prompt-context` hook rendered the prompt-facts block.
240 PromptContext,
241 /// `inbox-check` hook delivered unread messages.
242 InboxCheck,
243}
244
245impl InjectionKind {
246 /// Stable string label used for the wire format.
247 pub fn as_str(&self) -> &'static str {
248 match self {
249 Self::PromptContext => "prompt-context",
250 Self::InboxCheck => "inbox-check",
251 }
252 }
253}
254
255/// Live daemon events broadcast to connected SSE subscribers.
256///
257/// Why: The dashboard needs push-driven updates so palace creation, drawer
258/// add/delete, dream cycles, and aggregate status changes are visible without
259/// polling. A single broadcast channel fans out to every connected browser.
260/// What: Tagged enum serialized as `{"type": "...", ...fields}` over SSE.
261/// Test: `web::tests::sse_stream_emits_events` subscribes, triggers a
262/// mutation, and asserts the frame arrives.
263#[derive(Clone, Debug, serde::Serialize)]
264#[serde(tag = "type", rename_all = "snake_case")]
265pub enum DaemonEvent {
266 PalaceCreated {
267 id: String,
268 name: String,
269 /// Originating subsystem (HTTP, MCP, Hook). Why (issue #96): the
270 /// UI badges each row with its source so operators can tell at a
271 /// glance whether a write came from the dashboard form, an MCP
272 /// tool call, or a hook-driven path. The wire-format key is
273 /// `source` (lower-case strings via serde rename_all on
274 /// `ActivitySource`).
275 source: ActivitySource,
276 },
277 DrawerAdded {
278 palace_id: String,
279 /// Friendly palace name (Palace.name) at write time. Why: lets SSE
280 /// consumers (the dashboard activity feed) render the human-readable
281 /// label without a separate id→name lookup. Empty string if the
282 /// emitter could not resolve the name.
283 #[serde(default)]
284 palace_name: String,
285 drawer_count: usize,
286 /// Wall-clock timestamp when the drawer was added. Why: SSE
287 /// receivers want to render "just now / 2m ago" relative to the
288 /// daemon's clock, not the time the SSE frame happens to arrive.
289 timestamp: chrono::DateTime<chrono::Utc>,
290 /// Short preview of the drawer's content (whitespace-collapsed,
291 /// truncated to ~80 chars with an ellipsis when cut). Why: the TUI
292 /// activity feed and dashboard ticker want to show *what* was
293 /// stored, not just the running drawer count. Empty when the
294 /// emitter could not resolve the content (legacy clients tolerate
295 /// the missing field via `#[serde(default)]`).
296 #[serde(default)]
297 content_preview: String,
298 /// Originating subsystem (issue #96).
299 source: ActivitySource,
300 },
301 DrawerDeleted {
302 palace_id: String,
303 drawer_count: usize,
304 /// Originating subsystem (issue #96).
305 source: ActivitySource,
306 },
307 DreamCompleted {
308 palace_id: Option<String>,
309 merged: usize,
310 pruned: usize,
311 compacted: usize,
312 closets_updated: usize,
313 duration_ms: u64,
314 /// Originating subsystem (issue #96).
315 source: ActivitySource,
316 },
317 StatusChanged {
318 total_drawers: usize,
319 total_vectors: usize,
320 total_kg_triples: usize,
321 },
322 /// A Claude Code hook completed and rendered (or attempted to render) an
323 /// injection block.
324 ///
325 /// Why: pre-#XXX the activity feed only fired on drawer / palace / dream
326 /// writes, which meant a normal Claude Code session — whose only daemon
327 /// traffic is hook invocations — left the feed empty. Surfacing every
328 /// hook firing answers the user complaint "no activity in the TUI" and
329 /// gives operators a way to see how often each project palace is
330 /// actually picking up prompt-context / inbox-check work.
331 /// What: carries the resolved palace (or `None` if cwd resolution
332 /// failed), the [`HookType`] label, the [`InjectionKind`] label, the
333 /// rendered injection byte length, a short excerpt of the triggering
334 /// prompt (capped at ~80 chars; the full content stays in the JSONL
335 /// prompt log only), the timestamp, the hook's wall-clock duration,
336 /// and the [`ActivitySource`] tag (always `Hook` for this variant).
337 /// Backwards-compatible: SSE clients that do not recognise the
338 /// `hook_fired` `type` tag can safely ignore the frame.
339 HookFired {
340 /// Resolved palace id (slug) — `None` if cwd resolution failed.
341 #[serde(default)]
342 palace_id: Option<String>,
343 /// Friendly palace name at hook time — `None` if the registry
344 /// could not be consulted (HTTP path uses `palace_id` here when
345 /// no separate name is known).
346 #[serde(default)]
347 palace_name: Option<String>,
348 hook_type: HookType,
349 injection_kind: InjectionKind,
350 /// Rendered injection size in bytes (`0` when no injection was
351 /// emitted, e.g. SessionStart with an empty inbox).
352 injection_length: u64,
353 /// Short excerpt of the triggering prompt for the activity feed
354 /// display. Capped at ~80 chars with a trailing `…` when cut.
355 /// Why: the activity feed renders this directly; full prompt
356 /// content (which may be sensitive) stays in the JSONL log.
357 #[serde(default)]
358 trigger_prompt_excerpt: String,
359 timestamp: chrono::DateTime<chrono::Utc>,
360 /// Hook wall-clock duration in milliseconds.
361 duration_ms: u64,
362 /// Always `ActivitySource::Hook` for this variant; encoded explicitly
363 /// so the same dispatch path (`emit`) can persist + broadcast it.
364 source: ActivitySource,
365 },
366}
367
368/// Open the activity log under `data_root`, falling back to a per-process
369/// tempdir and finally to a no-op `Discard` variant when no writable
370/// directory is available.
371///
372/// Why (issues #96, #225): the activity log is a best-effort feature — if
373/// the data root is on a read-only mount, missing, or locked by another
374/// process, the daemon should still come up and serve every other endpoint.
375/// The first fallback is a `std::env::temp_dir()`-anchored subdirectory
376/// keyed by the daemon's process id. Issue #225: a previous version called
377/// `expect()` on the tempdir fallback, which crashed the daemon on hosts
378/// where neither `data_root` nor `std::env::temp_dir()` is writable
379/// (read-only containers, locked-down sandboxes). The contract is
380/// "best-effort", so the final fallback is now `ActivityLog::discard()` —
381/// a no-op variant that drops every append and returns empty reads. The
382/// dashboard's activity feed simply shows up empty in that degraded state.
383/// What: tries `ActivityLog::open(data_root)`; on error logs a warning and
384/// retries against `<temp>/trusty-memory-activity-<pid>/`. If both fail,
385/// emits a final warning and returns `ActivityLog::discard()`.
386/// Test: `open_activity_log_with_fallback_returns_discard_when_unwritable`
387/// covers the discard branch; existing `AppState` construction tests cover
388/// the happy and tempdir-fallback paths.
389fn open_activity_log_with_fallback(data_root: &Path) -> Arc<ActivityLog> {
390 match ActivityLog::open(data_root) {
391 Ok(log) => Arc::new(log),
392 Err(primary_err) => {
393 tracing::warn!(
394 "could not open activity log at {}: {primary_err:#}; falling back to per-process tempdir",
395 data_root.display()
396 );
397 let fallback =
398 std::env::temp_dir().join(format!("trusty-memory-activity-{}", std::process::id()));
399 match ActivityLog::open(&fallback) {
400 Ok(log) => Arc::new(log),
401 Err(fallback_err) => {
402 tracing::warn!(
403 "activity log tempdir fallback at {} also failed: {fallback_err:#}; \
404 activity feed disabled for this process (no-op log)",
405 fallback.display()
406 );
407 Arc::new(ActivityLog::discard())
408 }
409 }
410 }
411 }
412}
413
414impl DaemonEvent {
415 /// Short discriminant label matching the SSE `type` field.
416 ///
417 /// Why: the persisted activity log stores `event_type` as a string so
418 /// the UI can render the row without re-parsing the payload. Sharing
419 /// the same labels the SSE serializer uses keeps the wire and the
420 /// stored history consistent.
421 /// What: returns one of `palace_created`, `drawer_added`,
422 /// `drawer_deleted`, `dream_completed`, `status_changed`.
423 /// Test: `daemon_event_type_str_matches_sse_tag` in the lib tests.
424 pub fn type_str(&self) -> &'static str {
425 match self {
426 Self::PalaceCreated { .. } => "palace_created",
427 Self::DrawerAdded { .. } => "drawer_added",
428 Self::DrawerDeleted { .. } => "drawer_deleted",
429 Self::DreamCompleted { .. } => "dream_completed",
430 Self::StatusChanged { .. } => "status_changed",
431 Self::HookFired { .. } => "hook_fired",
432 }
433 }
434
435 /// `palace_id` if the event is scoped to a single palace.
436 ///
437 /// Why: the activity log indexes entries by palace id so the UI can
438 /// filter by palace; daemon-wide events (`status_changed`,
439 /// dream-across-all-palaces) return `None`.
440 /// What: returns a borrowed string when the variant carries a palace
441 /// id, otherwise `None`.
442 /// Test: `daemon_event_palace_id_extraction`.
443 pub fn palace_id(&self) -> Option<&str> {
444 match self {
445 Self::PalaceCreated { id, .. } => Some(id),
446 Self::DrawerAdded { palace_id, .. } | Self::DrawerDeleted { palace_id, .. } => {
447 Some(palace_id)
448 }
449 Self::DreamCompleted { palace_id, .. } => palace_id.as_deref(),
450 Self::HookFired { palace_id, .. } => palace_id.as_deref(),
451 Self::StatusChanged { .. } => None,
452 }
453 }
454
455 /// Originating subsystem if the event carries one.
456 ///
457 /// Why: only mutation events carry a `source`; the aggregate
458 /// `StatusChanged` is recomputed by the daemon and has no caller, so
459 /// it returns `None`.
460 /// What: returns the variant's `source` field where present.
461 /// Test: `daemon_event_source_extraction`.
462 pub fn source(&self) -> Option<ActivitySource> {
463 match self {
464 Self::PalaceCreated { source, .. }
465 | Self::DrawerAdded { source, .. }
466 | Self::DrawerDeleted { source, .. }
467 | Self::DreamCompleted { source, .. }
468 | Self::HookFired { source, .. } => Some(*source),
469 Self::StatusChanged { .. } => None,
470 }
471 }
472}
473
474/// Shared application state passed to every request handler.
475///
476/// Why: The stdio loop and HTTP server need the same handles to the registry,
477/// data root, and embedder so MCP tools can perform real reads/writes against
478/// the live trusty-memory core. The embedder is heavy (loads ONNX weights) so
479/// we hold it behind a `OnceCell` and initialize lazily on first use.
480/// What: `Clone`-able via `Arc` fields. The registry / data root are eager;
481/// `embedder` is `Arc<OnceCell<Arc<FastEmbedder>>>` so concurrent first-use
482/// races resolve to a single shared instance.
483/// Test: `app_state_default_constructs` confirms construction without panic.
484#[derive(Clone)]
485pub struct AppState {
486 pub version: String,
487 pub registry: Arc<PalaceRegistry>,
488 pub data_root: PathBuf,
489 pub embedder: Arc<OnceCell<Arc<FastEmbedder>>>,
490 /// Optional default palace applied to MCP tool calls when the caller
491 /// omits the `palace` argument. Set via `trusty-memory serve --palace`.
492 pub default_palace: Option<String>,
493 /// Active chat provider selected at startup. `None` means no upstream is
494 /// configured (no Ollama detected and no OpenRouter key) — callers must
495 /// degrade gracefully (chat endpoint returns 412).
496 pub chat_provider: Arc<OnceCell<Option<Arc<dyn ChatProvider>>>>,
497 /// Per-palace chat-session stores, opened lazily so cold-start cost is
498 /// paid only when chat-history endpoints are hit.
499 pub session_stores: Arc<dashmap::DashMap<String, Arc<ChatSessionStore>>>,
500 /// Broadcast sender for live `DaemonEvent` pushes to SSE subscribers.
501 ///
502 /// Why: Lets mutating handlers emit events that any connected dashboard
503 /// receives instantly. Cap of 128 buffers transient slow readers; if a
504 /// receiver lags it gets `RecvError::Lagged` and we emit a `lag` frame.
505 pub events: Arc<broadcast::Sender<DaemonEvent>>,
506 /// Instant the daemon started, used to compute `uptime_secs` on `/health`.
507 ///
508 /// Why (issue #35): `GET /health` reports how long the daemon has been
509 /// up. Capturing a monotonic `Instant` at `AppState` construction lets the
510 /// handler compute the elapsed seconds cheaply and without a clock-skew
511 /// hazard.
512 /// What: a wall-monotonic `Instant`; `AppState::new` stamps it at startup.
513 /// Test: `health_endpoint_includes_resource_fields`.
514 pub started_at: std::time::Instant,
515 /// In-memory ring buffer of recent tracing log lines (issue #35).
516 ///
517 /// Why: the `GET /api/v1/logs/tail` endpoint serves the last N log lines
518 /// so operators can inspect a running daemon without tailing a file. The
519 /// buffer is shared between the tracing `LogBufferLayer` (writer) and the
520 /// HTTP handler (reader).
521 /// What: a cheap `Arc`-backed clone of the buffer the subscriber writes
522 /// to. Defaults to an empty buffer for states that never install the
523 /// layer (tests, the stdio path).
524 /// Test: `logs_tail_returns_recent_lines`.
525 pub log_buffer: trusty_common::log_buffer::LogBuffer,
526 /// Bug-capture ERROR store (bug-reporting #478, Phase 1).
527 ///
528 /// Why: Phase 2 MCP / HTTP endpoints need to query captured errors; stashing
529 /// the `ErrorStore` handle here lets any handler reach it cheaply without
530 /// a second global or per-request construction.
531 /// What: populated by `run_serve` from the `init_tracing_with_buffer_and_capture`
532 /// result; the layer writes to this store automatically so every
533 /// `tracing::error!` call site contributes without any changes to call
534 /// sites. `None` in states that do not install the layer (tests, the
535 /// stdio path).
536 /// Test: compile-presence is verified by the `trusty-memory` build; Phase 2
537 /// will add query tests in `web.rs`.
538 pub error_store: Option<trusty_common::error_capture::ErrorStore>,
539 /// Most recent on-disk footprint of `data_root`, in bytes (issue #35).
540 ///
541 /// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
542 /// every health request would make a frequent health poll do unbounded
543 /// I/O; a background task recomputes it every 10 s and stores it here so
544 /// the handler reads it lock-free.
545 /// What: an `AtomicU64` updated by the ticker spawned in `run_http_on`.
546 /// `0` until the first walk completes.
547 /// Test: `health_endpoint_includes_resource_fields`.
548 pub disk_bytes: Arc<std::sync::atomic::AtomicU64>,
549 /// Per-process RSS + CPU sampler, refreshed on each `/health` request
550 /// (issue #35).
551 ///
552 /// Why: CPU usage is a delta between two `sysinfo` refreshes, so the
553 /// sampler must persist between requests — hence the shared `Mutex`.
554 /// What: a `tokio::sync::Mutex<SysMetrics>` so the async health handler
555 /// can sample without blocking the runtime.
556 /// Test: `health_endpoint_includes_resource_fields`.
557 pub sys_metrics: Arc<tokio::sync::Mutex<trusty_common::sys_metrics::SysMetrics>>,
558 /// HTTP listener address the daemon bound to, once `run_http_on` is running.
559 ///
560 /// Why: clients (and `/health` responses) need to advertise the live
561 /// `host:port` even though port selection happens dynamically (7070–7079
562 /// walk + OS fallback). Stashing it on `AppState` lets request handlers
563 /// surface the discovery value without re-querying the listener.
564 /// What: a `OnceLock<SocketAddr>` so `run_http_on` writes it exactly once
565 /// at bind time and every handler reads it lock-free thereafter. Empty
566 /// (`None` from `get()`) on the stdio path where no listener exists.
567 /// Test: `health_endpoint_reports_bound_addr` (added below).
568 pub bound_addr: Arc<OnceLock<SocketAddr>>,
569 /// Cached prompt-facts surface served by the MCP `get_prompt_context`
570 /// tool (issue #42).
571 ///
572 /// Why: The original session-init `prompts/get` design loaded context
573 /// once per connection; switching to a per-message tool lets the model
574 /// pull fresh, query-filtered context on demand. The cache holds both
575 /// the raw triples (for filtered lookups) and a pre-formatted Markdown
576 /// block (for the unfiltered hot path) so neither code path re-walks
577 /// the KG. The cache is rebuilt by
578 /// `prompt_facts::rebuild_prompt_cache` after any write that touches a
579 /// hot predicate (`kg_assert`, `add_alias`, `remove_prompt_fact`).
580 /// What: An `Arc<tokio::sync::RwLock<PromptFactsCache>>` so the hot
581 /// read path takes a brief read lock and clones the cache; rebuilds
582 /// take a write lock for the assignment only. The async-aware lock
583 /// (issue #229) yields to the tokio runtime instead of blocking a
584 /// runtime thread for the rebuild duration. An empty `triples` vec ↔
585 /// "no context stored yet" (the tool handler renders a hint).
586 /// Test: `get_prompt_context_returns_cached_or_hint`,
587 /// `get_prompt_context_filters_by_query`.
588 pub prompt_context_cache: Arc<RwLock<prompt_facts::PromptFactsCache>>,
589 /// Persistent activity log (issue #96).
590 ///
591 /// Why: the dashboard activity feed used to be a pure live-stream over
592 /// `/sse` — opening the UI showed an empty feed and any mutation from
593 /// the MCP path was invisible. Holding an `ActivityLog` on `AppState`
594 /// lets `emit` record an entry on every push so the
595 /// `GET /api/v1/activity` handler can return historical rows on mount
596 /// and the live SSE stream can continue prepending events on top of
597 /// the loaded history. `None` on builds that opt out (tests that use
598 /// `AppState::new` get a real log under their tempdir so behaviour
599 /// matches production).
600 /// What: an `Arc<ActivityLog>` shared with every emitter.
601 /// Test: `web::tests::activity_endpoint_lists_recent_emits`.
602 pub activity_log: Arc<ActivityLog>,
603 /// Optional per-palace BM25 lexical search lane (issue #156).
604 ///
605 /// Why: in-process BM25 would serialise the recall hot path on disk
606 /// I/O during writes and contend with the redb/usearch locks. Delegating
607 /// to the `trusty-bm25-daemon` subprocess (one socket per palace) keeps
608 /// BM25 ingestion and search off the critical path while still feeding
609 /// hits into the recall RRF fusion.
610 /// What: `Some(client)` only when `TRUSTY_BM25_DAEMON=1` at startup —
611 /// every code path that uses this field is gated on `is_some()` and
612 /// falls back to vector-only behaviour otherwise so existing deployments
613 /// see zero behavioural change.
614 /// Test: `bm25_client_disabled_by_default`,
615 /// `bm25_client_enabled_when_env_set`.
616 pub bm25_client: Option<Arc<Bm25Client>>,
617 /// Optional per-palace BM25 daemon spawn supervisor (issue #193).
618 ///
619 /// Why: without an in-process supervisor the BM25 daemon must be
620 /// launched out-of-band (launchd, manual `trusty-bm25-daemon`), which
621 /// is the same UX trap PR #190 fixed for trusty-embedderd. Holding a
622 /// supervisor here lets us spawn the daemon on first BM25 use for a
623 /// palace, restart it if it dies, and reap it on clean shutdown.
624 /// `Some` only when `TRUSTY_BM25_DAEMON=1` at startup — the same gate
625 /// that enables `bm25_client`. When set but `TRUSTY_BM25_EXTERNAL=1`,
626 /// the supervisor's `ensure_running` becomes a no-op that just returns
627 /// the canonical socket path so operators can keep using their own
628 /// process manager.
629 /// Test: covered by `bm25_supervisor_present_when_env_set` and the
630 /// `bm25_supervisor::tests` unit tests.
631 pub bm25_supervisor: Option<Arc<bm25_supervisor::Bm25Supervisor>>,
632 /// Per-palace write serialisation locks (issue #230).
633 ///
634 /// Why: the dedup gate in `tools.rs` previously read a snapshot of
635 /// existing drawers, checked for near-duplicates via Jaro-Winkler, and
636 /// then issued the write — a classic time-of-check/time-of-use race.
637 /// Two concurrent `memory_remember` calls with the same content could
638 /// both see the pre-write snapshot, both pass the gate, and both land
639 /// duplicate drawers. Serialising the gate-then-write sequence per
640 /// palace closes the window: while one task holds the mutex, any
641 /// concurrent writer for the same palace blocks until the first write
642 /// finishes and is visible to `list_drawers`. The lock is **per
643 /// palace** (not global) so writes to different palaces continue to
644 /// run in parallel.
645 /// What: a `DashMap` keyed by palace id, where each entry is an
646 /// `Arc<tokio::sync::Mutex<()>>`. The mutex is constructed lazily by
647 /// `palace_write_lock` on first access. `Arc` lets callers hold a
648 /// clone of the lock past the lifetime of the `DashMap` entry so the
649 /// map never needs to be held across an `.await`.
650 /// Test: `tools::tests::dedup_gate_blocks_concurrent_duplicate_writes`.
651 pub palace_write_locks: Arc<dashmap::DashMap<String, Arc<tokio::sync::Mutex<()>>>>,
652 /// Counter of in-flight activity-log writes spawned by `emit`
653 /// (issue #232).
654 ///
655 /// Why: `emit` offloads the synchronous redb append to the tokio blocking
656 /// pool via `spawn_blocking` so the async runtime is never parked waiting
657 /// on fsync. The write is fire-and-forget — `emit` returns immediately
658 /// after spawning. Tests that observe the activity log right after a
659 /// burst of `emit` calls need a deterministic synchronization point;
660 /// holding an in-flight counter lets `flush_activity_writes` poll until
661 /// every spawned append has settled, which keeps the assertions
662 /// race-free without forcing every caller to `.await`.
663 /// What: an `Arc<AtomicUsize>` incremented before each `spawn_blocking`
664 /// and decremented inside the closure (after the append completes, even
665 /// if it errored). The counter is cheap (one atomic add per emit) and
666 /// stays at zero in steady-state production traffic.
667 /// Test: `web::tests::activity_endpoint_lists_recent_emits` and
668 /// `tests::emit_persists_mutations_but_skips_status_changed` call
669 /// `flush_activity_writes` to drain the counter before reading the log.
670 pub pending_activity_writes: Arc<AtomicUsize>,
671 /// In-memory cache mapping palace id → `Palace.name` (issue #228).
672 ///
673 /// Why: every `memory_remember` / `memory_note` write used to call
674 /// `PalaceRegistry::list_palaces` (a synchronous filesystem walk of the
675 /// data root) just to resolve a friendly palace name for the SSE
676 /// `DrawerAdded` event. With N palaces on disk the cost was O(N) opendirs
677 /// plus `palace.json` reads on every write, blocking the async runtime.
678 /// Caching the name in-memory turns the lookup into a `DashMap::get`.
679 /// What: `DashMap<String, String>` populated by `create_palace` and
680 /// `load_palaces_from_disk`, kept in sync by rename / delete paths.
681 /// Missing entries are treated as "name unknown" so callers fall back to
682 /// the palace id and the emit path never fails.
683 /// Test: `palace_name_cache_populated_after_hydration` and
684 /// `palace_name_cache_updates_on_create`.
685 pub palace_names: Arc<dashmap::DashMap<String, String>>,
686 /// Single-pass startup pin-file map: palace id → project root path (issue #470).
687 ///
688 /// Why: after daemon startup we have no record of which on-disk project
689 /// directories correspond to which palace ids — that information only
690 /// existed inside the pin files on disk. Eager-opening every palace on
691 /// startup is too expensive. This field captures the scan-only result of
692 /// `startup_scan::scan_pin_map` so handlers that want to locate a project
693 /// by its palace id (e.g. future cwd-inference, project-health checks)
694 /// can do a single `DashMap::get` instead of a filesystem walk.
695 /// Populated once, shortly after `load_palaces_from_disk` returns, by
696 /// `spawn_startup_tasks`. Never mutated after population — it is a
697 /// snapshot of what the filesystem looked like at startup.
698 /// What: `DashMap<String (palace_id), PathBuf (project root)>`.
699 /// The outer `Arc` lets `spawn_startup_tasks` (which holds only a clone
700 /// of `AppState`) write to the same backing map that request handlers
701 /// read. Population is asynchronous so callers must treat an absent entry
702 /// as "not yet scanned" (or "no pin found"), never as "palace unknown".
703 /// Test: `startup_scan::tests::scan_pin_map_*` validate the underlying
704 /// scanner function; the wiring in `spawn_startup_tasks` is covered by
705 /// the integration-test daemon start path.
706 pub pin_project_map: Arc<dashmap::DashMap<String, PathBuf>>,
707 /// Bounded sender for the BM25 index worker (issue #231).
708 ///
709 /// Why: the previous fire-and-forget design `tokio::spawn`ed one task per
710 /// `memory_remember` / `memory_note` call, so a write burst against a slow
711 /// or unreachable BM25 daemon grew an unbounded in-flight task queue. A
712 /// single long-lived worker draining a bounded mpsc channel caps that
713 /// back-pressure: writers `try_send` (never block), full-queue requests
714 /// are dropped with a `warn!`, and the worker exits cleanly when the last
715 /// sender is dropped on shutdown.
716 /// What: an `mpsc::Sender` cloned to every `AppState` clone (cheap). The
717 /// matching receiver is consumed by the worker spawned in
718 /// [`AppState::new`] via [`tools::spawn_bm25_index_worker`]. Capacity is
719 /// [`tools::BM25_INDEX_QUEUE_CAPACITY`] (256).
720 /// Test: `bm25_index_queue_drops_when_full` exercises the full-queue
721 /// branch via `bm25_index_enqueue`.
722 pub bm25_index_tx: tokio::sync::mpsc::Sender<tools::Bm25IndexRequest>,
723 /// Cached result of the startup update check (issue #537).
724 ///
725 /// Why: `/health` should report `update_available` without hitting crates.io
726 /// on every probe. A single background check at daemon startup stores the
727 /// result here; the health handler reads it lock-free (well, a brief mutex
728 /// lock) without a network call.
729 /// What: `None` = up-to-date or check not yet done; `Some("x.y.z")` = newer
730 /// version available. The field is populated by a `tokio::spawn` in
731 /// `spawn_startup_tasks` (main.rs) after the daemon binds.
732 /// Test: indirectly by the `/health` endpoint tests in `web.rs`.
733 pub update_available: Arc<std::sync::Mutex<Option<String>>>,
734 /// Two-phase readiness state — `Warming` until the embedder is initialised,
735 /// then `Ready` (issues #910 / #911).
736 ///
737 /// Why: `AppState::embedder()` used to call `FastEmbedder::new()` without
738 /// any timeout, so the first `memory_recall`/`memory_remember` that arrived
739 /// before CoreML finished compiling would block for 5–11 hours until the
740 /// OnceCell resolved (issue #910). Exposing this state lets the preflight
741 /// guards in `tools.rs` return an explicit fast error immediately —
742 /// `"trusty-memory is warming up, retry shortly"` — instead of queueing
743 /// behind an open-ended init.
744 /// What: An `AtomicU8` starting at `DaemonReadiness::Warming` (0) and flipped
745 /// to `DaemonReadiness::Ready` (1) by `spawn_startup_tasks` after the embedder
746 /// warm-up succeeds. The transition is one-way and lock-free.
747 /// Test: `daemon_readiness_transitions_warming_to_ready`.
748 pub daemon_readiness: Arc<AtomicU8>,
749}
750
751impl AppState {
752 /// Construct an `AppState` rooted at the given on-disk data directory.
753 ///
754 /// Why: The CLI (`serve`) and integration tests need to point the MCP
755 /// server at different roots — production at `dirs::data_dir`, tests at a
756 /// `tempfile::tempdir()`.
757 /// What: Builds an empty `PalaceRegistry`, captures the version, and
758 /// allocates an empty `OnceCell` for the embedder. `default_palace` is
759 /// `None`; use `with_default_palace` to set it.
760 /// Test: `tools::tests::dispatch_palace_create_persists` constructs an
761 /// AppState pointed at a tempdir and round-trips a palace through it.
762 pub fn new(data_root: PathBuf) -> Self {
763 let (events_tx, _) = broadcast::channel::<DaemonEvent>(128);
764 // Issue #96: open (or create) the persistent activity log under the
765 // daemon data root. Open failure is logged but never crashes the
766 // daemon — we fall back to a per-process tempdir so emits remain
767 // best-effort and the rest of the daemon keeps working.
768 let activity_log = open_activity_log_with_fallback(&data_root);
769 // Issue #231: bounded mpsc channel + single long-lived worker
770 // replaces the per-write `tokio::spawn` fire-and-forget pattern so
771 // BM25 indexing back-pressure is capped. The worker is spawned here
772 // unconditionally so the channel always has a drain — even when
773 // `bm25_client` is `None`, the worker just consumes and discards
774 // each request so senders never block on a full queue.
775 let (bm25_index_tx, bm25_index_rx) =
776 tokio::sync::mpsc::channel::<tools::Bm25IndexRequest>(tools::BM25_INDEX_QUEUE_CAPACITY);
777 // `bm25_client` / `bm25_supervisor` start as `None`; the builder
778 // `with_bm25_client_from_env` rebuilds the worker with the real
779 // client + supervisor once env-gated opt-in is resolved.
780 tools::spawn_bm25_index_worker(bm25_index_rx, None, None);
781 Self {
782 version: env!("CARGO_PKG_VERSION").to_string(),
783 registry: Arc::new(PalaceRegistry::new()),
784 data_root,
785 embedder: Arc::new(OnceCell::new()),
786 default_palace: None,
787 chat_provider: Arc::new(OnceCell::new()),
788 session_stores: Arc::new(dashmap::DashMap::new()),
789 events: Arc::new(events_tx),
790 started_at: std::time::Instant::now(),
791 // Default to an empty buffer — `with_log_buffer` overrides this
792 // when the daemon installs the `LogBufferLayer` (HTTP mode).
793 log_buffer: trusty_common::log_buffer::LogBuffer::new(
794 trusty_common::log_buffer::DEFAULT_LOG_CAPACITY,
795 ),
796 // Bug-reporting #478: `None` until `with_error_store` is called
797 // during daemon startup (HTTP mode). Tests keep `None` so no
798 // unexpected files are written to the OS data dir.
799 error_store: None,
800 disk_bytes: Arc::new(std::sync::atomic::AtomicU64::new(0)),
801 sys_metrics: Arc::new(tokio::sync::Mutex::new(
802 trusty_common::sys_metrics::SysMetrics::new(),
803 )),
804 bound_addr: Arc::new(OnceLock::new()),
805 prompt_context_cache: Arc::new(RwLock::new(prompt_facts::PromptFactsCache::default())),
806 activity_log,
807 bm25_client: None,
808 bm25_supervisor: None,
809 palace_write_locks: Arc::new(dashmap::DashMap::new()),
810 pending_activity_writes: Arc::new(AtomicUsize::new(0)),
811 palace_names: Arc::new(dashmap::DashMap::new()),
812 pin_project_map: Arc::new(dashmap::DashMap::new()),
813 bm25_index_tx,
814 update_available: Arc::new(std::sync::Mutex::new(None)),
815 // Start in Warming state; flipped to Ready by spawn_startup_tasks
816 // once the embedder warm-up succeeds (issues #910/#911).
817 daemon_readiness: Arc::new(AtomicU8::new(DaemonReadiness::Warming as u8)),
818 }
819 }
820
821 /// Acquire (lazily, then clone) the per-palace write mutex.
822 ///
823 /// Why (issue #230): the dedup-check + `remember_with_options` write
824 /// sequence in `tools.rs` must be atomic per palace to prevent two
825 /// concurrent identical writes from both passing the dedup gate.
826 /// Callers hold the returned `Arc<Mutex<()>>`'s guard across the gate
827 /// check and the write so the second writer blocks until the first
828 /// write is visible to `list_drawers`. Returning a clone of the `Arc`
829 /// rather than a borrow into the `DashMap` lets the caller `.await`
830 /// while holding the lock without risking a deadlock against any
831 /// future map mutation (DashMap shards are sync mutexes).
832 /// What: looks up the palace id in `palace_write_locks` and returns
833 /// a clone of the existing mutex; on the first call for a palace,
834 /// inserts a freshly-constructed `tokio::sync::Mutex<()>` first. The
835 /// `DashMap::entry().or_insert_with` API guarantees the lazy
836 /// construction is racy-safe — only one mutex is ever inserted per
837 /// palace id.
838 /// Test: `tools::tests::dedup_gate_blocks_concurrent_duplicate_writes`.
839 pub fn palace_write_lock(&self, palace_id: &str) -> Arc<tokio::sync::Mutex<()>> {
840 if let Some(existing) = self.palace_write_locks.get(palace_id) {
841 return existing.clone();
842 }
843 self.palace_write_locks
844 .entry(palace_id.to_string())
845 .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(())))
846 .clone()
847 }
848
849 /// Look up a project root path by palace id in the startup pin-scan map.
850 ///
851 /// Why: provides a stable, cheap accessor so handlers do not reach directly
852 /// into the `DashMap` field and so the accessor can be mocked in future
853 /// tests without touching `AppState` internals. The map is populated
854 /// asynchronously by `spawn_startup_tasks` — an absent entry means either
855 /// the scan has not completed yet or no pin file claimed that id.
856 /// What: returns `Some(project_path)` when the palace id was found during
857 /// startup scan; `None` otherwise.
858 /// Test: covered indirectly via the startup-scan integration path; the
859 /// underlying map data is validated by `startup_scan::tests`.
860 pub fn pinned_project_path(&self, palace_id: &str) -> Option<PathBuf> {
861 self.pin_project_map.get(palace_id).map(|e| e.clone())
862 }
863
864 /// Builder-style: opt-in to the BM25 lexical lane (issue #156).
865 ///
866 /// Why: the BM25 subprocess is gated behind `TRUSTY_BM25_DAEMON=1` so
867 /// the default `cargo install trusty-memory` / launchd plist deployment
868 /// stays vector-only and existing test fixtures keep passing without
869 /// having to provision a daemon. Reading the env var here keeps the
870 /// gating logic in one place (the helper in `main.rs` just plumbs the
871 /// result through).
872 /// What: when `TRUSTY_BM25_DAEMON=1`, constructs one `Bm25Client` per
873 /// palace by lazy-resolving the socket path the first time the palace
874 /// id is observed. Currently we install a shared `default` client up
875 /// front and re-key on the palace id at the call site — palaces with no
876 /// daemon socket simply see search/index errors which we log + ignore.
877 /// Returns `self` unchanged when the env var is unset or set to anything
878 /// other than `1`.
879 /// Test: `bm25_client_disabled_by_default`,
880 /// `bm25_client_enabled_when_env_set`.
881 #[must_use]
882 pub fn with_bm25_client_from_env(mut self) -> Self {
883 if std::env::var("TRUSTY_BM25_DAEMON").as_deref() == Ok("1") {
884 // Install the default-palace client; per-palace clients are
885 // constructed on demand via `Bm25Client::for_palace`.
886 let default_palace = self.default_palace.as_deref().unwrap_or("default");
887 self.bm25_client = Some(Arc::new(Bm25Client::for_palace(default_palace)));
888 // Issue #193: hand-in-hand with the client, attach a spawn
889 // supervisor so the BM25 daemon is auto-started on first use
890 // for any palace. Operators who want to manage daemons
891 // out-of-band (launchd, systemd, manual) set
892 // TRUSTY_BM25_EXTERNAL=1 which makes the supervisor a no-op.
893 self.bm25_supervisor = Some(Arc::new(bm25_supervisor::Bm25Supervisor::new()));
894 // Issue #231: rebuild the bounded indexer channel + worker so
895 // the worker holds the now-populated client + supervisor. The
896 // placeholder worker installed by `AppState::new` (with `None`
897 // / `None`) drained the channel into the void — replacing the
898 // sender here closes the placeholder receiver and the
899 // placeholder worker exits cleanly. The new worker takes over
900 // as the sole drain for the indexer queue.
901 let (tx, rx) = tokio::sync::mpsc::channel::<tools::Bm25IndexRequest>(
902 tools::BM25_INDEX_QUEUE_CAPACITY,
903 );
904 tools::spawn_bm25_index_worker(
905 rx,
906 self.bm25_client.clone(),
907 self.bm25_supervisor.clone(),
908 );
909 self.bm25_index_tx = tx;
910 tracing::info!(
911 palace = default_palace,
912 "BM25 daemon client + spawn supervisor enabled (TRUSTY_BM25_DAEMON=1)"
913 );
914 }
915 self
916 }
917
918 /// Scan the palace registry directory and re-register every persisted
919 /// palace into the in-memory [`PalaceRegistry`].
920 ///
921 /// Why: `AppState::new` builds an *empty* registry, so after a daemon
922 /// restart `palace_list` / the dashboard reported zero palaces even though
923 /// dozens existed on disk — palace metadata was persisted by
924 /// `palace_create` but never re-hydrated on startup. This method closes
925 /// that gap by walking the on-disk layout (each subdirectory holding a
926 /// `palace.json` is one palace) and rebuilding a live `PalaceHandle` for
927 /// each, so recall paths see the full set immediately after a restart.
928 /// What: runs the blocking filesystem walk + per-palace `PalaceHandle::open`
929 /// on a `spawn_blocking` thread (so it never stalls the async runtime),
930 /// registers each successfully opened palace via `register_arc`, logs every
931 /// load at `debug!`, and returns the count loaded. A palace that fails to
932 /// open (corrupt index, unreadable `kg.db`, etc.) is logged at `warn!` and
933 /// skipped — one bad palace must not abort startup or crash the daemon.
934 /// `data_root` is expected to already be the palace registry directory —
935 /// `main.rs` resolves it via [`resolve_palace_registry_dir`] before
936 /// constructing the `AppState`, so the flat / legacy-`palaces/` layout
937 /// difference is handled exactly once.
938 /// Test: `tests::load_palaces_from_disk_rehydrates_registry` writes two
939 /// palaces into a tempdir, constructs an `AppState`, calls this method, and
940 /// asserts the returned count and registry contents.
941 pub async fn load_palaces_from_disk(&self) -> Result<usize> {
942 let registry_dir = self.data_root.clone();
943 let registry = self.registry.clone();
944 let palace_names = self.palace_names.clone();
945 // The directory walk and each `PalaceHandle::open` perform blocking
946 // filesystem + redb/usearch I/O — run the whole hydration on the
947 // blocking pool so it never parks an async worker thread.
948 let count = tokio::task::spawn_blocking(move || -> Result<usize> {
949 let palaces = PalaceRegistry::list_palaces(®istry_dir)?;
950 let total = palaces.len();
951 let mut loaded = 0usize;
952 let mut skipped = 0usize;
953 for palace in palaces {
954 match trusty_common::memory_core::PalaceHandle::open(&palace) {
955 Ok(handle) => {
956 tracing::debug!(
957 palace = %palace.id,
958 data_dir = %palace.data_dir.display(),
959 "loaded palace from disk"
960 );
961 // Issue #228: seed the in-memory name cache so write
962 // hot paths (memory_remember / memory_note) can resolve
963 // the friendly palace name without re-walking the data
964 // root. Insert here (during hydration) is the single
965 // point of truth for restart-time population.
966 palace_names.insert(palace.id.0.clone(), palace.name.clone());
967 registry.register_arc(handle);
968 loaded += 1;
969 }
970 Err(e) => {
971 // Why (issue #467): a single bad palace (corrupt kg.db,
972 // stale WAL, EMFILE — "Too many open files", permissions)
973 // must never abort startup or block the HTTP server from
974 // binding. Log per-palace and keep going; the summary
975 // below tells operators how many were skipped without
976 // trawling the log.
977 // The palace is NOT registered in the in-memory registry,
978 // so the next `open_palace` call for this id will attempt
979 // a fresh open from disk — the lazy-reopen path. If the
980 // root cause was EMFILE and the fd-limit fix (#462) raised
981 // the soft limit to 8192, that first request will succeed.
982 tracing::warn!(
983 palace = %palace.id,
984 data_dir = %palace.data_dir.display(),
985 "skipping palace during startup hydration: {e:#}; \
986 will retry lazily on first access"
987 );
988 skipped += 1;
989 }
990 }
991 }
992 tracing::info!(
993 "palace hydration summary: loaded {loaded}/{total} ({skipped} skipped due to errors)"
994 );
995 Ok(loaded)
996 })
997 .await
998 .map_err(|e| anyhow::anyhow!("join load_palaces_from_disk: {e}"))??;
999 Ok(count)
1000 }
1001
1002 /// Builder-style: attach the daemon's shared [`LogBuffer`] so the
1003 /// `GET /api/v1/logs/tail` endpoint serves the same lines the tracing
1004 /// subscriber captures (issue #35).
1005 ///
1006 /// Why: `main` builds the buffer (via `init_tracing_with_buffer`) before
1007 /// constructing the `AppState`, then hands a clone here so the HTTP
1008 /// handler and the tracing layer observe the same ring.
1009 /// What: replaces the empty default buffer with the supplied one.
1010 /// Test: `logs_tail_returns_recent_lines`.
1011 #[must_use]
1012 pub fn with_log_buffer(mut self, buffer: trusty_common::log_buffer::LogBuffer) -> Self {
1013 self.log_buffer = buffer;
1014 self
1015 }
1016
1017 /// Builder-style: attach the bug-capture `ErrorStore` handle (bug-reporting #478).
1018 ///
1019 /// Why: Phase 2 MCP / HTTP endpoints need a handle to the in-memory error
1020 /// ring so they can serve `recent_errors` / `errors_by_fingerprint`
1021 /// without disk I/O on the hot path. Installing it here — rather than
1022 /// adding it as a separate global — keeps the state graph explicit and
1023 /// lets tests skip it by never calling this method.
1024 /// What: stores `Some(store)` in `AppState::error_store`; the `BugCaptureLayer`
1025 /// that writes to this store is already installed in the tracing
1026 /// subscriber by `init_tracing_with_buffer_and_capture`. The store is
1027 /// `Clone` (cheap `Arc` clone internally) so both the layer and this
1028 /// field share the same underlying ring.
1029 /// Test: Phase 2 will add `error_store_captures_and_queries` in `web.rs`.
1030 #[must_use]
1031 pub fn with_error_store(mut self, store: trusty_common::error_capture::ErrorStore) -> Self {
1032 self.error_store = Some(store);
1033 self
1034 }
1035
1036 /// Send a `DaemonEvent` to all connected SSE subscribers and persist
1037 /// it to the activity log when the variant carries a source.
1038 ///
1039 /// Why: Mutating handlers call this after a successful write so the
1040 /// dashboard can update without polling. The send is best-effort —
1041 /// `broadcast::Sender::send` returns `Err` only when there are no live
1042 /// receivers, which is fine (no listeners == no work to do). Issue
1043 /// #96 additionally writes the entry to the persistent activity log
1044 /// so the feed can serve historical rows on page load and so MCP /
1045 /// HTTP / Hook origins are visible to the operator. Persistence is
1046 /// also best-effort — a write failure is logged but never blocks the
1047 /// SSE broadcast.
1048 ///
1049 /// Issue #232: the activity-log append is a synchronous redb write +
1050 /// fsync. Calling it directly on the async caller's task parked a tokio
1051 /// worker thread on disk I/O for every SSE event. We now offload the
1052 /// append to the blocking thread pool via `spawn_blocking` and return
1053 /// immediately — `emit` stays synchronous so every existing caller
1054 /// (including the sync `dispatch_hook_fired` JSON-RPC handler) keeps
1055 /// compiling unchanged. The fire-and-forget pattern matches the
1056 /// pre-fix semantics (best-effort, never blocks the SSE broadcast)
1057 /// while freeing the async runtime to do real work during the write.
1058 /// What: serialises the event for the log (skipping `StatusChanged`
1059 /// which is a recomputed aggregate, not a mutation), spawns the redb
1060 /// append on `tokio::task::spawn_blocking` keyed by a clone of the
1061 /// `Arc<ActivityLog>` and the cloned event, then sends the event over
1062 /// the broadcast channel. A `pending_activity_writes` counter is bumped
1063 /// before the spawn and decremented inside the closure so
1064 /// [`Self::flush_activity_writes`] can drain in tests.
1065 /// Test: `web::tests::sse_stream_receives_palace_created` confirms a
1066 /// subscriber observes the emitted event;
1067 /// `activity_endpoint_lists_recent_emits` confirms persistence via
1068 /// `flush_activity_writes`.
1069 pub fn emit(&self, event: DaemonEvent) {
1070 if let Some(source) = event.source() {
1071 let event_type = event.type_str();
1072 let palace_id = event.palace_id().map(|s| s.to_string());
1073 let log = Arc::clone(&self.activity_log);
1074 let event_for_log = event.clone();
1075 let pending = Arc::clone(&self.pending_activity_writes);
1076 // Pre-allocate the sequence id in the emitting thread so the
1077 // persisted order matches the emission order even when blocking-pool
1078 // workers execute the writes concurrently (issue #247). Without
1079 // this, four rapid emits would assign IDs inside their respective
1080 // `spawn_blocking` closures in a non-deterministic order.
1081 let id = log.alloc_id();
1082 pending.fetch_add(1, Ordering::SeqCst);
1083 // Why: the synchronous redb append + fsync must not park an
1084 // async worker thread (issue #232). Spawn the write on the
1085 // blocking pool; the JoinHandle is intentionally dropped —
1086 // the write is best-effort and any failure is logged below.
1087 tokio::task::spawn_blocking(move || {
1088 let result = log.append_with_id(id, source, palace_id, event_type, &event_for_log);
1089 if let Err(e) = result {
1090 tracing::warn!("activity_log.append failed for {event_type}: {e:#}");
1091 }
1092 pending.fetch_sub(1, Ordering::SeqCst);
1093 });
1094 }
1095 let _ = self.events.send(event);
1096 }
1097
1098 /// Block (asynchronously) until every in-flight activity-log write
1099 /// spawned by [`Self::emit`] has settled.
1100 ///
1101 /// Why: `emit` offloads its redb append to `tokio::task::spawn_blocking`
1102 /// and returns immediately (issue #232). Tests that observe the
1103 /// activity log right after a burst of emits would otherwise race the
1104 /// blocking-pool worker; this helper gives them a deterministic
1105 /// synchronization point. Production code never needs to call this —
1106 /// the dashboard reads through `GET /api/v1/activity`, which already
1107 /// tolerates writes settling asynchronously.
1108 /// What: spins on `pending_activity_writes` with a 1 ms yield until the
1109 /// counter is zero. Cheap: tests typically emit a handful of events
1110 /// and the loop exits within a single scheduler tick.
1111 /// Test: covered indirectly by `emit_persists_mutations_but_skips_status_changed`
1112 /// and `web::tests::activity_endpoint_lists_recent_emits`.
1113 pub async fn flush_activity_writes(&self) {
1114 while self.pending_activity_writes.load(Ordering::SeqCst) > 0 {
1115 tokio::time::sleep(std::time::Duration::from_millis(1)).await;
1116 }
1117 }
1118
1119 /// Open (or return cached) the chat-session store for a palace.
1120 ///
1121 /// Why: Chat session persistence lives in a dedicated SQLite file under
1122 /// the palace's data dir (`chat_sessions.db`) so it doesn't intermingle
1123 /// with the KG's transactional load. The store is cheap to clone via
1124 /// `Arc` but the underlying r2d2 pool should be reused, so cache by id.
1125 /// What: Creates the palace data dir if missing, opens (or reuses) a
1126 /// `ChatSessionStore` and stashes an `Arc` in the DashMap.
1127 /// Test: Indirectly via the session HTTP handlers in `web::tests`.
1128 pub fn session_store(&self, palace_id: &str) -> Result<Arc<ChatSessionStore>> {
1129 if let Some(entry) = self.session_stores.get(palace_id) {
1130 return Ok(entry.clone());
1131 }
1132 let dir = self.data_root.join(palace_id);
1133 std::fs::create_dir_all(&dir)
1134 .map_err(|e| anyhow::anyhow!("create palace dir {}: {e}", dir.display()))?;
1135 let store = Arc::new(ChatSessionStore::open(&dir.join("chat_sessions.db"))?);
1136 self.session_stores
1137 .insert(palace_id.to_string(), store.clone());
1138 Ok(store)
1139 }
1140
1141 /// Builder-style setter for the default palace name.
1142 ///
1143 /// Why: `serve --palace <name>` wants to bind every tool call to a
1144 /// project-scoped namespace without forcing every MCP request to repeat
1145 /// the palace argument.
1146 /// What: Returns `self` with `default_palace = Some(name)`.
1147 /// Test: `default_palace_used_when_arg_omitted` covers the resolution
1148 /// path; this setter is exercised there.
1149 pub fn with_default_palace(mut self, name: Option<String>) -> Self {
1150 self.default_palace = name;
1151 self
1152 }
1153
1154 /// Resolve (or initialize) the shared embedder.
1155 ///
1156 /// Why: FastEmbedder load is expensive — we share one instance across all
1157 /// tool calls; the `OnceCell` ensures concurrent first-use races collapse
1158 /// to a single load.
1159 /// What: Returns `Arc<FastEmbedder>` on success. Errors propagate from the
1160 /// underlying ONNX load.
1161 /// Test: Indirectly via `dispatch_remember_then_recall`.
1162 /// Resolve the active chat provider, auto-detecting on first call.
1163 ///
1164 /// Why: Provider selection depends on filesystem-loaded config plus a
1165 /// network probe (Ollama liveness), so it must be lazily initialised at
1166 /// runtime. Caching the choice in a `OnceCell` keeps it stable across
1167 /// concurrent requests without re-probing on every chat call.
1168 /// What: On first use loads `~/.trusty-memory/config.toml`, prefers an
1169 /// auto-detected Ollama instance (when `local_model.enabled`), and falls
1170 /// back to OpenRouter when an API key is set. Returns `Ok(None)` when
1171 /// neither is available so the caller can emit a 412.
1172 /// Test: `web::tests::providers_endpoint_returns_payload` covers the
1173 /// detection path indirectly through `/api/v1/chat/providers`.
1174 pub async fn chat_provider(&self) -> Option<Arc<dyn ChatProvider>> {
1175 self.chat_provider
1176 .get_or_init(|| async {
1177 // Why (issue #226): `service::load_user_config` is the
1178 // axum-free home of the loader; the `web::load_user_config`
1179 // re-export only exists for the HTTP handlers. Going
1180 // direct to `service` keeps this method usable when
1181 // the `axum-server` feature is disabled.
1182 let cfg = crate::service::load_user_config().unwrap_or_default();
1183 if cfg.local_model.enabled {
1184 if let Some(mut p) =
1185 trusty_common::auto_detect_local_provider(&cfg.local_model.base_url).await
1186 {
1187 // auto_detect returns an empty model id; callers must
1188 // set the configured model name themselves.
1189 p.model = cfg.local_model.model.clone();
1190 return Some(Arc::new(p) as Arc<dyn ChatProvider>);
1191 }
1192 }
1193 if !cfg.openrouter_api_key.is_empty() {
1194 return Some(Arc::new(trusty_common::OpenRouterProvider::new(
1195 cfg.openrouter_api_key,
1196 cfg.openrouter_model,
1197 )) as Arc<dyn ChatProvider>);
1198 }
1199 None
1200 })
1201 .await
1202 .clone()
1203 }
1204
1205 /// Spawn a fire-and-forget background task that auto-discovers project
1206 /// aliases under `project_root` and asserts new ones into `palace`.
1207 ///
1208 /// Why (issue #42): Projects carry implicit shorthand — cargo package
1209 /// names that differ from their directory, binary names that differ
1210 /// from packages, first-letter abbreviations — that should be surfaced
1211 /// without a user ever calling `add_alias`. Running discovery as a
1212 /// detached task on palace-open keeps startup latency unchanged: the
1213 /// daemon binds and starts serving immediately while the discovery scan
1214 /// completes in the background, and any newly-asserted aliases land in
1215 /// the prompt cache before the model's next `get_prompt_context` call.
1216 /// What: clones `self` (cheap; `Arc`-backed), spawns a tokio task that
1217 /// invokes the `discover_aliases` tool handler directly so the
1218 /// dedup + cache-rebuild logic runs exactly the same path as the MCP
1219 /// tool call. Errors are logged at `warn!`; one failed discovery never
1220 /// destabilises the daemon.
1221 /// Test: not unit-tested (timing-dependent fire-and-forget); the
1222 /// underlying `discover_aliases` dispatch is covered by
1223 /// `dispatch_discover_aliases_inserts_new_and_dedupes` in `tools::tests`.
1224 pub fn spawn_alias_discovery(&self, palace: String, project_root: PathBuf) {
1225 let state = self.clone();
1226 tokio::spawn(async move {
1227 let args = serde_json::json!({
1228 "palace": palace,
1229 "project_root": project_root.to_string_lossy(),
1230 });
1231 match tools::dispatch_tool(&state, "discover_aliases", args).await {
1232 Ok(result) => tracing::info!(
1233 new = ?result.get("new"),
1234 already_known = ?result.get("already_known"),
1235 "alias discovery complete"
1236 ),
1237 Err(e) => tracing::warn!("alias discovery failed: {e:#}"),
1238 }
1239 });
1240 }
1241
1242 /// Return the current readiness state.
1243 ///
1244 /// Why: tool handlers and the `/health` endpoint need a cheap, lock-free
1245 /// way to check whether the embedder has been initialised yet.
1246 /// What: loads `daemon_readiness` with `Acquire` ordering so the caller
1247 /// sees all writes the startup task made before setting the state.
1248 /// Test: `daemon_readiness_transitions_warming_to_ready`.
1249 pub fn readiness(&self) -> DaemonReadiness {
1250 DaemonReadiness::from_u8(self.daemon_readiness.load(Ordering::Acquire))
1251 }
1252
1253 /// Flip the readiness state from `Warming` to `Ready`.
1254 ///
1255 /// Why: called by `spawn_startup_tasks` in `main.rs` once the embedder
1256 /// warm-up succeeds — this is the single state-transition site.
1257 /// What: `store(Ready, Release)` so subsequent `Acquire` loads in handlers
1258 /// observe a consistent state. Idempotent: calling it multiple times is
1259 /// harmless.
1260 /// Test: `daemon_readiness_transitions_warming_to_ready`.
1261 pub fn set_ready(&self) {
1262 self.daemon_readiness
1263 .store(DaemonReadiness::Ready as u8, Ordering::Release);
1264 }
1265
1266 /// Return `Ok(())` when `Ready`, or an explicit `Err` with the warming
1267 /// message when still `Warming`.
1268 ///
1269 /// Why: the preflight in every bounded handler calls this and returns the
1270 /// error immediately so no embedding / redb I/O is attempted while the
1271 /// daemon is still initialising (tracks #911 internally).
1272 /// What: cheaply reads `daemon_readiness`; returns the fast error string
1273 /// on `Warming`. Zero allocation on the happy path.
1274 /// Test: covered by `tools::tests::remember_returns_warming_error_while_state_is_warming`.
1275 pub fn readiness_check(&self) -> Result<()> {
1276 if self.readiness() == DaemonReadiness::Warming {
1277 return Err(anyhow::anyhow!(
1278 "trusty-memory is warming up (embedder initialising); \
1279 please retry in a few seconds"
1280 ));
1281 }
1282 Ok(())
1283 }
1284
1285 /// Obtain the shared `FastEmbedder` instance, initialising it on first call.
1286 ///
1287 /// Why: centralises lazy embedder access so every tool handler goes through
1288 /// one bounded init path (tracks #910 internally).
1289 /// What: wraps `OnceCell::get_or_try_init` with a timeout so a slow
1290 /// CoreML/CUDA first-compile cannot block a handler indefinitely. On
1291 /// timeout the `OnceCell` is left unresolved and the next caller retries.
1292 ///
1293 /// **Callers on the request path MUST call `readiness_check()` before
1294 /// this method.** The four guarded handlers (`memory_remember`,
1295 /// `memory_recall`, `memory_recall_deep`, `memory_note`) do so; any new
1296 /// handler that calls `embedder()` must follow the same pattern.
1297 /// Reaching this method while still `Warming` is not a bug — the warm-up
1298 /// task itself calls `embedder()` while in `Warming` state — but request
1299 /// handlers should have short-circuited before here via `readiness_check()`.
1300 ///
1301 /// The `readiness_check()` preflight is the PRIMARY guard (fast rejection
1302 /// with no I/O). This timeout is the last-resort backstop in case a
1303 /// handler bypasses the preflight or the warm-up task itself hits a
1304 /// pathological init delay. If this timeout fires the `OnceCell` is left
1305 /// in the unresolved state and the next call retries from scratch.
1306 pub async fn embedder(&self) -> Result<Arc<FastEmbedder>> {
1307 use trusty_common::memory_core::timeouts;
1308 let cell = self.embedder.clone();
1309 let timeout = timeouts::embedder_init_timeout();
1310 // `readiness_check()` is the PRIMARY guard — handlers return a fast
1311 // warming error before reaching here. This timeout is the last-resort
1312 // backstop: if the embedder init races past the preflight (e.g. in the
1313 // warm-up task itself, which calls embedder() while still Warming) or
1314 // the CoreML/CUDA compile stalls, we fail fast rather than blocking
1315 // indefinitely. On timeout the OnceCell stays unresolved; the next
1316 // caller will retry the init from scratch.
1317 let embedder = tokio::time::timeout(
1318 timeout,
1319 cell.get_or_try_init(|| async {
1320 let e = FastEmbedder::new().await?;
1321 Ok::<Arc<FastEmbedder>, anyhow::Error>(Arc::new(e))
1322 }),
1323 )
1324 .await
1325 .map_err(|_| {
1326 anyhow::anyhow!(
1327 "AppState::embedder() timed out after {:?}; \
1328 the CoreML/CUDA model is taking unusually long to compile — \
1329 increase TRUSTY_EMBEDDER_INIT_TIMEOUT_SECS if needed",
1330 timeout
1331 )
1332 })??
1333 .clone();
1334 Ok(embedder)
1335 }
1336}
1337
1338impl std::fmt::Debug for AppState {
1339 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1340 f.debug_struct("AppState")
1341 .field("version", &self.version)
1342 .field("data_root", &self.data_root)
1343 .field("registry_len", &self.registry.len())
1344 .finish()
1345 }
1346}
1347
1348/// Handle a single MCP JSON-RPC message and produce its response.
1349///
1350/// Why: Pulled out of the stdio loop so unit tests can drive every method
1351/// without touching real stdin/stdout.
1352/// What: Routes `initialize`, `tools/list`, `tools/call`, `ping`, and the
1353/// `notifications/initialized` notification (which returns `Value::Null`).
1354/// Test: See unit tests below — initialize/list/call all return expected
1355/// JSON-RPC envelopes; notifications return `Null` (no response written).
1356pub async fn handle_message(state: &AppState, msg: Value) -> Value {
1357 let id = msg.get("id").cloned().unwrap_or(Value::Null);
1358 let method = msg.get("method").and_then(|m| m.as_str()).unwrap_or("");
1359
1360 match method {
1361 "initialize" => {
1362 let extra = state
1363 .default_palace
1364 .as_ref()
1365 .map(|dp| json!({ "default_palace": dp }));
1366 let result = initialize_response("trusty-memory", &state.version, extra);
1367 // Why (issue #42): prompt-facts now flow through the
1368 // per-message `get_prompt_context` tool rather than MCP
1369 // prompts, so we no longer advertise the `prompts` capability.
1370 json!({
1371 "jsonrpc": "2.0",
1372 "id": id,
1373 "result": result,
1374 })
1375 }
1376 // Notifications must NOT receive a response.
1377 "notifications/initialized" | "notifications/cancelled" => Value::Null,
1378 "tools/list" => json!({
1379 "jsonrpc": "2.0",
1380 "id": id,
1381 "result": tools::tool_definitions_with(state.default_palace.is_some())
1382 }),
1383 // OpenRPC 1.3.2 discovery — see `openrpc.rs`. Returns the full
1384 // service description so orchestrators (open-mpm, etc.) can
1385 // introspect every tool and its required `memory.read`/`memory.write`
1386 // scope without bespoke per-server adapters.
1387 "rpc.discover" => json!({
1388 "jsonrpc": "2.0",
1389 "id": id,
1390 "result": openrpc::build_discover_response(
1391 &state.version,
1392 state.default_palace.is_some(),
1393 ),
1394 }),
1395 "tools/call" => {
1396 let params = msg.get("params").cloned().unwrap_or_default();
1397 let tool_name = params
1398 .get("name")
1399 .and_then(|n| n.as_str())
1400 .unwrap_or("")
1401 .to_string();
1402 let args = params.get("arguments").cloned().unwrap_or_default();
1403 match tools::dispatch_tool(state, &tool_name, args).await {
1404 Ok(content) => {
1405 // Why: tools that return a bare JSON string (e.g.
1406 // `get_prompt_context` returning the formatted
1407 // Markdown block) should surface as plain text in the
1408 // MCP `content[0].text` field — wrapping in
1409 // `Value::to_string()` would re-quote the payload and
1410 // force every caller to strip outer quotes.
1411 let text = match &content {
1412 Value::String(s) => s.clone(),
1413 other => other.to_string(),
1414 };
1415 json!({
1416 "jsonrpc": "2.0",
1417 "id": id,
1418 "result": {
1419 "content": [{"type": "text", "text": text}]
1420 }
1421 })
1422 }
1423 Err(e) => json!({
1424 "jsonrpc": "2.0",
1425 "id": id,
1426 // Why: anyhow's `{:#}` alternate format walks the full
1427 // `Caused by:` chain so MCP clients see actionable
1428 // detail (e.g. "PalaceHandle::remember_with_options:
1429 // filter rejected: too short") instead of just the
1430 // outermost context label.
1431 "error": {"code": -32603, "message": format!("{e:#}")}
1432 }),
1433 }
1434 }
1435 "ping" => json!({"jsonrpc": "2.0", "id": id, "result": {}}),
1436 _ => json!({
1437 "jsonrpc": "2.0",
1438 "id": id,
1439 "error": {
1440 "code": -32601,
1441 "message": format!("Method not found: {method}")
1442 }
1443 }),
1444 }
1445}
1446
1447/// Preferred starting port for the trusty-memory HTTP daemon.
1448///
1449/// Why: keeps the well-known default stable for clients that have hard-coded
1450/// `127.0.0.1:7070` in their configuration, while still allowing dynamic
1451/// walking when the port is in use (`DYNAMIC_PORT_RANGE` ports starting here).
1452/// What: `7070` — historic default, matches the launchd plist's prior value.
1453/// Test: covered indirectly by `bind_dynamic_port_returns_listener`.
1454pub const DEFAULT_HTTP_PORT: u16 = 7070;
1455
1456/// Number of consecutive ports `bind_dynamic_port` walks before falling back
1457/// to the OS-assigned port. Matches the trusty-search convention.
1458const DYNAMIC_PORT_RANGE: u16 = 10;
1459
1460/// Path to the canonical address-discovery file for the trusty-memory daemon.
1461///
1462/// Why: clients (CLI, MCP tools, dashboards) need to find the running daemon
1463/// without configuration when the port was selected dynamically. Using
1464/// `trusty_common::resolve_data_dir` aligns this path with the location
1465/// that `trusty_common::read_daemon_addr("trusty-memory")` reads from, so
1466/// `prompt-context`, `doctor`, and `start`'s probe all find the running daemon.
1467/// The old `~/.trusty-memory/http_addr` path and the new
1468/// `~/Library/Application Support/trusty-memory/http_addr` (macOS) path were
1469/// divergent — the daemon wrote one; readers expected the other.
1470/// What: returns `{resolve_data_dir("trusty-memory")}/http_addr`, or `None` if
1471/// the data dir cannot be resolved (locked-down container, no passwd entry).
1472/// Test: `http_addr_path_uses_resolve_data_dir`.
1473pub fn http_addr_path() -> Option<PathBuf> {
1474 trusty_common::resolve_data_dir("trusty-memory")
1475 .ok()
1476 .map(|d| d.join("http_addr"))
1477}
1478
1479/// Bind a `TcpListener` to `127.0.0.1`, dynamically selecting a port.
1480///
1481/// Why: the historic default `7070` is convenient for clients but a stale
1482/// process or a second daemon must not produce a noisy failure. Walking
1483/// `DEFAULT_HTTP_PORT..DEFAULT_HTTP_PORT+DYNAMIC_PORT_RANGE` first preserves
1484/// backwards compatibility for the common case; OS-assigned fallback (`:0`)
1485/// guarantees the daemon always comes up even when every preferred port is
1486/// busy.
1487/// What: returns the first successful `TcpListener` (7070..=7079, then
1488/// OS-assigned); caller inspects `local_addr()` to learn the chosen port.
1489/// Test: `bind_dynamic_port_returns_listener` confirms it always binds *some*
1490/// port even after another listener occupies the preferred one.
1491pub async fn bind_dynamic_port() -> Result<tokio::net::TcpListener> {
1492 let preferred: SocketAddr = SocketAddr::from(([127, 0, 0, 1], DEFAULT_HTTP_PORT));
1493 // First: walk the preferred range (7070..=7079).
1494 if let Ok(listener) =
1495 trusty_common::bind_with_auto_port(preferred, DYNAMIC_PORT_RANGE - 1).await
1496 {
1497 return Ok(listener);
1498 }
1499 // Last resort: ask the kernel for any free port. `bind_with_auto_port`
1500 // with `:0` resolves immediately to the OS-assigned port.
1501 tracing::warn!(
1502 "all ports {DEFAULT_HTTP_PORT}..{} in use; requesting OS-assigned port",
1503 DEFAULT_HTTP_PORT + DYNAMIC_PORT_RANGE - 1
1504 );
1505 let any: SocketAddr = SocketAddr::from(([127, 0, 0, 1], 0));
1506 trusty_common::bind_with_auto_port(any, 0).await
1507}
1508
1509/// Write the bound `host:port` to `~/.trusty-memory/http_addr` atomically.
1510///
1511/// Why: clients must read the file mid-write without observing a partial
1512/// value. Writing to a `.tmp` sibling and renaming over the target gives
1513/// POSIX atomicity, matching the trusty-search implementation.
1514/// What: creates the parent directory if missing; writes `addr` followed by a
1515/// trailing newline (avoids the "no newline at end of file" warnings from
1516/// `cat`); renames `.tmp` → `http_addr`. Best-effort: I/O errors are
1517/// returned to the caller so `run_http_on` can log without panicking.
1518/// Test: `http_addr_file_round_trip_via_helpers`.
1519#[cfg(feature = "axum-server")]
1520fn write_http_addr_file(path: &Path, addr: &SocketAddr) -> std::io::Result<()> {
1521 use std::io::Write;
1522 if let Some(parent) = path.parent() {
1523 std::fs::create_dir_all(parent)?;
1524 }
1525 let tmp = path.with_extension("addr.tmp");
1526 {
1527 let mut f = std::fs::File::create(&tmp)?;
1528 writeln!(f, "{addr}")?;
1529 f.sync_all()?;
1530 }
1531 std::fs::rename(&tmp, path)?;
1532 Ok(())
1533}
1534
1535/// Return `true` when a non-default data directory is in effect.
1536///
1537/// Why (issue #880): two startup side-effects must be suppressed when the
1538/// daemon runs with an isolated/overridden data root:
1539/// 1. The legacy `~/.trusty-memory/http_addr` dotfile write — it would
1540/// overwrite the real production daemon's discovery file with the isolated
1541/// instance's throwaway address.
1542/// 2. The startup pin-scan — it reads project pin files from the **real**
1543/// user environment (~/Projects, ~/Developer, …) and imports palaces from
1544/// the real environment into the isolated data root, defeating isolation.
1545///
1546/// A "non-default data dir" means `TRUSTY_DATA_DIR_OVERRIDE` is set to a
1547/// non-empty, non-whitespace value. Empty or whitespace-only values are
1548/// treated as unset (same rule as `resolve_data_dir`), so an accidental blank
1549/// env var does not suppress the dotfile write on real production instances.
1550/// What: reads `TRUSTY_DATA_DIR_OVERRIDE`; returns `true` when it contains a
1551/// non-empty, non-whitespace string. Returns `false` otherwise.
1552/// Test: `is_data_dir_override_active_when_set`,
1553/// `is_data_dir_override_inactive_when_unset`,
1554/// `is_data_dir_override_inactive_when_blank`.
1555#[inline]
1556pub fn is_data_dir_override_active() -> bool {
1557 matches!(
1558 std::env::var(trusty_common::DATA_DIR_OVERRIDE_ENV),
1559 Ok(v) if !v.trim().is_empty()
1560 )
1561}
1562
1563/// Resolve the dotfile discovery path `~/.trusty-memory/http_addr`.
1564///
1565/// Why (issue #498): external tooling such as claude-mpm's `migrate_trusty_autodetect`
1566/// reads `~/.trusty-memory/http_addr` to find the running daemon's port. On
1567/// macOS, `resolve_data_dir("trusty-memory")` returns
1568/// `~/Library/Application Support/trusty-memory/`, not `~/.trusty-memory/`,
1569/// so the daemon was writing to the OS-standard location while readers expected
1570/// the dotfile location. Writing to both locations keeps every reader happy
1571/// regardless of which convention they follow.
1572///
1573/// Fix #880: returns `None` when `TRUSTY_DATA_DIR_OVERRIDE` is active so an
1574/// isolated instance (test rig, CI, parallel run) never overwrites the real
1575/// production daemon's discovery dotfile.
1576///
1577/// What: returns `$HOME/.trusty-memory/http_addr` in the default (production)
1578/// case, or `None` when `dirs::home_dir()` is unavailable OR when a data-dir
1579/// override is active (see `is_data_dir_override_active`).
1580/// Test: `dotfile_http_addr_path_uses_home_dir`,
1581/// `dotfile_suppressed_when_override_active`.
1582#[cfg(feature = "axum-server")]
1583fn dotfile_http_addr_path() -> Option<PathBuf> {
1584 // Fix #880: never write to the shared dotfile when an override is active.
1585 if is_data_dir_override_active() {
1586 return None;
1587 }
1588 dirs::home_dir().map(|h| h.join(".trusty-memory").join("http_addr"))
1589}
1590
1591/// Run the optional HTTP/SSE + web admin server.
1592///
1593/// Why: A long-running daemon mode lets non-stdio clients (browsers, curl,
1594/// future remote agents) hit `/health`, the `/api/v1/*` REST surface, and the
1595/// embedded admin SPA. The Unix-domain-socket transport and the
1596/// `trusty-memory-mcp-bridge` binary were removed in PR3 of the #914
1597/// stdio-cutover epic; the canonical MCP integration is now
1598/// `trusty-memory serve --stdio` (PR1 #919).
1599/// What: axum router built from `web::router()` plus a `/sse` stub for the
1600/// existing MCP-over-SSE clients. Caller provides a pre-bound listener so
1601/// port auto-detection lives at the call site. Before accepting connections
1602/// the daemon stamps the bound `host:port` onto `AppState.bound_addr` and
1603/// writes `~/.trusty-memory/http_addr` so clients can discover the live port.
1604/// On shutdown the file is removed best-effort (a stale file with the wrong
1605/// port is worse than a missing one).
1606/// Test: `cargo test -p trusty-memory web::tests` exercises the router shape;
1607/// manual: `curl http://127.0.0.1:<port>/health` returns `ok` with `addr`.
1608#[cfg(feature = "axum-server")]
1609pub async fn run_http_on(state: AppState, listener: tokio::net::TcpListener) -> Result<()> {
1610 use axum::routing::get;
1611
1612 // Issue #35: recompute the `data_root` disk footprint every 10 s on a
1613 // background task so `GET /health` reports `disk_bytes` without doing a
1614 // recursive directory walk on the request path.
1615 spawn_disk_size_ticker(state.clone());
1616
1617 // Issue #228: emit aggregate `StatusChanged` on a fixed cadence rather
1618 // than on every drawer write. The previous design called
1619 // `aggregate_status_event` from every `memory_remember` / `memory_note`
1620 // / `memory_forget` (and the matching HTTP handlers), each of which
1621 // walked the data root + opened every palace handle. Coalescing the
1622 // emit to a 30 s ticker keeps dashboards live without dragging an
1623 // O(N palaces) recompute onto the write hot path.
1624 spawn_status_event_ticker(state.clone());
1625
1626 // Capture and advertise the bound address BEFORE serving so the first
1627 // request handler — and the http_addr discovery file — see the real port
1628 // even if `local_addr()` would otherwise be racy.
1629 let local = listener.local_addr().ok();
1630 let (written_path, written_dotfile_path) = if let Some(a) = local {
1631 // Stash on state for handlers (e.g. /health) to surface.
1632 let _ = state.bound_addr.set(a);
1633 info!("HTTP server listening on http://{a}");
1634 eprintln!("HTTP server listening on http://{a}");
1635 // Primary: write to the OS-standard data dir (`~/Library/Application
1636 // Support/trusty-memory/http_addr` on macOS, `~/.local/share/…` on
1637 // Linux). This is what `trusty_common::read_daemon_addr` reads.
1638 // Best-effort: a missing $HOME or read-only fs is non-fatal.
1639 let primary = match http_addr_path() {
1640 Some(p) => match write_http_addr_file(&p, &a) {
1641 Ok(()) => {
1642 info!("wrote daemon address to {}", p.display());
1643 Some(p)
1644 }
1645 Err(e) => {
1646 tracing::warn!("could not write {}: {e}", p.display());
1647 None
1648 }
1649 },
1650 None => {
1651 tracing::warn!("no $HOME — skipping http_addr discovery file");
1652 None
1653 }
1654 };
1655 // Issue #498: also write to `~/.trusty-memory/http_addr` so external
1656 // tools (e.g. claude-mpm's `migrate_trusty_autodetect`) that read the
1657 // dotfile path can discover the daemon's port. On macOS the OS-standard
1658 // path differs from the dotfile path; writing both ensures consumers
1659 // using either convention find the file. Best-effort: failures are
1660 // logged but do not block startup.
1661 let dotfile = match dotfile_http_addr_path() {
1662 Some(p) => match write_http_addr_file(&p, &a) {
1663 Ok(()) => {
1664 info!("wrote daemon address to dotfile {}", p.display());
1665 Some(p)
1666 }
1667 Err(e) => {
1668 tracing::warn!("could not write dotfile {}: {e}", p.display());
1669 None
1670 }
1671 },
1672 None => None,
1673 };
1674 (primary, dotfile)
1675 } else {
1676 (None, None)
1677 };
1678
1679 // Keep a handle to the BM25 supervisor (if any) so we can call
1680 // `shutdown()` on the exit path. Cloning here is cheap (`Arc`) and
1681 // detaches the lifetime of the supervisor from the `state` move into
1682 // the router below.
1683 let bm25_supervisor = state.bm25_supervisor.clone();
1684
1685 let app = web::router()
1686 .route("/sse", get(sse_handler))
1687 .with_state(state);
1688
1689 // Why (issue #534): bare axum::serve exits only on an internal error; SIGTERM
1690 // (launchctl bootout) would kill the process before the cleanup below had a
1691 // chance to run, leaving stale addr/socket files behind and dropping any
1692 // in-flight request without draining. `with_graceful_shutdown` installs a
1693 // SIGTERM + SIGINT watcher; when either fires axum stops accepting new
1694 // connections, drains active requests, then returns here so cleanup runs.
1695 let serve_result = axum::serve(listener, app)
1696 .with_graceful_shutdown(trusty_common::shutdown_signal())
1697 .await;
1698
1699 // Best-effort cleanup: remove `http_addr` files so stale clients fail fast
1700 // instead of timing out against a dead port. Remove both the OS-standard
1701 // path and the dotfile path (#498).
1702 if let Some(p) = written_path.as_ref() {
1703 let _ = std::fs::remove_file(p);
1704 }
1705 if let Some(p) = written_dotfile_path.as_ref() {
1706 let _ = std::fs::remove_file(p);
1707 }
1708
1709 // Issue #193: gracefully reap every spawned BM25 daemon before the
1710 // process exits so each one gets a chance to flush its snapshot and
1711 // unlink its socket. `kill_on_drop=true` on the children would
1712 // SIGKILL them on Drop anyway, but that skips the daemon's own
1713 // shutdown sequence and leaves stale sockets behind.
1714 if let Some(supervisor) = bm25_supervisor {
1715 supervisor.shutdown().await;
1716 }
1717
1718 serve_result?;
1719 Ok(())
1720}
1721
1722/// Convenience: bind `addr` and serve via [`run_http_on`].
1723#[cfg(feature = "axum-server")]
1724pub async fn run_http(state: AppState, addr: std::net::SocketAddr) -> Result<()> {
1725 let listener = tokio::net::TcpListener::bind(addr).await?;
1726 run_http_on(state, listener).await
1727}
1728
1729/// Convenience: bind dynamically (7070..=7079, OS fallback) and serve.
1730///
1731/// Why: `trusty-memory serve` with no `--http` flag is the canonical
1732/// launchd-managed daemon entry point. Dynamic binding lets a stale daemon
1733/// or a hand-spawned `serve --http 127.0.0.1:7070` coexist without breaking
1734/// the launchd-managed instance.
1735/// What: calls [`bind_dynamic_port`] then [`run_http_on`].
1736/// Test: integration via `trusty-memory serve` + `cat ~/.trusty-memory/http_addr`.
1737#[cfg(feature = "axum-server")]
1738pub async fn run_http_dynamic(state: AppState) -> Result<()> {
1739 let listener = bind_dynamic_port().await?;
1740 run_http_on(state, listener).await
1741}
1742
1743/// Spawn a background ticker that recomputes the `data_root` disk footprint
1744/// every 10 seconds and stores it in `state.disk_bytes` (issue #35).
1745///
1746/// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
1747/// every health request would turn a frequent health poll into unbounded
1748/// recursive I/O. Computing it off the request path on a fixed cadence keeps
1749/// `/health` cheap and bounds the staleness to ~10 s — fine for an
1750/// at-a-glance footprint figure.
1751/// What: spawns a detached tokio task. `AppState` is cheap to `Clone` (all
1752/// `Arc` fields), so the task holds a full clone; the daemon process lives
1753/// for the lifetime of the server anyway, so no `Weak` downgrade is needed.
1754/// Each tick runs the blocking directory walk on `spawn_blocking` so it never
1755/// stalls the async runtime, then stores the byte total atomically.
1756/// Test: `health_endpoint_includes_resource_fields` asserts the field shape;
1757/// the ticker cadence is not unit-tested (timing-dependent).
1758#[cfg(feature = "axum-server")]
1759fn spawn_disk_size_ticker(state: AppState) {
1760 tokio::spawn(async move {
1761 let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
1762 loop {
1763 interval.tick().await;
1764 let dir = state.data_root.clone();
1765 // The directory walk is blocking filesystem I/O — run it on the
1766 // blocking pool so it never parks an async worker thread.
1767 let bytes = tokio::task::spawn_blocking(move || {
1768 trusty_common::sys_metrics::dir_size_bytes(&dir)
1769 })
1770 .await
1771 .unwrap_or(0);
1772 state
1773 .disk_bytes
1774 .store(bytes, std::sync::atomic::Ordering::Relaxed);
1775 }
1776 });
1777}
1778
1779/// Interval between aggregate-status snapshot emits on the SSE bus.
1780///
1781/// Why (issue #228): mutations used to fire `StatusChanged` synchronously on
1782/// the write path, which forced an O(N palaces) sum of drawer / vector / KG
1783/// counts on every `memory_remember`. Coalescing into a fixed-cadence ticker
1784/// lets dashboards stay current (a 30 s lag is invisible at human scale)
1785/// while keeping the write path free of aggregate work.
1786/// What: 30 seconds — short enough that the operator UI doesn't feel stale
1787/// between manual writes, long enough that the recompute cost (in-memory
1788/// registry walk plus the redb `count_active_triples` per palace) is a
1789/// rounding error on the daemon's CPU budget.
1790/// Test: covered indirectly — the math has not changed, only the cadence.
1791#[allow(dead_code)]
1792const STATUS_EVENT_TICK_SECS: u64 = 30;
1793
1794/// Spawn a background ticker that emits `DaemonEvent::StatusChanged` every
1795/// [`STATUS_EVENT_TICK_SECS`] seconds (issue #228).
1796///
1797/// Why: replaces the per-write `state.emit(self.aggregate_status_event())`
1798/// call sites that used to recompute the aggregate every time a drawer was
1799/// created or deleted. Walking N palaces on every write blocks the async
1800/// runtime; coalescing the emit onto a ticker keeps dashboards up-to-date
1801/// without that cost.
1802/// What: spawns a detached tokio task that holds a full `AppState` clone
1803/// (cheap — every field is `Arc`-backed) and ticks every
1804/// [`STATUS_EVENT_TICK_SECS`] seconds. Each tick computes
1805/// `MemoryService::aggregate_status_event` (which now iterates the
1806/// in-memory registry, not disk) and broadcasts it via `state.emit`. If
1807/// no SSE subscribers are connected the broadcast `send` is a cheap no-op,
1808/// so the ticker imposes no cost when nobody is listening.
1809/// Test: not unit-tested (timing-dependent fire-and-forget); the underlying
1810/// `aggregate_status_event` math is exercised by the existing
1811/// `status_endpoint_returns_payload` path.
1812#[allow(dead_code)]
1813fn spawn_status_event_ticker(state: AppState) {
1814 tokio::spawn(async move {
1815 let mut interval =
1816 tokio::time::interval(std::time::Duration::from_secs(STATUS_EVENT_TICK_SECS));
1817 // The first tick fires immediately, which is fine: it gives SSE
1818 // subscribers a baseline `StatusChanged` shortly after they connect.
1819 loop {
1820 interval.tick().await;
1821 let event = service::MemoryService::new(state.clone()).aggregate_status_event();
1822 state.emit(event);
1823 }
1824 });
1825}
1826
1827/// Live SSE event stream — pushes `DaemonEvent` frames to dashboard clients.
1828///
1829/// Why: The dashboard subscribes once and reacts to live pushes (palace
1830/// created, drawer added/deleted, dream completed, status changed) instead of
1831/// polling `/api/v1/*` endpoints.
1832/// What: Subscribes to `state.events`, emits an initial `connected` frame,
1833/// then forwards every `DaemonEvent` as `data: <json>\n\n`. Lagged
1834/// subscribers receive a `lag` frame indicating skipped events; channel
1835/// closure ends the stream.
1836/// Test: `web::tests::sse_stream_emits_palace_created` (covers subscribe +
1837/// emit + receive); manual: `curl -N http://.../sse`.
1838#[cfg(feature = "axum-server")]
1839pub(crate) async fn sse_handler(
1840 axum::extract::State(state): axum::extract::State<AppState>,
1841) -> impl axum::response::IntoResponse {
1842 use futures::StreamExt;
1843 use tokio_stream::wrappers::BroadcastStream;
1844
1845 let rx = state.events.subscribe();
1846 let initial = futures::stream::once(async {
1847 Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(
1848 "data: {\"type\":\"connected\"}\n\n",
1849 ))
1850 });
1851 let events = BroadcastStream::new(rx).map(|res| {
1852 let frame = match res {
1853 Ok(event) => match serde_json::to_string(&event) {
1854 Ok(json) => format!("data: {json}\n\n"),
1855 Err(e) => format!("data: {{\"type\":\"error\",\"message\":\"{e}\"}}\n\n"),
1856 },
1857 Err(tokio_stream::wrappers::errors::BroadcastStreamRecvError::Lagged(n)) => {
1858 format!("data: {{\"type\":\"lag\",\"skipped\":{n}}}\n\n")
1859 }
1860 };
1861 Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(frame))
1862 });
1863 let stream = initial.chain(events);
1864
1865 axum::response::Response::builder()
1866 .header("Content-Type", "text/event-stream")
1867 .header("Cache-Control", "no-cache")
1868 .header("X-Accel-Buffering", "no")
1869 .body(axum::body::Body::from_stream(stream))
1870 .expect("valid SSE response") // Why: invariant — SSE headers are compile-time constants; builder cannot fail
1871}
1872
1873#[cfg(test)]
1874mod tests {
1875 use super::*;
1876
1877 /// Why: Issue #234 — previously we `mem::forget`ed the `TempDir` so tests
1878 /// could keep using `AppState` without juggling the directory handle, but
1879 /// that leaked one temp directory per test (262+ accumulated each run).
1880 /// What: Returns the `TempDir` alongside the `AppState` so the caller can
1881 /// bind it (`let (state, _tmp) = ...;`) and let drop semantics clean up
1882 /// when the test scope ends.
1883 /// Test: Every test in this module that constructs state.
1884 fn test_state() -> (AppState, tempfile::TempDir) {
1885 let tmp = tempfile::tempdir().expect("tempdir");
1886 let root = tmp.path().to_path_buf();
1887 // Issue #88: bypass palace-slug enforcement so lib tests that call
1888 // `palace_create` with arbitrary names keep passing.
1889 // SAFETY: constant idempotent write; safe across test threads.
1890 unsafe {
1891 std::env::set_var("TRUSTY_SKIP_PALACE_ENFORCEMENT", "1");
1892 }
1893 let state = AppState::new(root);
1894 // Pre-existing tests exercise functional paths — flip to Ready so the
1895 // issue #911 warming preflight does not reject them.
1896 state.set_ready();
1897 (state, tmp)
1898 }
1899
1900 /// Why: DaemonReadiness tests need a state that starts in Warming; this
1901 /// variant skips `set_ready()` so the transition can be tested explicitly.
1902 /// Test: `daemon_readiness_transitions_warming_to_ready`,
1903 /// `readiness_check_ok_when_ready_err_when_warming`.
1904 fn test_state_warming() -> (AppState, tempfile::TempDir) {
1905 // Use OnceLock so the env var is written exactly once across all
1906 // parallel test threads — avoids the unsynchronised set_var race while
1907 // remaining consistent with the idempotent-write approach used in
1908 // `test_state()`.
1909 static SKIP_ENFORCEMENT_SET: std::sync::OnceLock<()> = std::sync::OnceLock::new();
1910 SKIP_ENFORCEMENT_SET.get_or_init(|| unsafe {
1911 std::env::set_var("TRUSTY_SKIP_PALACE_ENFORCEMENT", "1");
1912 });
1913 let tmp = tempfile::tempdir().expect("tempdir");
1914 let root = tmp.path().to_path_buf();
1915 // Deliberately do NOT call set_ready() — stays in Warming state.
1916 (AppState::new(root), tmp)
1917 }
1918
1919 #[tokio::test]
1920 async fn initialize_returns_protocol_version_and_capabilities() {
1921 let (state, _tmp) = test_state();
1922 let req = json!({
1923 "jsonrpc": "2.0",
1924 "id": 1,
1925 "method": "initialize",
1926 "params": {
1927 "protocolVersion": "2024-11-05",
1928 "capabilities": {},
1929 "clientInfo": {"name": "test", "version": "0"}
1930 }
1931 });
1932 let resp = handle_message(&state, req).await;
1933 assert_eq!(resp["jsonrpc"], "2.0");
1934 assert_eq!(resp["id"], 1);
1935 assert_eq!(resp["result"]["protocolVersion"], "2024-11-05");
1936 assert!(resp["result"]["capabilities"]["tools"].is_object());
1937 assert_eq!(resp["result"]["serverInfo"]["name"], "trusty-memory");
1938 }
1939
1940 #[tokio::test]
1941 async fn initialized_notification_returns_null() {
1942 let (state, _tmp) = test_state();
1943 let req = json!({
1944 "jsonrpc": "2.0",
1945 "method": "notifications/initialized",
1946 "params": {}
1947 });
1948 let resp = handle_message(&state, req).await;
1949 assert!(resp.is_null());
1950 }
1951
1952 #[tokio::test]
1953 async fn tools_list_returns_all_tools() {
1954 let (state, _tmp) = test_state();
1955 let req = json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"});
1956 let resp = handle_message(&state, req).await;
1957 let tools = resp["result"]["tools"].as_array().expect("tools array");
1958 // Issue #99 added `memory_send_message`; issue #180 added
1959 // `palace_delete`; the #180 follow-up adds `palace_update` on top
1960 // of the 22-tool baseline; issue #537 adds `upgrade`.
1961 assert_eq!(tools.len(), 24);
1962 }
1963
1964 #[tokio::test]
1965 async fn unknown_method_returns_error() {
1966 let (state, _tmp) = test_state();
1967 let req = json!({"jsonrpc": "2.0", "id": 4, "method": "wat"});
1968 let resp = handle_message(&state, req).await;
1969 assert_eq!(resp["error"]["code"], -32601);
1970 }
1971
1972 #[tokio::test]
1973 async fn ping_returns_empty_result() {
1974 let (state, _tmp) = test_state();
1975 let req = json!({"jsonrpc": "2.0", "id": 5, "method": "ping"});
1976 let resp = handle_message(&state, req).await;
1977 assert!(resp["result"].is_object());
1978 }
1979
1980 #[tokio::test]
1981 async fn app_state_default_constructs() {
1982 let (s, _tmp) = test_state();
1983 assert!(!s.version.is_empty());
1984 assert!(s.registry.is_empty());
1985 assert!(s.default_palace.is_none());
1986 }
1987
1988 /// Why (issue #225): the previous implementation called `.expect()` on the
1989 /// tempdir fallback, which panicked the daemon at startup on hosts where
1990 /// neither the data root nor `std::env::temp_dir()` is writable
1991 /// (read-only Docker overlays, locked-down sandboxes). The activity log
1992 /// is documented as best-effort, so the fix returns a no-op `Discard`
1993 /// variant instead. This test forces both paths to fail and asserts the
1994 /// helper returns the discard variant rather than panicking.
1995 ///
1996 /// Skipped when running as root because `chmod 000` is a no-op for the
1997 /// root user — the kernel grants root access regardless of mode bits.
1998 /// CI typically runs as non-root, so coverage is preserved in the
1999 /// common case; local root invocations simply skip with a warning.
2000 #[test]
2001 #[cfg(unix)]
2002 fn open_activity_log_with_fallback_returns_discard_when_unwritable() {
2003 // Skip when running as root — chmod is ignored.
2004 // SAFETY: libc::geteuid is a thread-safe syscall with no preconditions.
2005 if unsafe { libc::geteuid() } == 0 {
2006 eprintln!(
2007 "skipping open_activity_log_with_fallback_returns_discard_when_unwritable: running as root"
2008 );
2009 return;
2010 }
2011
2012 use std::os::unix::fs::PermissionsExt;
2013
2014 // Build two unwritable directories: the primary "data root" and a
2015 // shadow "TMPDIR" so the tempdir fallback also fails.
2016 let outer = tempfile::tempdir().expect("outer tempdir");
2017 let primary = outer.path().join("primary");
2018 let tmpdir = outer.path().join("fake-tmp");
2019 std::fs::create_dir(&primary).expect("create primary");
2020 std::fs::create_dir(&tmpdir).expect("create tmpdir");
2021
2022 // chmod 000 on both — neither can be opened for write.
2023 std::fs::set_permissions(&primary, std::fs::Permissions::from_mode(0o000))
2024 .expect("chmod primary");
2025 std::fs::set_permissions(&tmpdir, std::fs::Permissions::from_mode(0o000))
2026 .expect("chmod tmpdir");
2027
2028 // Override the tempdir lookup so `open_activity_log_with_fallback`
2029 // hits our unwritable fake-tmp instead of the real system temp.
2030 // Note: env var mutation is process-global; this test is the only
2031 // accessor for `TMPDIR` in this test binary, and we restore the
2032 // previous value before returning.
2033 let prev_tmpdir = std::env::var_os("TMPDIR");
2034 std::env::set_var("TMPDIR", &tmpdir);
2035
2036 let log = open_activity_log_with_fallback(&primary);
2037
2038 // Restore TMPDIR ASAP so a panic later in the test doesn't leak it.
2039 match prev_tmpdir {
2040 Some(v) => std::env::set_var("TMPDIR", v),
2041 None => std::env::remove_var("TMPDIR"),
2042 }
2043
2044 // Restore permissions so the outer tempdir can clean up.
2045 let _ = std::fs::set_permissions(&primary, std::fs::Permissions::from_mode(0o700));
2046 let _ = std::fs::set_permissions(&tmpdir, std::fs::Permissions::from_mode(0o700));
2047
2048 assert!(
2049 log.is_discard(),
2050 "expected ActivityLog::Discard when both data root and tempdir are unwritable"
2051 );
2052
2053 // The Discard variant must still satisfy the public contract: no
2054 // panic on append/count/list.
2055 let id = log
2056 .append(
2057 ActivitySource::Http,
2058 None,
2059 "drawer_added",
2060 json!({"smoke": true}),
2061 )
2062 .expect("discard append must succeed");
2063 assert_eq!(id, 0);
2064 assert_eq!(log.count().expect("discard count"), 0);
2065 assert!(log
2066 .list(&ActivityFilter::default(), 10, 0)
2067 .expect("discard list")
2068 .is_empty());
2069 }
2070
2071 /// Why: Issue #26 — when `serve --palace <name>` is set, the MCP server
2072 /// must (a) report the default in the `initialize` `serverInfo`, (b)
2073 /// drop `palace` from the required schema in `tools/list`, and (c) let
2074 /// `tools/call` use the default when the caller omits `palace`.
2075 /// Test: Construct an AppState with a default palace, create that palace
2076 /// on disk via the registry, then call `memory_remember` without a
2077 /// `palace` argument and confirm it resolves to the default.
2078 #[tokio::test]
2079 async fn default_palace_used_when_arg_omitted() {
2080 let tmp = tempfile::tempdir().expect("tempdir");
2081 let root = tmp.path().to_path_buf();
2082
2083 // Pre-create the default palace so remember has somewhere to land.
2084 let registry = trusty_common::memory_core::PalaceRegistry::new();
2085 let palace = trusty_common::memory_core::Palace {
2086 id: trusty_common::memory_core::PalaceId::new("default-pal"),
2087 name: "default-pal".to_string(),
2088 description: None,
2089 created_at: chrono::Utc::now(),
2090 data_dir: root.join("default-pal"),
2091 };
2092 registry
2093 .create_palace(&root, palace)
2094 .expect("create_palace");
2095
2096 let state = AppState::new(root).with_default_palace(Some("default-pal".to_string()));
2097 // Flip to Ready so the readiness preflight (#911) does not reject the
2098 // `memory_remember` call below.
2099 state.set_ready();
2100
2101 // (a) initialize advertises the default.
2102 let init = handle_message(
2103 &state,
2104 json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2105 )
2106 .await;
2107 assert_eq!(
2108 init["result"]["serverInfo"]["default_palace"], "default-pal",
2109 "initialize must echo default_palace in serverInfo"
2110 );
2111
2112 // (b) tools/list drops `palace` from required when default is set.
2113 let list = handle_message(
2114 &state,
2115 json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"}),
2116 )
2117 .await;
2118 let tools = list["result"]["tools"].as_array().expect("tools array");
2119 let remember = tools
2120 .iter()
2121 .find(|t| t["name"] == "memory_remember")
2122 .expect("memory_remember tool");
2123 let required: Vec<&str> = remember["inputSchema"]["required"]
2124 .as_array()
2125 .expect("required array")
2126 .iter()
2127 .filter_map(|v| v.as_str())
2128 .collect();
2129 assert!(
2130 !required.contains(&"palace"),
2131 "palace must not be required when default is configured; got {required:?}"
2132 );
2133 assert!(required.contains(&"text"));
2134
2135 // (c) tools/call resolves the default when arg is omitted.
2136 let call = handle_message(
2137 &state,
2138 json!({
2139 "jsonrpc": "2.0",
2140 "id": 3,
2141 "method": "tools/call",
2142 "params": {
2143 "name": "memory_remember",
2144 "arguments": {"text": "default palace test memory content with several tokens"},
2145 },
2146 }),
2147 )
2148 .await;
2149 // Successful dispatch returns `result.content[0].text` JSON.
2150 let text = call["result"]["content"][0]["text"]
2151 .as_str()
2152 .unwrap_or_else(|| panic!("expected success result, got {call}"));
2153 let parsed: Value = serde_json::from_str(text).expect("parse content json");
2154 assert_eq!(parsed["palace"], "default-pal");
2155 assert_eq!(parsed["status"], "stored");
2156 assert!(parsed["drawer_id"].as_str().is_some());
2157 }
2158
2159 /// Why: When no default is set, `tools/call` for a palace-bound tool
2160 /// without a `palace` argument should error helpfully rather than panic.
2161 #[tokio::test]
2162 async fn missing_palace_without_default_errors() {
2163 let (state, _tmp) = test_state();
2164 let resp = handle_message(
2165 &state,
2166 json!({
2167 "jsonrpc": "2.0",
2168 "id": 7,
2169 "method": "tools/call",
2170 "params": {
2171 "name": "memory_recall",
2172 "arguments": {"query": "anything"},
2173 },
2174 }),
2175 )
2176 .await;
2177 assert_eq!(resp["error"]["code"], -32603);
2178 let msg = resp["error"]["message"].as_str().unwrap_or("");
2179 assert!(
2180 msg.contains("missing 'palace'"),
2181 "expected helpful error, got: {msg}"
2182 );
2183 }
2184
2185 /// Why: regression for the "palaces lost on restart" bug — `AppState::new`
2186 /// builds an empty registry, so the daemon must call
2187 /// `load_palaces_from_disk` on startup to re-register palaces persisted by
2188 /// a previous run. Without that call the registry stays empty even though
2189 /// `palace.json` files exist on disk.
2190 /// What: persists two palaces under a tempdir (via the same
2191 /// `create_palace` path the `palace_create` tool uses), constructs a fresh
2192 /// `AppState` rooted there, calls `load_palaces_from_disk`, and asserts the
2193 /// returned count and registry contents.
2194 /// Test: this test itself.
2195 #[tokio::test]
2196 async fn load_palaces_from_disk_rehydrates_registry() {
2197 use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2198
2199 let tmp = tempfile::tempdir().expect("tempdir");
2200 let root = tmp.path().to_path_buf();
2201
2202 // Phase 1: persist two palaces to disk, then drop the writer registry
2203 // so nothing is held in memory — simulating a prior daemon run.
2204 {
2205 let writer = PalaceRegistry::new();
2206 for id in ["alpha", "beta"] {
2207 let palace = Palace {
2208 id: PalaceId::new(id),
2209 name: id.to_string(),
2210 description: None,
2211 created_at: chrono::Utc::now(),
2212 data_dir: root.join(id),
2213 };
2214 writer
2215 .create_palace(&root, palace)
2216 .expect("persist palace to disk");
2217 }
2218 }
2219
2220 // Add a stray non-palace subdirectory; the walker must ignore it.
2221 std::fs::create_dir_all(root.join("not-a-palace")).expect("mkdir");
2222
2223 // Phase 2: fresh AppState starts with an empty registry (the bug).
2224 let state = AppState::new(root);
2225 assert!(
2226 state.registry.is_empty(),
2227 "AppState::new must start with an empty registry"
2228 );
2229
2230 // The fix: hydrate from disk.
2231 let count = state
2232 .load_palaces_from_disk()
2233 .await
2234 .expect("load_palaces_from_disk");
2235
2236 assert_eq!(count, 2, "both persisted palaces should be loaded");
2237 assert_eq!(state.registry.len(), 2, "registry should hold both palaces");
2238 let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
2239 assert!(ids.contains(&"alpha".to_string()));
2240 assert!(ids.contains(&"beta".to_string()));
2241 }
2242
2243 /// Why: existing installs (and the legacy standalone `trusty-memory` repo)
2244 /// nest palaces one level deeper under a `palaces/` subdirectory. When that
2245 /// subdirectory exists, `resolve_palace_registry_dir` must descend into it
2246 /// so the daemon scans the level that actually holds the `palace.json`
2247 /// files — otherwise it finds zero palaces, which is the restart bug.
2248 /// What: creates `<dir>/palaces/`, resolves, and asserts the nested path is
2249 /// returned.
2250 /// Test: this test itself.
2251 #[test]
2252 fn resolve_palace_registry_dir_prefers_palaces_subdir() {
2253 let tmp = tempfile::tempdir().expect("tempdir");
2254 let data_dir = tmp.path().to_path_buf();
2255 std::fs::create_dir_all(data_dir.join("palaces")).expect("mkdir palaces");
2256
2257 let resolved = resolve_palace_registry_dir(data_dir.clone());
2258 assert_eq!(resolved, data_dir.join("palaces"));
2259 }
2260
2261 /// Why: a fresh install with no `palaces/` subdirectory must fall back to
2262 /// the data dir itself (the current flat monorepo layout).
2263 #[test]
2264 fn resolve_palace_registry_dir_falls_back_to_data_dir() {
2265 let tmp = tempfile::tempdir().expect("tempdir");
2266 let data_dir = tmp.path().to_path_buf();
2267
2268 let resolved = resolve_palace_registry_dir(data_dir.clone());
2269 assert_eq!(resolved, data_dir);
2270 }
2271
2272 /// Why: defense-in-depth assertion (#503) — a non-absolute data_dir must be
2273 /// caught and rejected before reaching `AppState::new`, as it would create
2274 /// palace dirs relative to the daemon CWD (/ under launchd).
2275 /// What: passing a relative path to `resolve_palace_registry_dir` produces
2276 /// a relative result; the daemon startup guard in `main.rs` must refuse it.
2277 /// This test validates the outcome of that guard path by confirming that a
2278 /// relative input would NOT produce an absolute registry dir.
2279 /// Test: this test itself.
2280 #[test]
2281 fn resolve_palace_registry_dir_relative_input_is_not_absolute() {
2282 // A relative dir is not a valid input, but we want to confirm that
2283 // if one somehow slipped through, the result would also be relative —
2284 // so the `main.rs` guard (is_absolute check) correctly rejects it.
2285 let relative = std::path::PathBuf::from("relative/path");
2286 let result = resolve_palace_registry_dir(relative.clone());
2287 assert!(
2288 !result.is_absolute(),
2289 "a relative input should produce a relative registry dir (caught by main.rs guard)"
2290 );
2291 }
2292
2293 /// Why: defense-in-depth assertion (#503) — a data_dir equal to "/" must be
2294 /// caught before reaching `AppState::new` to prevent palace dirs at `/`.
2295 /// What: confirms that "/" produces a result that equals "/", which the
2296 /// `main.rs` startup guard correctly rejects.
2297 /// Test: this test itself.
2298 #[test]
2299 fn resolve_palace_registry_dir_root_input_stays_root() {
2300 let root = std::path::PathBuf::from("/");
2301 let result = resolve_palace_registry_dir(root);
2302 // The guard in main.rs checks data_dir (before calling this fn), so
2303 // "/" would be caught before reaching this fn. But if it did reach here,
2304 // we confirm it yields "/" (no palaces/ subdir exists there), which would
2305 // then be caught by the main.rs post-resolve guard.
2306 assert_eq!(result, std::path::PathBuf::from("/"));
2307 }
2308
2309 /// Why: end-to-end check that the nested-`palaces/` layout hydrates — the
2310 /// daemon resolves the registry dir via `resolve_palace_registry_dir`, so
2311 /// an `AppState` rooted there must load palaces persisted one level below
2312 /// the bare data dir.
2313 /// What: persists two palaces under `<root>/palaces/<id>/`, constructs an
2314 /// `AppState` rooted at the resolved registry dir, and asserts hydration
2315 /// finds both.
2316 /// Test: this test itself.
2317 #[tokio::test]
2318 async fn load_palaces_from_disk_handles_palaces_subdir() {
2319 use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2320
2321 let tmp = tempfile::tempdir().expect("tempdir");
2322 let root = tmp.path().to_path_buf();
2323 let nested = root.join("palaces");
2324
2325 {
2326 let writer = PalaceRegistry::new();
2327 for id in ["cto", "engineering"] {
2328 let palace = Palace {
2329 id: PalaceId::new(id),
2330 name: id.to_string(),
2331 description: None,
2332 created_at: chrono::Utc::now(),
2333 data_dir: nested.join(id),
2334 };
2335 // create_palace anchors data_dir under the passed root, so
2336 // pass `nested` here to land palaces under `<root>/palaces/`.
2337 writer
2338 .create_palace(&nested, palace)
2339 .expect("persist palace under palaces/ subdir");
2340 }
2341 }
2342
2343 // Mirror main.rs: resolve the registry dir, then root AppState there.
2344 let registry_dir = resolve_palace_registry_dir(root);
2345 assert_eq!(registry_dir, nested, "must resolve into palaces/ subdir");
2346 let state = AppState::new(registry_dir);
2347 let count = state
2348 .load_palaces_from_disk()
2349 .await
2350 .expect("load_palaces_from_disk");
2351
2352 assert_eq!(count, 2, "both nested palaces should be loaded");
2353 assert_eq!(state.registry.len(), 2);
2354 let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
2355 assert!(ids.contains(&"cto".to_string()));
2356 assert!(ids.contains(&"engineering".to_string()));
2357 }
2358
2359 /// Why: an empty (or missing) palace registry directory must not error — a
2360 /// brand-new install has nothing to hydrate and should report zero.
2361 #[tokio::test]
2362 async fn load_palaces_from_disk_empty_root_returns_zero() {
2363 let (state, _tmp) = test_state();
2364 let count = state
2365 .load_palaces_from_disk()
2366 .await
2367 .expect("load_palaces_from_disk on empty root");
2368 assert_eq!(count, 0);
2369 assert!(state.registry.is_empty());
2370 }
2371
2372 /// Why (issue #228): hydration must seed `state.palace_names` so the
2373 /// MCP write hot path (`memory_remember` / `memory_note`) can resolve a
2374 /// friendly palace name without re-walking the data root on every call.
2375 /// Regression risk: a future refactor that forgets to populate the cache
2376 /// would silently degrade write latency.
2377 /// What: persists two palaces with distinct `name` values, constructs a
2378 /// fresh `AppState`, hydrates from disk, and asserts the cache holds the
2379 /// expected mappings.
2380 /// Test: this test itself.
2381 #[tokio::test]
2382 async fn palace_name_cache_populated_after_hydration() {
2383 use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2384
2385 let tmp = tempfile::tempdir().expect("tempdir");
2386 let root = tmp.path().to_path_buf();
2387 {
2388 let writer = PalaceRegistry::new();
2389 for (id, name) in [("alpha", "Alpha Project"), ("beta", "Beta Project")] {
2390 let palace = Palace {
2391 id: PalaceId::new(id),
2392 name: name.to_string(),
2393 description: None,
2394 created_at: chrono::Utc::now(),
2395 data_dir: root.join(id),
2396 };
2397 writer.create_palace(&root, palace).expect("persist palace");
2398 }
2399 }
2400
2401 let state = AppState::new(root);
2402 assert!(
2403 state.palace_names.is_empty(),
2404 "fresh AppState must start with an empty name cache"
2405 );
2406 state
2407 .load_palaces_from_disk()
2408 .await
2409 .expect("load_palaces_from_disk");
2410
2411 assert_eq!(state.palace_names.len(), 2, "cache must hold both palaces");
2412 assert_eq!(
2413 state.palace_names.get("alpha").map(|e| e.value().clone()),
2414 Some("Alpha Project".to_string()),
2415 );
2416 assert_eq!(
2417 state.palace_names.get("beta").map(|e| e.value().clone()),
2418 Some("Beta Project".to_string()),
2419 );
2420 }
2421
2422 /// Why (issue #228): `palace_create` (MCP tool) and `MemoryService::create_palace`
2423 /// (HTTP path) both insert into the name cache so a freshly-created palace
2424 /// is resolvable on the very next write — without waiting for the next
2425 /// hydration cycle.
2426 /// What: dispatches the `palace_create` MCP tool against a tempdir and
2427 /// asserts the cache row was written.
2428 /// Test: this test itself.
2429 #[tokio::test]
2430 async fn palace_name_cache_updates_on_create() {
2431 use serde_json::json;
2432
2433 let (state, _tmp) = test_state();
2434 let _ = tools::dispatch_tool(&state, "palace_create", json!({"name": "gamma"}))
2435 .await
2436 .expect("palace_create");
2437 assert_eq!(
2438 state.palace_names.get("gamma").map(|e| e.value().clone()),
2439 Some("gamma".to_string()),
2440 "palace_create must populate the in-memory name cache so writes \
2441 can resolve the friendly name without a disk walk"
2442 );
2443 }
2444
2445 /// Why: initialize without a default palace must omit `default_palace`
2446 /// from `serverInfo` so clients can detect the unbound mode.
2447 #[tokio::test]
2448 async fn initialize_without_default_palace_omits_field() {
2449 let (state, _tmp) = test_state();
2450 let init = handle_message(
2451 &state,
2452 json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2453 )
2454 .await;
2455 assert!(init["result"]["serverInfo"]["default_palace"].is_null());
2456 }
2457
2458 /// Why: every `~/.trusty-memory/http_addr` consumer (CLI, dashboard,
2459 /// future trusty-mpm wiring) must agree on the path. A regression that
2460 /// moves this file breaks every client relying on `read_daemon_addr`.
2461 /// What: under a stubbed data dir, the path ends in
2462 /// `trusty-memory/http_addr` — matching `trusty_common::read_daemon_addr`'s
2463 /// expected location.
2464 #[tokio::test]
2465 async fn http_addr_path_uses_resolve_data_dir() {
2466 // Hold the env_test_lock so this test does not race with
2467 // `prompt_context::tests::*` which spin a real daemon under
2468 // the same env override and would otherwise observe a
2469 // half-mutated $TRUSTY_DATA_DIR_OVERRIDE.
2470 let _guard = crate::commands::env_test_lock().lock().await;
2471 let tmp = tempfile::tempdir().unwrap();
2472 // SAFETY: test-only env mutation serialised by env_test_lock.
2473 unsafe {
2474 std::env::set_var(trusty_common::DATA_DIR_OVERRIDE_ENV, tmp.path());
2475 }
2476 let result = http_addr_path();
2477 unsafe {
2478 std::env::remove_var(trusty_common::DATA_DIR_OVERRIDE_ENV);
2479 }
2480 let p = result.expect("http_addr_path must return Some when data dir is resolvable");
2481 assert!(
2482 p.ends_with("trusty-memory/http_addr"),
2483 "unexpected http_addr path: {}",
2484 p.display()
2485 );
2486 }
2487
2488 /// Why: write+read round-trip pins the disk format: a single line of
2489 /// `host:port\n`. Clients (cat, sh `$(cat ...)`) trim whitespace, so the
2490 /// trailing newline is invisible — but anything else (extra whitespace,
2491 /// multi-line) would break callers.
2492 /// Note (issue #226): `write_http_addr_file` is part of the HTTP-serving
2493 /// surface gated behind `axum-server`; the test follows the same gate.
2494 #[cfg(feature = "axum-server")]
2495 #[test]
2496 fn http_addr_file_round_trip_via_helpers() {
2497 let dir = tempfile::tempdir().unwrap();
2498 let path = dir.path().join("http_addr");
2499 let addr: SocketAddr = "127.0.0.1:7073".parse().unwrap();
2500 write_http_addr_file(&path, &addr).unwrap();
2501 let raw = std::fs::read_to_string(&path).unwrap();
2502 assert_eq!(raw.trim(), "127.0.0.1:7073");
2503 // The trailing newline keeps `cat` and editors happy.
2504 assert!(raw.ends_with('\n'));
2505 }
2506
2507 /// Why: dynamic binding must succeed even when the preferred port is
2508 /// already in use. Walking 7070..=7079 + OS fallback guarantees the
2509 /// daemon never fails to come up just because another process holds 7070.
2510 /// What: pre-bind 7070 (best-effort — skip the test if it's already
2511 /// busy on the host), then call `bind_dynamic_port` and assert we got
2512 /// *some* listener back.
2513 #[tokio::test]
2514 async fn bind_dynamic_port_returns_listener() {
2515 let listener = bind_dynamic_port().await.expect("bind_dynamic_port");
2516 let addr = listener.local_addr().expect("local_addr");
2517 assert_eq!(addr.ip().to_string(), "127.0.0.1");
2518 assert!(addr.port() > 0, "port must be non-zero after bind");
2519 }
2520
2521 /// Why: Issue #42 — prompt-facts are now served by the per-message
2522 /// `get_prompt_context` tool rather than the MCP prompts surface, so the
2523 /// `initialize` handshake must NOT advertise a `prompts` capability and
2524 /// `prompts/list` / `prompts/get` must fall through to the "method not
2525 /// found" path.
2526 #[tokio::test]
2527 async fn initialize_does_not_advertise_prompts_capability() {
2528 let (state, _tmp) = test_state();
2529 let init = handle_message(
2530 &state,
2531 json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2532 )
2533 .await;
2534 assert!(
2535 init["result"]["capabilities"]["prompts"].is_null(),
2536 "initialize must NOT advertise the prompts capability; got {init}"
2537 );
2538
2539 // Both prompts/* dispatchers should now report method-not-found.
2540 for method in ["prompts/list", "prompts/get"] {
2541 let resp =
2542 handle_message(&state, json!({"jsonrpc": "2.0", "id": 2, "method": method})).await;
2543 assert_eq!(
2544 resp["error"]["code"], -32601,
2545 "{method} should return method-not-found; got {resp}"
2546 );
2547 }
2548 }
2549
2550 /// Why: `AppState::new` must initialise `bound_addr` to an empty
2551 /// `OnceLock` so `/health` reports `addr: None` on the stdio path. A
2552 /// regression that pre-populates this field would advertise a bogus
2553 /// address from a stale clone.
2554 ///
2555 /// Note (issue #231): now async so it runs inside a Tokio runtime —
2556 /// `AppState::new` spawns the bounded BM25 index worker via
2557 /// `tokio::spawn`, which requires an active runtime.
2558 #[tokio::test]
2559 async fn app_state_starts_with_empty_bound_addr() {
2560 let (state, _tmp) = test_state();
2561 assert!(state.bound_addr.get().is_none());
2562 }
2563
2564 /// Why (issue #96): `DaemonEvent::type_str` underpins the persisted
2565 /// activity log's `event_type` column — every variant must map to the
2566 /// exact SSE `type` tag the UI already handles. A drift between the
2567 /// SSE wire format and the stored type would break the feed's icon /
2568 /// label rendering for historical rows.
2569 /// What: constructs one of each variant, serialises via serde, and
2570 /// confirms `type_str()` matches the JSON `type` field.
2571 /// Test: this test.
2572 #[test]
2573 fn daemon_event_type_str_matches_sse_tag() {
2574 let cases = [
2575 DaemonEvent::PalaceCreated {
2576 id: "p".into(),
2577 name: "p".into(),
2578 source: ActivitySource::Http,
2579 },
2580 DaemonEvent::DrawerAdded {
2581 palace_id: "p".into(),
2582 palace_name: "p".into(),
2583 drawer_count: 1,
2584 timestamp: chrono::Utc::now(),
2585 content_preview: String::new(),
2586 source: ActivitySource::Mcp,
2587 },
2588 DaemonEvent::DrawerDeleted {
2589 palace_id: "p".into(),
2590 drawer_count: 0,
2591 source: ActivitySource::Http,
2592 },
2593 DaemonEvent::DreamCompleted {
2594 palace_id: None,
2595 merged: 0,
2596 pruned: 0,
2597 compacted: 0,
2598 closets_updated: 0,
2599 duration_ms: 0,
2600 source: ActivitySource::Http,
2601 },
2602 DaemonEvent::StatusChanged {
2603 total_drawers: 0,
2604 total_vectors: 0,
2605 total_kg_triples: 0,
2606 },
2607 DaemonEvent::HookFired {
2608 palace_id: Some("p".into()),
2609 palace_name: Some("p".into()),
2610 hook_type: HookType::UserPromptSubmit,
2611 injection_kind: InjectionKind::PromptContext,
2612 injection_length: 12,
2613 trigger_prompt_excerpt: "hello".into(),
2614 timestamp: chrono::Utc::now(),
2615 duration_ms: 5,
2616 source: ActivitySource::Hook,
2617 },
2618 ];
2619 for ev in &cases {
2620 let json = serde_json::to_value(ev).unwrap();
2621 assert_eq!(json["type"].as_str(), Some(ev.type_str()));
2622 }
2623 }
2624
2625 /// Why: `HookType` is serialised on every `HookFired` activity row; its
2626 /// wire format must round-trip cleanly so dashboard / TUI consumers can
2627 /// safely parse historic entries written by an older daemon build.
2628 /// What: serde-encodes each variant, asserts the JSON matches the
2629 /// expected PascalCase label, then decodes back.
2630 /// Test: itself.
2631 #[test]
2632 fn hook_type_serde_round_trips() {
2633 let cases = [
2634 (HookType::UserPromptSubmit, "\"UserPromptSubmit\""),
2635 (HookType::SessionStart, "\"SessionStart\""),
2636 ];
2637 for (ht, expected) in cases {
2638 let s = serde_json::to_string(&ht).unwrap();
2639 assert_eq!(s, expected, "{ht:?} should serialise to {expected}");
2640 let back: HookType = serde_json::from_str(&s).unwrap();
2641 assert_eq!(back, ht);
2642 assert_eq!(ht.as_str(), expected.trim_matches('"'));
2643 }
2644 }
2645
2646 /// Why: same as `hook_type_serde_round_trips` but for `InjectionKind`.
2647 /// What: kebab-case round trip on every variant.
2648 /// Test: itself.
2649 #[test]
2650 fn injection_kind_serde_round_trips() {
2651 let cases = [
2652 (InjectionKind::PromptContext, "\"prompt-context\""),
2653 (InjectionKind::InboxCheck, "\"inbox-check\""),
2654 ];
2655 for (ik, expected) in cases {
2656 let s = serde_json::to_string(&ik).unwrap();
2657 assert_eq!(s, expected);
2658 let back: InjectionKind = serde_json::from_str(&s).unwrap();
2659 assert_eq!(back, ik);
2660 assert_eq!(ik.as_str(), expected.trim_matches('"'));
2661 }
2662 }
2663
2664 /// Why: the activity feed renders the trigger prompt excerpt directly;
2665 /// runaway prompts must be capped at [`HOOK_PROMPT_EXCERPT_CHARS`] with
2666 /// a `…` marker so the row stays readable.
2667 /// What: feeds a 200-character prompt and asserts the excerpt is
2668 /// bounded.
2669 /// Test: itself.
2670 #[test]
2671 fn hook_excerpt_truncates_long_prompts() {
2672 let long = "x".repeat(200);
2673 let excerpt = hook_prompt_excerpt(&long);
2674 assert!(excerpt.chars().count() <= HOOK_PROMPT_EXCERPT_CHARS);
2675 assert!(excerpt.ends_with('…'));
2676 assert_eq!(hook_prompt_excerpt(""), "");
2677 }
2678
2679 /// Why: multi-line prompts must collapse to a single line so the
2680 /// activity feed row doesn't blow out vertically.
2681 /// What: feeds a multi-line whitespace-heavy prompt and asserts the
2682 /// output is a single-spaced single line.
2683 /// Test: itself.
2684 #[test]
2685 fn hook_excerpt_collapses_whitespace() {
2686 let input = "hello\n\nworld\t\tfoo";
2687 let excerpt = hook_prompt_excerpt(input);
2688 assert_eq!(excerpt, "hello world foo");
2689 }
2690
2691 /// Why (issue #96): `palace_id()` and `source()` feed the persisted
2692 /// activity log's columns; they must extract the right field per
2693 /// variant. Sloppy refactors could swap two fields and the log would
2694 /// silently mis-attribute writes.
2695 /// What: builds each variant with known field values and asserts the
2696 /// extractor returns them.
2697 /// Test: this test.
2698 #[test]
2699 fn daemon_event_palace_id_and_source_extraction() {
2700 let ev = DaemonEvent::DrawerAdded {
2701 palace_id: "alpha".into(),
2702 palace_name: "alpha".into(),
2703 drawer_count: 1,
2704 timestamp: chrono::Utc::now(),
2705 content_preview: String::new(),
2706 source: ActivitySource::Mcp,
2707 };
2708 assert_eq!(ev.palace_id(), Some("alpha"));
2709 assert_eq!(ev.source(), Some(ActivitySource::Mcp));
2710
2711 let status = DaemonEvent::StatusChanged {
2712 total_drawers: 1,
2713 total_vectors: 2,
2714 total_kg_triples: 3,
2715 };
2716 assert_eq!(status.palace_id(), None);
2717 assert_eq!(status.source(), None);
2718
2719 let dream = DaemonEvent::DreamCompleted {
2720 palace_id: Some("p1".into()),
2721 merged: 0,
2722 pruned: 0,
2723 compacted: 0,
2724 closets_updated: 0,
2725 duration_ms: 10,
2726 source: ActivitySource::Http,
2727 };
2728 assert_eq!(dream.palace_id(), Some("p1"));
2729 assert_eq!(dream.source(), Some(ActivitySource::Http));
2730 }
2731
2732 /// Why (issue #96): `AppState::emit` must persist mutation events to
2733 /// the activity log while keeping `StatusChanged` (a recomputed
2734 /// aggregate, not a mutation) out of the persisted history.
2735 /// What: emits one of each variant under a fresh state and asserts
2736 /// the persisted count matches the number of mutation events.
2737 /// Test: this test.
2738 #[tokio::test]
2739 async fn emit_persists_mutations_but_skips_status_changed() {
2740 let (state, _tmp) = test_state();
2741 state.emit(DaemonEvent::PalaceCreated {
2742 id: "p".into(),
2743 name: "p".into(),
2744 source: ActivitySource::Http,
2745 });
2746 state.emit(DaemonEvent::StatusChanged {
2747 total_drawers: 1,
2748 total_vectors: 0,
2749 total_kg_triples: 0,
2750 });
2751 state.emit(DaemonEvent::DrawerAdded {
2752 palace_id: "p".into(),
2753 palace_name: "p".into(),
2754 drawer_count: 1,
2755 timestamp: chrono::Utc::now(),
2756 content_preview: "x".into(),
2757 source: ActivitySource::Mcp,
2758 });
2759 // Issue #232: `emit` now offloads the redb write to `spawn_blocking`,
2760 // so the test must wait for the background pool to drain before
2761 // asserting on the persisted count.
2762 state.flush_activity_writes().await;
2763 let count = state.activity_log.count().unwrap();
2764 assert_eq!(count, 2, "only PalaceCreated + DrawerAdded must persist");
2765 }
2766
2767 /// Why (issue #156): the BM25 lane must be opt-in — existing deployments
2768 /// that don't set `TRUSTY_BM25_DAEMON=1` must see `bm25_client = None`
2769 /// and the recall hot path must continue to behave exactly as before.
2770 /// What: builds an `AppState` with `with_bm25_client_from_env()` while
2771 /// the env var is unset; asserts the field stays `None`.
2772 /// Test: this test.
2773 #[tokio::test]
2774 async fn bm25_client_disabled_by_default() {
2775 // Serialise with the sibling `bm25_client_enabled_when_env_set` test
2776 // so they don't race on the shared `TRUSTY_BM25_DAEMON` env var.
2777 let _guard = crate::commands::env_test_lock().lock().await;
2778 // SAFETY: this test exercises std::env::remove_var which is unsafe
2779 // in 2024 edition because the global env is shared. We restore the
2780 // pre-test value at the end so neighbours are unaffected.
2781 let prev = std::env::var("TRUSTY_BM25_DAEMON").ok();
2782 unsafe {
2783 std::env::remove_var("TRUSTY_BM25_DAEMON");
2784 }
2785 let (state, _tmp) = test_state();
2786 let state = state.with_bm25_client_from_env();
2787 assert!(
2788 state.bm25_client.is_none(),
2789 "bm25_client must be None when TRUSTY_BM25_DAEMON is unset"
2790 );
2791 // Issue #193: the spawn supervisor is bound to the same env gate as
2792 // the client — opt-out parity matters so we never accidentally
2793 // spawn daemons in deployments that explicitly didn't opt in.
2794 assert!(
2795 state.bm25_supervisor.is_none(),
2796 "bm25_supervisor must be None when TRUSTY_BM25_DAEMON is unset"
2797 );
2798 if let Some(v) = prev {
2799 unsafe {
2800 std::env::set_var("TRUSTY_BM25_DAEMON", v);
2801 }
2802 }
2803 }
2804
2805 /// Why (issue #156): when the operator opts in via `TRUSTY_BM25_DAEMON=1`,
2806 /// the builder must construct a real `Bm25Client` pointed at the canonical
2807 /// per-palace socket path. We don't connect — no daemon need be running —
2808 /// we only assert the client field is populated.
2809 /// What: sets the env var, runs the builder, asserts `Some(_)`.
2810 /// Test: this test.
2811 #[tokio::test]
2812 async fn bm25_client_enabled_when_env_set() {
2813 let _guard = crate::commands::env_test_lock().lock().await;
2814 let prev = std::env::var("TRUSTY_BM25_DAEMON").ok();
2815 unsafe {
2816 std::env::set_var("TRUSTY_BM25_DAEMON", "1");
2817 }
2818 let (state, _tmp) = test_state();
2819 let state = state.with_bm25_client_from_env();
2820 assert!(
2821 state.bm25_client.is_some(),
2822 "bm25_client must be Some when TRUSTY_BM25_DAEMON=1"
2823 );
2824 // Issue #193: opting in to the client must also install the spawn
2825 // supervisor so the daemon is auto-started on first use.
2826 assert!(
2827 state.bm25_supervisor.is_some(),
2828 "bm25_supervisor must be Some when TRUSTY_BM25_DAEMON=1"
2829 );
2830 match prev {
2831 Some(v) => unsafe { std::env::set_var("TRUSTY_BM25_DAEMON", v) },
2832 None => unsafe { std::env::remove_var("TRUSTY_BM25_DAEMON") },
2833 }
2834 }
2835
2836 // -------------------------------------------------------------------------
2837 // Issues #910 / #911 — DaemonReadiness
2838 // -------------------------------------------------------------------------
2839
2840 /// Why (issue #910/#911): `AppState` starts in `Warming` state; `set_ready`
2841 /// must flip it to `Ready` atomically; subsequent `readiness()` calls must
2842 /// observe `Ready`.
2843 /// What: construct a state, assert `Warming`, call `set_ready`, assert
2844 /// `Ready`.
2845 /// Test: this test.
2846 #[tokio::test]
2847 async fn daemon_readiness_transitions_warming_to_ready() {
2848 let (state, _tmp) = test_state_warming();
2849 assert_eq!(
2850 state.readiness(),
2851 DaemonReadiness::Warming,
2852 "daemon must start in Warming state"
2853 );
2854 state.set_ready();
2855 assert_eq!(
2856 state.readiness(),
2857 DaemonReadiness::Ready,
2858 "daemon must be Ready after set_ready()"
2859 );
2860 }
2861
2862 /// Why (issue #911): `readiness_check` must return `Ok(())` when Ready and
2863 /// an explicit `Err` when Warming, so tool handlers can use `?` to short-
2864 /// circuit without blocking.
2865 /// What: verify both states.
2866 /// Test: this test.
2867 #[tokio::test]
2868 async fn readiness_check_ok_when_ready_err_when_warming() {
2869 let (state, _tmp) = test_state_warming();
2870 // Warming → should error.
2871 let err = state
2872 .readiness_check()
2873 .expect_err("readiness_check must fail when Warming");
2874 let msg = err.to_string();
2875 assert!(
2876 msg.contains("warming up"),
2877 "error must mention 'warming up'; got: {msg}"
2878 );
2879 // Ready → should succeed.
2880 state.set_ready();
2881 state
2882 .readiness_check()
2883 .expect("readiness_check must succeed when Ready");
2884 }
2885
2886 /// Why (issue #911): `DaemonReadiness::from_u8` must map 0 → Warming and
2887 /// any non-zero → Ready.
2888 /// Test: this test.
2889 #[test]
2890 fn daemon_readiness_from_u8() {
2891 assert_eq!(DaemonReadiness::from_u8(0), DaemonReadiness::Warming);
2892 assert_eq!(DaemonReadiness::from_u8(1), DaemonReadiness::Ready);
2893 assert_eq!(DaemonReadiness::from_u8(255), DaemonReadiness::Ready);
2894 }
2895
2896 // -------------------------------------------------------------------------
2897 // Issue #467 — palaces skipped at startup hydration are lazily re-opened
2898 // -------------------------------------------------------------------------
2899
2900 /// Why (issue #467): when `load_palaces_from_disk` fails to open a palace
2901 /// (e.g. EMFILE — "Too many open files"), it logs a warning and does NOT
2902 /// register the palace in the in-memory registry. A subsequent call to
2903 /// `open_palace` for that id must attempt a fresh open from disk, not
2904 /// permanently return "not found". This test verifies the lazy-reopen
2905 /// path: create a palace on disk, remove it from the registry (simulating a
2906 /// startup-hydration skip), then open it via `open_palace` and assert success.
2907 /// What: builds an `AppState` with an on-disk palace that is subsequently
2908 /// removed from the in-memory registry (simulating what happens when
2909 /// `PalaceHandle::open` fails during `load_palaces_from_disk`), calls
2910 /// `registry.open_palace`, and asserts the palace handle is returned.
2911 /// Test: this test.
2912 #[tokio::test]
2913 async fn open_palace_lazy_reopens_hydration_skipped_palace() {
2914 let (state, _tmp) = test_state();
2915 // Create a palace on disk.
2916 let pid = trusty_common::memory_core::palace::PalaceId::new("hydration-skip");
2917 let palace = trusty_common::memory_core::Palace {
2918 id: pid.clone(),
2919 name: "hydration-skip".to_string(),
2920 description: None,
2921 created_at: chrono::Utc::now(),
2922 data_dir: state.data_root.join("hydration-skip"),
2923 };
2924 state
2925 .registry
2926 .create_palace(&state.data_root, palace)
2927 .expect("create_palace");
2928
2929 // Simulate a startup-hydration skip by removing the just-registered handle.
2930 // In production, the palace is simply never registered because
2931 // PalaceHandle::open failed during load_palaces_from_disk (EMFILE etc.).
2932 state.registry.remove(&pid);
2933
2934 // The registry must now report no in-memory handle for this palace.
2935 assert!(
2936 state.registry.get(&pid).is_none(),
2937 "palace must appear absent (simulating hydration skip) before the lazy-reopen"
2938 );
2939
2940 // Calling open_palace should attempt a fresh open from disk and succeed.
2941 let handle = state
2942 .registry
2943 .open_palace(&state.data_root, &pid)
2944 .expect("open_palace must lazily reopen a hydration-skipped palace");
2945 assert_eq!(handle.id.as_str(), "hydration-skip");
2946 }
2947
2948 // -------------------------------------------------------------------------
2949 // Issue #498 — dotfile http_addr path uses $HOME/.trusty-memory/http_addr
2950 // -------------------------------------------------------------------------
2951
2952 /// Why (issue #498): claude-mpm's `migrate_trusty_autodetect` reads
2953 /// `~/.trusty-memory/http_addr` to discover the daemon's port. On macOS
2954 /// the OS-standard data dir differs from the dotfile path, so the daemon
2955 /// was writing to the wrong location and claude-mpm always fell back to the
2956 /// hardcoded port `7070`. This test confirms `dotfile_http_addr_path()`
2957 /// returns a path rooted at `$HOME/.trusty-memory/http_addr`.
2958 /// What: under a known HOME, calls `dotfile_http_addr_path` and asserts the
2959 /// returned path ends in `.trusty-memory/http_addr`.
2960 /// Test: this test.
2961 #[cfg(feature = "axum-server")]
2962 #[test]
2963 fn dotfile_http_addr_path_uses_home_dir() {
2964 // `dirs::home_dir()` is not redirectable via env on macOS, but we can
2965 // at least assert that when it returns Some, the suffix is correct.
2966 if let Some(p) = dotfile_http_addr_path() {
2967 assert!(
2968 p.ends_with(".trusty-memory/http_addr"),
2969 "dotfile path must end in .trusty-memory/http_addr; got {}",
2970 p.display()
2971 );
2972 }
2973 // If home_dir() returns None (locked-down env), the function returns None —
2974 // that's acceptable; we just skip the assertion.
2975 }
2976
2977 /// Why (issue #498): the daemon must write to the dotfile path so that
2978 /// claude-mpm's `_resolve_base_url` finds the running port. This round-trip
2979 /// test exercises `write_http_addr_file` at a dotfile-shaped path and
2980 /// confirms the content is readable after the atomic rename.
2981 /// What: picks a tempdir as a stand-in for $HOME, writes an addr to
2982 /// `.trusty-memory/http_addr`, and reads it back.
2983 /// Test: this test.
2984 #[cfg(feature = "axum-server")]
2985 #[test]
2986 fn dotfile_http_addr_write_read_round_trip() {
2987 let home = tempfile::tempdir().unwrap();
2988 let dotfile_dir = home.path().join(".trusty-memory");
2989 let path = dotfile_dir.join("http_addr");
2990 let addr: SocketAddr = "127.0.0.1:7099".parse().unwrap();
2991 write_http_addr_file(&path, &addr).expect("write_http_addr_file to dotfile path");
2992 let raw = std::fs::read_to_string(&path).unwrap();
2993 assert_eq!(
2994 raw.trim(),
2995 "127.0.0.1:7099",
2996 "dotfile round-trip content mismatch"
2997 );
2998 assert!(raw.ends_with('\n'), "dotfile must end with a newline");
2999 }
3000}