trusty_memory/lib.rs
1//! MCP server (HTTP/SSE + UDS) for trusty-memory.
2//!
3//! Why: Claude Code and other MCP-aware clients integrate with trusty-memory
4//! through the standardized Model Context Protocol; we expose memory + KG
5//! tools so they can be called by name. Claude Code itself speaks stdio,
6//! but the in-process `serve --stdio` path was removed in issue #150
7//! because it deadlocked on the redb exclusive write lock whenever a
8//! long-lived daemon was already running — the canonical stdio integration
9//! is now the `trusty-memory-mcp-bridge` binary (PR #149), which pipes
10//! Claude Code's stdio over a Unix domain socket to the daemon.
11//! What: Provides `run_http` / `run_http_dynamic` / `run_http_on` (axum
12//! HTTP/SSE + REST + UI) and the `transport::uds` module (Unix-domain
13//! socket transport for the MCP bridge), plus an `AppState` that carries
14//! the shared `PalaceRegistry`, on-disk data root, and a lazily-initialized
15//! embedder.
16//! Test: `cargo test -p trusty-memory` validates handshake + dispatch via
17//! the in-process `handle_message` unit tests and the `tests/uds_roundtrip.rs`
18//! end-to-end harness.
19
20use anyhow::Result;
21use serde_json::{json, Value};
22use std::net::SocketAddr;
23use std::path::{Path, PathBuf};
24use std::sync::atomic::{AtomicUsize, Ordering};
25use std::sync::{Arc, OnceLock};
26use tokio::sync::{broadcast, OnceCell, RwLock};
27use trusty_common::bm25_client::Bm25Client;
28use trusty_common::mcp::initialize_response;
29use trusty_common::memory_core::embed::FastEmbedder;
30use trusty_common::memory_core::store::ChatSessionStore;
31use trusty_common::memory_core::PalaceRegistry;
32use trusty_common::ChatProvider;
33
34// Why: `tracing::info` is only used by the axum HTTP-serving helpers
35// (`run_http_on`, `spawn_uds_listener`). Pulling it in unconditionally
36// would trigger `unused_imports` warnings when the `axum-server`
37// feature is disabled. `SocketAddr` is still used by `bound_addr` on
38// `AppState` so it stays unconditional.
39#[cfg(feature = "axum-server")]
40use tracing::info;
41
42pub mod activity;
43pub mod attribution;
44pub mod bm25_supervisor;
45pub mod bootstrap;
46/// File-descriptor usage and limit reporting for `/health`.
47///
48/// Why: expose `open_fds` / `fd_soft_limit` so operators can see the fd
49/// ceiling and current consumption without needing lsof or shell access.
50/// Test: `fd_metrics::tests::fd_metrics_returns_sane_values`.
51pub mod fd_metrics;
52// Why (issue #226): `chat` and `web` are pure axum HTTP/SSE handler
53// surfaces. Gating them behind the `axum-server` feature is what lets
54// library consumers (e.g. `open-mpm` linking only `MemoryMcpService`)
55// drop axum + tower-http entirely from their build graph.
56#[cfg(feature = "axum-server")]
57pub mod chat;
58pub mod commands;
59pub mod discovery;
60pub mod hook_emit;
61pub mod kg_extract;
62pub mod mcp_service;
63pub mod messaging;
64pub mod openrpc;
65/// Issue #88: project-root detection and palace-slug enforcement.
66///
67/// Why: prevents unbounded palace creation by anchoring palace names to the
68/// canonical slug of the project directory that contains the CWD, or to the
69/// `personal` sentinel for non-project contexts.
70/// What: exports `find_project_root`, `project_slug_at`, `project_slug`,
71/// `validate_palace_name`, `PERSONAL_PALACE`, and `PROJECT_MARKERS`.
72/// Test: see unit tests inside this module.
73pub mod project_root;
74pub mod prompt_facts;
75pub mod prompt_log;
76pub mod service;
77/// Single-pass startup pin-file scanner (issue #470).
78///
79/// Why: builds the `palace_id → project_path` map at daemon startup without
80/// eager palace opens. One readdir sweep over the standard search roots is
81/// cheaper than the O(#palaces) per-id loop used by the doctor path and runs
82/// before the first HTTP request arrives.
83/// What: exports `scan_pin_map` and `default_search_dirs`.
84/// Test: see `startup_scan::tests`.
85pub mod startup_scan;
86pub mod tools;
87pub mod transport;
88#[cfg(feature = "axum-server")]
89pub mod web;
90
91pub use activity::{ActivityEntry, ActivityFilter, ActivityLog, ActivitySource};
92pub use attribution::{CreatorInfo, CreatorSource};
93
94/// Maximum bytes retained in the trigger-prompt excerpt embedded on a
95/// `HookFired` event.
96///
97/// Why: the full triggering prompt is sensitive and already lives in the
98/// JSONL prompt log; the activity feed only needs enough text to give an
99/// operator a glance — a single-line ~80 char preview matches the existing
100/// `drawer_content_preview` convention so dashboard rows render uniformly.
101/// What: 80 characters; longer prompts are truncated with a trailing `…`.
102/// Test: `hook_excerpt_truncates_long_prompts`.
103pub const HOOK_PROMPT_EXCERPT_CHARS: usize = 80;
104
105/// Reduce a triggering prompt to the short excerpt embedded on a
106/// `HookFired` activity event.
107///
108/// Why: see [`HOOK_PROMPT_EXCERPT_CHARS`]. Centralising the truncation rule
109/// keeps every emitter (HTTP, hook CLI handlers, future tests) producing
110/// the same preview shape so UI rendering is uniform.
111/// What: whitespace-collapses `prompt` and trims to
112/// [`HOOK_PROMPT_EXCERPT_CHARS`] chars with `…` when cut. Empty input
113/// returns an empty string.
114/// Test: `hook_excerpt_truncates_long_prompts`,
115/// `hook_excerpt_collapses_whitespace`.
116pub fn hook_prompt_excerpt(prompt: &str) -> String {
117 let normalised: String = prompt.split_whitespace().collect::<Vec<_>>().join(" ");
118 if normalised.chars().count() <= HOOK_PROMPT_EXCERPT_CHARS {
119 normalised
120 } else {
121 let kept: String = normalised
122 .chars()
123 .take(HOOK_PROMPT_EXCERPT_CHARS.saturating_sub(1))
124 .collect();
125 format!("{kept}…")
126 }
127}
128
129pub use mcp_service::MemoryMcpService;
130pub use tools::MemoryMcpServer;
131
132/// Resolve the directory that actually holds the per-palace subdirectories.
133///
134/// Why: there are two on-disk layouts in the wild. The current monorepo code
135/// treats the registry directory *itself* as the parent of per-palace dirs
136/// (`<dir>/<id>/palace.json`). The legacy standalone `trusty-memory` repo
137/// nested everything one level deeper under a `palaces/` subdirectory
138/// (`<data_dir>/palaces/<id>/palace.json`) — and that is where existing
139/// installs' data lives (e.g. 88 palaces under
140/// `~/Library/Application Support/trusty-memory/palaces/`). A daemon that uses
141/// the bare data dir as its registry root finds zero palaces because every
142/// `palace.json` sits one level below where it looked — the "palaces lost on
143/// restart" bug.
144/// What: given the standard data dir, returns `<data_dir>/palaces` when that
145/// subdirectory exists, otherwise `<data_dir>` itself. Resolving this once in
146/// `main.rs` and using the result as `AppState::data_root` keeps every call
147/// site (`status`, `palace_list`, `open_palace`, `palace_create`,
148/// `load_palaces_from_disk`) consistent without forcing a data migration.
149/// Test: `tests::resolve_palace_registry_dir_prefers_palaces_subdir` and
150/// `resolve_palace_registry_dir_falls_back_to_data_dir`.
151pub fn resolve_palace_registry_dir(data_dir: PathBuf) -> PathBuf {
152 let nested = data_dir.join("palaces");
153 if nested.is_dir() {
154 nested
155 } else {
156 data_dir
157 }
158}
159
160/// Hook type — labels the Claude Code hook that triggered a submission.
161///
162/// Why: every hook firing produces an activity-feed entry tagged with the
163/// originating hook so operators can tell whether activity came from a user
164/// prompt (`UserPromptSubmit`), a new session (`SessionStart`), or a future
165/// hook variant. Threading this through `DaemonEvent::HookFired` lets the
166/// dashboard badge each row with the hook label.
167/// What: serde-serialised in PascalCase so the wire format matches Claude
168/// Code's own hook-name strings exactly (e.g. `"UserPromptSubmit"`).
169/// Test: `hook_type_serde_round_trips`.
170#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
171pub enum HookType {
172 /// Claude Code's `UserPromptSubmit` hook — fires on every user prompt.
173 UserPromptSubmit,
174 /// Claude Code's `SessionStart` hook — fires once at session open.
175 SessionStart,
176}
177
178impl HookType {
179 /// Stable string label used for the wire format.
180 pub fn as_str(&self) -> &'static str {
181 match self {
182 Self::UserPromptSubmit => "UserPromptSubmit",
183 Self::SessionStart => "SessionStart",
184 }
185 }
186}
187
188/// Injection kind — labels what the hook actually injected (or attempted).
189///
190/// Why: distinct from `HookType` because one hook could in principle render
191/// more than one kind of injection (e.g. SessionStart can deliver both an
192/// inbox check and bootstrap context). Tagging the rendered kind explicitly
193/// keeps the activity log searchable when that fan-out lands.
194/// What: serde-serialised as kebab-case so it matches the labels already
195/// used in the JSONL prompt log (`prompt-context-facts`,
196/// `inbox-check-messages`).
197/// Test: `injection_kind_serde_round_trips`.
198#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
199#[serde(rename_all = "kebab-case")]
200pub enum InjectionKind {
201 /// `prompt-context` hook rendered the prompt-facts block.
202 PromptContext,
203 /// `inbox-check` hook delivered unread messages.
204 InboxCheck,
205}
206
207impl InjectionKind {
208 /// Stable string label used for the wire format.
209 pub fn as_str(&self) -> &'static str {
210 match self {
211 Self::PromptContext => "prompt-context",
212 Self::InboxCheck => "inbox-check",
213 }
214 }
215}
216
217/// Live daemon events broadcast to connected SSE subscribers.
218///
219/// Why: The dashboard needs push-driven updates so palace creation, drawer
220/// add/delete, dream cycles, and aggregate status changes are visible without
221/// polling. A single broadcast channel fans out to every connected browser.
222/// What: Tagged enum serialized as `{"type": "...", ...fields}` over SSE.
223/// Test: `web::tests::sse_stream_emits_events` subscribes, triggers a
224/// mutation, and asserts the frame arrives.
225#[derive(Clone, Debug, serde::Serialize)]
226#[serde(tag = "type", rename_all = "snake_case")]
227pub enum DaemonEvent {
228 PalaceCreated {
229 id: String,
230 name: String,
231 /// Originating subsystem (HTTP, MCP, Hook). Why (issue #96): the
232 /// UI badges each row with its source so operators can tell at a
233 /// glance whether a write came from the dashboard form, an MCP
234 /// tool call, or a hook-driven path. The wire-format key is
235 /// `source` (lower-case strings via serde rename_all on
236 /// `ActivitySource`).
237 source: ActivitySource,
238 },
239 DrawerAdded {
240 palace_id: String,
241 /// Friendly palace name (Palace.name) at write time. Why: lets SSE
242 /// consumers (the dashboard activity feed) render the human-readable
243 /// label without a separate id→name lookup. Empty string if the
244 /// emitter could not resolve the name.
245 #[serde(default)]
246 palace_name: String,
247 drawer_count: usize,
248 /// Wall-clock timestamp when the drawer was added. Why: SSE
249 /// receivers want to render "just now / 2m ago" relative to the
250 /// daemon's clock, not the time the SSE frame happens to arrive.
251 timestamp: chrono::DateTime<chrono::Utc>,
252 /// Short preview of the drawer's content (whitespace-collapsed,
253 /// truncated to ~80 chars with an ellipsis when cut). Why: the TUI
254 /// activity feed and dashboard ticker want to show *what* was
255 /// stored, not just the running drawer count. Empty when the
256 /// emitter could not resolve the content (legacy clients tolerate
257 /// the missing field via `#[serde(default)]`).
258 #[serde(default)]
259 content_preview: String,
260 /// Originating subsystem (issue #96).
261 source: ActivitySource,
262 },
263 DrawerDeleted {
264 palace_id: String,
265 drawer_count: usize,
266 /// Originating subsystem (issue #96).
267 source: ActivitySource,
268 },
269 DreamCompleted {
270 palace_id: Option<String>,
271 merged: usize,
272 pruned: usize,
273 compacted: usize,
274 closets_updated: usize,
275 duration_ms: u64,
276 /// Originating subsystem (issue #96).
277 source: ActivitySource,
278 },
279 StatusChanged {
280 total_drawers: usize,
281 total_vectors: usize,
282 total_kg_triples: usize,
283 },
284 /// A Claude Code hook completed and rendered (or attempted to render) an
285 /// injection block.
286 ///
287 /// Why: pre-#XXX the activity feed only fired on drawer / palace / dream
288 /// writes, which meant a normal Claude Code session — whose only daemon
289 /// traffic is hook invocations — left the feed empty. Surfacing every
290 /// hook firing answers the user complaint "no activity in the TUI" and
291 /// gives operators a way to see how often each project palace is
292 /// actually picking up prompt-context / inbox-check work.
293 /// What: carries the resolved palace (or `None` if cwd resolution
294 /// failed), the [`HookType`] label, the [`InjectionKind`] label, the
295 /// rendered injection byte length, a short excerpt of the triggering
296 /// prompt (capped at ~80 chars; the full content stays in the JSONL
297 /// prompt log only), the timestamp, the hook's wall-clock duration,
298 /// and the [`ActivitySource`] tag (always `Hook` for this variant).
299 /// Backwards-compatible: SSE clients that do not recognise the
300 /// `hook_fired` `type` tag can safely ignore the frame.
301 HookFired {
302 /// Resolved palace id (slug) — `None` if cwd resolution failed.
303 #[serde(default)]
304 palace_id: Option<String>,
305 /// Friendly palace name at hook time — `None` if the registry
306 /// could not be consulted (HTTP path uses `palace_id` here when
307 /// no separate name is known).
308 #[serde(default)]
309 palace_name: Option<String>,
310 hook_type: HookType,
311 injection_kind: InjectionKind,
312 /// Rendered injection size in bytes (`0` when no injection was
313 /// emitted, e.g. SessionStart with an empty inbox).
314 injection_length: u64,
315 /// Short excerpt of the triggering prompt for the activity feed
316 /// display. Capped at ~80 chars with a trailing `…` when cut.
317 /// Why: the activity feed renders this directly; full prompt
318 /// content (which may be sensitive) stays in the JSONL log.
319 #[serde(default)]
320 trigger_prompt_excerpt: String,
321 timestamp: chrono::DateTime<chrono::Utc>,
322 /// Hook wall-clock duration in milliseconds.
323 duration_ms: u64,
324 /// Always `ActivitySource::Hook` for this variant; encoded explicitly
325 /// so the same dispatch path (`emit`) can persist + broadcast it.
326 source: ActivitySource,
327 },
328}
329
330/// Open the activity log under `data_root`, falling back to a per-process
331/// tempdir and finally to a no-op `Discard` variant when no writable
332/// directory is available.
333///
334/// Why (issues #96, #225): the activity log is a best-effort feature — if
335/// the data root is on a read-only mount, missing, or locked by another
336/// process, the daemon should still come up and serve every other endpoint.
337/// The first fallback is a `std::env::temp_dir()`-anchored subdirectory
338/// keyed by the daemon's process id. Issue #225: a previous version called
339/// `expect()` on the tempdir fallback, which crashed the daemon on hosts
340/// where neither `data_root` nor `std::env::temp_dir()` is writable
341/// (read-only containers, locked-down sandboxes). The contract is
342/// "best-effort", so the final fallback is now `ActivityLog::discard()` —
343/// a no-op variant that drops every append and returns empty reads. The
344/// dashboard's activity feed simply shows up empty in that degraded state.
345/// What: tries `ActivityLog::open(data_root)`; on error logs a warning and
346/// retries against `<temp>/trusty-memory-activity-<pid>/`. If both fail,
347/// emits a final warning and returns `ActivityLog::discard()`.
348/// Test: `open_activity_log_with_fallback_returns_discard_when_unwritable`
349/// covers the discard branch; existing `AppState` construction tests cover
350/// the happy and tempdir-fallback paths.
351fn open_activity_log_with_fallback(data_root: &Path) -> Arc<ActivityLog> {
352 match ActivityLog::open(data_root) {
353 Ok(log) => Arc::new(log),
354 Err(primary_err) => {
355 tracing::warn!(
356 "could not open activity log at {}: {primary_err:#}; falling back to per-process tempdir",
357 data_root.display()
358 );
359 let fallback =
360 std::env::temp_dir().join(format!("trusty-memory-activity-{}", std::process::id()));
361 match ActivityLog::open(&fallback) {
362 Ok(log) => Arc::new(log),
363 Err(fallback_err) => {
364 tracing::warn!(
365 "activity log tempdir fallback at {} also failed: {fallback_err:#}; \
366 activity feed disabled for this process (no-op log)",
367 fallback.display()
368 );
369 Arc::new(ActivityLog::discard())
370 }
371 }
372 }
373 }
374}
375
376impl DaemonEvent {
377 /// Short discriminant label matching the SSE `type` field.
378 ///
379 /// Why: the persisted activity log stores `event_type` as a string so
380 /// the UI can render the row without re-parsing the payload. Sharing
381 /// the same labels the SSE serializer uses keeps the wire and the
382 /// stored history consistent.
383 /// What: returns one of `palace_created`, `drawer_added`,
384 /// `drawer_deleted`, `dream_completed`, `status_changed`.
385 /// Test: `daemon_event_type_str_matches_sse_tag` in the lib tests.
386 pub fn type_str(&self) -> &'static str {
387 match self {
388 Self::PalaceCreated { .. } => "palace_created",
389 Self::DrawerAdded { .. } => "drawer_added",
390 Self::DrawerDeleted { .. } => "drawer_deleted",
391 Self::DreamCompleted { .. } => "dream_completed",
392 Self::StatusChanged { .. } => "status_changed",
393 Self::HookFired { .. } => "hook_fired",
394 }
395 }
396
397 /// `palace_id` if the event is scoped to a single palace.
398 ///
399 /// Why: the activity log indexes entries by palace id so the UI can
400 /// filter by palace; daemon-wide events (`status_changed`,
401 /// dream-across-all-palaces) return `None`.
402 /// What: returns a borrowed string when the variant carries a palace
403 /// id, otherwise `None`.
404 /// Test: `daemon_event_palace_id_extraction`.
405 pub fn palace_id(&self) -> Option<&str> {
406 match self {
407 Self::PalaceCreated { id, .. } => Some(id),
408 Self::DrawerAdded { palace_id, .. } | Self::DrawerDeleted { palace_id, .. } => {
409 Some(palace_id)
410 }
411 Self::DreamCompleted { palace_id, .. } => palace_id.as_deref(),
412 Self::HookFired { palace_id, .. } => palace_id.as_deref(),
413 Self::StatusChanged { .. } => None,
414 }
415 }
416
417 /// Originating subsystem if the event carries one.
418 ///
419 /// Why: only mutation events carry a `source`; the aggregate
420 /// `StatusChanged` is recomputed by the daemon and has no caller, so
421 /// it returns `None`.
422 /// What: returns the variant's `source` field where present.
423 /// Test: `daemon_event_source_extraction`.
424 pub fn source(&self) -> Option<ActivitySource> {
425 match self {
426 Self::PalaceCreated { source, .. }
427 | Self::DrawerAdded { source, .. }
428 | Self::DrawerDeleted { source, .. }
429 | Self::DreamCompleted { source, .. }
430 | Self::HookFired { source, .. } => Some(*source),
431 Self::StatusChanged { .. } => None,
432 }
433 }
434}
435
436/// Shared application state passed to every request handler.
437///
438/// Why: The stdio loop and HTTP server need the same handles to the registry,
439/// data root, and embedder so MCP tools can perform real reads/writes against
440/// the live trusty-memory core. The embedder is heavy (loads ONNX weights) so
441/// we hold it behind a `OnceCell` and initialize lazily on first use.
442/// What: `Clone`-able via `Arc` fields. The registry / data root are eager;
443/// `embedder` is `Arc<OnceCell<Arc<FastEmbedder>>>` so concurrent first-use
444/// races resolve to a single shared instance.
445/// Test: `app_state_default_constructs` confirms construction without panic.
446#[derive(Clone)]
447pub struct AppState {
448 pub version: String,
449 pub registry: Arc<PalaceRegistry>,
450 pub data_root: PathBuf,
451 pub embedder: Arc<OnceCell<Arc<FastEmbedder>>>,
452 /// Optional default palace applied to MCP tool calls when the caller
453 /// omits the `palace` argument. Set via `trusty-memory serve --palace`.
454 pub default_palace: Option<String>,
455 /// Active chat provider selected at startup. `None` means no upstream is
456 /// configured (no Ollama detected and no OpenRouter key) — callers must
457 /// degrade gracefully (chat endpoint returns 412).
458 pub chat_provider: Arc<OnceCell<Option<Arc<dyn ChatProvider>>>>,
459 /// Per-palace chat-session stores, opened lazily so cold-start cost is
460 /// paid only when chat-history endpoints are hit.
461 pub session_stores: Arc<dashmap::DashMap<String, Arc<ChatSessionStore>>>,
462 /// Broadcast sender for live `DaemonEvent` pushes to SSE subscribers.
463 ///
464 /// Why: Lets mutating handlers emit events that any connected dashboard
465 /// receives instantly. Cap of 128 buffers transient slow readers; if a
466 /// receiver lags it gets `RecvError::Lagged` and we emit a `lag` frame.
467 pub events: Arc<broadcast::Sender<DaemonEvent>>,
468 /// Instant the daemon started, used to compute `uptime_secs` on `/health`.
469 ///
470 /// Why (issue #35): `GET /health` reports how long the daemon has been
471 /// up. Capturing a monotonic `Instant` at `AppState` construction lets the
472 /// handler compute the elapsed seconds cheaply and without a clock-skew
473 /// hazard.
474 /// What: a wall-monotonic `Instant`; `AppState::new` stamps it at startup.
475 /// Test: `health_endpoint_includes_resource_fields`.
476 pub started_at: std::time::Instant,
477 /// In-memory ring buffer of recent tracing log lines (issue #35).
478 ///
479 /// Why: the `GET /api/v1/logs/tail` endpoint serves the last N log lines
480 /// so operators can inspect a running daemon without tailing a file. The
481 /// buffer is shared between the tracing `LogBufferLayer` (writer) and the
482 /// HTTP handler (reader).
483 /// What: a cheap `Arc`-backed clone of the buffer the subscriber writes
484 /// to. Defaults to an empty buffer for states that never install the
485 /// layer (tests, the stdio path).
486 /// Test: `logs_tail_returns_recent_lines`.
487 pub log_buffer: trusty_common::log_buffer::LogBuffer,
488 /// Bug-capture ERROR store (bug-reporting #478, Phase 1).
489 ///
490 /// Why: Phase 2 MCP / HTTP endpoints need to query captured errors; stashing
491 /// the `ErrorStore` handle here lets any handler reach it cheaply without
492 /// a second global or per-request construction.
493 /// What: populated by `run_serve` from the `init_tracing_with_buffer_and_capture`
494 /// result; the layer writes to this store automatically so every
495 /// `tracing::error!` call site contributes without any changes to call
496 /// sites. `None` in states that do not install the layer (tests, the
497 /// stdio path).
498 /// Test: compile-presence is verified by the `trusty-memory` build; Phase 2
499 /// will add query tests in `web.rs`.
500 pub error_store: Option<trusty_common::error_capture::ErrorStore>,
501 /// Most recent on-disk footprint of `data_root`, in bytes (issue #35).
502 ///
503 /// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
504 /// every health request would make a frequent health poll do unbounded
505 /// I/O; a background task recomputes it every 10 s and stores it here so
506 /// the handler reads it lock-free.
507 /// What: an `AtomicU64` updated by the ticker spawned in `run_http_on`.
508 /// `0` until the first walk completes.
509 /// Test: `health_endpoint_includes_resource_fields`.
510 pub disk_bytes: Arc<std::sync::atomic::AtomicU64>,
511 /// Per-process RSS + CPU sampler, refreshed on each `/health` request
512 /// (issue #35).
513 ///
514 /// Why: CPU usage is a delta between two `sysinfo` refreshes, so the
515 /// sampler must persist between requests — hence the shared `Mutex`.
516 /// What: a `tokio::sync::Mutex<SysMetrics>` so the async health handler
517 /// can sample without blocking the runtime.
518 /// Test: `health_endpoint_includes_resource_fields`.
519 pub sys_metrics: Arc<tokio::sync::Mutex<trusty_common::sys_metrics::SysMetrics>>,
520 /// HTTP listener address the daemon bound to, once `run_http_on` is running.
521 ///
522 /// Why: clients (and `/health` responses) need to advertise the live
523 /// `host:port` even though port selection happens dynamically (7070–7079
524 /// walk + OS fallback). Stashing it on `AppState` lets request handlers
525 /// surface the discovery value without re-querying the listener.
526 /// What: a `OnceLock<SocketAddr>` so `run_http_on` writes it exactly once
527 /// at bind time and every handler reads it lock-free thereafter. Empty
528 /// (`None` from `get()`) on the stdio path where no listener exists.
529 /// Test: `health_endpoint_reports_bound_addr` (added below).
530 pub bound_addr: Arc<OnceLock<SocketAddr>>,
531 /// Cached prompt-facts surface served by the MCP `get_prompt_context`
532 /// tool (issue #42).
533 ///
534 /// Why: The original session-init `prompts/get` design loaded context
535 /// once per connection; switching to a per-message tool lets the model
536 /// pull fresh, query-filtered context on demand. The cache holds both
537 /// the raw triples (for filtered lookups) and a pre-formatted Markdown
538 /// block (for the unfiltered hot path) so neither code path re-walks
539 /// the KG. The cache is rebuilt by
540 /// `prompt_facts::rebuild_prompt_cache` after any write that touches a
541 /// hot predicate (`kg_assert`, `add_alias`, `remove_prompt_fact`).
542 /// What: An `Arc<tokio::sync::RwLock<PromptFactsCache>>` so the hot
543 /// read path takes a brief read lock and clones the cache; rebuilds
544 /// take a write lock for the assignment only. The async-aware lock
545 /// (issue #229) yields to the tokio runtime instead of blocking a
546 /// runtime thread for the rebuild duration. An empty `triples` vec ↔
547 /// "no context stored yet" (the tool handler renders a hint).
548 /// Test: `get_prompt_context_returns_cached_or_hint`,
549 /// `get_prompt_context_filters_by_query`.
550 pub prompt_context_cache: Arc<RwLock<prompt_facts::PromptFactsCache>>,
551 /// Persistent activity log (issue #96).
552 ///
553 /// Why: the dashboard activity feed used to be a pure live-stream over
554 /// `/sse` — opening the UI showed an empty feed and any mutation from
555 /// the MCP path was invisible. Holding an `ActivityLog` on `AppState`
556 /// lets `emit` record an entry on every push so the
557 /// `GET /api/v1/activity` handler can return historical rows on mount
558 /// and the live SSE stream can continue prepending events on top of
559 /// the loaded history. `None` on builds that opt out (tests that use
560 /// `AppState::new` get a real log under their tempdir so behaviour
561 /// matches production).
562 /// What: an `Arc<ActivityLog>` shared with every emitter.
563 /// Test: `web::tests::activity_endpoint_lists_recent_emits`.
564 pub activity_log: Arc<ActivityLog>,
565 /// Optional per-palace BM25 lexical search lane (issue #156).
566 ///
567 /// Why: in-process BM25 would serialise the recall hot path on disk
568 /// I/O during writes and contend with the redb/usearch locks. Delegating
569 /// to the `trusty-bm25-daemon` subprocess (one socket per palace) keeps
570 /// BM25 ingestion and search off the critical path while still feeding
571 /// hits into the recall RRF fusion.
572 /// What: `Some(client)` only when `TRUSTY_BM25_DAEMON=1` at startup —
573 /// every code path that uses this field is gated on `is_some()` and
574 /// falls back to vector-only behaviour otherwise so existing deployments
575 /// see zero behavioural change.
576 /// Test: `bm25_client_disabled_by_default`,
577 /// `bm25_client_enabled_when_env_set`.
578 pub bm25_client: Option<Arc<Bm25Client>>,
579 /// Optional per-palace BM25 daemon spawn supervisor (issue #193).
580 ///
581 /// Why: without an in-process supervisor the BM25 daemon must be
582 /// launched out-of-band (launchd, manual `trusty-bm25-daemon`), which
583 /// is the same UX trap PR #190 fixed for trusty-embedderd. Holding a
584 /// supervisor here lets us spawn the daemon on first BM25 use for a
585 /// palace, restart it if it dies, and reap it on clean shutdown.
586 /// `Some` only when `TRUSTY_BM25_DAEMON=1` at startup — the same gate
587 /// that enables `bm25_client`. When set but `TRUSTY_BM25_EXTERNAL=1`,
588 /// the supervisor's `ensure_running` becomes a no-op that just returns
589 /// the canonical socket path so operators can keep using their own
590 /// process manager.
591 /// Test: covered by `bm25_supervisor_present_when_env_set` and the
592 /// `bm25_supervisor::tests` unit tests.
593 pub bm25_supervisor: Option<Arc<bm25_supervisor::Bm25Supervisor>>,
594 /// Per-palace write serialisation locks (issue #230).
595 ///
596 /// Why: the dedup gate in `tools.rs` previously read a snapshot of
597 /// existing drawers, checked for near-duplicates via Jaro-Winkler, and
598 /// then issued the write — a classic time-of-check/time-of-use race.
599 /// Two concurrent `memory_remember` calls with the same content could
600 /// both see the pre-write snapshot, both pass the gate, and both land
601 /// duplicate drawers. Serialising the gate-then-write sequence per
602 /// palace closes the window: while one task holds the mutex, any
603 /// concurrent writer for the same palace blocks until the first write
604 /// finishes and is visible to `list_drawers`. The lock is **per
605 /// palace** (not global) so writes to different palaces continue to
606 /// run in parallel.
607 /// What: a `DashMap` keyed by palace id, where each entry is an
608 /// `Arc<tokio::sync::Mutex<()>>`. The mutex is constructed lazily by
609 /// `palace_write_lock` on first access. `Arc` lets callers hold a
610 /// clone of the lock past the lifetime of the `DashMap` entry so the
611 /// map never needs to be held across an `.await`.
612 /// Test: `tools::tests::dedup_gate_blocks_concurrent_duplicate_writes`.
613 pub palace_write_locks: Arc<dashmap::DashMap<String, Arc<tokio::sync::Mutex<()>>>>,
614 /// Counter of in-flight activity-log writes spawned by `emit`
615 /// (issue #232).
616 ///
617 /// Why: `emit` offloads the synchronous redb append to the tokio blocking
618 /// pool via `spawn_blocking` so the async runtime is never parked waiting
619 /// on fsync. The write is fire-and-forget — `emit` returns immediately
620 /// after spawning. Tests that observe the activity log right after a
621 /// burst of `emit` calls need a deterministic synchronization point;
622 /// holding an in-flight counter lets `flush_activity_writes` poll until
623 /// every spawned append has settled, which keeps the assertions
624 /// race-free without forcing every caller to `.await`.
625 /// What: an `Arc<AtomicUsize>` incremented before each `spawn_blocking`
626 /// and decremented inside the closure (after the append completes, even
627 /// if it errored). The counter is cheap (one atomic add per emit) and
628 /// stays at zero in steady-state production traffic.
629 /// Test: `web::tests::activity_endpoint_lists_recent_emits` and
630 /// `tests::emit_persists_mutations_but_skips_status_changed` call
631 /// `flush_activity_writes` to drain the counter before reading the log.
632 pub pending_activity_writes: Arc<AtomicUsize>,
633 /// In-memory cache mapping palace id → `Palace.name` (issue #228).
634 ///
635 /// Why: every `memory_remember` / `memory_note` write used to call
636 /// `PalaceRegistry::list_palaces` (a synchronous filesystem walk of the
637 /// data root) just to resolve a friendly palace name for the SSE
638 /// `DrawerAdded` event. With N palaces on disk the cost was O(N) opendirs
639 /// plus `palace.json` reads on every write, blocking the async runtime.
640 /// Caching the name in-memory turns the lookup into a `DashMap::get`.
641 /// What: `DashMap<String, String>` populated by `create_palace` and
642 /// `load_palaces_from_disk`, kept in sync by rename / delete paths.
643 /// Missing entries are treated as "name unknown" so callers fall back to
644 /// the palace id and the emit path never fails.
645 /// Test: `palace_name_cache_populated_after_hydration` and
646 /// `palace_name_cache_updates_on_create`.
647 pub palace_names: Arc<dashmap::DashMap<String, String>>,
648 /// Single-pass startup pin-file map: palace id → project root path (issue #470).
649 ///
650 /// Why: after daemon startup we have no record of which on-disk project
651 /// directories correspond to which palace ids — that information only
652 /// existed inside the pin files on disk. Eager-opening every palace on
653 /// startup is too expensive. This field captures the scan-only result of
654 /// `startup_scan::scan_pin_map` so handlers that want to locate a project
655 /// by its palace id (e.g. future cwd-inference, project-health checks)
656 /// can do a single `DashMap::get` instead of a filesystem walk.
657 /// Populated once, shortly after `load_palaces_from_disk` returns, by
658 /// `spawn_startup_tasks`. Never mutated after population — it is a
659 /// snapshot of what the filesystem looked like at startup.
660 /// What: `DashMap<String (palace_id), PathBuf (project root)>`.
661 /// The outer `Arc` lets `spawn_startup_tasks` (which holds only a clone
662 /// of `AppState`) write to the same backing map that request handlers
663 /// read. Population is asynchronous so callers must treat an absent entry
664 /// as "not yet scanned" (or "no pin found"), never as "palace unknown".
665 /// Test: `startup_scan::tests::scan_pin_map_*` validate the underlying
666 /// scanner function; the wiring in `spawn_startup_tasks` is covered by
667 /// the integration-test daemon start path.
668 pub pin_project_map: Arc<dashmap::DashMap<String, PathBuf>>,
669 /// Bounded sender for the BM25 index worker (issue #231).
670 ///
671 /// Why: the previous fire-and-forget design `tokio::spawn`ed one task per
672 /// `memory_remember` / `memory_note` call, so a write burst against a slow
673 /// or unreachable BM25 daemon grew an unbounded in-flight task queue. A
674 /// single long-lived worker draining a bounded mpsc channel caps that
675 /// back-pressure: writers `try_send` (never block), full-queue requests
676 /// are dropped with a `warn!`, and the worker exits cleanly when the last
677 /// sender is dropped on shutdown.
678 /// What: an `mpsc::Sender` cloned to every `AppState` clone (cheap). The
679 /// matching receiver is consumed by the worker spawned in
680 /// [`AppState::new`] via [`tools::spawn_bm25_index_worker`]. Capacity is
681 /// [`tools::BM25_INDEX_QUEUE_CAPACITY`] (256).
682 /// Test: `bm25_index_queue_drops_when_full` exercises the full-queue
683 /// branch via `bm25_index_enqueue`.
684 pub bm25_index_tx: tokio::sync::mpsc::Sender<tools::Bm25IndexRequest>,
685 /// Cached result of the startup update check (issue #537).
686 ///
687 /// Why: `/health` should report `update_available` without hitting crates.io
688 /// on every probe. A single background check at daemon startup stores the
689 /// result here; the health handler reads it lock-free (well, a brief mutex
690 /// lock) without a network call.
691 /// What: `None` = up-to-date or check not yet done; `Some("x.y.z")` = newer
692 /// version available. The field is populated by a `tokio::spawn` in
693 /// `spawn_startup_tasks` (main.rs) after the daemon binds.
694 /// Test: indirectly by the `/health` endpoint tests in `web.rs`.
695 pub update_available: Arc<std::sync::Mutex<Option<String>>>,
696}
697
698impl AppState {
699 /// Construct an `AppState` rooted at the given on-disk data directory.
700 ///
701 /// Why: The CLI (`serve`) and integration tests need to point the MCP
702 /// server at different roots — production at `dirs::data_dir`, tests at a
703 /// `tempfile::tempdir()`.
704 /// What: Builds an empty `PalaceRegistry`, captures the version, and
705 /// allocates an empty `OnceCell` for the embedder. `default_palace` is
706 /// `None`; use `with_default_palace` to set it.
707 /// Test: `tools::tests::dispatch_palace_create_persists` constructs an
708 /// AppState pointed at a tempdir and round-trips a palace through it.
709 pub fn new(data_root: PathBuf) -> Self {
710 let (events_tx, _) = broadcast::channel::<DaemonEvent>(128);
711 // Issue #96: open (or create) the persistent activity log under the
712 // daemon data root. Open failure is logged but never crashes the
713 // daemon — we fall back to a per-process tempdir so emits remain
714 // best-effort and the rest of the daemon keeps working.
715 let activity_log = open_activity_log_with_fallback(&data_root);
716 // Issue #231: bounded mpsc channel + single long-lived worker
717 // replaces the per-write `tokio::spawn` fire-and-forget pattern so
718 // BM25 indexing back-pressure is capped. The worker is spawned here
719 // unconditionally so the channel always has a drain — even when
720 // `bm25_client` is `None`, the worker just consumes and discards
721 // each request so senders never block on a full queue.
722 let (bm25_index_tx, bm25_index_rx) =
723 tokio::sync::mpsc::channel::<tools::Bm25IndexRequest>(tools::BM25_INDEX_QUEUE_CAPACITY);
724 // `bm25_client` / `bm25_supervisor` start as `None`; the builder
725 // `with_bm25_client_from_env` rebuilds the worker with the real
726 // client + supervisor once env-gated opt-in is resolved.
727 tools::spawn_bm25_index_worker(bm25_index_rx, None, None);
728 Self {
729 version: env!("CARGO_PKG_VERSION").to_string(),
730 registry: Arc::new(PalaceRegistry::new()),
731 data_root,
732 embedder: Arc::new(OnceCell::new()),
733 default_palace: None,
734 chat_provider: Arc::new(OnceCell::new()),
735 session_stores: Arc::new(dashmap::DashMap::new()),
736 events: Arc::new(events_tx),
737 started_at: std::time::Instant::now(),
738 // Default to an empty buffer — `with_log_buffer` overrides this
739 // when the daemon installs the `LogBufferLayer` (HTTP mode).
740 log_buffer: trusty_common::log_buffer::LogBuffer::new(
741 trusty_common::log_buffer::DEFAULT_LOG_CAPACITY,
742 ),
743 // Bug-reporting #478: `None` until `with_error_store` is called
744 // during daemon startup (HTTP mode). Tests keep `None` so no
745 // unexpected files are written to the OS data dir.
746 error_store: None,
747 disk_bytes: Arc::new(std::sync::atomic::AtomicU64::new(0)),
748 sys_metrics: Arc::new(tokio::sync::Mutex::new(
749 trusty_common::sys_metrics::SysMetrics::new(),
750 )),
751 bound_addr: Arc::new(OnceLock::new()),
752 prompt_context_cache: Arc::new(RwLock::new(prompt_facts::PromptFactsCache::default())),
753 activity_log,
754 bm25_client: None,
755 bm25_supervisor: None,
756 palace_write_locks: Arc::new(dashmap::DashMap::new()),
757 pending_activity_writes: Arc::new(AtomicUsize::new(0)),
758 palace_names: Arc::new(dashmap::DashMap::new()),
759 pin_project_map: Arc::new(dashmap::DashMap::new()),
760 bm25_index_tx,
761 update_available: Arc::new(std::sync::Mutex::new(None)),
762 }
763 }
764
765 /// Acquire (lazily, then clone) the per-palace write mutex.
766 ///
767 /// Why (issue #230): the dedup-check + `remember_with_options` write
768 /// sequence in `tools.rs` must be atomic per palace to prevent two
769 /// concurrent identical writes from both passing the dedup gate.
770 /// Callers hold the returned `Arc<Mutex<()>>`'s guard across the gate
771 /// check and the write so the second writer blocks until the first
772 /// write is visible to `list_drawers`. Returning a clone of the `Arc`
773 /// rather than a borrow into the `DashMap` lets the caller `.await`
774 /// while holding the lock without risking a deadlock against any
775 /// future map mutation (DashMap shards are sync mutexes).
776 /// What: looks up the palace id in `palace_write_locks` and returns
777 /// a clone of the existing mutex; on the first call for a palace,
778 /// inserts a freshly-constructed `tokio::sync::Mutex<()>` first. The
779 /// `DashMap::entry().or_insert_with` API guarantees the lazy
780 /// construction is racy-safe — only one mutex is ever inserted per
781 /// palace id.
782 /// Test: `tools::tests::dedup_gate_blocks_concurrent_duplicate_writes`.
783 pub fn palace_write_lock(&self, palace_id: &str) -> Arc<tokio::sync::Mutex<()>> {
784 if let Some(existing) = self.palace_write_locks.get(palace_id) {
785 return existing.clone();
786 }
787 self.palace_write_locks
788 .entry(palace_id.to_string())
789 .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(())))
790 .clone()
791 }
792
793 /// Look up a project root path by palace id in the startup pin-scan map.
794 ///
795 /// Why: provides a stable, cheap accessor so handlers do not reach directly
796 /// into the `DashMap` field and so the accessor can be mocked in future
797 /// tests without touching `AppState` internals. The map is populated
798 /// asynchronously by `spawn_startup_tasks` — an absent entry means either
799 /// the scan has not completed yet or no pin file claimed that id.
800 /// What: returns `Some(project_path)` when the palace id was found during
801 /// startup scan; `None` otherwise.
802 /// Test: covered indirectly via the startup-scan integration path; the
803 /// underlying map data is validated by `startup_scan::tests`.
804 pub fn pinned_project_path(&self, palace_id: &str) -> Option<PathBuf> {
805 self.pin_project_map.get(palace_id).map(|e| e.clone())
806 }
807
808 /// Builder-style: opt-in to the BM25 lexical lane (issue #156).
809 ///
810 /// Why: the BM25 subprocess is gated behind `TRUSTY_BM25_DAEMON=1` so
811 /// the default `cargo install trusty-memory` / launchd plist deployment
812 /// stays vector-only and existing test fixtures keep passing without
813 /// having to provision a daemon. Reading the env var here keeps the
814 /// gating logic in one place (the helper in `main.rs` just plumbs the
815 /// result through).
816 /// What: when `TRUSTY_BM25_DAEMON=1`, constructs one `Bm25Client` per
817 /// palace by lazy-resolving the socket path the first time the palace
818 /// id is observed. Currently we install a shared `default` client up
819 /// front and re-key on the palace id at the call site — palaces with no
820 /// daemon socket simply see search/index errors which we log + ignore.
821 /// Returns `self` unchanged when the env var is unset or set to anything
822 /// other than `1`.
823 /// Test: `bm25_client_disabled_by_default`,
824 /// `bm25_client_enabled_when_env_set`.
825 #[must_use]
826 pub fn with_bm25_client_from_env(mut self) -> Self {
827 if std::env::var("TRUSTY_BM25_DAEMON").as_deref() == Ok("1") {
828 // Install the default-palace client; per-palace clients are
829 // constructed on demand via `Bm25Client::for_palace`.
830 let default_palace = self.default_palace.as_deref().unwrap_or("default");
831 self.bm25_client = Some(Arc::new(Bm25Client::for_palace(default_palace)));
832 // Issue #193: hand-in-hand with the client, attach a spawn
833 // supervisor so the BM25 daemon is auto-started on first use
834 // for any palace. Operators who want to manage daemons
835 // out-of-band (launchd, systemd, manual) set
836 // TRUSTY_BM25_EXTERNAL=1 which makes the supervisor a no-op.
837 self.bm25_supervisor = Some(Arc::new(bm25_supervisor::Bm25Supervisor::new()));
838 // Issue #231: rebuild the bounded indexer channel + worker so
839 // the worker holds the now-populated client + supervisor. The
840 // placeholder worker installed by `AppState::new` (with `None`
841 // / `None`) drained the channel into the void — replacing the
842 // sender here closes the placeholder receiver and the
843 // placeholder worker exits cleanly. The new worker takes over
844 // as the sole drain for the indexer queue.
845 let (tx, rx) = tokio::sync::mpsc::channel::<tools::Bm25IndexRequest>(
846 tools::BM25_INDEX_QUEUE_CAPACITY,
847 );
848 tools::spawn_bm25_index_worker(
849 rx,
850 self.bm25_client.clone(),
851 self.bm25_supervisor.clone(),
852 );
853 self.bm25_index_tx = tx;
854 tracing::info!(
855 palace = default_palace,
856 "BM25 daemon client + spawn supervisor enabled (TRUSTY_BM25_DAEMON=1)"
857 );
858 }
859 self
860 }
861
862 /// Scan the palace registry directory and re-register every persisted
863 /// palace into the in-memory [`PalaceRegistry`].
864 ///
865 /// Why: `AppState::new` builds an *empty* registry, so after a daemon
866 /// restart `palace_list` / the dashboard reported zero palaces even though
867 /// dozens existed on disk — palace metadata was persisted by
868 /// `palace_create` but never re-hydrated on startup. This method closes
869 /// that gap by walking the on-disk layout (each subdirectory holding a
870 /// `palace.json` is one palace) and rebuilding a live `PalaceHandle` for
871 /// each, so recall paths see the full set immediately after a restart.
872 /// What: runs the blocking filesystem walk + per-palace `PalaceHandle::open`
873 /// on a `spawn_blocking` thread (so it never stalls the async runtime),
874 /// registers each successfully opened palace via `register_arc`, logs every
875 /// load at `debug!`, and returns the count loaded. A palace that fails to
876 /// open (corrupt index, unreadable `kg.db`, etc.) is logged at `warn!` and
877 /// skipped — one bad palace must not abort startup or crash the daemon.
878 /// `data_root` is expected to already be the palace registry directory —
879 /// `main.rs` resolves it via [`resolve_palace_registry_dir`] before
880 /// constructing the `AppState`, so the flat / legacy-`palaces/` layout
881 /// difference is handled exactly once.
882 /// Test: `tests::load_palaces_from_disk_rehydrates_registry` writes two
883 /// palaces into a tempdir, constructs an `AppState`, calls this method, and
884 /// asserts the returned count and registry contents.
885 pub async fn load_palaces_from_disk(&self) -> Result<usize> {
886 let registry_dir = self.data_root.clone();
887 let registry = self.registry.clone();
888 let palace_names = self.palace_names.clone();
889 // The directory walk and each `PalaceHandle::open` perform blocking
890 // filesystem + redb/usearch I/O — run the whole hydration on the
891 // blocking pool so it never parks an async worker thread.
892 let count = tokio::task::spawn_blocking(move || -> Result<usize> {
893 let palaces = PalaceRegistry::list_palaces(®istry_dir)?;
894 let total = palaces.len();
895 let mut loaded = 0usize;
896 let mut skipped = 0usize;
897 for palace in palaces {
898 match trusty_common::memory_core::PalaceHandle::open(&palace) {
899 Ok(handle) => {
900 tracing::debug!(
901 palace = %palace.id,
902 data_dir = %palace.data_dir.display(),
903 "loaded palace from disk"
904 );
905 // Issue #228: seed the in-memory name cache so write
906 // hot paths (memory_remember / memory_note) can resolve
907 // the friendly palace name without re-walking the data
908 // root. Insert here (during hydration) is the single
909 // point of truth for restart-time population.
910 palace_names.insert(palace.id.0.clone(), palace.name.clone());
911 registry.register_arc(handle);
912 loaded += 1;
913 }
914 Err(e) => {
915 // Why (issue #467): a single bad palace (corrupt kg.db,
916 // stale WAL, EMFILE — "Too many open files", permissions)
917 // must never abort startup or block the HTTP server from
918 // binding. Log per-palace and keep going; the summary
919 // below tells operators how many were skipped without
920 // trawling the log.
921 // The palace is NOT registered in the in-memory registry,
922 // so the next `open_palace` call for this id will attempt
923 // a fresh open from disk — the lazy-reopen path. If the
924 // root cause was EMFILE and the fd-limit fix (#462) raised
925 // the soft limit to 8192, that first request will succeed.
926 tracing::warn!(
927 palace = %palace.id,
928 data_dir = %palace.data_dir.display(),
929 "skipping palace during startup hydration: {e:#}; \
930 will retry lazily on first access"
931 );
932 skipped += 1;
933 }
934 }
935 }
936 tracing::info!(
937 "palace hydration summary: loaded {loaded}/{total} ({skipped} skipped due to errors)"
938 );
939 Ok(loaded)
940 })
941 .await
942 .map_err(|e| anyhow::anyhow!("join load_palaces_from_disk: {e}"))??;
943 Ok(count)
944 }
945
946 /// Builder-style: attach the daemon's shared [`LogBuffer`] so the
947 /// `GET /api/v1/logs/tail` endpoint serves the same lines the tracing
948 /// subscriber captures (issue #35).
949 ///
950 /// Why: `main` builds the buffer (via `init_tracing_with_buffer`) before
951 /// constructing the `AppState`, then hands a clone here so the HTTP
952 /// handler and the tracing layer observe the same ring.
953 /// What: replaces the empty default buffer with the supplied one.
954 /// Test: `logs_tail_returns_recent_lines`.
955 #[must_use]
956 pub fn with_log_buffer(mut self, buffer: trusty_common::log_buffer::LogBuffer) -> Self {
957 self.log_buffer = buffer;
958 self
959 }
960
961 /// Builder-style: attach the bug-capture `ErrorStore` handle (bug-reporting #478).
962 ///
963 /// Why: Phase 2 MCP / HTTP endpoints need a handle to the in-memory error
964 /// ring so they can serve `recent_errors` / `errors_by_fingerprint`
965 /// without disk I/O on the hot path. Installing it here — rather than
966 /// adding it as a separate global — keeps the state graph explicit and
967 /// lets tests skip it by never calling this method.
968 /// What: stores `Some(store)` in `AppState::error_store`; the `BugCaptureLayer`
969 /// that writes to this store is already installed in the tracing
970 /// subscriber by `init_tracing_with_buffer_and_capture`. The store is
971 /// `Clone` (cheap `Arc` clone internally) so both the layer and this
972 /// field share the same underlying ring.
973 /// Test: Phase 2 will add `error_store_captures_and_queries` in `web.rs`.
974 #[must_use]
975 pub fn with_error_store(mut self, store: trusty_common::error_capture::ErrorStore) -> Self {
976 self.error_store = Some(store);
977 self
978 }
979
980 /// Send a `DaemonEvent` to all connected SSE subscribers and persist
981 /// it to the activity log when the variant carries a source.
982 ///
983 /// Why: Mutating handlers call this after a successful write so the
984 /// dashboard can update without polling. The send is best-effort —
985 /// `broadcast::Sender::send` returns `Err` only when there are no live
986 /// receivers, which is fine (no listeners == no work to do). Issue
987 /// #96 additionally writes the entry to the persistent activity log
988 /// so the feed can serve historical rows on page load and so MCP /
989 /// HTTP / Hook origins are visible to the operator. Persistence is
990 /// also best-effort — a write failure is logged but never blocks the
991 /// SSE broadcast.
992 ///
993 /// Issue #232: the activity-log append is a synchronous redb write +
994 /// fsync. Calling it directly on the async caller's task parked a tokio
995 /// worker thread on disk I/O for every SSE event. We now offload the
996 /// append to the blocking thread pool via `spawn_blocking` and return
997 /// immediately — `emit` stays synchronous so every existing caller
998 /// (including the sync `dispatch_hook_fired` JSON-RPC handler) keeps
999 /// compiling unchanged. The fire-and-forget pattern matches the
1000 /// pre-fix semantics (best-effort, never blocks the SSE broadcast)
1001 /// while freeing the async runtime to do real work during the write.
1002 /// What: serialises the event for the log (skipping `StatusChanged`
1003 /// which is a recomputed aggregate, not a mutation), spawns the redb
1004 /// append on `tokio::task::spawn_blocking` keyed by a clone of the
1005 /// `Arc<ActivityLog>` and the cloned event, then sends the event over
1006 /// the broadcast channel. A `pending_activity_writes` counter is bumped
1007 /// before the spawn and decremented inside the closure so
1008 /// [`Self::flush_activity_writes`] can drain in tests.
1009 /// Test: `web::tests::sse_stream_receives_palace_created` confirms a
1010 /// subscriber observes the emitted event;
1011 /// `activity_endpoint_lists_recent_emits` confirms persistence via
1012 /// `flush_activity_writes`.
1013 pub fn emit(&self, event: DaemonEvent) {
1014 if let Some(source) = event.source() {
1015 let event_type = event.type_str();
1016 let palace_id = event.palace_id().map(|s| s.to_string());
1017 let log = Arc::clone(&self.activity_log);
1018 let event_for_log = event.clone();
1019 let pending = Arc::clone(&self.pending_activity_writes);
1020 // Pre-allocate the sequence id in the emitting thread so the
1021 // persisted order matches the emission order even when blocking-pool
1022 // workers execute the writes concurrently (issue #247). Without
1023 // this, four rapid emits would assign IDs inside their respective
1024 // `spawn_blocking` closures in a non-deterministic order.
1025 let id = log.alloc_id();
1026 pending.fetch_add(1, Ordering::SeqCst);
1027 // Why: the synchronous redb append + fsync must not park an
1028 // async worker thread (issue #232). Spawn the write on the
1029 // blocking pool; the JoinHandle is intentionally dropped —
1030 // the write is best-effort and any failure is logged below.
1031 tokio::task::spawn_blocking(move || {
1032 let result = log.append_with_id(id, source, palace_id, event_type, &event_for_log);
1033 if let Err(e) = result {
1034 tracing::warn!("activity_log.append failed for {event_type}: {e:#}");
1035 }
1036 pending.fetch_sub(1, Ordering::SeqCst);
1037 });
1038 }
1039 let _ = self.events.send(event);
1040 }
1041
1042 /// Block (asynchronously) until every in-flight activity-log write
1043 /// spawned by [`Self::emit`] has settled.
1044 ///
1045 /// Why: `emit` offloads its redb append to `tokio::task::spawn_blocking`
1046 /// and returns immediately (issue #232). Tests that observe the
1047 /// activity log right after a burst of emits would otherwise race the
1048 /// blocking-pool worker; this helper gives them a deterministic
1049 /// synchronization point. Production code never needs to call this —
1050 /// the dashboard reads through `GET /api/v1/activity`, which already
1051 /// tolerates writes settling asynchronously.
1052 /// What: spins on `pending_activity_writes` with a 1 ms yield until the
1053 /// counter is zero. Cheap: tests typically emit a handful of events
1054 /// and the loop exits within a single scheduler tick.
1055 /// Test: covered indirectly by `emit_persists_mutations_but_skips_status_changed`
1056 /// and `web::tests::activity_endpoint_lists_recent_emits`.
1057 pub async fn flush_activity_writes(&self) {
1058 while self.pending_activity_writes.load(Ordering::SeqCst) > 0 {
1059 tokio::time::sleep(std::time::Duration::from_millis(1)).await;
1060 }
1061 }
1062
1063 /// Open (or return cached) the chat-session store for a palace.
1064 ///
1065 /// Why: Chat session persistence lives in a dedicated SQLite file under
1066 /// the palace's data dir (`chat_sessions.db`) so it doesn't intermingle
1067 /// with the KG's transactional load. The store is cheap to clone via
1068 /// `Arc` but the underlying r2d2 pool should be reused, so cache by id.
1069 /// What: Creates the palace data dir if missing, opens (or reuses) a
1070 /// `ChatSessionStore` and stashes an `Arc` in the DashMap.
1071 /// Test: Indirectly via the session HTTP handlers in `web::tests`.
1072 pub fn session_store(&self, palace_id: &str) -> Result<Arc<ChatSessionStore>> {
1073 if let Some(entry) = self.session_stores.get(palace_id) {
1074 return Ok(entry.clone());
1075 }
1076 let dir = self.data_root.join(palace_id);
1077 std::fs::create_dir_all(&dir)
1078 .map_err(|e| anyhow::anyhow!("create palace dir {}: {e}", dir.display()))?;
1079 let store = Arc::new(ChatSessionStore::open(&dir.join("chat_sessions.db"))?);
1080 self.session_stores
1081 .insert(palace_id.to_string(), store.clone());
1082 Ok(store)
1083 }
1084
1085 /// Builder-style setter for the default palace name.
1086 ///
1087 /// Why: `serve --palace <name>` wants to bind every tool call to a
1088 /// project-scoped namespace without forcing every MCP request to repeat
1089 /// the palace argument.
1090 /// What: Returns `self` with `default_palace = Some(name)`.
1091 /// Test: `default_palace_used_when_arg_omitted` covers the resolution
1092 /// path; this setter is exercised there.
1093 pub fn with_default_palace(mut self, name: Option<String>) -> Self {
1094 self.default_palace = name;
1095 self
1096 }
1097
1098 /// Resolve (or initialize) the shared embedder.
1099 ///
1100 /// Why: FastEmbedder load is expensive — we share one instance across all
1101 /// tool calls; the `OnceCell` ensures concurrent first-use races collapse
1102 /// to a single load.
1103 /// What: Returns `Arc<FastEmbedder>` on success. Errors propagate from the
1104 /// underlying ONNX load.
1105 /// Test: Indirectly via `dispatch_remember_then_recall`.
1106 /// Resolve the active chat provider, auto-detecting on first call.
1107 ///
1108 /// Why: Provider selection depends on filesystem-loaded config plus a
1109 /// network probe (Ollama liveness), so it must be lazily initialised at
1110 /// runtime. Caching the choice in a `OnceCell` keeps it stable across
1111 /// concurrent requests without re-probing on every chat call.
1112 /// What: On first use loads `~/.trusty-memory/config.toml`, prefers an
1113 /// auto-detected Ollama instance (when `local_model.enabled`), and falls
1114 /// back to OpenRouter when an API key is set. Returns `Ok(None)` when
1115 /// neither is available so the caller can emit a 412.
1116 /// Test: `web::tests::providers_endpoint_returns_payload` covers the
1117 /// detection path indirectly through `/api/v1/chat/providers`.
1118 pub async fn chat_provider(&self) -> Option<Arc<dyn ChatProvider>> {
1119 self.chat_provider
1120 .get_or_init(|| async {
1121 // Why (issue #226): `service::load_user_config` is the
1122 // axum-free home of the loader; the `web::load_user_config`
1123 // re-export only exists for the HTTP handlers. Going
1124 // direct to `service` keeps this method usable when
1125 // the `axum-server` feature is disabled.
1126 let cfg = crate::service::load_user_config().unwrap_or_default();
1127 if cfg.local_model.enabled {
1128 if let Some(mut p) =
1129 trusty_common::auto_detect_local_provider(&cfg.local_model.base_url).await
1130 {
1131 // auto_detect returns an empty model id; callers must
1132 // set the configured model name themselves.
1133 p.model = cfg.local_model.model.clone();
1134 return Some(Arc::new(p) as Arc<dyn ChatProvider>);
1135 }
1136 }
1137 if !cfg.openrouter_api_key.is_empty() {
1138 return Some(Arc::new(trusty_common::OpenRouterProvider::new(
1139 cfg.openrouter_api_key,
1140 cfg.openrouter_model,
1141 )) as Arc<dyn ChatProvider>);
1142 }
1143 None
1144 })
1145 .await
1146 .clone()
1147 }
1148
1149 /// Spawn a fire-and-forget background task that auto-discovers project
1150 /// aliases under `project_root` and asserts new ones into `palace`.
1151 ///
1152 /// Why (issue #42): Projects carry implicit shorthand — cargo package
1153 /// names that differ from their directory, binary names that differ
1154 /// from packages, first-letter abbreviations — that should be surfaced
1155 /// without a user ever calling `add_alias`. Running discovery as a
1156 /// detached task on palace-open keeps startup latency unchanged: the
1157 /// daemon binds and starts serving immediately while the discovery scan
1158 /// completes in the background, and any newly-asserted aliases land in
1159 /// the prompt cache before the model's next `get_prompt_context` call.
1160 /// What: clones `self` (cheap; `Arc`-backed), spawns a tokio task that
1161 /// invokes the `discover_aliases` tool handler directly so the
1162 /// dedup + cache-rebuild logic runs exactly the same path as the MCP
1163 /// tool call. Errors are logged at `warn!`; one failed discovery never
1164 /// destabilises the daemon.
1165 /// Test: not unit-tested (timing-dependent fire-and-forget); the
1166 /// underlying `discover_aliases` dispatch is covered by
1167 /// `dispatch_discover_aliases_inserts_new_and_dedupes` in `tools::tests`.
1168 pub fn spawn_alias_discovery(&self, palace: String, project_root: PathBuf) {
1169 let state = self.clone();
1170 tokio::spawn(async move {
1171 let args = serde_json::json!({
1172 "palace": palace,
1173 "project_root": project_root.to_string_lossy(),
1174 });
1175 match tools::dispatch_tool(&state, "discover_aliases", args).await {
1176 Ok(result) => tracing::info!(
1177 new = ?result.get("new"),
1178 already_known = ?result.get("already_known"),
1179 "alias discovery complete"
1180 ),
1181 Err(e) => tracing::warn!("alias discovery failed: {e:#}"),
1182 }
1183 });
1184 }
1185
1186 pub async fn embedder(&self) -> Result<Arc<FastEmbedder>> {
1187 let cell = self.embedder.clone();
1188 let embedder = cell
1189 .get_or_try_init(|| async {
1190 let e = FastEmbedder::new().await?;
1191 Ok::<Arc<FastEmbedder>, anyhow::Error>(Arc::new(e))
1192 })
1193 .await?
1194 .clone();
1195 Ok(embedder)
1196 }
1197}
1198
1199impl std::fmt::Debug for AppState {
1200 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1201 f.debug_struct("AppState")
1202 .field("version", &self.version)
1203 .field("data_root", &self.data_root)
1204 .field("registry_len", &self.registry.len())
1205 .finish()
1206 }
1207}
1208
1209/// Handle a single MCP JSON-RPC message and produce its response.
1210///
1211/// Why: Pulled out of the stdio loop so unit tests can drive every method
1212/// without touching real stdin/stdout.
1213/// What: Routes `initialize`, `tools/list`, `tools/call`, `ping`, and the
1214/// `notifications/initialized` notification (which returns `Value::Null`).
1215/// Test: See unit tests below — initialize/list/call all return expected
1216/// JSON-RPC envelopes; notifications return `Null` (no response written).
1217pub async fn handle_message(state: &AppState, msg: Value) -> Value {
1218 let id = msg.get("id").cloned().unwrap_or(Value::Null);
1219 let method = msg.get("method").and_then(|m| m.as_str()).unwrap_or("");
1220
1221 match method {
1222 "initialize" => {
1223 let extra = state
1224 .default_palace
1225 .as_ref()
1226 .map(|dp| json!({ "default_palace": dp }));
1227 let result = initialize_response("trusty-memory", &state.version, extra);
1228 // Why (issue #42): prompt-facts now flow through the
1229 // per-message `get_prompt_context` tool rather than MCP
1230 // prompts, so we no longer advertise the `prompts` capability.
1231 json!({
1232 "jsonrpc": "2.0",
1233 "id": id,
1234 "result": result,
1235 })
1236 }
1237 // Notifications must NOT receive a response.
1238 "notifications/initialized" | "notifications/cancelled" => Value::Null,
1239 "tools/list" => json!({
1240 "jsonrpc": "2.0",
1241 "id": id,
1242 "result": tools::tool_definitions_with(state.default_palace.is_some())
1243 }),
1244 // OpenRPC 1.3.2 discovery — see `openrpc.rs`. Returns the full
1245 // service description so orchestrators (open-mpm, etc.) can
1246 // introspect every tool and its required `memory.read`/`memory.write`
1247 // scope without bespoke per-server adapters.
1248 "rpc.discover" => json!({
1249 "jsonrpc": "2.0",
1250 "id": id,
1251 "result": openrpc::build_discover_response(
1252 &state.version,
1253 state.default_palace.is_some(),
1254 ),
1255 }),
1256 "tools/call" => {
1257 let params = msg.get("params").cloned().unwrap_or_default();
1258 let tool_name = params
1259 .get("name")
1260 .and_then(|n| n.as_str())
1261 .unwrap_or("")
1262 .to_string();
1263 let args = params.get("arguments").cloned().unwrap_or_default();
1264 match tools::dispatch_tool(state, &tool_name, args).await {
1265 Ok(content) => {
1266 // Why: tools that return a bare JSON string (e.g.
1267 // `get_prompt_context` returning the formatted
1268 // Markdown block) should surface as plain text in the
1269 // MCP `content[0].text` field — wrapping in
1270 // `Value::to_string()` would re-quote the payload and
1271 // force every caller to strip outer quotes.
1272 let text = match &content {
1273 Value::String(s) => s.clone(),
1274 other => other.to_string(),
1275 };
1276 json!({
1277 "jsonrpc": "2.0",
1278 "id": id,
1279 "result": {
1280 "content": [{"type": "text", "text": text}]
1281 }
1282 })
1283 }
1284 Err(e) => json!({
1285 "jsonrpc": "2.0",
1286 "id": id,
1287 // Why: anyhow's `{:#}` alternate format walks the full
1288 // `Caused by:` chain so MCP clients see actionable
1289 // detail (e.g. "PalaceHandle::remember_with_options:
1290 // filter rejected: too short") instead of just the
1291 // outermost context label.
1292 "error": {"code": -32603, "message": format!("{e:#}")}
1293 }),
1294 }
1295 }
1296 "ping" => json!({"jsonrpc": "2.0", "id": id, "result": {}}),
1297 _ => json!({
1298 "jsonrpc": "2.0",
1299 "id": id,
1300 "error": {
1301 "code": -32601,
1302 "message": format!("Method not found: {method}")
1303 }
1304 }),
1305 }
1306}
1307
1308/// Preferred starting port for the trusty-memory HTTP daemon.
1309///
1310/// Why: keeps the well-known default stable for clients that have hard-coded
1311/// `127.0.0.1:7070` in their configuration, while still allowing dynamic
1312/// walking when the port is in use (`DYNAMIC_PORT_RANGE` ports starting here).
1313/// What: `7070` — historic default, matches the launchd plist's prior value.
1314/// Test: covered indirectly by `bind_dynamic_port_returns_listener`.
1315pub const DEFAULT_HTTP_PORT: u16 = 7070;
1316
1317/// Number of consecutive ports `bind_dynamic_port` walks before falling back
1318/// to the OS-assigned port. Matches the trusty-search convention.
1319const DYNAMIC_PORT_RANGE: u16 = 10;
1320
1321/// Path to the canonical address-discovery file for the trusty-memory daemon.
1322///
1323/// Why: clients (CLI, MCP tools, dashboards) need to find the running daemon
1324/// without configuration when the port was selected dynamically. Using
1325/// `trusty_common::resolve_data_dir` aligns this path with the location
1326/// that `trusty_common::read_daemon_addr("trusty-memory")` reads from, so
1327/// `prompt-context`, `doctor`, and `start`'s probe all find the running daemon.
1328/// The old `~/.trusty-memory/http_addr` path and the new
1329/// `~/Library/Application Support/trusty-memory/http_addr` (macOS) path were
1330/// divergent — the daemon wrote one; readers expected the other.
1331/// What: returns `{resolve_data_dir("trusty-memory")}/http_addr`, or `None` if
1332/// the data dir cannot be resolved (locked-down container, no passwd entry).
1333/// Test: `http_addr_path_uses_resolve_data_dir`.
1334pub fn http_addr_path() -> Option<PathBuf> {
1335 trusty_common::resolve_data_dir("trusty-memory")
1336 .ok()
1337 .map(|d| d.join("http_addr"))
1338}
1339
1340/// Bind a `TcpListener` to `127.0.0.1`, dynamically selecting a port.
1341///
1342/// Why: the historic default `7070` is convenient for clients but a stale
1343/// process or a second daemon must not produce a noisy failure. Walking
1344/// `DEFAULT_HTTP_PORT..DEFAULT_HTTP_PORT+DYNAMIC_PORT_RANGE` first preserves
1345/// backwards compatibility for the common case; OS-assigned fallback (`:0`)
1346/// guarantees the daemon always comes up even when every preferred port is
1347/// busy.
1348/// What: returns the first successful `TcpListener`. Tries 7070..=7079
1349/// in order, then falls back to OS-assigned. Caller inspects
1350/// `local_addr()` to learn the chosen port.
1351/// Test: `bind_dynamic_port_returns_listener` confirms it always binds *some*
1352/// port even after another listener occupies the preferred one.
1353pub async fn bind_dynamic_port() -> Result<tokio::net::TcpListener> {
1354 let preferred: SocketAddr = SocketAddr::from(([127, 0, 0, 1], DEFAULT_HTTP_PORT));
1355 // First: walk the preferred range (7070..=7079).
1356 if let Ok(listener) =
1357 trusty_common::bind_with_auto_port(preferred, DYNAMIC_PORT_RANGE - 1).await
1358 {
1359 return Ok(listener);
1360 }
1361 // Last resort: ask the kernel for any free port. `bind_with_auto_port`
1362 // with `:0` resolves immediately to the OS-assigned port.
1363 tracing::warn!(
1364 "all ports {DEFAULT_HTTP_PORT}..{} in use; requesting OS-assigned port",
1365 DEFAULT_HTTP_PORT + DYNAMIC_PORT_RANGE - 1
1366 );
1367 let any: SocketAddr = SocketAddr::from(([127, 0, 0, 1], 0));
1368 trusty_common::bind_with_auto_port(any, 0).await
1369}
1370
1371/// Write the bound `host:port` to `~/.trusty-memory/http_addr` atomically.
1372///
1373/// Why: clients must read the file mid-write without observing a partial
1374/// value. Writing to a `.tmp` sibling and renaming over the target gives
1375/// POSIX atomicity, matching the trusty-search implementation.
1376/// What: creates the parent directory if missing; writes `addr` followed by a
1377/// trailing newline (avoids the "no newline at end of file" warnings from
1378/// `cat`); renames `.tmp` → `http_addr`. Best-effort: I/O errors are
1379/// returned to the caller so `run_http_on` can log without panicking.
1380/// Test: `http_addr_file_round_trip_via_helpers`.
1381#[cfg(feature = "axum-server")]
1382fn write_http_addr_file(path: &Path, addr: &SocketAddr) -> std::io::Result<()> {
1383 use std::io::Write;
1384 if let Some(parent) = path.parent() {
1385 std::fs::create_dir_all(parent)?;
1386 }
1387 let tmp = path.with_extension("addr.tmp");
1388 {
1389 let mut f = std::fs::File::create(&tmp)?;
1390 writeln!(f, "{addr}")?;
1391 f.sync_all()?;
1392 }
1393 std::fs::rename(&tmp, path)?;
1394 Ok(())
1395}
1396
1397/// Resolve the dotfile discovery path `~/.trusty-memory/http_addr`.
1398///
1399/// Why (issue #498): external tooling such as claude-mpm's `migrate_trusty_autodetect`
1400/// reads `~/.trusty-memory/http_addr` to find the running daemon's port. On
1401/// macOS, `resolve_data_dir("trusty-memory")` returns
1402/// `~/Library/Application Support/trusty-memory/`, not `~/.trusty-memory/`,
1403/// so the daemon was writing to the OS-standard location while readers expected
1404/// the dotfile location. Writing to both locations keeps every reader happy
1405/// regardless of which convention they follow.
1406/// What: returns `$HOME/.trusty-memory/http_addr`, or `None` when
1407/// `dirs::home_dir()` is unavailable.
1408/// Test: `dotfile_http_addr_path_uses_home_dir`.
1409#[cfg(feature = "axum-server")]
1410fn dotfile_http_addr_path() -> Option<PathBuf> {
1411 dirs::home_dir().map(|h| h.join(".trusty-memory").join("http_addr"))
1412}
1413
1414/// Run the optional HTTP/SSE + web admin server.
1415///
1416/// Why: A long-running daemon mode lets non-stdio clients (browsers, curl,
1417/// future remote agents) hit `/health`, the `/api/v1/*` REST surface, and the
1418/// embedded admin SPA.
1419/// What: axum router built from `web::router()` plus a `/sse` stub for the
1420/// existing MCP-over-SSE clients. Caller provides a pre-bound listener so
1421/// port auto-detection lives at the call site. Before accepting connections
1422/// the daemon stamps the bound `host:port` onto `AppState.bound_addr` and
1423/// writes `~/.trusty-memory/http_addr` so clients can discover the live port.
1424/// On shutdown the file is removed best-effort (a stale file with the wrong
1425/// port is worse than a missing one).
1426/// Test: `cargo test -p trusty-memory web::tests` exercises the router shape;
1427/// manual: `curl http://127.0.0.1:<port>/health` returns `ok` with `addr`.
1428#[cfg(feature = "axum-server")]
1429pub async fn run_http_on(state: AppState, listener: tokio::net::TcpListener) -> Result<()> {
1430 use axum::routing::get;
1431
1432 // Issue #35: recompute the `data_root` disk footprint every 10 s on a
1433 // background task so `GET /health` reports `disk_bytes` without doing a
1434 // recursive directory walk on the request path.
1435 spawn_disk_size_ticker(state.clone());
1436
1437 // Issue #228: emit aggregate `StatusChanged` on a fixed cadence rather
1438 // than on every drawer write. The previous design called
1439 // `aggregate_status_event` from every `memory_remember` / `memory_note`
1440 // / `memory_forget` (and the matching HTTP handlers), each of which
1441 // walked the data root + opened every palace handle. Coalescing the
1442 // emit to a 30 s ticker keeps dashboards live without dragging an
1443 // O(N palaces) recompute onto the write hot path.
1444 spawn_status_event_ticker(state.clone());
1445
1446 // Capture and advertise the bound address BEFORE serving so the first
1447 // request handler — and the http_addr discovery file — see the real port
1448 // even if `local_addr()` would otherwise be racy.
1449 let local = listener.local_addr().ok();
1450 let (written_path, written_dotfile_path) = if let Some(a) = local {
1451 // Stash on state for handlers (e.g. /health) to surface.
1452 let _ = state.bound_addr.set(a);
1453 info!("HTTP server listening on http://{a}");
1454 eprintln!("HTTP server listening on http://{a}");
1455 // Primary: write to the OS-standard data dir (`~/Library/Application
1456 // Support/trusty-memory/http_addr` on macOS, `~/.local/share/…` on
1457 // Linux). This is what `trusty_common::read_daemon_addr` reads.
1458 // Best-effort: a missing $HOME or read-only fs is non-fatal.
1459 let primary = match http_addr_path() {
1460 Some(p) => match write_http_addr_file(&p, &a) {
1461 Ok(()) => {
1462 info!("wrote daemon address to {}", p.display());
1463 Some(p)
1464 }
1465 Err(e) => {
1466 tracing::warn!("could not write {}: {e}", p.display());
1467 None
1468 }
1469 },
1470 None => {
1471 tracing::warn!("no $HOME — skipping http_addr discovery file");
1472 None
1473 }
1474 };
1475 // Issue #498: also write to `~/.trusty-memory/http_addr` so external
1476 // tools (e.g. claude-mpm's `migrate_trusty_autodetect`) that read the
1477 // dotfile path can discover the daemon's port. On macOS the OS-standard
1478 // path differs from the dotfile path; writing both ensures consumers
1479 // using either convention find the file. Best-effort: failures are
1480 // logged but do not block startup.
1481 let dotfile = match dotfile_http_addr_path() {
1482 Some(p) => match write_http_addr_file(&p, &a) {
1483 Ok(()) => {
1484 info!("wrote daemon address to dotfile {}", p.display());
1485 Some(p)
1486 }
1487 Err(e) => {
1488 tracing::warn!("could not write dotfile {}: {e}", p.display());
1489 None
1490 }
1491 },
1492 None => None,
1493 };
1494 (primary, dotfile)
1495 } else {
1496 (None, None)
1497 };
1498
1499 // Multi-transport refactor: bind the Unix domain socket alongside
1500 // the HTTP listener. The UDS serves NDJSON JSON-RPC 2.0 for the
1501 // `trusty-memory-mcp-bridge` binary (and any local CLI that wants
1502 // to skip HTTP overhead). Failures are logged but never block the
1503 // HTTP server from coming up — UDS is best-effort on hosts where
1504 // it's unsupported (e.g. some Docker overlays).
1505 let uds_sock_path = spawn_uds_listener(state.clone()).await;
1506
1507 // Keep a handle to the BM25 supervisor (if any) so we can call
1508 // `shutdown()` on the exit path. Cloning here is cheap (`Arc`) and
1509 // detaches the lifetime of the supervisor from the `state` move into
1510 // the router below.
1511 let bm25_supervisor = state.bm25_supervisor.clone();
1512
1513 let app = web::router()
1514 .route("/sse", get(sse_handler))
1515 .with_state(state);
1516
1517 // Why (issue #534): bare axum::serve exits only on an internal error; SIGTERM
1518 // (launchctl bootout) would kill the process before the cleanup below had a
1519 // chance to run, leaving stale addr/socket files behind and dropping any
1520 // in-flight request without draining. `with_graceful_shutdown` installs a
1521 // SIGTERM + SIGINT watcher; when either fires axum stops accepting new
1522 // connections, drains active requests, then returns here so cleanup runs.
1523 let serve_result = axum::serve(listener, app)
1524 .with_graceful_shutdown(trusty_common::shutdown_signal())
1525 .await;
1526
1527 // Best-effort cleanup: remove `http_addr` files so stale clients fail fast
1528 // instead of timing out against a dead port. Remove both the OS-standard
1529 // path and the dotfile path (#498).
1530 if let Some(p) = written_path.as_ref() {
1531 let _ = std::fs::remove_file(p);
1532 }
1533 if let Some(p) = written_dotfile_path.as_ref() {
1534 let _ = std::fs::remove_file(p);
1535 }
1536 if let Some(p) = uds_sock_path.as_ref() {
1537 let _ = std::fs::remove_file(p);
1538 }
1539
1540 // Issue #193: gracefully reap every spawned BM25 daemon before the
1541 // process exits so each one gets a chance to flush its snapshot and
1542 // unlink its socket. `kill_on_drop=true` on the children would
1543 // SIGKILL them on Drop anyway, but that skips the daemon's own
1544 // shutdown sequence and leaves stale sockets behind.
1545 if let Some(supervisor) = bm25_supervisor {
1546 supervisor.shutdown().await;
1547 }
1548
1549 serve_result?;
1550 Ok(())
1551}
1552
1553/// Spawn the UDS accept loop alongside the HTTP server.
1554///
1555/// Why: UDS is an additive transport — failing to bind it (unusual
1556/// $TMPDIR layout, permission error on macOS) should not block the
1557/// HTTP daemon from coming up. Logging the failure and returning
1558/// `None` lets the caller skip cleanup later.
1559/// What: resolves [`transport::uds::socket_path`], cleans any stale
1560/// file, binds, writes the `<data_root>/uds_addr` discovery file, and
1561/// spawns the accept loop on a background tokio task. Returns the
1562/// bound path so the caller can clean it up on shutdown.
1563/// Test: covered by `uds_ndjson_roundtrip` in the integration tests
1564/// and the unit tests in [`transport::uds`].
1565#[cfg(feature = "axum-server")]
1566async fn spawn_uds_listener(state: AppState) -> Option<PathBuf> {
1567 // Use a data-root-scoped socket path so multiple daemons (typical
1568 // in tests) don't collide on the shared `$TMPDIR/trusty-memory.sock`.
1569 // Production daemons (those rooted at the canonical data dir) still
1570 // get the canonical socket path so the bridge can find it without
1571 // reading the discovery file.
1572 let sock_path = transport::uds::socket_path_for(&state.data_root);
1573 let listener = match transport::uds::bind_uds(&sock_path).await {
1574 Ok(l) => l,
1575 Err(e) => {
1576 tracing::warn!(
1577 "UDS bind at {} failed: {e:#}; continuing without UDS transport",
1578 sock_path.display()
1579 );
1580 return None;
1581 }
1582 };
1583 info!("UDS listener bound at {}", sock_path.display());
1584 eprintln!("UDS listener bound at {}", sock_path.display());
1585 // Best-effort: write the address discovery file so the bridge can
1586 // find the live socket even when the daemon was started with an
1587 // unusual $TMPDIR.
1588 if let Err(e) = transport::uds::write_uds_addr_file(&state.data_root, &sock_path) {
1589 tracing::warn!(
1590 "could not write {}/{}: {e:#}",
1591 state.data_root.display(),
1592 transport::uds::UDS_ADDR_FILE
1593 );
1594 }
1595 let task_state = state.clone();
1596 tokio::spawn(async move {
1597 if let Err(e) = transport::uds::run_uds(task_state, listener).await {
1598 tracing::error!("UDS accept loop exited: {e:#}");
1599 }
1600 });
1601 Some(sock_path)
1602}
1603
1604/// Convenience: bind `addr` and serve via [`run_http_on`].
1605#[cfg(feature = "axum-server")]
1606pub async fn run_http(state: AppState, addr: std::net::SocketAddr) -> Result<()> {
1607 let listener = tokio::net::TcpListener::bind(addr).await?;
1608 run_http_on(state, listener).await
1609}
1610
1611/// Convenience: bind dynamically (7070..=7079, OS fallback) and serve.
1612///
1613/// Why: `trusty-memory serve` with no `--http` flag is the canonical
1614/// launchd-managed daemon entry point. Dynamic binding lets a stale daemon
1615/// or a hand-spawned `serve --http 127.0.0.1:7070` coexist without breaking
1616/// the launchd-managed instance.
1617/// What: calls [`bind_dynamic_port`] then [`run_http_on`].
1618/// Test: integration via `trusty-memory serve` + `cat ~/.trusty-memory/http_addr`.
1619#[cfg(feature = "axum-server")]
1620pub async fn run_http_dynamic(state: AppState) -> Result<()> {
1621 let listener = bind_dynamic_port().await?;
1622 run_http_on(state, listener).await
1623}
1624
1625/// Spawn a background ticker that recomputes the `data_root` disk footprint
1626/// every 10 seconds and stores it in `state.disk_bytes` (issue #35).
1627///
1628/// Why: `GET /health` reports `disk_bytes`. Walking the data directory on
1629/// every health request would turn a frequent health poll into unbounded
1630/// recursive I/O. Computing it off the request path on a fixed cadence keeps
1631/// `/health` cheap and bounds the staleness to ~10 s — fine for an
1632/// at-a-glance footprint figure.
1633/// What: spawns a detached tokio task. `AppState` is cheap to `Clone` (all
1634/// `Arc` fields), so the task holds a full clone; the daemon process lives
1635/// for the lifetime of the server anyway, so no `Weak` downgrade is needed.
1636/// Each tick runs the blocking directory walk on `spawn_blocking` so it never
1637/// stalls the async runtime, then stores the byte total atomically.
1638/// Test: `health_endpoint_includes_resource_fields` asserts the field shape;
1639/// the ticker cadence is not unit-tested (timing-dependent).
1640#[cfg(feature = "axum-server")]
1641fn spawn_disk_size_ticker(state: AppState) {
1642 tokio::spawn(async move {
1643 let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
1644 loop {
1645 interval.tick().await;
1646 let dir = state.data_root.clone();
1647 // The directory walk is blocking filesystem I/O — run it on the
1648 // blocking pool so it never parks an async worker thread.
1649 let bytes = tokio::task::spawn_blocking(move || {
1650 trusty_common::sys_metrics::dir_size_bytes(&dir)
1651 })
1652 .await
1653 .unwrap_or(0);
1654 state
1655 .disk_bytes
1656 .store(bytes, std::sync::atomic::Ordering::Relaxed);
1657 }
1658 });
1659}
1660
1661/// Interval between aggregate-status snapshot emits on the SSE bus.
1662///
1663/// Why (issue #228): mutations used to fire `StatusChanged` synchronously on
1664/// the write path, which forced an O(N palaces) sum of drawer / vector / KG
1665/// counts on every `memory_remember`. Coalescing into a fixed-cadence ticker
1666/// lets dashboards stay current (a 30 s lag is invisible at human scale)
1667/// while keeping the write path free of aggregate work.
1668/// What: 30 seconds — short enough that the operator UI doesn't feel stale
1669/// between manual writes, long enough that the recompute cost (in-memory
1670/// registry walk plus the redb `count_active_triples` per palace) is a
1671/// rounding error on the daemon's CPU budget.
1672/// Test: covered indirectly — the math has not changed, only the cadence.
1673#[allow(dead_code)]
1674const STATUS_EVENT_TICK_SECS: u64 = 30;
1675
1676/// Spawn a background ticker that emits `DaemonEvent::StatusChanged` every
1677/// [`STATUS_EVENT_TICK_SECS`] seconds (issue #228).
1678///
1679/// Why: replaces the per-write `state.emit(self.aggregate_status_event())`
1680/// call sites that used to recompute the aggregate every time a drawer was
1681/// created or deleted. Walking N palaces on every write blocks the async
1682/// runtime; coalescing the emit onto a ticker keeps dashboards up-to-date
1683/// without that cost.
1684/// What: spawns a detached tokio task that holds a full `AppState` clone
1685/// (cheap — every field is `Arc`-backed) and ticks every
1686/// [`STATUS_EVENT_TICK_SECS`] seconds. Each tick computes
1687/// `MemoryService::aggregate_status_event` (which now iterates the
1688/// in-memory registry, not disk) and broadcasts it via `state.emit`. If
1689/// no SSE subscribers are connected the broadcast `send` is a cheap no-op,
1690/// so the ticker imposes no cost when nobody is listening.
1691/// Test: not unit-tested (timing-dependent fire-and-forget); the underlying
1692/// `aggregate_status_event` math is exercised by the existing
1693/// `status_endpoint_returns_payload` path.
1694#[allow(dead_code)]
1695fn spawn_status_event_ticker(state: AppState) {
1696 tokio::spawn(async move {
1697 let mut interval =
1698 tokio::time::interval(std::time::Duration::from_secs(STATUS_EVENT_TICK_SECS));
1699 // The first tick fires immediately, which is fine: it gives SSE
1700 // subscribers a baseline `StatusChanged` shortly after they connect.
1701 loop {
1702 interval.tick().await;
1703 let event = service::MemoryService::new(state.clone()).aggregate_status_event();
1704 state.emit(event);
1705 }
1706 });
1707}
1708
1709/// Live SSE event stream — pushes `DaemonEvent` frames to dashboard clients.
1710///
1711/// Why: The dashboard subscribes once and reacts to live pushes (palace
1712/// created, drawer added/deleted, dream completed, status changed) instead of
1713/// polling `/api/v1/*` endpoints.
1714/// What: Subscribes to `state.events`, emits an initial `connected` frame,
1715/// then forwards every `DaemonEvent` as `data: <json>\n\n`. Lagged
1716/// subscribers receive a `lag` frame indicating skipped events; channel
1717/// closure ends the stream.
1718/// Test: `web::tests::sse_stream_emits_palace_created` (covers subscribe +
1719/// emit + receive); manual: `curl -N http://.../sse`.
1720#[cfg(feature = "axum-server")]
1721pub(crate) async fn sse_handler(
1722 axum::extract::State(state): axum::extract::State<AppState>,
1723) -> impl axum::response::IntoResponse {
1724 use futures::StreamExt;
1725 use tokio_stream::wrappers::BroadcastStream;
1726
1727 let rx = state.events.subscribe();
1728 let initial = futures::stream::once(async {
1729 Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(
1730 "data: {\"type\":\"connected\"}\n\n",
1731 ))
1732 });
1733 let events = BroadcastStream::new(rx).map(|res| {
1734 let frame = match res {
1735 Ok(event) => match serde_json::to_string(&event) {
1736 Ok(json) => format!("data: {json}\n\n"),
1737 Err(e) => format!("data: {{\"type\":\"error\",\"message\":\"{e}\"}}\n\n"),
1738 },
1739 Err(tokio_stream::wrappers::errors::BroadcastStreamRecvError::Lagged(n)) => {
1740 format!("data: {{\"type\":\"lag\",\"skipped\":{n}}}\n\n")
1741 }
1742 };
1743 Ok::<axum::body::Bytes, std::io::Error>(axum::body::Bytes::from(frame))
1744 });
1745 let stream = initial.chain(events);
1746
1747 axum::response::Response::builder()
1748 .header("Content-Type", "text/event-stream")
1749 .header("Cache-Control", "no-cache")
1750 .header("X-Accel-Buffering", "no")
1751 .body(axum::body::Body::from_stream(stream))
1752 .expect("valid SSE response")
1753}
1754
1755#[cfg(test)]
1756mod tests {
1757 use super::*;
1758
1759 /// Why: Issue #234 — previously we `mem::forget`ed the `TempDir` so tests
1760 /// could keep using `AppState` without juggling the directory handle, but
1761 /// that leaked one temp directory per test (262+ accumulated each run).
1762 /// What: Returns the `TempDir` alongside the `AppState` so the caller can
1763 /// bind it (`let (state, _tmp) = ...;`) and let drop semantics clean up
1764 /// when the test scope ends.
1765 /// Test: Every test in this module that constructs state.
1766 fn test_state() -> (AppState, tempfile::TempDir) {
1767 let tmp = tempfile::tempdir().expect("tempdir");
1768 let root = tmp.path().to_path_buf();
1769 // Issue #88: bypass palace-slug enforcement so lib tests that call
1770 // `palace_create` with arbitrary names keep passing.
1771 // SAFETY: constant idempotent write; safe across test threads.
1772 unsafe {
1773 std::env::set_var("TRUSTY_SKIP_PALACE_ENFORCEMENT", "1");
1774 }
1775 (AppState::new(root), tmp)
1776 }
1777
1778 #[tokio::test]
1779 async fn initialize_returns_protocol_version_and_capabilities() {
1780 let (state, _tmp) = test_state();
1781 let req = json!({
1782 "jsonrpc": "2.0",
1783 "id": 1,
1784 "method": "initialize",
1785 "params": {
1786 "protocolVersion": "2024-11-05",
1787 "capabilities": {},
1788 "clientInfo": {"name": "test", "version": "0"}
1789 }
1790 });
1791 let resp = handle_message(&state, req).await;
1792 assert_eq!(resp["jsonrpc"], "2.0");
1793 assert_eq!(resp["id"], 1);
1794 assert_eq!(resp["result"]["protocolVersion"], "2024-11-05");
1795 assert!(resp["result"]["capabilities"]["tools"].is_object());
1796 assert_eq!(resp["result"]["serverInfo"]["name"], "trusty-memory");
1797 }
1798
1799 #[tokio::test]
1800 async fn initialized_notification_returns_null() {
1801 let (state, _tmp) = test_state();
1802 let req = json!({
1803 "jsonrpc": "2.0",
1804 "method": "notifications/initialized",
1805 "params": {}
1806 });
1807 let resp = handle_message(&state, req).await;
1808 assert!(resp.is_null());
1809 }
1810
1811 #[tokio::test]
1812 async fn tools_list_returns_all_tools() {
1813 let (state, _tmp) = test_state();
1814 let req = json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"});
1815 let resp = handle_message(&state, req).await;
1816 let tools = resp["result"]["tools"].as_array().expect("tools array");
1817 // Issue #99 added `memory_send_message`; issue #180 added
1818 // `palace_delete`; the #180 follow-up adds `palace_update` on top
1819 // of the 22-tool baseline; issue #537 adds `upgrade`.
1820 assert_eq!(tools.len(), 24);
1821 }
1822
1823 #[tokio::test]
1824 async fn unknown_method_returns_error() {
1825 let (state, _tmp) = test_state();
1826 let req = json!({"jsonrpc": "2.0", "id": 4, "method": "wat"});
1827 let resp = handle_message(&state, req).await;
1828 assert_eq!(resp["error"]["code"], -32601);
1829 }
1830
1831 #[tokio::test]
1832 async fn ping_returns_empty_result() {
1833 let (state, _tmp) = test_state();
1834 let req = json!({"jsonrpc": "2.0", "id": 5, "method": "ping"});
1835 let resp = handle_message(&state, req).await;
1836 assert!(resp["result"].is_object());
1837 }
1838
1839 #[tokio::test]
1840 async fn app_state_default_constructs() {
1841 let (s, _tmp) = test_state();
1842 assert!(!s.version.is_empty());
1843 assert!(s.registry.is_empty());
1844 assert!(s.default_palace.is_none());
1845 }
1846
1847 /// Why (issue #225): the previous implementation called `.expect()` on the
1848 /// tempdir fallback, which panicked the daemon at startup on hosts where
1849 /// neither the data root nor `std::env::temp_dir()` is writable
1850 /// (read-only Docker overlays, locked-down sandboxes). The activity log
1851 /// is documented as best-effort, so the fix returns a no-op `Discard`
1852 /// variant instead. This test forces both paths to fail and asserts the
1853 /// helper returns the discard variant rather than panicking.
1854 ///
1855 /// Skipped when running as root because `chmod 000` is a no-op for the
1856 /// root user — the kernel grants root access regardless of mode bits.
1857 /// CI typically runs as non-root, so coverage is preserved in the
1858 /// common case; local root invocations simply skip with a warning.
1859 #[test]
1860 #[cfg(unix)]
1861 fn open_activity_log_with_fallback_returns_discard_when_unwritable() {
1862 // Skip when running as root — chmod is ignored.
1863 // SAFETY: libc::geteuid is a thread-safe syscall with no preconditions.
1864 if unsafe { libc::geteuid() } == 0 {
1865 eprintln!(
1866 "skipping open_activity_log_with_fallback_returns_discard_when_unwritable: running as root"
1867 );
1868 return;
1869 }
1870
1871 use std::os::unix::fs::PermissionsExt;
1872
1873 // Build two unwritable directories: the primary "data root" and a
1874 // shadow "TMPDIR" so the tempdir fallback also fails.
1875 let outer = tempfile::tempdir().expect("outer tempdir");
1876 let primary = outer.path().join("primary");
1877 let tmpdir = outer.path().join("fake-tmp");
1878 std::fs::create_dir(&primary).expect("create primary");
1879 std::fs::create_dir(&tmpdir).expect("create tmpdir");
1880
1881 // chmod 000 on both — neither can be opened for write.
1882 std::fs::set_permissions(&primary, std::fs::Permissions::from_mode(0o000))
1883 .expect("chmod primary");
1884 std::fs::set_permissions(&tmpdir, std::fs::Permissions::from_mode(0o000))
1885 .expect("chmod tmpdir");
1886
1887 // Override the tempdir lookup so `open_activity_log_with_fallback`
1888 // hits our unwritable fake-tmp instead of the real system temp.
1889 // Note: env var mutation is process-global; this test is the only
1890 // accessor for `TMPDIR` in this test binary, and we restore the
1891 // previous value before returning.
1892 let prev_tmpdir = std::env::var_os("TMPDIR");
1893 std::env::set_var("TMPDIR", &tmpdir);
1894
1895 let log = open_activity_log_with_fallback(&primary);
1896
1897 // Restore TMPDIR ASAP so a panic later in the test doesn't leak it.
1898 match prev_tmpdir {
1899 Some(v) => std::env::set_var("TMPDIR", v),
1900 None => std::env::remove_var("TMPDIR"),
1901 }
1902
1903 // Restore permissions so the outer tempdir can clean up.
1904 let _ = std::fs::set_permissions(&primary, std::fs::Permissions::from_mode(0o700));
1905 let _ = std::fs::set_permissions(&tmpdir, std::fs::Permissions::from_mode(0o700));
1906
1907 assert!(
1908 log.is_discard(),
1909 "expected ActivityLog::Discard when both data root and tempdir are unwritable"
1910 );
1911
1912 // The Discard variant must still satisfy the public contract: no
1913 // panic on append/count/list.
1914 let id = log
1915 .append(
1916 ActivitySource::Http,
1917 None,
1918 "drawer_added",
1919 json!({"smoke": true}),
1920 )
1921 .expect("discard append must succeed");
1922 assert_eq!(id, 0);
1923 assert_eq!(log.count().expect("discard count"), 0);
1924 assert!(log
1925 .list(&ActivityFilter::default(), 10, 0)
1926 .expect("discard list")
1927 .is_empty());
1928 }
1929
1930 /// Why: Issue #26 — when `serve --palace <name>` is set, the MCP server
1931 /// must (a) report the default in the `initialize` `serverInfo`, (b)
1932 /// drop `palace` from the required schema in `tools/list`, and (c) let
1933 /// `tools/call` use the default when the caller omits `palace`.
1934 /// Test: Construct an AppState with a default palace, create that palace
1935 /// on disk via the registry, then call `memory_remember` without a
1936 /// `palace` argument and confirm it resolves to the default.
1937 #[tokio::test]
1938 async fn default_palace_used_when_arg_omitted() {
1939 let tmp = tempfile::tempdir().expect("tempdir");
1940 let root = tmp.path().to_path_buf();
1941
1942 // Pre-create the default palace so remember has somewhere to land.
1943 let registry = trusty_common::memory_core::PalaceRegistry::new();
1944 let palace = trusty_common::memory_core::Palace {
1945 id: trusty_common::memory_core::PalaceId::new("default-pal"),
1946 name: "default-pal".to_string(),
1947 description: None,
1948 created_at: chrono::Utc::now(),
1949 data_dir: root.join("default-pal"),
1950 };
1951 registry
1952 .create_palace(&root, palace)
1953 .expect("create_palace");
1954
1955 let state = AppState::new(root).with_default_palace(Some("default-pal".to_string()));
1956
1957 // (a) initialize advertises the default.
1958 let init = handle_message(
1959 &state,
1960 json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
1961 )
1962 .await;
1963 assert_eq!(
1964 init["result"]["serverInfo"]["default_palace"], "default-pal",
1965 "initialize must echo default_palace in serverInfo"
1966 );
1967
1968 // (b) tools/list drops `palace` from required when default is set.
1969 let list = handle_message(
1970 &state,
1971 json!({"jsonrpc": "2.0", "id": 2, "method": "tools/list"}),
1972 )
1973 .await;
1974 let tools = list["result"]["tools"].as_array().expect("tools array");
1975 let remember = tools
1976 .iter()
1977 .find(|t| t["name"] == "memory_remember")
1978 .expect("memory_remember tool");
1979 let required: Vec<&str> = remember["inputSchema"]["required"]
1980 .as_array()
1981 .expect("required array")
1982 .iter()
1983 .filter_map(|v| v.as_str())
1984 .collect();
1985 assert!(
1986 !required.contains(&"palace"),
1987 "palace must not be required when default is configured; got {required:?}"
1988 );
1989 assert!(required.contains(&"text"));
1990
1991 // (c) tools/call resolves the default when arg is omitted.
1992 let call = handle_message(
1993 &state,
1994 json!({
1995 "jsonrpc": "2.0",
1996 "id": 3,
1997 "method": "tools/call",
1998 "params": {
1999 "name": "memory_remember",
2000 "arguments": {"text": "default palace test memory content with several tokens"},
2001 },
2002 }),
2003 )
2004 .await;
2005 // Successful dispatch returns `result.content[0].text` JSON.
2006 let text = call["result"]["content"][0]["text"]
2007 .as_str()
2008 .unwrap_or_else(|| panic!("expected success result, got {call}"));
2009 let parsed: Value = serde_json::from_str(text).expect("parse content json");
2010 assert_eq!(parsed["palace"], "default-pal");
2011 assert_eq!(parsed["status"], "stored");
2012 assert!(parsed["drawer_id"].as_str().is_some());
2013 }
2014
2015 /// Why: When no default is set, `tools/call` for a palace-bound tool
2016 /// without a `palace` argument should error helpfully rather than panic.
2017 #[tokio::test]
2018 async fn missing_palace_without_default_errors() {
2019 let (state, _tmp) = test_state();
2020 let resp = handle_message(
2021 &state,
2022 json!({
2023 "jsonrpc": "2.0",
2024 "id": 7,
2025 "method": "tools/call",
2026 "params": {
2027 "name": "memory_recall",
2028 "arguments": {"query": "anything"},
2029 },
2030 }),
2031 )
2032 .await;
2033 assert_eq!(resp["error"]["code"], -32603);
2034 let msg = resp["error"]["message"].as_str().unwrap_or("");
2035 assert!(
2036 msg.contains("missing 'palace'"),
2037 "expected helpful error, got: {msg}"
2038 );
2039 }
2040
2041 /// Why: regression for the "palaces lost on restart" bug — `AppState::new`
2042 /// builds an empty registry, so the daemon must call
2043 /// `load_palaces_from_disk` on startup to re-register palaces persisted by
2044 /// a previous run. Without that call the registry stays empty even though
2045 /// `palace.json` files exist on disk.
2046 /// What: persists two palaces under a tempdir (via the same
2047 /// `create_palace` path the `palace_create` tool uses), constructs a fresh
2048 /// `AppState` rooted there, calls `load_palaces_from_disk`, and asserts the
2049 /// returned count and registry contents.
2050 /// Test: this test itself.
2051 #[tokio::test]
2052 async fn load_palaces_from_disk_rehydrates_registry() {
2053 use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2054
2055 let tmp = tempfile::tempdir().expect("tempdir");
2056 let root = tmp.path().to_path_buf();
2057
2058 // Phase 1: persist two palaces to disk, then drop the writer registry
2059 // so nothing is held in memory — simulating a prior daemon run.
2060 {
2061 let writer = PalaceRegistry::new();
2062 for id in ["alpha", "beta"] {
2063 let palace = Palace {
2064 id: PalaceId::new(id),
2065 name: id.to_string(),
2066 description: None,
2067 created_at: chrono::Utc::now(),
2068 data_dir: root.join(id),
2069 };
2070 writer
2071 .create_palace(&root, palace)
2072 .expect("persist palace to disk");
2073 }
2074 }
2075
2076 // Add a stray non-palace subdirectory; the walker must ignore it.
2077 std::fs::create_dir_all(root.join("not-a-palace")).expect("mkdir");
2078
2079 // Phase 2: fresh AppState starts with an empty registry (the bug).
2080 let state = AppState::new(root);
2081 assert!(
2082 state.registry.is_empty(),
2083 "AppState::new must start with an empty registry"
2084 );
2085
2086 // The fix: hydrate from disk.
2087 let count = state
2088 .load_palaces_from_disk()
2089 .await
2090 .expect("load_palaces_from_disk");
2091
2092 assert_eq!(count, 2, "both persisted palaces should be loaded");
2093 assert_eq!(state.registry.len(), 2, "registry should hold both palaces");
2094 let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
2095 assert!(ids.contains(&"alpha".to_string()));
2096 assert!(ids.contains(&"beta".to_string()));
2097 }
2098
2099 /// Why: existing installs (and the legacy standalone `trusty-memory` repo)
2100 /// nest palaces one level deeper under a `palaces/` subdirectory. When that
2101 /// subdirectory exists, `resolve_palace_registry_dir` must descend into it
2102 /// so the daemon scans the level that actually holds the `palace.json`
2103 /// files — otherwise it finds zero palaces, which is the restart bug.
2104 /// What: creates `<dir>/palaces/`, resolves, and asserts the nested path is
2105 /// returned.
2106 /// Test: this test itself.
2107 #[test]
2108 fn resolve_palace_registry_dir_prefers_palaces_subdir() {
2109 let tmp = tempfile::tempdir().expect("tempdir");
2110 let data_dir = tmp.path().to_path_buf();
2111 std::fs::create_dir_all(data_dir.join("palaces")).expect("mkdir palaces");
2112
2113 let resolved = resolve_palace_registry_dir(data_dir.clone());
2114 assert_eq!(resolved, data_dir.join("palaces"));
2115 }
2116
2117 /// Why: a fresh install with no `palaces/` subdirectory must fall back to
2118 /// the data dir itself (the current flat monorepo layout).
2119 #[test]
2120 fn resolve_palace_registry_dir_falls_back_to_data_dir() {
2121 let tmp = tempfile::tempdir().expect("tempdir");
2122 let data_dir = tmp.path().to_path_buf();
2123
2124 let resolved = resolve_palace_registry_dir(data_dir.clone());
2125 assert_eq!(resolved, data_dir);
2126 }
2127
2128 /// Why: defense-in-depth assertion (#503) — a non-absolute data_dir must be
2129 /// caught and rejected before reaching `AppState::new`, as it would create
2130 /// palace dirs relative to the daemon CWD (/ under launchd).
2131 /// What: passing a relative path to `resolve_palace_registry_dir` produces
2132 /// a relative result; the daemon startup guard in `main.rs` must refuse it.
2133 /// This test validates the outcome of that guard path by confirming that a
2134 /// relative input would NOT produce an absolute registry dir.
2135 /// Test: this test itself.
2136 #[test]
2137 fn resolve_palace_registry_dir_relative_input_is_not_absolute() {
2138 // A relative dir is not a valid input, but we want to confirm that
2139 // if one somehow slipped through, the result would also be relative —
2140 // so the `main.rs` guard (is_absolute check) correctly rejects it.
2141 let relative = std::path::PathBuf::from("relative/path");
2142 let result = resolve_palace_registry_dir(relative.clone());
2143 assert!(
2144 !result.is_absolute(),
2145 "a relative input should produce a relative registry dir (caught by main.rs guard)"
2146 );
2147 }
2148
2149 /// Why: defense-in-depth assertion (#503) — a data_dir equal to "/" must be
2150 /// caught before reaching `AppState::new` to prevent palace dirs at `/`.
2151 /// What: confirms that "/" produces a result that equals "/", which the
2152 /// `main.rs` startup guard correctly rejects.
2153 /// Test: this test itself.
2154 #[test]
2155 fn resolve_palace_registry_dir_root_input_stays_root() {
2156 let root = std::path::PathBuf::from("/");
2157 let result = resolve_palace_registry_dir(root);
2158 // The guard in main.rs checks data_dir (before calling this fn), so
2159 // "/" would be caught before reaching this fn. But if it did reach here,
2160 // we confirm it yields "/" (no palaces/ subdir exists there), which would
2161 // then be caught by the main.rs post-resolve guard.
2162 assert_eq!(result, std::path::PathBuf::from("/"));
2163 }
2164
2165 /// Why: end-to-end check that the nested-`palaces/` layout hydrates — the
2166 /// daemon resolves the registry dir via `resolve_palace_registry_dir`, so
2167 /// an `AppState` rooted there must load palaces persisted one level below
2168 /// the bare data dir.
2169 /// What: persists two palaces under `<root>/palaces/<id>/`, constructs an
2170 /// `AppState` rooted at the resolved registry dir, and asserts hydration
2171 /// finds both.
2172 /// Test: this test itself.
2173 #[tokio::test]
2174 async fn load_palaces_from_disk_handles_palaces_subdir() {
2175 use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2176
2177 let tmp = tempfile::tempdir().expect("tempdir");
2178 let root = tmp.path().to_path_buf();
2179 let nested = root.join("palaces");
2180
2181 {
2182 let writer = PalaceRegistry::new();
2183 for id in ["cto", "engineering"] {
2184 let palace = Palace {
2185 id: PalaceId::new(id),
2186 name: id.to_string(),
2187 description: None,
2188 created_at: chrono::Utc::now(),
2189 data_dir: nested.join(id),
2190 };
2191 // create_palace anchors data_dir under the passed root, so
2192 // pass `nested` here to land palaces under `<root>/palaces/`.
2193 writer
2194 .create_palace(&nested, palace)
2195 .expect("persist palace under palaces/ subdir");
2196 }
2197 }
2198
2199 // Mirror main.rs: resolve the registry dir, then root AppState there.
2200 let registry_dir = resolve_palace_registry_dir(root);
2201 assert_eq!(registry_dir, nested, "must resolve into palaces/ subdir");
2202 let state = AppState::new(registry_dir);
2203 let count = state
2204 .load_palaces_from_disk()
2205 .await
2206 .expect("load_palaces_from_disk");
2207
2208 assert_eq!(count, 2, "both nested palaces should be loaded");
2209 assert_eq!(state.registry.len(), 2);
2210 let ids: Vec<String> = state.registry.list().into_iter().map(|p| p.0).collect();
2211 assert!(ids.contains(&"cto".to_string()));
2212 assert!(ids.contains(&"engineering".to_string()));
2213 }
2214
2215 /// Why: an empty (or missing) palace registry directory must not error — a
2216 /// brand-new install has nothing to hydrate and should report zero.
2217 #[tokio::test]
2218 async fn load_palaces_from_disk_empty_root_returns_zero() {
2219 let (state, _tmp) = test_state();
2220 let count = state
2221 .load_palaces_from_disk()
2222 .await
2223 .expect("load_palaces_from_disk on empty root");
2224 assert_eq!(count, 0);
2225 assert!(state.registry.is_empty());
2226 }
2227
2228 /// Why (issue #228): hydration must seed `state.palace_names` so the
2229 /// MCP write hot path (`memory_remember` / `memory_note`) can resolve a
2230 /// friendly palace name without re-walking the data root on every call.
2231 /// Regression risk: a future refactor that forgets to populate the cache
2232 /// would silently degrade write latency.
2233 /// What: persists two palaces with distinct `name` values, constructs a
2234 /// fresh `AppState`, hydrates from disk, and asserts the cache holds the
2235 /// expected mappings.
2236 /// Test: this test itself.
2237 #[tokio::test]
2238 async fn palace_name_cache_populated_after_hydration() {
2239 use trusty_common::memory_core::{Palace, PalaceId, PalaceRegistry};
2240
2241 let tmp = tempfile::tempdir().expect("tempdir");
2242 let root = tmp.path().to_path_buf();
2243 {
2244 let writer = PalaceRegistry::new();
2245 for (id, name) in [("alpha", "Alpha Project"), ("beta", "Beta Project")] {
2246 let palace = Palace {
2247 id: PalaceId::new(id),
2248 name: name.to_string(),
2249 description: None,
2250 created_at: chrono::Utc::now(),
2251 data_dir: root.join(id),
2252 };
2253 writer.create_palace(&root, palace).expect("persist palace");
2254 }
2255 }
2256
2257 let state = AppState::new(root);
2258 assert!(
2259 state.palace_names.is_empty(),
2260 "fresh AppState must start with an empty name cache"
2261 );
2262 state
2263 .load_palaces_from_disk()
2264 .await
2265 .expect("load_palaces_from_disk");
2266
2267 assert_eq!(state.palace_names.len(), 2, "cache must hold both palaces");
2268 assert_eq!(
2269 state.palace_names.get("alpha").map(|e| e.value().clone()),
2270 Some("Alpha Project".to_string()),
2271 );
2272 assert_eq!(
2273 state.palace_names.get("beta").map(|e| e.value().clone()),
2274 Some("Beta Project".to_string()),
2275 );
2276 }
2277
2278 /// Why (issue #228): `palace_create` (MCP tool) and `MemoryService::create_palace`
2279 /// (HTTP path) both insert into the name cache so a freshly-created palace
2280 /// is resolvable on the very next write — without waiting for the next
2281 /// hydration cycle.
2282 /// What: dispatches the `palace_create` MCP tool against a tempdir and
2283 /// asserts the cache row was written.
2284 /// Test: this test itself.
2285 #[tokio::test]
2286 async fn palace_name_cache_updates_on_create() {
2287 use serde_json::json;
2288
2289 let (state, _tmp) = test_state();
2290 let _ = tools::dispatch_tool(&state, "palace_create", json!({"name": "gamma"}))
2291 .await
2292 .expect("palace_create");
2293 assert_eq!(
2294 state.palace_names.get("gamma").map(|e| e.value().clone()),
2295 Some("gamma".to_string()),
2296 "palace_create must populate the in-memory name cache so writes \
2297 can resolve the friendly name without a disk walk"
2298 );
2299 }
2300
2301 /// Why: initialize without a default palace must omit `default_palace`
2302 /// from `serverInfo` so clients can detect the unbound mode.
2303 #[tokio::test]
2304 async fn initialize_without_default_palace_omits_field() {
2305 let (state, _tmp) = test_state();
2306 let init = handle_message(
2307 &state,
2308 json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2309 )
2310 .await;
2311 assert!(init["result"]["serverInfo"]["default_palace"].is_null());
2312 }
2313
2314 /// Why: every `~/.trusty-memory/http_addr` consumer (CLI, dashboard,
2315 /// future trusty-mpm wiring) must agree on the path. A regression that
2316 /// moves this file breaks every client relying on `read_daemon_addr`.
2317 /// What: under a stubbed data dir, the path ends in
2318 /// `trusty-memory/http_addr` — matching `trusty_common::read_daemon_addr`'s
2319 /// expected location.
2320 #[tokio::test]
2321 async fn http_addr_path_uses_resolve_data_dir() {
2322 // Hold the env_test_lock so this test does not race with
2323 // `prompt_context::tests::*` which spin a real daemon under
2324 // the same env override and would otherwise observe a
2325 // half-mutated $TRUSTY_DATA_DIR_OVERRIDE.
2326 let _guard = crate::commands::env_test_lock().lock().await;
2327 let tmp = tempfile::tempdir().unwrap();
2328 // SAFETY: test-only env mutation serialised by env_test_lock.
2329 unsafe {
2330 std::env::set_var(trusty_common::DATA_DIR_OVERRIDE_ENV, tmp.path());
2331 }
2332 let result = http_addr_path();
2333 unsafe {
2334 std::env::remove_var(trusty_common::DATA_DIR_OVERRIDE_ENV);
2335 }
2336 let p = result.expect("http_addr_path must return Some when data dir is resolvable");
2337 assert!(
2338 p.ends_with("trusty-memory/http_addr"),
2339 "unexpected http_addr path: {}",
2340 p.display()
2341 );
2342 }
2343
2344 /// Why: write+read round-trip pins the disk format: a single line of
2345 /// `host:port\n`. Clients (cat, sh `$(cat ...)`) trim whitespace, so the
2346 /// trailing newline is invisible — but anything else (extra whitespace,
2347 /// multi-line) would break callers.
2348 /// Note (issue #226): `write_http_addr_file` is part of the HTTP-serving
2349 /// surface gated behind `axum-server`; the test follows the same gate.
2350 #[cfg(feature = "axum-server")]
2351 #[test]
2352 fn http_addr_file_round_trip_via_helpers() {
2353 let dir = tempfile::tempdir().unwrap();
2354 let path = dir.path().join("http_addr");
2355 let addr: SocketAddr = "127.0.0.1:7073".parse().unwrap();
2356 write_http_addr_file(&path, &addr).unwrap();
2357 let raw = std::fs::read_to_string(&path).unwrap();
2358 assert_eq!(raw.trim(), "127.0.0.1:7073");
2359 // The trailing newline keeps `cat` and editors happy.
2360 assert!(raw.ends_with('\n'));
2361 }
2362
2363 /// Why: dynamic binding must succeed even when the preferred port is
2364 /// already in use. Walking 7070..=7079 + OS fallback guarantees the
2365 /// daemon never fails to come up just because another process holds 7070.
2366 /// What: pre-bind 7070 (best-effort — skip the test if it's already
2367 /// busy on the host), then call `bind_dynamic_port` and assert we got
2368 /// *some* listener back.
2369 #[tokio::test]
2370 async fn bind_dynamic_port_returns_listener() {
2371 let listener = bind_dynamic_port().await.expect("bind_dynamic_port");
2372 let addr = listener.local_addr().expect("local_addr");
2373 assert_eq!(addr.ip().to_string(), "127.0.0.1");
2374 assert!(addr.port() > 0, "port must be non-zero after bind");
2375 }
2376
2377 /// Why: Issue #42 — prompt-facts are now served by the per-message
2378 /// `get_prompt_context` tool rather than the MCP prompts surface, so the
2379 /// `initialize` handshake must NOT advertise a `prompts` capability and
2380 /// `prompts/list` / `prompts/get` must fall through to the "method not
2381 /// found" path.
2382 #[tokio::test]
2383 async fn initialize_does_not_advertise_prompts_capability() {
2384 let (state, _tmp) = test_state();
2385 let init = handle_message(
2386 &state,
2387 json!({"jsonrpc": "2.0", "id": 1, "method": "initialize"}),
2388 )
2389 .await;
2390 assert!(
2391 init["result"]["capabilities"]["prompts"].is_null(),
2392 "initialize must NOT advertise the prompts capability; got {init}"
2393 );
2394
2395 // Both prompts/* dispatchers should now report method-not-found.
2396 for method in ["prompts/list", "prompts/get"] {
2397 let resp =
2398 handle_message(&state, json!({"jsonrpc": "2.0", "id": 2, "method": method})).await;
2399 assert_eq!(
2400 resp["error"]["code"], -32601,
2401 "{method} should return method-not-found; got {resp}"
2402 );
2403 }
2404 }
2405
2406 /// Why: `AppState::new` must initialise `bound_addr` to an empty
2407 /// `OnceLock` so `/health` reports `addr: None` on the stdio path. A
2408 /// regression that pre-populates this field would advertise a bogus
2409 /// address from a stale clone.
2410 ///
2411 /// Note (issue #231): now async so it runs inside a Tokio runtime —
2412 /// `AppState::new` spawns the bounded BM25 index worker via
2413 /// `tokio::spawn`, which requires an active runtime.
2414 #[tokio::test]
2415 async fn app_state_starts_with_empty_bound_addr() {
2416 let (state, _tmp) = test_state();
2417 assert!(state.bound_addr.get().is_none());
2418 }
2419
2420 /// Why (issue #96): `DaemonEvent::type_str` underpins the persisted
2421 /// activity log's `event_type` column — every variant must map to the
2422 /// exact SSE `type` tag the UI already handles. A drift between the
2423 /// SSE wire format and the stored type would break the feed's icon /
2424 /// label rendering for historical rows.
2425 /// What: constructs one of each variant, serialises via serde, and
2426 /// confirms `type_str()` matches the JSON `type` field.
2427 /// Test: this test.
2428 #[test]
2429 fn daemon_event_type_str_matches_sse_tag() {
2430 let cases = [
2431 DaemonEvent::PalaceCreated {
2432 id: "p".into(),
2433 name: "p".into(),
2434 source: ActivitySource::Http,
2435 },
2436 DaemonEvent::DrawerAdded {
2437 palace_id: "p".into(),
2438 palace_name: "p".into(),
2439 drawer_count: 1,
2440 timestamp: chrono::Utc::now(),
2441 content_preview: String::new(),
2442 source: ActivitySource::Mcp,
2443 },
2444 DaemonEvent::DrawerDeleted {
2445 palace_id: "p".into(),
2446 drawer_count: 0,
2447 source: ActivitySource::Http,
2448 },
2449 DaemonEvent::DreamCompleted {
2450 palace_id: None,
2451 merged: 0,
2452 pruned: 0,
2453 compacted: 0,
2454 closets_updated: 0,
2455 duration_ms: 0,
2456 source: ActivitySource::Http,
2457 },
2458 DaemonEvent::StatusChanged {
2459 total_drawers: 0,
2460 total_vectors: 0,
2461 total_kg_triples: 0,
2462 },
2463 DaemonEvent::HookFired {
2464 palace_id: Some("p".into()),
2465 palace_name: Some("p".into()),
2466 hook_type: HookType::UserPromptSubmit,
2467 injection_kind: InjectionKind::PromptContext,
2468 injection_length: 12,
2469 trigger_prompt_excerpt: "hello".into(),
2470 timestamp: chrono::Utc::now(),
2471 duration_ms: 5,
2472 source: ActivitySource::Hook,
2473 },
2474 ];
2475 for ev in &cases {
2476 let json = serde_json::to_value(ev).unwrap();
2477 assert_eq!(json["type"].as_str(), Some(ev.type_str()));
2478 }
2479 }
2480
2481 /// Why: `HookType` is serialised on every `HookFired` activity row; its
2482 /// wire format must round-trip cleanly so dashboard / TUI consumers can
2483 /// safely parse historic entries written by an older daemon build.
2484 /// What: serde-encodes each variant, asserts the JSON matches the
2485 /// expected PascalCase label, then decodes back.
2486 /// Test: itself.
2487 #[test]
2488 fn hook_type_serde_round_trips() {
2489 let cases = [
2490 (HookType::UserPromptSubmit, "\"UserPromptSubmit\""),
2491 (HookType::SessionStart, "\"SessionStart\""),
2492 ];
2493 for (ht, expected) in cases {
2494 let s = serde_json::to_string(&ht).unwrap();
2495 assert_eq!(s, expected, "{ht:?} should serialise to {expected}");
2496 let back: HookType = serde_json::from_str(&s).unwrap();
2497 assert_eq!(back, ht);
2498 assert_eq!(ht.as_str(), expected.trim_matches('"'));
2499 }
2500 }
2501
2502 /// Why: same as `hook_type_serde_round_trips` but for `InjectionKind`.
2503 /// What: kebab-case round trip on every variant.
2504 /// Test: itself.
2505 #[test]
2506 fn injection_kind_serde_round_trips() {
2507 let cases = [
2508 (InjectionKind::PromptContext, "\"prompt-context\""),
2509 (InjectionKind::InboxCheck, "\"inbox-check\""),
2510 ];
2511 for (ik, expected) in cases {
2512 let s = serde_json::to_string(&ik).unwrap();
2513 assert_eq!(s, expected);
2514 let back: InjectionKind = serde_json::from_str(&s).unwrap();
2515 assert_eq!(back, ik);
2516 assert_eq!(ik.as_str(), expected.trim_matches('"'));
2517 }
2518 }
2519
2520 /// Why: the activity feed renders the trigger prompt excerpt directly;
2521 /// runaway prompts must be capped at [`HOOK_PROMPT_EXCERPT_CHARS`] with
2522 /// a `…` marker so the row stays readable.
2523 /// What: feeds a 200-character prompt and asserts the excerpt is
2524 /// bounded.
2525 /// Test: itself.
2526 #[test]
2527 fn hook_excerpt_truncates_long_prompts() {
2528 let long = "x".repeat(200);
2529 let excerpt = hook_prompt_excerpt(&long);
2530 assert!(excerpt.chars().count() <= HOOK_PROMPT_EXCERPT_CHARS);
2531 assert!(excerpt.ends_with('…'));
2532 assert_eq!(hook_prompt_excerpt(""), "");
2533 }
2534
2535 /// Why: multi-line prompts must collapse to a single line so the
2536 /// activity feed row doesn't blow out vertically.
2537 /// What: feeds a multi-line whitespace-heavy prompt and asserts the
2538 /// output is a single-spaced single line.
2539 /// Test: itself.
2540 #[test]
2541 fn hook_excerpt_collapses_whitespace() {
2542 let input = "hello\n\nworld\t\tfoo";
2543 let excerpt = hook_prompt_excerpt(input);
2544 assert_eq!(excerpt, "hello world foo");
2545 }
2546
2547 /// Why (issue #96): `palace_id()` and `source()` feed the persisted
2548 /// activity log's columns; they must extract the right field per
2549 /// variant. Sloppy refactors could swap two fields and the log would
2550 /// silently mis-attribute writes.
2551 /// What: builds each variant with known field values and asserts the
2552 /// extractor returns them.
2553 /// Test: this test.
2554 #[test]
2555 fn daemon_event_palace_id_and_source_extraction() {
2556 let ev = DaemonEvent::DrawerAdded {
2557 palace_id: "alpha".into(),
2558 palace_name: "alpha".into(),
2559 drawer_count: 1,
2560 timestamp: chrono::Utc::now(),
2561 content_preview: String::new(),
2562 source: ActivitySource::Mcp,
2563 };
2564 assert_eq!(ev.palace_id(), Some("alpha"));
2565 assert_eq!(ev.source(), Some(ActivitySource::Mcp));
2566
2567 let status = DaemonEvent::StatusChanged {
2568 total_drawers: 1,
2569 total_vectors: 2,
2570 total_kg_triples: 3,
2571 };
2572 assert_eq!(status.palace_id(), None);
2573 assert_eq!(status.source(), None);
2574
2575 let dream = DaemonEvent::DreamCompleted {
2576 palace_id: Some("p1".into()),
2577 merged: 0,
2578 pruned: 0,
2579 compacted: 0,
2580 closets_updated: 0,
2581 duration_ms: 10,
2582 source: ActivitySource::Http,
2583 };
2584 assert_eq!(dream.palace_id(), Some("p1"));
2585 assert_eq!(dream.source(), Some(ActivitySource::Http));
2586 }
2587
2588 /// Why (issue #96): `AppState::emit` must persist mutation events to
2589 /// the activity log while keeping `StatusChanged` (a recomputed
2590 /// aggregate, not a mutation) out of the persisted history.
2591 /// What: emits one of each variant under a fresh state and asserts
2592 /// the persisted count matches the number of mutation events.
2593 /// Test: this test.
2594 #[tokio::test]
2595 async fn emit_persists_mutations_but_skips_status_changed() {
2596 let (state, _tmp) = test_state();
2597 state.emit(DaemonEvent::PalaceCreated {
2598 id: "p".into(),
2599 name: "p".into(),
2600 source: ActivitySource::Http,
2601 });
2602 state.emit(DaemonEvent::StatusChanged {
2603 total_drawers: 1,
2604 total_vectors: 0,
2605 total_kg_triples: 0,
2606 });
2607 state.emit(DaemonEvent::DrawerAdded {
2608 palace_id: "p".into(),
2609 palace_name: "p".into(),
2610 drawer_count: 1,
2611 timestamp: chrono::Utc::now(),
2612 content_preview: "x".into(),
2613 source: ActivitySource::Mcp,
2614 });
2615 // Issue #232: `emit` now offloads the redb write to `spawn_blocking`,
2616 // so the test must wait for the background pool to drain before
2617 // asserting on the persisted count.
2618 state.flush_activity_writes().await;
2619 let count = state.activity_log.count().unwrap();
2620 assert_eq!(count, 2, "only PalaceCreated + DrawerAdded must persist");
2621 }
2622
2623 /// Why (issue #156): the BM25 lane must be opt-in — existing deployments
2624 /// that don't set `TRUSTY_BM25_DAEMON=1` must see `bm25_client = None`
2625 /// and the recall hot path must continue to behave exactly as before.
2626 /// What: builds an `AppState` with `with_bm25_client_from_env()` while
2627 /// the env var is unset; asserts the field stays `None`.
2628 /// Test: this test.
2629 #[tokio::test]
2630 async fn bm25_client_disabled_by_default() {
2631 // Serialise with the sibling `bm25_client_enabled_when_env_set` test
2632 // so they don't race on the shared `TRUSTY_BM25_DAEMON` env var.
2633 let _guard = crate::commands::env_test_lock().lock().await;
2634 // SAFETY: this test exercises std::env::remove_var which is unsafe
2635 // in 2024 edition because the global env is shared. We restore the
2636 // pre-test value at the end so neighbours are unaffected.
2637 let prev = std::env::var("TRUSTY_BM25_DAEMON").ok();
2638 unsafe {
2639 std::env::remove_var("TRUSTY_BM25_DAEMON");
2640 }
2641 let (state, _tmp) = test_state();
2642 let state = state.with_bm25_client_from_env();
2643 assert!(
2644 state.bm25_client.is_none(),
2645 "bm25_client must be None when TRUSTY_BM25_DAEMON is unset"
2646 );
2647 // Issue #193: the spawn supervisor is bound to the same env gate as
2648 // the client — opt-out parity matters so we never accidentally
2649 // spawn daemons in deployments that explicitly didn't opt in.
2650 assert!(
2651 state.bm25_supervisor.is_none(),
2652 "bm25_supervisor must be None when TRUSTY_BM25_DAEMON is unset"
2653 );
2654 if let Some(v) = prev {
2655 unsafe {
2656 std::env::set_var("TRUSTY_BM25_DAEMON", v);
2657 }
2658 }
2659 }
2660
2661 /// Why (issue #156): when the operator opts in via `TRUSTY_BM25_DAEMON=1`,
2662 /// the builder must construct a real `Bm25Client` pointed at the canonical
2663 /// per-palace socket path. We don't connect — no daemon need be running —
2664 /// we only assert the client field is populated.
2665 /// What: sets the env var, runs the builder, asserts `Some(_)`.
2666 /// Test: this test.
2667 #[tokio::test]
2668 async fn bm25_client_enabled_when_env_set() {
2669 let _guard = crate::commands::env_test_lock().lock().await;
2670 let prev = std::env::var("TRUSTY_BM25_DAEMON").ok();
2671 unsafe {
2672 std::env::set_var("TRUSTY_BM25_DAEMON", "1");
2673 }
2674 let (state, _tmp) = test_state();
2675 let state = state.with_bm25_client_from_env();
2676 assert!(
2677 state.bm25_client.is_some(),
2678 "bm25_client must be Some when TRUSTY_BM25_DAEMON=1"
2679 );
2680 // Issue #193: opting in to the client must also install the spawn
2681 // supervisor so the daemon is auto-started on first use.
2682 assert!(
2683 state.bm25_supervisor.is_some(),
2684 "bm25_supervisor must be Some when TRUSTY_BM25_DAEMON=1"
2685 );
2686 match prev {
2687 Some(v) => unsafe { std::env::set_var("TRUSTY_BM25_DAEMON", v) },
2688 None => unsafe { std::env::remove_var("TRUSTY_BM25_DAEMON") },
2689 }
2690 }
2691
2692 // -------------------------------------------------------------------------
2693 // Issue #467 — palaces skipped at startup hydration are lazily re-opened
2694 // -------------------------------------------------------------------------
2695
2696 /// Why (issue #467): when `load_palaces_from_disk` fails to open a palace
2697 /// (e.g. EMFILE — "Too many open files"), it logs a warning and does NOT
2698 /// register the palace in the in-memory registry. A subsequent call to
2699 /// `open_palace` for that id must attempt a fresh open from disk, not
2700 /// permanently return "not found". This test verifies the lazy-reopen
2701 /// path: create a palace on disk, remove it from the registry (simulating a
2702 /// startup-hydration skip), then open it via `open_palace` and assert success.
2703 /// What: builds an `AppState` with an on-disk palace that is subsequently
2704 /// removed from the in-memory registry (simulating what happens when
2705 /// `PalaceHandle::open` fails during `load_palaces_from_disk`), calls
2706 /// `registry.open_palace`, and asserts the palace handle is returned.
2707 /// Test: this test.
2708 #[tokio::test]
2709 async fn open_palace_lazy_reopens_hydration_skipped_palace() {
2710 let (state, _tmp) = test_state();
2711 // Create a palace on disk.
2712 let pid = trusty_common::memory_core::palace::PalaceId::new("hydration-skip");
2713 let palace = trusty_common::memory_core::Palace {
2714 id: pid.clone(),
2715 name: "hydration-skip".to_string(),
2716 description: None,
2717 created_at: chrono::Utc::now(),
2718 data_dir: state.data_root.join("hydration-skip"),
2719 };
2720 state
2721 .registry
2722 .create_palace(&state.data_root, palace)
2723 .expect("create_palace");
2724
2725 // Simulate a startup-hydration skip by removing the just-registered handle.
2726 // In production, the palace is simply never registered because
2727 // PalaceHandle::open failed during load_palaces_from_disk (EMFILE etc.).
2728 state.registry.remove(&pid);
2729
2730 // The registry must now report no in-memory handle for this palace.
2731 assert!(
2732 state.registry.get(&pid).is_none(),
2733 "palace must appear absent (simulating hydration skip) before the lazy-reopen"
2734 );
2735
2736 // Calling open_palace should attempt a fresh open from disk and succeed.
2737 let handle = state
2738 .registry
2739 .open_palace(&state.data_root, &pid)
2740 .expect("open_palace must lazily reopen a hydration-skipped palace");
2741 assert_eq!(handle.id.as_str(), "hydration-skip");
2742 }
2743
2744 // -------------------------------------------------------------------------
2745 // Issue #498 — dotfile http_addr path uses $HOME/.trusty-memory/http_addr
2746 // -------------------------------------------------------------------------
2747
2748 /// Why (issue #498): claude-mpm's `migrate_trusty_autodetect` reads
2749 /// `~/.trusty-memory/http_addr` to discover the daemon's port. On macOS
2750 /// the OS-standard data dir differs from the dotfile path, so the daemon
2751 /// was writing to the wrong location and claude-mpm always fell back to the
2752 /// hardcoded port `7070`. This test confirms `dotfile_http_addr_path()`
2753 /// returns a path rooted at `$HOME/.trusty-memory/http_addr`.
2754 /// What: under a known HOME, calls `dotfile_http_addr_path` and asserts the
2755 /// returned path ends in `.trusty-memory/http_addr`.
2756 /// Test: this test.
2757 #[cfg(feature = "axum-server")]
2758 #[test]
2759 fn dotfile_http_addr_path_uses_home_dir() {
2760 // `dirs::home_dir()` is not redirectable via env on macOS, but we can
2761 // at least assert that when it returns Some, the suffix is correct.
2762 if let Some(p) = dotfile_http_addr_path() {
2763 assert!(
2764 p.ends_with(".trusty-memory/http_addr"),
2765 "dotfile path must end in .trusty-memory/http_addr; got {}",
2766 p.display()
2767 );
2768 }
2769 // If home_dir() returns None (locked-down env), the function returns None —
2770 // that's acceptable; we just skip the assertion.
2771 }
2772
2773 /// Why (issue #498): the daemon must write to the dotfile path so that
2774 /// claude-mpm's `_resolve_base_url` finds the running port. This round-trip
2775 /// test exercises `write_http_addr_file` at a dotfile-shaped path and
2776 /// confirms the content is readable after the atomic rename.
2777 /// What: picks a tempdir as a stand-in for $HOME, writes an addr to
2778 /// `.trusty-memory/http_addr`, and reads it back.
2779 /// Test: this test.
2780 #[cfg(feature = "axum-server")]
2781 #[test]
2782 fn dotfile_http_addr_write_read_round_trip() {
2783 let home = tempfile::tempdir().unwrap();
2784 let dotfile_dir = home.path().join(".trusty-memory");
2785 let path = dotfile_dir.join("http_addr");
2786 let addr: SocketAddr = "127.0.0.1:7099".parse().unwrap();
2787 write_http_addr_file(&path, &addr).expect("write_http_addr_file to dotfile path");
2788 let raw = std::fs::read_to_string(&path).unwrap();
2789 assert_eq!(
2790 raw.trim(),
2791 "127.0.0.1:7099",
2792 "dotfile round-trip content mismatch"
2793 );
2794 assert!(raw.ends_with('\n'), "dotfile must end with a newline");
2795 }
2796}