Skip to main content

zeph_core/agent/
experiment_cmd.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! `/experiment` slash-command handler for the agent loop.
5
6use std::fmt::Write as _;
7use std::sync::Arc;
8
9use super::{Agent, error::AgentError};
10use crate::channel::Channel;
11use crate::experiments::{
12    BenchmarkSet, Evaluator, ExperimentEngine, ExperimentSource, GridStep, SearchSpace,
13};
14
15impl<C: Channel> Agent<C> {
16    /// Dispatch `/experiment [subcommand]` slash command.
17    ///
18    /// # Errors
19    ///
20    /// Returns an error if the channel send fails or a `SQLite` query fails.
21    pub async fn handle_experiment_command(&mut self, input: &str) -> Result<(), AgentError> {
22        let args = input.strip_prefix("/experiment").unwrap_or("").trim();
23
24        match args {
25            "" | "status" => return self.handle_experiment_status().await,
26            "stop" => return self.handle_experiment_stop().await,
27            "report" => return self.handle_experiment_report().await,
28            "best" => return self.handle_experiment_best().await,
29            _ => {}
30        }
31
32        if args == "start" || args.starts_with("start") {
33            let max_override = args
34                .strip_prefix("start")
35                .and_then(|s| s.trim().parse::<u32>().ok());
36            return self.handle_experiment_start(max_override).await;
37        }
38
39        self.channel
40            .send(
41                "Unknown /experiment subcommand. Available: /experiment start [N], \
42                 /experiment stop, /experiment status, /experiment report, /experiment best",
43            )
44            .await?;
45        Ok(())
46    }
47
48    async fn handle_experiment_start(
49        &mut self,
50        max_override: Option<u32>,
51    ) -> Result<(), AgentError> {
52        // Guard: reject if an experiment is already running.
53        if self
54            .experiments
55            .cancel
56            .as_ref()
57            .is_some_and(|t| !t.is_cancelled())
58        {
59            self.channel
60                .send("Experiment already running. Use /experiment stop to cancel.")
61                .await?;
62            return Ok(());
63        }
64
65        let mut config = self.experiments.config.clone();
66
67        if !config.enabled {
68            self.channel
69                .send(
70                    "Experiments are disabled. Set `experiments.enabled = true` in config \
71                     and restart.",
72                )
73                .await?;
74            return Ok(());
75        }
76
77        if let Some(n) = max_override {
78            config.max_experiments = n;
79        }
80
81        if let Err(e) = config.validate() {
82            self.channel
83                .send(&format!("Experiment config is invalid: {e}"))
84                .await?;
85            return Ok(());
86        }
87
88        let max_n = config.max_experiments;
89        let engine = match self.build_experiment_engine(config) {
90            Ok(e) => e,
91            Err(msg) => {
92                self.channel.send(&msg).await?;
93                return Ok(());
94            }
95        };
96
97        self.run_experiment_engine(engine, max_n).await
98    }
99
100    /// Build an [`ExperimentEngine`] from validated config, returning `Err(message)` on failure.
101    fn build_experiment_engine(
102        &mut self,
103        config: crate::config::ExperimentConfig,
104    ) -> Result<ExperimentEngine, String> {
105        let benchmark_path = if let Some(p) = &config.benchmark_file {
106            p.clone()
107        } else {
108            return Err("experiments.benchmark_file is not set in config.".to_owned());
109        };
110
111        let benchmark = BenchmarkSet::from_file(&benchmark_path)
112            .map_err(|e| format!("Failed to load benchmark: {e}"))?;
113
114        let provider_arc = Arc::new(self.provider.clone());
115        // Use a dedicated eval provider when `eval_model` is configured, so the judge is
116        // independent from the agent under test. Fall back to the primary provider otherwise.
117        let judge_arc = self
118            .experiments
119            .eval_provider
120            .as_ref()
121            .map_or_else(|| Arc::clone(&provider_arc), |p| Arc::new(p.clone()));
122        let evaluator = Evaluator::new(judge_arc, benchmark, config.eval_budget_tokens)
123            .map_err(|e| format!("Failed to create evaluator: {e}"))?;
124
125        let generator = Box::new(GridStep::new(SearchSpace::default()));
126        // Use the pre-built baseline snapshot that reflects actual runtime config values.
127        // Set via Agent::with_experiment_baseline(); defaults to ConfigSnapshot::default()
128        // when not explicitly provided.
129        let baseline = self.experiments.baseline.clone();
130        let memory = self.memory_state.memory.clone();
131
132        Ok(
133            ExperimentEngine::new(evaluator, generator, provider_arc, baseline, config, memory)
134                .with_source(ExperimentSource::Manual),
135        )
136    }
137
138    async fn run_experiment_engine(
139        &mut self,
140        mut engine: ExperimentEngine,
141        max_n: u32,
142    ) -> Result<(), AgentError> {
143        let cancel = engine.cancel_token();
144        self.experiments.cancel = Some(cancel);
145
146        self.channel
147            .send(&format!(
148                "Experiment session starting (max {max_n} experiments). \
149                 Use /experiment stop to cancel. Results will be shown when complete.",
150            ))
151            .await?;
152
153        // Run the engine in a background task so the agent loop remains responsive.
154        // Completion (ok or error) is forwarded via experiment_notify_tx; the agent loop
155        // select! branch clears experiment_cancel and delivers the message to the channel.
156        let notify_tx = self.experiments.notify_tx.clone();
157        tokio::spawn(async move {
158            let msg = match engine.run().await {
159                Ok(report) => {
160                    let accepted = report.results.iter().filter(|r| r.accepted).count();
161                    let wall_secs =
162                        f64::from(u32::try_from(report.wall_time_ms).unwrap_or(u32::MAX)) / 1000.0;
163                    format!(
164                        "Experiment session `{}` complete: {}/{} accepted, \
165                         baseline {:.3} → {:.3} (improvement {:.3}), {wall_secs:.1}s{}",
166                        &report.session_id[..report.session_id.len().min(8)],
167                        accepted,
168                        report.results.len(),
169                        report.baseline_score,
170                        report.final_score,
171                        report.total_improvement,
172                        if report.cancelled { " [cancelled]" } else { "" },
173                    )
174                }
175                Err(e) => format!("Experiment session failed: {e}"),
176            };
177            // Ignore send errors: the agent may have shut down before the engine finished.
178            let _ = notify_tx.send(msg).await;
179        });
180
181        Ok(())
182    }
183
184    async fn handle_experiment_stop(&mut self) -> Result<(), AgentError> {
185        match &self.experiments.cancel {
186            Some(token) if !token.is_cancelled() => {
187                token.cancel();
188                self.channel
189                    .send("Experiment session cancelled. Results so far are saved.")
190                    .await?;
191            }
192            _ => {
193                self.channel
194                    .send("No experiment is currently running.")
195                    .await?;
196            }
197        }
198        Ok(())
199    }
200
201    async fn handle_experiment_status(&mut self) -> Result<(), AgentError> {
202        let running = self
203            .experiments
204            .cancel
205            .as_ref()
206            .is_some_and(|t| !t.is_cancelled());
207
208        let mut msg = if running {
209            String::from("Experiment: **running**. Use `/experiment stop` to cancel.")
210        } else {
211            String::from("Experiment: **idle**. Use `/experiment start [N]` to begin.")
212        };
213
214        if let Some(memory) = &self.memory_state.memory {
215            let rows = memory.sqlite().list_experiment_results(None, 1).await?;
216            if let Some(latest) = rows.first()
217                && let Some(summary) = memory
218                    .sqlite()
219                    .experiment_session_summary(&latest.session_id)
220                    .await?
221            {
222                let sid_len = summary.session_id.len().min(11);
223                let _ = write!(
224                    msg,
225                    "\nLast session: `{}` | {} experiments | {} accepted | \
226                     best delta: {:.3}",
227                    &summary.session_id[..sid_len],
228                    summary.total,
229                    summary.accepted_count,
230                    summary.best_delta,
231                );
232            }
233        }
234
235        self.channel.send(&msg).await?;
236        Ok(())
237    }
238
239    async fn handle_experiment_report(&mut self) -> Result<(), AgentError> {
240        let Some(memory) = &self.memory_state.memory else {
241            self.channel
242                .send("Memory is not enabled — cannot query experiment results.")
243                .await?;
244            return Ok(());
245        };
246
247        let rows = memory.sqlite().list_experiment_results(None, 50).await?;
248
249        if rows.is_empty() {
250            self.channel.send("No experiment results found.").await?;
251            return Ok(());
252        }
253
254        let mut out = String::from("**Experiment Results** (last 50, newest first)\n\n```\n");
255        let _ = writeln!(
256            out,
257            "{:<8} {:<12} {:<20} {:<8} {:<8} {:<8} {:<8}",
258            "ID", "Session", "Parameter", "Delta", "Baseline", "Candidate", "Accepted"
259        );
260        for r in &rows {
261            let sid_len = r.session_id.len().min(11);
262            let _ = writeln!(
263                out,
264                "{:<8} {:<12} {:<20} {:<8.3} {:<8.3} {:<8.3} {:<8}",
265                r.id,
266                &r.session_id[..sid_len],
267                &r.parameter,
268                r.delta,
269                r.baseline_score,
270                r.candidate_score,
271                if r.accepted { "yes" } else { "no" },
272            );
273        }
274        out.push_str("```");
275        self.channel.send(&out).await?;
276        Ok(())
277    }
278
279    async fn handle_experiment_best(&mut self) -> Result<(), AgentError> {
280        let Some(memory) = &self.memory_state.memory else {
281            self.channel
282                .send("Memory is not enabled — cannot query experiment results.")
283                .await?;
284            return Ok(());
285        };
286
287        let row = memory.sqlite().best_experiment_result(None).await?;
288
289        match row {
290            None => {
291                self.channel
292                    .send("No accepted experiment results found yet.")
293                    .await?;
294            }
295            Some(r) => {
296                let sid_len = r.session_id.len().min(11);
297                let msg = format!(
298                    "**Best experiment result**\n\
299                     - Session: `{}`\n\
300                     - Parameter: `{}`\n\
301                     - Delta: `{:.3}` ({:.3} → {:.3})\n\
302                     - Source: `{}`\n\
303                     - At: {}",
304                    &r.session_id[..sid_len],
305                    r.parameter,
306                    r.delta,
307                    r.baseline_score,
308                    r.candidate_score,
309                    r.source,
310                    r.created_at,
311                );
312                self.channel.send(&msg).await?;
313            }
314        }
315        Ok(())
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use super::super::agent_tests::{
322        MockChannel, MockToolExecutor, create_test_registry, mock_provider,
323    };
324    use super::*;
325
326    fn make_agent() -> Agent<MockChannel> {
327        Agent::new(
328            mock_provider(vec![]),
329            MockChannel::new(vec![]),
330            create_test_registry(),
331            None,
332            5,
333            MockToolExecutor::no_tools(),
334        )
335    }
336
337    #[tokio::test]
338    async fn unknown_subcommand_returns_help() {
339        let mut agent = make_agent();
340        agent
341            .handle_experiment_command("/experiment foobar")
342            .await
343            .unwrap();
344        let msgs = agent.channel.sent_messages();
345        assert!(
346            msgs.iter().any(|s| s.contains("Unknown /experiment")),
347            "expected help text, got: {msgs:?}"
348        );
349    }
350
351    #[tokio::test]
352    async fn start_when_disabled_returns_error() {
353        let mut agent = make_agent();
354        // Default config has enabled = false.
355        agent
356            .handle_experiment_command("/experiment start")
357            .await
358            .unwrap();
359        let msgs = agent.channel.sent_messages();
360        assert!(
361            msgs.iter().any(|s| s.contains("disabled")),
362            "expected disabled message, got: {msgs:?}"
363        );
364    }
365
366    #[tokio::test]
367    async fn stop_when_not_running_returns_not_running() {
368        let mut agent = make_agent();
369        agent
370            .handle_experiment_command("/experiment stop")
371            .await
372            .unwrap();
373        let msgs = agent.channel.sent_messages();
374        assert!(
375            msgs.iter()
376                .any(|s| s.contains("No experiment is currently running")),
377            "expected not-running message, got: {msgs:?}"
378        );
379    }
380
381    #[tokio::test]
382    async fn status_when_idle_returns_idle() {
383        let mut agent = make_agent();
384        agent
385            .handle_experiment_command("/experiment status")
386            .await
387            .unwrap();
388        let msgs = agent.channel.sent_messages();
389        assert!(
390            msgs.iter().any(|s| s.contains("idle")),
391            "expected idle status, got: {msgs:?}"
392        );
393    }
394
395    #[tokio::test]
396    async fn empty_subcommand_returns_status() {
397        let mut agent = make_agent();
398        agent
399            .handle_experiment_command("/experiment")
400            .await
401            .unwrap();
402        let msgs = agent.channel.sent_messages();
403        // Empty subcommand maps to status (idle when no experiment running).
404        assert!(
405            msgs.iter().any(|s| s.contains("idle")),
406            "expected idle status for empty subcommand, got: {msgs:?}"
407        );
408    }
409
410    #[tokio::test]
411    async fn start_while_running_returns_already_running() {
412        use tokio_util::sync::CancellationToken;
413
414        let mut agent = make_agent();
415        // Simulate a running experiment by inserting a live cancel token.
416        agent.experiments.cancel = Some(CancellationToken::new());
417
418        agent
419            .handle_experiment_command("/experiment start")
420            .await
421            .unwrap();
422        let msgs = agent.channel.sent_messages();
423        assert!(
424            msgs.iter().any(|s| s.contains("already running")),
425            "expected already-running guard, got: {msgs:?}"
426        );
427    }
428
429    #[tokio::test]
430    async fn report_without_memory_returns_error() {
431        let mut agent = make_agent();
432        // No memory wired → should return the "not enabled" message.
433        agent
434            .handle_experiment_command("/experiment report")
435            .await
436            .unwrap();
437        let msgs = agent.channel.sent_messages();
438        assert!(
439            msgs.iter().any(|s| s.contains("Memory is not enabled")),
440            "expected no-memory message, got: {msgs:?}"
441        );
442    }
443
444    #[tokio::test]
445    async fn best_without_memory_returns_error() {
446        let mut agent = make_agent();
447        agent
448            .handle_experiment_command("/experiment best")
449            .await
450            .unwrap();
451        let msgs = agent.channel.sent_messages();
452        assert!(
453            msgs.iter().any(|s| s.contains("Memory is not enabled")),
454            "expected no-memory message, got: {msgs:?}"
455        );
456    }
457}