Skip to main content

skilllite_evolution/
feedback.rs

1//! Evolution feedback collection and evaluation system (EVO-1).
2
3use anyhow::Result;
4use rusqlite::{params, Connection};
5use std::path::Path;
6
7// ─── Decision input (agent converts ExecutionFeedback to this) ─────────────────
8
9/// Input for recording a decision. The agent converts its ExecutionFeedback to this.
10#[derive(Debug, Clone, Default)]
11pub struct DecisionInput {
12    pub total_tools: usize,
13    pub failed_tools: usize,
14    pub replans: usize,
15    pub elapsed_ms: u64,
16    pub task_completed: bool,
17    pub task_description: Option<String>,
18    pub rules_used: Vec<String>,
19    pub tools_detail: Vec<ToolExecDetail>,
20}
21
22#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
23pub struct ToolExecDetail {
24    pub tool: String,
25    pub success: bool,
26}
27
28/// User feedback signal for the last decision.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
30pub enum FeedbackSignal {
31    ExplicitPositive,
32    ExplicitNegative,
33    #[default]
34    Neutral,
35}
36
37#[derive(Debug, Clone, Copy, Default, PartialEq)]
38pub struct CoreMetrics {
39    pub first_success_rate: f64,
40    pub avg_replans: f64,
41    pub user_correction_rate: f64,
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum EvolutionJudgement {
46    Promote,
47    KeepObserving,
48    Rollback,
49}
50
51impl EvolutionJudgement {
52    pub fn as_str(&self) -> &'static str {
53        match self {
54            Self::Promote => "promote",
55            Self::KeepObserving => "keep_observing",
56            Self::Rollback => "rollback",
57        }
58    }
59
60    pub fn label_zh(&self) -> &'static str {
61        match self {
62            Self::Promote => "保留",
63            Self::KeepObserving => "继续观察",
64            Self::Rollback => "回滚",
65        }
66    }
67}
68
69#[derive(Debug, Clone, PartialEq)]
70pub struct JudgementSummary {
71    pub judgement: EvolutionJudgement,
72    pub current: CoreMetrics,
73    pub baseline: Option<CoreMetrics>,
74    pub reason: String,
75}
76
77#[derive(Debug, Clone, PartialEq, Eq)]
78pub struct RuleHistoryEntry {
79    pub ts: String,
80    pub event_type: String,
81    pub txn_id: String,
82    pub reason: String,
83}
84
85impl FeedbackSignal {
86    pub fn as_str(&self) -> &'static str {
87        match self {
88            Self::ExplicitPositive => "pos",
89            Self::ExplicitNegative => "neg",
90            Self::Neutral => "neutral",
91        }
92    }
93}
94
95pub fn open_evolution_db(chat_root: &Path) -> Result<Connection> {
96    let db_path = chat_root.join("feedback.sqlite");
97    let conn = Connection::open(&db_path)?;
98    conn.execute_batch("PRAGMA foreign_keys=ON;")?;
99    ensure_evolution_tables(&conn)?;
100    Ok(conn)
101}
102// ─── Schema ─────────────────────────────────────────────────────────────────
103
104pub fn ensure_evolution_tables(conn: &Connection) -> Result<()> {
105    conn.execute_batch(
106        r#"
107        CREATE TABLE IF NOT EXISTS decisions (
108            id INTEGER PRIMARY KEY AUTOINCREMENT,
109            ts TEXT NOT NULL DEFAULT (datetime('now')),
110            session_id TEXT,
111            total_tools INTEGER DEFAULT 0,
112            failed_tools INTEGER DEFAULT 0,
113            replans INTEGER DEFAULT 0,
114            elapsed_ms INTEGER DEFAULT 0,
115            task_completed BOOLEAN DEFAULT 0,
116            feedback TEXT DEFAULT 'neutral',
117            evolved BOOLEAN DEFAULT 0,
118            task_description TEXT,
119            tools_detail TEXT,
120            tool_sequence_key TEXT
121        );
122
123        CREATE TABLE IF NOT EXISTS decision_rules (
124            decision_id INTEGER REFERENCES decisions(id) ON DELETE CASCADE,
125            rule_id TEXT NOT NULL
126        );
127
128        CREATE TABLE IF NOT EXISTS evolution_log (
129            id INTEGER PRIMARY KEY AUTOINCREMENT,
130            ts TEXT NOT NULL DEFAULT (datetime('now')),
131            type TEXT NOT NULL,
132            target_id TEXT,
133            reason TEXT,
134            version TEXT
135        );
136
137        CREATE TABLE IF NOT EXISTS evolution_metrics (
138            date TEXT PRIMARY KEY,
139            first_success_rate REAL,
140            avg_replans REAL,
141            avg_tool_calls REAL,
142            user_correction_rate REAL,
143            evolved_rules INTEGER DEFAULT 0,
144            effective_rules INTEGER DEFAULT 0,
145            egl REAL DEFAULT 0.0
146        );
147
148        CREATE INDEX IF NOT EXISTS idx_decisions_evolved ON decisions(evolved);
149        CREATE INDEX IF NOT EXISTS idx_decisions_ts ON decisions(ts);
150        CREATE INDEX IF NOT EXISTS idx_dr_rule ON decision_rules(rule_id);
151        CREATE INDEX IF NOT EXISTS idx_dr_decision ON decision_rules(decision_id);
152        CREATE INDEX IF NOT EXISTS idx_evo_log_ts ON evolution_log(ts);
153        "#,
154    )?;
155    // Backward-compatible migration: add column for existing DBs (ignored if column exists).
156    let _ = conn.execute(
157        "ALTER TABLE decisions ADD COLUMN tool_sequence_key TEXT",
158        [],
159    );
160    // Index must be created after ALTER TABLE so existing DBs have the column first.
161    let _ = conn.execute(
162        "CREATE INDEX IF NOT EXISTS idx_decisions_seq ON decisions(tool_sequence_key)",
163        [],
164    );
165    Ok(())
166}
167
168/// Build a compact tool-sequence key from tools_detail (at most 3 tools joined by →).
169/// Used to group decisions by "what tool pattern was used" rather than raw task description.
170/// Example: [weather] → "weather"; [http-request, write_output] → "http-request→write_output".
171pub fn compute_tool_sequence_key(tools_detail: &[ToolExecDetail]) -> Option<String> {
172    if tools_detail.is_empty() {
173        return None;
174    }
175    let key = tools_detail
176        .iter()
177        .take(3)
178        .map(|t| t.tool.as_str())
179        .collect::<Vec<_>>()
180        .join("→");
181    Some(key)
182}
183
184// ─── Decision recording ─────────────────────────────────────────────────────
185
186pub fn insert_decision(
187    conn: &Connection,
188    session_id: Option<&str>,
189    feedback: &DecisionInput,
190    user_feedback: FeedbackSignal,
191) -> Result<i64> {
192    let tools_detail_json = serde_json::to_string(&feedback.tools_detail).unwrap_or_default();
193    let tool_sequence_key = compute_tool_sequence_key(&feedback.tools_detail);
194
195    conn.execute(
196        "INSERT INTO decisions (session_id, total_tools, failed_tools, replans,
197         elapsed_ms, task_completed, feedback, task_description, tools_detail, tool_sequence_key)
198         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)",
199        params![
200            session_id,
201            feedback.total_tools as i64,
202            feedback.failed_tools as i64,
203            feedback.replans as i64,
204            feedback.elapsed_ms as i64,
205            feedback.task_completed,
206            user_feedback.as_str(),
207            feedback.task_description,
208            tools_detail_json,
209            tool_sequence_key,
210        ],
211    )?;
212    let decision_id = conn.last_insert_rowid();
213
214    if !feedback.rules_used.is_empty() {
215        let mut stmt =
216            conn.prepare("INSERT INTO decision_rules (decision_id, rule_id) VALUES (?1, ?2)")?;
217        for rule_id in &feedback.rules_used {
218            stmt.execute(params![decision_id, rule_id])?;
219        }
220    }
221
222    Ok(decision_id)
223}
224
225pub fn count_unprocessed_decisions(conn: &Connection) -> Result<i64> {
226    conn.query_row(
227        "SELECT COUNT(*) FROM decisions WHERE evolved = 0",
228        [],
229        |r| r.get(0),
230    )
231    .map_err(Into::into)
232}
233
234/// Diagnostic: count unprocessed decisions with/without task_description.
235/// Evolution requires task_description to learn from decisions.
236pub fn count_decisions_with_task_desc(conn: &Connection) -> Result<(i64, i64)> {
237    let total: i64 = conn.query_row(
238        "SELECT COUNT(*) FROM decisions WHERE evolved = 0",
239        [],
240        |r| r.get(0),
241    )?;
242    let with_desc: i64 = conn.query_row(
243        "SELECT COUNT(*) FROM decisions WHERE evolved = 0 AND task_description IS NOT NULL",
244        [],
245        |r| r.get(0),
246    )?;
247    Ok((total, with_desc))
248}
249
250pub fn update_last_decision_feedback(
251    conn: &Connection,
252    session_id: &str,
253    feedback: FeedbackSignal,
254) -> Result<()> {
255    conn.execute(
256        "UPDATE decisions SET feedback = ?1
257         WHERE id = (SELECT id FROM decisions WHERE session_id = ?2 ORDER BY ts DESC LIMIT 1)",
258        params![feedback.as_str(), session_id],
259    )?;
260    Ok(())
261}
262
263// ─── Effectiveness aggregation ──────────────────────────────────────────────
264
265pub fn compute_effectiveness(conn: &Connection, rule_id: &str) -> Result<f32> {
266    let result: Result<(i64, i64), _> = conn.query_row(
267        "SELECT
268            COUNT(CASE WHEN d.task_completed = 1 AND d.feedback != 'neg' THEN 1 END),
269            COUNT(*)
270         FROM decisions d
271         JOIN decision_rules dr ON d.id = dr.decision_id
272         WHERE dr.rule_id = ?1 AND d.ts > datetime('now', '-30 days')",
273        params![rule_id],
274        |row| Ok((row.get(0)?, row.get(1)?)),
275    );
276    match result {
277        Ok((success, total)) => {
278            if total < 3 {
279                Ok(-1.0)
280            } else {
281                Ok(success as f32 / total as f32)
282            }
283        }
284        Err(_) => Ok(-1.0),
285    }
286}
287
288pub fn query_rule_history(conn: &Connection, rule_id: &str) -> Result<Vec<RuleHistoryEntry>> {
289    let mut stmt = conn.prepare(
290        "SELECT ts, type, COALESCE(version, ''), COALESCE(reason, '')
291         FROM evolution_log
292         WHERE target_id = ?1
293         ORDER BY ts DESC",
294    )?;
295
296    let rows = stmt.query_map(params![rule_id], |row| {
297        Ok(RuleHistoryEntry {
298            ts: row.get(0)?,
299            event_type: row.get(1)?,
300            txn_id: row.get(2)?,
301            reason: row.get(3)?,
302        })
303    })?;
304
305    let entries = rows.collect::<std::result::Result<Vec<_>, _>>()?;
306    Ok(entries)
307}
308
309// ─── System-level metrics ───────────────────────────────────────────────────
310
311pub fn update_daily_metrics(conn: &Connection) -> Result<()> {
312    let today = chrono::Utc::now().format("%Y-%m-%d").to_string();
313    let core = compute_core_metrics_for_date(conn, &today)?;
314
315    let avg_tool_calls: f64 = conn
316        .query_row(
317            "SELECT COALESCE(AVG(CAST(total_tools AS REAL)), 0.0)
318             FROM decisions
319             WHERE date(ts) = ?1 AND total_tools >= 1",
320            params![today],
321            |row| row.get(0),
322        )
323        .unwrap_or(0.0);
324    let egl = compute_egl(conn, &today).unwrap_or(0.0);
325
326    conn.execute(
327        "INSERT INTO evolution_metrics (date, first_success_rate, avg_replans,
328         avg_tool_calls, user_correction_rate, egl)
329         VALUES (?1, ?2, ?3, ?4, ?5, ?6)
330         ON CONFLICT(date) DO UPDATE SET
331            first_success_rate = ?2, avg_replans = ?3,
332            avg_tool_calls = ?4, user_correction_rate = ?5, egl = ?6",
333        params![
334            today,
335            core.first_success_rate,
336            core.avg_replans,
337            avg_tool_calls,
338            core.user_correction_rate,
339            egl
340        ],
341    )?;
342
343    Ok(())
344}
345
346pub fn compute_core_metrics_for_date(conn: &Connection, date: &str) -> Result<CoreMetrics> {
347    // first_success_rate: (total decisions where task_completed = 1 and feedback != 'neg') / total decisions
348    let (success_count, total_count): (i64, i64) = conn.query_row(
349        "SELECT
350            COUNT(CASE WHEN task_completed = 1 AND feedback != 'neg' THEN 1 END),
351            COUNT(*)
352         FROM decisions
353         WHERE date(ts) = ?1",
354        params![date],
355        |row| Ok((row.get(0)?, row.get(1)?)),
356    )?;
357    let first_success_rate = if total_count > 0 {
358        success_count as f64 / total_count as f64
359    } else {
360        0.0
361    };
362
363    // avg_replans: average of 'replans' for all decisions
364    let avg_replans: f64 = conn
365        .query_row(
366            "SELECT COALESCE(AVG(CAST(replans AS REAL)), 0.0) FROM decisions WHERE date(ts) = ?1",
367            params![date],
368            |row| row.get(0),
369        )
370        .unwrap_or(0.0);
371
372    // user_correction_rate: count of 'neg' feedbacks / (count of 'pos' feedbacks + count of 'neg' feedbacks)
373    let (pos_feedback_count, neg_feedback_count): (i64, i64) = conn.query_row(
374        "SELECT
375            COUNT(CASE WHEN feedback = 'pos' THEN 1 END),
376            COUNT(CASE WHEN feedback = 'neg' THEN 1 END)
377         FROM decisions
378         WHERE date(ts) = ?1",
379        params![date],
380        |row| Ok((row.get(0)?, row.get(1)?)),
381    )?;
382    let user_correction_rate = if (pos_feedback_count + neg_feedback_count) > 0 {
383        neg_feedback_count as f64 / (pos_feedback_count + neg_feedback_count) as f64
384    } else {
385        0.0
386    };
387
388    Ok(CoreMetrics {
389        first_success_rate,
390        avg_replans,
391        user_correction_rate,
392    })
393}
394
395// ─── Evolution decision making ────────────────────────────────────────────────
396
397/// EGL (Evolutionary Grade Level) captures a single, combined score of "goodness"
398/// of a given rule or agent behavior.
399/// EGL = (first_success_rate * A) - (avg_replans * B) - (user_correction_rate * C)
400/// where A, B, C are configurable weights.
401/// A higher EGL indicates better performance.
402///
403/// This is meant for comparing rule effectiveness, so it's only computed for specific rules.
404pub fn compute_egl_for_rule(conn: &Connection, rule_id: &str) -> Result<f64> {
405    let (success_count, total_count, total_replans, pos_feedback, neg_feedback): (
406        i64,
407        i64,
408        i64,
409        i64,
410        i64,
411    ) = conn.query_row(
412        "SELECT
413            COUNT(CASE WHEN d.task_completed = 1 AND d.feedback != 'neg' THEN 1 END),
414            COUNT(*),
415            SUM(d.replans),
416            COUNT(CASE WHEN d.feedback = 'pos' THEN 1 END),
417            COUNT(CASE WHEN d.feedback = 'neg' THEN 1 END)
418         FROM decisions d
419         JOIN decision_rules dr ON d.id = dr.decision_id
420         WHERE dr.rule_id = ?1 AND d.ts > datetime('now', '-30 days')", // Last 30 days
421        params![rule_id],
422        |row| {
423            Ok((
424                row.get(0)?,
425                row.get(1)?,
426                row.get(2)?,
427                row.get(3)?,
428                row.get(4)?,
429            ))
430        },
431    )?;
432
433    if total_count == 0 {
434        return Ok(0.0);
435    }
436
437    let success_rate = success_count as f64 / total_count as f64;
438    let avg_replans = total_replans as f64 / total_count as f64;
439    let user_correction_rate = if (pos_feedback + neg_feedback) > 0 {
440        neg_feedback as f64 / (pos_feedback + neg_feedback) as f64
441    } else {
442        0.0
443    };
444
445    // Weights (can be configured)
446    let w_success = 1.0;
447    let w_replans = 0.5;
448    let w_correction = 0.7;
449
450    let egl = (success_rate * w_success)
451        - (avg_replans * w_replans)
452        - (user_correction_rate * w_correction);
453    Ok(egl)
454}
455
456/// System-wide EGL based on global metrics for the current day.
457pub fn compute_egl(conn: &Connection, date: &str) -> Result<f64> {
458    let metrics = compute_core_metrics_for_date(conn, date)?;
459
460    // Weights (can be configured)
461    let w_success = 1.0;
462    let w_replans = 0.5;
463    let w_correction = 0.7;
464
465    let egl = (metrics.first_success_rate * w_success)
466        - (metrics.avg_replans * w_replans)
467        - (metrics.user_correction_rate * w_correction);
468    Ok(egl)
469}
470
471pub fn fetch_latest_metrics(conn: &Connection) -> Result<Option<CoreMetrics>> {
472    let mut stmt = conn.prepare(
473        "SELECT first_success_rate, avg_replans, user_correction_rate
474         FROM evolution_metrics ORDER BY date DESC LIMIT 1",
475    )?;
476    let metrics = stmt.query_row([], |row| {
477        Ok(CoreMetrics {
478            first_success_rate: row.get(0)?,
479            avg_replans: row.get(1)?,
480            user_correction_rate: row.get(2)?,
481        })
482    });
483    match metrics {
484        Ok(m) => Ok(Some(m)),
485        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
486        Err(e) => Err(e.into()),
487    }
488}
489
490pub fn build_latest_judgement(conn: &Connection) -> Result<Option<JudgementSummary>> {
491    let today = chrono::Utc::now().format("%Y-%m-%d").to_string();
492    let current_metrics = compute_core_metrics_for_date(conn, &today)?;
493    let baseline_metrics = fetch_latest_metrics(conn)?;
494
495    if let Some(baseline) = baseline_metrics {
496        let mut reason_parts = vec![];
497        let mut promote_score = 0; // +1 for improvement, -1 for degradation
498        let mut degrade_score = 0;
499
500        if current_metrics.first_success_rate > baseline.first_success_rate {
501            reason_parts.push(format!(
502                "首次成功率从 {:.2}% 提升到 {:.2}%",
503                baseline.first_success_rate * 100.0,
504                current_metrics.first_success_rate * 100.0
505            ));
506            promote_score += 1;
507        } else if current_metrics.first_success_rate < baseline.first_success_rate {
508            reason_parts.push(format!(
509                "首次成功率从 {:.2}% 下降到 {:.2}%",
510                baseline.first_success_rate * 100.0,
511                current_metrics.first_success_rate * 100.0
512            ));
513            degrade_score += 1;
514        }
515
516        if current_metrics.avg_replans < baseline.avg_replans {
517            reason_parts.push(format!(
518                "平均重试次数从 {:.2} 减少到 {:.2}",
519                baseline.avg_replans, current_metrics.avg_replans
520            ));
521            promote_score += 1;
522        } else if current_metrics.avg_replans > baseline.avg_replans {
523            reason_parts.push(format!(
524                "平均重试次数从 {:.2} 增加到 {:.2}",
525                baseline.avg_replans, current_metrics.avg_replans
526            ));
527            degrade_score += 1;
528        }
529
530        if current_metrics.user_correction_rate < baseline.user_correction_rate {
531            reason_parts.push(format!(
532                "用户修正率从 {:.2}% 减少到 {:.2}%",
533                baseline.user_correction_rate * 100.0,
534                current_metrics.user_correction_rate * 100.0
535            ));
536            promote_score += 1;
537        } else if current_metrics.user_correction_rate > baseline.user_correction_rate {
538            reason_parts.push(format!(
539                "用户修正率从 {:.2}% 增加到 {:.2}%",
540                baseline.user_correction_rate * 100.0,
541                current_metrics.user_correction_rate * 100.0
542            ));
543            degrade_score += 1;
544        }
545
546        let judgement = if promote_score > degrade_score && promote_score > 0 {
547            EvolutionJudgement::Promote
548        } else if degrade_score > promote_score && degrade_score > 0 {
549            EvolutionJudgement::Rollback
550        } else {
551            EvolutionJudgement::KeepObserving
552        };
553
554        let reason = if reason_parts.is_empty() {
555            "指标无显著变化".to_string()
556        } else {
557            reason_parts.join(",")
558        };
559
560        Ok(Some(JudgementSummary {
561            judgement,
562            current: current_metrics,
563            baseline: Some(baseline),
564            reason,
565        }))
566    } else {
567        // No baseline data, keep observing
568        Ok(Some(JudgementSummary {
569            judgement: EvolutionJudgement::KeepObserving,
570            current: current_metrics,
571            baseline: None,
572            reason: "无基线数据,继续观察".to_string(),
573        }))
574    }
575}
576
577// ─── Logging and persistence ────────────────────────────────────────────────
578
579pub fn log_evolution_event(
580    conn: &Connection,
581    event_type: &str,
582    target_id: Option<&str>,
583    reason: Option<&str>,
584    version: Option<&str>,
585) -> Result<i64> {
586    conn.execute(
587        "INSERT INTO evolution_log (type, target_id, reason, version) VALUES (?1, ?2, ?3, ?4)",
588        params![event_type, target_id, reason, version],
589    )?;
590    Ok(conn.last_insert_rowid())
591}
592
593// ─── Test helpers ───────────────────────────────────────────────────────────
594
595#[cfg(test)]
596mod tests {
597    use super::*;
598    use rusqlite::Connection;
599
600    fn setup_conn() -> Connection {
601        let conn = Connection::open_in_memory().unwrap();
602        ensure_evolution_tables(&conn).unwrap();
603        conn
604    }
605
606    #[test]
607    fn test_ensure_evolution_tables() {
608        let conn = setup_conn();
609        let tables = conn
610            .query_row(
611                "SELECT name FROM sqlite_master WHERE type='table' AND name='decisions'",
612                [],
613                |r| r.get::<_, String>(0),
614            )
615            .unwrap();
616        assert_eq!(tables, "decisions");
617    }
618
619    #[test]
620    fn test_insert_decision() {
621        let conn = setup_conn();
622        let input = DecisionInput {
623            total_tools: 1,
624            failed_tools: 0,
625            replans: 0,
626            elapsed_ms: 100,
627            task_completed: true,
628            task_description: Some("test task".to_string()),
629            rules_used: vec![],
630            tools_detail: vec![],
631        };
632        let decision_id =
633            insert_decision(&conn, Some("session1"), &input, FeedbackSignal::Neutral).unwrap();
634        assert!(decision_id > 0);
635
636        let count: i64 = conn
637            .query_row("SELECT COUNT(*) FROM decisions", [], |r| r.get(0))
638            .unwrap();
639        assert_eq!(count, 1);
640    }
641
642    #[test]
643    fn test_update_last_decision_feedback() {
644        let conn = setup_conn();
645        let input = DecisionInput {
646            total_tools: 1,
647            failed_tools: 0,
648            replans: 0,
649            elapsed_ms: 100,
650            task_completed: true,
651            task_description: Some("test task".to_string()),
652            rules_used: vec![],
653            tools_detail: vec![],
654        };
655        insert_decision(&conn, Some("s1"), &input, FeedbackSignal::Neutral).unwrap();
656        update_last_decision_feedback(&conn, "s1", FeedbackSignal::ExplicitPositive).unwrap();
657
658        let feedback: String = conn
659            .query_row(
660                "SELECT feedback FROM decisions WHERE session_id = 's1'",
661                [],
662                |r| r.get(0),
663            )
664            .unwrap();
665        assert_eq!(feedback, "pos");
666    }
667
668    #[test]
669    fn test_compute_effectiveness() {
670        let conn = setup_conn();
671        conn.execute_batch(
672            "INSERT INTO decisions (ts, task_completed, feedback) VALUES
673             ('2026-03-14 09:00:00', 1, 'pos'),
674             ('2026-03-14 10:00:00', 0, 'neg'),
675             ('2026-03-14 11:00:00', 1, 'neutral'),
676             ('2026-03-14 12:00:00', 1, 'pos');
677             INSERT INTO decision_rules (decision_id, rule_id) VALUES
678             (1, 'test-rule'), (2, 'test-rule'), (3, 'test-rule'), (4, 'test-rule');",
679        )
680        .unwrap();
681
682        let effectiveness = compute_effectiveness(&conn, "test-rule").unwrap();
683        // 3 successful (pos, neutral, pos) out of 4 total = 0.75
684        assert!((effectiveness - 0.75).abs() < 1e-6);
685    }
686
687    #[test]
688    fn test_compute_effectiveness_less_than_three_decisions() {
689        let conn = setup_conn();
690        conn.execute_batch(
691            "INSERT INTO decisions (ts, task_completed, feedback) VALUES
692             ('2026-03-14 09:00:00', 1, 'pos'),
693             ('2026-03-14 10:00:00', 1, 'neutral');
694             INSERT INTO decision_rules (decision_id, rule_id) VALUES
695             (1, 'test-rule-2'), (2, 'test-rule-2');",
696        )
697        .unwrap();
698
699        let effectiveness = compute_effectiveness(&conn, "test-rule-2").unwrap();
700        assert!((effectiveness - -1.0).abs() < 1e-6);
701    }
702
703    #[test]
704    fn test_query_rule_history_returns_events_for_rule() {
705        let conn = setup_conn();
706        conn.execute_batch(
707            "INSERT INTO evolution_log (ts, type, target_id, reason, version) VALUES
708             ('2026-03-14T09:00:00Z', 'rule_added', 'rule-a', 'seeded', 'txn-1'),
709             ('2026-03-14T10:00:00Z', 'rule_promoted', 'rule-a', 'effective', 'txn-2'),
710             ('2026-03-14T11:00:00Z', 'rule_added', 'rule-b', 'other', 'txn-3')",
711        )
712        .unwrap();
713
714        let history = query_rule_history(&conn, "rule-a").unwrap();
715        assert_eq!(history.len(), 2);
716        assert_eq!(history[0].event_type, "rule_promoted");
717        assert_eq!(history[0].txn_id, "txn-2");
718        assert_eq!(history[0].reason, "effective");
719        assert_eq!(history[1].event_type, "rule_added");
720        assert_eq!(history[1].txn_id, "txn-1");
721    }
722
723    #[test]
724    fn test_compute_tool_sequence_key() {
725        let tools_detail = vec![
726            ToolExecDetail {
727                tool: "read_file".to_string(),
728                success: true,
729            },
730            ToolExecDetail {
731                tool: "write_file".to_string(),
732                success: true,
733            },
734            ToolExecDetail {
735                tool: "run_command".to_string(),
736                success: false,
737            },
738            ToolExecDetail {
739                tool: "http_request".to_string(),
740                success: true,
741            }, // Should be ignored
742        ];
743        let key = compute_tool_sequence_key(&tools_detail);
744        assert_eq!(key, Some("read_file→write_file→run_command".to_string()));
745
746        let empty_detail: Vec<ToolExecDetail> = vec![];
747        let key = compute_tool_sequence_key(&empty_detail);
748        assert_eq!(key, None);
749
750        let single_detail = vec![ToolExecDetail {
751            tool: "list_directory".to_string(),
752            success: true,
753        }];
754        let key = compute_tool_sequence_key(&single_detail);
755        assert_eq!(key, Some("list_directory".to_string()));
756    }
757
758    #[test]
759    fn test_insert_decision_with_rules() {
760        let conn = setup_conn();
761        let input = DecisionInput {
762            total_tools: 2,
763            failed_tools: 0,
764            replans: 0,
765            elapsed_ms: 100,
766            task_completed: true,
767            task_description: Some("test".to_string()),
768            rules_used: vec!["rule-a".to_string(), "rule-b".to_string()],
769            tools_detail: vec![ToolExecDetail {
770                tool: "read_file".to_string(),
771                success: true,
772            }],
773        };
774
775        let id = insert_decision(&conn, Some("s1"), &input, FeedbackSignal::Neutral).unwrap();
776        let mut stmt = conn
777            .prepare("SELECT rule_id FROM decision_rules WHERE decision_id = ?1 ORDER BY rule_id")
778            .unwrap();
779        let rows: Vec<String> = stmt
780            .query_map(params![id], |row| row.get(0))
781            .unwrap()
782            .collect::<std::result::Result<Vec<_>, _>>()
783            .unwrap();
784
785        assert_eq!(rows, vec!["rule-a".to_string(), "rule-b".to_string()]);
786    }
787
788    #[test]
789    fn test_compute_core_metrics_for_date_uses_minimal_metrics() {
790        let conn = setup_conn();
791        conn.execute(
792            "INSERT INTO decisions (ts, total_tools, replans, task_completed, feedback)
793             VALUES
794             ('2026-03-14 09:00:00', 1, 0, 1, 'neutral'),
795             ('2026-03-14 10:00:00', 2, 1, 1, 'neg'),
796             ('2026-03-14 11:00:00', 1, 2, 0, 'pos')",
797            [],
798        )
799        .unwrap();
800
801        let metrics = compute_core_metrics_for_date(&conn, "2026-03-14").unwrap();
802        assert!((metrics.first_success_rate - (1.0 / 3.0)).abs() < 1e-6);
803        assert!((metrics.avg_replans - 1.0).abs() < 1e-6);
804        assert!((metrics.user_correction_rate - 0.5).abs() < 1e-6);
805    }
806
807    #[test]
808    fn test_compute_core_metrics_for_date_avg_replans_and_user_correction_rate_extended() {
809        let conn = setup_conn();
810        conn.execute_batch(
811            "INSERT INTO decisions (ts, total_tools, replans, task_completed, feedback)
812             VALUES
813             ('2026-03-15 09:00:00', 1, 0, 1, 'neutral'),
814             ('2026-03-15 10:00:00', 2, 1, 1, 'neg'),
815             ('2026-03-15 11:00:00', 1, 2, 0, 'pos'),
816             ('2026-03-15 12:00:00', 3, 3, 1, 'neutral'),
817             ('2026-03-15 13:00:00', 1, 0, 1, 'pos'),
818             ('2026-03-15 14:00:00', 2, 1, 0, 'neg'),
819             ('2026-03-15 15:00:00', 1, 0, 1, 'neutral')",
820        )
821        .unwrap();
822
823        let metrics = compute_core_metrics_for_date(&conn, "2026-03-15").unwrap();
824        // avg_replans: (0 + 1 + 2 + 3 + 0 + 1 + 0) / 7 = 7 / 7 = 1.0
825        assert!((metrics.avg_replans - 1.0).abs() < 1e-6);
826        // user_correction_rate: neg (2) / (pos (2) + neg (2)) = 2 / 4 = 0.5
827        assert!((metrics.user_correction_rate - 0.5).abs() < 1e-6);
828    }
829
830    #[test]
831    fn test_build_latest_judgement_promotes_improving_metrics() {
832        let conn = setup_conn();
833        conn.execute(
834            "INSERT INTO evolution_metrics (date, first_success_rate, avg_replans, avg_tool_calls, user_correction_rate, egl)
835             VALUES
836             ('2026-03-10', 0.40, 1.5, 3.0, 0.30, 0.0),
837             ('2026-03-11', 0.50, 1.4, 3.0, 0.20, 0.0),
838             ('2026-03-12', 0.55, 1.2, 3.0, 0.15, 0.0),
839             ('2026-03-14', 0.72, 0.8, 2.5, 0.10, 0.0)",
840            [],
841        )
842        .unwrap();
843
844        let summary = build_latest_judgement(&conn).unwrap().unwrap();
845        assert_eq!(summary.judgement, EvolutionJudgement::Promote);
846    }
847}