batty_cli/team/
quality_metrics.rs

1use std::collections::{BTreeMap, HashMap};
2
3use super::events::TeamEvent;
4
5#[derive(Debug, Clone, PartialEq, Default)]
6pub struct QualityMetrics {
7    pub narration_ratio: f64,
8    pub commit_frequency: f64,
9    pub first_pass_test_rate: f64,
10    pub retry_rate: f64,
11    pub time_to_completion_secs: u64,
12}
13
14#[derive(Debug, Clone, PartialEq)]
15pub struct CompletionQualityMetrics {
16    pub backend: String,
17    pub role: String,
18    pub task_id: String,
19    pub narration_ratio: f64,
20    pub commit_frequency: f64,
21    pub first_pass_test_rate: f64,
22    pub retry_rate: f64,
23    pub time_to_completion_secs: u64,
24}
25
26#[derive(Debug, Clone, PartialEq)]
27pub struct BackendQualityStats {
28    pub backend: String,
29    pub samples: u32,
30    pub narration_ratio: f64,
31    pub commit_frequency: f64,
32    pub first_pass_test_rate: f64,
33    pub retry_rate: f64,
34    pub time_to_completion_secs: f64,
35}
36
37pub const BACKEND_COMPARISON_SQL: &str = "SELECT \
38  json_extract(payload, '$.backend') AS backend, \
39  ROUND(AVG(json_extract(payload, '$.narration_ratio')), 4) AS narration_ratio, \
40  ROUND(AVG(json_extract(payload, '$.commit_frequency')), 4) AS commit_frequency, \
41  ROUND(AVG(json_extract(payload, '$.first_pass_test_rate')), 4) AS first_pass_test_rate, \
42  ROUND(AVG(json_extract(payload, '$.retry_rate')), 4) AS retry_rate, \
43  ROUND(AVG(json_extract(payload, '$.time_to_completion_secs')), 2) AS time_to_completion_secs, \
44  COUNT(*) AS samples \
45FROM events \
46WHERE event_type='quality_metrics_recorded' \
47GROUP BY backend \
48ORDER BY backend;";
49
50pub const QUALITY_TRENDS_SQL: &str = "WITH RECURSIVE hours(h) AS ( \
51  SELECT COALESCE(MIN(timestamp) / 3600, strftime('%s', 'now') / 3600) FROM events WHERE event_type='quality_metrics_recorded' \
52  UNION ALL \
53  SELECT h + 1 FROM hours WHERE h < (SELECT COALESCE(MAX(timestamp) / 3600, strftime('%s', 'now') / 3600) FROM events WHERE event_type='quality_metrics_recorded') \
54) \
55SELECT \
56  h * 3600 AS time, \
57  json_extract(e.payload, '$.backend') AS backend, \
58  ROUND(AVG(json_extract(e.payload, '$.first_pass_test_rate')), 4) AS first_pass_test_rate, \
59  ROUND(AVG(json_extract(e.payload, '$.retry_rate')), 4) AS retry_rate \
60FROM hours \
61LEFT JOIN events e \
62  ON e.event_type='quality_metrics_recorded' \
63 AND e.timestamp / 3600 = hours.h \
64  GROUP BY time, backend \
65ORDER BY time, backend;";
66
67pub fn calculate_narration_ratio(output: &str) -> f64 {
68    narration_ratio(output)
69}
70
71pub fn narration_ratio(output: &str) -> f64 {
72    let mut explanation_lines = 0_u32;
73    let mut code_or_tool_lines = 0_u32;
74
75    for line in output.lines() {
76        let trimmed = line.trim();
77        if trimmed.is_empty() {
78            continue;
79        }
80        if is_code_or_tool_line(trimmed) {
81            code_or_tool_lines += 1;
82        } else {
83            explanation_lines += 1;
84        }
85    }
86
87    ratio(explanation_lines, code_or_tool_lines)
88}
89
90pub fn ratio(explanation_lines: u32, code_or_tool_lines: u32) -> f64 {
91    let total = explanation_lines + code_or_tool_lines;
92    if total == 0 {
93        return 0.0;
94    }
95    explanation_lines as f64 / total as f64
96}
97
98pub fn commit_frequency(commits: u32, time_to_completion_secs: u64) -> f64 {
99    if commits == 0 || time_to_completion_secs == 0 {
100        return 0.0;
101    }
102    commits as f64 / (time_to_completion_secs as f64 / 3600.0)
103}
104
105#[allow(clippy::too_many_arguments)]
106pub fn build_completion_quality_metrics(
107    backend: impl Into<String>,
108    role: impl Into<String>,
109    task_id: u32,
110    output: &str,
111    commits: u32,
112    retries_before_success: u32,
113    started_at: Option<u64>,
114    completed_at: u64,
115) -> CompletionQualityMetrics {
116    let time_to_completion_secs = started_at
117        .map(|started| completed_at.saturating_sub(started))
118        .unwrap_or(0);
119    CompletionQualityMetrics {
120        backend: backend.into(),
121        role: role.into(),
122        task_id: task_id.to_string(),
123        narration_ratio: narration_ratio(output),
124        commit_frequency: commit_frequency(commits, time_to_completion_secs),
125        first_pass_test_rate: if retries_before_success == 0 {
126            1.0
127        } else {
128            0.0
129        },
130        retry_rate: retries_before_success as f64,
131        time_to_completion_secs,
132    }
133}
134
135pub fn aggregate_by_backend(
136    metrics: &[(String, QualityMetrics)],
137) -> HashMap<String, QualityMetrics> {
138    let mut grouped: HashMap<String, Vec<&QualityMetrics>> = HashMap::new();
139    for (backend, quality) in metrics {
140        grouped.entry(backend.clone()).or_default().push(quality);
141    }
142
143    grouped
144        .into_iter()
145        .map(|(backend, samples)| {
146            let count = samples.len() as f64;
147            let avg = |f: fn(&QualityMetrics) -> f64| -> f64 {
148                if samples.is_empty() {
149                    return 0.0;
150                }
151                samples.iter().map(|sample| f(sample)).sum::<f64>() / count
152            };
153            let avg_u64 = |f: fn(&QualityMetrics) -> u64| -> u64 {
154                if samples.is_empty() {
155                    return 0;
156                }
157                (samples.iter().map(|sample| f(sample) as f64).sum::<f64>() / count).round() as u64
158            };
159
160            (
161                backend,
162                QualityMetrics {
163                    narration_ratio: avg(|sample| sample.narration_ratio),
164                    commit_frequency: avg(|sample| sample.commit_frequency),
165                    first_pass_test_rate: avg(|sample| sample.first_pass_test_rate),
166                    retry_rate: avg(|sample| sample.retry_rate),
167                    time_to_completion_secs: avg_u64(|sample| sample.time_to_completion_secs),
168                },
169            )
170        })
171        .collect()
172}
173
174pub fn aggregate_completion_metrics_by_backend(
175    metrics: &[CompletionQualityMetrics],
176) -> BTreeMap<String, BackendQualityStats> {
177    let mut grouped: BTreeMap<String, Vec<&CompletionQualityMetrics>> = BTreeMap::new();
178    for metric in metrics {
179        grouped
180            .entry(metric.backend.clone())
181            .or_default()
182            .push(metric);
183    }
184
185    grouped
186        .into_iter()
187        .map(|(backend, samples)| {
188            let count = samples.len() as u32;
189            let avg = |f: fn(&CompletionQualityMetrics) -> f64| -> f64 {
190                if samples.is_empty() {
191                    return 0.0;
192                }
193                samples.iter().map(|sample| f(sample)).sum::<f64>() / samples.len() as f64
194            };
195
196            (
197                backend.clone(),
198                BackendQualityStats {
199                    backend,
200                    samples: count,
201                    narration_ratio: avg(|sample| sample.narration_ratio),
202                    commit_frequency: avg(|sample| sample.commit_frequency),
203                    first_pass_test_rate: avg(|sample| sample.first_pass_test_rate),
204                    retry_rate: avg(|sample| sample.retry_rate),
205                    time_to_completion_secs: avg(|sample| sample.time_to_completion_secs as f64),
206                },
207            )
208        })
209        .collect()
210}
211
212pub fn assignment_started_at(events: &[TeamEvent], role: &str, task_id: u32) -> Option<u64> {
213    let task_id = task_id.to_string();
214    events
215        .iter()
216        .filter(|event| event.event == "task_assigned")
217        .filter(|event| event.role.as_deref() == Some(role))
218        .filter(|event| event.task.as_deref() == Some(task_id.as_str()))
219        .map(|event| event.ts)
220        .min()
221}
222
223fn is_code_or_tool_line(line: &str) -> bool {
224    line.starts_with("```")
225        || line.starts_with("$ ")
226        || line.starts_with("> ")
227        || line.starts_with("Command:")
228        || line.starts_with("Output:")
229        || line.starts_with("diff --git")
230        || line.starts_with("+++ ")
231        || line.starts_with("--- ")
232        || line.starts_with("@@")
233        || line.starts_with('{')
234        || line.starts_with('[')
235        || line.starts_with("fn ")
236        || line.starts_with("let ")
237        || line.starts_with("use ")
238        || line.starts_with("pub ")
239        || line.contains("::")
240        || line.contains('{')
241        || line.contains('}')
242        || (line.contains('(') && line.contains(')') && line.ends_with(';'))
243        || line.contains("();")
244        || line.starts_with("test ")
245        || line.starts_with("running ")
246        || line.starts_with("error[")
247        || line.starts_with("warning:")
248        || line.starts_with("Compiling ")
249        || line.starts_with("Finished ")
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    #[test]
257    fn narration_ratio_is_zero_for_all_code_lines() {
258        let output = "```rust\nfn main() {\n    println!(\"hi\");\n}\n```\n$ cargo test";
259        assert_eq!(calculate_narration_ratio(output), 0.0);
260    }
261
262    #[test]
263    fn narration_ratio_is_one_for_all_text_lines() {
264        let output =
265            "I inspected the failure.\nNext I will patch the parser.\nThen I will rerun tests.";
266        assert_eq!(calculate_narration_ratio(output), 1.0);
267    }
268
269    #[test]
270    fn narration_ratio_handles_mixed_text_and_code() {
271        let output = "I found the issue.\nfn main() {}\nI patched the bug.\n$ cargo test";
272        assert_eq!(calculate_narration_ratio(output), 0.5);
273    }
274
275    #[test]
276    fn first_pass_rate_aggregation_handles_multiple_completions() {
277        let stats = aggregate_by_backend(&[
278            (
279                "codex".into(),
280                QualityMetrics {
281                    first_pass_test_rate: 1.0,
282                    retry_rate: 0.0,
283                    ..QualityMetrics::default()
284                },
285            ),
286            (
287                "codex".into(),
288                QualityMetrics {
289                    first_pass_test_rate: 0.0,
290                    retry_rate: 2.0,
291                    ..QualityMetrics::default()
292                },
293            ),
294            (
295                "codex".into(),
296                QualityMetrics {
297                    first_pass_test_rate: 1.0,
298                    retry_rate: 0.0,
299                    ..QualityMetrics::default()
300                },
301            ),
302        ]);
303
304        let codex = stats.get("codex").unwrap();
305        assert!((codex.first_pass_test_rate - (2.0 / 3.0)).abs() < 0.0001);
306        assert!((codex.retry_rate - (2.0 / 3.0)).abs() < 0.0001);
307    }
308
309    #[test]
310    fn backend_grouping_produces_correct_per_backend_stats() {
311        let stats = aggregate_by_backend(&[
312            (
313                "claude".into(),
314                QualityMetrics {
315                    narration_ratio: 0.5,
316                    commit_frequency: 1.0,
317                    first_pass_test_rate: 1.0,
318                    retry_rate: 0.0,
319                    time_to_completion_secs: 3600,
320                },
321            ),
322            (
323                "codex".into(),
324                QualityMetrics {
325                    narration_ratio: 0.0,
326                    commit_frequency: 4.0,
327                    first_pass_test_rate: 0.0,
328                    retry_rate: 1.0,
329                    time_to_completion_secs: 1800,
330                },
331            ),
332            (
333                "codex".into(),
334                QualityMetrics {
335                    narration_ratio: 0.5,
336                    commit_frequency: 3.0,
337                    first_pass_test_rate: 1.0,
338                    retry_rate: 0.0,
339                    time_to_completion_secs: 3600,
340                },
341            ),
342        ]);
343
344        assert!((stats.get("claude").unwrap().first_pass_test_rate - 1.0).abs() < 0.0001);
345        assert!((stats.get("codex").unwrap().first_pass_test_rate - 0.5).abs() < 0.0001);
346        assert_eq!(stats.get("codex").unwrap().time_to_completion_secs, 2700);
347    }
348
349    #[test]
350    fn empty_data_is_handled_gracefully() {
351        assert!(aggregate_by_backend(&[]).is_empty());
352        assert_eq!(assignment_started_at(&[], "eng-1", 42), None);
353        assert_eq!(calculate_narration_ratio(""), 0.0);
354        assert_eq!(commit_frequency(3, 0), 0.0);
355    }
356}
batty_cli/team/quality_metrics.rs

batty_cli/team/
quality_metrics.rs