1use std::collections::{BTreeMap, HashMap};
2
3use super::events::TeamEvent;
4
5#[derive(Debug, Clone, PartialEq, Default)]
6pub struct QualityMetrics {
7 pub narration_ratio: f64,
8 pub commit_frequency: f64,
9 pub first_pass_test_rate: f64,
10 pub retry_rate: f64,
11 pub time_to_completion_secs: u64,
12}
13
14#[derive(Debug, Clone, PartialEq)]
15pub struct CompletionQualityMetrics {
16 pub backend: String,
17 pub role: String,
18 pub task_id: String,
19 pub narration_ratio: f64,
20 pub commit_frequency: f64,
21 pub first_pass_test_rate: f64,
22 pub retry_rate: f64,
23 pub time_to_completion_secs: u64,
24}
25
26#[derive(Debug, Clone, PartialEq)]
27pub struct BackendQualityStats {
28 pub backend: String,
29 pub samples: u32,
30 pub narration_ratio: f64,
31 pub commit_frequency: f64,
32 pub first_pass_test_rate: f64,
33 pub retry_rate: f64,
34 pub time_to_completion_secs: f64,
35}
36
37pub const BACKEND_COMPARISON_SQL: &str = "SELECT \
38 json_extract(payload, '$.backend') AS backend, \
39 ROUND(AVG(json_extract(payload, '$.narration_ratio')), 4) AS narration_ratio, \
40 ROUND(AVG(json_extract(payload, '$.commit_frequency')), 4) AS commit_frequency, \
41 ROUND(AVG(json_extract(payload, '$.first_pass_test_rate')), 4) AS first_pass_test_rate, \
42 ROUND(AVG(json_extract(payload, '$.retry_rate')), 4) AS retry_rate, \
43 ROUND(AVG(json_extract(payload, '$.time_to_completion_secs')), 2) AS time_to_completion_secs, \
44 COUNT(*) AS samples \
45FROM events \
46WHERE event_type='quality_metrics_recorded' \
47GROUP BY backend \
48ORDER BY backend;";
49
50pub const QUALITY_TRENDS_SQL: &str = "WITH RECURSIVE hours(h) AS ( \
51 SELECT COALESCE(MIN(timestamp) / 3600, strftime('%s', 'now') / 3600) FROM events WHERE event_type='quality_metrics_recorded' \
52 UNION ALL \
53 SELECT h + 1 FROM hours WHERE h < (SELECT COALESCE(MAX(timestamp) / 3600, strftime('%s', 'now') / 3600) FROM events WHERE event_type='quality_metrics_recorded') \
54) \
55SELECT \
56 h * 3600 AS time, \
57 json_extract(e.payload, '$.backend') AS backend, \
58 ROUND(AVG(json_extract(e.payload, '$.first_pass_test_rate')), 4) AS first_pass_test_rate, \
59 ROUND(AVG(json_extract(e.payload, '$.retry_rate')), 4) AS retry_rate \
60FROM hours \
61LEFT JOIN events e \
62 ON e.event_type='quality_metrics_recorded' \
63 AND e.timestamp / 3600 = hours.h \
64 GROUP BY time, backend \
65ORDER BY time, backend;";
66
67pub fn calculate_narration_ratio(output: &str) -> f64 {
68 narration_ratio(output)
69}
70
71pub fn narration_ratio(output: &str) -> f64 {
72 let mut explanation_lines = 0_u32;
73 let mut code_or_tool_lines = 0_u32;
74
75 for line in output.lines() {
76 let trimmed = line.trim();
77 if trimmed.is_empty() {
78 continue;
79 }
80 if is_code_or_tool_line(trimmed) {
81 code_or_tool_lines += 1;
82 } else {
83 explanation_lines += 1;
84 }
85 }
86
87 ratio(explanation_lines, code_or_tool_lines)
88}
89
90pub fn ratio(explanation_lines: u32, code_or_tool_lines: u32) -> f64 {
91 let total = explanation_lines + code_or_tool_lines;
92 if total == 0 {
93 return 0.0;
94 }
95 explanation_lines as f64 / total as f64
96}
97
98pub fn commit_frequency(commits: u32, time_to_completion_secs: u64) -> f64 {
99 if commits == 0 || time_to_completion_secs == 0 {
100 return 0.0;
101 }
102 commits as f64 / (time_to_completion_secs as f64 / 3600.0)
103}
104
105#[allow(clippy::too_many_arguments)]
106pub fn build_completion_quality_metrics(
107 backend: impl Into<String>,
108 role: impl Into<String>,
109 task_id: u32,
110 output: &str,
111 commits: u32,
112 retries_before_success: u32,
113 started_at: Option<u64>,
114 completed_at: u64,
115) -> CompletionQualityMetrics {
116 let time_to_completion_secs = started_at
117 .map(|started| completed_at.saturating_sub(started))
118 .unwrap_or(0);
119 CompletionQualityMetrics {
120 backend: backend.into(),
121 role: role.into(),
122 task_id: task_id.to_string(),
123 narration_ratio: narration_ratio(output),
124 commit_frequency: commit_frequency(commits, time_to_completion_secs),
125 first_pass_test_rate: if retries_before_success == 0 {
126 1.0
127 } else {
128 0.0
129 },
130 retry_rate: retries_before_success as f64,
131 time_to_completion_secs,
132 }
133}
134
135pub fn aggregate_by_backend(
136 metrics: &[(String, QualityMetrics)],
137) -> HashMap<String, QualityMetrics> {
138 let mut grouped: HashMap<String, Vec<&QualityMetrics>> = HashMap::new();
139 for (backend, quality) in metrics {
140 grouped.entry(backend.clone()).or_default().push(quality);
141 }
142
143 grouped
144 .into_iter()
145 .map(|(backend, samples)| {
146 let count = samples.len() as f64;
147 let avg = |f: fn(&QualityMetrics) -> f64| -> f64 {
148 if samples.is_empty() {
149 return 0.0;
150 }
151 samples.iter().map(|sample| f(sample)).sum::<f64>() / count
152 };
153 let avg_u64 = |f: fn(&QualityMetrics) -> u64| -> u64 {
154 if samples.is_empty() {
155 return 0;
156 }
157 (samples.iter().map(|sample| f(sample) as f64).sum::<f64>() / count).round() as u64
158 };
159
160 (
161 backend,
162 QualityMetrics {
163 narration_ratio: avg(|sample| sample.narration_ratio),
164 commit_frequency: avg(|sample| sample.commit_frequency),
165 first_pass_test_rate: avg(|sample| sample.first_pass_test_rate),
166 retry_rate: avg(|sample| sample.retry_rate),
167 time_to_completion_secs: avg_u64(|sample| sample.time_to_completion_secs),
168 },
169 )
170 })
171 .collect()
172}
173
174pub fn aggregate_completion_metrics_by_backend(
175 metrics: &[CompletionQualityMetrics],
176) -> BTreeMap<String, BackendQualityStats> {
177 let mut grouped: BTreeMap<String, Vec<&CompletionQualityMetrics>> = BTreeMap::new();
178 for metric in metrics {
179 grouped
180 .entry(metric.backend.clone())
181 .or_default()
182 .push(metric);
183 }
184
185 grouped
186 .into_iter()
187 .map(|(backend, samples)| {
188 let count = samples.len() as u32;
189 let avg = |f: fn(&CompletionQualityMetrics) -> f64| -> f64 {
190 if samples.is_empty() {
191 return 0.0;
192 }
193 samples.iter().map(|sample| f(sample)).sum::<f64>() / samples.len() as f64
194 };
195
196 (
197 backend.clone(),
198 BackendQualityStats {
199 backend,
200 samples: count,
201 narration_ratio: avg(|sample| sample.narration_ratio),
202 commit_frequency: avg(|sample| sample.commit_frequency),
203 first_pass_test_rate: avg(|sample| sample.first_pass_test_rate),
204 retry_rate: avg(|sample| sample.retry_rate),
205 time_to_completion_secs: avg(|sample| sample.time_to_completion_secs as f64),
206 },
207 )
208 })
209 .collect()
210}
211
212pub fn assignment_started_at(events: &[TeamEvent], role: &str, task_id: u32) -> Option<u64> {
213 let task_id = task_id.to_string();
214 events
215 .iter()
216 .filter(|event| event.event == "task_assigned")
217 .filter(|event| event.role.as_deref() == Some(role))
218 .filter(|event| event.task.as_deref() == Some(task_id.as_str()))
219 .map(|event| event.ts)
220 .min()
221}
222
223fn is_code_or_tool_line(line: &str) -> bool {
224 line.starts_with("```")
225 || line.starts_with("$ ")
226 || line.starts_with("> ")
227 || line.starts_with("Command:")
228 || line.starts_with("Output:")
229 || line.starts_with("diff --git")
230 || line.starts_with("+++ ")
231 || line.starts_with("--- ")
232 || line.starts_with("@@")
233 || line.starts_with('{')
234 || line.starts_with('[')
235 || line.starts_with("fn ")
236 || line.starts_with("let ")
237 || line.starts_with("use ")
238 || line.starts_with("pub ")
239 || line.contains("::")
240 || line.contains('{')
241 || line.contains('}')
242 || (line.contains('(') && line.contains(')') && line.ends_with(';'))
243 || line.contains("();")
244 || line.starts_with("test ")
245 || line.starts_with("running ")
246 || line.starts_with("error[")
247 || line.starts_with("warning:")
248 || line.starts_with("Compiling ")
249 || line.starts_with("Finished ")
250}
251
252#[cfg(test)]
253mod tests {
254 use super::*;
255
256 #[test]
257 fn narration_ratio_is_zero_for_all_code_lines() {
258 let output = "```rust\nfn main() {\n println!(\"hi\");\n}\n```\n$ cargo test";
259 assert_eq!(calculate_narration_ratio(output), 0.0);
260 }
261
262 #[test]
263 fn narration_ratio_is_one_for_all_text_lines() {
264 let output =
265 "I inspected the failure.\nNext I will patch the parser.\nThen I will rerun tests.";
266 assert_eq!(calculate_narration_ratio(output), 1.0);
267 }
268
269 #[test]
270 fn narration_ratio_handles_mixed_text_and_code() {
271 let output = "I found the issue.\nfn main() {}\nI patched the bug.\n$ cargo test";
272 assert_eq!(calculate_narration_ratio(output), 0.5);
273 }
274
275 #[test]
276 fn first_pass_rate_aggregation_handles_multiple_completions() {
277 let stats = aggregate_by_backend(&[
278 (
279 "codex".into(),
280 QualityMetrics {
281 first_pass_test_rate: 1.0,
282 retry_rate: 0.0,
283 ..QualityMetrics::default()
284 },
285 ),
286 (
287 "codex".into(),
288 QualityMetrics {
289 first_pass_test_rate: 0.0,
290 retry_rate: 2.0,
291 ..QualityMetrics::default()
292 },
293 ),
294 (
295 "codex".into(),
296 QualityMetrics {
297 first_pass_test_rate: 1.0,
298 retry_rate: 0.0,
299 ..QualityMetrics::default()
300 },
301 ),
302 ]);
303
304 let codex = stats.get("codex").unwrap();
305 assert!((codex.first_pass_test_rate - (2.0 / 3.0)).abs() < 0.0001);
306 assert!((codex.retry_rate - (2.0 / 3.0)).abs() < 0.0001);
307 }
308
309 #[test]
310 fn backend_grouping_produces_correct_per_backend_stats() {
311 let stats = aggregate_by_backend(&[
312 (
313 "claude".into(),
314 QualityMetrics {
315 narration_ratio: 0.5,
316 commit_frequency: 1.0,
317 first_pass_test_rate: 1.0,
318 retry_rate: 0.0,
319 time_to_completion_secs: 3600,
320 },
321 ),
322 (
323 "codex".into(),
324 QualityMetrics {
325 narration_ratio: 0.0,
326 commit_frequency: 4.0,
327 first_pass_test_rate: 0.0,
328 retry_rate: 1.0,
329 time_to_completion_secs: 1800,
330 },
331 ),
332 (
333 "codex".into(),
334 QualityMetrics {
335 narration_ratio: 0.5,
336 commit_frequency: 3.0,
337 first_pass_test_rate: 1.0,
338 retry_rate: 0.0,
339 time_to_completion_secs: 3600,
340 },
341 ),
342 ]);
343
344 assert!((stats.get("claude").unwrap().first_pass_test_rate - 1.0).abs() < 0.0001);
345 assert!((stats.get("codex").unwrap().first_pass_test_rate - 0.5).abs() < 0.0001);
346 assert_eq!(stats.get("codex").unwrap().time_to_completion_secs, 2700);
347 }
348
349 #[test]
350 fn empty_data_is_handled_gracefully() {
351 assert!(aggregate_by_backend(&[]).is_empty());
352 assert_eq!(assignment_started_at(&[], "eng-1", 42), None);
353 assert_eq!(calculate_narration_ratio(""), 0.0);
354 assert_eq!(commit_frequency(3, 0), 0.0);
355 }
356}