1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use std::collections::BTreeMap;
4
5use crate::{models::CanonicalEvent, sanitize::contains_sensitive_patterns};
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct EpisodeOutcomeSignals {
9 pub tests_passed: Option<bool>,
10 pub exit_code: Option<i32>,
11 pub lint_fixed: Option<bool>,
12}
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct EpisodeOutcome {
16 pub success: bool,
17 pub signals: EpisodeOutcomeSignals,
18}
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct EpisodeMeta {
22 pub lang: Option<String>,
23 pub tool_names: Vec<String>,
24 pub error_types: Vec<String>,
25 pub repo_fingerprint: Option<String>,
26 pub os: Option<String>,
27 pub editor: Option<String>,
28 pub raw_content_included: bool,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct EpisodeConsent {
33 pub accepted_at: String,
34 pub consent_version: String,
35 pub public_searchable: bool,
36 pub trainable: bool,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct EpisodeStep {
41 pub role: String,
42 pub content: String,
43 pub name: Option<String>,
44 pub ts: String,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct EpisodeRecord {
49 pub id: String,
50 pub source_tool: String,
51 pub session_id: String,
52 pub ts_start: String,
53 pub ts_end: String,
54 pub prompt: String,
55 pub context: String,
56 pub trace: Vec<EpisodeStep>,
57 pub result: String,
58 pub outcome: EpisodeOutcome,
59 pub meta: EpisodeMeta,
60 pub consent: EpisodeConsent,
61 pub license: String,
62 pub policy_version: String,
63 pub sanitizer_version: String,
64 pub content_hash: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct SftRecord {
69 pub id: String,
70 pub instruction: String,
71 pub input: String,
72 pub output: String,
73 pub meta: EpisodeMeta,
74 pub license: String,
75 pub consent: EpisodeConsent,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct TooltraceMessage {
80 pub role: String,
81 pub content: String,
82 pub name: Option<String>,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct TooltraceRecord {
87 pub id: String,
88 pub messages: Vec<TooltraceMessage>,
89 pub meta: EpisodeMeta,
90 pub license: String,
91 pub consent: EpisodeConsent,
92}
93
94#[allow(clippy::too_many_arguments)]
95pub fn build_episode(
96 source_tool: &str,
97 session_id: &str,
98 events: &[CanonicalEvent],
99 include_raw: bool,
100 accepted_at: &str,
101 consent_version: &str,
102 license: &str,
103 policy_version: &str,
104 sanitizer_version: &str,
105) -> Option<EpisodeRecord> {
106 if events.is_empty() {
107 return None;
108 }
109
110 let raw_prompt = events
111 .iter()
112 .find(|e| e.kind == "user_msg")
113 .map(|e| e.text.clone())
114 .unwrap_or_default();
115
116 let raw_result = events
117 .iter()
118 .rev()
119 .find(|e| e.kind == "assistant_msg" || e.kind == "response_item")
120 .map(|e| e.text.clone())
121 .unwrap_or_default();
122
123 let ts_start = events
124 .first()
125 .map(|e| e.ts.to_rfc3339())
126 .unwrap_or_default();
127 let ts_end = events.last().map(|e| e.ts.to_rfc3339()).unwrap_or_default();
128
129 let tool_names = events
130 .iter()
131 .filter_map(|e| e.tool.as_ref().map(|t| t.name.clone()))
132 .collect::<Vec<_>>();
133
134 let error_types = events
135 .iter()
136 .filter(|e| e.kind == "error")
137 .map(|e| e.kind.clone())
138 .collect::<Vec<_>>();
139
140 let success = !events.iter().any(|e| e.kind == "error");
141
142 let prompt = if include_raw {
143 raw_prompt
144 } else {
145 summarize_prompt(events)
146 };
147
148 let result = if include_raw {
149 raw_result
150 } else {
151 summarize_result(events)
152 };
153
154 let trace = if include_raw {
155 events
156 .iter()
157 .map(|e| EpisodeStep {
158 role: role_from_kind(&e.kind).to_string(),
159 content: e.text.clone(),
160 name: e.tool.as_ref().map(|t| t.name.clone()),
161 ts: e.ts.to_rfc3339(),
162 })
163 .collect::<Vec<_>>()
164 } else {
165 summarize_trace(events)
166 };
167
168 let context = if include_raw {
169 build_context(events)
170 } else {
171 summarize_context(events)
172 };
173
174 let canonical = serde_json::json!({
175 "source_tool": source_tool,
176 "session_id": session_id,
177 "ts_start": ts_start,
178 "ts_end": ts_end,
179 "prompt": prompt,
180 "result": result,
181 "trace": trace,
182 });
183 let canon = serde_json::to_string(&canonical).unwrap_or_default();
184 let content_hash = blake3::hash(canon.as_bytes()).to_hex().to_string();
185 let id = blake3::hash(format!("episode|{content_hash}").as_bytes())
186 .to_hex()
187 .to_string();
188
189 if contains_sensitive_patterns(&prompt)
190 || contains_sensitive_patterns(&context)
191 || contains_sensitive_patterns(&result)
192 {
193 return None;
194 }
195
196 Some(EpisodeRecord {
197 id,
198 source_tool: source_tool.to_string(),
199 session_id: session_id.to_string(),
200 ts_start,
201 ts_end,
202 prompt,
203 context,
204 trace,
205 result,
206 outcome: EpisodeOutcome {
207 success,
208 signals: EpisodeOutcomeSignals {
209 tests_passed: None,
210 exit_code: extract_exit_code(events),
211 lint_fixed: None,
212 },
213 },
214 meta: EpisodeMeta {
215 lang: None,
216 tool_names,
217 error_types,
218 repo_fingerprint: None,
219 os: std::env::consts::OS.parse().ok(),
220 editor: None,
221 raw_content_included: include_raw,
222 },
223 consent: EpisodeConsent {
224 accepted_at: accepted_at.to_string(),
225 consent_version: consent_version.to_string(),
226 public_searchable: true,
227 trainable: true,
228 },
229 license: license.to_string(),
230 policy_version: policy_version.to_string(),
231 sanitizer_version: sanitizer_version.to_string(),
232 content_hash,
233 })
234}
235
236#[allow(clippy::too_many_arguments)]
237pub fn build_episodes(
238 source_tool: &str,
239 session_id: &str,
240 events: &[CanonicalEvent],
241 include_raw: bool,
242 accepted_at: &str,
243 consent_version: &str,
244 license: &str,
245 policy_version: &str,
246 sanitizer_version: &str,
247) -> Vec<EpisodeRecord> {
248 if events.is_empty() {
249 return Vec::new();
250 }
251
252 let windows = split_event_windows(events, 300);
253 windows
254 .into_iter()
255 .filter_map(|w| {
256 build_episode(
257 source_tool,
258 session_id,
259 &w,
260 include_raw,
261 accepted_at,
262 consent_version,
263 license,
264 policy_version,
265 sanitizer_version,
266 )
267 })
268 .collect()
269}
270
271fn split_event_windows(events: &[CanonicalEvent], max_events: usize) -> Vec<Vec<CanonicalEvent>> {
272 let mut out = Vec::new();
273 let mut current = Vec::new();
274 let max_events = max_events.max(50);
275
276 for event in events {
277 if !current.is_empty() && (is_turn_boundary(event) || current.len() >= max_events) {
278 out.push(std::mem::take(&mut current));
279 }
280 current.push(event.clone());
281 }
282
283 if !current.is_empty() {
284 out.push(current);
285 }
286 out
287}
288
289fn is_turn_boundary(event: &CanonicalEvent) -> bool {
290 if event.kind == "turn_context" {
291 return true;
292 }
293 event.kind == "user_msg" && !event.text.trim().is_empty()
294}
295
296fn summarize_prompt(events: &[CanonicalEvent]) -> String {
297 if let Some(msg) = events
298 .iter()
299 .find(|e| e.kind == "user_msg" && !e.text.trim().is_empty())
300 {
301 let candidate = format!("summary_user_prompt: {}", preview(&msg.text, 220));
302 if !contains_sensitive_patterns(&candidate) {
303 return candidate;
304 }
305 }
306 let user_messages = events.iter().filter(|e| e.kind == "user_msg").count();
307 let meaningful_events = events
308 .iter()
309 .filter(|e| !is_low_signal_event(e) || !e.text.trim().is_empty())
310 .count();
311 format!(
312 "summary: user_messages={user_messages} meaningful_events={meaningful_events} (raw prompt omitted)"
313 )
314}
315
316fn summarize_result(events: &[CanonicalEvent]) -> String {
317 if let Some(msg) = events.iter().rev().find(|e| {
318 (e.kind == "assistant_msg" || e.kind == "response_item") && !e.text.trim().is_empty()
319 }) {
320 let candidate = format!("summary_assistant_result: {}", preview(&msg.text, 260));
321 if !contains_sensitive_patterns(&candidate) {
322 return candidate;
323 }
324 }
325 let assistant_messages = events
326 .iter()
327 .filter(|e| e.kind == "assistant_msg" || e.kind == "response_item")
328 .count();
329 let error_events = events.iter().filter(|e| e.kind == "error").count();
330 format!(
331 "summary: assistant_messages={assistant_messages} error_events={error_events} (raw result omitted)"
332 )
333}
334
335fn summarize_context(events: &[CanonicalEvent]) -> String {
336 let mut tool_names = events
337 .iter()
338 .filter_map(|e| e.tool.as_ref().map(|t| t.name.clone()))
339 .collect::<Vec<_>>();
340 tool_names.sort();
341 tool_names.dedup();
342 let exit_codes = events
343 .iter()
344 .filter_map(|e| e.meta.as_ref().and_then(|m| m.exit_code))
345 .collect::<Vec<_>>();
346 let mut by_kind: BTreeMap<String, usize> = BTreeMap::new();
347 for e in events {
348 *by_kind.entry(e.kind.clone()).or_insert(0) += 1;
349 }
350 format!(
351 "summary: tools={:?} exit_codes={:?} events_by_kind={:?} error_events={}",
352 tool_names,
353 exit_codes,
354 by_kind,
355 events.iter().filter(|e| e.kind == "error").count(),
356 )
357}
358
359fn summarize_trace(events: &[CanonicalEvent]) -> Vec<EpisodeStep> {
360 let mut out = Vec::new();
361 if let (Some(first), Some(last)) = (events.first(), events.last()) {
362 let duration_secs = (last.ts - first.ts).num_seconds();
363 out.push(EpisodeStep {
364 role: "system".to_string(),
365 content: format!(
366 "summary: total_events={} duration_secs={} source_kinds_compacted=true",
367 events.len(),
368 duration_secs.max(0)
369 ),
370 name: None,
371 ts: first.ts.to_rfc3339(),
372 });
373 }
374
375 let mut by_kind: BTreeMap<String, usize> = BTreeMap::new();
376 for e in events {
377 *by_kind.entry(e.kind.clone()).or_insert(0) += 1;
378 }
379 out.push(EpisodeStep {
380 role: "system".to_string(),
381 content: format!("summary: by_kind={:?}", by_kind),
382 name: None,
383 ts: events
384 .first()
385 .map(|e| e.ts.to_rfc3339())
386 .unwrap_or_default(),
387 });
388
389 let meaningful = events
390 .iter()
391 .filter(|e| !is_low_signal_event(e) || !e.text.trim().is_empty())
392 .take(10);
393
394 for e in meaningful {
395 let args_len = e
396 .tool
397 .as_ref()
398 .and_then(|t| t.args_json.as_ref())
399 .map(|s| s.len())
400 .unwrap_or(0);
401 let result_len = e
402 .tool
403 .as_ref()
404 .and_then(|t| t.result_json.as_ref())
405 .map(|s| s.len())
406 .unwrap_or(0);
407 let preview_text = preview(&e.text, 140);
408 let content = format!(
409 "summary_event: kind={} chars={} tool_args_bytes={} tool_result_bytes={} preview=\"{}\"",
410 e.kind,
411 e.text.len(),
412 args_len,
413 result_len,
414 preview_text
415 );
416 out.push(EpisodeStep {
417 role: role_from_kind(&e.kind).to_string(),
418 content,
419 name: e.tool.as_ref().map(|t| t.name.clone()),
420 ts: e.ts.to_rfc3339(),
421 });
422 }
423
424 out
425}
426
427fn is_low_signal_event(event: &CanonicalEvent) -> bool {
428 matches!(
429 event.kind.as_str(),
430 "event_msg" | "turn_context" | "session_meta"
431 )
432}
433
434fn preview(input: &str, max_chars: usize) -> String {
435 let compact = input.split_whitespace().collect::<Vec<_>>().join(" ");
436 if compact.chars().count() <= max_chars {
437 return compact;
438 }
439 compact.chars().take(max_chars).collect::<String>() + "..."
440}
441
442pub fn derive_sft(episode: &EpisodeRecord) -> SftRecord {
443 SftRecord {
444 id: episode.id.clone(),
445 instruction: episode.prompt.clone(),
446 input: episode.context.clone(),
447 output: episode.result.clone(),
448 meta: episode.meta.clone(),
449 license: episode.license.clone(),
450 consent: episode.consent.clone(),
451 }
452}
453
454pub fn derive_tooltrace(episode: &EpisodeRecord) -> TooltraceRecord {
455 TooltraceRecord {
456 id: episode.id.clone(),
457 messages: episode
458 .trace
459 .iter()
460 .map(|s| TooltraceMessage {
461 role: s.role.clone(),
462 content: s.content.clone(),
463 name: s.name.clone(),
464 })
465 .collect(),
466 meta: episode.meta.clone(),
467 license: episode.license.clone(),
468 consent: episode.consent.clone(),
469 }
470}
471
472fn role_from_kind(kind: &str) -> &str {
473 match kind {
474 "user_msg" => "user",
475 "assistant_msg" | "response_item" => "assistant",
476 "tool_call" => "assistant",
477 "tool_result" => "tool",
478 _ => "system",
479 }
480}
481
482fn extract_exit_code(events: &[CanonicalEvent]) -> Option<i32> {
483 events
484 .iter()
485 .rev()
486 .find_map(|e| e.meta.as_ref().and_then(|m| m.exit_code))
487}
488
489fn build_context(events: &[CanonicalEvent]) -> String {
490 let mut parts = Vec::new();
491 let errors = events
492 .iter()
493 .filter(|e| e.kind == "error")
494 .map(|e| e.text.clone())
495 .collect::<Vec<_>>();
496 if !errors.is_empty() {
497 parts.push(format!("errors: {}", errors.join(" | ")));
498 }
499
500 let constraints = events
501 .iter()
502 .filter(|e| e.kind == "system")
503 .take(5)
504 .map(|e| e.text.clone())
505 .collect::<Vec<_>>();
506 if !constraints.is_empty() {
507 parts.push(format!("system: {}", constraints.join(" | ")));
508 }
509
510 parts.join("\n")
511}
512
513pub fn parse_ts(ts: &str) -> Option<DateTime<Utc>> {
514 DateTime::parse_from_rfc3339(ts)
515 .ok()
516 .map(|dt| dt.with_timezone(&Utc))
517}
518
519#[cfg(test)]
520mod tests {
521 use chrono::Utc;
522
523 use crate::models::CanonicalEvent;
524
525 #[test]
526 fn summary_mode_keeps_redacted_single_event() {
527 let events = vec![CanonicalEvent {
528 source: "x".to_string(),
529 session_id: "s".to_string(),
530 ts: Utc::now(),
531 kind: "user_msg".to_string(),
532 text: "token=[REDACTED] hello".to_string(),
533 tool: None,
534 meta: None,
535 }];
536 let ep = super::build_episode(
537 "x",
538 "s",
539 &events,
540 false,
541 "2026-01-01T00:00:00Z",
542 "v1",
543 "CC0-1.0",
544 "p1",
545 "s1",
546 );
547 assert!(ep.is_some());
548 }
549
550 #[test]
551 fn splits_large_session_into_multiple_episodes() {
552 let mut events = Vec::new();
553 for i in 0..620 {
554 events.push(CanonicalEvent {
555 source: "x".to_string(),
556 session_id: "s".to_string(),
557 ts: Utc::now(),
558 kind: if i % 120 == 0 {
559 "user_msg".to_string()
560 } else {
561 "response_item".to_string()
562 },
563 text: format!("event-{i}"),
564 tool: None,
565 meta: None,
566 });
567 }
568
569 let eps = super::build_episodes("x", "s", &events, false, "a", "v1", "CC0-1.0", "p", "s");
570 assert!(eps.len() >= 2);
571 }
572}