Skip to main content

trace_share_core/
episode.rs

1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use std::collections::BTreeMap;
4
5use crate::{models::CanonicalEvent, sanitize::contains_sensitive_patterns};
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct EpisodeOutcomeSignals {
9    pub tests_passed: Option<bool>,
10    pub exit_code: Option<i32>,
11    pub lint_fixed: Option<bool>,
12}
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct EpisodeOutcome {
16    pub success: bool,
17    pub signals: EpisodeOutcomeSignals,
18}
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct EpisodeMeta {
22    pub lang: Option<String>,
23    pub tool_names: Vec<String>,
24    pub error_types: Vec<String>,
25    pub repo_fingerprint: Option<String>,
26    pub os: Option<String>,
27    pub editor: Option<String>,
28    pub raw_content_included: bool,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct EpisodeConsent {
33    pub accepted_at: String,
34    pub consent_version: String,
35    pub public_searchable: bool,
36    pub trainable: bool,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct EpisodeStep {
41    pub role: String,
42    pub content: String,
43    pub name: Option<String>,
44    pub ts: String,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct EpisodeRecord {
49    pub id: String,
50    pub source_tool: String,
51    pub session_id: String,
52    pub ts_start: String,
53    pub ts_end: String,
54    pub prompt: String,
55    pub context: String,
56    pub trace: Vec<EpisodeStep>,
57    pub result: String,
58    pub outcome: EpisodeOutcome,
59    pub meta: EpisodeMeta,
60    pub consent: EpisodeConsent,
61    pub license: String,
62    pub policy_version: String,
63    pub sanitizer_version: String,
64    pub content_hash: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct SftRecord {
69    pub id: String,
70    pub instruction: String,
71    pub input: String,
72    pub output: String,
73    pub meta: EpisodeMeta,
74    pub license: String,
75    pub consent: EpisodeConsent,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct TooltraceMessage {
80    pub role: String,
81    pub content: String,
82    pub name: Option<String>,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct TooltraceRecord {
87    pub id: String,
88    pub messages: Vec<TooltraceMessage>,
89    pub meta: EpisodeMeta,
90    pub license: String,
91    pub consent: EpisodeConsent,
92}
93
94#[allow(clippy::too_many_arguments)]
95pub fn build_episode(
96    source_tool: &str,
97    session_id: &str,
98    events: &[CanonicalEvent],
99    include_raw: bool,
100    accepted_at: &str,
101    consent_version: &str,
102    license: &str,
103    policy_version: &str,
104    sanitizer_version: &str,
105) -> Option<EpisodeRecord> {
106    if events.is_empty() {
107        return None;
108    }
109
110    let raw_prompt = events
111        .iter()
112        .find(|e| e.kind == "user_msg")
113        .map(|e| e.text.clone())
114        .unwrap_or_default();
115
116    let raw_result = events
117        .iter()
118        .rev()
119        .find(|e| e.kind == "assistant_msg" || e.kind == "response_item")
120        .map(|e| e.text.clone())
121        .unwrap_or_default();
122
123    let ts_start = events
124        .first()
125        .map(|e| e.ts.to_rfc3339())
126        .unwrap_or_default();
127    let ts_end = events.last().map(|e| e.ts.to_rfc3339()).unwrap_or_default();
128
129    let tool_names = events
130        .iter()
131        .filter_map(|e| e.tool.as_ref().map(|t| t.name.clone()))
132        .collect::<Vec<_>>();
133
134    let error_types = events
135        .iter()
136        .filter(|e| e.kind == "error")
137        .map(|e| e.kind.clone())
138        .collect::<Vec<_>>();
139
140    let success = !events.iter().any(|e| e.kind == "error");
141
142    let prompt = if include_raw {
143        raw_prompt
144    } else {
145        summarize_prompt(events)
146    };
147
148    let result = if include_raw {
149        raw_result
150    } else {
151        summarize_result(events)
152    };
153
154    let trace = if include_raw {
155        events
156            .iter()
157            .map(|e| EpisodeStep {
158                role: role_from_kind(&e.kind).to_string(),
159                content: e.text.clone(),
160                name: e.tool.as_ref().map(|t| t.name.clone()),
161                ts: e.ts.to_rfc3339(),
162            })
163            .collect::<Vec<_>>()
164    } else {
165        summarize_trace(events)
166    };
167
168    let context = if include_raw {
169        build_context(events)
170    } else {
171        summarize_context(events)
172    };
173
174    let canonical = serde_json::json!({
175        "source_tool": source_tool,
176        "session_id": session_id,
177        "ts_start": ts_start,
178        "ts_end": ts_end,
179        "prompt": prompt,
180        "result": result,
181        "trace": trace,
182    });
183    let canon = serde_json::to_string(&canonical).unwrap_or_default();
184    let content_hash = blake3::hash(canon.as_bytes()).to_hex().to_string();
185    let id = blake3::hash(format!("episode|{content_hash}").as_bytes())
186        .to_hex()
187        .to_string();
188
189    if contains_sensitive_patterns(&prompt)
190        || contains_sensitive_patterns(&context)
191        || contains_sensitive_patterns(&result)
192    {
193        return None;
194    }
195
196    Some(EpisodeRecord {
197        id,
198        source_tool: source_tool.to_string(),
199        session_id: session_id.to_string(),
200        ts_start,
201        ts_end,
202        prompt,
203        context,
204        trace,
205        result,
206        outcome: EpisodeOutcome {
207            success,
208            signals: EpisodeOutcomeSignals {
209                tests_passed: None,
210                exit_code: extract_exit_code(events),
211                lint_fixed: None,
212            },
213        },
214        meta: EpisodeMeta {
215            lang: None,
216            tool_names,
217            error_types,
218            repo_fingerprint: None,
219            os: std::env::consts::OS.parse().ok(),
220            editor: None,
221            raw_content_included: include_raw,
222        },
223        consent: EpisodeConsent {
224            accepted_at: accepted_at.to_string(),
225            consent_version: consent_version.to_string(),
226            public_searchable: true,
227            trainable: true,
228        },
229        license: license.to_string(),
230        policy_version: policy_version.to_string(),
231        sanitizer_version: sanitizer_version.to_string(),
232        content_hash,
233    })
234}
235
236#[allow(clippy::too_many_arguments)]
237pub fn build_episodes(
238    source_tool: &str,
239    session_id: &str,
240    events: &[CanonicalEvent],
241    include_raw: bool,
242    accepted_at: &str,
243    consent_version: &str,
244    license: &str,
245    policy_version: &str,
246    sanitizer_version: &str,
247) -> Vec<EpisodeRecord> {
248    if events.is_empty() {
249        return Vec::new();
250    }
251
252    let windows = split_event_windows(events, 300);
253    windows
254        .into_iter()
255        .filter_map(|w| {
256            build_episode(
257                source_tool,
258                session_id,
259                &w,
260                include_raw,
261                accepted_at,
262                consent_version,
263                license,
264                policy_version,
265                sanitizer_version,
266            )
267        })
268        .collect()
269}
270
271fn split_event_windows(events: &[CanonicalEvent], max_events: usize) -> Vec<Vec<CanonicalEvent>> {
272    let mut out = Vec::new();
273    let mut current = Vec::new();
274    let max_events = max_events.max(50);
275
276    for event in events {
277        if !current.is_empty() && (is_turn_boundary(event) || current.len() >= max_events) {
278            out.push(std::mem::take(&mut current));
279        }
280        current.push(event.clone());
281    }
282
283    if !current.is_empty() {
284        out.push(current);
285    }
286    out
287}
288
289fn is_turn_boundary(event: &CanonicalEvent) -> bool {
290    if event.kind == "turn_context" {
291        return true;
292    }
293    event.kind == "user_msg" && !event.text.trim().is_empty()
294}
295
296fn summarize_prompt(events: &[CanonicalEvent]) -> String {
297    if let Some(msg) = events
298        .iter()
299        .find(|e| e.kind == "user_msg" && !e.text.trim().is_empty())
300    {
301        let candidate = format!("summary_user_prompt: {}", preview(&msg.text, 220));
302        if !contains_sensitive_patterns(&candidate) {
303            return candidate;
304        }
305    }
306    let user_messages = events.iter().filter(|e| e.kind == "user_msg").count();
307    let meaningful_events = events
308        .iter()
309        .filter(|e| !is_low_signal_event(e) || !e.text.trim().is_empty())
310        .count();
311    format!(
312        "summary: user_messages={user_messages} meaningful_events={meaningful_events} (raw prompt omitted)"
313    )
314}
315
316fn summarize_result(events: &[CanonicalEvent]) -> String {
317    if let Some(msg) = events.iter().rev().find(|e| {
318        (e.kind == "assistant_msg" || e.kind == "response_item") && !e.text.trim().is_empty()
319    }) {
320        let candidate = format!("summary_assistant_result: {}", preview(&msg.text, 260));
321        if !contains_sensitive_patterns(&candidate) {
322            return candidate;
323        }
324    }
325    let assistant_messages = events
326        .iter()
327        .filter(|e| e.kind == "assistant_msg" || e.kind == "response_item")
328        .count();
329    let error_events = events.iter().filter(|e| e.kind == "error").count();
330    format!(
331        "summary: assistant_messages={assistant_messages} error_events={error_events} (raw result omitted)"
332    )
333}
334
335fn summarize_context(events: &[CanonicalEvent]) -> String {
336    let mut tool_names = events
337        .iter()
338        .filter_map(|e| e.tool.as_ref().map(|t| t.name.clone()))
339        .collect::<Vec<_>>();
340    tool_names.sort();
341    tool_names.dedup();
342    let exit_codes = events
343        .iter()
344        .filter_map(|e| e.meta.as_ref().and_then(|m| m.exit_code))
345        .collect::<Vec<_>>();
346    let mut by_kind: BTreeMap<String, usize> = BTreeMap::new();
347    for e in events {
348        *by_kind.entry(e.kind.clone()).or_insert(0) += 1;
349    }
350    format!(
351        "summary: tools={:?} exit_codes={:?} events_by_kind={:?} error_events={}",
352        tool_names,
353        exit_codes,
354        by_kind,
355        events.iter().filter(|e| e.kind == "error").count(),
356    )
357}
358
359fn summarize_trace(events: &[CanonicalEvent]) -> Vec<EpisodeStep> {
360    let mut out = Vec::new();
361    if let (Some(first), Some(last)) = (events.first(), events.last()) {
362        let duration_secs = (last.ts - first.ts).num_seconds();
363        out.push(EpisodeStep {
364            role: "system".to_string(),
365            content: format!(
366                "summary: total_events={} duration_secs={} source_kinds_compacted=true",
367                events.len(),
368                duration_secs.max(0)
369            ),
370            name: None,
371            ts: first.ts.to_rfc3339(),
372        });
373    }
374
375    let mut by_kind: BTreeMap<String, usize> = BTreeMap::new();
376    for e in events {
377        *by_kind.entry(e.kind.clone()).or_insert(0) += 1;
378    }
379    out.push(EpisodeStep {
380        role: "system".to_string(),
381        content: format!("summary: by_kind={:?}", by_kind),
382        name: None,
383        ts: events
384            .first()
385            .map(|e| e.ts.to_rfc3339())
386            .unwrap_or_default(),
387    });
388
389    let meaningful = events
390        .iter()
391        .filter(|e| !is_low_signal_event(e) || !e.text.trim().is_empty())
392        .take(10);
393
394    for e in meaningful {
395        let args_len = e
396            .tool
397            .as_ref()
398            .and_then(|t| t.args_json.as_ref())
399            .map(|s| s.len())
400            .unwrap_or(0);
401        let result_len = e
402            .tool
403            .as_ref()
404            .and_then(|t| t.result_json.as_ref())
405            .map(|s| s.len())
406            .unwrap_or(0);
407        let preview_text = preview(&e.text, 140);
408        let content = format!(
409            "summary_event: kind={} chars={} tool_args_bytes={} tool_result_bytes={} preview=\"{}\"",
410            e.kind,
411            e.text.len(),
412            args_len,
413            result_len,
414            preview_text
415        );
416        out.push(EpisodeStep {
417            role: role_from_kind(&e.kind).to_string(),
418            content,
419            name: e.tool.as_ref().map(|t| t.name.clone()),
420            ts: e.ts.to_rfc3339(),
421        });
422    }
423
424    out
425}
426
427fn is_low_signal_event(event: &CanonicalEvent) -> bool {
428    matches!(
429        event.kind.as_str(),
430        "event_msg" | "turn_context" | "session_meta"
431    )
432}
433
434fn preview(input: &str, max_chars: usize) -> String {
435    let compact = input.split_whitespace().collect::<Vec<_>>().join(" ");
436    if compact.chars().count() <= max_chars {
437        return compact;
438    }
439    compact.chars().take(max_chars).collect::<String>() + "..."
440}
441
442pub fn derive_sft(episode: &EpisodeRecord) -> SftRecord {
443    SftRecord {
444        id: episode.id.clone(),
445        instruction: episode.prompt.clone(),
446        input: episode.context.clone(),
447        output: episode.result.clone(),
448        meta: episode.meta.clone(),
449        license: episode.license.clone(),
450        consent: episode.consent.clone(),
451    }
452}
453
454pub fn derive_tooltrace(episode: &EpisodeRecord) -> TooltraceRecord {
455    TooltraceRecord {
456        id: episode.id.clone(),
457        messages: episode
458            .trace
459            .iter()
460            .map(|s| TooltraceMessage {
461                role: s.role.clone(),
462                content: s.content.clone(),
463                name: s.name.clone(),
464            })
465            .collect(),
466        meta: episode.meta.clone(),
467        license: episode.license.clone(),
468        consent: episode.consent.clone(),
469    }
470}
471
472fn role_from_kind(kind: &str) -> &str {
473    match kind {
474        "user_msg" => "user",
475        "assistant_msg" | "response_item" => "assistant",
476        "tool_call" => "assistant",
477        "tool_result" => "tool",
478        _ => "system",
479    }
480}
481
482fn extract_exit_code(events: &[CanonicalEvent]) -> Option<i32> {
483    events
484        .iter()
485        .rev()
486        .find_map(|e| e.meta.as_ref().and_then(|m| m.exit_code))
487}
488
489fn build_context(events: &[CanonicalEvent]) -> String {
490    let mut parts = Vec::new();
491    let errors = events
492        .iter()
493        .filter(|e| e.kind == "error")
494        .map(|e| e.text.clone())
495        .collect::<Vec<_>>();
496    if !errors.is_empty() {
497        parts.push(format!("errors: {}", errors.join(" | ")));
498    }
499
500    let constraints = events
501        .iter()
502        .filter(|e| e.kind == "system")
503        .take(5)
504        .map(|e| e.text.clone())
505        .collect::<Vec<_>>();
506    if !constraints.is_empty() {
507        parts.push(format!("system: {}", constraints.join(" | ")));
508    }
509
510    parts.join("\n")
511}
512
513pub fn parse_ts(ts: &str) -> Option<DateTime<Utc>> {
514    DateTime::parse_from_rfc3339(ts)
515        .ok()
516        .map(|dt| dt.with_timezone(&Utc))
517}
518
519#[cfg(test)]
520mod tests {
521    use chrono::Utc;
522
523    use crate::models::CanonicalEvent;
524
525    #[test]
526    fn summary_mode_keeps_redacted_single_event() {
527        let events = vec![CanonicalEvent {
528            source: "x".to_string(),
529            session_id: "s".to_string(),
530            ts: Utc::now(),
531            kind: "user_msg".to_string(),
532            text: "token=[REDACTED] hello".to_string(),
533            tool: None,
534            meta: None,
535        }];
536        let ep = super::build_episode(
537            "x",
538            "s",
539            &events,
540            false,
541            "2026-01-01T00:00:00Z",
542            "v1",
543            "CC0-1.0",
544            "p1",
545            "s1",
546        );
547        assert!(ep.is_some());
548    }
549
550    #[test]
551    fn splits_large_session_into_multiple_episodes() {
552        let mut events = Vec::new();
553        for i in 0..620 {
554            events.push(CanonicalEvent {
555                source: "x".to_string(),
556                session_id: "s".to_string(),
557                ts: Utc::now(),
558                kind: if i % 120 == 0 {
559                    "user_msg".to_string()
560                } else {
561                    "response_item".to_string()
562                },
563                text: format!("event-{i}"),
564                tool: None,
565                meta: None,
566            });
567        }
568
569        let eps = super::build_episodes("x", "s", &events, false, "a", "v1", "CC0-1.0", "p", "s");
570        assert!(eps.len() >= 2);
571    }
572}