Skip to main content

agx_core/
format.rs

1use anyhow::{Context, Result, anyhow};
2use std::fmt;
3use std::path::Path;
4
5/// A recognized session format.
6///
7/// `#[non_exhaustive]` signals that new variants will land as new
8/// parsers ship (e.g. LlamaIndex, Pydantic AI). External
9/// pattern matches must include a wildcard arm; internal matches
10/// stay exhaustive. See `docs/stability.md` for the full
11/// versioning policy.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
13#[serde(rename_all = "snake_case")]
14#[non_exhaustive]
15pub enum Format {
16    ClaudeCode,
17    Codex,
18    Gemini,
19    Generic,
20    Langchain,
21    OtelJson,
22    OtelProto,
23    VercelAi,
24}
25
26impl fmt::Display for Format {
27    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28        let s = match self {
29            Format::ClaudeCode => "Claude Code",
30            Format::Codex => "Codex CLI",
31            Format::Gemini => "Gemini CLI",
32            Format::Generic => "Generic conversation",
33            Format::Langchain => "LangChain / LangSmith",
34            Format::OtelJson => "OpenTelemetry GenAI (JSON)",
35            Format::OtelProto => "OpenTelemetry GenAI (protobuf)",
36            Format::VercelAi => "Vercel AI SDK",
37        };
38        f.write_str(s)
39    }
40}
41
42/// Detect the format of a session file by inspecting its content shape.
43/// Content-based only — no file-extension sniffing, because agx tools in
44/// the wild all use vanilla `.json` / `.jsonl` extensions regardless of
45/// which agent CLI produced them.
46///
47/// Probe order (most specific first, so ambiguous shapes land on the
48/// right parser):
49///
50/// - Non-UTF-8 bytes → [`Format::OtelProto`] (binary OTLP)
51/// - Single JSON with `resourceSpans` → [`Format::OtelJson`]
52/// - Single JSON with `run_type` + `inputs`/`outputs` → [`Format::Langchain`]
53/// - Single JSON with `finishReason` / `steps[].stepType` / camelCase `toolCallId` → [`Format::VercelAi`]
54/// - Single JSON with `sessionId` + `messages` → [`Format::Gemini`]
55/// - Single JSON with bare `messages` → [`Format::Generic`]
56/// - JSONL first-line `type` in `session_meta` / `event_msg` / `response_item` / `turn_context` → [`Format::Codex`]
57/// - Anything else → [`Format::ClaudeCode`]
58///
59/// The ordering matters. For example, a Vercel AI SDK save has
60/// `messages` at the top level (which would otherwise match Generic),
61/// so the Vercel-specific markers (`finishReason` / `stepType` /
62/// camelCase `toolCallId`) are checked first. Same story for LangChain
63/// exports that happen to include a `messages` field under `inputs`.
64pub fn detect(path: &Path) -> Result<Format> {
65    // Read bytes first so we can distinguish text formats from binary OTLP.
66    // `read_to_string` used to be enough when all supported formats were
67    // UTF-8 JSON/JSONL, but Phase 2.2 (binary OTLP) requires us to
68    // gracefully route non-UTF-8 content to the protobuf parser.
69    let bytes =
70        std::fs::read(path).with_context(|| format!("reading session file: {}", path.display()))?;
71    if bytes.is_empty() {
72        return Err(anyhow!("session file is empty"));
73    }
74
75    let Ok(content) = std::str::from_utf8(&bytes) else {
76        // Not UTF-8 → must be binary. Only binary format agx handles today
77        // is OTLP protobuf. We route to OtelProto regardless of whether the
78        // `otel-proto` feature is enabled at build time; the load dispatch
79        // in main.rs produces a helpful rebuild-with-feature error when the
80        // feature is off, which is a better failure mode than silently
81        // mis-claiming "not json".
82        return Ok(Format::OtelProto);
83    };
84
85    // Single JSON object: OTel GenAI (resourceSpans), LangSmith/LangChain
86    // export (run_type at top level), Vercel AI SDK (finishReason or
87    // steps[].stepType), Gemini (sessionId + messages), or Generic
88    // (messages with role). Vercel is checked before Generic because its
89    // outer shape (`messages[]` with role=user) would otherwise match
90    // Generic — the Vercel-specific markers (`finishReason` / `stepType`)
91    // disambiguate.
92    if content.trim_start().starts_with('{')
93        && let Ok(v) = serde_json::from_str::<serde_json::Value>(content)
94    {
95        if v.get("resourceSpans").is_some() {
96            return Ok(Format::OtelJson);
97        }
98        if v.get("run_type").is_some() && (v.get("inputs").is_some() || v.get("outputs").is_some())
99        {
100            return Ok(Format::Langchain);
101        }
102        if is_vercel_ai(&v) {
103            return Ok(Format::VercelAi);
104        }
105        if v.get("sessionId").is_some() && v.get("messages").is_some() {
106            return Ok(Format::Gemini);
107        }
108        if v.get("messages").is_some() {
109            return Ok(Format::Generic);
110        }
111    }
112
113    // JSONL: inspect the first non-empty line's `type` field
114    let first = content
115        .lines()
116        .find(|l| !l.trim().is_empty())
117        .ok_or_else(|| anyhow!("session file is empty"))?;
118    let entry: serde_json::Value = serde_json::from_str(first)
119        .with_context(|| "could not parse first line of session file as JSON")?;
120    let ty = entry
121        .get("type")
122        .and_then(|t| t.as_str())
123        .ok_or_else(|| anyhow!("first entry has no `type` field"))?;
124    match ty {
125        "session_meta" | "event_msg" | "response_item" | "turn_context" => Ok(Format::Codex),
126        _ => Ok(Format::ClaudeCode),
127    }
128}
129
130/// Heuristics for Vercel AI SDK `generateText` / `streamText` saved
131/// traces. Any of these is sufficient — they're all specific enough to the
132/// SDK that false positives are rare.
133fn is_vercel_ai(v: &serde_json::Value) -> bool {
134    // `finishReason` at the top level is a definitive SDK marker.
135    if v.get("finishReason").is_some() {
136        return true;
137    }
138    // `steps: [{stepType: ...}]` is the multi-step result shape.
139    if let Some(steps) = v.get("steps").and_then(|s| s.as_array())
140        && steps.iter().any(|s| s.get("stepType").is_some())
141    {
142        return true;
143    }
144    // CamelCase toolCall fields — distinguishes from generic OpenAI
145    // (`tool_calls[0].id`, `.function.name`) which uses snake_case.
146    if let Some(calls) = v.get("toolCalls").and_then(|c| c.as_array())
147        && calls
148            .iter()
149            .any(|c| c.get("toolCallId").is_some() && c.get("toolName").is_some())
150    {
151        return true;
152    }
153    false
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use std::io::Write;
160    use tempfile::NamedTempFile;
161
162    fn write_file(content: &str) -> NamedTempFile {
163        let mut f = NamedTempFile::new().unwrap();
164        f.write_all(content.as_bytes()).unwrap();
165        f
166    }
167
168    #[test]
169    fn detects_claude_code_by_first_line_type() {
170        let f = write_file(
171            r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2024-01-01T00:00:00Z","message":{"role":"user","content":"hi"}}"#,
172        );
173        assert_eq!(detect(f.path()).unwrap(), Format::ClaudeCode);
174    }
175
176    #[test]
177    fn detects_codex_by_session_meta_first_line() {
178        let f = write_file(
179            r#"{"timestamp":"2024-01-01T00:00:00Z","type":"session_meta","payload":{"id":"s1","cwd":"/tmp","originator":"codex-tui"}}"#,
180        );
181        assert_eq!(detect(f.path()).unwrap(), Format::Codex);
182    }
183
184    #[test]
185    fn detects_codex_by_response_item_first_line() {
186        let f = write_file(
187            r#"{"timestamp":"2024-01-01T00:00:00Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"hi"}]}}"#,
188        );
189        assert_eq!(detect(f.path()).unwrap(), Format::Codex);
190    }
191
192    #[test]
193    fn detects_otel_json_by_resource_spans_key() {
194        // Minimal OTLP-JSON: any top-level object with `resourceSpans` is
195        // unambiguously OTel, independent of what's inside.
196        let f = write_file(r#"{"resourceSpans":[]}"#);
197        assert_eq!(detect(f.path()).unwrap(), Format::OtelJson);
198    }
199
200    #[test]
201    fn detects_generic_by_bare_messages_only() {
202        // Pure OpenAI-compatible conversation: `messages` but none of the
203        // format-specific markers that Vercel / Gemini / LangChain need.
204        let f = write_file(
205            r#"{"messages":[{"role":"user","content":"hi"},{"role":"assistant","content":"hello"}]}"#,
206        );
207        assert_eq!(detect(f.path()).unwrap(), Format::Generic);
208    }
209
210    #[test]
211    fn langchain_requires_inputs_or_outputs_alongside_run_type() {
212        // A single `run_type` field without inputs/outputs is probably a
213        // partial or unrelated object — fall through rather than misroute
214        // to LangChain. Adding a bare `messages` so something catches.
215        let f = write_file(r#"{"run_type":"chain","messages":[{"role":"user","content":"hi"}]}"#);
216        assert_eq!(detect(f.path()).unwrap(), Format::Generic);
217    }
218
219    #[test]
220    fn detects_gemini_by_single_json_object_with_sessionid() {
221        let f = write_file(
222            r#"{"sessionId":"s1","projectHash":"abc","startTime":"2024-01-01T00:00:00Z","lastUpdated":"2024-01-01T00:00:01Z","messages":[],"kind":"main"}"#,
223        );
224        assert_eq!(detect(f.path()).unwrap(), Format::Gemini);
225    }
226
227    #[test]
228    fn empty_file_errors() {
229        let f = write_file("");
230        assert!(detect(f.path()).is_err());
231    }
232
233    #[test]
234    fn invalid_first_line_errors() {
235        let f = write_file("not json\n");
236        assert!(detect(f.path()).is_err());
237    }
238
239    #[test]
240    fn detects_langchain_by_run_type_top_level_key() {
241        let f = write_file(
242            r#"{"id":"r1","name":"chain","run_type":"chain","start_time":"2024-01-01T00:00:00Z","inputs":{"input":"hi"},"outputs":{"output":"hello"},"child_runs":[]}"#,
243        );
244        assert_eq!(detect(f.path()).unwrap(), Format::Langchain);
245    }
246
247    #[test]
248    fn detects_vercel_ai_by_finish_reason_top_level() {
249        let f = write_file(
250            r#"{"text":"ok","finishReason":"stop","usage":{"promptTokens":1,"completionTokens":1},"messages":[{"role":"user","content":"q"}]}"#,
251        );
252        assert_eq!(detect(f.path()).unwrap(), Format::VercelAi);
253    }
254
255    #[test]
256    fn detects_vercel_ai_by_step_type() {
257        let f = write_file(
258            r#"{"steps":[{"stepType":"initial","text":"hi"}],"messages":[{"role":"user","content":"q"}]}"#,
259        );
260        assert_eq!(detect(f.path()).unwrap(), Format::VercelAi);
261    }
262
263    #[test]
264    fn detects_vercel_ai_by_camelcase_tool_call_fields() {
265        let f = write_file(
266            r#"{"toolCalls":[{"toolCallId":"c1","toolName":"x","args":{}}],"messages":[{"role":"user","content":"q"}]}"#,
267        );
268        assert_eq!(detect(f.path()).unwrap(), Format::VercelAi);
269    }
270
271    #[test]
272    fn generic_messages_without_vercel_markers_still_detect_as_generic() {
273        let f = write_file(
274            r#"{"messages":[{"role":"user","content":"hi"},{"role":"assistant","content":"hello"}]}"#,
275        );
276        assert_eq!(detect(f.path()).unwrap(), Format::Generic);
277    }
278
279    #[test]
280    fn non_utf8_file_routes_to_otel_proto() {
281        // Binary bytes (0x80+ is invalid as a UTF-8 lead byte in positions
282        // where a leading byte is required). Simulates a .pb file from an
283        // OTLP exporter. Detection routes to OtelProto even when the
284        // feature is off — main.rs's dispatch owns the "feature disabled"
285        // error rather than detection.
286        let mut f = NamedTempFile::new().unwrap();
287        f.write_all(&[0x0a, 0x80, 0xff, 0xfe]).unwrap();
288        assert_eq!(detect(f.path()).unwrap(), Format::OtelProto);
289    }
290}