Skip to main content

obol_core/transcript/
obol.rs

1//! The `obol` house dialect: a provider-tagged raw-usage sidecar (`usage.jsonl`)
2//! that in-house harnesses emit. One row per billable LLM call:
3//!
4//! ```json
5//! {"type":"obol.usage","v":"2026-06-08","provider":"anthropic","model":"…",
6//!  "service_tier":"standard","usage":{ …the SDK's usage object, verbatim… }}
7//! ```
8//!
9//! The producer tags (provider / model / tier) and copies the raw `usage`
10//! through — no arithmetic. obol dispatches on `provider` to a shared
11//! normalizer (`provider::{anthropic,openai}`) and derives the rest. The
12//! interpretation — the part naive summers get wrong — lives here, once.
13
14use super::provider::{self, ProviderTokens};
15use crate::error::ObolError;
16use crate::model::{MessageUsage, Provider};
17use serde_json::Value;
18
19/// Schema versions this build understands. `v` is an ISO date, matched as an
20/// opaque string (no date arithmetic). An unrecognized `v` is a loud error, not
21/// a silent mis-parse: a newer schema may mean fields obol can't interpret.
22const SCHEMA_VERSIONS: &[&str] = &["2026-06-08"];
23
24pub fn parse(bytes: &[u8]) -> Result<Vec<MessageUsage>, ObolError> {
25    let text = std::str::from_utf8(bytes).map_err(|e| ObolError::MalformedTranscript {
26        line: 0,
27        msg: e.to_string(),
28    })?;
29
30    let mut out = Vec::new();
31    for (i, line) in text.lines().enumerate() {
32        let line = line.trim();
33        if line.is_empty() {
34            continue;
35        }
36        // Non-JSON lines (e.g. a truncated trailing write) are skipped, like the
37        // other dialects. Only well-formed JSON objects of the right type below.
38        let v: Value = match serde_json::from_str(line) {
39            Ok(v) => v,
40            Err(_) => continue,
41        };
42        if v.get("type").and_then(Value::as_str) != Some("obol.usage") {
43            continue;
44        }
45        out.push(parse_row(&v, i + 1)?);
46    }
47    Ok(out)
48}
49
50fn parse_row(v: &Value, line: usize) -> Result<MessageUsage, ObolError> {
51    let err = |msg: String| ObolError::MalformedTranscript { line, msg };
52
53    let ver = v.get("v").and_then(Value::as_str).unwrap_or("");
54    if !SCHEMA_VERSIONS.contains(&ver) {
55        return Err(err(format!("unknown obol.usage schema version {ver:?}")));
56    }
57
58    let provider_tag = v
59        .get("provider")
60        .and_then(Value::as_str)
61        .ok_or_else(|| err("obol.usage row missing `provider`".into()))?;
62    let usage = v
63        .get("usage")
64        .filter(|u| u.is_object())
65        .ok_or_else(|| err("obol.usage row missing `usage` object".into()))?;
66
67    let (provider, tokens): (Provider, ProviderTokens) = match provider_tag {
68        "anthropic" => (Provider::Anthropic, provider::anthropic::normalize(usage)),
69        "openai" => (Provider::OpenAI, provider::openai::normalize(usage)),
70        other => {
71            return Err(err(format!(
72                "no usage normalizer for provider {other:?} (supported: anthropic, openai)"
73            )))
74        }
75    };
76
77    let request_input_tokens =
78        tokens.input_uncached + tokens.cache_read + tokens.cache_write_5m + tokens.cache_write_1h;
79
80    Ok(MessageUsage {
81        model: v
82            .get("model")
83            .and_then(Value::as_str)
84            .unwrap_or("")
85            .to_string(),
86        provider,
87        namespace: "litellm".into(),
88        input_uncached: tokens.input_uncached,
89        cache_read: tokens.cache_read,
90        cache_write_5m: tokens.cache_write_5m,
91        cache_write_1h: tokens.cache_write_1h,
92        output: tokens.output,
93        request_input_tokens,
94        service_tier: v
95            .get("service_tier")
96            .and_then(Value::as_str)
97            .map(String::from),
98        native_cost_usd: None,
99    })
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105
106    fn anthropic_line() -> &'static str {
107        r#"{"type":"obol.usage","v":"2026-06-08","provider":"anthropic","model":"claude-opus-4-8","service_tier":"standard","usage":{"input_tokens":12,"cache_read_input_tokens":120,"cache_creation_input_tokens":60,"cache_creation":{"ephemeral_5m_input_tokens":50,"ephemeral_1h_input_tokens":10},"output_tokens":9}}"#
108    }
109    fn openai_line() -> &'static str {
110        r#"{"type":"obol.usage","v":"2026-06-08","provider":"openai","model":"gpt-5.5","usage":{"input_tokens":100,"input_tokens_details":{"cached_tokens":40},"output_tokens":20,"output_tokens_details":{"reasoning_tokens":5}}}"#
111    }
112
113    #[test]
114    fn parses_anthropic_and_openai_rows() {
115        let bytes = format!("{}\n{}\n", anthropic_line(), openai_line());
116        let usages = parse(bytes.as_bytes()).unwrap();
117        assert_eq!(
118            usages,
119            vec![
120                MessageUsage {
121                    model: "claude-opus-4-8".into(),
122                    provider: Provider::Anthropic,
123                    namespace: "litellm".into(),
124                    input_uncached: 12,
125                    cache_read: 120,
126                    cache_write_5m: 50,
127                    cache_write_1h: 10,
128                    output: 9,
129                    request_input_tokens: 192,
130                    service_tier: Some("standard".into()),
131                    native_cost_usd: None,
132                },
133                MessageUsage {
134                    model: "gpt-5.5".into(),
135                    provider: Provider::OpenAI,
136                    namespace: "litellm".into(),
137                    input_uncached: 60,
138                    cache_read: 40,
139                    cache_write_5m: 0,
140                    cache_write_1h: 0,
141                    output: 25,
142                    request_input_tokens: 100,
143                    service_tier: None,
144                    native_cost_usd: None,
145                },
146            ]
147        );
148    }
149
150    #[test]
151    fn unknown_schema_version_is_a_loud_error() {
152        let line = r#"{"type":"obol.usage","v":"2099-12-31","provider":"anthropic","model":"x","usage":{"input_tokens":1,"output_tokens":1}}"#;
153        let e = parse(line.as_bytes()).unwrap_err();
154        assert!(
155            matches!(e, ObolError::MalformedTranscript { line: 1, .. }),
156            "got {e:?}"
157        );
158    }
159
160    #[test]
161    fn missing_usage_object_is_a_loud_error() {
162        let line = r#"{"type":"obol.usage","v":"2026-06-08","provider":"anthropic","model":"x"}"#;
163        assert!(parse(line.as_bytes()).is_err());
164    }
165
166    #[test]
167    fn unknown_provider_is_a_loud_error() {
168        let line = r#"{"type":"obol.usage","v":"2026-06-08","provider":"mystery","model":"x","usage":{"input_tokens":1}}"#;
169        assert!(parse(line.as_bytes()).is_err());
170    }
171
172    #[test]
173    fn skips_blank_and_non_obol_lines_but_keeps_valid_rows() {
174        let bytes = format!(
175            "\nnot json\n{{\"type\":\"something_else\"}}\n{}\n",
176            anthropic_line()
177        );
178        let usages = parse(bytes.as_bytes()).unwrap();
179        assert_eq!(usages.len(), 1);
180        assert_eq!(usages[0].model, "claude-opus-4-8");
181    }
182
183    #[test]
184    fn missing_model_yields_empty_model_for_loud_unpriced() {
185        let line = r#"{"type":"obol.usage","v":"2026-06-08","provider":"anthropic","usage":{"input_tokens":1,"output_tokens":1}}"#;
186        let usages = parse(line.as_bytes()).unwrap();
187        assert_eq!(usages[0].model, "");
188    }
189
190    // The integrity guarantee: the SAME Anthropic usage object, once embedded in
191    // a Claude Code assistant line and once in an obol.usage row, produces the
192    // SAME token buckets — because both route through provider::anthropic. The
193    // math lives in one implementation.
194    #[test]
195    fn anthropic_buckets_match_the_claude_dialect() {
196        let usage = r#"{"input_tokens":12,"cache_read_input_tokens":120,"cache_creation_input_tokens":60,"cache_creation":{"ephemeral_5m_input_tokens":50,"ephemeral_1h_input_tokens":10},"output_tokens":9}"#;
197        let claude_line = format!(
198            r#"{{"type":"assistant","message":{{"id":"m1","model":"m","usage":{usage}}}}}"#
199        );
200        let obol_line = format!(
201            r#"{{"type":"obol.usage","v":"2026-06-08","provider":"anthropic","model":"m","usage":{usage}}}"#
202        );
203        let c = crate::transcript::claude::parse(claude_line.as_bytes())
204            .unwrap()
205            .usages;
206        let o = parse(obol_line.as_bytes()).unwrap();
207        let f = |u: &MessageUsage| {
208            (
209                u.input_uncached,
210                u.cache_read,
211                u.cache_write_5m,
212                u.cache_write_1h,
213                u.output,
214                u.request_input_tokens,
215            )
216        };
217        assert_eq!(f(&c[0]), f(&o[0]));
218    }
219}