car-inference 0.32.1

Local model inference for CAR — Candle backend with Qwen3 models
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//! Provider-calibrated token estimation for multimodal content blocks.
//!
//! Text token estimation elsewhere in this crate uses the ~4 chars/token
//! heuristic ([`crate::remote::estimate_tokens`]). Media blocks used to
//! contribute **zero** to every estimate, so a request carrying a minute
//! of video under-counted its prompt by tens of thousands of tokens —
//! breaking context-window fitting (`truncate_prompt_to_fit`), the
//! adaptive router's headroom/`needs_compaction` signal, and the I4
//! mid-stream spend guard's armed prompt cost.
//!
//! # Calibration source
//!
//! Gemini is the only remote protocol CAR routes video/audio through
//! (`ProtocolHandler::supports_video()` / `supports_audio()`), so the
//! constants are calibrated to Google's documented multimodal token
//! accounting (<https://ai.google.dev/gemini-api/docs/tokens> and
//! <https://ai.google.dev/gemini-api/docs/video-understanding>):
//!
//! * **Video**: 263 tokens per second of video at the default 1 fps
//!   sampling / default media resolution (≈258 tokens per sampled frame
//!   plus per-second overhead).
//! * **Audio**: 32 tokens per second.
//! * **Images**: 258 tokens flat for images ≤384 px in both dimensions;
//!   larger images are tiled into 768×768 crops at 258 tokens each. We
//!   don't decode image bytes here, so the flat 258 is used as the
//!   documented floor.
//!
//! These are deliberately **estimates for budgeting**, not billing-grade
//! counts: the goal is that a multimodal request stops counting as
//! near-zero input. Anthropic/OpenAI image accounting differs (≈
//! `w*h/750` and tile-based respectively) but is the same order of
//! magnitude as the Gemini floor.
//!
//! # Duration heuristics
//!
//! The estimator is synchronous and does no network I/O and no media
//! decoding. Durations are derived from what is cheaply knowable:
//!
//! * `VideoPath` / `AudioPath` — file size via `fs::metadata`, divided
//!   by an assumed encoding rate (1.5 Mbps H.264 for video — typical
//!   720p; 128 kbps for audio). Stat failure falls back to the default
//!   duration.
//! * `VideoBase64` / `AudioBase64` — decoded byte count (`len * 3 / 4`),
//!   same rate conversion.
//! * `VideoUrl` / `AudioUrl` — no probe (an estimator must not block on
//!   the network); assumes [`DEFAULT_VIDEO_SECONDS`] /
//!   [`DEFAULT_AUDIO_SECONDS`].
//!
//! `fps` / `max_frames` hints on the video variants refine the estimate:
//! with an explicit `fps` the frame count scales accordingly, and
//! `max_frames` caps it (the same cap backends apply when sampling).

use crate::tasks::generate::{ContentBlock, Message};

/// Tokens per second of video at Gemini's default 1 fps sampling and
/// default media resolution. Source:
/// <https://ai.google.dev/gemini-api/docs/video-understanding>.
pub const VIDEO_TOKENS_PER_SECOND: usize = 263;

/// Tokens per sampled video frame at default media resolution. Used when
/// the caller supplies an explicit `fps` or `max_frames` hint. Source:
/// <https://ai.google.dev/gemini-api/docs/video-understanding>.
pub const VIDEO_TOKENS_PER_FRAME: usize = 258;

/// Tokens per second of audio. Source:
/// <https://ai.google.dev/gemini-api/docs/tokens>.
pub const AUDIO_TOKENS_PER_SECOND: usize = 32;

/// Flat per-image token floor (Gemini: 258 for ≤384 px, 258 per 768×768
/// tile above that). We don't decode dimensions, so the floor is used.
pub const IMAGE_TOKENS_FLAT: usize = 258;

/// Assumed video byte rate for size→duration conversion: 1.5 Mbps
/// (typical 720p H.264) ≈ 187,500 bytes/second.
pub const ASSUMED_VIDEO_BYTES_PER_SEC: u64 = 187_500;

/// Assumed audio byte rate for size→duration conversion: 128 kbps
/// (typical MP3/AAC) = 16,000 bytes/second.
pub const ASSUMED_AUDIO_BYTES_PER_SEC: u64 = 16_000;

/// Assumed duration for video sources whose size can't be determined
/// without network I/O (`VideoUrl`).
pub const DEFAULT_VIDEO_SECONDS: u64 = 60;

/// Assumed duration for audio sources whose size can't be determined
/// without network I/O (`AudioUrl`).
pub const DEFAULT_AUDIO_SECONDS: u64 = 60;

/// Bytes represented by a base64 string (4 chars → 3 bytes).
fn base64_decoded_len(data: &str) -> u64 {
    (data.len() as u64 / 4) * 3
}

/// Duration in whole seconds from a byte count and an assumed rate.
/// Always at least 1 second for non-empty media.
fn seconds_from_bytes(bytes: u64, bytes_per_sec: u64) -> u64 {
    if bytes == 0 {
        return 0;
    }
    std::cmp::max(1, bytes / bytes_per_sec)
}

/// File size in bytes, or `None` when the path can't be stat'd.
fn file_len(path: &str) -> Option<u64> {
    std::fs::metadata(path).ok().map(|m| m.len())
}

/// Estimated tokens for `seconds` of video, honoring the caller's
/// sampling hints. With no hints this is the documented
/// `seconds × 263`. With an explicit `fps` the sampled frame count
/// scales (`seconds × fps` frames at 258 tokens each); `max_frames`
/// caps the frame count either way — the same cap backends apply when
/// actually sampling the clip.
pub fn video_tokens_for_duration(
    seconds: u64,
    fps: Option<f32>,
    max_frames: Option<u32>,
) -> usize {
    if seconds == 0 {
        return 0;
    }
    match (fps, max_frames) {
        (None, None) => seconds as usize * VIDEO_TOKENS_PER_SECOND,
        _ => {
            let effective_fps = fps.unwrap_or(1.0).max(0.0) as f64;
            let mut frames = (seconds as f64 * effective_fps).ceil() as u64;
            if let Some(cap) = max_frames {
                frames = frames.min(cap as u64);
            }
            // A non-empty clip is at least one sampled frame.
            frames = frames.max(1);
            frames as usize * VIDEO_TOKENS_PER_FRAME
        }
    }
}

/// Estimated tokens for `seconds` of audio: `seconds × 32`.
pub fn audio_tokens_for_duration(seconds: u64) -> usize {
    seconds as usize * AUDIO_TOKENS_PER_SECOND
}

/// Estimated input tokens contributed by one content block's **media**
/// payload. `Text` blocks return 0 — text is counted by the caller's
/// existing text heuristic, never double-counted here.
pub fn content_block_media_tokens(block: &ContentBlock) -> usize {
    match block {
        ContentBlock::Text { .. } => 0,
        ContentBlock::ImageBase64 { .. } | ContentBlock::ImageUrl { .. } => IMAGE_TOKENS_FLAT,
        ContentBlock::VideoPath {
            path,
            fps,
            max_frames,
        } => {
            let seconds = file_len(path)
                .map(|len| seconds_from_bytes(len, ASSUMED_VIDEO_BYTES_PER_SEC))
                .unwrap_or(DEFAULT_VIDEO_SECONDS);
            video_tokens_for_duration(seconds, *fps, *max_frames)
        }
        ContentBlock::VideoUrl {
            fps, max_frames, ..
        } => video_tokens_for_duration(DEFAULT_VIDEO_SECONDS, *fps, *max_frames),
        ContentBlock::VideoBase64 {
            data,
            fps,
            max_frames,
            ..
        } => {
            let seconds =
                seconds_from_bytes(base64_decoded_len(data), ASSUMED_VIDEO_BYTES_PER_SEC);
            video_tokens_for_duration(seconds, *fps, *max_frames)
        }
        ContentBlock::AudioPath { path, .. } => {
            let seconds = file_len(path)
                .map(|len| seconds_from_bytes(len, ASSUMED_AUDIO_BYTES_PER_SEC))
                .unwrap_or(DEFAULT_AUDIO_SECONDS);
            audio_tokens_for_duration(seconds)
        }
        ContentBlock::AudioUrl { .. } => audio_tokens_for_duration(DEFAULT_AUDIO_SECONDS),
        ContentBlock::AudioBase64 { data, .. } => {
            let seconds =
                seconds_from_bytes(base64_decoded_len(data), ASSUMED_AUDIO_BYTES_PER_SEC);
            audio_tokens_for_duration(seconds)
        }
    }
}

/// Sum of [`content_block_media_tokens`] over a block slice.
pub fn blocks_media_tokens(blocks: &[ContentBlock]) -> usize {
    blocks.iter().map(content_block_media_tokens).sum()
}

/// Total tokens a request's multimodal inputs and multi-turn history
/// add on top of the caller's prompt/context/tools text estimate:
/// single-turn `images` blocks at calibrated media rates, plus the
/// full media-aware `messages` history ([`messages_history_tokens`] —
/// history *text* at ~4 chars/token, history media at calibrated
/// rates). This is the one number the window-fit, routing, and
/// spend-guard estimators all add, so a multi-turn video request is
/// never counted as near-zero input.
pub fn request_media_and_history_tokens(
    images: Option<&[ContentBlock]>,
    messages: Option<&[Message]>,
) -> usize {
    images.map(blocks_media_tokens).unwrap_or(0)
        + messages.map(messages_history_tokens).unwrap_or(0)
}

/// Estimated tokens for a multi-turn `messages` history that is
/// media-aware: text content is counted with the same ~4 chars/token
/// heuristic as the rest of the crate, while media blocks are counted
/// via the calibrated estimators above **instead of** their serialized
/// JSON envelope. This matters in both directions: a `VideoPath` block
/// serializes to a ~40-char envelope (near-zero tokens) while the
/// provider charges thousands; a `VideoBase64`/`ImageBase64` block
/// serializes to its full base64 payload, which at chars/4 wildly
/// overcounts vs. the provider's per-second/per-image accounting.
pub fn messages_history_tokens(messages: &[Message]) -> usize {
    messages
        .iter()
        .map(|msg| match msg {
            Message::UserMultimodal { content } => content
                .iter()
                .map(|block| match block {
                    ContentBlock::Text { text } => text.len().div_ceil(4),
                    media => content_block_media_tokens(media),
                })
                .sum(),
            other => serde_json::to_string(other)
                .map(|s| s.len().div_ceil(4))
                .unwrap_or(0),
        })
        .sum()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn video_url(fps: Option<f32>, max_frames: Option<u32>) -> ContentBlock {
        ContentBlock::VideoUrl {
            url: "https://example.com/clip.mp4".into(),
            fps,
            max_frames,
        }
    }

    #[test]
    fn text_blocks_contribute_zero_media_tokens() {
        assert_eq!(
            content_block_media_tokens(&ContentBlock::Text {
                text: "hello world".into()
            }),
            0
        );
    }

    #[test]
    fn video_url_uses_default_duration() {
        // 60s × 263 tokens/s — the documented Gemini rate.
        assert_eq!(
            content_block_media_tokens(&video_url(None, None)),
            DEFAULT_VIDEO_SECONDS as usize * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn video_fps_hint_scales_frame_count() {
        // 60s at 2 fps = 120 frames × 258.
        assert_eq!(
            content_block_media_tokens(&video_url(Some(2.0), None)),
            120 * VIDEO_TOKENS_PER_FRAME
        );
    }

    #[test]
    fn video_max_frames_caps_the_estimate() {
        // 60s at 2 fps would be 120 frames, capped at 16.
        assert_eq!(
            content_block_media_tokens(&video_url(Some(2.0), Some(16))),
            16 * VIDEO_TOKENS_PER_FRAME
        );
        // max_frames alone (default 1 fps): min(60, 8) = 8 frames.
        assert_eq!(
            content_block_media_tokens(&video_url(None, Some(8))),
            8 * VIDEO_TOKENS_PER_FRAME
        );
    }

    #[test]
    fn video_base64_derives_duration_from_payload_size() {
        // 10 seconds at the assumed 187,500 B/s = 1,875,000 bytes
        // = 2,500,000 base64 chars.
        let data = "A".repeat(2_500_000);
        let block = ContentBlock::VideoBase64 {
            data,
            media_type: "video/mp4".into(),
            fps: None,
            max_frames: None,
        };
        assert_eq!(
            content_block_media_tokens(&block),
            10 * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn video_path_stats_the_file() {
        // tempfile: unique per-test directory — no collision when the
        // suite runs concurrently, cleaned up on drop.
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("clip.mp4");
        // 5 seconds at the assumed rate.
        std::fs::write(&path, vec![0u8; (5 * ASSUMED_VIDEO_BYTES_PER_SEC) as usize]).unwrap();
        let block = ContentBlock::VideoPath {
            path: path.to_string_lossy().into_owned(),
            fps: None,
            max_frames: None,
        };
        assert_eq!(
            content_block_media_tokens(&block),
            5 * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn video_path_missing_file_falls_back_to_default() {
        let block = ContentBlock::VideoPath {
            path: "/nonexistent/car-media-tokens/clip.mp4".into(),
            fps: None,
            max_frames: None,
        };
        assert_eq!(
            content_block_media_tokens(&block),
            DEFAULT_VIDEO_SECONDS as usize * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn audio_and_image_rates() {
        assert_eq!(
            content_block_media_tokens(&ContentBlock::AudioUrl {
                url: "https://example.com/a.mp3".into(),
                sample_rate: None,
            }),
            DEFAULT_AUDIO_SECONDS as usize * AUDIO_TOKENS_PER_SECOND
        );
        assert_eq!(
            content_block_media_tokens(&ContentBlock::ImageUrl {
                url: "https://example.com/i.png".into(),
                detail: "auto".into(),
            }),
            IMAGE_TOKENS_FLAT
        );
        // 30s of audio at 16,000 B/s = 480,000 bytes = 640,000 b64 chars.
        assert_eq!(
            content_block_media_tokens(&ContentBlock::AudioBase64 {
                data: "A".repeat(640_000),
                media_type: "audio/mp3".into(),
                sample_rate: None,
            }),
            30 * AUDIO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn request_tokens_sum_images_media_and_full_history() {
        let images = vec![video_url(None, Some(4))];
        let messages = vec![
            Message::User {
                content: "plain text turn".into(),
            },
            Message::UserMultimodal {
                content: vec![
                    ContentBlock::Text {
                        text: "describe this".into(),
                    },
                    ContentBlock::ImageUrl {
                        url: "https://example.com/i.png".into(),
                        detail: "auto".into(),
                    },
                ],
            },
        ];
        // History TEXT counts too (Q4): the plain user turn at its
        // serialized-JSON chars/4, the multimodal turn's text at
        // chars/4, media at calibrated rates.
        let plain_turn_tokens = serde_json::to_string(&messages[0])
            .unwrap()
            .len()
            .div_ceil(4);
        let multimodal_text_tokens = "describe this".len().div_ceil(4);
        assert_eq!(
            request_media_and_history_tokens(Some(&images), Some(&messages)),
            4 * VIDEO_TOKENS_PER_FRAME
                + plain_turn_tokens
                + multimodal_text_tokens
                + IMAGE_TOKENS_FLAT
        );
        assert_eq!(request_media_and_history_tokens(None, None), 0);
    }

    #[test]
    fn history_tokens_count_text_and_calibrated_media_not_base64_payload() {
        // A base64 video payload must NOT be counted at chars/4 (that
        // would be ~625k tokens for 10s of video); it must use the
        // calibrated per-second rate.
        let messages = vec![Message::UserMultimodal {
            content: vec![
                ContentBlock::Text {
                    text: "abcdefgh".into(), // 8 chars → 2 tokens
                },
                ContentBlock::VideoBase64 {
                    data: "A".repeat(2_500_000), // 10 seconds
                    media_type: "video/mp4".into(),
                    fps: None,
                    max_frames: None,
                },
            ],
        }];
        assert_eq!(
            messages_history_tokens(&messages),
            2 + 10 * VIDEO_TOKENS_PER_SECOND
        );
    }
}