use crate::tasks::generate::{ContentBlock, Message};
pub const VIDEO_TOKENS_PER_SECOND: usize = 263;
pub const VIDEO_TOKENS_PER_FRAME: usize = 258;
pub const AUDIO_TOKENS_PER_SECOND: usize = 32;
pub const IMAGE_TOKENS_FLAT: usize = 258;
pub const ASSUMED_VIDEO_BYTES_PER_SEC: u64 = 187_500;
pub const ASSUMED_AUDIO_BYTES_PER_SEC: u64 = 16_000;
pub const DEFAULT_VIDEO_SECONDS: u64 = 60;
pub const DEFAULT_AUDIO_SECONDS: u64 = 60;
fn base64_decoded_len(data: &str) -> u64 {
(data.len() as u64 / 4) * 3
}
fn seconds_from_bytes(bytes: u64, bytes_per_sec: u64) -> u64 {
if bytes == 0 {
return 0;
}
std::cmp::max(1, bytes / bytes_per_sec)
}
fn file_len(path: &str) -> Option<u64> {
std::fs::metadata(path).ok().map(|m| m.len())
}
pub fn video_tokens_for_duration(
seconds: u64,
fps: Option<f32>,
max_frames: Option<u32>,
) -> usize {
if seconds == 0 {
return 0;
}
match (fps, max_frames) {
(None, None) => seconds as usize * VIDEO_TOKENS_PER_SECOND,
_ => {
let effective_fps = fps.unwrap_or(1.0).max(0.0) as f64;
let mut frames = (seconds as f64 * effective_fps).ceil() as u64;
if let Some(cap) = max_frames {
frames = frames.min(cap as u64);
}
frames = frames.max(1);
frames as usize * VIDEO_TOKENS_PER_FRAME
}
}
}
pub fn audio_tokens_for_duration(seconds: u64) -> usize {
seconds as usize * AUDIO_TOKENS_PER_SECOND
}
pub fn content_block_media_tokens(block: &ContentBlock) -> usize {
match block {
ContentBlock::Text { .. } => 0,
ContentBlock::ImageBase64 { .. } | ContentBlock::ImageUrl { .. } => IMAGE_TOKENS_FLAT,
ContentBlock::VideoPath {
path,
fps,
max_frames,
} => {
let seconds = file_len(path)
.map(|len| seconds_from_bytes(len, ASSUMED_VIDEO_BYTES_PER_SEC))
.unwrap_or(DEFAULT_VIDEO_SECONDS);
video_tokens_for_duration(seconds, *fps, *max_frames)
}
ContentBlock::VideoUrl {
fps, max_frames, ..
} => video_tokens_for_duration(DEFAULT_VIDEO_SECONDS, *fps, *max_frames),
ContentBlock::VideoBase64 {
data,
fps,
max_frames,
..
} => {
let seconds =
seconds_from_bytes(base64_decoded_len(data), ASSUMED_VIDEO_BYTES_PER_SEC);
video_tokens_for_duration(seconds, *fps, *max_frames)
}
ContentBlock::AudioPath { path, .. } => {
let seconds = file_len(path)
.map(|len| seconds_from_bytes(len, ASSUMED_AUDIO_BYTES_PER_SEC))
.unwrap_or(DEFAULT_AUDIO_SECONDS);
audio_tokens_for_duration(seconds)
}
ContentBlock::AudioUrl { .. } => audio_tokens_for_duration(DEFAULT_AUDIO_SECONDS),
ContentBlock::AudioBase64 { data, .. } => {
let seconds =
seconds_from_bytes(base64_decoded_len(data), ASSUMED_AUDIO_BYTES_PER_SEC);
audio_tokens_for_duration(seconds)
}
}
}
pub fn blocks_media_tokens(blocks: &[ContentBlock]) -> usize {
blocks.iter().map(content_block_media_tokens).sum()
}
pub fn request_media_and_history_tokens(
images: Option<&[ContentBlock]>,
messages: Option<&[Message]>,
) -> usize {
images.map(blocks_media_tokens).unwrap_or(0)
+ messages.map(messages_history_tokens).unwrap_or(0)
}
pub fn messages_history_tokens(messages: &[Message]) -> usize {
messages
.iter()
.map(|msg| match msg {
Message::UserMultimodal { content } => content
.iter()
.map(|block| match block {
ContentBlock::Text { text } => text.len().div_ceil(4),
media => content_block_media_tokens(media),
})
.sum(),
other => serde_json::to_string(other)
.map(|s| s.len().div_ceil(4))
.unwrap_or(0),
})
.sum()
}
#[cfg(test)]
mod tests {
use super::*;
fn video_url(fps: Option<f32>, max_frames: Option<u32>) -> ContentBlock {
ContentBlock::VideoUrl {
url: "https://example.com/clip.mp4".into(),
fps,
max_frames,
}
}
#[test]
fn text_blocks_contribute_zero_media_tokens() {
assert_eq!(
content_block_media_tokens(&ContentBlock::Text {
text: "hello world".into()
}),
0
);
}
#[test]
fn video_url_uses_default_duration() {
assert_eq!(
content_block_media_tokens(&video_url(None, None)),
DEFAULT_VIDEO_SECONDS as usize * VIDEO_TOKENS_PER_SECOND
);
}
#[test]
fn video_fps_hint_scales_frame_count() {
assert_eq!(
content_block_media_tokens(&video_url(Some(2.0), None)),
120 * VIDEO_TOKENS_PER_FRAME
);
}
#[test]
fn video_max_frames_caps_the_estimate() {
assert_eq!(
content_block_media_tokens(&video_url(Some(2.0), Some(16))),
16 * VIDEO_TOKENS_PER_FRAME
);
assert_eq!(
content_block_media_tokens(&video_url(None, Some(8))),
8 * VIDEO_TOKENS_PER_FRAME
);
}
#[test]
fn video_base64_derives_duration_from_payload_size() {
let data = "A".repeat(2_500_000);
let block = ContentBlock::VideoBase64 {
data,
media_type: "video/mp4".into(),
fps: None,
max_frames: None,
};
assert_eq!(
content_block_media_tokens(&block),
10 * VIDEO_TOKENS_PER_SECOND
);
}
#[test]
fn video_path_stats_the_file() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("clip.mp4");
std::fs::write(&path, vec![0u8; (5 * ASSUMED_VIDEO_BYTES_PER_SEC) as usize]).unwrap();
let block = ContentBlock::VideoPath {
path: path.to_string_lossy().into_owned(),
fps: None,
max_frames: None,
};
assert_eq!(
content_block_media_tokens(&block),
5 * VIDEO_TOKENS_PER_SECOND
);
}
#[test]
fn video_path_missing_file_falls_back_to_default() {
let block = ContentBlock::VideoPath {
path: "/nonexistent/car-media-tokens/clip.mp4".into(),
fps: None,
max_frames: None,
};
assert_eq!(
content_block_media_tokens(&block),
DEFAULT_VIDEO_SECONDS as usize * VIDEO_TOKENS_PER_SECOND
);
}
#[test]
fn audio_and_image_rates() {
assert_eq!(
content_block_media_tokens(&ContentBlock::AudioUrl {
url: "https://example.com/a.mp3".into(),
sample_rate: None,
}),
DEFAULT_AUDIO_SECONDS as usize * AUDIO_TOKENS_PER_SECOND
);
assert_eq!(
content_block_media_tokens(&ContentBlock::ImageUrl {
url: "https://example.com/i.png".into(),
detail: "auto".into(),
}),
IMAGE_TOKENS_FLAT
);
assert_eq!(
content_block_media_tokens(&ContentBlock::AudioBase64 {
data: "A".repeat(640_000),
media_type: "audio/mp3".into(),
sample_rate: None,
}),
30 * AUDIO_TOKENS_PER_SECOND
);
}
#[test]
fn request_tokens_sum_images_media_and_full_history() {
let images = vec![video_url(None, Some(4))];
let messages = vec![
Message::User {
content: "plain text turn".into(),
},
Message::UserMultimodal {
content: vec![
ContentBlock::Text {
text: "describe this".into(),
},
ContentBlock::ImageUrl {
url: "https://example.com/i.png".into(),
detail: "auto".into(),
},
],
},
];
let plain_turn_tokens = serde_json::to_string(&messages[0])
.unwrap()
.len()
.div_ceil(4);
let multimodal_text_tokens = "describe this".len().div_ceil(4);
assert_eq!(
request_media_and_history_tokens(Some(&images), Some(&messages)),
4 * VIDEO_TOKENS_PER_FRAME
+ plain_turn_tokens
+ multimodal_text_tokens
+ IMAGE_TOKENS_FLAT
);
assert_eq!(request_media_and_history_tokens(None, None), 0);
}
#[test]
fn history_tokens_count_text_and_calibrated_media_not_base64_payload() {
let messages = vec![Message::UserMultimodal {
content: vec![
ContentBlock::Text {
text: "abcdefgh".into(), },
ContentBlock::VideoBase64 {
data: "A".repeat(2_500_000), media_type: "video/mp4".into(),
fps: None,
max_frames: None,
},
],
}];
assert_eq!(
messages_history_tokens(&messages),
2 + 10 * VIDEO_TOKENS_PER_SECOND
);
}
}