car-inference 0.32.1

//! Provider-calibrated token estimation for multimodal content blocks.
//!
//! Text token estimation elsewhere in this crate uses the ~4 chars/token
//! heuristic ([`crate::remote::estimate_tokens`]). Media blocks used to
//! contribute **zero** to every estimate, so a request carrying a minute
//! of video under-counted its prompt by tens of thousands of tokens —
//! breaking context-window fitting (`truncate_prompt_to_fit`), the
//! adaptive router's headroom/`needs_compaction` signal, and the I4
//! mid-stream spend guard's armed prompt cost.
//!
//! # Calibration source
//!
//! Gemini is the only remote protocol CAR routes video/audio through
//! (`ProtocolHandler::supports_video()` / `supports_audio()`), so the
//! constants are calibrated to Google's documented multimodal token
//! accounting (<https://ai.google.dev/gemini-api/docs/tokens> and
//! <https://ai.google.dev/gemini-api/docs/video-understanding>):
//!
//! * **Video**: 263 tokens per second of video at the default 1 fps
//!   sampling / default media resolution (≈258 tokens per sampled frame
//!   plus per-second overhead).
//! * **Audio**: 32 tokens per second.
//! * **Images**: 258 tokens flat for images ≤384 px in both dimensions;
//!   larger images are tiled into 768×768 crops at 258 tokens each. We
//!   don't decode image bytes here, so the flat 258 is used as the
//!   documented floor.
//!
//! These are deliberately **estimates for budgeting**, not billing-grade
//! counts: the goal is that a multimodal request stops counting as
//! near-zero input. Anthropic/OpenAI image accounting differs (≈
//! `w*h/750` and tile-based respectively) but is the same order of
//! magnitude as the Gemini floor.
//!
//! # Duration heuristics
//!
//! The estimator is synchronous and does no network I/O and no media
//! decoding. Durations are derived from what is cheaply knowable:
//!
//! * `VideoPath` / `AudioPath` — file size via `fs::metadata`, divided
//!   by an assumed encoding rate (1.5 Mbps H.264 for video — typical
//!   720p; 128 kbps for audio). Stat failure falls back to the default
//!   duration.
//! * `VideoBase64` / `AudioBase64` — decoded byte count (`len * 3 / 4`),
//!   same rate conversion.
//! * `VideoUrl` / `AudioUrl` — no probe (an estimator must not block on
//!   the network); assumes [`DEFAULT_VIDEO_SECONDS`] /
//!   [`DEFAULT_AUDIO_SECONDS`].
//!
//! `fps` / `max_frames` hints on the video variants refine the estimate:
//! with an explicit `fps` the frame count scales accordingly, and
//! `max_frames` caps it (the same cap backends apply when sampling).

use crate::tasks::generate::{ContentBlock, Message};

/// Tokens per second of video at Gemini's default 1 fps sampling and
/// default media resolution. Source:
/// <https://ai.google.dev/gemini-api/docs/video-understanding>.
pub const VIDEO_TOKENS_PER_SECOND: usize = 263;

/// Tokens per sampled video frame at default media resolution. Used when
/// the caller supplies an explicit `fps` or `max_frames` hint. Source:
/// <https://ai.google.dev/gemini-api/docs/video-understanding>.
pub const VIDEO_TOKENS_PER_FRAME: usize = 258;

/// Tokens per second of audio. Source:
/// <https://ai.google.dev/gemini-api/docs/tokens>.
pub const AUDIO_TOKENS_PER_SECOND: usize = 32;

/// Flat per-image token floor (Gemini: 258 for ≤384 px, 258 per 768×768
/// tile above that). We don't decode dimensions, so the floor is used.
pub const IMAGE_TOKENS_FLAT: usize = 258;

/// Assumed video byte rate for size→duration conversion: 1.5 Mbps
/// (typical 720p H.264) ≈ 187,500 bytes/second.
pub const ASSUMED_VIDEO_BYTES_PER_SEC: u64 = 187_500;

/// Assumed audio byte rate for size→duration conversion: 128 kbps
/// (typical MP3/AAC) = 16,000 bytes/second.
pub const ASSUMED_AUDIO_BYTES_PER_SEC: u64 = 16_000;

/// Assumed duration for video sources whose size can't be determined
/// without network I/O (`VideoUrl`).
pub const DEFAULT_VIDEO_SECONDS: u64 = 60;

/// Assumed duration for audio sources whose size can't be determined
/// without network I/O (`AudioUrl`).
pub const DEFAULT_AUDIO_SECONDS: u64 = 60;

/// Bytes represented by a base64 string (4 chars → 3 bytes).
fn base64_decoded_len(data: &str) -> u64 {
    (data.len() as u64 / 4) * 3
}

/// Duration in whole seconds from a byte count and an assumed rate.
/// Always at least 1 second for non-empty media.
fn seconds_from_bytes(bytes: u64, bytes_per_sec: u64) -> u64 {
    if bytes == 0 {
        return 0;
    }
    std::cmp::max(1, bytes / bytes_per_sec)
}

/// File size in bytes, or `None` when the path can't be stat'd.
fn file_len(path: &str) -> Option<u64> {
    std::fs::metadata(path).ok().map(|m| m.len())
}

/// Estimated tokens for `seconds` of video, honoring the caller's
/// sampling hints. With no hints this is the documented
/// `seconds × 263`. With an explicit `fps` the sampled frame count
/// scales (`seconds × fps` frames at 258 tokens each); `max_frames`
/// caps the frame count either way — the same cap backends apply when
/// actually sampling the clip.
pub fn video_tokens_for_duration(
    seconds: u64,
    fps: Option<f32>,
    max_frames: Option<u32>,
) -> usize {
    if seconds == 0 {
        return 0;
    }
    match (fps, max_frames) {
        (None, None) => seconds as usize * VIDEO_TOKENS_PER_SECOND,
        _ => {
            let effective_fps = fps.unwrap_or(1.0).max(0.0) as f64;
            let mut frames = (seconds as f64 * effective_fps).ceil() as u64;
            if let Some(cap) = max_frames {
                frames = frames.min(cap as u64);
            }
            // A non-empty clip is at least one sampled frame.
            frames = frames.max(1);
            frames as usize * VIDEO_TOKENS_PER_FRAME
        }
    }
}

/// Estimated tokens for `seconds` of audio: `seconds × 32`.
pub fn audio_tokens_for_duration(seconds: u64) -> usize {
    seconds as usize * AUDIO_TOKENS_PER_SECOND
}

/// Estimated input tokens contributed by one content block's **media**
/// payload. `Text` blocks return 0 — text is counted by the caller's
/// existing text heuristic, never double-counted here.
pub fn content_block_media_tokens(block: &ContentBlock) -> usize {
    match block {
        ContentBlock::Text { .. } => 0,
        ContentBlock::ImageBase64 { .. } | ContentBlock::ImageUrl { .. } => IMAGE_TOKENS_FLAT,
        ContentBlock::VideoPath {
            path,
            fps,
            max_frames,
        } => {
            let seconds = file_len(path)
                .map(|len| seconds_from_bytes(len, ASSUMED_VIDEO_BYTES_PER_SEC))
                .unwrap_or(DEFAULT_VIDEO_SECONDS);
            video_tokens_for_duration(seconds, *fps, *max_frames)
        }
        ContentBlock::VideoUrl {
            fps, max_frames, ..
        } => video_tokens_for_duration(DEFAULT_VIDEO_SECONDS, *fps, *max_frames),
        ContentBlock::VideoBase64 {
            data,
            fps,
            max_frames,
            ..
        } => {
            let seconds =
                seconds_from_bytes(base64_decoded_len(data), ASSUMED_VIDEO_BYTES_PER_SEC);
            video_tokens_for_duration(seconds, *fps, *max_frames)
        }
        ContentBlock::AudioPath { path, .. } => {
            let seconds = file_len(path)
                .map(|len| seconds_from_bytes(len, ASSUMED_AUDIO_BYTES_PER_SEC))
                .unwrap_or(DEFAULT_AUDIO_SECONDS);
            audio_tokens_for_duration(seconds)
        }
        ContentBlock::AudioUrl { .. } => audio_tokens_for_duration(DEFAULT_AUDIO_SECONDS),
        ContentBlock::AudioBase64 { data, .. } => {
            let seconds =
                seconds_from_bytes(base64_decoded_len(data), ASSUMED_AUDIO_BYTES_PER_SEC);
            audio_tokens_for_duration(seconds)
        }
    }
}

/// Sum of [`content_block_media_tokens`] over a block slice.
pub fn blocks_media_tokens(blocks: &[ContentBlock]) -> usize {
    blocks.iter().map(content_block_media_tokens).sum()
}

/// Total tokens a request's multimodal inputs and multi-turn history
/// add on top of the caller's prompt/context/tools text estimate:
/// single-turn `images` blocks at calibrated media rates, plus the
/// full media-aware `messages` history ([`messages_history_tokens`] —
/// history *text* at ~4 chars/token, history media at calibrated
/// rates). This is the one number the window-fit, routing, and
/// spend-guard estimators all add, so a multi-turn video request is
/// never counted as near-zero input.
pub fn request_media_and_history_tokens(
    images: Option<&[ContentBlock]>,
    messages: Option<&[Message]>,
) -> usize {
    images.map(blocks_media_tokens).unwrap_or(0)
        + messages.map(messages_history_tokens).unwrap_or(0)
}

/// Estimated tokens for a multi-turn `messages` history that is
/// media-aware: text content is counted with the same ~4 chars/token
/// heuristic as the rest of the crate, while media blocks are counted
/// via the calibrated estimators above **instead of** their serialized
/// JSON envelope. This matters in both directions: a `VideoPath` block
/// serializes to a ~40-char envelope (near-zero tokens) while the
/// provider charges thousands; a `VideoBase64`/`ImageBase64` block
/// serializes to its full base64 payload, which at chars/4 wildly
/// overcounts vs. the provider's per-second/per-image accounting.
pub fn messages_history_tokens(messages: &[Message]) -> usize {
    messages
        .iter()
        .map(|msg| match msg {
            Message::UserMultimodal { content } => content
                .iter()
                .map(|block| match block {
                    ContentBlock::Text { text } => text.len().div_ceil(4),
                    media => content_block_media_tokens(media),
                })
                .sum(),
            other => serde_json::to_string(other)
                .map(|s| s.len().div_ceil(4))
                .unwrap_or(0),
        })
        .sum()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn video_url(fps: Option<f32>, max_frames: Option<u32>) -> ContentBlock {
        ContentBlock::VideoUrl {
            url: "https://example.com/clip.mp4".into(),
            fps,
            max_frames,
        }
    }

    #[test]
    fn text_blocks_contribute_zero_media_tokens() {
        assert_eq!(
            content_block_media_tokens(&ContentBlock::Text {
                text: "hello world".into()
            }),
            0
        );
    }

    #[test]
    fn video_url_uses_default_duration() {
        // 60s × 263 tokens/s — the documented Gemini rate.
        assert_eq!(
            content_block_media_tokens(&video_url(None, None)),
            DEFAULT_VIDEO_SECONDS as usize * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn video_fps_hint_scales_frame_count() {
        // 60s at 2 fps = 120 frames × 258.
        assert_eq!(
            content_block_media_tokens(&video_url(Some(2.0), None)),
            120 * VIDEO_TOKENS_PER_FRAME
        );
    }

    #[test]
    fn video_max_frames_caps_the_estimate() {
        // 60s at 2 fps would be 120 frames, capped at 16.
        assert_eq!(
            content_block_media_tokens(&video_url(Some(2.0), Some(16))),
            16 * VIDEO_TOKENS_PER_FRAME
        );
        // max_frames alone (default 1 fps): min(60, 8) = 8 frames.
        assert_eq!(
            content_block_media_tokens(&video_url(None, Some(8))),
            8 * VIDEO_TOKENS_PER_FRAME
        );
    }

    #[test]
    fn video_base64_derives_duration_from_payload_size() {
        // 10 seconds at the assumed 187,500 B/s = 1,875,000 bytes
        // = 2,500,000 base64 chars.
        let data = "A".repeat(2_500_000);
        let block = ContentBlock::VideoBase64 {
            data,
            media_type: "video/mp4".into(),
            fps: None,
            max_frames: None,
        };
        assert_eq!(
            content_block_media_tokens(&block),
            10 * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn video_path_stats_the_file() {
        // tempfile: unique per-test directory — no collision when the
        // suite runs concurrently, cleaned up on drop.
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("clip.mp4");
        // 5 seconds at the assumed rate.
        std::fs::write(&path, vec![0u8; (5 * ASSUMED_VIDEO_BYTES_PER_SEC) as usize]).unwrap();
        let block = ContentBlock::VideoPath {
            path: path.to_string_lossy().into_owned(),
            fps: None,
            max_frames: None,
        };
        assert_eq!(
            content_block_media_tokens(&block),
            5 * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn video_path_missing_file_falls_back_to_default() {
        let block = ContentBlock::VideoPath {
            path: "/nonexistent/car-media-tokens/clip.mp4".into(),
            fps: None,
            max_frames: None,
        };
        assert_eq!(
            content_block_media_tokens(&block),
            DEFAULT_VIDEO_SECONDS as usize * VIDEO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn audio_and_image_rates() {
        assert_eq!(
            content_block_media_tokens(&ContentBlock::AudioUrl {
                url: "https://example.com/a.mp3".into(),
                sample_rate: None,
            }),
            DEFAULT_AUDIO_SECONDS as usize * AUDIO_TOKENS_PER_SECOND
        );
        assert_eq!(
            content_block_media_tokens(&ContentBlock::ImageUrl {
                url: "https://example.com/i.png".into(),
                detail: "auto".into(),
            }),
            IMAGE_TOKENS_FLAT
        );
        // 30s of audio at 16,000 B/s = 480,000 bytes = 640,000 b64 chars.
        assert_eq!(
            content_block_media_tokens(&ContentBlock::AudioBase64 {
                data: "A".repeat(640_000),
                media_type: "audio/mp3".into(),
                sample_rate: None,
            }),
            30 * AUDIO_TOKENS_PER_SECOND
        );
    }

    #[test]
    fn request_tokens_sum_images_media_and_full_history() {
        let images = vec![video_url(None, Some(4))];
        let messages = vec![
            Message::User {
                content: "plain text turn".into(),
            },
            Message::UserMultimodal {
                content: vec![
                    ContentBlock::Text {
                        text: "describe this".into(),
                    },
                    ContentBlock::ImageUrl {
                        url: "https://example.com/i.png".into(),
                        detail: "auto".into(),
                    },
                ],
            },
        ];
        // History TEXT counts too (Q4): the plain user turn at its
        // serialized-JSON chars/4, the multimodal turn's text at
        // chars/4, media at calibrated rates.
        let plain_turn_tokens = serde_json::to_string(&messages[0])
            .unwrap()
            .len()
            .div_ceil(4);
        let multimodal_text_tokens = "describe this".len().div_ceil(4);
        assert_eq!(
            request_media_and_history_tokens(Some(&images), Some(&messages)),
            4 * VIDEO_TOKENS_PER_FRAME
                + plain_turn_tokens
                + multimodal_text_tokens
                + IMAGE_TOKENS_FLAT
        );
        assert_eq!(request_media_and_history_tokens(None, None), 0);
    }

    #[test]
    fn history_tokens_count_text_and_calibrated_media_not_base64_payload() {
        // A base64 video payload must NOT be counted at chars/4 (that
        // would be ~625k tokens for 10s of video); it must use the
        // calibrated per-second rate.
        let messages = vec![Message::UserMultimodal {
            content: vec![
                ContentBlock::Text {
                    text: "abcdefgh".into(), // 8 chars → 2 tokens
                },
                ContentBlock::VideoBase64 {
                    data: "A".repeat(2_500_000), // 10 seconds
                    media_type: "video/mp4".into(),
                    fps: None,
                    max_frames: None,
                },
            ],
        }];
        assert_eq!(
            messages_history_tokens(&messages),
            2 + 10 * VIDEO_TOKENS_PER_SECOND
        );
    }
}