shinkai-translator 0.1.3

CLI tool for translating video subtitles with LLMs through OpenAI-compatible APIs, with native PGS OCR
use crate::domain::{SubtitleCue, TranslationOptions};
use crate::providers::{ProviderCapabilities, TranslationBatchItem};

pub fn chunk_cues(
    cues: &[SubtitleCue],
    options: &TranslationOptions,
    capabilities: &ProviderCapabilities,
) -> Vec<Vec<TranslationBatchItem>> {
    let provider_max_items = capabilities.max_batch_items.unwrap_or(usize::MAX);
    let max_items = options.max_batch_items.min(provider_max_items).max(1);
    let max_characters = options.max_batch_characters.max(1);

    let mut batches = Vec::new();
    let mut current = Vec::new();
    let mut current_chars = 0usize;

    for cue in cues.iter().filter(|cue| cue.is_translatable()) {
        let next_chars = cue.text_len();
        let exceeds_items = current.len() >= max_items;
        let exceeds_chars = !current.is_empty() && current_chars + next_chars > max_characters;

        if exceeds_items || exceeds_chars {
            batches.push(current);
            current = Vec::new();
            current_chars = 0;
        }

        current_chars += next_chars;
        current.push(TranslationBatchItem {
            id: cue.id().to_owned(),
            text: cue.text().to_owned(),
        });
    }

    if !current.is_empty() {
        batches.push(current);
    }

    batches
}

#[cfg(test)]
mod tests {
    use std::collections::BTreeMap;

    use crate::domain::{CueKind, SubtitleCue, TranslationOptions};
    use crate::providers::ProviderCapabilities;

    use super::chunk_cues;

    #[test]
    fn respects_batch_item_limit() {
        let cues = (0..3)
            .map(|index| {
                SubtitleCue::new(
                    format!("cue-{index}"),
                    None,
                    "00:00:00,000",
                    "00:00:01,000",
                    None,
                    "hello",
                    BTreeMap::new(),
                )
            })
            .collect::<Vec<_>>();

        let options = TranslationOptions {
            max_batch_items: 2,
            ..TranslationOptions::default()
        };

        let batches = chunk_cues(&cues, &options, &ProviderCapabilities::default());
        assert_eq!(batches.len(), 2);
        assert_eq!(batches[0].len(), 2);
        assert_eq!(batches[1].len(), 1);
    }

    #[test]
    fn skips_preserved_cues() {
        let cues = vec![
            SubtitleCue::new(
                "cue-1",
                None,
                "00:00:00,000",
                "00:00:01,000",
                None,
                "hello",
                BTreeMap::new(),
            ),
            SubtitleCue::new_with_kind(
                "cue-2",
                None,
                "00:00:01,000",
                "00:00:02,000",
                None,
                "{\\k20}ka",
                BTreeMap::new(),
                CueKind::Karaoke,
            ),
        ];

        let batches = chunk_cues(
            &cues,
            &TranslationOptions::default(),
            &ProviderCapabilities::default(),
        );
        assert_eq!(batches.len(), 1);
        assert_eq!(batches[0].len(), 1);
        assert_eq!(batches[0][0].id, "cue-1");
    }
}