Skip to main content

oximedia_caption_gen/
multilang.rs

1//! Multi-language subtitle support with ISO 639-1 validated language codes.
2//!
3//! This module provides:
4//!
5//! - [`LanguageCode`] — ISO 639-1 validated 2-letter language code newtype.
6//! - [`CaptionEntry`] — A single timed caption entry with text.
7//! - [`MultiLangCaption`] — Container for caption tracks in multiple languages.
8//! - [`MultiLangCaptionBuilder`] — Builder for constructing `MultiLangCaption`.
9//!
10//! ## SRT output
11//!
12//! [`MultiLangCaption::to_srt`] formats a language track as standard SubRip
13//! (`.srt`) text, with 1-based sequence numbers and `HH:MM:SS,mmm` timestamps.
14//!
15//! ## Timing merge
16//!
17//! [`MultiLangCaption::merge_timing`] aligns a secondary-language track to a
18//! primary track by matching overlapping timestamps, returning a merged
19//! [`Vec<CaptionEntry>`] whose timing follows the primary track and whose text
20//! is taken from the secondary track.
21
22use std::collections::HashMap;
23
24use crate::CaptionGenError;
25
26// ─── Language code ────────────────────────────────────────────────────────────
27
28/// ISO 639-1 language code newtype (two lowercase ASCII letters, e.g. `"en"`).
29///
30/// Construction always validates the code; use [`LanguageCode::new`] or
31/// [`LanguageCode::try_from`].
32#[derive(Debug, Clone, PartialEq, Eq, Hash)]
33pub struct LanguageCode(String);
34
35impl LanguageCode {
36    /// Create a validated ISO 639-1 language code.
37    ///
38    /// The code must be exactly two ASCII lowercase letters (`a-z`).
39    ///
40    /// # Errors
41    ///
42    /// Returns [`CaptionGenError::InvalidParameter`] if the code is not exactly
43    /// two lowercase ASCII letters.
44    pub fn new(code: &str) -> Result<Self, CaptionGenError> {
45        let code = code.trim();
46        if code.len() != 2 || !code.chars().all(|c| c.is_ascii_lowercase()) {
47            return Err(CaptionGenError::InvalidParameter(format!(
48                "ISO 639-1 language code must be exactly two lowercase ASCII letters, got {:?}",
49                code
50            )));
51        }
52        Ok(Self(code.to_string()))
53    }
54
55    /// Return the inner code string slice.
56    pub fn as_str(&self) -> &str {
57        &self.0
58    }
59}
60
61impl std::fmt::Display for LanguageCode {
62    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63        f.write_str(&self.0)
64    }
65}
66
67impl TryFrom<&str> for LanguageCode {
68    type Error = CaptionGenError;
69
70    fn try_from(value: &str) -> Result<Self, Self::Error> {
71        Self::new(value)
72    }
73}
74
75impl TryFrom<String> for LanguageCode {
76    type Error = CaptionGenError;
77
78    fn try_from(value: String) -> Result<Self, Self::Error> {
79        Self::new(&value)
80    }
81}
82
83// ─── Caption entry ────────────────────────────────────────────────────────────
84
85/// A single timed caption entry.
86#[derive(Debug, Clone, PartialEq)]
87pub struct CaptionEntry {
88    /// 1-based sequence number.
89    pub id: u32,
90    /// Display start time in milliseconds.
91    pub start_ms: u64,
92    /// Display end time in milliseconds.
93    pub end_ms: u64,
94    /// Caption text (may contain newlines for multi-line captions).
95    pub text: String,
96}
97
98impl CaptionEntry {
99    /// Create a new caption entry.
100    pub fn new(id: u32, start_ms: u64, end_ms: u64, text: impl Into<String>) -> Self {
101        Self {
102            id,
103            start_ms,
104            end_ms,
105            text: text.into(),
106        }
107    }
108
109    /// Duration of this entry in milliseconds.
110    pub fn duration_ms(&self) -> u64 {
111        self.end_ms.saturating_sub(self.start_ms)
112    }
113}
114
115// ─── Multi-language caption container ────────────────────────────────────────
116
117/// Container for subtitle tracks in multiple languages.
118///
119/// Each language track is a `Vec<CaptionEntry>` keyed by its [`LanguageCode`].
120#[derive(Debug, Clone)]
121pub struct MultiLangCaption {
122    pub entries: HashMap<LanguageCode, Vec<CaptionEntry>>,
123}
124
125impl MultiLangCaption {
126    /// Returns the set of language codes present in this container.
127    pub fn languages(&self) -> impl Iterator<Item = &LanguageCode> {
128        self.entries.keys()
129    }
130
131    /// Returns the entries for a given language, or `None` if absent.
132    pub fn track(&self, lang: &LanguageCode) -> Option<&[CaptionEntry]> {
133        self.entries.get(lang).map(|v| v.as_slice())
134    }
135
136    /// Format a language track as SRT (SubRip) text.
137    ///
138    /// Returns an error if the language is not present in this container.
139    pub fn to_srt(&self, lang: &LanguageCode) -> Result<String, CaptionGenError> {
140        let track = self.entries.get(lang).ok_or_else(|| {
141            CaptionGenError::InvalidParameter(format!(
142                "language {:?} not found in MultiLangCaption",
143                lang.as_str()
144            ))
145        })?;
146
147        if track.is_empty() {
148            return Ok(String::new());
149        }
150
151        let mut out = String::with_capacity(track.len() * 80);
152        for (idx, entry) in track.iter().enumerate() {
153            let seq = idx as u32 + 1;
154            out.push_str(&format!(
155                "{}\n{} --> {}\n{}\n\n",
156                seq,
157                ms_to_srt_timestamp(entry.start_ms),
158                ms_to_srt_timestamp(entry.end_ms),
159                entry.text
160            ));
161        }
162        Ok(out)
163    }
164
165    /// Merge timing from a primary language track onto a secondary track.
166    ///
167    /// For each entry in the primary track, finds the best-overlapping entry
168    /// in the secondary track and adopts the primary's timestamps.  Entries
169    /// in the secondary track with no overlap are omitted.
170    ///
171    /// Returns an error if either language is not present.
172    pub fn merge_timing(
173        &self,
174        primary: &LanguageCode,
175        secondary: &LanguageCode,
176    ) -> Result<Vec<CaptionEntry>, CaptionGenError> {
177        let primary_track = self.entries.get(primary).ok_or_else(|| {
178            CaptionGenError::InvalidParameter(format!(
179                "primary language {:?} not found",
180                primary.as_str()
181            ))
182        })?;
183        let secondary_track = self.entries.get(secondary).ok_or_else(|| {
184            CaptionGenError::InvalidParameter(format!(
185                "secondary language {:?} not found",
186                secondary.as_str()
187            ))
188        })?;
189
190        let mut merged: Vec<CaptionEntry> = Vec::with_capacity(primary_track.len());
191
192        for (idx, pentry) in primary_track.iter().enumerate() {
193            // Find the secondary entry with the greatest temporal overlap.
194            let best = secondary_track
195                .iter()
196                .filter_map(|sentry| {
197                    let overlap_start = pentry.start_ms.max(sentry.start_ms);
198                    let overlap_end = pentry.end_ms.min(sentry.end_ms);
199                    if overlap_end > overlap_start {
200                        Some((sentry, overlap_end - overlap_start))
201                    } else {
202                        None
203                    }
204                })
205                .max_by_key(|(_, overlap)| *overlap)
206                .map(|(sentry, _)| sentry);
207
208            if let Some(sentry) = best {
209                merged.push(CaptionEntry {
210                    id: idx as u32 + 1,
211                    start_ms: pentry.start_ms,
212                    end_ms: pentry.end_ms,
213                    text: sentry.text.clone(),
214                });
215            }
216        }
217
218        Ok(merged)
219    }
220}
221
222// ─── Builder ──────────────────────────────────────────────────────────────────
223
224/// Builder for [`MultiLangCaption`].
225///
226/// ```rust,no_run
227/// # use oximedia_caption_gen::multilang::{MultiLangCaptionBuilder, CaptionEntry, LanguageCode};
228/// let lang_en = LanguageCode::new("en").unwrap();
229/// let entries = vec![CaptionEntry::new(1, 0, 2000, "Hello")];
230/// let caption = MultiLangCaptionBuilder::new()
231///     .add_track(lang_en, entries)
232///     .build();
233/// ```
234#[derive(Debug, Default)]
235pub struct MultiLangCaptionBuilder {
236    entries: HashMap<LanguageCode, Vec<CaptionEntry>>,
237}
238
239impl MultiLangCaptionBuilder {
240    /// Create a new empty builder.
241    pub fn new() -> Self {
242        Self::default()
243    }
244
245    /// Add a caption track for the given language.
246    ///
247    /// If a track already exists for this language, it is replaced.
248    /// Returns `self` by value for method chaining with `.build()`.
249    pub fn add_track(mut self, lang: LanguageCode, entries: Vec<CaptionEntry>) -> Self {
250        self.entries.insert(lang, entries);
251        self
252    }
253
254    /// Consume the builder and produce a [`MultiLangCaption`].
255    pub fn build(self) -> MultiLangCaption {
256        MultiLangCaption {
257            entries: self.entries,
258        }
259    }
260}
261
262// ─── Helpers ──────────────────────────────────────────────────────────────────
263
264/// Format milliseconds as SRT timestamp `HH:MM:SS,mmm`.
265fn ms_to_srt_timestamp(ms: u64) -> String {
266    let total_secs = ms / 1_000;
267    let millis = ms % 1_000;
268    let secs = total_secs % 60;
269    let total_mins = total_secs / 60;
270    let mins = total_mins % 60;
271    let hours = total_mins / 60;
272    format!("{:02}:{:02}:{:02},{:03}", hours, mins, secs, millis)
273}
274
275// ─── Tests ────────────────────────────────────────────────────────────────────
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    // ── LanguageCode ──────────────────────────────────────────────────────────
282
283    #[test]
284    fn lang_code_valid_en() {
285        let code = LanguageCode::new("en").expect("en should be valid");
286        assert_eq!(code.as_str(), "en");
287    }
288
289    #[test]
290    fn lang_code_valid_ja() {
291        let code = LanguageCode::new("ja").expect("ja should be valid");
292        assert_eq!(code.as_str(), "ja");
293    }
294
295    #[test]
296    fn lang_code_valid_zh() {
297        assert!(LanguageCode::new("zh").is_ok());
298    }
299
300    #[test]
301    fn lang_code_invalid_empty() {
302        assert!(LanguageCode::new("").is_err());
303    }
304
305    #[test]
306    fn lang_code_invalid_one_letter() {
307        assert!(LanguageCode::new("e").is_err());
308    }
309
310    #[test]
311    fn lang_code_invalid_three_letters() {
312        assert!(LanguageCode::new("eng").is_err());
313    }
314
315    #[test]
316    fn lang_code_invalid_uppercase() {
317        assert!(LanguageCode::new("EN").is_err());
318    }
319
320    #[test]
321    fn lang_code_invalid_digit() {
322        assert!(LanguageCode::new("e1").is_err());
323    }
324
325    #[test]
326    fn lang_code_try_from_str() {
327        let code: Result<LanguageCode, _> = "fr".try_into();
328        assert!(code.is_ok());
329    }
330
331    #[test]
332    fn lang_code_display() {
333        let code = LanguageCode::new("de").expect("new should succeed");
334        assert_eq!(code.to_string(), "de");
335    }
336
337    // ── CaptionEntry ──────────────────────────────────────────────────────────
338
339    #[test]
340    fn caption_entry_duration() {
341        let entry = CaptionEntry::new(1, 1000, 4000, "Hello");
342        assert_eq!(entry.duration_ms(), 3000);
343    }
344
345    #[test]
346    fn caption_entry_duration_zero_on_equal_timestamps() {
347        let entry = CaptionEntry::new(1, 2000, 2000, "X");
348        assert_eq!(entry.duration_ms(), 0);
349    }
350
351    // ── Builder ───────────────────────────────────────────────────────────────
352
353    #[test]
354    fn builder_creates_empty_multilang() {
355        let caption = MultiLangCaptionBuilder::new().build();
356        assert_eq!(caption.entries.len(), 0);
357    }
358
359    #[test]
360    fn builder_add_track() {
361        let en = LanguageCode::new("en").expect("new should succeed");
362        let entries = vec![CaptionEntry::new(1, 0, 2000, "Hello")];
363        let caption = MultiLangCaptionBuilder::new()
364            .add_track(en.clone(), entries)
365            .build();
366        assert!(caption.track(&en).is_some());
367        assert_eq!(caption.track(&en).expect("track should succeed").len(), 1);
368    }
369
370    #[test]
371    fn builder_add_two_tracks() {
372        let en = LanguageCode::new("en").expect("new should succeed");
373        let es = LanguageCode::new("es").expect("new should succeed");
374        let en_entries = vec![CaptionEntry::new(1, 0, 2000, "Hello")];
375        let es_entries = vec![CaptionEntry::new(1, 0, 2000, "Hola")];
376        let caption = MultiLangCaptionBuilder::new()
377            .add_track(en.clone(), en_entries)
378            .add_track(es.clone(), es_entries)
379            .build();
380        assert!(caption.track(&en).is_some());
381        assert!(caption.track(&es).is_some());
382    }
383
384    #[test]
385    fn builder_add_track_replaces_existing() {
386        let en = LanguageCode::new("en").expect("new should succeed");
387        let first = vec![CaptionEntry::new(1, 0, 1000, "First")];
388        let second = vec![CaptionEntry::new(1, 0, 1000, "Second")];
389        let caption = MultiLangCaptionBuilder::new()
390            .add_track(en.clone(), first)
391            .add_track(en.clone(), second)
392            .build();
393        assert_eq!(
394            caption.track(&en).expect("track should succeed")[0].text,
395            "Second"
396        );
397    }
398
399    // ── to_srt ────────────────────────────────────────────────────────────────
400
401    #[test]
402    fn to_srt_basic() {
403        let en = LanguageCode::new("en").expect("new should succeed");
404        let entries = vec![
405            CaptionEntry::new(1, 0, 2000, "Hello"),
406            CaptionEntry::new(2, 3000, 5000, "World"),
407        ];
408        let caption = MultiLangCaptionBuilder::new()
409            .add_track(en.clone(), entries)
410            .build();
411        let srt = caption.to_srt(&en).expect("to srt should succeed");
412        assert!(srt.contains("1\n"));
413        assert!(srt.contains("2\n"));
414        assert!(srt.contains("00:00:00,000 --> 00:00:02,000"));
415        assert!(srt.contains("00:00:03,000 --> 00:00:05,000"));
416        assert!(srt.contains("Hello"));
417        assert!(srt.contains("World"));
418    }
419
420    #[test]
421    fn to_srt_empty_track_returns_empty_string() {
422        let en = LanguageCode::new("en").expect("new should succeed");
423        let caption = MultiLangCaptionBuilder::new()
424            .add_track(en.clone(), vec![])
425            .build();
426        let srt = caption.to_srt(&en).expect("to srt should succeed");
427        assert!(srt.is_empty());
428    }
429
430    #[test]
431    fn to_srt_missing_language_returns_error() {
432        let en = LanguageCode::new("en").expect("new should succeed");
433        let fr = LanguageCode::new("fr").expect("new should succeed");
434        let caption = MultiLangCaptionBuilder::new().add_track(en, vec![]).build();
435        assert!(caption.to_srt(&fr).is_err());
436    }
437
438    #[test]
439    fn to_srt_timestamp_format() {
440        // Test timestamp formatting: 1 hour, 2 min, 3 sec, 456 ms
441        let ms = 1 * 3_600_000 + 2 * 60_000 + 3 * 1_000 + 456;
442        let ts = ms_to_srt_timestamp(ms);
443        assert_eq!(ts, "01:02:03,456");
444    }
445
446    // ── merge_timing ──────────────────────────────────────────────────────────
447
448    #[test]
449    fn merge_timing_basic_overlap() {
450        let en = LanguageCode::new("en").expect("new should succeed");
451        let ja = LanguageCode::new("ja").expect("new should succeed");
452        let en_entries = vec![CaptionEntry::new(1, 0, 3000, "Hello")];
453        let ja_entries = vec![CaptionEntry::new(1, 500, 3500, "こんにちは")];
454        let caption = MultiLangCaptionBuilder::new()
455            .add_track(en.clone(), en_entries)
456            .add_track(ja.clone(), ja_entries)
457            .build();
458        let merged = caption
459            .merge_timing(&en, &ja)
460            .expect("merge timing should succeed");
461        assert_eq!(merged.len(), 1);
462        assert_eq!(merged[0].start_ms, 0); // primary timing
463        assert_eq!(merged[0].end_ms, 3000); // primary timing
464        assert_eq!(merged[0].text, "こんにちは"); // secondary text
465    }
466
467    #[test]
468    fn merge_timing_no_overlap_excluded() {
469        let en = LanguageCode::new("en").expect("new should succeed");
470        let ja = LanguageCode::new("ja").expect("new should succeed");
471        let en_entries = vec![CaptionEntry::new(1, 0, 1000, "Hello")];
472        let ja_entries = vec![CaptionEntry::new(1, 5000, 7000, "こんにちは")]; // far away
473        let caption = MultiLangCaptionBuilder::new()
474            .add_track(en.clone(), en_entries)
475            .add_track(ja.clone(), ja_entries)
476            .build();
477        let merged = caption
478            .merge_timing(&en, &ja)
479            .expect("merge timing should succeed");
480        assert!(merged.is_empty());
481    }
482
483    #[test]
484    fn merge_timing_picks_best_overlap() {
485        let en = LanguageCode::new("en").expect("new should succeed");
486        let es = LanguageCode::new("es").expect("new should succeed");
487        let en_entries = vec![CaptionEntry::new(1, 0, 5000, "Long sentence")];
488        let es_entries = vec![
489            CaptionEntry::new(1, 0, 500, "Short"),   // 500ms overlap
490            CaptionEntry::new(2, 0, 4000, "Better"), // 4000ms overlap — wins
491        ];
492        let caption = MultiLangCaptionBuilder::new()
493            .add_track(en.clone(), en_entries)
494            .add_track(es.clone(), es_entries)
495            .build();
496        let merged = caption
497            .merge_timing(&en, &es)
498            .expect("merge timing should succeed");
499        assert_eq!(merged.len(), 1);
500        assert_eq!(merged[0].text, "Better");
501    }
502
503    #[test]
504    fn merge_timing_missing_primary_returns_error() {
505        let en = LanguageCode::new("en").expect("new should succeed");
506        let fr = LanguageCode::new("fr").expect("new should succeed");
507        let es = LanguageCode::new("es").expect("new should succeed");
508        let caption = MultiLangCaptionBuilder::new().add_track(en, vec![]).build();
509        assert!(caption.merge_timing(&fr, &es).is_err());
510    }
511
512    #[test]
513    fn merge_timing_missing_secondary_returns_error() {
514        let en = LanguageCode::new("en").expect("new should succeed");
515        let fr = LanguageCode::new("fr").expect("new should succeed");
516        let caption = MultiLangCaptionBuilder::new()
517            .add_track(en.clone(), vec![CaptionEntry::new(1, 0, 1000, "X")])
518            .build();
519        assert!(caption.merge_timing(&en, &fr).is_err());
520    }
521
522    #[test]
523    fn merge_timing_ids_renumbered() {
524        let en = LanguageCode::new("en").expect("new should succeed");
525        let de = LanguageCode::new("de").expect("new should succeed");
526        let en_entries = vec![
527            CaptionEntry::new(1, 0, 1000, "Hello"),
528            CaptionEntry::new(2, 2000, 3000, "World"),
529        ];
530        let de_entries = vec![
531            CaptionEntry::new(5, 200, 1200, "Hallo"),
532            CaptionEntry::new(6, 2100, 3100, "Welt"),
533        ];
534        let caption = MultiLangCaptionBuilder::new()
535            .add_track(en.clone(), en_entries)
536            .add_track(de.clone(), de_entries)
537            .build();
538        let merged = caption
539            .merge_timing(&en, &de)
540            .expect("merge timing should succeed");
541        assert_eq!(merged.len(), 2);
542        assert_eq!(merged[0].id, 1);
543        assert_eq!(merged[1].id, 2);
544    }
545}