use derive_more::IsVariant;
use mediaframe::lang::Language;
use mediatime::TimeRange;
use smol_str::SmolStr;
use crate::domain::{
vo::{LocalizedText, VoiceFingerprint},
Uuid7,
};
#[inline]
const fn is_valid_score(score: f32) -> bool {
score.is_finite() && score >= 0.0 && score <= 1.0
}
#[derive(Debug, Clone, PartialEq)]
pub struct Word {
text: SmolStr,
span: TimeRange,
score: f32,
language: Option<Language>,
}
impl Word {
#[inline]
pub fn try_new(text: impl Into<SmolStr>, span: TimeRange, score: f32) -> Result<Self, WordError> {
Self::try_from_parts(text, span, score, None)
}
#[inline]
pub fn try_from_parts(
text: impl Into<SmolStr>,
span: TimeRange,
score: f32,
language: Option<Language>,
) -> Result<Self, WordError> {
if !is_valid_score(score) {
return Err(WordError::ScoreOutOfRange);
}
Ok(Self {
text: text.into(),
span,
score,
language,
})
}
#[inline(always)]
pub fn text(&self) -> &str {
self.text.as_str()
}
#[inline(always)]
pub const fn span_ref(&self) -> &TimeRange {
&self.span
}
#[inline(always)]
pub const fn score(&self) -> f32 {
self.score
}
#[inline(always)]
pub const fn language(&self) -> Option<Language> {
self.language
}
#[inline(always)]
#[must_use]
pub fn with_text(mut self, v: impl Into<SmolStr>) -> Self {
self.text = v.into();
self
}
#[inline(always)]
#[must_use]
pub fn with_span(mut self, span: TimeRange) -> Self {
self.span = span;
self
}
#[inline]
pub fn try_with_score(mut self, score: f32) -> Result<Self, WordError> {
if !is_valid_score(score) {
return Err(WordError::ScoreOutOfRange);
}
self.score = score;
Ok(self)
}
#[inline]
pub const fn try_set_score(&mut self, score: f32) -> Result<&mut Self, WordError> {
if !is_valid_score(score) {
return Err(WordError::ScoreOutOfRange);
}
self.score = score;
Ok(self)
}
#[inline(always)]
#[must_use]
pub const fn with_language(mut self, v: Option<Language>) -> Self {
self.language = v;
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, IsVariant, thiserror::Error)]
#[non_exhaustive]
pub enum WordError {
#[error("Word score must be finite and within [0, 1]")]
ScoreOutOfRange,
}
#[derive(Debug, Clone, PartialEq)]
pub struct AudioSegment<Id = Uuid7> {
id: Id,
audio_track_id: Id,
index: u32,
span: TimeRange,
speaker_id: Option<Id>,
text: LocalizedText,
language: Option<Language>,
words: std::vec::Vec<Word>,
no_speech_prob: Option<f32>,
avg_logprob: Option<f32>,
temperature: Option<f32>,
voice_fingerprint: Option<VoiceFingerprint<Id>>,
}
impl AudioSegment<Uuid7> {
pub fn try_new(
id: Uuid7,
audio_track_id: Uuid7,
index: u32,
span: TimeRange,
) -> Result<Self, AudioSegmentError> {
if id.is_nil() {
return Err(AudioSegmentError::NilId);
}
if audio_track_id.is_nil() {
return Err(AudioSegmentError::NilAudioTrackId);
}
if span.start().cmp_semantic(&span.end()) == core::cmp::Ordering::Greater {
return Err(AudioSegmentError::InvertedSpan);
}
Ok(Self {
id,
audio_track_id,
index,
span,
speaker_id: None,
text: LocalizedText::new(),
language: None,
words: std::vec::Vec::new(),
no_speech_prob: None,
avg_logprob: None,
temperature: None,
voice_fingerprint: None,
})
}
#[inline(always)]
#[must_use]
pub fn with_speaker_id(mut self, v: Option<Uuid7>) -> Self {
self.speaker_id = v;
self
}
#[inline(always)]
pub fn set_speaker_id(&mut self, v: Option<Uuid7>) -> &mut Self {
self.speaker_id = v;
self
}
}
impl<Id> AudioSegment<Id> {
#[inline(always)]
pub const fn id_ref(&self) -> &Id {
&self.id
}
#[inline(always)]
pub const fn audio_track_id_ref(&self) -> &Id {
&self.audio_track_id
}
#[inline(always)]
pub const fn index(&self) -> u32 {
self.index
}
#[inline(always)]
pub const fn span_ref(&self) -> &TimeRange {
&self.span
}
#[inline(always)]
pub const fn speaker_id_ref(&self) -> Option<&Id> {
self.speaker_id.as_ref()
}
#[inline(always)]
pub const fn text_ref(&self) -> &LocalizedText {
&self.text
}
#[inline(always)]
pub const fn language(&self) -> Option<Language> {
self.language
}
#[inline(always)]
pub fn words_slice(&self) -> &[Word] {
self.words.as_slice()
}
#[inline(always)]
pub const fn no_speech_prob(&self) -> Option<f32> {
self.no_speech_prob
}
#[inline(always)]
pub const fn avg_logprob(&self) -> Option<f32> {
self.avg_logprob
}
#[inline(always)]
pub const fn temperature(&self) -> Option<f32> {
self.temperature
}
#[inline(always)]
pub const fn voice_fingerprint_ref(&self) -> Option<&VoiceFingerprint<Id>> {
self.voice_fingerprint.as_ref()
}
fn check_words(&self, words: &[Word]) -> Result<(), AudioSegmentError> {
use core::cmp::Ordering;
let (seg_start, seg_end) = (self.span.start(), self.span.end());
for w in words {
let (w_start, w_end) = (w.span_ref().start(), w.span_ref().end());
if w_start.cmp_semantic(&w_end) == Ordering::Greater {
return Err(AudioSegmentError::InvertedWordSpan);
}
if w_start.cmp_semantic(&seg_start) == Ordering::Less
|| w_end.cmp_semantic(&seg_end) == Ordering::Greater
{
return Err(AudioSegmentError::WordSpanOutOfSegment);
}
}
Ok(())
}
#[inline(always)]
#[must_use]
pub fn with_text(mut self, v: LocalizedText) -> Self {
self.text = v;
self
}
#[inline(always)]
#[must_use]
pub const fn with_language(mut self, v: Option<Language>) -> Self {
self.language = v;
self
}
#[inline]
pub fn try_with_words(
mut self,
v: impl Into<std::vec::Vec<Word>>,
) -> Result<Self, AudioSegmentError> {
let words = v.into();
self.check_words(&words)?;
self.words = words;
Ok(self)
}
#[inline]
pub fn try_with_no_speech_prob(mut self, v: Option<f32>) -> Result<Self, AudioSegmentError> {
if matches!(v, Some(p) if !is_valid_score(p)) {
return Err(AudioSegmentError::NoSpeechProbOutOfRange);
}
self.no_speech_prob = v;
Ok(self)
}
#[inline(always)]
#[must_use]
pub const fn with_avg_logprob(mut self, v: Option<f32>) -> Self {
self.avg_logprob = v;
self
}
#[inline(always)]
#[must_use]
pub const fn with_temperature(mut self, v: Option<f32>) -> Self {
self.temperature = v;
self
}
#[inline(always)]
pub fn set_text(&mut self, v: LocalizedText) -> &mut Self {
self.text = v;
self
}
#[inline(always)]
pub const fn set_language(&mut self, v: Option<Language>) -> &mut Self {
self.language = v;
self
}
#[inline]
pub fn try_set_words(
&mut self,
v: impl Into<std::vec::Vec<Word>>,
) -> Result<&mut Self, AudioSegmentError> {
let words = v.into();
self.check_words(&words)?;
self.words = words;
Ok(self)
}
#[inline]
pub const fn try_set_no_speech_prob(
&mut self,
v: Option<f32>,
) -> Result<&mut Self, AudioSegmentError> {
if let Some(p) = v {
if !is_valid_score(p) {
return Err(AudioSegmentError::NoSpeechProbOutOfRange);
}
}
self.no_speech_prob = v;
Ok(self)
}
#[inline(always)]
pub const fn set_avg_logprob(&mut self, v: Option<f32>) -> &mut Self {
self.avg_logprob = v;
self
}
#[inline(always)]
pub const fn set_temperature(&mut self, v: Option<f32>) -> &mut Self {
self.temperature = v;
self
}
#[inline(always)]
#[must_use]
pub fn with_voice_fingerprint(mut self, v: Option<VoiceFingerprint<Id>>) -> Self {
self.voice_fingerprint = v;
self
}
#[inline(always)]
pub fn set_voice_fingerprint(&mut self, v: Option<VoiceFingerprint<Id>>) -> &mut Self {
self.voice_fingerprint = v;
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, IsVariant, thiserror::Error)]
#[non_exhaustive]
pub enum AudioSegmentError {
#[error("AudioSegment id must not be the nil UUID")]
NilId,
#[error("AudioSegment `audio_track_id` (FK → AudioTrack) must not be the nil UUID")]
NilAudioTrackId,
#[error("AudioSegment span.start must be <= span.end")]
InvertedSpan,
#[error("AudioSegment word span must be contained in the segment span")]
WordSpanOutOfSegment,
#[error("AudioSegment word span.start must be <= span.end")]
InvertedWordSpan,
#[error("AudioSegment no_speech_prob must be finite and within [0, 1]")]
NoSpeechProbOutOfRange,
}
#[cfg(all(test, feature = "std"))]
mod tests {
use super::*;
use core::num::NonZeroU32;
use mediatime::Timebase;
fn tb() -> Timebase {
Timebase::new(1, NonZeroU32::new(1000).expect("nonzero"))
}
fn span(start_ticks: i64, end_ticks: i64) -> TimeRange {
TimeRange::new(start_ticks, end_ticks, tb())
}
#[test]
fn try_new_happy_path() {
let audio_track_id = Uuid7::new();
let s = AudioSegment::try_new(Uuid7::new(), audio_track_id, 0, span(0, 1500))
.expect("valid construction must succeed");
assert_eq!(s.audio_track_id_ref(), &audio_track_id);
assert_eq!(s.index(), 0);
assert!(s.speaker_id_ref().is_none());
assert!(s.text_ref().is_empty());
assert!(s.words_slice().is_empty());
assert!(s.language().is_none());
assert!(s.no_speech_prob().is_none());
}
#[test]
fn try_new_rejects_nil_id() {
let r = AudioSegment::try_new(Uuid7::nil(), Uuid7::new(), 0, span(0, 1500));
assert_eq!(r.err(), Some(AudioSegmentError::NilId));
assert!(AudioSegmentError::NilId.is_nil_id());
}
#[test]
fn try_new_rejects_nil_audio_track_id() {
let r = AudioSegment::try_new(Uuid7::new(), Uuid7::nil(), 0, span(0, 1500));
assert_eq!(r.err(), Some(AudioSegmentError::NilAudioTrackId));
assert!(AudioSegmentError::NilAudioTrackId.is_nil_audio_track_id());
}
#[test]
fn inverted_span_variant_is_constructible_and_reports_predicate() {
let e = AudioSegmentError::InvertedSpan;
assert!(e.is_inverted_span());
assert_eq!(
format!("{e}"),
"AudioSegment span.start must be <= span.end"
);
}
#[test]
fn try_new_accepts_zero_length_span() {
AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(500, 500))
.expect("zero-length span ok");
}
#[test]
fn builders_attach_speaker_and_text() {
let speaker = Uuid7::new();
let es = Language::from_bcp47("es").unwrap();
let s = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 2, span(1000, 2000))
.unwrap()
.with_speaker_id(Some(speaker))
.with_text(LocalizedText::from_src_translated("hola", "hello"))
.with_language(Some(es));
assert_eq!(s.speaker_id_ref(), Some(&speaker));
assert_eq!(s.text_ref().src(), "hola");
assert_eq!(s.text_ref().translated(), "hello");
assert_eq!(s.language(), Some(es));
assert_eq!(s.language().unwrap().language(), "es");
}
#[test]
fn words_attach_and_carry_per_word_language() {
let fr = Language::from_bcp47("fr").unwrap();
let w1 = Word::try_new("bon", span(0, 200), 0.95)
.unwrap()
.with_language(Some(fr));
let w2 = Word::try_from_parts("jour", span(200, 400), 0.92, Some(fr)).unwrap();
let s = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 400))
.unwrap()
.try_with_words(std::vec![w1.clone(), w2.clone()])
.unwrap();
assert_eq!(s.words_slice().len(), 2);
assert_eq!(s.words_slice()[0].text(), "bon");
assert!((s.words_slice()[0].score() - 0.95).abs() < f32::EPSILON);
assert_eq!(s.words_slice()[1].language(), Some(fr));
}
#[test]
fn word_try_new_rejects_non_finite_or_out_of_range_score() {
for bad in [f32::NAN, f32::INFINITY, f32::NEG_INFINITY, -0.1, 1.1] {
let r = Word::try_new("x", span(0, 100), bad);
assert_eq!(r.err(), Some(WordError::ScoreOutOfRange));
}
assert!(Word::try_new("x", span(0, 100), 0.0).is_ok());
assert!(Word::try_new("x", span(0, 100), 1.0).is_ok());
assert!(WordError::ScoreOutOfRange.is_score_out_of_range());
}
#[test]
fn word_try_with_score_and_try_set_score_validate() {
let w = Word::try_new("x", span(0, 100), 0.5).unwrap();
assert!(w.clone().try_with_score(f32::NAN).is_err());
assert!(w.clone().try_with_score(2.0).is_err());
assert!(w.clone().try_with_score(0.8).is_ok());
let mut w = w;
assert!(w.try_set_score(f32::NEG_INFINITY).is_err());
assert!((w.score() - 0.5).abs() < f32::EPSILON);
w.try_set_score(0.25).unwrap();
assert!((w.score() - 0.25).abs() < f32::EPSILON);
}
#[test]
fn try_with_words_rejects_word_span_outside_segment() {
let seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(100, 400)).unwrap();
let early = Word::try_new("a", span(0, 200), 0.9).unwrap();
let r = seg.clone().try_with_words(std::vec![early]);
assert_eq!(r.err(), Some(AudioSegmentError::WordSpanOutOfSegment));
let late = Word::try_new("b", span(300, 500), 0.9).unwrap();
let r = seg.clone().try_with_words(std::vec![late]);
assert_eq!(r.err(), Some(AudioSegmentError::WordSpanOutOfSegment));
assert!(AudioSegmentError::WordSpanOutOfSegment.is_word_span_out_of_segment());
let ok = Word::try_new("c", span(150, 300), 0.9).unwrap();
assert!(seg.try_with_words(std::vec![ok]).is_ok());
}
#[test]
fn try_set_words_rejects_and_leaves_words_unchanged() {
let mut seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(100, 400)).unwrap();
let good = Word::try_new("c", span(150, 300), 0.9).unwrap();
seg.try_set_words(std::vec![good.clone()]).unwrap();
assert_eq!(seg.words_slice().len(), 1);
let bad = Word::try_new("d", span(0, 50), 0.9).unwrap();
let r = seg.try_set_words(std::vec![bad]);
assert_eq!(r.err(), Some(AudioSegmentError::WordSpanOutOfSegment));
assert_eq!(seg.words_slice(), &[good]);
}
#[test]
fn whisper_quality_signals_attach() {
let s = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500))
.unwrap()
.try_with_no_speech_prob(Some(0.05))
.unwrap()
.with_avg_logprob(Some(-0.4))
.with_temperature(Some(0.0));
assert!((s.no_speech_prob().unwrap() - 0.05).abs() < f32::EPSILON);
assert!((s.avg_logprob().unwrap() - -0.4).abs() < f32::EPSILON);
assert!((s.temperature().unwrap() - 0.0).abs() < f32::EPSILON);
}
#[test]
fn setters_mutate_in_place() {
let mut s = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500)).unwrap();
let speaker = Uuid7::new();
s.set_speaker_id(Some(speaker));
s.set_text(LocalizedText::from_src("hello"));
s.try_set_words(std::vec![Word::try_new("hi", span(0, 100), 0.9).unwrap()])
.unwrap();
s.try_set_no_speech_prob(Some(0.01)).unwrap();
assert_eq!(s.speaker_id_ref(), Some(&speaker));
assert_eq!(s.text_ref().src(), "hello");
assert_eq!(s.words_slice().len(), 1);
assert!((s.no_speech_prob().unwrap() - 0.01).abs() < f32::EPSILON);
}
fn mpeg_tb() -> Timebase {
Timebase::new(1, NonZeroU32::new(90_000).expect("nonzero"))
}
#[test]
fn check_words_uses_semantic_time_across_timebases() {
let seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 1000)).unwrap();
let w_late = Word::try_new("late", TimeRange::new(0, 135_000, mpeg_tb()), 0.9).unwrap();
let r = seg.clone().try_with_words(std::vec![w_late]);
assert_eq!(r.err(), Some(AudioSegmentError::WordSpanOutOfSegment));
let w_ok = Word::try_new("ok", TimeRange::new(22_500, 67_500, mpeg_tb()), 0.9).unwrap();
assert!(seg.try_with_words(std::vec![w_ok]).is_ok());
}
#[test]
fn check_words_rejects_inverted_word_span() {
let seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 1000)).unwrap();
let inverted = TimeRange::new(100, 200, tb()).with_end(50);
let w = Word::try_new("bad", inverted, 0.9).unwrap();
let r = seg.try_with_words(std::vec![w]);
assert_eq!(r.err(), Some(AudioSegmentError::InvertedWordSpan));
assert!(AudioSegmentError::InvertedWordSpan.is_inverted_word_span());
}
#[test]
fn try_with_no_speech_prob_rejects_invalid_and_accepts_boundaries() {
let seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500)).unwrap();
for bad in [f32::NAN, f32::INFINITY, f32::NEG_INFINITY, -0.1, 1.1] {
let r = seg.clone().try_with_no_speech_prob(Some(bad));
assert_eq!(r.err(), Some(AudioSegmentError::NoSpeechProbOutOfRange));
}
assert!(seg.clone().try_with_no_speech_prob(None).is_ok());
assert!(seg.clone().try_with_no_speech_prob(Some(0.0)).is_ok());
assert!(seg.try_with_no_speech_prob(Some(1.0)).is_ok());
assert!(AudioSegmentError::NoSpeechProbOutOfRange.is_no_speech_prob_out_of_range());
}
#[test]
fn try_set_no_speech_prob_rejects_and_leaves_value_unchanged() {
let mut seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500)).unwrap();
seg.try_set_no_speech_prob(Some(0.3)).unwrap();
let r = seg.try_set_no_speech_prob(Some(f32::NAN));
assert_eq!(r.err(), Some(AudioSegmentError::NoSpeechProbOutOfRange));
assert!((seg.no_speech_prob().unwrap() - 0.3).abs() < f32::EPSILON);
let r = seg.try_set_no_speech_prob(Some(2.0));
assert_eq!(r.err(), Some(AudioSegmentError::NoSpeechProbOutOfRange));
assert!((seg.no_speech_prob().unwrap() - 0.3).abs() < f32::EPSILON);
seg.try_set_no_speech_prob(Some(0.9)).unwrap();
assert!((seg.no_speech_prob().unwrap() - 0.9).abs() < f32::EPSILON);
}
fn vfp() -> VoiceFingerprint<Uuid7> {
use crate::domain::vo::Provenance;
VoiceFingerprint::try_new(
Uuid7::new(),
192,
jiff::Timestamp::from_millisecond(1_700_000_000_000).expect("valid ts"),
Some(0.9),
Provenance::from_parts("ecapa-tdnn", "v1.0.0", "", "findit-indexer-0.1.0"),
)
.expect("valid voiceprint")
}
#[test]
fn voice_fingerprint_defaults_to_none() {
let s = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500)).unwrap();
assert!(s.voice_fingerprint_ref().is_none());
}
#[test]
fn with_voice_fingerprint_and_set_voice_fingerprint_attach_and_clear() {
let v = vfp();
let seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500))
.unwrap()
.with_voice_fingerprint(Some(v.clone()));
assert_eq!(seg.voice_fingerprint_ref(), Some(&v));
let seg = seg.with_voice_fingerprint(None);
assert!(seg.voice_fingerprint_ref().is_none());
let mut seg = seg;
seg.set_voice_fingerprint(Some(v.clone()));
assert_eq!(seg.voice_fingerprint_ref(), Some(&v));
seg.set_voice_fingerprint(None);
assert!(seg.voice_fingerprint_ref().is_none());
}
#[test]
fn with_speaker_and_set_speaker_attach_and_clear() {
let seg = AudioSegment::try_new(Uuid7::new(), Uuid7::new(), 0, span(0, 500)).unwrap();
let n = seg.clone().with_speaker_id(None);
assert!(n.speaker_id_ref().is_none());
let speaker = Uuid7::new();
let mut s = seg.with_speaker_id(Some(speaker));
assert_eq!(s.speaker_id_ref(), Some(&speaker));
s.set_speaker_id(None);
assert!(s.speaker_id_ref().is_none());
}
}