use std::fs;
use std::path::Path;
use audio_analysis_transcription::TranscriptionPipelineResponse;
use text_transcripts::TranscriptionContract;
use crate::config::{
ExpectedOutputComparison, ExpectedOutputFile, NativeWhisperxError, OutputComparisonMode,
OutputConfig, OutputFile, OutputFormat, ParityTolerance, SubtitleConfig,
};
pub fn write_outputs(
response: &TranscriptionPipelineResponse,
output: &OutputConfig,
) -> Result<Vec<OutputFile>, NativeWhisperxError> {
write_outputs_with_options(response, output, false)
}
pub(crate) fn write_outputs_with_options(
response: &TranscriptionPipelineResponse,
output: &OutputConfig,
return_char_alignments: bool,
) -> Result<Vec<OutputFile>, NativeWhisperxError> {
let Some(output_dir) = &output.output_dir else {
return Ok(Vec::new());
};
fs::create_dir_all(output_dir)?;
let basename = output
.basename
.clone()
.or_else(|| {
response
.transcript
.source
.as_ref()
.and_then(source_basename)
})
.unwrap_or_else(|| "transcript".to_string());
output
.formats
.iter()
.copied()
.flat_map(expand_output_format)
.map(|format| {
let path = output_dir.join(format!("{basename}.{}", format.extension()));
let contents = render_output(response, format, output, return_char_alignments)?;
fs::write(&path, contents)?;
Ok(OutputFile { format, path })
})
.collect()
}
pub(crate) fn compare_expected_outputs(
actual_outputs: &[OutputFile],
expected_outputs: &[ExpectedOutputFile],
) -> Result<Vec<ExpectedOutputComparison>, NativeWhisperxError> {
expected_outputs
.iter()
.map(|expected| {
let actual_path = actual_outputs
.iter()
.find(|actual| actual.format == expected.format)
.map(|actual| actual.path.clone());
let Some(actual_path_ref) = actual_path.as_ref() else {
return Ok(ExpectedOutputComparison {
format: expected.format,
comparison: expected.comparison,
gating: expected.gating,
expected_path: expected.path.clone(),
actual_path,
passed: false,
difference: Some(format!("missing actual {:?} output", expected.format)),
});
};
let comparison = match expected.comparison {
OutputComparisonMode::Exact => {
compare_output_bytes(&expected.path, actual_path_ref)
}
OutputComparisonMode::JsonSemantic => {
compare_output_json(&expected.path, actual_path_ref)
}
OutputComparisonMode::SubtitleSemantic => {
compare_output_subtitles(&expected.path, actual_path_ref)
}
}?;
Ok(ExpectedOutputComparison {
format: expected.format,
comparison: expected.comparison,
gating: expected.gating,
expected_path: expected.path.clone(),
actual_path,
passed: comparison.is_none(),
difference: comparison,
})
})
.collect()
}
fn compare_output_bytes(
expected_path: &Path,
actual_path: &Path,
) -> Result<Option<String>, NativeWhisperxError> {
let expected = match fs::read(expected_path) {
Ok(bytes) => bytes,
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
return Ok(Some(format!(
"missing expected output {}",
expected_path.display()
)));
}
Err(error) => return Err(NativeWhisperxError::Io(error)),
};
let actual = fs::read(actual_path)?;
if expected == actual {
return Ok(None);
}
Ok(Some(first_output_difference(
expected_path,
actual_path,
&expected,
&actual,
)))
}
pub(crate) fn compare_output_json(
expected_path: &Path,
actual_path: &Path,
) -> Result<Option<String>, NativeWhisperxError> {
let expected = match fs::read(expected_path) {
Ok(bytes) => bytes,
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
return Ok(Some(format!(
"missing expected output {}",
expected_path.display()
)));
}
Err(error) => return Err(NativeWhisperxError::Io(error)),
};
let actual = fs::read(actual_path)?;
let expected_json: serde_json::Value = serde_json::from_slice(&expected)?;
let actual_json: serde_json::Value = serde_json::from_slice(&actual)?;
if expected_json == actual_json {
return Ok(None);
}
if looks_like_whisperx_transcript_json(&expected_json)
&& looks_like_whisperx_transcript_json(&actual_json)
{
return Ok(compare_whisperx_transcript_json(
&expected_json,
&actual_json,
ParityTolerance::default(),
));
}
Ok(Some(format!(
"JSON output differs: expected={} actual={}",
expected_path.display(),
actual_path.display()
)))
}
#[derive(Debug, Clone, PartialEq)]
struct ParsedSubtitleCue {
start: f64,
end: f64,
text: String,
}
fn compare_output_subtitles(
expected_path: &Path,
actual_path: &Path,
) -> Result<Option<String>, NativeWhisperxError> {
let expected = match fs::read_to_string(expected_path) {
Ok(text) => text,
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
return Ok(Some(format!(
"missing expected output {}",
expected_path.display()
)));
}
Err(error) => return Err(NativeWhisperxError::Io(error)),
};
let actual = fs::read_to_string(actual_path)?;
let expected_cues = parse_subtitle_cues(&expected);
let actual_cues = parse_subtitle_cues(&actual);
if expected_cues.len() != actual_cues.len() {
return Ok(Some(format!(
"subtitle cue count differs: expected={} actual={}",
expected_cues.len(),
actual_cues.len()
)));
}
let tolerance = ParityTolerance::default().word_seconds;
for (index, (expected, actual)) in expected_cues.iter().zip(actual_cues.iter()).enumerate() {
if let Some(difference) =
compare_subtitle_seconds(index, "start", expected.start, actual.start, tolerance)
{
return Ok(Some(difference));
}
if let Some(difference) =
compare_subtitle_seconds(index, "end", expected.end, actual.end, tolerance)
{
return Ok(Some(difference));
}
if expected.text != actual.text {
return Ok(Some(format!(
"subtitle cue {index} text differs: expected {:?} actual {:?}",
expected.text, actual.text
)));
}
}
Ok(None)
}
fn compare_subtitle_seconds(
index: usize,
field: &str,
expected: f64,
actual: f64,
tolerance: f64,
) -> Option<String> {
let delta = (expected - actual).abs();
if delta <= tolerance {
None
} else {
Some(format!(
"subtitle cue {index} {field} differs: expected={expected:.3} actual={actual:.3} delta={delta:.3} tolerance={tolerance:.3}"
))
}
}
fn parse_subtitle_cues(text: &str) -> Vec<ParsedSubtitleCue> {
let normalized = text.replace("\r\n", "\n");
normalized
.split("\n\n")
.filter_map(parse_subtitle_block)
.collect()
}
fn parse_subtitle_block(block: &str) -> Option<ParsedSubtitleCue> {
let mut lines = block
.lines()
.map(str::trim)
.filter(|line| !line.is_empty() && *line != "WEBVTT");
let timing_line = lines.find(|line| line.contains("-->"))?;
let (start, end) = parse_subtitle_timing_line(timing_line)?;
let text = normalize_subtitle_text(&lines.collect::<Vec<_>>().join(" "));
Some(ParsedSubtitleCue { start, end, text })
}
fn parse_subtitle_timing_line(line: &str) -> Option<(f64, f64)> {
let (start, rest) = line.split_once("-->")?;
let end = rest.split_whitespace().next()?;
Some((
timestamp_to_seconds(start.trim()),
timestamp_to_seconds(end.trim()),
))
}
fn normalize_subtitle_text(text: &str) -> String {
normalize_space(&text.replace("<u>", "").replace("</u>", ""))
}
fn looks_like_whisperx_transcript_json(value: &serde_json::Value) -> bool {
value.as_object().is_some_and(|object| {
object.contains_key("segments")
|| object.contains_key("word_segments")
|| (object.contains_key("language") && object.contains_key("text"))
})
}
fn compare_whisperx_transcript_json(
expected: &serde_json::Value,
actual: &serde_json::Value,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_object = match expected.as_object() {
Some(object) => object,
None => return Some("JSON transcript malformed: expected top-level object".to_string()),
};
let actual_object = match actual.as_object() {
Some(object) => object,
None => return Some("JSON transcript malformed: actual top-level object".to_string()),
};
if let Some(difference) = compare_json_language(expected_object, actual_object) {
return Some(difference);
}
if let Some(difference) = compare_json_segments(expected_object, actual_object, tolerance) {
return Some(difference);
}
if let Some(difference) = compare_json_words(expected_object, actual_object, tolerance) {
return Some(difference);
}
if json_contains_chars(expected_object) || json_contains_chars(actual_object) {
if let Some(difference) = compare_json_chars(expected_object, actual_object, tolerance) {
return Some(difference);
}
}
None
}
fn compare_json_language(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
) -> Option<String> {
let expected_language = match optional_json_string(expected, "language", "expected language") {
Ok(language) => language,
Err(error) => return Some(error),
};
let actual_language = match optional_json_string(actual, "language", "actual language") {
Ok(language) => language,
Err(error) => return Some(error),
};
if expected_language != actual_language {
return Some(format!(
"JSON transcript language differs: expected={expected_language:?} actual={actual_language:?}"
));
}
None
}
fn compare_json_segments(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_segments = match json_array_field(expected, "segments", "expected segments") {
Ok(segments) => segments,
Err(error) => return Some(error),
};
let actual_segments = match json_array_field(actual, "segments", "actual segments") {
Ok(segments) => segments,
Err(error) => return Some(error),
};
if expected_segments.len() != actual_segments.len() {
return Some(format!(
"JSON transcript segment count differs: expected={} actual={}",
expected_segments.len(),
actual_segments.len()
));
}
for (index, (expected_segment, actual_segment)) in expected_segments
.iter()
.zip(actual_segments.iter())
.enumerate()
{
let expected_segment = match expected_segment.as_object() {
Some(segment) => segment,
None => {
return Some(format!(
"JSON transcript segment {index} malformed: expected object"
));
}
};
let actual_segment = match actual_segment.as_object() {
Some(segment) => segment,
None => {
return Some(format!(
"JSON transcript segment {index} malformed: actual object"
));
}
};
if let Some(difference) = compare_required_json_seconds(
expected_segment,
actual_segment,
"start",
&format!("segment {index} start"),
tolerance.segment_seconds,
) {
return Some(difference);
}
if let Some(difference) = compare_required_json_seconds(
expected_segment,
actual_segment,
"end",
&format!("segment {index} end"),
tolerance.segment_seconds,
) {
return Some(difference);
}
let expected_text = match required_json_string(
expected_segment,
"text",
&format!("segment {index} expected text"),
) {
Ok(text) => text,
Err(error) => return Some(error),
};
let actual_text = match required_json_string(
actual_segment,
"text",
&format!("segment {index} actual text"),
) {
Ok(text) => text,
Err(error) => return Some(error),
};
if normalize_space(expected_text) != normalize_space(actual_text) {
return Some(format!(
"JSON transcript segment {index} text differs: expected={expected_text:?} actual={actual_text:?}"
));
}
if expected_segment.contains_key("speaker") || actual_segment.contains_key("speaker") {
let expected_speaker = match optional_json_string(
expected_segment,
"speaker",
&format!("segment {index} expected speaker"),
) {
Ok(speaker) => speaker,
Err(error) => return Some(error),
};
let actual_speaker = match optional_json_string(
actual_segment,
"speaker",
&format!("segment {index} actual speaker"),
) {
Ok(speaker) => speaker,
Err(error) => return Some(error),
};
if expected_speaker != actual_speaker {
return Some(format!(
"JSON transcript segment {index} speaker differs: expected={expected_speaker:?} actual={actual_speaker:?}"
));
}
}
}
None
}
fn compare_json_words(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_words = match flattened_json_words(expected, "expected") {
Ok(words) => words,
Err(error) => return Some(error),
};
let actual_words = match flattened_json_words(actual, "actual") {
Ok(words) => words,
Err(error) => return Some(error),
};
if expected_words.len() != actual_words.len() {
return Some(format!(
"JSON transcript word count differs: expected={} actual={}",
expected_words.len(),
actual_words.len()
));
}
for (index, (expected_word, actual_word)) in
expected_words.iter().zip(actual_words.iter()).enumerate()
{
if let Some(difference) = compare_json_word(index, expected_word, actual_word, tolerance) {
return Some(difference);
}
}
None
}
fn compare_json_word(
index: usize,
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_text =
match required_json_string(expected, "word", &format!("word {index} expected word")) {
Ok(text) => text,
Err(error) => return Some(error),
};
let actual_text =
match required_json_string(actual, "word", &format!("word {index} actual word")) {
Ok(text) => text,
Err(error) => return Some(error),
};
if normalize_space(expected_text) != normalize_space(actual_text) {
return Some(format!(
"JSON transcript word {index} text differs: expected={expected_text:?} actual={actual_text:?}"
));
}
if let Some(difference) = compare_required_json_seconds(
expected,
actual,
"start",
&format!("word {index} start"),
tolerance.word_seconds,
) {
return Some(difference);
}
if let Some(difference) = compare_required_json_seconds(
expected,
actual,
"end",
&format!("word {index} end"),
tolerance.word_seconds,
) {
return Some(difference);
}
if expected.contains_key("score") && actual.contains_key("score") {
let expected_score = match optional_json_number(
expected,
"score",
&format!("word {index} expected score"),
) {
Ok(Some(score)) => score,
Ok(None) => return None,
Err(error) => return Some(error),
};
let actual_score =
match optional_json_number(actual, "score", &format!("word {index} actual score")) {
Ok(Some(score)) => score,
Ok(None) => return None,
Err(error) => return Some(error),
};
if (expected_score - actual_score).abs() > 0.001 {
return Some(format!(
"JSON transcript word {index} score differs: expected={expected_score:.3} actual={actual_score:.3} tolerance=0.001"
));
}
}
None
}
fn compare_json_chars(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_chars = match flattened_json_chars(expected, "expected") {
Ok(chars) => chars,
Err(error) => return Some(error),
};
let actual_chars = match flattened_json_chars(actual, "actual") {
Ok(chars) => chars,
Err(error) => return Some(error),
};
if expected_chars.len() != actual_chars.len() {
return Some(format!(
"JSON transcript char count differs: expected={} actual={}",
expected_chars.len(),
actual_chars.len()
));
}
for (index, (expected_char, actual_char)) in
expected_chars.iter().zip(actual_chars.iter()).enumerate()
{
let expected_text = match required_json_string(
expected_char,
"char",
&format!("char {index} expected char"),
) {
Ok(text) => text,
Err(error) => return Some(error),
};
let actual_text =
match required_json_string(actual_char, "char", &format!("char {index} actual char")) {
Ok(text) => text,
Err(error) => return Some(error),
};
if expected_text != actual_text {
return Some(format!(
"JSON transcript char {index} text differs: expected={expected_text:?} actual={actual_text:?}"
));
}
if let Some(difference) = compare_optional_json_seconds(
expected_char,
actual_char,
"start",
&format!("char {index} start"),
tolerance.word_seconds,
) {
return Some(difference);
}
if let Some(difference) = compare_optional_json_seconds(
expected_char,
actual_char,
"end",
&format!("char {index} end"),
tolerance.word_seconds,
) {
return Some(difference);
}
}
None
}
fn json_contains_chars(object: &serde_json::Map<String, serde_json::Value>) -> bool {
object
.get("segments")
.and_then(serde_json::Value::as_array)
.is_some_and(|segments| {
segments.iter().any(|segment| {
segment
.as_object()
.is_some_and(|segment| segment.contains_key("chars"))
})
})
}
fn flattened_json_words<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
side: &str,
) -> Result<Vec<&'a serde_json::Map<String, serde_json::Value>>, String> {
if let Some(words) = object.get("word_segments") {
return json_value_array(words, &format!("{side} word_segments"))?
.iter()
.enumerate()
.map(|(index, word)| {
word.as_object().ok_or_else(|| {
format!(
"JSON transcript {side} word_segments[{index}] malformed: object expected"
)
})
})
.collect();
}
let segments = json_array_field(object, "segments", &format!("{side} segments"))?;
let mut words = Vec::new();
for (segment_index, segment) in segments.iter().enumerate() {
let Some(segment) = segment.as_object() else {
return Err(format!(
"JSON transcript {side} segment {segment_index} malformed: object expected"
));
};
if let Some(segment_words) = segment.get("words") {
for (word_index, word) in json_value_array(
segment_words,
&format!("{side} segment {segment_index} words"),
)?
.iter()
.enumerate()
{
words.push(word.as_object().ok_or_else(|| {
format!("JSON transcript {side} segment {segment_index} words[{word_index}] malformed: object expected")
})?);
}
}
}
Ok(words)
}
fn flattened_json_chars<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
side: &str,
) -> Result<Vec<&'a serde_json::Map<String, serde_json::Value>>, String> {
let segments = json_array_field(object, "segments", &format!("{side} segments"))?;
let mut chars = Vec::new();
for (segment_index, segment) in segments.iter().enumerate() {
let Some(segment) = segment.as_object() else {
return Err(format!(
"JSON transcript {side} segment {segment_index} malformed: object expected"
));
};
if let Some(segment_chars) = segment.get("chars") {
for (char_index, character) in json_value_array(
segment_chars,
&format!("{side} segment {segment_index} chars"),
)?
.iter()
.enumerate()
{
chars.push(character.as_object().ok_or_else(|| {
format!("JSON transcript {side} segment {segment_index} chars[{char_index}] malformed: object expected")
})?);
}
}
}
Ok(chars)
}
fn json_array_field<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<&'a Vec<serde_json::Value>, String> {
let value = object
.get(key)
.ok_or_else(|| format!("JSON transcript missing array: {label}"))?;
json_value_array(value, label)
}
fn json_value_array<'a>(
value: &'a serde_json::Value,
label: &str,
) -> Result<&'a Vec<serde_json::Value>, String> {
value
.as_array()
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be an array"))
}
fn required_json_string<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<&'a str, String> {
let value = object
.get(key)
.ok_or_else(|| format!("JSON transcript malformed field: {label} missing"))?;
value
.as_str()
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be a string"))
}
fn optional_json_string<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<Option<&'a str>, String> {
match object.get(key) {
Some(serde_json::Value::Null) | None => Ok(None),
Some(value) => value
.as_str()
.map(Some)
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be a string")),
}
}
fn optional_json_number(
object: &serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<Option<f64>, String> {
match object.get(key) {
Some(serde_json::Value::Null) | None => Ok(None),
Some(value) => value
.as_f64()
.map(Some)
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be a number")),
}
}
fn compare_required_json_seconds(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
tolerance: f64,
) -> Option<String> {
let expected_seconds = match optional_json_number(expected, key, &format!("{label} expected")) {
Ok(Some(seconds)) => seconds,
Ok(None) => {
return Some(format!(
"JSON transcript malformed field: {label} expected missing"
));
}
Err(error) => return Some(error),
};
let actual_seconds = match optional_json_number(actual, key, &format!("{label} actual")) {
Ok(Some(seconds)) => seconds,
Ok(None) => {
return Some(format!(
"JSON transcript malformed field: {label} actual missing"
));
}
Err(error) => return Some(error),
};
if (expected_seconds - actual_seconds).abs() > tolerance {
return Some(format!(
"JSON transcript {label} timing differs: expected={expected_seconds:.3}s actual={actual_seconds:.3}s delta={:.3}s tolerance={tolerance:.3}s",
(expected_seconds - actual_seconds).abs()
));
}
None
}
fn compare_optional_json_seconds(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
tolerance: f64,
) -> Option<String> {
let expected_seconds = match optional_json_number(expected, key, &format!("{label} expected")) {
Ok(seconds) => seconds,
Err(error) => return Some(error),
};
let actual_seconds = match optional_json_number(actual, key, &format!("{label} actual")) {
Ok(seconds) => seconds,
Err(error) => return Some(error),
};
match (expected_seconds, actual_seconds) {
(Some(expected_seconds), Some(actual_seconds)) => {
if (expected_seconds - actual_seconds).abs() > tolerance {
Some(format!(
"JSON transcript {label} timing differs: expected={expected_seconds:.3}s actual={actual_seconds:.3}s delta={:.3}s tolerance={tolerance:.3}s",
(expected_seconds - actual_seconds).abs()
))
} else {
None
}
}
(None, None) => None,
(Some(_), None) | (None, Some(_)) => Some(format!(
"JSON transcript {label} timing shape differs: expected={} actual={}",
timing_shape(expected_seconds),
timing_shape(actual_seconds)
)),
}
}
fn timing_shape(value: Option<f64>) -> &'static str {
if value.is_some() {
"present"
} else {
"null"
}
}
fn first_output_difference(
expected_path: &Path,
actual_path: &Path,
expected: &[u8],
actual: &[u8],
) -> String {
let expected_text = std::str::from_utf8(expected);
let actual_text = std::str::from_utf8(actual);
if let (Ok(expected_text), Ok(actual_text)) = (expected_text, actual_text) {
for (index, (expected_line, actual_line)) in
expected_text.lines().zip(actual_text.lines()).enumerate()
{
if expected_line != actual_line {
return format!(
"line {} differs: expected {:?}, actual {:?}",
index + 1,
expected_line,
actual_line
);
}
}
}
format!(
"output bytes differ: expected={} ({} bytes) actual={} ({} bytes)",
expected_path.display(),
expected.len(),
actual_path.display(),
actual.len()
)
}
fn render_output(
response: &TranscriptionPipelineResponse,
format: OutputFormat,
output: &OutputConfig,
return_char_alignments: bool,
) -> Result<String, NativeWhisperxError> {
match format {
OutputFormat::All => Err(NativeWhisperxError::InvalidConfig(
"internal error: all output format must be expanded before rendering".to_string(),
)),
OutputFormat::Json if output.pretty_json => Ok(serde_json::to_string_pretty(
&whisperx_json_value(&response.transcript, return_char_alignments),
)?),
OutputFormat::Json => Ok(serde_json::to_string(&whisperx_json_value(
&response.transcript,
return_char_alignments,
))?),
OutputFormat::NativeJson if output.pretty_json => {
Ok(serde_json::to_string_pretty(&response.transcript)?)
}
OutputFormat::NativeJson => Ok(serde_json::to_string(&response.transcript)?),
OutputFormat::Srt => Ok(format_srt_with_options(
&response.transcript,
&output.subtitles,
)),
OutputFormat::Vtt => Ok(format_webvtt_with_options(
&response.transcript,
&output.subtitles,
)),
OutputFormat::Txt => Ok(format_txt(&response.transcript)),
OutputFormat::Tsv => Ok(format_tsv(&response.transcript)),
OutputFormat::Audacity => Ok(format_audacity_labels(&response.transcript)),
}
}
pub(crate) fn whisperx_json_value(
transcript: &TranscriptionContract,
return_char_alignments: bool,
) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert(
"text".to_string(),
serde_json::Value::String(transcript.text_or_joined()),
);
if let Some(language) = &transcript.language {
object.insert(
"language".to_string(),
serde_json::Value::String(language.clone()),
);
}
if let Some(source) = &transcript.source {
object.insert(
"source".to_string(),
serde_json::Value::String(source.clone()),
);
}
let segments = transcript
.segments
.iter()
.map(|segment| whisperx_segment_value(segment, return_char_alignments))
.collect::<Vec<_>>();
let words = transcript
.segments
.iter()
.flat_map(|segment| segment.words.iter())
.map(whisperx_word_value)
.collect::<Vec<_>>();
object.insert("segments".to_string(), serde_json::Value::Array(segments));
object.insert("word_segments".to_string(), serde_json::Value::Array(words));
serde_json::Value::Object(object)
}
fn whisperx_segment_value(
segment: &text_transcripts::TranscriptSegmentContract,
return_char_alignments: bool,
) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert("id".to_string(), serde_json::Value::from(segment.index));
insert_seconds(&mut object, "start", segment.start_seconds);
insert_seconds(&mut object, "end", segment.end_seconds);
object.insert(
"text".to_string(),
serde_json::Value::String(segment.text.clone()),
);
if let Some(speaker) = &segment.speaker {
object.insert(
"speaker".to_string(),
serde_json::Value::String(speaker.clone()),
);
}
if let Some(confidence) = segment.confidence {
object.insert("score".to_string(), serde_json::Value::from(confidence));
}
if !segment.words.is_empty() {
object.insert(
"words".to_string(),
serde_json::Value::Array(segment.words.iter().map(whisperx_word_value).collect()),
);
}
if return_char_alignments && !segment.chars.is_empty() {
object.insert(
"chars".to_string(),
serde_json::Value::Array(segment.chars.iter().map(whisperx_char_value).collect()),
);
}
serde_json::Value::Object(object)
}
fn whisperx_word_value(word: &text_transcripts::TranscriptWordContract) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert(
"word".to_string(),
serde_json::Value::String(word.text.clone()),
);
insert_seconds(&mut object, "start", word.start_seconds);
insert_seconds(&mut object, "end", word.end_seconds);
if let Some(confidence) = word.confidence {
object.insert("score".to_string(), serde_json::Value::from(confidence));
}
if let Some(speaker) = &word.speaker {
object.insert(
"speaker".to_string(),
serde_json::Value::String(speaker.clone()),
);
}
serde_json::Value::Object(object)
}
fn whisperx_char_value(character: &text_transcripts::TranscriptCharContract) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert(
"char".to_string(),
serde_json::Value::String(character.character.clone()),
);
insert_seconds(&mut object, "start", character.start_seconds);
insert_seconds(&mut object, "end", character.end_seconds);
if let Some(confidence) = character.confidence {
object.insert("score".to_string(), serde_json::Value::from(confidence));
}
serde_json::Value::Object(object)
}
fn insert_seconds(
object: &mut serde_json::Map<String, serde_json::Value>,
key: &str,
value: Option<f64>,
) {
if let Some(value) = value {
object.insert(key.to_string(), serde_json::Value::from(value));
}
}
pub(crate) fn expand_output_format(format: OutputFormat) -> Vec<OutputFormat> {
match format {
OutputFormat::All => vec![
OutputFormat::Txt,
OutputFormat::Vtt,
OutputFormat::Srt,
OutputFormat::Tsv,
OutputFormat::Audacity,
OutputFormat::Json,
],
other => vec![other],
}
}
fn format_txt(transcript: &TranscriptionContract) -> String {
let text = transcript
.segments
.iter()
.map(|segment| match &segment.speaker {
Some(speaker) => format!("[{speaker}]: {}", segment.text.trim()),
None => segment.text.trim().to_string(),
})
.collect::<Vec<_>>()
.join("\n");
if text.is_empty() {
text
} else {
format!("{text}\n")
}
}
fn format_tsv(transcript: &TranscriptionContract) -> String {
let mut output = String::from("start\tend\ttext\n");
for segment in &transcript.segments {
let start = seconds_to_millis(segment.start_seconds);
let end = seconds_to_millis(segment.end_seconds);
output.push_str(&format!(
"{start}\t{end}\t{}\n",
segment.text.trim().replace('\t', " ")
));
}
output
}
fn format_audacity_labels(transcript: &TranscriptionContract) -> String {
let mut output = String::new();
for segment in &transcript.segments {
let start = segment.start_seconds.unwrap_or(0.0);
let end = segment.end_seconds.unwrap_or(start).max(start);
let text = match &segment.speaker {
Some(speaker) => format!("[[{speaker}]]{}", segment.text.trim().replace('\t', " ")),
None => segment.text.trim().replace('\t', " "),
};
output.push_str(&format!("{start}\t{end}\t{text}\n"));
}
output
}
fn seconds_to_millis(seconds: Option<f64>) -> u64 {
seconds.unwrap_or(0.0).max(0.0).mul_add(1000.0, 0.0).round() as u64
}
fn format_srt_with_options(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> String {
let mut output = String::new();
for (index, cue) in subtitle_cues(transcript, subtitles).into_iter().enumerate() {
output.push_str(&(index + 1).to_string());
output.push('\n');
output.push_str(&format_subtitle_timestamp(cue.start, true, ','));
output.push_str(" --> ");
output.push_str(&format_subtitle_timestamp(cue.end, true, ','));
output.push('\n');
output.push_str(&cue.text);
output.push_str("\n\n");
}
output
}
fn format_webvtt_with_options(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> String {
let mut output = String::from("WEBVTT\n\n");
for cue in subtitle_cues(transcript, subtitles) {
output.push_str(&format_subtitle_timestamp(cue.start, false, '.'));
output.push_str(" --> ");
output.push_str(&format_subtitle_timestamp(cue.end, false, '.'));
output.push('\n');
output.push_str(&cue.text);
output.push_str("\n\n");
}
output
}
#[derive(Debug, Clone)]
struct SubtitleCue {
start: f64,
end: f64,
text: String,
}
#[derive(Debug, Clone)]
struct SubtitleTiming {
word: String,
start: Option<f64>,
end: Option<f64>,
}
fn subtitle_cues(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> Vec<SubtitleCue> {
let Some(first_segment) = transcript.segments.first() else {
return Vec::new();
};
if !first_segment.words.is_empty() {
return subtitle_word_cues(transcript, subtitles);
}
transcript
.segments
.iter()
.map(|segment| {
let start = segment.start_seconds.unwrap_or(0.0);
let end = segment.end_seconds.unwrap_or(start).max(start);
let mut text = segment.text.trim().replace("-->", "->");
if let Some(speaker) = &segment.speaker {
text = format!("[{speaker}]: {text}");
}
SubtitleCue { start, end, text }
})
.collect()
}
fn subtitle_word_cues(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> Vec<SubtitleCue> {
let mut cues = Vec::new();
let raw_max_line_width = subtitles.max_line_width;
let max_line_count = subtitles.max_line_count;
let max_line_width = raw_max_line_width.unwrap_or(1000);
let preserve_segments = max_line_count.is_none() || raw_max_line_width.is_none();
let mut line_len = 0usize;
let mut line_count = 1usize;
let mut subtitle = Vec::<SubtitleTiming>::new();
let mut times = Vec::<(f64, f64, Option<String>)>::new();
let mut last = transcript
.segments
.first()
.and_then(|segment| segment.start_seconds)
.unwrap_or(0.0);
for segment in &transcript.segments {
for (word_index, original_timing) in segment.words.iter().enumerate() {
let mut timing = SubtitleTiming {
word: original_timing.text.clone(),
start: original_timing.start_seconds,
end: original_timing.end_seconds,
};
let long_pause = if preserve_segments {
false
} else {
timing.start.is_some_and(|start| start - last > 3.0)
};
let has_room = line_len + timing.word.chars().count() <= max_line_width;
let seg_break = word_index == 0 && !subtitle.is_empty() && preserve_segments;
if line_len > 0 && has_room && !long_pause && !seg_break {
line_len += timing.word.chars().count();
} else {
timing.word = timing.word.trim().to_string();
if (!subtitle.is_empty()
&& max_line_count.is_some()
&& (long_pause || line_count >= max_line_count.unwrap_or(0)))
|| seg_break
{
push_subtitle_cues(transcript, subtitles, &subtitle, ×, &mut cues);
subtitle.clear();
times.clear();
line_count = 1;
} else if line_len > 0 {
line_count += 1;
timing.word = format!("\n{}", timing.word);
}
line_len = timing.word.trim().chars().count();
}
subtitle.push(timing);
times.push((
segment.start_seconds.unwrap_or(0.0),
segment
.end_seconds
.unwrap_or_else(|| segment.start_seconds.unwrap_or(0.0)),
segment.speaker.clone(),
));
if let Some(start) = original_timing.start_seconds {
last = start;
}
}
}
if !subtitle.is_empty() {
push_subtitle_cues(transcript, subtitles, &subtitle, ×, &mut cues);
}
cues
}
fn push_subtitle_cues(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
subtitle: &[SubtitleTiming],
times: &[(f64, f64, Option<String>)],
cues: &mut Vec<SubtitleCue>,
) {
let Some((fallback_start, fallback_end, speaker)) = times.first() else {
return;
};
let word_starts = subtitle.iter().filter_map(|word| word.start);
let word_ends = subtitle.iter().filter_map(|word| word.end);
let start = word_starts.reduce(f64::min).unwrap_or(*fallback_start);
let end = word_ends.reduce(f64::max).unwrap_or(*fallback_end);
let prefix = speaker
.as_ref()
.map(|speaker| format!("[{speaker}]: "))
.unwrap_or_default();
let subtitle_text = subtitle_text_for_language(transcript, subtitle);
let has_timing = subtitle.iter().any(|word| word.start.is_some());
if subtitles.highlight_words && has_timing {
let mut last = format_subtitle_timestamp(start, true, ',');
let all_words = subtitle
.iter()
.map(|timing| timing.word.clone())
.collect::<Vec<_>>();
for (index, timing) in subtitle.iter().enumerate() {
let (Some(word_start), Some(word_end)) = (timing.start, timing.end) else {
continue;
};
let start_text = format_subtitle_timestamp(word_start, true, ',');
let end_text = format_subtitle_timestamp(word_end, true, ',');
if last != start_text {
cues.push(SubtitleCue {
start: timestamp_to_seconds(&last),
end: word_start,
text: format!("{prefix}{subtitle_text}"),
});
}
cues.push(SubtitleCue {
start: word_start,
end: word_end,
text: format!(
"{prefix}{}",
all_words
.iter()
.enumerate()
.map(|(word_index, word)| {
if word_index == index {
underline_word_preserving_leading_space(word)
} else {
word.clone()
}
})
.collect::<Vec<_>>()
.join(" ")
),
});
last = end_text;
}
} else {
cues.push(SubtitleCue {
start,
end,
text: format!("{prefix}{subtitle_text}"),
});
}
}
fn subtitle_text_for_language(
transcript: &TranscriptionContract,
subtitle: &[SubtitleTiming],
) -> String {
let words = subtitle
.iter()
.map(|timing| timing.word.clone())
.collect::<Vec<_>>();
if transcript
.language
.as_deref()
.is_some_and(|language| matches!(language, "ja" | "zh"))
{
words.join("")
} else {
words.join(" ")
}
}
fn underline_word_preserving_leading_space(word: &str) -> String {
let leading_bytes = word
.char_indices()
.find(|(_, character)| !character.is_whitespace())
.map(|(index, _)| index)
.unwrap_or(word.len());
let (leading, rest) = word.split_at(leading_bytes);
format!("{leading}<u>{rest}</u>")
}
fn format_subtitle_timestamp(
seconds: f64,
always_include_hours: bool,
decimal_marker: char,
) -> String {
let total_millis = (seconds.max(0.0) * 1_000.0).round() as u64;
let millis = total_millis % 1_000;
let total_seconds = total_millis / 1_000;
let secs = total_seconds % 60;
let total_minutes = total_seconds / 60;
let minutes = total_minutes % 60;
let hours = total_minutes / 60;
if always_include_hours || hours > 0 {
format!("{hours:02}:{minutes:02}:{secs:02}{decimal_marker}{millis:03}")
} else {
format!("{minutes:02}:{secs:02}{decimal_marker}{millis:03}")
}
}
fn timestamp_to_seconds(timestamp: &str) -> f64 {
let normalized = timestamp.replace(',', ".");
let parts = normalized.split(':').collect::<Vec<_>>();
match parts.as_slice() {
[hours, minutes, seconds] => {
hours.parse::<f64>().unwrap_or(0.0) * 3600.0
+ minutes.parse::<f64>().unwrap_or(0.0) * 60.0
+ seconds.parse::<f64>().unwrap_or(0.0)
}
[minutes, seconds] => {
minutes.parse::<f64>().unwrap_or(0.0) * 60.0 + seconds.parse::<f64>().unwrap_or(0.0)
}
_ => 0.0,
}
}
fn source_basename(source: &String) -> Option<String> {
Path::new(source)
.file_stem()
.and_then(|stem| stem.to_str())
.map(|stem| stem.to_string())
.filter(|stem| !stem.trim().is_empty())
}
pub(crate) fn normalize_space(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
use audio_analysis_transcription::TranscriptionPipelineResponse;
use crate::config::{
ExpectedOutputFile, OutputComparisonMode, OutputConfig, OutputFile, OutputFormat,
SegmentResolution, SubtitleConfig,
};
use crate::import_whisperx_json;
const WHISPERX_SAMPLE: &[u8] =
include_bytes!("../../../tests/fixtures/whisperx-parity-sample.json");
#[test]
fn writes_requested_outputs() {
let response = fixture_response_with_chars();
let temp = tempfile::tempdir().expect("tempdir");
let files = write_outputs_with_options(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![
OutputFormat::Json,
OutputFormat::NativeJson,
OutputFormat::Srt,
OutputFormat::Vtt,
OutputFormat::Txt,
OutputFormat::Tsv,
OutputFormat::Audacity,
],
basename: Some("sample".to_string()),
pretty_json: true,
subtitles: SubtitleConfig::default(),
},
true,
)
.expect("outputs should write");
assert_eq!(files.len(), 7);
let json_path = temp.path().join("sample.json");
let native_json_path = temp.path().join("sample.native.json");
let srt_path = temp.path().join("sample.srt");
let vtt_path = temp.path().join("sample.vtt");
let txt_path = temp.path().join("sample.txt");
let tsv_path = temp.path().join("sample.tsv");
let aud_path = temp.path().join("sample.aud");
assert!(json_path.is_file());
assert!(native_json_path.is_file());
assert!(srt_path.is_file());
assert!(vtt_path.is_file());
assert!(txt_path.is_file());
assert!(tsv_path.is_file());
assert!(aud_path.is_file());
let whisperx_json: serde_json::Value =
serde_json::from_slice(&fs::read(json_path).expect("json"))
.expect("valid whisperx json");
assert!(whisperx_json.get("segments").is_some());
assert!(whisperx_json.get("word_segments").is_some());
assert!(whisperx_json["segments"][0].get("start").is_some());
assert!(whisperx_json["segments"][0].get("end").is_some());
assert!(whisperx_json["segments"][0].get("startSeconds").is_none());
assert_eq!(whisperx_json["segments"][0]["chars"][0]["char"], "h");
let native_json: serde_json::Value =
serde_json::from_slice(&fs::read(native_json_path).expect("native json"))
.expect("valid native json");
assert!(native_json["segments"][0].get("startSeconds").is_some());
assert!(native_json["segments"][0].get("chars").is_some());
let txt = fs::read_to_string(txt_path).expect("txt");
assert_eq!(
txt,
"[SPEAKER_00]: hello world\n[SPEAKER_01]: second speaker\n"
);
let srt = fs::read_to_string(srt_path).expect("srt");
assert!(srt.contains("00:00:00,000 --> 00:00:01,100"));
assert!(srt.contains("[SPEAKER_00]: hello world"));
let vtt = fs::read_to_string(vtt_path).expect("vtt");
assert!(vtt.starts_with("WEBVTT\n\n"));
assert!(vtt.contains("00:01.350 --> 00:02.350"));
assert!(vtt.contains("[SPEAKER_01]: second speaker"));
let tsv = fs::read_to_string(tsv_path).expect("tsv");
assert!(tsv.starts_with("start\tend\ttext\n"));
assert!(tsv.contains("0\t1200\thello world"));
assert!(tsv.contains("1350\t2400\tsecond speaker"));
let aud = fs::read_to_string(aud_path).expect("aud");
assert!(aud.contains("0\t1.2\t[[SPEAKER_00]]hello world"));
assert!(aud.contains("1.35\t2.4\t[[SPEAKER_01]]second speaker"));
}
#[test]
fn all_format_writes_whisperx_compatible_set_without_native_json() {
let response = fixture_response_with_chars();
let temp = tempfile::tempdir().expect("tempdir");
write_outputs_with_options(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::All],
basename: Some("sample".to_string()),
pretty_json: true,
subtitles: SubtitleConfig::default(),
},
true,
)
.expect("outputs should write");
let mut names = fs::read_dir(temp.path())
.expect("read output dir")
.map(|entry| {
entry
.expect("dir entry")
.file_name()
.to_string_lossy()
.into_owned()
})
.collect::<Vec<_>>();
names.sort();
assert_eq!(
names,
vec![
"sample.aud",
"sample.json",
"sample.srt",
"sample.tsv",
"sample.txt",
"sample.vtt",
]
);
}
#[test]
fn output_stems_keep_multi_input_writes_collision_safe() {
let temp = tempfile::tempdir().expect("tempdir");
let mut first = fixture_response_with_chars();
first.transcript.source = Some("audio/first-input.wav".to_string());
let mut second = fixture_response_with_chars();
second.transcript.source = Some("audio/second-input.wav".to_string());
let output = OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::All],
basename: None,
pretty_json: true,
subtitles: SubtitleConfig::default(),
};
write_outputs_with_options(&first, &output, true).expect("first outputs should write");
write_outputs_with_options(&second, &output, true).expect("second outputs should write");
let mut names = fs::read_dir(temp.path())
.expect("read output dir")
.map(|entry| {
entry
.expect("dir entry")
.file_name()
.to_string_lossy()
.into_owned()
})
.collect::<Vec<_>>();
names.sort();
assert_eq!(
names,
vec![
"first-input.aud",
"first-input.json",
"first-input.srt",
"first-input.tsv",
"first-input.txt",
"first-input.vtt",
"second-input.aud",
"second-input.json",
"second-input.srt",
"second-input.tsv",
"second-input.txt",
"second-input.vtt",
]
);
}
#[test]
fn txt_writes_each_segment_without_speakers() {
let mut response = fixture_response_with_chars();
for segment in &mut response.transcript.segments {
segment.speaker = None;
}
let temp = tempfile::tempdir().expect("tempdir");
write_outputs(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::Txt],
basename: Some("sample".to_string()),
..OutputConfig::default()
},
)
.expect("txt should write");
let txt = fs::read_to_string(temp.path().join("sample.txt")).expect("txt");
assert_eq!(txt, "hello world\nsecond speaker\n");
}
#[test]
fn tsv_includes_header_and_replaces_tabs() {
let mut response = fixture_response_with_chars();
response.transcript.segments[0].text = " hello\tworld ".to_string();
let temp = tempfile::tempdir().expect("tempdir");
write_outputs(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::Tsv],
basename: Some("sample".to_string()),
..OutputConfig::default()
},
)
.expect("tsv should write");
let tsv = fs::read_to_string(temp.path().join("sample.tsv")).expect("tsv");
assert!(tsv.starts_with("start\tend\ttext\n"));
assert!(tsv.contains("0\t1200\thello world\n"));
}
#[test]
fn subtitle_options_highlight_and_wrap_text() {
let response = fixture_response_with_chars();
let temp = tempfile::tempdir().expect("tempdir");
write_outputs(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::Srt, OutputFormat::Vtt],
basename: Some("sample".to_string()),
pretty_json: true,
subtitles: SubtitleConfig {
max_line_width: Some(8),
max_line_count: None,
highlight_words: true,
segment_resolution: SegmentResolution::Sentence,
},
},
)
.expect("subtitles should write");
let srt = fs::read_to_string(temp.path().join("sample.srt")).expect("srt");
assert!(srt.contains("<u>hello</u>"));
assert!(srt.contains("[SPEAKER_00]: <u>hello</u> \nworld"));
assert!(srt.contains("[SPEAKER_00]: hello \n<u>world</u>"));
let vtt = fs::read_to_string(temp.path().join("sample.vtt")).expect("vtt");
assert!(vtt.contains("<u>hello</u>"));
}
#[test]
fn subtitle_max_line_count_merges_overflow() {
let response = fixture_response_with_chars();
let temp = tempfile::tempdir().expect("tempdir");
write_outputs(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::Srt],
basename: Some("sample".to_string()),
pretty_json: true,
subtitles: SubtitleConfig {
max_line_width: Some(8),
max_line_count: Some(1),
highlight_words: false,
segment_resolution: SegmentResolution::Sentence,
},
},
)
.expect("subtitles should write");
let srt = fs::read_to_string(temp.path().join("sample.srt")).expect("srt");
assert!(srt.contains("[SPEAKER_00]: hello\n\n2"));
assert!(srt.contains("[SPEAKER_00]: world\n\n3"));
assert!(srt.contains("[SPEAKER_01]: second\n\n4"));
assert!(srt.contains("[SPEAKER_01]: speaker\n\n"));
}
#[test]
fn subtitle_word_cues_join_languages_without_spaces() {
let mut response = fixture_response_with_chars();
response.transcript.language = Some("ja".to_string());
response.transcript.segments[0].speaker = None;
let temp = tempfile::tempdir().expect("tempdir");
write_outputs(
&response,
&OutputConfig {
output_dir: Some(temp.path().to_path_buf()),
formats: vec![OutputFormat::Srt],
basename: Some("sample".to_string()),
pretty_json: true,
subtitles: SubtitleConfig::default(),
},
)
.expect("subtitles should write");
let srt = fs::read_to_string(temp.path().join("sample.srt")).expect("srt");
assert!(srt.contains("helloworld"));
}
#[test]
fn whisperx_json_omits_chars_when_not_requested() {
let mut transcript = import_whisperx_json(WHISPERX_SAMPLE).expect("fixture should import");
transcript.segments[0]
.chars
.push(text_transcripts::TranscriptCharContract {
character: "h".to_string(),
start_seconds: Some(0.0),
end_seconds: Some(0.1),
confidence: Some(0.9),
attributes: Default::default(),
});
let without_chars = whisperx_json_value(&transcript, false);
let with_chars = whisperx_json_value(&transcript, true);
assert!(without_chars["segments"][0].get("chars").is_none());
assert!(with_chars["segments"][0].get("chars").is_some());
}
#[test]
fn output_comparison_reports_exact_json_and_missing_outputs() {
let temp = tempfile::tempdir().expect("tempdir");
let expected_txt = temp.path().join("expected.txt");
let actual_txt = temp.path().join("actual.txt");
let expected_json = temp.path().join("expected.json");
let actual_json = temp.path().join("actual.json");
let missing_expected = temp.path().join("missing.srt");
let actual_srt = temp.path().join("actual.srt");
fs::write(&expected_txt, "hello\n").expect("expected txt");
fs::write(&actual_txt, "hello changed\n").expect("actual txt");
fs::write(&expected_json, "{\n \"a\": 1\n}\n").expect("expected json");
fs::write(&actual_json, "{\"a\":1}").expect("actual json");
fs::write(&actual_srt, "1\n").expect("actual srt");
let actual_outputs = vec![
OutputFile {
format: OutputFormat::Txt,
path: actual_txt,
},
OutputFile {
format: OutputFormat::Json,
path: actual_json,
},
OutputFile {
format: OutputFormat::Srt,
path: actual_srt,
},
];
let comparisons = compare_expected_outputs(
&actual_outputs,
&[
ExpectedOutputFile {
format: OutputFormat::Txt,
path: expected_txt,
comparison: OutputComparisonMode::Exact,
gating: true,
},
ExpectedOutputFile {
format: OutputFormat::Json,
path: expected_json,
comparison: OutputComparisonMode::JsonSemantic,
gating: true,
},
ExpectedOutputFile {
format: OutputFormat::Vtt,
path: temp.path().join("expected.vtt"),
comparison: OutputComparisonMode::Exact,
gating: true,
},
ExpectedOutputFile {
format: OutputFormat::Srt,
path: missing_expected,
comparison: OutputComparisonMode::Exact,
gating: true,
},
],
)
.expect("comparison should run");
assert!(!comparisons[0].passed);
assert!(comparisons[0]
.difference
.as_deref()
.is_some_and(|difference| difference.contains("line 1 differs")));
assert!(comparisons[1].passed);
assert!(!comparisons[2].passed);
assert!(comparisons[2]
.difference
.as_deref()
.is_some_and(|difference| difference.contains("missing actual")));
assert!(!comparisons[3].passed);
assert!(comparisons[3]
.difference
.as_deref()
.is_some_and(|difference| difference.contains("missing expected")));
}
#[test]
fn output_json_semantic_compares_whisperx_transcript_contract() {
let difference =
compare_json_output_values(semantic_expected_whisperx_json(), semantic_actual_json());
assert_eq!(difference, None);
}
#[test]
fn output_json_semantic_fails_changed_word_text() {
let expected = semantic_expected_whisperx_json();
let mut actual = semantic_actual_json();
actual["word_segments"][1]["word"] = serde_json::json!("planet");
let difference = compare_json_output_values(expected, actual).expect("should differ");
assert!(difference.contains("JSON transcript word 1 text differs"));
}
#[test]
fn output_json_semantic_fails_word_timing_beyond_tolerance() {
let expected = semantic_expected_whisperx_json();
let mut actual = semantic_actual_json();
actual["word_segments"][0]["start"] = serde_json::json!(0.200);
let difference = compare_json_output_values(expected, actual).expect("should differ");
assert!(difference.contains("JSON transcript word 0 start timing differs"));
assert!(difference.contains("tolerance=0.050s"));
}
#[test]
fn output_json_semantic_fails_segment_timing_beyond_tolerance() {
let expected = semantic_expected_whisperx_json();
let mut actual = semantic_actual_json();
actual["segments"][0]["end"] = serde_json::json!(1.500);
let difference = compare_json_output_values(expected, actual).expect("should differ");
assert!(difference.contains("JSON transcript segment 0 end timing differs"));
assert!(difference.contains("tolerance=0.100s"));
}
#[test]
fn output_json_semantic_fails_char_count_mismatch_when_chars_present() {
let expected = semantic_expected_whisperx_json();
let mut actual = semantic_actual_json();
actual["segments"][0]["chars"] = serde_json::json!([
{
"char": "h",
"start": 0.002,
"end": 0.098
}
]);
let difference = compare_json_output_values(expected, actual).expect("should differ");
assert!(difference.contains("JSON transcript char count differs"));
}
fn compare_json_output_values(
expected: serde_json::Value,
actual: serde_json::Value,
) -> Option<String> {
let temp = tempfile::tempdir().expect("tempdir");
let expected_path = temp.path().join("expected.json");
let actual_path = temp.path().join("actual.json");
fs::write(
&expected_path,
serde_json::to_string(&expected).expect("expected json"),
)
.expect("write expected json");
fs::write(
&actual_path,
serde_json::to_string_pretty(&actual).expect("actual json"),
)
.expect("write actual json");
compare_output_json(&expected_path, &actual_path).expect("json comparison")
}
fn semantic_expected_whisperx_json() -> serde_json::Value {
serde_json::json!({
"language": "en",
"segments": [
{
"start": 0.0,
"end": 1.2,
"text": " hello world",
"avg_logprob": -0.1,
"no_speech_prob": 0.01,
"words": [
{
"word": " hello",
"start": 0.0,
"end": 0.5,
"score": 0.9876
},
{
"word": "world",
"start": 0.55,
"end": 1.2,
"score": 0.902
}
],
"chars": [
{
"char": "h",
"start": 0.0,
"end": 0.1
},
{
"char": "i",
"start": null,
"end": null
}
]
}
],
"word_segments": [
{
"word": " hello",
"start": 0.0,
"end": 0.5,
"score": 0.9876
},
{
"word": "world",
"start": 0.55,
"end": 1.2,
"score": 0.902
}
]
})
}
fn semantic_actual_json() -> serde_json::Value {
serde_json::json!({
"text": "hello world",
"source": "sample.wav",
"language": "en",
"segments": [
{
"id": 0,
"start": 0.004,
"end": 1.196,
"text": "hello world",
"score": 0.95,
"words": [
{
"word": "hello",
"start": 0.002,
"end": 0.501,
"score": 0.987
},
{
"word": " world",
"start": 0.552,
"end": 1.198,
"score": 0.9025
}
],
"chars": [
{
"char": "h",
"start": 0.002,
"end": 0.098
},
{
"char": "i"
}
]
}
],
"word_segments": [
{
"word": "hello",
"start": 0.002,
"end": 0.501,
"score": 0.987
},
{
"word": " world",
"start": 0.552,
"end": 1.198,
"score": 0.9025
}
]
})
}
fn fixture_response_with_chars() -> TranscriptionPipelineResponse {
let mut transcript = import_whisperx_json(WHISPERX_SAMPLE).expect("fixture should import");
transcript.segments[0]
.chars
.push(text_transcripts::TranscriptCharContract {
character: "h".to_string(),
start_seconds: Some(0.0),
end_seconds: Some(0.1),
confidence: Some(0.9),
attributes: Default::default(),
});
TranscriptionPipelineResponse {
accepted: true,
operation: "audio.transcription.transcribe".to_string(),
provider: "fixture".to_string(),
model_id: "fixture".to_string(),
transcript,
vad_segments: Vec::new(),
alignment: None,
diarization: None,
artifacts: Vec::new(),
diagnostics: Vec::new(),
}
}
}