use std::fs;
use std::path::Path;
use audio_analysis_transcription::TranscriptionPipelineResponse;
use text_transcripts::TranscriptionContract;
use crate::config::{
ExpectedOutputComparison, ExpectedOutputFile, NativeWhisperxError, OutputComparisonMode,
OutputConfig, OutputFile, OutputFormat, ParityTolerance, SubtitleConfig,
};
pub fn write_outputs(
response: &TranscriptionPipelineResponse,
output: &OutputConfig,
) -> Result<Vec<OutputFile>, NativeWhisperxError> {
write_outputs_with_options(response, output, false)
}
pub(crate) fn write_outputs_with_options(
response: &TranscriptionPipelineResponse,
output: &OutputConfig,
return_char_alignments: bool,
) -> Result<Vec<OutputFile>, NativeWhisperxError> {
let Some(output_dir) = &output.output_dir else {
return Ok(Vec::new());
};
fs::create_dir_all(output_dir)?;
let basename = output
.basename
.clone()
.or_else(|| {
response
.transcript
.source
.as_ref()
.and_then(source_basename)
})
.unwrap_or_else(|| "transcript".to_string());
output
.formats
.iter()
.copied()
.flat_map(expand_output_format)
.map(|format| {
let path = output_dir.join(format!("{basename}.{}", format.extension()));
let contents = render_output(response, format, output, return_char_alignments)?;
fs::write(&path, contents)?;
Ok(OutputFile { format, path })
})
.collect()
}
pub(crate) fn compare_expected_outputs(
actual_outputs: &[OutputFile],
expected_outputs: &[ExpectedOutputFile],
) -> Result<Vec<ExpectedOutputComparison>, NativeWhisperxError> {
expected_outputs
.iter()
.map(|expected| {
let actual_path = actual_outputs
.iter()
.find(|actual| actual.format == expected.format)
.map(|actual| actual.path.clone());
let Some(actual_path_ref) = actual_path.as_ref() else {
return Ok(ExpectedOutputComparison {
format: expected.format,
comparison: expected.comparison,
gating: expected.gating,
expected_path: expected.path.clone(),
actual_path,
passed: false,
difference: Some(format!("missing actual {:?} output", expected.format)),
});
};
let comparison = match expected.comparison {
OutputComparisonMode::Exact => {
compare_output_bytes(&expected.path, actual_path_ref)
}
OutputComparisonMode::JsonSemantic => {
compare_output_json(&expected.path, actual_path_ref)
}
OutputComparisonMode::SubtitleSemantic => {
compare_output_subtitles(&expected.path, actual_path_ref)
}
}?;
Ok(ExpectedOutputComparison {
format: expected.format,
comparison: expected.comparison,
gating: expected.gating,
expected_path: expected.path.clone(),
actual_path,
passed: comparison.is_none(),
difference: comparison,
})
})
.collect()
}
fn compare_output_bytes(
expected_path: &Path,
actual_path: &Path,
) -> Result<Option<String>, NativeWhisperxError> {
let expected = match fs::read(expected_path) {
Ok(bytes) => bytes,
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
return Ok(Some(format!(
"missing expected output {}",
expected_path.display()
)));
}
Err(error) => return Err(NativeWhisperxError::Io(error)),
};
let actual = fs::read(actual_path)?;
if expected == actual {
return Ok(None);
}
Ok(Some(first_output_difference(
expected_path,
actual_path,
&expected,
&actual,
)))
}
pub(crate) fn compare_output_json(
expected_path: &Path,
actual_path: &Path,
) -> Result<Option<String>, NativeWhisperxError> {
let expected = match fs::read(expected_path) {
Ok(bytes) => bytes,
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
return Ok(Some(format!(
"missing expected output {}",
expected_path.display()
)));
}
Err(error) => return Err(NativeWhisperxError::Io(error)),
};
let actual = fs::read(actual_path)?;
let expected_json: serde_json::Value = serde_json::from_slice(&expected)?;
let actual_json: serde_json::Value = serde_json::from_slice(&actual)?;
if expected_json == actual_json {
return Ok(None);
}
if looks_like_whisperx_transcript_json(&expected_json)
&& looks_like_whisperx_transcript_json(&actual_json)
{
return Ok(compare_whisperx_transcript_json(
&expected_json,
&actual_json,
ParityTolerance::default(),
));
}
Ok(Some(format!(
"JSON output differs: expected={} actual={}",
expected_path.display(),
actual_path.display()
)))
}
#[derive(Debug, Clone, PartialEq)]
struct ParsedSubtitleCue {
start: f64,
end: f64,
text: String,
}
fn compare_output_subtitles(
expected_path: &Path,
actual_path: &Path,
) -> Result<Option<String>, NativeWhisperxError> {
let expected = match fs::read_to_string(expected_path) {
Ok(text) => text,
Err(error) if error.kind() == std::io::ErrorKind::NotFound => {
return Ok(Some(format!(
"missing expected output {}",
expected_path.display()
)));
}
Err(error) => return Err(NativeWhisperxError::Io(error)),
};
let actual = fs::read_to_string(actual_path)?;
let expected_cues = parse_subtitle_cues(&expected);
let actual_cues = parse_subtitle_cues(&actual);
if expected_cues.len() != actual_cues.len() {
return Ok(Some(format!(
"subtitle cue count differs: expected={} actual={}",
expected_cues.len(),
actual_cues.len()
)));
}
let tolerance = ParityTolerance::default().word_seconds;
for (index, (expected, actual)) in expected_cues.iter().zip(actual_cues.iter()).enumerate() {
if let Some(difference) =
compare_subtitle_seconds(index, "start", expected.start, actual.start, tolerance)
{
return Ok(Some(difference));
}
if let Some(difference) =
compare_subtitle_seconds(index, "end", expected.end, actual.end, tolerance)
{
return Ok(Some(difference));
}
if expected.text != actual.text {
return Ok(Some(format!(
"subtitle cue {index} text differs: expected {:?} actual {:?}",
expected.text, actual.text
)));
}
}
Ok(None)
}
fn compare_subtitle_seconds(
index: usize,
field: &str,
expected: f64,
actual: f64,
tolerance: f64,
) -> Option<String> {
let delta = (expected - actual).abs();
if delta <= tolerance {
None
} else {
Some(format!(
"subtitle cue {index} {field} differs: expected={expected:.3} actual={actual:.3} delta={delta:.3} tolerance={tolerance:.3}"
))
}
}
fn parse_subtitle_cues(text: &str) -> Vec<ParsedSubtitleCue> {
let normalized = text.replace("\r\n", "\n");
normalized
.split("\n\n")
.filter_map(parse_subtitle_block)
.collect()
}
fn parse_subtitle_block(block: &str) -> Option<ParsedSubtitleCue> {
let mut lines = block
.lines()
.map(str::trim)
.filter(|line| !line.is_empty() && *line != "WEBVTT");
let timing_line = lines.find(|line| line.contains("-->"))?;
let (start, end) = parse_subtitle_timing_line(timing_line)?;
let text = normalize_subtitle_text(&lines.collect::<Vec<_>>().join(" "));
Some(ParsedSubtitleCue { start, end, text })
}
fn parse_subtitle_timing_line(line: &str) -> Option<(f64, f64)> {
let (start, rest) = line.split_once("-->")?;
let end = rest.split_whitespace().next()?;
Some((
timestamp_to_seconds(start.trim()),
timestamp_to_seconds(end.trim()),
))
}
fn normalize_subtitle_text(text: &str) -> String {
normalize_space(&text.replace("<u>", "").replace("</u>", ""))
}
fn looks_like_whisperx_transcript_json(value: &serde_json::Value) -> bool {
value.as_object().is_some_and(|object| {
object.contains_key("segments")
|| object.contains_key("word_segments")
|| (object.contains_key("language") && object.contains_key("text"))
})
}
fn compare_whisperx_transcript_json(
expected: &serde_json::Value,
actual: &serde_json::Value,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_object = match expected.as_object() {
Some(object) => object,
None => return Some("JSON transcript malformed: expected top-level object".to_string()),
};
let actual_object = match actual.as_object() {
Some(object) => object,
None => return Some("JSON transcript malformed: actual top-level object".to_string()),
};
if let Some(difference) = compare_json_language(expected_object, actual_object) {
return Some(difference);
}
if let Some(difference) = compare_json_segments(expected_object, actual_object, tolerance) {
return Some(difference);
}
if let Some(difference) = compare_json_words(expected_object, actual_object, tolerance) {
return Some(difference);
}
if json_contains_chars(expected_object) || json_contains_chars(actual_object) {
if let Some(difference) = compare_json_chars(expected_object, actual_object, tolerance) {
return Some(difference);
}
}
None
}
fn compare_json_language(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
) -> Option<String> {
let expected_language = match optional_json_string(expected, "language", "expected language") {
Ok(language) => language,
Err(error) => return Some(error),
};
let actual_language = match optional_json_string(actual, "language", "actual language") {
Ok(language) => language,
Err(error) => return Some(error),
};
if expected_language != actual_language {
return Some(format!(
"JSON transcript language differs: expected={expected_language:?} actual={actual_language:?}"
));
}
None
}
fn compare_json_segments(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_segments = match json_array_field(expected, "segments", "expected segments") {
Ok(segments) => segments,
Err(error) => return Some(error),
};
let actual_segments = match json_array_field(actual, "segments", "actual segments") {
Ok(segments) => segments,
Err(error) => return Some(error),
};
if expected_segments.len() != actual_segments.len() {
return Some(format!(
"JSON transcript segment count differs: expected={} actual={}",
expected_segments.len(),
actual_segments.len()
));
}
for (index, (expected_segment, actual_segment)) in expected_segments
.iter()
.zip(actual_segments.iter())
.enumerate()
{
let expected_segment = match expected_segment.as_object() {
Some(segment) => segment,
None => {
return Some(format!(
"JSON transcript segment {index} malformed: expected object"
));
}
};
let actual_segment = match actual_segment.as_object() {
Some(segment) => segment,
None => {
return Some(format!(
"JSON transcript segment {index} malformed: actual object"
));
}
};
if let Some(difference) = compare_required_json_seconds(
expected_segment,
actual_segment,
"start",
&format!("segment {index} start"),
tolerance.segment_seconds,
) {
return Some(difference);
}
if let Some(difference) = compare_required_json_seconds(
expected_segment,
actual_segment,
"end",
&format!("segment {index} end"),
tolerance.segment_seconds,
) {
return Some(difference);
}
let expected_text = match required_json_string(
expected_segment,
"text",
&format!("segment {index} expected text"),
) {
Ok(text) => text,
Err(error) => return Some(error),
};
let actual_text = match required_json_string(
actual_segment,
"text",
&format!("segment {index} actual text"),
) {
Ok(text) => text,
Err(error) => return Some(error),
};
if normalize_space(expected_text) != normalize_space(actual_text) {
return Some(format!(
"JSON transcript segment {index} text differs: expected={expected_text:?} actual={actual_text:?}"
));
}
if expected_segment.contains_key("speaker") || actual_segment.contains_key("speaker") {
let expected_speaker = match optional_json_string(
expected_segment,
"speaker",
&format!("segment {index} expected speaker"),
) {
Ok(speaker) => speaker,
Err(error) => return Some(error),
};
let actual_speaker = match optional_json_string(
actual_segment,
"speaker",
&format!("segment {index} actual speaker"),
) {
Ok(speaker) => speaker,
Err(error) => return Some(error),
};
if expected_speaker != actual_speaker {
return Some(format!(
"JSON transcript segment {index} speaker differs: expected={expected_speaker:?} actual={actual_speaker:?}"
));
}
}
}
None
}
fn compare_json_words(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_words = match flattened_json_words(expected, "expected") {
Ok(words) => words,
Err(error) => return Some(error),
};
let actual_words = match flattened_json_words(actual, "actual") {
Ok(words) => words,
Err(error) => return Some(error),
};
if expected_words.len() != actual_words.len() {
return Some(format!(
"JSON transcript word count differs: expected={} actual={}",
expected_words.len(),
actual_words.len()
));
}
for (index, (expected_word, actual_word)) in
expected_words.iter().zip(actual_words.iter()).enumerate()
{
if let Some(difference) = compare_json_word(index, expected_word, actual_word, tolerance) {
return Some(difference);
}
}
None
}
fn compare_json_word(
index: usize,
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_text =
match required_json_string(expected, "word", &format!("word {index} expected word")) {
Ok(text) => text,
Err(error) => return Some(error),
};
let actual_text =
match required_json_string(actual, "word", &format!("word {index} actual word")) {
Ok(text) => text,
Err(error) => return Some(error),
};
if normalize_space(expected_text) != normalize_space(actual_text) {
return Some(format!(
"JSON transcript word {index} text differs: expected={expected_text:?} actual={actual_text:?}"
));
}
if let Some(difference) = compare_required_json_seconds(
expected,
actual,
"start",
&format!("word {index} start"),
tolerance.word_seconds,
) {
return Some(difference);
}
if let Some(difference) = compare_required_json_seconds(
expected,
actual,
"end",
&format!("word {index} end"),
tolerance.word_seconds,
) {
return Some(difference);
}
if expected.contains_key("score") && actual.contains_key("score") {
let expected_score = match optional_json_number(
expected,
"score",
&format!("word {index} expected score"),
) {
Ok(Some(score)) => score,
Ok(None) => return None,
Err(error) => return Some(error),
};
let actual_score =
match optional_json_number(actual, "score", &format!("word {index} actual score")) {
Ok(Some(score)) => score,
Ok(None) => return None,
Err(error) => return Some(error),
};
if (expected_score - actual_score).abs() > 0.001 {
return Some(format!(
"JSON transcript word {index} score differs: expected={expected_score:.3} actual={actual_score:.3} tolerance=0.001"
));
}
}
None
}
fn compare_json_chars(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
tolerance: ParityTolerance,
) -> Option<String> {
let expected_chars = match flattened_json_chars(expected, "expected") {
Ok(chars) => chars,
Err(error) => return Some(error),
};
let actual_chars = match flattened_json_chars(actual, "actual") {
Ok(chars) => chars,
Err(error) => return Some(error),
};
if expected_chars.len() != actual_chars.len() {
return Some(format!(
"JSON transcript char count differs: expected={} actual={}",
expected_chars.len(),
actual_chars.len()
));
}
for (index, (expected_char, actual_char)) in
expected_chars.iter().zip(actual_chars.iter()).enumerate()
{
let expected_text = match required_json_string(
expected_char,
"char",
&format!("char {index} expected char"),
) {
Ok(text) => text,
Err(error) => return Some(error),
};
let actual_text =
match required_json_string(actual_char, "char", &format!("char {index} actual char")) {
Ok(text) => text,
Err(error) => return Some(error),
};
if expected_text != actual_text {
return Some(format!(
"JSON transcript char {index} text differs: expected={expected_text:?} actual={actual_text:?}"
));
}
if let Some(difference) = compare_optional_json_seconds(
expected_char,
actual_char,
"start",
&format!("char {index} start"),
tolerance.word_seconds,
) {
return Some(difference);
}
if let Some(difference) = compare_optional_json_seconds(
expected_char,
actual_char,
"end",
&format!("char {index} end"),
tolerance.word_seconds,
) {
return Some(difference);
}
}
None
}
fn json_contains_chars(object: &serde_json::Map<String, serde_json::Value>) -> bool {
object
.get("segments")
.and_then(serde_json::Value::as_array)
.is_some_and(|segments| {
segments.iter().any(|segment| {
segment
.as_object()
.is_some_and(|segment| segment.contains_key("chars"))
})
})
}
fn flattened_json_words<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
side: &str,
) -> Result<Vec<&'a serde_json::Map<String, serde_json::Value>>, String> {
if let Some(words) = object.get("word_segments") {
return json_value_array(words, &format!("{side} word_segments"))?
.iter()
.enumerate()
.map(|(index, word)| {
word.as_object().ok_or_else(|| {
format!(
"JSON transcript {side} word_segments[{index}] malformed: object expected"
)
})
})
.collect();
}
let segments = json_array_field(object, "segments", &format!("{side} segments"))?;
let mut words = Vec::new();
for (segment_index, segment) in segments.iter().enumerate() {
let Some(segment) = segment.as_object() else {
return Err(format!(
"JSON transcript {side} segment {segment_index} malformed: object expected"
));
};
if let Some(segment_words) = segment.get("words") {
for (word_index, word) in json_value_array(
segment_words,
&format!("{side} segment {segment_index} words"),
)?
.iter()
.enumerate()
{
words.push(word.as_object().ok_or_else(|| {
format!("JSON transcript {side} segment {segment_index} words[{word_index}] malformed: object expected")
})?);
}
}
}
Ok(words)
}
fn flattened_json_chars<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
side: &str,
) -> Result<Vec<&'a serde_json::Map<String, serde_json::Value>>, String> {
let segments = json_array_field(object, "segments", &format!("{side} segments"))?;
let mut chars = Vec::new();
for (segment_index, segment) in segments.iter().enumerate() {
let Some(segment) = segment.as_object() else {
return Err(format!(
"JSON transcript {side} segment {segment_index} malformed: object expected"
));
};
if let Some(segment_chars) = segment.get("chars") {
for (char_index, character) in json_value_array(
segment_chars,
&format!("{side} segment {segment_index} chars"),
)?
.iter()
.enumerate()
{
chars.push(character.as_object().ok_or_else(|| {
format!("JSON transcript {side} segment {segment_index} chars[{char_index}] malformed: object expected")
})?);
}
}
}
Ok(chars)
}
fn json_array_field<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<&'a Vec<serde_json::Value>, String> {
let value = object
.get(key)
.ok_or_else(|| format!("JSON transcript missing array: {label}"))?;
json_value_array(value, label)
}
fn json_value_array<'a>(
value: &'a serde_json::Value,
label: &str,
) -> Result<&'a Vec<serde_json::Value>, String> {
value
.as_array()
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be an array"))
}
fn required_json_string<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<&'a str, String> {
let value = object
.get(key)
.ok_or_else(|| format!("JSON transcript malformed field: {label} missing"))?;
value
.as_str()
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be a string"))
}
fn optional_json_string<'a>(
object: &'a serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<Option<&'a str>, String> {
match object.get(key) {
Some(serde_json::Value::Null) | None => Ok(None),
Some(value) => value
.as_str()
.map(Some)
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be a string")),
}
}
fn optional_json_number(
object: &serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
) -> Result<Option<f64>, String> {
match object.get(key) {
Some(serde_json::Value::Null) | None => Ok(None),
Some(value) => value
.as_f64()
.map(Some)
.ok_or_else(|| format!("JSON transcript malformed field: {label} must be a number")),
}
}
fn compare_required_json_seconds(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
tolerance: f64,
) -> Option<String> {
let expected_seconds = match optional_json_number(expected, key, &format!("{label} expected")) {
Ok(Some(seconds)) => seconds,
Ok(None) => {
return Some(format!(
"JSON transcript malformed field: {label} expected missing"
));
}
Err(error) => return Some(error),
};
let actual_seconds = match optional_json_number(actual, key, &format!("{label} actual")) {
Ok(Some(seconds)) => seconds,
Ok(None) => {
return Some(format!(
"JSON transcript malformed field: {label} actual missing"
));
}
Err(error) => return Some(error),
};
if (expected_seconds - actual_seconds).abs() > tolerance {
return Some(format!(
"JSON transcript {label} timing differs: expected={expected_seconds:.3}s actual={actual_seconds:.3}s delta={:.3}s tolerance={tolerance:.3}s",
(expected_seconds - actual_seconds).abs()
));
}
None
}
fn compare_optional_json_seconds(
expected: &serde_json::Map<String, serde_json::Value>,
actual: &serde_json::Map<String, serde_json::Value>,
key: &str,
label: &str,
tolerance: f64,
) -> Option<String> {
let expected_seconds = match optional_json_number(expected, key, &format!("{label} expected")) {
Ok(seconds) => seconds,
Err(error) => return Some(error),
};
let actual_seconds = match optional_json_number(actual, key, &format!("{label} actual")) {
Ok(seconds) => seconds,
Err(error) => return Some(error),
};
match (expected_seconds, actual_seconds) {
(Some(expected_seconds), Some(actual_seconds)) => {
if (expected_seconds - actual_seconds).abs() > tolerance {
Some(format!(
"JSON transcript {label} timing differs: expected={expected_seconds:.3}s actual={actual_seconds:.3}s delta={:.3}s tolerance={tolerance:.3}s",
(expected_seconds - actual_seconds).abs()
))
} else {
None
}
}
(None, None) => None,
(Some(_), None) | (None, Some(_)) => Some(format!(
"JSON transcript {label} timing shape differs: expected={} actual={}",
timing_shape(expected_seconds),
timing_shape(actual_seconds)
)),
}
}
fn timing_shape(value: Option<f64>) -> &'static str {
if value.is_some() {
"present"
} else {
"null"
}
}
fn first_output_difference(
expected_path: &Path,
actual_path: &Path,
expected: &[u8],
actual: &[u8],
) -> String {
let expected_text = std::str::from_utf8(expected);
let actual_text = std::str::from_utf8(actual);
if let (Ok(expected_text), Ok(actual_text)) = (expected_text, actual_text) {
for (index, (expected_line, actual_line)) in
expected_text.lines().zip(actual_text.lines()).enumerate()
{
if expected_line != actual_line {
return format!(
"line {} differs: expected {:?}, actual {:?}",
index + 1,
expected_line,
actual_line
);
}
}
}
format!(
"output bytes differ: expected={} ({} bytes) actual={} ({} bytes)",
expected_path.display(),
expected.len(),
actual_path.display(),
actual.len()
)
}
fn render_output(
response: &TranscriptionPipelineResponse,
format: OutputFormat,
output: &OutputConfig,
return_char_alignments: bool,
) -> Result<String, NativeWhisperxError> {
match format {
OutputFormat::All => Err(NativeWhisperxError::InvalidConfig(
"internal error: all output format must be expanded before rendering".to_string(),
)),
OutputFormat::Json if output.pretty_json => Ok(serde_json::to_string_pretty(
&whisperx_json_value(&response.transcript, return_char_alignments),
)?),
OutputFormat::Json => Ok(serde_json::to_string(&whisperx_json_value(
&response.transcript,
return_char_alignments,
))?),
OutputFormat::NativeJson if output.pretty_json => {
Ok(serde_json::to_string_pretty(&response.transcript)?)
}
OutputFormat::NativeJson => Ok(serde_json::to_string(&response.transcript)?),
OutputFormat::Srt => Ok(format_srt_with_options(
&response.transcript,
&output.subtitles,
)),
OutputFormat::Vtt => Ok(format_webvtt_with_options(
&response.transcript,
&output.subtitles,
)),
OutputFormat::Txt => Ok(format_txt(&response.transcript)),
OutputFormat::Tsv => Ok(format_tsv(&response.transcript)),
OutputFormat::Audacity => Ok(format_audacity_labels(&response.transcript)),
}
}
pub(crate) fn whisperx_json_value(
transcript: &TranscriptionContract,
return_char_alignments: bool,
) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert(
"text".to_string(),
serde_json::Value::String(transcript.text_or_joined()),
);
if let Some(language) = &transcript.language {
object.insert(
"language".to_string(),
serde_json::Value::String(language.clone()),
);
}
if let Some(source) = &transcript.source {
object.insert(
"source".to_string(),
serde_json::Value::String(source.clone()),
);
}
let segments = transcript
.segments
.iter()
.map(|segment| whisperx_segment_value(segment, return_char_alignments))
.collect::<Vec<_>>();
let words = transcript
.segments
.iter()
.flat_map(|segment| segment.words.iter())
.map(whisperx_word_value)
.collect::<Vec<_>>();
object.insert("segments".to_string(), serde_json::Value::Array(segments));
object.insert("word_segments".to_string(), serde_json::Value::Array(words));
serde_json::Value::Object(object)
}
fn whisperx_segment_value(
segment: &text_transcripts::TranscriptSegmentContract,
return_char_alignments: bool,
) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert("id".to_string(), serde_json::Value::from(segment.index));
insert_seconds(&mut object, "start", segment.start_seconds);
insert_seconds(&mut object, "end", segment.end_seconds);
object.insert(
"text".to_string(),
serde_json::Value::String(segment.text.clone()),
);
if let Some(speaker) = &segment.speaker {
object.insert(
"speaker".to_string(),
serde_json::Value::String(speaker.clone()),
);
}
if let Some(confidence) = segment.confidence {
object.insert("score".to_string(), serde_json::Value::from(confidence));
}
if !segment.words.is_empty() {
object.insert(
"words".to_string(),
serde_json::Value::Array(segment.words.iter().map(whisperx_word_value).collect()),
);
}
if return_char_alignments && !segment.chars.is_empty() {
object.insert(
"chars".to_string(),
serde_json::Value::Array(segment.chars.iter().map(whisperx_char_value).collect()),
);
}
serde_json::Value::Object(object)
}
fn whisperx_word_value(word: &text_transcripts::TranscriptWordContract) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert(
"word".to_string(),
serde_json::Value::String(word.text.clone()),
);
insert_seconds(&mut object, "start", word.start_seconds);
insert_seconds(&mut object, "end", word.end_seconds);
if let Some(confidence) = word.confidence {
object.insert("score".to_string(), serde_json::Value::from(confidence));
}
if let Some(speaker) = &word.speaker {
object.insert(
"speaker".to_string(),
serde_json::Value::String(speaker.clone()),
);
}
serde_json::Value::Object(object)
}
fn whisperx_char_value(character: &text_transcripts::TranscriptCharContract) -> serde_json::Value {
let mut object = serde_json::Map::new();
object.insert(
"char".to_string(),
serde_json::Value::String(character.character.clone()),
);
insert_seconds(&mut object, "start", character.start_seconds);
insert_seconds(&mut object, "end", character.end_seconds);
if let Some(confidence) = character.confidence {
object.insert("score".to_string(), serde_json::Value::from(confidence));
}
serde_json::Value::Object(object)
}
fn insert_seconds(
object: &mut serde_json::Map<String, serde_json::Value>,
key: &str,
value: Option<f64>,
) {
if let Some(value) = value {
object.insert(key.to_string(), serde_json::Value::from(value));
}
}
pub(crate) fn expand_output_format(format: OutputFormat) -> Vec<OutputFormat> {
match format {
OutputFormat::All => vec![
OutputFormat::Txt,
OutputFormat::Vtt,
OutputFormat::Srt,
OutputFormat::Tsv,
OutputFormat::Audacity,
OutputFormat::Json,
],
other => vec![other],
}
}
fn format_txt(transcript: &TranscriptionContract) -> String {
let text = transcript
.segments
.iter()
.map(|segment| match &segment.speaker {
Some(speaker) => format!("[{speaker}]: {}", segment.text.trim()),
None => segment.text.trim().to_string(),
})
.collect::<Vec<_>>()
.join("\n");
if text.is_empty() {
text
} else {
format!("{text}\n")
}
}
fn format_tsv(transcript: &TranscriptionContract) -> String {
let mut output = String::from("start\tend\ttext\n");
for segment in &transcript.segments {
let start = seconds_to_millis(segment.start_seconds);
let end = seconds_to_millis(segment.end_seconds);
output.push_str(&format!(
"{start}\t{end}\t{}\n",
segment.text.trim().replace('\t', " ")
));
}
output
}
fn format_audacity_labels(transcript: &TranscriptionContract) -> String {
let mut output = String::new();
for segment in &transcript.segments {
let start = segment.start_seconds.unwrap_or(0.0);
let end = segment.end_seconds.unwrap_or(start).max(start);
let text = match &segment.speaker {
Some(speaker) => format!("[[{speaker}]]{}", segment.text.trim().replace('\t', " ")),
None => segment.text.trim().replace('\t', " "),
};
output.push_str(&format!("{start}\t{end}\t{text}\n"));
}
output
}
fn seconds_to_millis(seconds: Option<f64>) -> u64 {
seconds.unwrap_or(0.0).max(0.0).mul_add(1000.0, 0.0).round() as u64
}
fn format_srt_with_options(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> String {
let mut output = String::new();
for (index, cue) in subtitle_cues(transcript, subtitles).into_iter().enumerate() {
output.push_str(&(index + 1).to_string());
output.push('\n');
output.push_str(&format_subtitle_timestamp(cue.start, true, ','));
output.push_str(" --> ");
output.push_str(&format_subtitle_timestamp(cue.end, true, ','));
output.push('\n');
output.push_str(&cue.text);
output.push_str("\n\n");
}
output
}
fn format_webvtt_with_options(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> String {
let mut output = String::from("WEBVTT\n\n");
for cue in subtitle_cues(transcript, subtitles) {
output.push_str(&format_subtitle_timestamp(cue.start, false, '.'));
output.push_str(" --> ");
output.push_str(&format_subtitle_timestamp(cue.end, false, '.'));
output.push('\n');
output.push_str(&cue.text);
output.push_str("\n\n");
}
output
}
#[derive(Debug, Clone)]
struct SubtitleCue {
start: f64,
end: f64,
text: String,
}
#[derive(Debug, Clone)]
struct SubtitleTiming {
word: String,
start: Option<f64>,
end: Option<f64>,
}
fn subtitle_cues(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> Vec<SubtitleCue> {
let Some(first_segment) = transcript.segments.first() else {
return Vec::new();
};
if !first_segment.words.is_empty() {
return subtitle_word_cues(transcript, subtitles);
}
transcript
.segments
.iter()
.map(|segment| {
let start = segment.start_seconds.unwrap_or(0.0);
let end = segment.end_seconds.unwrap_or(start).max(start);
let mut text = segment.text.trim().replace("-->", "->");
if let Some(speaker) = &segment.speaker {
text = format!("[{speaker}]: {text}");
}
SubtitleCue { start, end, text }
})
.collect()
}
fn subtitle_word_cues(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
) -> Vec<SubtitleCue> {
let mut cues = Vec::new();
let raw_max_line_width = subtitles.max_line_width;
let max_line_count = subtitles.max_line_count;
let max_line_width = raw_max_line_width.unwrap_or(1000);
let preserve_segments = max_line_count.is_none() || raw_max_line_width.is_none();
let mut line_len = 0usize;
let mut line_count = 1usize;
let mut subtitle = Vec::<SubtitleTiming>::new();
let mut times = Vec::<(f64, f64, Option<String>)>::new();
let mut last = transcript
.segments
.first()
.and_then(|segment| segment.start_seconds)
.unwrap_or(0.0);
for segment in &transcript.segments {
for (word_index, original_timing) in segment.words.iter().enumerate() {
let mut timing = SubtitleTiming {
word: original_timing.text.clone(),
start: original_timing.start_seconds,
end: original_timing.end_seconds,
};
let long_pause = if preserve_segments {
false
} else {
timing.start.is_some_and(|start| start - last > 3.0)
};
let has_room = line_len + timing.word.chars().count() <= max_line_width;
let seg_break = word_index == 0 && !subtitle.is_empty() && preserve_segments;
if line_len > 0 && has_room && !long_pause && !seg_break {
line_len += timing.word.chars().count();
} else {
timing.word = timing.word.trim().to_string();
if (!subtitle.is_empty()
&& max_line_count.is_some()
&& (long_pause || line_count >= max_line_count.unwrap_or(0)))
|| seg_break
{
push_subtitle_cues(transcript, subtitles, &subtitle, ×, &mut cues);
subtitle.clear();
times.clear();
line_count = 1;
} else if line_len > 0 {
line_count += 1;
timing.word = format!("\n{}", timing.word);
}
line_len = timing.word.trim().chars().count();
}
subtitle.push(timing);
times.push((
segment.start_seconds.unwrap_or(0.0),
segment
.end_seconds
.unwrap_or_else(|| segment.start_seconds.unwrap_or(0.0)),
segment.speaker.clone(),
));
if let Some(start) = original_timing.start_seconds {
last = start;
}
}
}
if !subtitle.is_empty() {
push_subtitle_cues(transcript, subtitles, &subtitle, ×, &mut cues);
}
cues
}
fn push_subtitle_cues(
transcript: &TranscriptionContract,
subtitles: &SubtitleConfig,
subtitle: &[SubtitleTiming],
times: &[(f64, f64, Option<String>)],
cues: &mut Vec<SubtitleCue>,
) {
let Some((fallback_start, fallback_end, speaker)) = times.first() else {
return;
};
let word_starts = subtitle.iter().filter_map(|word| word.start);
let word_ends = subtitle.iter().filter_map(|word| word.end);
let start = word_starts.reduce(f64::min).unwrap_or(*fallback_start);
let end = word_ends.reduce(f64::max).unwrap_or(*fallback_end);
let prefix = speaker
.as_ref()
.map(|speaker| format!("[{speaker}]: "))
.unwrap_or_default();
let subtitle_text = subtitle_text_for_language(transcript, subtitle);
let has_timing = subtitle.iter().any(|word| word.start.is_some());
if subtitles.highlight_words && has_timing {
let mut last = format_subtitle_timestamp(start, true, ',');
let all_words = subtitle
.iter()
.map(|timing| timing.word.clone())
.collect::<Vec<_>>();
for (index, timing) in subtitle.iter().enumerate() {
let (Some(word_start), Some(word_end)) = (timing.start, timing.end) else {
continue;
};
let start_text = format_subtitle_timestamp(word_start, true, ',');
let end_text = format_subtitle_timestamp(word_end, true, ',');
if last != start_text {
cues.push(SubtitleCue {
start: timestamp_to_seconds(&last),
end: word_start,
text: format!("{prefix}{subtitle_text}"),
});
}
cues.push(SubtitleCue {
start: word_start,
end: word_end,
text: format!(
"{prefix}{}",
all_words
.iter()
.enumerate()
.map(|(word_index, word)| {
if word_index == index {
underline_word_preserving_leading_space(word)
} else {
word.clone()
}
})
.collect::<Vec<_>>()
.join(" ")
),
});
last = end_text;
}
} else {
cues.push(SubtitleCue {
start,
end,
text: format!("{prefix}{subtitle_text}"),
});
}
}
fn subtitle_text_for_language(
transcript: &TranscriptionContract,
subtitle: &[SubtitleTiming],
) -> String {
let words = subtitle
.iter()
.map(|timing| timing.word.clone())
.collect::<Vec<_>>();
if transcript
.language
.as_deref()
.is_some_and(|language| matches!(language, "ja" | "zh"))
{
words.join("")
} else {
words.join(" ")
}
}
fn underline_word_preserving_leading_space(word: &str) -> String {
let leading_bytes = word
.char_indices()
.find(|(_, character)| !character.is_whitespace())
.map(|(index, _)| index)
.unwrap_or(word.len());
let (leading, rest) = word.split_at(leading_bytes);
format!("{leading}<u>{rest}</u>")
}
fn format_subtitle_timestamp(
seconds: f64,
always_include_hours: bool,
decimal_marker: char,
) -> String {
let total_millis = (seconds.max(0.0) * 1_000.0).round() as u64;
let millis = total_millis % 1_000;
let total_seconds = total_millis / 1_000;
let secs = total_seconds % 60;
let total_minutes = total_seconds / 60;
let minutes = total_minutes % 60;
let hours = total_minutes / 60;
if always_include_hours || hours > 0 {
format!("{hours:02}:{minutes:02}:{secs:02}{decimal_marker}{millis:03}")
} else {
format!("{minutes:02}:{secs:02}{decimal_marker}{millis:03}")
}
}
fn timestamp_to_seconds(timestamp: &str) -> f64 {
let normalized = timestamp.replace(',', ".");
let parts = normalized.split(':').collect::<Vec<_>>();
match parts.as_slice() {
[hours, minutes, seconds] => {
hours.parse::<f64>().unwrap_or(0.0) * 3600.0
+ minutes.parse::<f64>().unwrap_or(0.0) * 60.0
+ seconds.parse::<f64>().unwrap_or(0.0)
}
[minutes, seconds] => {
minutes.parse::<f64>().unwrap_or(0.0) * 60.0 + seconds.parse::<f64>().unwrap_or(0.0)
}
_ => 0.0,
}
}
fn source_basename(source: &String) -> Option<String> {
Path::new(source)
.file_stem()
.and_then(|stem| stem.to_str())
.map(|stem| stem.to_string())
.filter(|stem| !stem.trim().is_empty())
}
pub(crate) fn normalize_space(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}