native-whisperx-cli 0.1.0

Cargo-installable CLI for native-whisperx WhisperX parity workflows.
//! Shared CLI argument enums and transcribe option parsing.

use std::path::PathBuf;

use clap::{ArgAction, Args, ValueEnum};
use native_whisperx::{AssignmentPolicy, SegmentResolution, VadMethod};

use crate::SpeakerDirectoryArgs;

#[derive(Debug, Args)]
pub(crate) struct TranscribeArgs {
    #[arg(required = true)]
    pub(crate) input: Vec<PathBuf>,
    #[arg(long, value_enum, default_value_t = CliProvider::Native)]
    pub(crate) provider: CliProvider,
    #[arg(long, visible_alias = "whisper_bundle")]
    pub(crate) whisper_bundle: Option<PathBuf>,
    #[arg(long, default_value = "small")]
    pub(crate) model: String,
    #[arg(long, value_enum, default_value_t = CliTask::Transcribe)]
    pub(crate) task: CliTask,
    #[arg(long)]
    pub(crate) language: Option<String>,
    #[arg(long, value_enum, default_value_t = CliDevicePreference::Auto)]
    pub(crate) device: CliDevicePreference,
    #[arg(long, visible_alias = "device_index")]
    pub(crate) device_index: Option<String>,
    #[arg(long, visible_alias = "batch_size")]
    pub(crate) batch_size: Option<usize>,
    #[arg(long, visible_alias = "compute_type")]
    pub(crate) compute_type: Option<String>,
    #[arg(long, num_args = 0..=1, default_missing_value = "true")]
    pub(crate) verbose: Option<String>,
    #[arg(long = "log-level", visible_alias = "log_level")]
    pub(crate) log_level: Option<String>,
    #[arg(long = "print-progress", visible_alias = "print_progress", action = ArgAction::SetTrue)]
    pub(crate) print_progress: bool,
    #[arg(long = "no-align", visible_alias = "no_align")]
    pub(crate) no_align: bool,
    #[arg(long, visible_alias = "alignment_bundle")]
    pub(crate) alignment_bundle: Option<PathBuf>,
    #[arg(
        long = "align-model",
        visible_alias = "align_model",
        default_value = "facebook/wav2vec2-base-960h"
    )]
    pub(crate) alignment_model: String,
    #[arg(long = "model-dir", visible_alias = "model_dir")]
    pub(crate) model_dir: Option<PathBuf>,
    #[arg(long = "model-cache-only", visible_alias = "model_cache_only")]
    pub(crate) model_cache_only: bool,
    #[arg(long = "translation-model", visible_alias = "translation_model")]
    pub(crate) translation_model: Option<String>,
    #[arg(long = "translation-bundle", visible_alias = "translation_bundle")]
    pub(crate) translation_bundle: Option<PathBuf>,
    #[arg(
        long = "translation-source-language",
        visible_alias = "translation_source_language"
    )]
    pub(crate) translation_source_language: Option<String>,
    #[arg(
        long = "translation-target-language",
        visible_alias = "translation_target_language"
    )]
    pub(crate) translation_target_language: Option<String>,
    #[arg(
        long = "translation-max-new-tokens",
        visible_alias = "translation_max_new_tokens",
        default_value_t = 256
    )]
    pub(crate) translation_max_new_tokens: usize,
    #[arg(long = "interpolate-method", visible_alias = "interpolate_method", value_enum, default_value_t = CliAlignmentInterpolationMethod::Nearest)]
    pub(crate) interpolate_method: CliAlignmentInterpolationMethod,
    #[arg(
        long = "return-char-alignments",
        visible_alias = "return_char_alignments"
    )]
    pub(crate) return_char_alignments: bool,
    #[arg(long, visible_alias = "speaker_embedding_bundle")]
    pub(crate) speaker_embedding_bundle: Option<PathBuf>,
    #[arg(long, visible_alias = "speaker_embedding_model_file")]
    pub(crate) speaker_embedding_model_file: Option<String>,
    #[arg(long, visible_alias = "speaker_embedding_dim")]
    pub(crate) speaker_embedding_dim: Option<usize>,
    #[arg(long, visible_alias = "speaker_embedding_sample_rate")]
    pub(crate) speaker_embedding_sample_rate: Option<u32>,
    #[arg(long, action = ArgAction::SetTrue)]
    pub(crate) diarize: bool,
    #[arg(long, visible_alias = "diarize_model")]
    pub(crate) diarize_model: Option<String>,
    #[arg(
        long = "diarization-model-bundle",
        visible_alias = "diarization_model_bundle"
    )]
    pub(crate) diarization_model_bundle: Option<PathBuf>,
    #[arg(
        long = "diarization-manifest-file",
        visible_alias = "diarization_manifest_file"
    )]
    pub(crate) diarization_manifest_file: Option<String>,
    #[arg(
        long = "diarization-segmentation-model-file",
        visible_alias = "diarization_segmentation_model_file"
    )]
    pub(crate) diarization_segmentation_model_file: Option<String>,
    #[arg(
        long = "diarization-embedding-model-file",
        visible_alias = "diarization_embedding_model_file"
    )]
    pub(crate) diarization_embedding_model_file: Option<String>,
    #[arg(
        long = "diarization-plda-transform-file",
        visible_alias = "diarization_plda_transform_file"
    )]
    pub(crate) diarization_plda_transform_file: Option<String>,
    #[arg(
        long = "diarization-plda-model-file",
        visible_alias = "diarization_plda_model_file"
    )]
    pub(crate) diarization_plda_model_file: Option<String>,
    #[arg(
        long = "diarization-clustering-config-file",
        visible_alias = "diarization_clustering_config_file"
    )]
    pub(crate) diarization_clustering_config_file: Option<String>,
    #[arg(long, visible_alias = "speaker_embeddings", action = ArgAction::SetTrue)]
    pub(crate) speaker_embeddings: bool,
    #[arg(long, visible_alias = "hf_token")]
    pub(crate) hf_token: Option<String>,
    #[arg(long, visible_alias = "min_speakers")]
    pub(crate) min_speakers: Option<usize>,
    #[arg(long, visible_alias = "max_speakers")]
    pub(crate) max_speakers: Option<usize>,
    #[arg(
        long = "speaker-assignment-policy",
        visible_alias = "speaker_assignment_policy",
        value_enum,
        default_value_t = CliAssignmentPolicy::Majority
    )]
    pub(crate) speaker_assignment_policy: CliAssignmentPolicy,
    #[command(flatten)]
    pub(crate) speaker_directory: SpeakerDirectoryArgs,
    #[arg(long = "no-speaker-library", visible_alias = "no_speaker_library", action = ArgAction::SetTrue)]
    pub(crate) no_speaker_library: bool,
    #[arg(long = "no-speaker-store", visible_alias = "no_speaker_store", action = ArgAction::SetTrue)]
    pub(crate) no_speaker_store: bool,
    #[arg(long = "no-save-draft-speakers", visible_alias = "no_save_draft_speakers", action = ArgAction::SetTrue)]
    pub(crate) no_save_draft_speakers: bool,
    #[arg(long = "no-use-draft-speakers", visible_alias = "no_use_draft_speakers", action = ArgAction::SetTrue)]
    pub(crate) no_use_draft_speakers: bool,
    #[arg(long, short = 'o', visible_alias = "output_dir")]
    pub(crate) output_dir: Option<PathBuf>,
    #[arg(long)]
    pub(crate) basename: Option<String>,
    #[arg(
        long = "format",
        short = 'f',
        alias = "output-format",
        visible_alias = "output_format",
        value_enum,
        default_values_t = [CliOutputFormat::Json]
    )]
    pub(crate) formats: Vec<CliOutputFormat>,
    #[arg(long, visible_alias = "vad_method", value_enum, default_value_t = CliVadMethod::Energy)]
    pub(crate) vad_method: CliVadMethod,
    #[arg(long, visible_alias = "vad_onset")]
    pub(crate) vad_onset: Option<f32>,
    #[arg(long, visible_alias = "vad_offset")]
    pub(crate) vad_offset: Option<f32>,
    #[arg(long, visible_alias = "chunk_size")]
    pub(crate) chunk_size: Option<f64>,
    #[arg(long = "vad-model-bundle", visible_alias = "vad_model_bundle")]
    pub(crate) vad_model_bundle: Option<PathBuf>,
    #[arg(long = "vad-model-file", visible_alias = "vad_model_file")]
    pub(crate) vad_model_file: Option<String>,
    #[arg(long = "vad-input-name", visible_alias = "vad_input_name")]
    pub(crate) vad_input_name: Option<String>,
    #[arg(long = "vad-output-name", visible_alias = "vad_output_name")]
    pub(crate) vad_output_name: Option<String>,
    #[arg(long, value_delimiter = ',')]
    pub(crate) temperature: Vec<f32>,
    #[arg(long, visible_alias = "best_of")]
    pub(crate) best_of: Option<usize>,
    #[arg(long, visible_alias = "beam_size")]
    pub(crate) beam_size: Option<usize>,
    #[arg(long)]
    pub(crate) patience: Option<f32>,
    #[arg(long, visible_alias = "length_penalty")]
    pub(crate) length_penalty: Option<f32>,
    #[arg(long, visible_alias = "suppress_tokens")]
    pub(crate) suppress_tokens: Option<String>,
    #[arg(long, visible_alias = "suppress_numerals", action = ArgAction::SetTrue)]
    pub(crate) suppress_numerals: bool,
    #[arg(long, visible_alias = "initial_prompt")]
    pub(crate) initial_prompt: Option<String>,
    #[arg(long)]
    pub(crate) hotwords: Option<String>,
    #[arg(long, visible_alias = "condition_on_previous_text")]
    pub(crate) condition_on_previous_text: Option<bool>,
    #[arg(long)]
    pub(crate) fp16: Option<bool>,
    #[arg(long, visible_alias = "compression_ratio_threshold")]
    pub(crate) compression_ratio_threshold: Option<f32>,
    #[arg(long, visible_alias = "logprob_threshold")]
    pub(crate) logprob_threshold: Option<f32>,
    #[arg(long, visible_alias = "no_speech_threshold")]
    pub(crate) no_speech_threshold: Option<f32>,
    #[arg(long)]
    pub(crate) threads: Option<usize>,
    #[arg(long, visible_alias = "max_line_width")]
    pub(crate) max_line_width: Option<usize>,
    #[arg(long, visible_alias = "max_line_count")]
    pub(crate) max_line_count: Option<usize>,
    #[arg(long, visible_alias = "highlight_words", action = ArgAction::SetTrue)]
    pub(crate) highlight_words: bool,
    #[arg(long, visible_alias = "segment_resolution", value_enum, default_value_t = CliSegmentResolution::Sentence)]
    pub(crate) segment_resolution: CliSegmentResolution,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
pub(crate) enum CliOutputFormat {
    All,
    Json,
    NativeJson,
    Srt,
    Vtt,
    Txt,
    Tsv,
    Aud,
}

#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
pub(crate) enum CliProvider {
    Native,
    ExternalWhisperx,
}

#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
pub(crate) enum CliTask {
    Transcribe,
    Translate,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
pub(crate) enum CliDevicePreference {
    Auto,
    Cpu,
    Cuda,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
pub(crate) enum CliAlignmentInterpolationMethod {
    Nearest,
    Linear,
    Ignore,
}

#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
pub(crate) enum CliVadMethod {
    Energy,
    Pyannote,
    Silero,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
pub(crate) enum CliAssignmentPolicy {
    Majority,
    NearestStart,
    StrictContained,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
pub(crate) enum CliSegmentResolution {
    #[value(alias = "segment")]
    Sentence,
    Chunk,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
pub(crate) enum CliSpeakerDirectoryScope {
    Auto,
    Local,
    Global,
}

impl From<CliVadMethod> for VadMethod {
    fn from(value: CliVadMethod) -> Self {
        match value {
            CliVadMethod::Energy => Self::Energy,
            CliVadMethod::Pyannote => Self::Pyannote,
            CliVadMethod::Silero => Self::Silero,
        }
    }
}

impl From<CliAssignmentPolicy> for AssignmentPolicy {
    fn from(value: CliAssignmentPolicy) -> Self {
        match value {
            CliAssignmentPolicy::Majority => Self::Majority,
            CliAssignmentPolicy::NearestStart => Self::NearestStart,
            CliAssignmentPolicy::StrictContained => Self::StrictContained,
        }
    }
}

impl From<CliSegmentResolution> for SegmentResolution {
    fn from(value: CliSegmentResolution) -> Self {
        match value {
            CliSegmentResolution::Sentence => Self::Sentence,
            CliSegmentResolution::Chunk => Self::Chunk,
        }
    }
}