Skip to main content

whisper_macos_cli/
cli.rs

1use std::path::PathBuf;
2
3use clap::{Args, Parser, Subcommand, ValueEnum, ValueHint};
4
5pub fn long_version() -> &'static str {
6    concat!(
7        env!("CARGO_PKG_VERSION"),
8        " (",
9        env!("GIT_SHA"),
10        " ",
11        env!("BUILD_DATE"),
12        " ",
13        env!("TARGET"),
14        ")"
15    )
16}
17
18/// Transcribe audio files locally on Apple Silicon via whisper.cpp with Metal GPU.
19///
20/// Emits structured JSON to stdout for AI agent integration and Unix pipelines.
21/// Stdin/stdout contract — stderr reserved for logs.
22#[derive(Debug, Parser)]
23#[command(
24    name = "whisper-macos-cli",
25    version,
26    long_version = long_version(),
27    propagate_version = true,
28    arg_required_else_help = true,
29    max_term_width = 100,
30    after_help = "\
31EXAMPLES:
32  whisper-macos-cli transcribe voice.ogg
33  whisper-macos-cli transcribe --model base --language pt audio.mp3
34  whisper-macos-cli transcribe --timestamps --ndjson *.ogg
35  cat audio.wav | whisper-macos-cli transcribe
36  whisper-macos-cli models download base
37  whisper-macos-cli doctor
38  whisper-macos-cli commands --format json
39
40ENVIRONMENT:
41  WHISPER_MODEL       Override default model (e.g. base, small, medium)
42  WHISPER_LANGUAGE    Override default language (e.g. pt, en, es, auto)
43  NO_COLOR            Disable colored output (see https://no-color.org)
44  CI                  Disable all interactive prompts when set to true
45  RUST_LOG            Override tracing log level filter
46  SOURCE_DATE_EPOCH   Unix timestamp for reproducible builds
47
48EXIT STATUS:
49  0     Success
50  2     Usage error (invalid arguments)
51  64    No input provided
52  65    Invalid input data (corrupt audio, unsupported format)
53  66    Input file not found
54  69    Service unavailable (download failed, unsupported platform)
55  70    Internal error (whisper inference failed)
56  74    I/O error
57  78    Configuration error (model not found)
58  130   Interrupted (SIGINT / Ctrl+C)
59  141   Broken pipe (SIGPIPE)
60  143   Terminated (SIGTERM)
61
62FILES:
63  ~/Library/Application Support/whisper-macos-cli/models/
64      Downloaded Whisper model files (ggml-*.bin)
65
66SEE ALSO:
67  Project:  https://github.com/daniloaguiarbr/whisper-macos-cli
68  whisper.cpp: https://github.com/ggml-org/whisper.cpp
69
70BUGS:
71  Report bugs at https://github.com/daniloaguiarbr/whisper-macos-cli/issues"
72)]
73pub struct Cli {
74    #[command(subcommand)]
75    pub command: Option<Commands>,
76
77    /// Suppress stderr output
78    #[arg(long, global = true, env = "QUIET")]
79    pub quiet: bool,
80
81    /// Increase verbosity (-v info, -vv debug, -vvv trace)
82    #[arg(short, long, global = true, action = clap::ArgAction::Count)]
83    pub verbose: u8,
84
85    /// Print JSON schema of the output envelope and exit
86    #[arg(long, global = true)]
87    pub print_schema: bool,
88
89    /// Print the current effective configuration as JSON and exit
90    #[arg(long, global = true)]
91    pub print_config: bool,
92
93    /// Disable interactive fallbacks (for agent/script use; honored when CI=true)
94    #[arg(long, global = true, env = "NO_INPUT")]
95    pub no_input: bool,
96
97    /// Control colored output
98    #[arg(long, global = true, value_name = "WHEN", default_value = "auto")]
99    pub color: ColorChoice,
100}
101
102#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
103pub enum ColorChoice {
104    Auto,
105    Always,
106    Never,
107}
108
109#[derive(Debug, Subcommand)]
110pub enum Commands {
111    /// Transcribe audio files to text
112    Transcribe(TranscribeArgs),
113    /// Manage Whisper models
114    Models {
115        #[command(subcommand)]
116        action: ModelsAction,
117    },
118    /// Check system environment
119    Doctor,
120    /// Print JSON schema of output envelope
121    Schema,
122    /// Print current effective configuration as JSON
123    Config,
124    /// Generate shell completions
125    Completions {
126        /// Shell to generate completions for
127        #[arg(value_name = "SHELL")]
128        shell: clap_complete::Shell,
129    },
130    /// Emit the full command tree as JSON for agent discovery
131    Commands {
132        /// Output format
133        #[arg(long, value_name = "FMT", default_value = "json")]
134        format: CommandsFormat,
135    },
136    /// Generate a starter SKILL.md and AGENTS.md scaffolding for downstream agents
137    Init {
138        /// Target directory (defaults to current)
139        #[arg(long, value_name = "DIR", default_value = ".")]
140        target: PathBuf,
141    },
142    /// Print third-party license attribution
143    Licenses,
144    /// Resume a previously interrupted batch by workflow_id (no-op for v0.1)
145    Resume {
146        /// Workflow id from a prior interrupted run
147        #[arg(value_name = "WORKFLOW_ID")]
148        workflow_id: String,
149    },
150}
151
152#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
153pub enum CommandsFormat {
154    Json,
155    Yaml,
156}
157
158#[derive(Debug, Args)]
159pub struct TranscribeArgs {
160    /// Audio files to transcribe (reads stdin if omitted and not a TTY)
161    #[arg(value_hint = ValueHint::FilePath)]
162    pub files: Vec<PathBuf>,
163
164    /// Language for transcription (e.g. pt, en, es, auto)
165    #[arg(
166        short,
167        long,
168        value_name = "LANG",
169        env = "WHISPER_LANGUAGE",
170        help_heading = "Transcription"
171    )]
172    pub language: Option<String>,
173
174    /// Whisper model to use
175    #[arg(
176        short,
177        long,
178        value_name = "MODEL",
179        env = "WHISPER_MODEL",
180        default_value = "large-v3",
181        help_heading = "Transcription"
182    )]
183    pub model: WhisperModel,
184
185    /// Beam size for BeamSearch decoding [1-16]
186    #[arg(long, value_name = "N", default_value_t = 8, value_parser = parse_beam_size, help_heading = "Transcription")]
187    pub beam_size: i32,
188
189    /// Include timestamped segments in output
190    #[arg(long, help_heading = "Output")]
191    pub timestamps: bool,
192
193    /// Emit NDJSON (one JSON object per line per file)
194    #[arg(long, help_heading = "Output", conflicts_with = "output_format")]
195    pub ndjson: bool,
196
197    /// Output format
198    #[arg(long, value_name = "FMT", help_heading = "Output")]
199    pub output_format: Option<OutputFormat>,
200
201    /// VAD threshold [0.0-1.0]
202    #[arg(long, value_name = "FLOAT", default_value_t = 0.5, value_parser = parse_vad_threshold, help_heading = "Transcription")]
203    pub vad_threshold: f32,
204
205    /// Maximum parallel transcriptions [1-32]
206    #[arg(long, value_name = "N", default_value_t = 2, value_parser = parse_concurrency, help_heading = "Transcription")]
207    pub concurrency: usize,
208
209    /// Force input audio format (ogg, mp3, wav, flac)
210    #[arg(long, value_name = "FMT", help_heading = "Input")]
211    pub input_format: Option<String>,
212
213    /// Resolve inputs and exit without transcribing
214    #[arg(long, help_heading = "Execution")]
215    pub dry_run: bool,
216
217    /// Per-attempt request timeout in seconds [1-3600]
218    #[arg(long, value_name = "SECS", value_parser = parse_timeout_secs, help_heading = "Execution")]
219    pub timeout: Option<u64>,
220
221    /// Total retry attempts for transient errors [0-10]
222    #[arg(long, value_name = "N", value_parser = parse_retry_count, help_heading = "Execution")]
223    pub retry_count: Option<u32>,
224
225    /// Total elapsed time budget for retries in seconds [1-3600]
226    #[arg(long, value_name = "SECS", value_parser = parse_retry_elapsed, help_heading = "Execution")]
227    pub retry_max_elapsed: Option<u64>,
228
229    /// Fail fast in air-gapped environments without network connectivity
230    #[arg(long, help_heading = "Execution")]
231    pub offline: bool,
232
233    /// Resume a previously interrupted batch (no-op for v0.1; reserved)
234    #[arg(long, value_name = "WORKFLOW_ID", help_heading = "Execution")]
235    pub resume: Option<String>,
236}
237
238impl TranscribeArgs {
239    pub fn is_ndjson(&self) -> bool {
240        self.ndjson || matches!(self.output_format, Some(OutputFormat::Ndjson))
241    }
242}
243
244#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
245pub enum OutputFormat {
246    Json,
247    Ndjson,
248}
249
250#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
251pub enum WhisperModel {
252    Tiny,
253    Base,
254    Small,
255    Medium,
256    #[value(name = "large-v3")]
257    LargeV3,
258}
259
260impl WhisperModel {
261    pub fn as_str(&self) -> &'static str {
262        match self {
263            Self::Tiny => "tiny",
264            Self::Base => "base",
265            Self::Small => "small",
266            Self::Medium => "medium",
267            Self::LargeV3 => "large-v3",
268        }
269    }
270}
271
272impl std::fmt::Display for WhisperModel {
273    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
274        f.write_str(self.as_str())
275    }
276}
277
278#[derive(Debug, Subcommand)]
279pub enum ModelsAction {
280    /// Download a model
281    Download {
282        /// Model name (default: large-v3)
283        #[arg(value_name = "MODEL")]
284        model: Option<WhisperModel>,
285    },
286    /// List available and downloaded models
287    List,
288    /// Show model file path
289    Path {
290        /// Model name (default: large-v3)
291        #[arg(value_name = "MODEL")]
292        model: Option<WhisperModel>,
293    },
294    /// Remove a downloaded model
295    Remove {
296        /// Model name to remove
297        #[arg(value_name = "MODEL")]
298        model: WhisperModel,
299        /// Show what would be removed without deleting
300        #[arg(long)]
301        dry_run: bool,
302    },
303}
304
305fn parse_beam_size(s: &str) -> Result<i32, String> {
306    let val: i32 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
307    if !(1..=16).contains(&val) {
308        return Err(format!("beam size must be between 1 and 16, got {val}"));
309    }
310    Ok(val)
311}
312
313fn parse_vad_threshold(s: &str) -> Result<f32, String> {
314    let val: f32 = s.parse().map_err(|e| format!("invalid float: {e}"))?;
315    if !(0.0..=1.0).contains(&val) {
316        return Err(format!(
317            "VAD threshold must be between 0.0 and 1.0, got {val}"
318        ));
319    }
320    Ok(val)
321}
322
323fn parse_concurrency(s: &str) -> Result<usize, String> {
324    let val: usize = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
325    if !(1..=32).contains(&val) {
326        return Err(format!("concurrency must be between 1 and 32, got {val}"));
327    }
328    Ok(val)
329}
330
331fn parse_timeout_secs(s: &str) -> Result<u64, String> {
332    let val: u64 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
333    if !(1..=3600).contains(&val) {
334        return Err(format!(
335            "timeout must be between 1 and 3600 seconds, got {val}"
336        ));
337    }
338    Ok(val)
339}
340
341fn parse_retry_count(s: &str) -> Result<u32, String> {
342    let val: u32 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
343    if val > 10 {
344        return Err(format!("retry count must be between 0 and 10, got {val}"));
345    }
346    Ok(val)
347}
348
349fn parse_retry_elapsed(s: &str) -> Result<u64, String> {
350    let val: u64 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
351    if !(1..=3600).contains(&val) {
352        return Err(format!(
353            "retry max elapsed must be between 1 and 3600 seconds, got {val}"
354        ));
355    }
356    Ok(val)
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362    use clap::CommandFactory;
363
364    #[test]
365    fn cli_debug_assert() {
366        Cli::command().debug_assert();
367    }
368}