Skip to main content

whisper_macos_cli/
cli.rs

1use std::path::PathBuf;
2
3use clap::{Args, Parser, Subcommand, ValueEnum, ValueHint};
4
5pub fn long_version() -> &'static str {
6    concat!(
7        env!("CARGO_PKG_VERSION"),
8        " (",
9        env!("GIT_SHA"),
10        " ",
11        env!("BUILD_DATE"),
12        " ",
13        env!("TARGET"),
14        ")"
15    )
16}
17
18/// Transcribe audio files locally on Apple Silicon via whisper.cpp with Metal GPU.
19///
20/// Emits structured JSON to stdout for AI agent integration and Unix pipelines.
21/// Stdin/stdout contract — stderr reserved for logs.
22#[derive(Debug, Parser)]
23#[command(
24    name = "whisper-macos-cli",
25    version,
26    long_version = long_version(),
27    propagate_version = true,
28    arg_required_else_help = true,
29    max_term_width = 100,
30    after_help = "\
31EXAMPLES:
32  whisper-macos-cli transcribe voice.ogg
33  whisper-macos-cli transcribe --model base --language pt audio.mp3
34  whisper-macos-cli transcribe --timestamps --ndjson *.ogg
35  cat audio.wav | whisper-macos-cli transcribe
36  whisper-macos-cli models download base
37  whisper-macos-cli doctor
38  whisper-macos-cli commands --format json
39
40ENVIRONMENT:
41  WHISPER_MODEL       Override default model (e.g. base, small, medium)
42  WHISPER_LANGUAGE    Override default language (e.g. pt, en, es, auto)
43  NO_COLOR            Disable colored output (see https://no-color.org)
44  CI                  Disable all interactive prompts when set to true
45  RUST_LOG            Override tracing log level filter
46  SOURCE_DATE_EPOCH   Unix timestamp for reproducible builds
47
48EXIT STATUS:
49  0     Success
50  2     Usage error (invalid arguments)
51  64    No input provided
52  65    Invalid input data (corrupt audio, unsupported format)
53  66    Input file not found
54  69    Service unavailable (download failed, unsupported platform)
55  70    Internal error (whisper inference failed)
56  74    I/O error
57  78    Configuration error (model not found)
58  130   Interrupted (SIGINT / Ctrl+C)
59  141   Broken pipe (SIGPIPE)
60  143   Terminated (SIGTERM)
61
62FILES:
63  ~/Library/Application Support/whisper-macos-cli/models/
64      Downloaded Whisper model files (ggml-*.bin)
65
66SEE ALSO:
67  Project:  https://github.com/daniloaguiarbr/whisper-macos-cli
68  whisper.cpp: https://github.com/ggml-org/whisper.cpp
69
70BUGS:
71  Report bugs at https://github.com/daniloaguiarbr/whisper-macos-cli/issues"
72)]
73pub struct Cli {
74    #[command(subcommand)]
75    pub command: Option<Commands>,
76
77    /// Suppress stderr output
78    #[arg(long, global = true, env = "QUIET")]
79    pub quiet: bool,
80
81    /// Increase verbosity (-v info, -vv debug, -vvv trace)
82    #[arg(short, long, global = true, action = clap::ArgAction::Count)]
83    pub verbose: u8,
84
85    /// Print JSON schema of the output envelope and exit
86    #[arg(long, global = true)]
87    pub print_schema: bool,
88
89    /// Print the current effective configuration as JSON and exit
90    #[arg(long, global = true)]
91    pub print_config: bool,
92
93    /// Disable interactive fallbacks (for agent/script use; honored when CI=true)
94    #[arg(long, global = true, env = "NO_INPUT")]
95    pub no_input: bool,
96
97    /// Control colored output
98    #[arg(long, global = true, value_name = "WHEN", default_value = "auto")]
99    pub color: ColorChoice,
100}
101
102#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
103pub enum ColorChoice {
104    Auto,
105    Always,
106    Never,
107}
108
109#[derive(Debug, Subcommand)]
110pub enum Commands {
111    /// Transcribe audio files to text
112    Transcribe(TranscribeArgs),
113    /// Manage Whisper models
114    Models {
115        #[command(subcommand)]
116        action: ModelsAction,
117    },
118    /// Check system environment
119    Doctor,
120    /// Print JSON schema of output envelope
121    Schema,
122    /// Print current effective configuration as JSON
123    Config,
124    /// Generate shell completions
125    Completions {
126        /// Shell to generate completions for
127        #[arg(value_name = "SHELL")]
128        shell: clap_complete::Shell,
129    },
130    /// Emit the full command tree as JSON for agent discovery
131    Commands {
132        /// Output format
133        #[arg(long, value_name = "FMT", default_value = "json")]
134        format: CommandsFormat,
135    },
136    /// Generate a starter SKILL.md and AGENTS.md scaffolding for downstream agents
137    Init {
138        /// Target directory (defaults to current)
139        #[arg(long, value_name = "DIR", default_value = ".")]
140        target: PathBuf,
141    },
142    /// Print third-party license attribution
143    Licenses,
144    /// Resume a previously interrupted batch by workflow_id (no-op for v0.1)
145    Resume {
146        /// Workflow id from a prior interrupted run
147        #[arg(value_name = "WORKFLOW_ID")]
148        workflow_id: String,
149    },
150}
151
152#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
153pub enum CommandsFormat {
154    Json,
155    Yaml,
156}
157
158#[derive(Debug, Args)]
159pub struct TranscribeArgs {
160    /// Audio files to transcribe (reads stdin if omitted and not a TTY)
161    #[arg(value_hint = ValueHint::FilePath)]
162    pub files: Vec<PathBuf>,
163
164    /// Language for transcription (e.g. pt, en, es, auto)
165    #[arg(
166        short,
167        long,
168        value_name = "LANG",
169        env = "WHISPER_LANGUAGE",
170        help_heading = "Transcription"
171    )]
172    pub language: Option<String>,
173
174    /// Whisper model to use
175    #[arg(
176        short,
177        long,
178        value_name = "MODEL",
179        env = "WHISPER_MODEL",
180        default_value = "large-v3",
181        help_heading = "Transcription"
182    )]
183    pub model: WhisperModel,
184
185    /// Beam size for BeamSearch decoding [1-16]
186    #[arg(long, value_name = "N", default_value_t = 8, value_parser = parse_beam_size, help_heading = "Transcription")]
187    pub beam_size: i32,
188
189    /// Include timestamped segments in output
190    #[arg(long, help_heading = "Output")]
191    pub timestamps: bool,
192
193    /// Emit NDJSON (one JSON object per line per file)
194    #[arg(long, help_heading = "Output", conflicts_with = "output_format")]
195    pub ndjson: bool,
196
197    /// Output format
198    #[arg(long, value_name = "FMT", help_heading = "Output")]
199    pub output_format: Option<OutputFormat>,
200
201    /// VAD threshold [0.0-1.0]
202    #[arg(long, value_name = "FLOAT", default_value_t = 0.5, value_parser = parse_vad_threshold, help_heading = "Transcription")]
203    pub vad_threshold: f32,
204
205    /// Maximum parallel transcriptions [1-32]
206    #[arg(long, value_name = "N", default_value_t = 2, value_parser = parse_concurrency, help_heading = "Transcription")]
207    pub concurrency: usize,
208
209    /// Force input audio format (ogg, mp3, wav, flac)
210    #[arg(long, value_name = "FMT", help_heading = "Input")]
211    pub input_format: Option<String>,
212
213    /// Path to ffmpeg binary (auto-detected from PATH by default)
214    #[arg(
215        long,
216        value_name = "PATH",
217        env = "WHISPER_FFMPEG_BINARY",
218        default_value = "ffmpeg",
219        help_heading = "Input"
220    )]
221    pub ffmpeg_binary: String,
222
223    /// Disable automatic ffmpeg fallback (e.g. for reproducing bugs)
224    #[arg(long, env = "WHISPER_NO_FFMPEG_FALLBACK", help_heading = "Input")]
225    pub no_ffmpeg_fallback: bool,
226
227    /// Resolve inputs and exit without transcribing
228    #[arg(long, help_heading = "Execution")]
229    pub dry_run: bool,
230
231    /// Per-attempt request timeout in seconds [1-3600]
232    #[arg(long, value_name = "SECS", value_parser = parse_timeout_secs, help_heading = "Execution")]
233    pub timeout: Option<u64>,
234
235    /// Total retry attempts for transient errors [0-10]
236    #[arg(long, value_name = "N", value_parser = parse_retry_count, help_heading = "Execution")]
237    pub retry_count: Option<u32>,
238
239    /// Total elapsed time budget for retries in seconds [1-3600]
240    #[arg(long, value_name = "SECS", value_parser = parse_retry_elapsed, help_heading = "Execution")]
241    pub retry_max_elapsed: Option<u64>,
242
243    /// Fail fast in air-gapped environments without network connectivity
244    #[arg(long, help_heading = "Execution")]
245    pub offline: bool,
246
247    /// Resume a previously interrupted batch (no-op for v0.1; reserved)
248    #[arg(long, value_name = "WORKFLOW_ID", help_heading = "Execution")]
249    pub resume: Option<String>,
250}
251
252impl TranscribeArgs {
253    pub fn is_ndjson(&self) -> bool {
254        self.ndjson || matches!(self.output_format, Some(OutputFormat::Ndjson))
255    }
256}
257
258#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
259pub enum OutputFormat {
260    Json,
261    Ndjson,
262}
263
264#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
265pub enum WhisperModel {
266    Tiny,
267    Base,
268    Small,
269    Medium,
270    #[value(name = "large-v3")]
271    LargeV3,
272}
273
274impl WhisperModel {
275    pub fn as_str(&self) -> &'static str {
276        match self {
277            Self::Tiny => "tiny",
278            Self::Base => "base",
279            Self::Small => "small",
280            Self::Medium => "medium",
281            Self::LargeV3 => "large-v3",
282        }
283    }
284}
285
286impl std::fmt::Display for WhisperModel {
287    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
288        f.write_str(self.as_str())
289    }
290}
291
292#[derive(Debug, Subcommand)]
293pub enum ModelsAction {
294    /// Download a model
295    Download {
296        /// Model name (default: large-v3)
297        #[arg(value_name = "MODEL")]
298        model: Option<WhisperModel>,
299    },
300    /// List available and downloaded models
301    List,
302    /// Show model file path
303    Path {
304        /// Model name (default: large-v3)
305        #[arg(value_name = "MODEL")]
306        model: Option<WhisperModel>,
307    },
308    /// Remove a downloaded model
309    Remove {
310        /// Model name to remove
311        #[arg(value_name = "MODEL")]
312        model: WhisperModel,
313        /// Show what would be removed without deleting
314        #[arg(long)]
315        dry_run: bool,
316    },
317}
318
319fn parse_beam_size(s: &str) -> Result<i32, String> {
320    let val: i32 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
321    if !(1..=16).contains(&val) {
322        return Err(format!("beam size must be between 1 and 16, got {val}"));
323    }
324    Ok(val)
325}
326
327fn parse_vad_threshold(s: &str) -> Result<f32, String> {
328    let val: f32 = s.parse().map_err(|e| format!("invalid float: {e}"))?;
329    if !(0.0..=1.0).contains(&val) {
330        return Err(format!(
331            "VAD threshold must be between 0.0 and 1.0, got {val}"
332        ));
333    }
334    Ok(val)
335}
336
337fn parse_concurrency(s: &str) -> Result<usize, String> {
338    let val: usize = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
339    if !(1..=32).contains(&val) {
340        return Err(format!("concurrency must be between 1 and 32, got {val}"));
341    }
342    Ok(val)
343}
344
345fn parse_timeout_secs(s: &str) -> Result<u64, String> {
346    let val: u64 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
347    if !(1..=3600).contains(&val) {
348        return Err(format!(
349            "timeout must be between 1 and 3600 seconds, got {val}"
350        ));
351    }
352    Ok(val)
353}
354
355fn parse_retry_count(s: &str) -> Result<u32, String> {
356    let val: u32 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
357    if val > 10 {
358        return Err(format!("retry count must be between 0 and 10, got {val}"));
359    }
360    Ok(val)
361}
362
363fn parse_retry_elapsed(s: &str) -> Result<u64, String> {
364    let val: u64 = s.parse().map_err(|e| format!("invalid integer: {e}"))?;
365    if !(1..=3600).contains(&val) {
366        return Err(format!(
367            "retry max elapsed must be between 1 and 3600 seconds, got {val}"
368        ));
369    }
370    Ok(val)
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376    use clap::CommandFactory;
377
378    #[test]
379    fn cli_debug_assert() {
380        Cli::command().debug_assert();
381    }
382
383    #[test]
384    fn parse_beam_size_accepts_boundaries() {
385        assert_eq!(parse_beam_size("1").unwrap(), 1);
386        assert_eq!(parse_beam_size("8").unwrap(), 8);
387        assert_eq!(parse_beam_size("16").unwrap(), 16);
388    }
389
390    #[test]
391    fn parse_beam_size_rejects_below_range() {
392        assert!(parse_beam_size("0").is_err());
393        assert!(parse_beam_size("-1").is_err());
394    }
395
396    #[test]
397    fn parse_beam_size_rejects_above_range() {
398        assert!(parse_beam_size("17").is_err());
399        assert!(parse_beam_size("100").is_err());
400    }
401
402    #[test]
403    fn parse_beam_size_rejects_non_integer() {
404        assert!(parse_beam_size("abc").is_err());
405        assert!(parse_beam_size("1.5").is_err());
406        assert!(parse_beam_size("").is_err());
407    }
408
409    #[test]
410    fn parse_vad_threshold_accepts_boundaries() {
411        assert!((parse_vad_threshold("0.0").unwrap() - 0.0).abs() < f32::EPSILON);
412        assert!((parse_vad_threshold("0.5").unwrap() - 0.5).abs() < f32::EPSILON);
413        assert!((parse_vad_threshold("1.0").unwrap() - 1.0).abs() < f32::EPSILON);
414    }
415
416    #[test]
417    fn parse_vad_threshold_rejects_out_of_range() {
418        assert!(parse_vad_threshold("-0.1").is_err());
419        assert!(parse_vad_threshold("1.5").is_err());
420        assert!(parse_vad_threshold("2.0").is_err());
421    }
422
423    #[test]
424    fn parse_vad_threshold_rejects_non_float() {
425        assert!(parse_vad_threshold("abc").is_err());
426        assert!(parse_vad_threshold("").is_err());
427    }
428
429    #[test]
430    fn parse_concurrency_accepts_boundaries() {
431        assert_eq!(parse_concurrency("1").unwrap(), 1);
432        assert_eq!(parse_concurrency("16").unwrap(), 16);
433        assert_eq!(parse_concurrency("32").unwrap(), 32);
434    }
435
436    #[test]
437    fn parse_concurrency_rejects_below_range() {
438        assert!(parse_concurrency("0").is_err());
439    }
440
441    #[test]
442    fn parse_concurrency_rejects_above_range() {
443        assert!(parse_concurrency("33").is_err());
444        assert!(parse_concurrency("1000").is_err());
445    }
446
447    #[test]
448    fn parse_timeout_secs_accepts_valid_range() {
449        assert_eq!(parse_timeout_secs("1").unwrap(), 1);
450        assert_eq!(parse_timeout_secs("3600").unwrap(), 3600);
451    }
452
453    #[test]
454    fn parse_timeout_secs_rejects_out_of_range() {
455        assert!(parse_timeout_secs("0").is_err());
456        assert!(parse_timeout_secs("3601").is_err());
457    }
458
459    #[test]
460    fn parse_retry_count_accepts_max() {
461        assert_eq!(parse_retry_count("0").unwrap(), 0);
462        assert_eq!(parse_retry_count("10").unwrap(), 10);
463    }
464
465    #[test]
466    fn parse_retry_count_rejects_above_max() {
467        assert!(parse_retry_count("11").is_err());
468        assert!(parse_retry_count("100").is_err());
469    }
470}