use std::path::PathBuf;
use clap::{Args, Parser, Subcommand, ValueEnum};
const HELP_FOOTER: &str = r#"Tips:
* Fast path: gemini-tts-cli speak "Say warmly: hello" -o hello.wav
* Tags pass through: gemini-tts-cli speak "[whispers] this part is quiet" --voice Achernar
* Add direction only when useful: --style, --pace, --accent, --scene, --profile
* Use script first when an agent is authoring performance text, then pipe that prompt into speak
* Run voices recommend "excited podcast host" or tags list to choose better defaults
* MP3, M4A, and FLAC require ffmpeg; WAV and PCM are written directly
* Run doctor --live before important jobs to verify the API key and model actually produce audio
Examples:
gemini-tts-cli speak "Say cheerfully: Have a wonderful day!" --voice Kore -o day.wav
gemini-tts-cli script "Welcome back." --style "calm expert narrator" --accent "London English" --tag "[warmly]"
gemini-tts-cli speak script.txt --text-file --speaker Joe=Kore --speaker Jane=Puck -o dialogue.mp3
gemini-tts-cli voices recommend "sleepy intimate audiobook"
gemini-tts-cli auth import-env
"#;
#[derive(Parser)]
#[command(
version,
about = "Generate expressive Gemini TTS audio for humans and AI agents",
long_about = "Agent-friendly Gemini text-to-speech CLI. It generates WAV/PCM natively, compressed audio through ffmpeg, and exposes voices, tags, languages, prompt templates, diagnostics, and JSON envelopes for reliable agent use.",
after_long_help = HELP_FOOTER
)]
pub struct Cli {
#[arg(long, global = true)]
pub json: bool,
#[arg(long, global = true)]
pub quiet: bool,
#[command(subcommand)]
pub command: Commands,
}
#[derive(Subcommand)]
pub enum Commands {
Speak(SpeakArgs),
Script(ScriptArgs),
Lint(LintArgs),
Voices {
#[command(subcommand)]
action: VoicesAction,
},
Tags {
#[command(subcommand)]
action: TagsAction,
},
Languages {
#[command(subcommand)]
action: LanguagesAction,
},
Doctor(DoctorArgs),
Auth {
#[command(subcommand)]
action: AuthAction,
},
#[command(visible_alias = "info")]
AgentInfo,
Skill {
#[command(subcommand)]
action: SkillAction,
},
Config {
#[command(subcommand)]
action: ConfigAction,
},
Update {
#[arg(long)]
check: bool,
},
#[command(hide = true)]
Contract {
code: i32,
},
}
#[derive(Args, Clone)]
pub struct SpeakArgs {
pub text: String,
#[arg(long)]
pub text_file: bool,
#[arg(short, long, default_value = "speech.wav")]
pub out: PathBuf,
#[arg(long, value_enum, default_value = "auto")]
pub format: AudioFormat,
#[arg(long)]
pub model: Option<String>,
#[arg(long)]
pub voice: Option<String>,
#[arg(long, value_name = "NAME=VOICE", num_args = 1..=2)]
pub speaker: Vec<String>,
#[arg(long)]
pub profile: Option<String>,
#[arg(long)]
pub scene: Option<String>,
#[arg(long)]
pub style: Option<String>,
#[arg(long)]
pub pace: Option<String>,
#[arg(long)]
pub accent: Option<String>,
#[arg(long)]
pub language: Option<String>,
#[arg(long, value_name = "TAG")]
pub tag: Vec<String>,
#[arg(long)]
pub raw: bool,
#[arg(long)]
pub play: bool,
#[arg(long)]
pub force: bool,
}
#[derive(Args, Clone)]
pub struct ScriptArgs {
pub text: String,
#[arg(long)]
pub text_file: bool,
#[arg(short, long)]
pub out: Option<PathBuf>,
#[arg(long)]
pub profile: Option<String>,
#[arg(long)]
pub scene: Option<String>,
#[arg(long)]
pub style: Option<String>,
#[arg(long)]
pub pace: Option<String>,
#[arg(long)]
pub accent: Option<String>,
#[arg(long)]
pub language: Option<String>,
#[arg(long, value_name = "TAG")]
pub tag: Vec<String>,
#[arg(long, value_name = "NAME=VOICE", num_args = 1..=2)]
pub speaker: Vec<String>,
}
#[derive(Args, Clone)]
pub struct LintArgs {
pub text: String,
#[arg(long)]
pub text_file: bool,
#[arg(long, value_name = "NAME")]
pub speaker: Vec<String>,
}
#[derive(Clone, Copy, Debug, ValueEnum, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AudioFormat {
Auto,
Wav,
Pcm,
Mp3,
M4a,
Flac,
}
impl std::fmt::Display for AudioFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Auto => write!(f, "auto"),
Self::Wav => write!(f, "wav"),
Self::Pcm => write!(f, "pcm"),
Self::Mp3 => write!(f, "mp3"),
Self::M4a => write!(f, "m4a"),
Self::Flac => write!(f, "flac"),
}
}
}
#[derive(Subcommand)]
pub enum VoicesAction {
List {
#[arg(long)]
query: Option<String>,
},
Recommend {
brief: String,
#[arg(short, long, default_value = "5")]
count: usize,
},
}
#[derive(Subcommand)]
pub enum TagsAction {
List {
#[arg(long, value_enum)]
category: Option<TagCategory>,
},
Search {
query: String,
},
Recipes,
}
#[derive(Clone, Copy, Debug, ValueEnum, serde::Serialize, PartialEq, Eq)]
#[serde(rename_all = "kebab-case")]
pub enum TagCategory {
Emotion,
Pace,
Volume,
Pause,
Nonverbal,
Character,
Accent,
}
#[derive(Subcommand)]
pub enum LanguagesAction {
List {
#[arg(long)]
query: Option<String>,
},
}
#[derive(Args)]
pub struct DoctorArgs {
#[arg(long)]
pub live: bool,
#[arg(long)]
pub require_ffmpeg: bool,
}
#[derive(Subcommand)]
pub enum AuthAction {
Set {
#[arg(long)]
api_key: String,
},
ImportEnv,
Status,
}
#[derive(Subcommand)]
pub enum SkillAction {
Install,
Status,
}
#[derive(Subcommand)]
pub enum ConfigAction {
Show,
Path,
Init,
Set {
key: String,
value: String,
},
Get {
key: String,
},
}