1use clap::{Arg, Command};
2use std::io::{self, IsTerminal, Read};
3use std::path::PathBuf;
4
5use crate::audio::{create_temp_audio_file, play_audio_and_cleanup};
6use crate::audio_merge::{check_ffmpeg_available, merge_audio_files};
7use crate::config::{get_presets_map, list_presets, load_config, Config};
8use crate::text_splitter::{check_text_length, split_text, MAX_CHARS};
9use crate::voicepeak::{list_emotion, list_narrator, VoicepeakCommand};
10
11pub fn build_cli() -> Command {
12 Command::new("voicepeak-cli")
13 .version("0.4.2")
14 .about("VOICEPEAK CLI wrapper with presets and auto-play")
15 .arg(
16 Arg::new("text")
17 .value_name("TEXT")
18 .help("Text to say (or pipe from stdin)")
19 .index(1),
20 )
21 .arg(
22 Arg::new("file")
23 .short('t')
24 .long("text")
25 .value_name("FILE")
26 .help("Text file to say")
27 .conflicts_with("text"),
28 )
29 .arg(
30 Arg::new("out")
31 .short('o')
32 .long("out")
33 .value_name("FILE")
34 .help("Path of output file (optional - will play with mpv if not specified)"),
35 )
36 .arg(
37 Arg::new("narrator")
38 .short('n')
39 .long("narrator")
40 .value_name("NAME")
41 .help("Name of voice"),
42 )
43 .arg(
44 Arg::new("emotion")
45 .short('e')
46 .long("emotion")
47 .value_name("EXPR")
48 .help("Emotion expression (e.g., happy=50,sad=50)"),
49 )
50 .arg(
51 Arg::new("preset")
52 .short('p')
53 .long("preset")
54 .value_name("NAME")
55 .help("Use voice preset (karin-normal, karin-happy, karin-angry, karin-sad, karin-whisper)")
56 .conflicts_with_all(["narrator", "emotion"]),
57 )
58 .arg(
59 Arg::new("list-narrator")
60 .long("list-narrator")
61 .help("Print voice list")
62 .action(clap::ArgAction::SetTrue),
63 )
64 .arg(
65 Arg::new("list-emotion")
66 .long("list-emotion")
67 .value_name("NARRATOR")
68 .help("Print emotion list for given voice"),
69 )
70 .arg(
71 Arg::new("list-presets")
72 .long("list-presets")
73 .help("Print available presets")
74 .action(clap::ArgAction::SetTrue),
75 )
76 .arg(
77 Arg::new("speed")
78 .long("speed")
79 .value_name("VALUE")
80 .help("Speed (50 - 200)"),
81 )
82 .arg(
83 Arg::new("pitch")
84 .long("pitch")
85 .value_name("VALUE")
86 .help("Pitch (-300 - 300)"),
87 )
88 .arg(
89 Arg::new("strict-length")
90 .long("strict-length")
91 .help("Reject input longer than 140 characters (default: false, allows splitting)")
92 .action(clap::ArgAction::SetTrue),
93 )
94 .arg(
95 Arg::new("playback-mode")
96 .long("playback-mode")
97 .value_name("MODE")
98 .help("Playback mode: sequential or batch (default: batch)")
99 .value_parser(["sequential", "batch"])
100 .default_value("batch"),
101 )
102 .arg(
103 Arg::new("verbose")
104 .long("verbose")
105 .short('v')
106 .help("Enable verbose output (show VOICEPEAK debug messages)")
107 .action(clap::ArgAction::SetTrue),
108 )
109}
110
111pub fn handle_matches(matches: clap::ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
112 let config = load_config()?;
113
114 if matches.get_flag("list-narrator") {
115 list_narrator();
116 return Ok(());
117 }
118
119 if let Some(narrator) = matches.get_one::<String>("list-emotion") {
120 list_emotion(narrator);
121 return Ok(());
122 }
123
124 if matches.get_flag("list-presets") {
125 list_presets(&config);
126 return Ok(());
127 }
128
129 run_voicepeak(&matches, &config)
130}
131
132fn run_voicepeak(
133 matches: &clap::ArgMatches,
134 config: &Config,
135) -> Result<(), Box<dyn std::error::Error>> {
136 let input_text = if let Some(text) = matches.get_one::<String>("text") {
137 text.clone()
138 } else if let Some(file_path) = matches.get_one::<String>("file") {
139 std::fs::read_to_string(file_path)?
140 } else if !io::stdin().is_terminal() {
141 let mut buffer = String::new();
143 io::stdin().read_to_string(&mut buffer)?;
144 buffer.trim().to_string()
145 } else {
146 return Err("Either text argument, --text file, or pipe input must be specified".into());
147 };
148
149 let presets_map = get_presets_map(config);
150
151 let (narrator, emotion, preset_pitch) =
152 if let Some(preset_name) = matches.get_one::<String>("preset") {
153 let preset = presets_map
155 .get(preset_name)
156 .ok_or_else(|| format!("Unknown preset: {}", preset_name))?;
157 (
158 preset.narrator.clone(),
159 preset.get_emotion_string(),
160 preset.pitch,
161 )
162 } else if let Some(default_preset_name) = &config.default_preset {
163 if let Some(default_preset) = presets_map.get(default_preset_name) {
165 let narrator = matches
167 .get_one::<String>("narrator")
168 .cloned()
169 .unwrap_or_else(|| default_preset.narrator.clone());
170 let emotion = matches
171 .get_one::<String>("emotion")
172 .cloned()
173 .unwrap_or_else(|| default_preset.get_emotion_string());
174 let preset_pitch = if matches.get_one::<String>("emotion").is_some() {
175 None } else {
177 default_preset.pitch
178 };
179 (narrator, emotion, preset_pitch)
180 } else {
181 let narrator = matches
183 .get_one::<String>("narrator")
184 .cloned()
185 .ok_or("No narrator specified. Use --narrator option or configure a preset.")?;
186 let emotion = matches
187 .get_one::<String>("emotion")
188 .cloned()
189 .unwrap_or_default();
190 (narrator, emotion, None)
191 }
192 } else {
193 let narrator = matches
195 .get_one::<String>("narrator")
196 .cloned()
197 .ok_or("No narrator specified. Use --narrator option or configure a preset.")?;
198 let emotion = matches
199 .get_one::<String>("emotion")
200 .cloned()
201 .unwrap_or_default();
202 (narrator, emotion, None)
203 };
204
205 let speed = matches.get_one::<String>("speed");
206 let pitch = matches
207 .get_one::<String>("pitch")
208 .cloned()
209 .or_else(|| preset_pitch.map(|p| p.to_string()));
210 let should_play = matches.get_one::<String>("out").is_none();
211 let output_path = matches.get_one::<String>("out").map(PathBuf::from);
212 let strict_length = matches.get_flag("strict-length");
213 let playback_mode = matches.get_one::<String>("playback-mode").unwrap();
214 let verbose = matches.get_flag("verbose");
215
216 if strict_length && !check_text_length(&input_text) {
217 return Err(format!(
218 "Input text is too long ({} characters). Maximum allowed is {} characters.\nUse without --strict-length to enable automatic splitting.",
219 input_text.chars().count(),
220 MAX_CHARS
221 ).into());
222 }
223
224 let text_chunks = split_text(&input_text);
225
226 if text_chunks.len() > 1 {
227 println!(
228 "Text is too long, splitting into {} parts...",
229 text_chunks.len()
230 );
231 }
232
233 if (playback_mode == "batch" || (!should_play && text_chunks.len() > 1))
235 && !check_ffmpeg_available()
236 {
237 return Err(
238 "ffmpeg is required for batch mode and multi-chunk file output.\n\
239 Please install ffmpeg or use --playback-mode sequential for auto-play mode.\n\
240 Install ffmpeg: https://ffmpeg.org/download.html"
241 .into(),
242 );
243 }
244
245 if should_play {
246 if playback_mode == "sequential" {
248 for (i, chunk) in text_chunks.iter().enumerate() {
250 if text_chunks.len() > 1 {
251 println!("Playing part {}/{}", i + 1, text_chunks.len());
252 }
253
254 let temp_path = create_temp_audio_file()?;
255
256 let mut cmd = VoicepeakCommand::new()
257 .text(chunk)
258 .narrator(&narrator)
259 .emotion(&emotion)
260 .output(&temp_path);
261
262 if let Some(speed) = speed {
263 cmd = cmd.speed(speed);
264 }
265 if let Some(pitch) = &pitch {
266 cmd = cmd.pitch(pitch);
267 }
268
269 cmd.execute_with_verbose(verbose)?;
270 play_audio_and_cleanup(&temp_path)?;
271 }
272 } else {
273 let mut temp_files = Vec::new();
275
276 for (i, chunk) in text_chunks.iter().enumerate() {
277 if text_chunks.len() > 1 {
278 println!("Generating part {}/{}", i + 1, text_chunks.len());
279 }
280
281 let temp_path = create_temp_audio_file()?;
282
283 let mut cmd = VoicepeakCommand::new()
284 .text(chunk)
285 .narrator(&narrator)
286 .emotion(&emotion)
287 .output(&temp_path);
288
289 if let Some(speed) = speed {
290 cmd = cmd.speed(speed);
291 }
292 if let Some(pitch) = &pitch {
293 cmd = cmd.pitch(pitch);
294 }
295
296 cmd.execute_with_verbose(verbose)?;
297 temp_files.push(temp_path);
298 }
299
300 let final_temp = create_temp_audio_file()?;
302 let temp_paths: Vec<&std::path::Path> =
303 temp_files.iter().map(|p| p.as_path()).collect();
304
305 if text_chunks.len() > 1 {
306 println!("Merging audio files...");
307 merge_audio_files(&temp_paths, &final_temp)?;
308 println!("Merge complete. Playing audio...");
309 } else {
310 merge_audio_files(&temp_paths, &final_temp)?;
311 }
312
313 for temp_file in temp_files {
315 let _ = std::fs::remove_file(temp_file);
316 }
317
318 play_audio_and_cleanup(&final_temp)?;
319 }
320 } else if let Some(output_path) = output_path {
321 let mut temp_files = Vec::new();
323
324 for (i, chunk) in text_chunks.iter().enumerate() {
325 if text_chunks.len() > 1 {
326 println!("Generating part {}/{}", i + 1, text_chunks.len());
327 }
328
329 let temp_path = create_temp_audio_file()?;
330
331 let mut cmd = VoicepeakCommand::new()
332 .text(chunk)
333 .narrator(&narrator)
334 .emotion(&emotion)
335 .output(&temp_path);
336
337 if let Some(speed) = speed {
338 cmd = cmd.speed(speed);
339 }
340 if let Some(pitch) = &pitch {
341 cmd = cmd.pitch(pitch);
342 }
343
344 cmd.execute_with_verbose(verbose)?;
345 temp_files.push(temp_path);
346 }
347
348 let temp_paths: Vec<&std::path::Path> = temp_files.iter().map(|p| p.as_path()).collect();
350
351 if text_chunks.len() > 1 {
352 println!("Merging audio files...");
353 merge_audio_files(&temp_paths, &output_path)?;
354 println!("Merge complete.");
355 } else {
356 merge_audio_files(&temp_paths, &output_path)?;
357 }
358
359 for temp_file in temp_files {
361 let _ = std::fs::remove_file(temp_file);
362 }
363
364 println!("Audio saved to: {}", output_path.display());
365 }
366
367 Ok(())
368}