use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, Result};
use tokio::io::AsyncWriteExt;
use tokio::net::UnixListener;
use tokio::sync::Mutex;
use tracing::{debug, error, info, warn};
use whisrs::audio::capture::AudioCaptureHandle;
use whisrs::audio::feedback;
use whisrs::audio::silence::AutoStopDetector;
use whisrs::history::{self, HistoryEntry};
use whisrs::input::clipboard::ClipboardOps;
use whisrs::input::ClipboardHandler;
use whisrs::llm;
use whisrs::post_processing::filler::remove_filler_words;
use whisrs::state::{Action, StateMachine};
use whisrs::transcription::groq::GroqBackend;
use whisrs::transcription::local_parakeet::ParakeetBackend;
use whisrs::transcription::local_vosk::VoskBackend;
use whisrs::transcription::local_whisper::LocalWhisperBackend;
use whisrs::transcription::openai_realtime::OpenAIRealtimeBackend;
use whisrs::transcription::openai_rest::OpenAIRestBackend;
use whisrs::transcription::{TranscriptionBackend, TranscriptionConfig};
use whisrs::window::{self, WindowTracker};
use whisrs::{encode_message, read_message, socket_path, Command, Config, Response, State};
struct CommandModeContext {
selected_text: String,
saved_clipboard: String,
llm_config: llm::LlmConfig,
is_terminal: bool,
}
struct DaemonState {
state_machine: StateMachine,
audio_capture: Option<AudioCaptureHandle>,
recording_window_id: Option<String>,
streaming_task: Option<tokio::task::JoinHandle<Result<String>>>,
recording_started_at: Option<std::time::Instant>,
command_mode: Option<CommandModeContext>,
}
impl DaemonState {
fn new() -> Self {
Self {
state_machine: StateMachine::new(),
audio_capture: None,
recording_window_id: None,
streaming_task: None,
recording_started_at: None,
command_mode: None,
}
}
}
struct DaemonContext {
config: Config,
window_tracker: Arc<dyn WindowTracker>,
transcription_backend: Arc<dyn TranscriptionBackend>,
notify: bool,
state_tx: tokio::sync::watch::Sender<State>,
}
async fn socket_is_alive(path: &std::path::Path) -> bool {
tokio::net::UnixStream::connect(path).await.is_ok()
}
async fn cleanup_stale_socket(path: &std::path::Path) -> Result<()> {
if path.exists() {
if socket_is_alive(path).await {
anyhow::bail!("another whisrsd instance is already running");
}
warn!("removing stale socket at {}", path.display());
std::fs::remove_file(path).context("failed to remove stale socket")?;
}
Ok(())
}
fn load_config() -> (Config, Option<String>) {
let config_path = whisrs::config_path();
if config_path.exists() {
match std::fs::read_to_string(&config_path) {
Ok(contents) => match toml::from_str::<Config>(&contents) {
Ok(config) => {
info!("loaded config from {}", config_path.display());
return (config, None);
}
Err(e) => {
let msg = format!(
"Failed to parse config at {}: {e} — using defaults",
config_path.display()
);
error!("{msg}");
return (
Config {
general: Default::default(),
audio: Default::default(),
groq: None,
openai: None,
local_whisper: None,
local_vosk: None,
local_parakeet: None,
llm: None,
hotkeys: None,
},
Some(msg),
);
}
},
Err(e) => {
let msg = format!(
"Failed to read config at {}: {e} — using defaults",
config_path.display()
);
error!("{msg}");
return (
Config {
general: Default::default(),
audio: Default::default(),
groq: None,
openai: None,
local_whisper: None,
local_vosk: None,
local_parakeet: None,
llm: None,
hotkeys: None,
},
Some(msg),
);
}
}
} else {
info!(
"no config file found at {}; using defaults",
config_path.display()
);
}
(
Config {
general: Default::default(),
audio: Default::default(),
groq: None,
openai: None,
local_whisper: None,
local_vosk: None,
local_parakeet: None,
llm: None,
hotkeys: None,
},
None,
)
}
const COMPOSITOR_ENV_MAX_RETRIES: u32 = 10;
const COMPOSITOR_ENV_INITIAL_DELAY: Duration = Duration::from_secs(1);
const COMPOSITOR_ENV_VARS: &[&str] = &[
"WAYLAND_DISPLAY",
"DISPLAY",
"HYPRLAND_INSTANCE_SIGNATURE",
"SWAYSOCK",
"XDG_CURRENT_DESKTOP",
];
async fn import_compositor_env() {
if std::env::var("WAYLAND_DISPLAY").is_ok() || std::env::var("DISPLAY").is_ok() {
debug!("compositor environment already available");
return;
}
info!("compositor env vars not set — polling systemd user environment");
let mut delay = COMPOSITOR_ENV_INITIAL_DELAY;
for attempt in 1..=COMPOSITOR_ENV_MAX_RETRIES {
if let Some(imported) = try_import_from_systemd() {
info!("imported compositor environment from systemd (attempt {attempt}): {imported}");
return;
}
if attempt == COMPOSITOR_ENV_MAX_RETRIES {
warn!(
"compositor environment not available after {COMPOSITOR_ENV_MAX_RETRIES} attempts \
— clipboard and window tracking may not work"
);
return;
}
info!(
"compositor env not available (attempt {attempt}/{COMPOSITOR_ENV_MAX_RETRIES}) \
— retrying in {delay:?}"
);
tokio::time::sleep(delay).await;
delay = (delay * 2).min(Duration::from_secs(10));
}
}
fn try_import_from_systemd() -> Option<String> {
let output = std::process::Command::new("systemctl")
.args(["--user", "show-environment"])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut imported = Vec::new();
for line in stdout.lines() {
if let Some((key, value)) = line.split_once('=') {
if COMPOSITOR_ENV_VARS.contains(&key) && std::env::var(key).is_err() {
std::env::set_var(key, value);
imported.push(key.to_string());
}
}
}
if std::env::var("WAYLAND_DISPLAY").is_ok() || std::env::var("DISPLAY").is_ok() {
Some(imported.join(", "))
} else {
None
}
}
fn check_uinput_access() {
use std::fs::OpenOptions;
match OpenOptions::new().write(true).open("/dev/uinput") {
Ok(_) => info!("uinput access: ok"),
Err(e) => {
if e.kind() == std::io::ErrorKind::PermissionDenied {
warn!(
"Cannot open /dev/uinput — permission denied.\n\
Fix: sudo usermod -aG input $USER\n\
# Then log out and log back in\n\
Or install the udev rule:\n\
sudo cp contrib/99-whisrs.rules /etc/udev/rules.d/\n\
sudo udevadm control --reload-rules\n\
sudo udevadm trigger"
);
} else {
warn!("Cannot open /dev/uinput: {e}");
}
}
}
}
fn check_audio_devices() {
use cpal::traits::{DeviceTrait, HostTrait};
let host = cpal::default_host();
match host.default_input_device() {
Some(device) => {
let name = device.name().unwrap_or_else(|_| "unknown".into());
info!("default audio input device: {name}");
}
None => {
warn!("no default audio input device found");
if let Ok(devices) = host.input_devices() {
let names: Vec<String> = devices.filter_map(|d| d.name().ok()).collect();
if names.is_empty() {
warn!("no audio input devices available at all");
} else {
warn!("available audio input devices: {}", names.join(", "));
}
}
}
}
}
fn validate_config(config: &Config) {
match config.validate() {
Ok(warnings) => {
for w in &warnings {
warn!("config: {}", w);
}
}
Err(e) => error!("config: {e}"),
}
if !config.has_any_backend_configured() {
warn!("No transcription backend configured. Run 'whisrs setup' to get started.");
}
}
fn resolve_groq_api_key(config: &Config) -> Option<String> {
if let Ok(key) = std::env::var("WHISRS_GROQ_API_KEY") {
if !key.is_empty() {
return Some(key);
}
}
config.groq.as_ref().map(|g| g.api_key.clone())
}
fn resolve_openai_api_key(config: &Config) -> Option<String> {
if let Ok(key) = std::env::var("WHISRS_OPENAI_API_KEY") {
if !key.is_empty() {
return Some(key);
}
}
config.openai.as_ref().map(|o| o.api_key.clone())
}
fn create_backend(config: &Config) -> Arc<dyn TranscriptionBackend> {
match config.general.backend.as_str() {
"groq" => {
let api_key = resolve_groq_api_key(config).unwrap_or_default();
if api_key.is_empty() {
warn!("no Groq API key configured");
}
info!("using Groq transcription backend");
Arc::new(GroqBackend::new(api_key))
}
"openai-realtime" => {
let api_key = resolve_openai_api_key(config).unwrap_or_default();
if api_key.is_empty() {
warn!("no OpenAI API key configured");
}
info!("using OpenAI Realtime transcription backend");
Arc::new(OpenAIRealtimeBackend::new(api_key))
}
"openai" => {
let api_key = resolve_openai_api_key(config).unwrap_or_default();
if api_key.is_empty() {
warn!("no OpenAI API key configured");
}
info!("using OpenAI REST transcription backend");
Arc::new(OpenAIRestBackend::new(api_key))
}
"local-whisper" | "local" => {
let model_path = config
.local_whisper
.as_ref()
.map(|l| l.model_path.clone())
.unwrap_or_else(|| {
dirs::data_dir()
.unwrap_or_else(|| std::path::PathBuf::from("~/.local/share"))
.join("whisrs/models/ggml-base.en.bin")
.to_string_lossy()
.to_string()
});
info!("using local whisper transcription backend (model: {model_path})");
Arc::new(LocalWhisperBackend::new(model_path))
}
"local-vosk" => {
let model_path = config
.local_vosk
.as_ref()
.map(|l| l.model_path.clone())
.unwrap_or_else(|| {
dirs::data_dir()
.unwrap_or_else(|| std::path::PathBuf::from("~/.local/share"))
.join("whisrs/models/vosk-model-small-en-us-0.15")
.to_string_lossy()
.to_string()
});
info!("using Vosk transcription backend (model: {model_path})");
Arc::new(VoskBackend::new(model_path))
}
"local-parakeet" => {
let model_path = config
.local_parakeet
.as_ref()
.map(|l| l.model_path.clone())
.unwrap_or_else(|| {
dirs::data_dir()
.unwrap_or_else(|| std::path::PathBuf::from("~/.local/share"))
.join("whisrs/models/parakeet-eou-120m")
.to_string_lossy()
.to_string()
});
info!("using Parakeet transcription backend (model: {model_path})");
Arc::new(ParakeetBackend::new(model_path))
}
other => {
warn!("unknown backend '{other}', falling back to groq");
let api_key = resolve_groq_api_key(config).unwrap_or_default();
Arc::new(GroqBackend::new(api_key))
}
}
}
fn get_model_for_backend(config: &Config) -> String {
match config.general.backend.as_str() {
"groq" => config
.groq
.as_ref()
.map(|g| g.model.clone())
.unwrap_or_else(|| "whisper-large-v3-turbo".to_string()),
"openai-realtime" | "openai" => config
.openai
.as_ref()
.map(|o| o.model.clone())
.unwrap_or_else(|| "gpt-4o-mini-transcribe".to_string()),
"local-whisper" | "local" => "base.en".to_string(),
"local-vosk" => "small-en-us".to_string(),
"local-parakeet" => "eou-120m".to_string(),
_ => "whisper-large-v3-turbo".to_string(),
}
}
fn truncate_preview(text: &str, max_bytes: usize) -> String {
if text.len() <= max_bytes {
return text.to_string();
}
let end = text
.char_indices()
.map(|(i, _)| i)
.take_while(|&i| i <= max_bytes)
.last()
.unwrap_or(0);
format!("{}...", &text[..end])
}
fn send_notification(summary: &str, body: &str) {
let summary = summary.to_string();
let body = body.to_string();
std::thread::spawn(move || {
if let Err(e) = notify_rust::Notification::new()
.summary(&summary)
.body(&body)
.appname("whisrs")
.timeout(notify_rust::Timeout::Milliseconds(2000))
.show()
{
warn!("failed to send notification: {e}");
}
});
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
)
.init();
info!("whisrsd v{} starting", env!("CARGO_PKG_VERSION"));
check_uinput_access();
check_audio_devices();
let (config, config_warning) = load_config();
validate_config(&config);
let notify = config.general.notify;
if let Some(msg) = config_warning {
if notify {
send_notification("whisrs", &msg);
}
}
let backend = create_backend(&config);
import_compositor_env().await;
let window_tracker: Arc<dyn WindowTracker> = Arc::from(window::detect_tracker());
info!(
"window tracker: {}",
std::any::type_name_of_val(&*window_tracker)
);
let (state_tx, state_rx) = tokio::sync::watch::channel(State::Idle);
let tray_enabled = config.general.tray;
let context = Arc::new(DaemonContext {
config,
window_tracker,
transcription_backend: backend,
notify,
state_tx,
});
if tray_enabled {
tokio::spawn(whisrs::tray::spawn_tray(state_rx));
}
let sock_path = socket_path();
info!("socket path: {}", sock_path.display());
if let Some(parent) = sock_path.parent() {
if !parent.exists() {
std::fs::create_dir_all(parent)
.with_context(|| format!("failed to create directory {}", parent.display()))?;
}
}
cleanup_stale_socket(&sock_path).await?;
let listener = UnixListener::bind(&sock_path).context("failed to bind Unix socket")?;
info!("listening on {}", sock_path.display());
let daemon_state = Arc::new(Mutex::new(DaemonState::new()));
let sock_path_clone = sock_path.clone();
tokio::spawn(async move {
tokio::signal::ctrl_c().await.ok();
info!("received SIGINT, shutting down");
let _ = std::fs::remove_file(&sock_path_clone);
std::process::exit(0);
});
if let Some(ref hk_config) = context.config.hotkeys {
let hk_config = hk_config.clone();
let hk_state = Arc::clone(&daemon_state);
let hk_ctx = Arc::clone(&context);
tokio::spawn(async move {
let (hotkey_tx, mut hotkey_rx) = tokio::sync::mpsc::channel::<Command>(16);
whisrs::hotkey::start_hotkey_listener(&hk_config, hotkey_tx).await;
while let Some(cmd) = hotkey_rx.recv().await {
info!("hotkey command: {cmd:?}");
let _response =
handle_command(cmd, Arc::clone(&hk_state), Arc::clone(&hk_ctx)).await;
let current = hk_state.lock().await.state_machine.state();
let _ = hk_ctx.state_tx.send(current);
}
});
}
loop {
let (stream, _addr) = listener.accept().await?;
let state = Arc::clone(&daemon_state);
let ctx = Arc::clone(&context);
tokio::spawn(async move {
if let Err(e) = handle_connection(stream, state, ctx).await {
error!("connection error: {e:#}");
}
});
}
}
async fn handle_connection(
stream: tokio::net::UnixStream,
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) -> Result<()> {
let (mut reader, mut writer) = stream.into_split();
let cmd: Command = read_message(&mut reader).await?;
info!("received command: {cmd:?}");
let response = handle_command(cmd, Arc::clone(&daemon_state), Arc::clone(&context)).await;
let current = daemon_state.lock().await.state_machine.state();
let _ = context.state_tx.send(current);
let encoded = encode_message(&response)?;
writer.write_all(&encoded).await?;
writer.shutdown().await?;
Ok(())
}
async fn handle_command(
cmd: Command,
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) -> Response {
match cmd {
Command::Toggle => handle_toggle(daemon_state, context).await,
Command::Cancel => handle_cancel(daemon_state, context).await,
Command::Status => {
let ds = daemon_state.lock().await;
Response::Ok {
state: ds.state_machine.state(),
}
}
Command::Log { limit } => match history::read_entries(limit) {
Ok(entries) => Response::History { entries },
Err(e) => Response::Error {
message: format!("failed to read history: {e}"),
},
},
Command::ClearHistory => match history::clear_history() {
Ok(()) => {
info!("transcription history cleared");
Response::Ok {
state: daemon_state.lock().await.state_machine.state(),
}
}
Err(e) => Response::Error {
message: format!("failed to clear history: {e}"),
},
},
Command::CommandMode => handle_command_mode(daemon_state, context).await,
}
}
async fn handle_toggle(
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) -> Response {
let mut ds = daemon_state.lock().await;
let current_state = ds.state_machine.state();
match current_state {
State::Idle => {
let window_id = match context.window_tracker.get_focused_window() {
Ok(id) => {
info!("captured source window: {id}");
Some(id)
}
Err(e) => {
warn!("failed to capture focused window: {e}");
None
}
};
let mut capture = match AudioCaptureHandle::start() {
Ok(c) => c,
Err(e) => {
let msg = format!("{e}");
let friendly = if msg.contains("no default audio input device") {
format_no_microphone_error()
} else {
format!("Failed to start audio capture: {e}")
};
error!("{friendly}");
return Response::Error { message: friendly };
}
};
if context.transcription_backend.supports_streaming() {
let audio_rx = match capture.take_receiver() {
Some(rx) => rx,
None => {
return Response::Error {
message: "failed to get audio receiver".to_string(),
}
}
};
let config = TranscriptionConfig {
language: context.config.general.language.clone(),
model: get_model_for_backend(&context.config),
prompt: vocabulary_prompt(&context.config.general.vocabulary),
};
let backend = Arc::clone(&context.transcription_backend);
let wid = window_id.clone();
let ctx_notify = context.notify;
let window_tracker_for_pipeline = Arc::clone(&context.window_tracker);
let wid_for_focus = wid.clone();
let silence_timeout = context.config.general.silence_timeout_ms;
let ds_ref = Arc::clone(&daemon_state);
let filler_enabled = context.config.general.remove_filler_words;
let filler_words = context.config.general.filler_words.clone();
let pipeline_audio_feedback = context.config.general.audio_feedback;
let pipeline_audio_volume = context.config.general.audio_feedback_volume;
let pipeline_backend_name = context.config.general.backend.clone();
let pipeline_language = context.config.general.language.clone();
let pipeline_state_tx = context.state_tx.clone();
let task = tokio::spawn(async move {
run_streaming_pipeline(
audio_rx,
backend,
config,
wid,
ctx_notify,
silence_timeout,
ds_ref,
window_tracker_for_pipeline,
filler_enabled,
filler_words,
pipeline_audio_feedback,
pipeline_audio_volume,
pipeline_backend_name,
pipeline_language,
pipeline_state_tx,
)
.await
});
ds.streaming_task = Some(task);
if let Some(wid) = &wid_for_focus {
if let Err(e) = context.window_tracker.focus_window(wid) {
warn!("failed to pre-focus window: {e}");
}
}
}
ds.audio_capture = Some(capture);
ds.recording_window_id = window_id;
ds.recording_started_at = Some(std::time::Instant::now());
match ds.state_machine.transition(Action::Toggle) {
Ok(new_state) => {
info!("started recording");
let _ = context.state_tx.send(new_state);
if context.config.general.audio_feedback {
feedback::play_start(context.config.general.audio_feedback_volume);
}
if context.notify {
send_notification("whisrs", "Recording...");
}
Response::Ok { state: new_state }
}
Err(e) => {
ds.audio_capture = None;
ds.recording_window_id = None;
ds.streaming_task = None;
Response::Error {
message: e.to_string(),
}
}
}
}
State::Recording => {
match ds.state_machine.transition(Action::Toggle) {
Ok(_) => {
info!("stopped recording, transitioning to transcribing");
let _ = context.state_tx.send(State::Transcribing);
if context.config.general.audio_feedback {
feedback::play_stop(context.config.general.audio_feedback_volume);
}
if context.notify {
send_notification("whisrs", "Transcribing...");
}
let capture = ds.audio_capture.take();
let window_id = ds.recording_window_id.take();
let streaming_task = ds.streaming_task.take();
let recording_started_at = ds.recording_started_at.take();
drop(ds);
let result = if let Some(task) = streaming_task {
if let Some(mut cap) = capture {
cap.stop();
tokio::task::spawn_blocking(move || drop(cap));
}
match task.await {
Ok(Ok(text)) => Ok(text),
Ok(Err(e)) => Err(e),
Err(e) => Err(anyhow::anyhow!("streaming task panicked: {e}")),
}
} else {
process_recording_batch(capture, window_id.as_deref(), &context).await
};
let duration_secs = recording_started_at
.map(|t| t.elapsed().as_secs_f64())
.unwrap_or(0.0);
let mut ds = daemon_state.lock().await;
match ds.state_machine.transition(Action::TranscriptionDone) {
Ok(new_state) => {
let _ = context.state_tx.send(new_state);
match result {
Ok(text) => {
info!("transcription complete: {} chars", text.len());
if !text.is_empty() {
save_history_entry(
&text,
&context.config.general.backend,
&context.config.general.language,
duration_secs,
);
}
if context.config.general.audio_feedback {
feedback::play_done(
context.config.general.audio_feedback_volume,
);
}
if context.notify {
let preview = truncate_preview(&text, 77);
send_notification("whisrs", &format!("Done: {preview}"));
}
Response::Ok { state: new_state }
}
Err(e) => {
error!("transcription failed: {e:#}");
if context.notify {
send_notification(
"whisrs",
&format!("Transcription failed: {e}"),
);
}
Response::Ok { state: new_state }
}
}
}
Err(e) => Response::Error {
message: e.to_string(),
},
}
}
Err(e) => Response::Error {
message: e.to_string(),
},
}
}
State::Transcribing => Response::Error {
message: "cannot toggle while transcribing".to_string(),
},
}
}
#[allow(clippy::too_many_arguments)]
async fn run_streaming_pipeline(
mut audio_rx: tokio::sync::mpsc::UnboundedReceiver<Vec<i16>>,
backend: Arc<dyn TranscriptionBackend>,
config: TranscriptionConfig,
window_id: Option<String>,
notify: bool,
silence_timeout_ms: u64,
daemon_state: Arc<Mutex<DaemonState>>,
window_tracker: Arc<dyn WindowTracker>,
filler_enabled: bool,
filler_words: Vec<String>,
audio_feedback: bool,
audio_feedback_volume: f32,
backend_name: String,
language: String,
state_tx: tokio::sync::watch::Sender<State>,
) -> Result<String> {
let pipeline_start = std::time::Instant::now();
let (audio_tx, backend_rx) = tokio::sync::mpsc::channel::<Vec<i16>>(256);
let (text_tx, mut text_rx) = tokio::sync::mpsc::channel::<String>(64);
let config_clone = config.clone();
let backend_task = tokio::spawn(async move {
backend
.transcribe_stream(backend_rx, text_tx, &config_clone)
.await
});
let wid = window_id.clone();
let typing_task = tokio::spawn(async move {
let mut full_text = String::new();
let mut focused = false;
let batch_delay = std::time::Duration::from_millis(150);
loop {
let first = text_rx.recv().await;
let Some(first) = first else { break };
let mut batch = first;
while let Ok(Some(more)) = tokio::time::timeout(batch_delay, text_rx.recv()).await {
batch.push_str(&more);
}
if batch.is_empty() {
continue;
}
if filler_enabled {
batch = remove_filler_words(&batch, &filler_words);
if batch.is_empty() {
continue;
}
}
if !focused {
if let Some(wid) = &wid {
let wid_clone = wid.clone();
let tracker = Arc::clone(&window_tracker);
if let Err(e) = tracker.focus_window(&wid_clone) {
warn!("failed to refocus window {wid_clone} before typing: {e}");
} else {
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
}
}
focused = true;
}
let text_to_type = if full_text.is_empty() {
batch.clone()
} else if !batch.starts_with(' ') && !full_text.ends_with(' ') {
format!(" {batch}")
} else {
batch.clone()
};
full_text.push_str(&text_to_type);
info!("typing: {:?}", text_to_type);
if let Err(e) =
tokio::task::spawn_blocking(move || type_text_at_cursor(&text_to_type)).await
{
warn!("failed to type text: {e}");
}
}
full_text
});
let mut auto_stop = AutoStopDetector::new(0.003, silence_timeout_ms, 16_000);
while let Some(chunk) = audio_rx.recv().await {
if auto_stop.feed(&chunk) {
info!("silence auto-stop triggered after {silence_timeout_ms}ms");
if notify {
send_notification("whisrs", "Auto-stopped (silence detected)");
}
let mut ds = daemon_state.lock().await;
if ds.state_machine.state() == State::Recording {
if let Some(mut capture) = ds.audio_capture.take() {
capture.stop();
tokio::task::spawn_blocking(move || drop(capture));
}
if let Err(e) = ds.state_machine.transition(Action::Toggle) {
warn!("auto-stop state transition failed: {e}");
}
}
break;
}
if audio_tx.send(chunk).await.is_err() {
break;
}
}
while let Some(chunk) = audio_rx.recv().await {
if audio_tx.send(chunk).await.is_err() {
break;
}
}
drop(audio_tx);
let mut stream_error: Option<String> = None;
match backend_task.await {
Ok(Ok(())) => {}
Ok(Err(e)) => {
let friendly = format_api_error(&e);
error!("streaming transcription error: {friendly}");
stream_error = Some(friendly);
}
Err(e) => {
error!("streaming backend task panicked: {e}");
stream_error = Some(format!("transcription task panicked: {e}"));
}
}
let full_text = typing_task.await.unwrap_or_default();
if let Some(err_msg) = &stream_error {
if notify {
if full_text.is_empty() {
send_notification("whisrs", &format!("Transcription error: {err_msg}"));
} else {
send_notification(
"whisrs",
&format!("Transcription failed — partial text may have been typed.\n{err_msg}"),
);
}
}
}
if !full_text.is_empty() {
let duration_secs = pipeline_start.elapsed().as_secs_f64();
save_history_entry(&full_text, &backend_name, &language, duration_secs);
}
let mut ds = daemon_state.lock().await;
if ds.state_machine.state() == State::Transcribing {
ds.state_machine.transition(Action::TranscriptionDone).ok();
let _ = state_tx.send(ds.state_machine.state());
if audio_feedback {
feedback::play_done(audio_feedback_volume);
}
if notify {
let preview = truncate_preview(&full_text, 77);
if !preview.is_empty() {
send_notification("whisrs", &format!("Done: {preview}"));
}
}
}
Ok(full_text)
}
async fn process_recording_batch(
capture: Option<AudioCaptureHandle>,
window_id: Option<&str>,
context: &DaemonContext,
) -> Result<String> {
use whisrs::audio::capture::encode_wav;
let samples = match capture {
Some(cap) => cap.stop_and_collect().await?,
None => anyhow::bail!("no audio capture to collect"),
};
if samples.is_empty() {
anyhow::bail!("no audio samples captured");
}
info!("collected {} audio samples", samples.len());
let wav_data = encode_wav(&samples)?;
info!("encoded WAV: {} bytes", wav_data.len());
let config = TranscriptionConfig {
language: context.config.general.language.clone(),
model: get_model_for_backend(&context.config),
prompt: vocabulary_prompt(&context.config.general.vocabulary),
};
let text = match context
.transcription_backend
.transcribe(&wav_data, &config)
.await
{
Ok(t) => t,
Err(e) => {
let friendly = format_api_error(&e);
error!("transcription failed: {friendly}");
use whisrs::audio::recovery;
match recovery::save_recovery_audio(&samples) {
Ok(path) => {
info!(
"audio saved for recovery: {} — retry with: whisrs transcribe-recovery",
path.display()
);
if context.notify {
send_notification(
"whisrs",
&format!(
"Transcription failed: {friendly}\nAudio saved to {}\nRetry with: whisrs transcribe-recovery",
path.display()
),
);
}
}
Err(re) => {
warn!("failed to save recovery audio: {re}");
}
}
return Err(e);
}
};
let text = if context.config.general.remove_filler_words {
let cleaned = remove_filler_words(&text, &context.config.general.filler_words);
if cleaned != text {
info!(
"filler removal: {} chars -> {} chars",
text.len(),
cleaned.len()
);
}
cleaned
} else {
text
};
if text.is_empty() {
info!("transcription returned empty text — nothing to type");
return Ok(text);
}
if let Some(wid) = window_id {
if let Err(e) = context.window_tracker.focus_window(wid) {
warn!("failed to restore window focus: {e}");
} else {
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
}
let text_clone = text.clone();
if let Err(e) = tokio::task::spawn_blocking(move || type_text_at_cursor(&text_clone)).await? {
warn!("failed to type text: {e}");
}
Ok(text)
}
fn format_no_microphone_error() -> String {
use cpal::traits::{DeviceTrait, HostTrait};
let host = cpal::default_host();
let mut msg = "No microphone found — no default audio input device available.".to_string();
if let Ok(devices) = host.input_devices() {
let names: Vec<String> = devices.filter_map(|d| d.name().ok()).collect();
if names.is_empty() {
msg.push_str("\nNo audio input devices detected. Check that your microphone is connected and PipeWire/PulseAudio is running.");
} else {
msg.push_str("\nAvailable input devices:");
for name in &names {
msg.push_str(&format!("\n - {name}"));
}
msg.push_str(
"\nSet the device in ~/.config/whisrs/config.toml under [audio] device = \"...\"",
);
}
}
msg
}
fn format_api_error(err: &anyhow::Error) -> String {
let msg = format!("{err}");
if msg.contains("invalid API key") || msg.contains("401") {
return "Invalid API key — check your config at ~/.config/whisrs/config.toml".to_string();
}
if msg.contains("rate limit") || msg.contains("429") {
return "Rate limited — wait a moment and try again".to_string();
}
if msg.contains("error sending request")
|| msg.contains("dns error")
|| msg.contains("connection refused")
|| msg.contains("timed out")
|| msg.contains("ConnectError")
{
return "Cannot reach API — check your internet connection".to_string();
}
msg
}
fn type_text_at_cursor(text: &str) -> Result<()> {
use whisrs::input::clipboard::ClipboardOps;
use whisrs::input::keymap::XkbKeymap;
use whisrs::input::uinput::UinputKeyboard;
use whisrs::input::KeyInjector;
let keymap = XkbKeymap::from_default_layout().context("failed to build XKB keymap")?;
let clipboard = ClipboardOps::detect();
let mut keyboard = match UinputKeyboard::new(keymap, clipboard) {
Ok(kb) => kb,
Err(e) => {
let msg = format!("{e:#}");
if msg.contains("Permission denied") || msg.contains("permission") {
anyhow::bail!(
"Cannot open /dev/uinput — permission denied.\n\
Fix: sudo usermod -aG input $USER"
);
}
return Err(e.context("failed to create virtual keyboard"));
}
};
keyboard.type_text(text).context("failed to type text")?;
Ok(())
}
fn vocabulary_prompt(vocabulary: &[String]) -> Option<String> {
if vocabulary.is_empty() {
None
} else {
Some(vocabulary.join(", "))
}
}
fn save_history_entry(text: &str, backend: &str, language: &str, duration_secs: f64) {
let entry = HistoryEntry {
timestamp: chrono::Local::now(),
text: text.to_string(),
backend: backend.to_string(),
language: language.to_string(),
duration_secs,
};
if let Err(e) = history::append_entry(&entry) {
warn!("failed to save history entry: {e}");
}
}
async fn handle_command_mode(
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) -> Response {
let current_state = {
let ds = daemon_state.lock().await;
ds.state_machine.state()
};
match current_state {
State::Recording => {
let is_command_mode = {
let ds = daemon_state.lock().await;
ds.command_mode.is_some()
};
if !is_command_mode {
return Response::Error {
message: "recording is active but not in command mode — use toggle or cancel"
.to_string(),
};
}
let mut ds = daemon_state.lock().await;
if let Some(mut capture) = ds.audio_capture.take() {
capture.stop();
tokio::task::spawn_blocking(move || drop(capture));
}
info!("command mode: manual stop");
Response::Ok {
state: State::Recording,
}
}
State::Idle => {
command_mode_start(daemon_state, context).await
}
State::Transcribing => Response::Error {
message: "cannot start command mode while transcribing".to_string(),
},
}
}
async fn command_mode_start(
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) -> Response {
let llm_config = context.config.llm.clone().unwrap_or_default();
info!("command mode: getting selected text");
let clipboard = ClipboardOps::detect();
let saved_clipboard = clipboard.get_text().unwrap_or_default();
let is_terminal = context
.window_tracker
.get_focused_window_class()
.map(|c| is_terminal_class(&c))
.unwrap_or(false);
let selected_text = clipboard.get_primary_selection().unwrap_or_default();
let copy_fn = if is_terminal {
simulate_terminal_copy
} else {
simulate_copy
};
match tokio::task::spawn_blocking(copy_fn).await {
Ok(Ok(())) => {}
Ok(Err(e)) => {
if selected_text.is_empty() {
return Response::Error {
message: format!("failed to copy selection: {e}"),
};
}
warn!("copy simulation failed ({e}), using primary selection text");
}
Err(e) => {
if selected_text.is_empty() {
return Response::Error {
message: format!("copy task panicked: {e}"),
};
}
warn!("copy task panicked ({e}), using primary selection text");
}
}
let selected_text = if selected_text.is_empty() {
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
match clipboard.get_text() {
Ok(text) => text,
Err(e) => {
return Response::Error {
message: format!("failed to read clipboard: {e}"),
};
}
}
} else {
selected_text
};
if selected_text.is_empty() || selected_text == saved_clipboard {
return Response::Error {
message: "no text selected — select some text before using command mode".to_string(),
};
}
info!(
"command mode: got {} chars of selected text",
selected_text.len()
);
if context.config.general.audio_feedback {
feedback::play_start(context.config.general.audio_feedback_volume);
}
let mut capture = match AudioCaptureHandle::start() {
Ok(c) => c,
Err(e) => {
return Response::Error {
message: format!("failed to start audio capture: {e}"),
};
}
};
let audio_rx = capture.take_receiver();
{
let mut ds = daemon_state.lock().await;
if let Err(e) = ds.state_machine.transition(Action::Toggle) {
return Response::Error {
message: format!("state transition failed: {e}"),
};
}
ds.audio_capture = Some(capture);
ds.recording_started_at = Some(std::time::Instant::now());
ds.command_mode = Some(CommandModeContext {
selected_text,
saved_clipboard,
llm_config,
is_terminal,
});
}
if context.notify {
send_notification(
"whisrs",
"Command mode: speak your instruction... (press again to stop)",
);
}
let ds_ref = Arc::clone(&daemon_state);
let ctx = Arc::clone(&context);
tokio::spawn(async move {
command_mode_background(audio_rx, ds_ref, ctx).await;
});
Response::Ok {
state: State::Recording,
}
}
async fn command_mode_background(
audio_rx: Option<tokio::sync::mpsc::UnboundedReceiver<Vec<i16>>>,
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) {
let silence_timeout = context.config.general.silence_timeout_ms;
let mut auto_stop = AutoStopDetector::new(0.003, silence_timeout, 16_000);
let mut all_samples: Vec<i16> = Vec::new();
if let Some(mut rx) = audio_rx {
while let Some(chunk) = rx.recv().await {
all_samples.extend_from_slice(&chunk);
if auto_stop.feed(&chunk) {
info!("command mode: silence auto-stop");
let mut ds = daemon_state.lock().await;
if let Some(mut capture) = ds.audio_capture.take() {
capture.stop();
tokio::task::spawn_blocking(move || drop(capture));
}
break;
}
}
}
let cmd_ctx = {
let mut ds = daemon_state.lock().await;
ds.audio_capture.take(); ds.command_mode.take()
};
let Some(cmd_ctx) = cmd_ctx else {
warn!("command mode: context missing, aborting");
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::Toggle);
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
};
{
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::Toggle);
}
if context.config.general.audio_feedback {
feedback::play_stop(context.config.general.audio_feedback_volume);
}
if all_samples.is_empty() {
if context.notify {
send_notification("whisrs", "Command mode: no audio captured");
}
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
}
if context.notify {
send_notification("whisrs", "Processing command...");
}
let wav_data = match whisrs::audio::capture::encode_wav(&all_samples) {
Ok(d) => d,
Err(e) => {
error!("command mode: failed to encode audio: {e}");
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
}
};
let config = TranscriptionConfig {
language: context.config.general.language.clone(),
model: get_model_for_backend(&context.config),
prompt: None,
};
let instruction = match context
.transcription_backend
.transcribe(&wav_data, &config)
.await
{
Ok(text) => text,
Err(e) => {
error!("command mode: transcription failed: {e}");
if context.notify {
send_notification("whisrs", &format!("Command failed: {e}"));
}
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
}
};
if instruction.is_empty() {
if context.notify {
send_notification("whisrs", "Could not understand instruction — try again");
}
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
}
info!("command mode: instruction = {:?}", instruction);
let result =
match llm::rewrite_text(&cmd_ctx.llm_config, &cmd_ctx.selected_text, &instruction).await {
Ok(text) => text,
Err(e) => {
error!("command mode: LLM failed: {e}");
if context.notify {
send_notification("whisrs", &format!("Command failed: {e}"));
}
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
}
};
info!("command mode: pasting {} chars", result.len());
let clipboard = ClipboardOps::detect();
if let Err(e) = clipboard.set_text(&result) {
error!("command mode: failed to set clipboard: {e}");
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
return;
}
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
if cmd_ctx.is_terminal {
match tokio::task::spawn_blocking(simulate_terminal_clear_and_paste).await {
Ok(Ok(())) => {}
Ok(Err(e)) => warn!("failed to clear+paste in terminal: {e}"),
Err(e) => warn!("terminal clear+paste task panicked: {e}"),
}
} else {
match tokio::task::spawn_blocking(simulate_paste).await {
Ok(Ok(())) => {}
Ok(Err(e)) => warn!("failed to paste: {e}"),
Err(e) => warn!("paste task panicked: {e}"),
}
}
let saved = cmd_ctx.saved_clipboard.clone();
let clipboard_restore = ClipboardOps::detect();
tokio::spawn(async move {
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
if let Err(e) = clipboard_restore.set_text(&saved) {
warn!("failed to restore clipboard: {e}");
}
});
if context.config.general.audio_feedback {
feedback::play_done(context.config.general.audio_feedback_volume);
}
if context.notify {
send_notification("whisrs", &format!("Command applied: {instruction}"));
}
let mut ds = daemon_state.lock().await;
let _ = ds.state_machine.transition(Action::TranscriptionDone);
let _ = context.state_tx.send(ds.state_machine.state());
}
fn simulate_key_combo(modifier: evdev::Key, key: evdev::Key) -> anyhow::Result<()> {
use evdev::{AttributeSet, EventType, InputEvent, Key};
use std::thread;
use std::time::Duration;
let mut keys = AttributeSet::<Key>::new();
keys.insert(modifier);
keys.insert(key);
let mut device = evdev::uinput::VirtualDeviceBuilder::new()
.context("failed to create VirtualDeviceBuilder")?
.name("whisrs command")
.with_keys(&keys)
.context("failed to register key events")?
.build()
.context("failed to build uinput device")?;
thread::sleep(Duration::from_millis(200));
device.emit(&[InputEvent::new(EventType::KEY, modifier.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, key.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, key.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, modifier.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
Ok(())
}
fn simulate_key_combo_2mod(
mod1: evdev::Key,
mod2: evdev::Key,
key: evdev::Key,
) -> anyhow::Result<()> {
use evdev::{AttributeSet, EventType, InputEvent, Key};
use std::thread;
use std::time::Duration;
let mut keys = AttributeSet::<Key>::new();
keys.insert(mod1);
keys.insert(mod2);
keys.insert(key);
let mut device = evdev::uinput::VirtualDeviceBuilder::new()
.context("failed to create VirtualDeviceBuilder")?
.name("whisrs command")
.with_keys(&keys)
.context("failed to register key events")?
.build()
.context("failed to build uinput device")?;
thread::sleep(Duration::from_millis(200));
device.emit(&[InputEvent::new(EventType::KEY, mod1.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, mod2.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, key.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, key.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, mod2.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, mod1.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
Ok(())
}
fn simulate_copy() -> anyhow::Result<()> {
simulate_key_combo(evdev::Key::KEY_LEFTCTRL, evdev::Key::KEY_C)
}
fn simulate_terminal_copy() -> anyhow::Result<()> {
simulate_key_combo_2mod(
evdev::Key::KEY_LEFTCTRL,
evdev::Key::KEY_LEFTSHIFT,
evdev::Key::KEY_C,
)
}
fn simulate_paste() -> anyhow::Result<()> {
simulate_key_combo(evdev::Key::KEY_LEFTCTRL, evdev::Key::KEY_V)
}
fn simulate_terminal_clear_and_paste() -> anyhow::Result<()> {
use evdev::{AttributeSet, EventType, InputEvent, Key};
use std::thread;
use std::time::Duration;
let mut keys = AttributeSet::<Key>::new();
keys.insert(Key::KEY_LEFTCTRL);
keys.insert(Key::KEY_LEFTSHIFT);
keys.insert(Key::KEY_A);
keys.insert(Key::KEY_K);
keys.insert(Key::KEY_V);
let mut device = evdev::uinput::VirtualDeviceBuilder::new()
.context("failed to create VirtualDeviceBuilder")?
.name("whisrs command")
.with_keys(&keys)
.context("failed to register key events")?
.build()
.context("failed to build uinput device")?;
thread::sleep(Duration::from_millis(200));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_LEFTCTRL.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_A.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_A.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_K.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_K.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_LEFTCTRL.code(), 0)])?;
thread::sleep(Duration::from_millis(10));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_LEFTCTRL.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(
EventType::KEY,
Key::KEY_LEFTSHIFT.code(),
1,
)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_V.code(), 1)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_V.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(
EventType::KEY,
Key::KEY_LEFTSHIFT.code(),
0,
)])?;
thread::sleep(Duration::from_millis(2));
device.emit(&[InputEvent::new(EventType::KEY, Key::KEY_LEFTCTRL.code(), 0)])?;
thread::sleep(Duration::from_millis(2));
Ok(())
}
const TERMINAL_CLASSES: &[&str] = &[
"alacritty",
"kitty",
"foot",
"wezterm",
"gnome-terminal",
"konsole",
"xterm",
"urxvt",
"terminator",
"tilix",
"st",
"xfce4-terminal",
"sakura",
"guake",
"yakuake",
"termite",
"cool-retro-term",
"ghostty",
];
fn is_terminal_class(class: &str) -> bool {
let lower = class.to_lowercase();
TERMINAL_CLASSES.iter().any(|t| lower.contains(t))
}
async fn handle_cancel(
daemon_state: Arc<Mutex<DaemonState>>,
context: Arc<DaemonContext>,
) -> Response {
let mut ds = daemon_state.lock().await;
match ds.state_machine.transition(Action::Cancel) {
Ok(new_state) => {
if let Some(mut capture) = ds.audio_capture.take() {
capture.stop();
tokio::task::spawn_blocking(move || drop(capture));
}
if let Some(task) = ds.streaming_task.take() {
task.abort();
}
ds.recording_window_id = None;
info!("cancelled recording");
if context.notify {
send_notification("whisrs", "Recording cancelled");
}
Response::Ok { state: new_state }
}
Err(e) => Response::Error {
message: e.to_string(),
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn truncate_ascii_short() {
assert_eq!(truncate_preview("hello", 77), "hello");
}
#[test]
fn truncate_ascii_exact() {
let text = "a".repeat(77);
assert_eq!(truncate_preview(&text, 77), text);
}
#[test]
fn truncate_ascii_long() {
let text = "a".repeat(100);
let expected = format!("{}...", "a".repeat(77));
assert_eq!(truncate_preview(&text, 77), expected);
}
#[test]
fn truncate_cyrillic() {
let text = "а".repeat(40); let preview = truncate_preview(&text, 77);
assert!(preview.ends_with("..."));
assert_eq!(preview, format!("{}...", "а".repeat(38)));
}
#[test]
fn truncate_cyrillic_issue_repro() {
let text = "Вот это рутина, блять, вы дешевые щели, блять.";
let preview = truncate_preview(text, 77);
assert!(preview.is_char_boundary(0));
assert!(!preview.is_empty());
}
#[test]
fn truncate_cjk() {
let text = "字".repeat(27);
let preview = truncate_preview(&text, 77);
assert!(preview.ends_with("..."));
assert_eq!(preview, format!("{}...", "字".repeat(25)));
}
#[test]
fn truncate_arabic() {
let text = "ش".repeat(40);
let preview = truncate_preview(&text, 77);
assert!(preview.ends_with("..."));
assert_eq!(preview, format!("{}...", "ش".repeat(38)));
}
#[test]
fn truncate_emoji() {
let text = "😀".repeat(20);
let preview = truncate_preview(&text, 77);
assert!(preview.ends_with("..."));
assert_eq!(preview, format!("{}...", "😀".repeat(19)));
}
#[test]
fn truncate_mixed_scripts() {
let text = "Hello Мир 世界! This is a test of mixed script truncation that is long enough";
let preview = truncate_preview(text, 77);
assert!(preview.ends_with("..."));
assert!(preview.len() <= 80); }
#[test]
fn truncate_empty() {
assert_eq!(truncate_preview("", 77), "");
}
}