transcribe-cli 0.0.5

Whisper CLI transcription pipeline on CTranslate2 with CPU and optional CUDA support
use std::ffi::{CStr, CString};
use std::os::raw::{c_char, c_int, c_long, c_uint, c_ulong, c_void};
use std::ptr;

use anyhow::{Context, Result, anyhow, bail};

const SND_PCM_STREAM_CAPTURE: c_int = 1;
const SND_PCM_ACCESS_RW_INTERLEAVED: c_int = 3;
const SND_PCM_FORMAT_S16_LE: c_int = 2;
const DEFAULT_PCM_LATENCY_US: c_uint = 80_000;
const DEFAULT_FRAMES_PER_READ: usize = 1024;

#[repr(C)]
struct SndPcm(c_void);

#[link(name = "asound")]
unsafe extern "C" {
    fn snd_pcm_open(
        pcm: *mut *mut SndPcm,
        name: *const c_char,
        stream: c_int,
        mode: c_int,
    ) -> c_int;
    fn snd_pcm_close(pcm: *mut SndPcm) -> c_int;
    fn snd_pcm_set_params(
        pcm: *mut SndPcm,
        format: c_int,
        access: c_int,
        channels: c_uint,
        rate: c_uint,
        soft_resample: c_int,
        latency: c_uint,
    ) -> c_int;
    fn snd_pcm_readi(pcm: *mut SndPcm, buffer: *mut c_void, size: c_ulong) -> c_long;
    fn snd_pcm_recover(pcm: *mut SndPcm, err: c_int, silent: c_int) -> c_int;
    fn snd_strerror(errnum: c_int) -> *const c_char;
}

pub struct MicrophoneCapture {
    pcm: *mut SndPcm,
    frames_per_read: usize,
}

impl MicrophoneCapture {
    pub fn open_default(sample_rate: u32, channels: u16) -> Result<Self> {
        if sample_rate == 0 {
            bail!("microphone sample rate must be greater than zero");
        }
        if channels == 0 {
            bail!("microphone channel count must be greater than zero");
        }

        let device_name =
            CString::new("default").context("failed to build default ALSA device name")?;
        let mut pcm = ptr::null_mut();

        alsa_result(unsafe {
            snd_pcm_open(&mut pcm, device_name.as_ptr(), SND_PCM_STREAM_CAPTURE, 0)
        })
        .context("failed to open default microphone device")?;

        let pcm = PcmHandle::new(pcm);
        alsa_result(unsafe {
            snd_pcm_set_params(
                pcm.as_ptr(),
                SND_PCM_FORMAT_S16_LE,
                SND_PCM_ACCESS_RW_INTERLEAVED,
                channels as c_uint,
                sample_rate,
                1,
                DEFAULT_PCM_LATENCY_US,
            )
        })
        .with_context(|| {
            format!(
                "failed to configure microphone capture for {} Hz / {} channel(s)",
                sample_rate, channels
            )
        })?;

        Ok(Self {
            pcm: pcm.into_raw(),
            frames_per_read: DEFAULT_FRAMES_PER_READ,
        })
    }

    pub fn read_samples(&mut self) -> Result<Vec<f32>> {
        let pcm = PcmHandle::borrowed(self.pcm);
        let mut buffer = vec![0i16; self.frames_per_read];

        loop {
            let frames_read = unsafe {
                snd_pcm_readi(
                    pcm.as_ptr(),
                    buffer.as_mut_ptr().cast::<c_void>(),
                    self.frames_per_read as c_ulong,
                )
            };

            if frames_read > 0 {
                let frames_read = frames_read as usize;
                return Ok(buffer[..frames_read]
                    .iter()
                    .map(|sample| (*sample as f32 / i16::MAX as f32).clamp(-1.0, 1.0))
                    .collect());
            }

            if frames_read == 0 {
                continue;
            }

            let recovered = unsafe { snd_pcm_recover(pcm.as_ptr(), frames_read as c_int, 1) };
            if recovered < 0 {
                return Err(alsa_error(recovered))
                    .context("microphone capture failed and ALSA could not recover");
            }
        }
    }
}

impl Drop for MicrophoneCapture {
    fn drop(&mut self) {
        if !self.pcm.is_null() {
            let _ = unsafe { snd_pcm_close(self.pcm) };
        }
    }
}

fn alsa_result(code: c_int) -> Result<()> {
    if code < 0 {
        Err(alsa_error(code))
    } else {
        Ok(())
    }
}

fn alsa_error(code: c_int) -> anyhow::Error {
    let message = unsafe { CStr::from_ptr(snd_strerror(code)) }
        .to_string_lossy()
        .into_owned();
    anyhow!("ALSA error {code}: {message}")
}

struct PcmHandle {
    ptr: *mut SndPcm,
    owned: bool,
}

impl PcmHandle {
    fn new(ptr: *mut SndPcm) -> Self {
        Self { ptr, owned: true }
    }

    fn borrowed(ptr: *mut SndPcm) -> Self {
        Self { ptr, owned: false }
    }

    fn as_ptr(&self) -> *mut SndPcm {
        self.ptr
    }

    fn into_raw(mut self) -> *mut SndPcm {
        self.owned = false;
        self.ptr
    }
}

impl Drop for PcmHandle {
    fn drop(&mut self) {
        if self.owned && !self.ptr.is_null() {
            let _ = unsafe { snd_pcm_close(self.ptr) };
        }
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn converts_i16_audio_to_unit_f32_range() {
        let samples = [i16::MIN, 0, i16::MAX];
        let converted = samples
            .iter()
            .map(|sample| (*sample as f32 / i16::MAX as f32).clamp(-1.0, 1.0))
            .collect::<Vec<_>>();

        assert_eq!(converted[0], -1.0);
        assert_eq!(converted[1], 0.0);
        assert_eq!(converted[2], 1.0);
    }
}