pub mod screen_diff;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::thread::{self, JoinHandle};
use std::time::{Duration, Instant};
use tracing::{debug, warn};
use crate::audio::{capture_system_audio, transcribe, AudioData};
pub struct AudioRingBuffer {
data: Vec<f32>,
capacity: usize,
write_pos: usize,
len: usize,
}
impl AudioRingBuffer {
#[must_use]
pub fn new(capacity: usize) -> Self {
assert!(capacity > 0, "AudioRingBuffer capacity must be > 0");
Self {
data: vec![0.0f32; capacity],
capacity,
write_pos: 0,
len: 0,
}
}
pub fn push_slice(&mut self, samples: &[f32]) {
for &s in samples {
self.data[self.write_pos] = s;
self.write_pos = (self.write_pos + 1) % self.capacity;
if self.len < self.capacity {
self.len += 1;
}
}
}
#[must_use]
pub fn read_last(&self, count: usize) -> Vec<f32> {
let n = count.min(self.len);
if n == 0 {
return Vec::new();
}
let mut out = Vec::with_capacity(n);
let start = (self.write_pos + self.capacity - self.len + (self.len - n)) % self.capacity;
for i in 0..n {
out.push(self.data[(start + i) % self.capacity]);
}
out
}
#[must_use]
pub fn len(&self) -> usize {
self.len
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn clear(&mut self) {
self.write_pos = 0;
self.len = 0;
}
}
#[derive(Debug, Clone)]
pub struct TranscriptSegment {
pub text: String,
pub start_ms: u64,
pub end_ms: u64,
pub speaker: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ScreenFrame {
pub png_base64: String,
pub timestamp: String,
pub width: u32,
pub height: u32,
}
#[derive(Debug, Clone)]
pub struct CaptureConfig {
pub audio: bool,
pub transcribe: bool,
pub screen: bool,
pub buffer_seconds: u32,
pub screen_diff_threshold: f32,
}
impl Default for CaptureConfig {
fn default() -> Self {
Self {
audio: true,
transcribe: true,
screen: false,
buffer_seconds: 60,
screen_diff_threshold: 0.05,
}
}
}
#[derive(Default)]
pub(crate) struct CaptureState {
pub(crate) transcript_segments: Vec<TranscriptSegment>,
pub(crate) latest_frame: Option<ScreenFrame>,
}
pub struct CaptureSession {
pub session_id: String,
audio_buffer: Arc<Mutex<AudioRingBuffer>>,
state: Arc<Mutex<CaptureState>>,
running: Arc<AtomicBool>,
sample_rate: u32,
handle: Option<JoinHandle<()>>,
config: CaptureConfig,
started_at: Instant,
frames_captured: Arc<AtomicU64>,
frames_skipped: Arc<AtomicU64>,
}
static SESSION_COUNTER: AtomicU64 = AtomicU64::new(1);
const AUDIO_WINDOW_SECS: f32 = 5.0;
const SCREEN_POLL_SECS: u64 = 3;
impl CaptureSession {
#[must_use]
pub fn start(config: CaptureConfig) -> Self {
let sample_rate = crate::audio::SAMPLE_RATE;
let capacity = (config.buffer_seconds as usize)
.saturating_mul(sample_rate as usize)
.max(1);
let audio_buffer = Arc::new(Mutex::new(AudioRingBuffer::new(capacity)));
let state = Arc::new(Mutex::new(CaptureState::default()));
let running = Arc::new(AtomicBool::new(true));
let frames_captured = Arc::new(AtomicU64::new(0));
let frames_skipped = Arc::new(AtomicU64::new(0));
let id = SESSION_COUNTER.fetch_add(1, Ordering::Relaxed);
let session_id = format!("{id:016x}");
let started_at = Instant::now();
let handle = spawn_capture_thread(
config.clone(),
Arc::clone(&audio_buffer),
Arc::clone(&state),
Arc::clone(&running),
Arc::clone(&frames_captured),
Arc::clone(&frames_skipped),
);
Self {
session_id,
audio_buffer,
state,
running,
sample_rate,
handle: Some(handle),
config,
started_at,
frames_captured,
frames_skipped,
}
}
pub fn stop(&mut self) {
self.running.store(false, Ordering::Release);
if let Some(h) = self.handle.take() {
let _ = h.join();
}
}
#[must_use]
pub fn is_running(&self) -> bool {
self.running.load(Ordering::Acquire) && self.handle.is_some()
}
#[must_use]
#[allow(clippy::cast_possible_truncation)]
pub fn duration_ms(&self) -> u64 {
self.started_at.elapsed().as_millis() as u64
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn audio_buffer_seconds(&self) -> f64 {
let len = self
.audio_buffer
.lock()
.map(|g| g.len())
.unwrap_or_default();
len as f64 / f64::from(self.sample_rate)
}
#[must_use]
pub fn read_transcription(&self, since_seconds: u64) -> Vec<TranscriptSegment> {
let Ok(guard) = self.state.lock() else {
return Vec::new();
};
let duration_ms = self.duration_ms();
let since_ms = since_seconds.saturating_mul(1_000);
let cutoff_ms = duration_ms.saturating_sub(since_ms);
guard
.transcript_segments
.iter()
.filter(|s| s.end_ms >= cutoff_ms)
.cloned()
.collect()
}
#[must_use]
pub fn transcript_segment_count(&self) -> usize {
self.state
.lock()
.map(|g| g.transcript_segments.len())
.unwrap_or_default()
}
#[must_use]
pub fn latest_frame(&self) -> Option<ScreenFrame> {
self.state.lock().ok()?.latest_frame.clone()
}
#[must_use]
pub fn frames_captured(&self) -> u64 {
self.frames_captured.load(Ordering::Relaxed)
}
#[must_use]
pub fn frames_skipped(&self) -> u64 {
self.frames_skipped.load(Ordering::Relaxed)
}
#[must_use]
pub fn config(&self) -> &CaptureConfig {
&self.config
}
#[must_use]
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
pub fn read_audio_samples(&self, seconds: f32) -> Vec<f32> {
let count = (f64::from(seconds) * f64::from(self.sample_rate)).round() as usize;
self.audio_buffer
.lock()
.map(|g| g.read_last(count))
.unwrap_or_default()
}
}
impl Drop for CaptureSession {
fn drop(&mut self) {
self.stop();
}
}
#[allow(clippy::needless_pass_by_value)] fn spawn_capture_thread(
config: CaptureConfig,
audio_buffer: Arc<Mutex<AudioRingBuffer>>,
state: Arc<Mutex<CaptureState>>,
running: Arc<AtomicBool>,
frames_captured: Arc<AtomicU64>,
frames_skipped: Arc<AtomicU64>,
) -> JoinHandle<()> {
thread::Builder::new()
.name("ax-capture".to_string())
.spawn(move || {
run_capture_loop(
config,
audio_buffer,
state,
running,
frames_captured,
frames_skipped,
);
})
.expect("failed to spawn capture thread")
}
#[allow(clippy::needless_pass_by_value)] fn run_capture_loop(
config: CaptureConfig,
audio_buffer: Arc<Mutex<AudioRingBuffer>>,
state: Arc<Mutex<CaptureState>>,
running: Arc<AtomicBool>,
frames_captured: Arc<AtomicU64>,
frames_skipped: Arc<AtomicU64>,
) {
let started = Instant::now();
let mut last_screen_capture = Instant::now()
.checked_sub(Duration::from_secs(SCREEN_POLL_SECS))
.unwrap_or(Instant::now());
let mut segment_offset_ms: u64 = 0;
let mut prev_fingerprint: Option<screen_diff::ScreenFingerprint> = None;
while running.load(Ordering::Acquire) {
if config.audio {
capture_audio_window(
&audio_buffer,
&state,
&running,
config.transcribe,
started,
&mut segment_offset_ms,
);
}
if config.screen && last_screen_capture.elapsed() >= Duration::from_secs(SCREEN_POLL_SECS) {
capture_screen_snapshot(
&state,
started,
config.screen_diff_threshold,
&mut prev_fingerprint,
&frames_captured,
&frames_skipped,
);
last_screen_capture = Instant::now();
}
if !config.audio && !config.screen {
thread::sleep(Duration::from_millis(100));
}
}
debug!("capture loop exited cleanly");
}
fn capture_audio_window(
audio_buffer: &Arc<Mutex<AudioRingBuffer>>,
state: &Arc<Mutex<CaptureState>>,
running: &Arc<AtomicBool>,
do_transcribe: bool,
started: Instant,
segment_offset_ms: &mut u64,
) {
if !running.load(Ordering::Acquire) {
return;
}
let audio_data = match capture_system_audio(AUDIO_WINDOW_SECS) {
Ok(d) => d,
Err(e) => {
warn!(error = %e, "audio capture window failed");
thread::sleep(Duration::from_secs(1));
return;
}
};
#[allow(clippy::cast_possible_truncation)]
let window_end_ms = started.elapsed().as_millis() as u64;
let window_start_ms = window_end_ms.saturating_sub(audio_data.duration_ms());
append_audio_samples(audio_buffer, &audio_data);
if do_transcribe {
transcribe_and_store(state, &audio_data, window_start_ms, window_end_ms);
}
*segment_offset_ms = window_end_ms;
}
fn append_audio_samples(audio_buffer: &Arc<Mutex<AudioRingBuffer>>, data: &AudioData) {
match audio_buffer.lock() {
Ok(mut guard) => guard.push_slice(&data.samples),
Err(e) => warn!("audio_buffer lock poisoned: {e}"),
}
}
fn transcribe_and_store(
state: &Arc<Mutex<CaptureState>>,
audio_data: &AudioData,
start_ms: u64,
end_ms: u64,
) {
match transcribe(audio_data, None) {
Ok(text) if !text.trim().is_empty() => {
let segment = TranscriptSegment {
text,
start_ms,
end_ms,
speaker: None,
};
match state.lock() {
Ok(mut guard) => guard.transcript_segments.push(segment),
Err(e) => warn!("state lock poisoned during transcription: {e}"),
}
}
Ok(_) => {}
Err(e) => warn!(error = %e, "transcription failed for audio window"),
}
}
fn capture_screen_snapshot(
state: &Arc<Mutex<CaptureState>>,
started: Instant,
threshold: f32,
prev_fingerprint: &mut Option<screen_diff::ScreenFingerprint>,
frames_captured: &Arc<AtomicU64>,
frames_skipped: &Arc<AtomicU64>,
) {
let elapsed_secs = started.elapsed().as_secs();
let timestamp = format!("T+{elapsed_secs}s");
let (png_base64, width, height, png_bytes) = match capture_primary_display_png() {
Ok(result) => result,
Err(e) => {
warn!(error = %e, "screen snapshot failed");
return;
}
};
let fingerprint = screen_diff::ScreenFingerprint::from_png_bytes(&png_bytes);
let should_store = match prev_fingerprint.as_ref() {
None => true, Some(prev) => {
let diff = screen_diff::ScreenDiff::compare(prev, &fingerprint);
let significant = diff.is_significant(threshold);
debug!(score = diff.score, threshold, significant, "screen diff");
significant
}
};
if should_store {
let frame = ScreenFrame {
png_base64,
timestamp,
width,
height,
};
match state.lock() {
Ok(mut guard) => guard.latest_frame = Some(frame),
Err(e) => warn!("state lock poisoned during screen capture: {e}"),
}
*prev_fingerprint = Some(fingerprint);
frames_captured.fetch_add(1, Ordering::Relaxed);
} else {
frames_skipped.fetch_add(1, Ordering::Relaxed);
}
}
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
fn capture_primary_display_png() -> Result<(String, u32, u32, Vec<u8>), String> {
use base64::Engine as _;
use core_graphics::display::CGDisplay;
use std::process::Command;
let main = CGDisplay::main();
let bounds = main.bounds();
let width = bounds.size.width as u32;
let height = bounds.size.height as u32;
let tmp = format!("/tmp/axterminator_capture_{}.png", std::process::id());
let output = Command::new("screencapture")
.args(["-x", "-D", "1", &tmp])
.output()
.map_err(|e| format!("screencapture failed to launch: {e}"))?;
if !output.status.success() {
return Err(format!(
"screencapture exited with status {}",
output.status
));
}
let png_bytes = std::fs::read(&tmp).map_err(|e| format!("failed to read capture file: {e}"))?;
let _ = std::fs::remove_file(&tmp);
let png_base64 = base64::engine::general_purpose::STANDARD.encode(&png_bytes);
Ok((png_base64, width, height, png_bytes))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn audio_ring_buffer_starts_empty() {
let buf = AudioRingBuffer::new(100);
assert!(buf.is_empty());
assert_eq!(buf.len(), 0);
}
#[test]
fn audio_ring_buffer_push_slice_below_capacity_stores_all() {
let mut buf = AudioRingBuffer::new(16);
buf.push_slice(&[1.0, 2.0, 3.0, 4.0]);
assert_eq!(buf.len(), 4);
}
#[test]
fn audio_ring_buffer_read_last_returns_n_most_recent() {
let mut buf = AudioRingBuffer::new(8);
buf.push_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
let last = buf.read_last(3);
assert_eq!(last, vec![3.0, 4.0, 5.0]);
}
#[test]
fn audio_ring_buffer_overflow_evicts_oldest() {
let mut buf = AudioRingBuffer::new(4);
buf.push_slice(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
assert_eq!(buf.len(), 4);
let all = buf.read_last(4);
assert_eq!(all, vec![3.0, 4.0, 5.0, 6.0]);
}
#[test]
fn audio_ring_buffer_read_last_clamps_to_available() {
let mut buf = AudioRingBuffer::new(10);
buf.push_slice(&[7.0, 8.0, 9.0]);
let last = buf.read_last(100);
assert_eq!(last.len(), 3);
assert_eq!(last, vec![7.0, 8.0, 9.0]);
}
#[test]
fn audio_ring_buffer_read_last_empty_returns_empty_vec() {
let buf = AudioRingBuffer::new(8);
assert_eq!(buf.read_last(4), Vec::<f32>::new());
}
#[test]
fn audio_ring_buffer_clear_resets_state() {
let mut buf = AudioRingBuffer::new(8);
buf.push_slice(&[1.0, 2.0, 3.0]);
buf.clear();
assert!(buf.is_empty());
assert_eq!(buf.len(), 0);
assert_eq!(buf.read_last(4), Vec::<f32>::new());
}
#[test]
fn audio_ring_buffer_multiple_wraps_preserves_latest() {
let mut buf = AudioRingBuffer::new(3);
buf.push_slice(&[1.0, 2.0, 3.0]);
buf.push_slice(&[4.0, 5.0, 6.0]);
buf.push_slice(&[7.0, 8.0, 9.0]);
assert_eq!(buf.len(), 3);
assert_eq!(buf.read_last(3), vec![7.0, 8.0, 9.0]);
}
#[test]
#[should_panic(expected = "capacity must be > 0")]
fn audio_ring_buffer_zero_capacity_panics() {
let _ = AudioRingBuffer::new(0);
}
#[test]
fn audio_ring_buffer_read_last_zero_count_returns_empty() {
let mut buf = AudioRingBuffer::new(8);
buf.push_slice(&[1.0, 2.0]);
assert_eq!(buf.read_last(0), Vec::<f32>::new());
}
#[test]
fn audio_ring_buffer_push_single_sample_at_a_time() {
let mut buf = AudioRingBuffer::new(3);
for i in 1..=5u8 {
buf.push_slice(&[f32::from(i)]);
}
assert_eq!(buf.len(), 3);
assert_eq!(buf.read_last(3), vec![3.0, 4.0, 5.0]);
}
#[test]
fn transcript_segment_fields_are_accessible() {
let seg = TranscriptSegment {
text: "hello".to_string(),
start_ms: 0,
end_ms: 5000,
speaker: Some("A".to_string()),
};
assert_eq!(seg.text, "hello");
assert_eq!(seg.start_ms, 0);
assert_eq!(seg.end_ms, 5000);
assert_eq!(seg.speaker.as_deref(), Some("A"));
}
#[test]
fn transcript_segment_clone_is_independent() {
let seg = TranscriptSegment {
text: "world".to_string(),
start_ms: 1000,
end_ms: 6000,
speaker: None,
};
let seg2 = seg.clone();
assert_eq!(seg2.text, seg.text);
assert_eq!(seg2.start_ms, seg.start_ms);
}
#[test]
fn capture_config_default_has_sensible_values() {
let cfg = CaptureConfig::default();
assert!(cfg.audio);
assert!(cfg.transcribe);
assert!(!cfg.screen);
assert_eq!(cfg.buffer_seconds, 60);
}
#[test]
fn capture_config_clone_is_independent() {
let mut cfg = CaptureConfig::default();
let cfg2 = cfg.clone();
cfg.audio = false;
assert!(!cfg.audio, "mutation must affect original");
assert!(cfg2.audio, "clone must remain unaffected");
}
fn idle_config(buffer_seconds: u32) -> CaptureConfig {
CaptureConfig {
audio: false,
transcribe: false,
screen: false,
buffer_seconds,
..CaptureConfig::default()
}
}
#[test]
fn capture_session_starts_with_unique_session_ids() {
let s1 = CaptureSession::start(idle_config(1));
let s2 = CaptureSession::start(idle_config(1));
assert_ne!(s1.session_id, s2.session_id);
}
#[test]
fn capture_session_is_running_after_start() {
let session = CaptureSession::start(idle_config(1));
assert!(session.is_running());
}
#[test]
fn capture_session_stop_halts_background_thread() {
let mut session = CaptureSession::start(idle_config(1));
session.stop();
assert!(!session.is_running());
}
#[test]
fn capture_session_drop_stops_thread_without_panic() {
{
let _session = CaptureSession::start(idle_config(1));
}
}
#[test]
fn capture_session_stop_is_idempotent() {
let mut session = CaptureSession::start(idle_config(1));
session.stop();
session.stop(); }
#[test]
fn capture_session_audio_buffer_seconds_initially_zero() {
let session = CaptureSession::start(idle_config(60));
assert!(session.audio_buffer_seconds() < 1.0);
}
#[test]
fn capture_session_transcript_segment_count_initially_zero() {
let session = CaptureSession::start(idle_config(1));
assert_eq!(session.transcript_segment_count(), 0);
}
#[test]
fn capture_session_read_transcription_empty_returns_empty_vec() {
let session = CaptureSession::start(idle_config(1));
assert!(session.read_transcription(30).is_empty());
}
#[test]
fn capture_session_read_audio_samples_empty_returns_empty() {
let session = CaptureSession::start(idle_config(1));
assert!(session.read_audio_samples(1.0).is_empty());
}
#[test]
fn capture_session_latest_frame_initially_none() {
let session = CaptureSession::start(idle_config(1));
assert!(session.latest_frame().is_none());
}
#[test]
fn capture_session_duration_ms_advances() {
let session = CaptureSession::start(idle_config(1));
let t0 = session.duration_ms();
std::thread::sleep(Duration::from_millis(5));
let t1 = session.duration_ms();
assert!(t1 >= t0);
}
#[test]
fn capture_session_config_is_accessible() {
let session = CaptureSession::start(CaptureConfig {
audio: false,
buffer_seconds: 30,
..CaptureConfig::default()
});
assert_eq!(session.config().buffer_seconds, 30);
}
#[test]
fn read_transcription_filters_by_time_window() {
let session = CaptureSession::start(idle_config(1));
{
let mut guard = session.state.lock().unwrap();
guard.transcript_segments.push(TranscriptSegment {
text: "old".to_string(),
start_ms: 0,
end_ms: 1_000,
speaker: None,
});
guard.transcript_segments.push(TranscriptSegment {
text: "recent".to_string(),
start_ms: 90_000,
end_ms: 95_000,
speaker: None,
});
}
let all = session.read_transcription(200);
assert_eq!(all.len(), 2);
let _ = session.read_transcription(1);
}
#[test]
fn capture_config_default_screen_diff_threshold_is_five_percent() {
let cfg = CaptureConfig::default();
assert!((cfg.screen_diff_threshold - 0.05).abs() < f32::EPSILON);
}
#[test]
fn capture_config_screen_diff_threshold_zero_stored() {
let cfg = CaptureConfig {
screen_diff_threshold: 0.0,
..CaptureConfig::default()
};
assert_eq!(cfg.screen_diff_threshold, 0.0);
}
#[test]
fn capture_config_screen_diff_threshold_one_stored() {
let cfg = CaptureConfig {
screen_diff_threshold: 1.0,
..CaptureConfig::default()
};
assert_eq!(cfg.screen_diff_threshold, 1.0);
}
#[test]
fn capture_config_screen_diff_threshold_propagates_to_session() {
let session = CaptureSession::start(CaptureConfig {
screen_diff_threshold: 0.10,
..idle_config(1)
});
assert!((session.config().screen_diff_threshold - 0.10).abs() < f32::EPSILON);
}
#[test]
fn capture_session_frames_captured_initially_zero() {
let session = CaptureSession::start(idle_config(1));
assert_eq!(session.frames_captured(), 0);
}
#[test]
fn capture_session_frames_skipped_initially_zero() {
let session = CaptureSession::start(idle_config(1));
assert_eq!(session.frames_skipped(), 0);
}
#[test]
fn diff_first_frame_always_significant() {
let threshold = 0.05_f32;
let pixels = vec![100u8; 16 * 16 * 4];
let fp = screen_diff::ScreenFingerprint::from_raw_pixels(&pixels, 16, 16, 4);
let prev: Option<screen_diff::ScreenFingerprint> = None;
let should_store = match prev.as_ref() {
None => true,
Some(p) => screen_diff::ScreenDiff::compare(p, &fp).is_significant(threshold),
};
assert!(should_store, "first frame must always be stored");
}
#[test]
fn diff_identical_consecutive_frame_is_not_significant_at_default_threshold() {
let pixels = vec![128u8; 32 * 32 * 4];
let fp = screen_diff::ScreenFingerprint::from_raw_pixels(&pixels, 32, 32, 4);
let diff = screen_diff::ScreenDiff::compare(&fp, &fp);
assert_eq!(diff.score, 0.0);
assert!(!diff.is_significant(0.05));
}
#[test]
fn diff_fully_changed_frame_is_significant_at_default_threshold() {
let black = vec![0u8; 32 * 32 * 4];
let white = vec![255u8; 32 * 32 * 4];
let fp_prev = screen_diff::ScreenFingerprint::from_raw_pixels(&black, 32, 32, 4);
let fp_next = screen_diff::ScreenFingerprint::from_raw_pixels(&white, 32, 32, 4);
let diff = screen_diff::ScreenDiff::compare(&fp_prev, &fp_next);
assert_eq!(diff.score, 1.0);
assert!(diff.is_significant(0.05));
}
#[test]
fn diff_minor_change_below_5pct_threshold_is_not_significant() {
let mut pixels_a = vec![100u8; 16 * 16 * 4];
let mut pixels_b = vec![100u8; 16 * 16 * 4];
pixels_b[0] = 0; pixels_b[1] = 0;
pixels_b[2] = 0;
pixels_a[63 * 4] = 99; let fp_a = screen_diff::ScreenFingerprint::from_raw_pixels(&pixels_a, 16, 16, 4);
let fp_b = screen_diff::ScreenFingerprint::from_raw_pixels(&pixels_b, 16, 16, 4);
let diff = screen_diff::ScreenDiff::compare(&fp_a, &fp_b);
assert!(diff.score < 0.05, "score was {}", diff.score);
assert!(!diff.is_significant(0.05));
}
#[test]
fn diff_zero_threshold_always_stores_any_nonzero_score() {
assert!(screen_diff::ScreenDiff { score: 0.0 }.is_significant(0.0));
assert!(screen_diff::ScreenDiff { score: 0.001 }.is_significant(0.0));
}
#[test]
fn diff_hash_only_fallback_returns_one_when_bytes_differ() {
let fp1 = screen_diff::ScreenFingerprint::from_png_bytes(b"bytes_a");
let fp2 = screen_diff::ScreenFingerprint::from_png_bytes(b"bytes_b");
assert_eq!(screen_diff::ScreenDiff::compare(&fp1, &fp2).score, 1.0);
}
}