use image::{ImageBuffer, Rgb};
use ndarray::{concatenate, s, Array, Array2, Axis};
use std::collections::HashSet;
#[derive(Copy, Clone, Default)]
pub struct DetectionSettings {
pub min_energy: f64,
pub min_y: usize,
pub min_x: usize,
pub min_mel: usize,
}
impl DetectionSettings {
pub fn new(min_energy: f64, min_y: usize, min_x: usize, min_mel: usize) -> Self {
Self {
min_energy,
min_y,
min_x,
min_mel,
}
}
pub fn min_energy(&self) -> f64 {
self.min_energy
}
pub fn min_y(&self) -> usize {
self.min_y
}
pub fn min_x(&self) -> usize {
self.min_x
}
pub fn min_mel(&self) -> usize {
self.min_mel
}
}
pub struct VoiceActivityDetector {
mel_buffer: Vec<Array2<f64>>,
settings: DetectionSettings,
idx: usize,
}
impl VoiceActivityDetector {
pub fn new(settings: &DetectionSettings) -> Self {
let mel_buffer: Vec<Array2<f64>> = Vec::new();
Self {
mel_buffer,
settings: settings.to_owned(),
idx: 0,
}
}
pub fn add(&mut self, frame: &Array2<f64>) -> Option<bool> {
let min_x = self.settings.min_x;
if self.idx == 128 {
self.mel_buffer = self.mel_buffer[(self.mel_buffer.len() - min_x)..].to_vec();
self.idx = min_x;
}
self.mel_buffer.push(frame.to_owned());
self.idx += 1;
if self.idx < min_x {
return None;
}
let window = &self.mel_buffer[self.idx - min_x..];
let edge_info = vad_boundaries(&window, &self.settings);
let ni = edge_info.intersected();
if ni.is_empty() {
Some(ni[0] == 0)
} else {
Some(false)
}
}
}
fn vad_on(edge_info: &EdgeInfo, n: usize) -> bool {
let intersected_columns = &edge_info.intersected_columns;
if intersected_columns.is_empty() {
return false; }
let mut contiguous_count = 1; let mut prev_index = intersected_columns[0];
for &index in &intersected_columns[1..] {
if index == prev_index + 1 {
contiguous_count += 1;
} else {
contiguous_count = 1;
}
if contiguous_count >= n {
return true;
}
prev_index = index;
}
false }
fn vad_boundaries(frames: &[Array2<f64>], settings: &DetectionSettings) -> EdgeInfo {
let array_views: Vec<_> = frames.iter().map(|a| a.view()).collect();
let min_energy = settings.min_energy;
let min_y = settings.min_y;
let min_mel = settings.min_mel;
let merged_frames = concatenate(Axis(1), &array_views).unwrap();
let shape = merged_frames.raw_dim();
let width = shape[1];
let height = shape[0];
let sobel_x =
Array::from_shape_vec((3, 3), vec![-1., 0., 1., -2., 0., 2., -1., 0., 1.]).unwrap();
let sobel_y =
Array::from_shape_vec((3, 3), vec![-1., -2., -1., 0., 0., 0., 1., 2., 1.]).unwrap();
let gradient_mag = Array::from_shape_fn((height - 2, width - 2), |(y, x)| {
if y < height && x < width {
let view = merged_frames.slice(s![y..y + 3, x..x + 3]);
let mut gradient_x = 0.0;
let mut gradient_y = 0.0;
for j in 0..3 {
for i in 0..3 {
gradient_x += view[[j, i]] * sobel_x[[j, i]]; gradient_y += view[[j, i]] * sobel_y[[j, i]]; }
}
(gradient_x * gradient_x + gradient_y * gradient_y).sqrt()
} else {
0.0
}
});
let mut intersected_columns: Vec<usize> = Vec::new();
let mut non_intersected_columns: Vec<usize> = Vec::new();
let gradient_positions = HashSet::new();
for x in 0..width - 2 {
let indices: Vec<usize> = (0..height - 2)
.filter(|&y| gradient_mag[(y, x)] >= min_energy && y >= min_mel)
.collect();
let num_intersections = indices.len();
if num_intersections <= min_y {
non_intersected_columns.push(x);
} else if num_intersections >= min_y {
intersected_columns.push(x);
}
}
EdgeInfo::new(
non_intersected_columns,
intersected_columns,
gradient_positions,
)
}
#[derive(Debug)]
pub struct EdgeInfo {
non_intersected_columns: Vec<usize>,
intersected_columns: Vec<usize>,
gradient_positions: HashSet<(usize, usize)>,
}
impl EdgeInfo {
pub fn new(
non_intersected_columns: Vec<usize>,
intersected_columns: Vec<usize>,
gradient_positions: HashSet<(usize, usize)>,
) -> Self {
EdgeInfo {
non_intersected_columns,
intersected_columns,
gradient_positions,
}
}
pub fn non_intersected(&self) -> Vec<usize> {
self.non_intersected_columns.clone()
}
pub fn intersected(&self) -> Vec<usize> {
self.intersected_columns.clone()
}
pub fn gradient_positions(&self) -> HashSet<(usize, usize)> {
self.gradient_positions.clone()
}
}
pub fn as_image(
frames: &[Array2<f64>],
non_intersected_columns: &Vec<usize>,
gradient_positions: &HashSet<(usize, usize)>,
) -> ImageBuffer<Rgb<u8>, Vec<u8>> {
let array_views: Vec<_> = frames.iter().map(|a| a.view()).collect();
let array_view = concatenate(Axis(1), &array_views).unwrap();
let shape = array_view.raw_dim();
let width = shape[1];
let height = shape[0];
let mut img_buffer = ImageBuffer::new(width as u32, height as u32);
let max_val = array_view.fold(0.0, |acc: f64, &val| acc.max(val));
let scaled_image: Array2<u8> = array_view.mapv(|val| (val * (255.0 / max_val)) as u8);
let tint_value = 200;
for (y, row) in scaled_image.outer_iter().rev().enumerate() {
for (x, &val) in row.into_iter().enumerate() {
let mut rgb_pixel = Rgb([val, val, val]);
if non_intersected_columns.contains(&x) {
if y < 10 {
let green_tint = Rgb([0, 255, 0]);
rgb_pixel = green_tint;
} else {
let green_tint_value = 60;
let green_tint = Rgb([val, val.saturating_add(green_tint_value), val]);
rgb_pixel = green_tint;
}
}
let inverted_y = height.checked_sub(y + 3).unwrap_or(0);
if gradient_positions.contains(&(x, inverted_y)) {
let tint = Rgb([tint_value, 0, 0]);
rgb_pixel = Rgb([
rgb_pixel[0].saturating_add(tint[0]),
rgb_pixel[1].saturating_add(tint[1]),
rgb_pixel[2].saturating_add(tint[2]),
]);
}
img_buffer.put_pixel(x as u32, y as u32, rgb_pixel);
}
}
img_buffer
}
pub fn n_frames_for_duration(hop_size: usize, sampling_rate: f64, duration_ms: usize) -> usize {
let frame_duration = hop_size as f32 / sampling_rate as f32 * 1000.0;
let total_frames = (duration_ms as f32 / frame_duration).ceil() as u32;
total_frames as usize
}
pub fn duration_ms_for_n_frames(hop_size: usize, sampling_rate: f64, total_frames: usize) -> usize {
let frame_duration = hop_size as f64 / sampling_rate * 1000.0;
(total_frames as f64 * frame_duration) as usize
}
pub fn format_milliseconds(milliseconds: u64) -> String {
let total_seconds = milliseconds / 1000;
let ms = milliseconds % 1000;
let seconds = total_seconds % 60;
let total_minutes = total_seconds / 60;
let minutes = total_minutes % 60;
let hours = total_minutes / 60;
format!("{:02}:{:02}:{:02}.{:03}", hours, minutes, seconds, ms)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::quant::{load_tga_8bit, to_array2};
#[test]
fn test_speech_detection() {
let n_mels = 80;
let min_x = 5;
let settings = DetectionSettings {
min_energy: 1.0,
min_y: 10,
min_x,
min_mel: 0,
};
let ids = vec![21168, 23760, 41492, 41902, 63655, 7497, 39744];
for id in ids {
let file_path = format!("../testdata/blank/frame_{}.tga", id);
let dequantized_mel = load_tga_8bit(&file_path).unwrap();
let frames = to_array2(&dequantized_mel, n_mels);
let edge_info = vad_boundaries(&[frames.clone()], &settings);
let img = as_image(
&[frames.clone()],
&edge_info.non_intersected(),
&edge_info.gradient_positions(),
);
assert!(vad_on(&edge_info, min_x) == false);
let path = format!("../testdata/vad_off_{}.png", id);
img.save(path).unwrap();
}
let ids = vec![11648, 2889, 4694, 4901, 27125];
for id in ids {
let file_path = format!("../testdata/speech/frame_{}.tga", id);
let dequantized_mel = load_tga_8bit(&file_path).unwrap();
let frames = to_array2(&dequantized_mel, n_mels);
let edge_info = vad_boundaries(&[frames.clone()], &settings);
let img = as_image(
&[frames.clone()],
&edge_info.non_intersected(),
&edge_info.gradient_positions(),
);
assert!(vad_on(&edge_info, min_x) == true);
let path = format!("../testdata/vad_on_{}.png", id);
img.save(path).unwrap();
}
}
fn test_vad_debug() {
let n_mels = 80;
let settings = DetectionSettings {
min_energy: 1.0,
min_y: 6,
min_x: 1,
min_mel: 0,
};
let start = std::time::Instant::now();
let file_path = "../testdata/jfk_full_speech_chunk0_golden.tga";
let dequantized_mel = load_tga_8bit(file_path).unwrap();
let frames = to_array2(&dequantized_mel, n_mels);
let edge_info = vad_boundaries(&[frames.clone()], &settings);
let elapsed = start.elapsed().as_millis();
dbg!(elapsed);
let img = as_image(
&[frames.clone()],
&edge_info.non_intersected(),
&edge_info.gradient_positions(),
);
img.save("../doc/debug.png").unwrap();
}
#[test]
fn test_vad_boundaries() {
let n_mels = 80;
let settings = DetectionSettings {
min_energy: 1.0,
min_y: 3,
min_x: 6,
min_mel: 0,
};
let start = std::time::Instant::now();
let file_path = "../testdata/quantized_mel_golden.tga";
let dequantized_mel = load_tga_8bit(file_path).unwrap();
dbg!(&dequantized_mel);
let frames = to_array2(&dequantized_mel, n_mels);
let edge_info = vad_boundaries(&[frames.clone()], &settings);
let elapsed = start.elapsed().as_millis();
dbg!(elapsed);
let img = as_image(
&[frames.clone()],
&edge_info.non_intersected(),
&edge_info.gradient_positions(),
);
img.save("../doc/vad.png").unwrap();
}
fn test_stage() {
let n_mels = 80;
let settings = DetectionSettings {
min_energy: 1.0,
min_y: 3,
min_x: 3,
min_mel: 0,
};
let mut stage = VoiceActivityDetector::new(&settings);
let file_path = "../testdata/quantized_mel_golden.tga";
let dequantized_mel = load_tga_8bit(file_path).unwrap();
let frames = to_array2(&dequantized_mel, n_mels);
let chunk_size = 1;
let chunks: Vec<Array2<f64>> = frames
.axis_chunks_iter(Axis(1), chunk_size)
.map(|chunk| chunk.to_owned())
.collect();
let start = std::time::Instant::now();
for mel in &chunks {
if let Some(_) = stage.add(&mel) {}
}
let elapsed = start.elapsed().as_millis();
dbg!(elapsed);
}
}