Skip to main content

dominant_speaker/
lib.rs

1//! Pure-Rust dominant speaker identification for WebRTC applications.
2//!
3//! This crate implements the three-time-scale subband comparison algorithm
4//! described in Volfin & Cohen, "Dominant Speaker Identification for
5//! Multipoint Videoconferencing", IEEE 2012. The implementation follows
6//! mediasoup's C++ `ActiveSpeakerObserver` for constants and Jitsi's Java
7//! `DominantSpeakerIdentification` for the overall structure.
8//!
9//! Feed it RFC 6464 audio-level observations and it tells you who is talking.
10//! No FFI, no WebRTC stack dependency, no unsafe code.
11//!
12//! # Quick start
13//!
14//! Timestamps are caller-supplied `u64` milliseconds — use any epoch you like
15//! (e.g. `Instant::now().elapsed().as_millis() as u64` in std environments,
16//! or `performance.now() as u64` in a WASM AudioWorklet).
17//!
18//! ```rust
19//! use dominant_speaker::ActiveSpeakerDetector;
20//!
21//! let mut detector = ActiveSpeakerDetector::new();
22//!
23//! // Register two participants (timestamp = 0 ms).
24//! detector.add_peer(1u64, 0);
25//! detector.add_peer(2u64, 0);
26//!
27//! // Feed audio levels (0 = loud, 127 = silent, per RFC 6464).
28//! // Simulate peer 1 speaking for 2 seconds at 20 ms cadence.
29//! let mut t_ms: u64 = 0;
30//! while t_ms < 2000 {
31//!     detector.record_level(1, 5, t_ms);   // peer 1: active
32//!     detector.record_level(2, 127, t_ms); // peer 2: silent
33//!     t_ms += 20;
34//! }
35//!
36//! // Call tick() on a 300 ms timer — returns Some(SpeakerChange) only on change.
37//! if let Some(change) = detector.tick(300) {
38//!     println!("Dominant speaker: peer {}", change.peer_id);
39//! }
40//! ```
41//!
42//! See the [README](https://github.com/anatolykoptev/rust-dominant-speaker)
43//! for algorithm details, constants reference, and prior art.
44
45#![no_std]
46#![forbid(unsafe_code)]
47
48extern crate alloc;
49
50#[cfg(test)]
51extern crate std;
52
53mod detector;
54mod numerics;
55mod speaker;
56
57pub use detector::ActiveSpeakerDetector;
58
59/// Emitted by [`ActiveSpeakerDetector::tick`] when the dominant speaker changes.
60#[derive(Debug, Clone, PartialEq)]
61pub struct SpeakerChange<PeerId = u64> {
62    /// The new dominant speaker's identifier.
63    pub peer_id: PeerId,
64    /// Medium-window log-ratio margin above the C2 threshold at election time.
65    ///
66    /// `0.0` for single-peer rooms and bootstrap elections (no challenger).
67    /// Positive values indicate how confidently the challenger beat the incumbent —
68    /// higher means more confident. Useful for UI animations or debouncing.
69    pub c2_margin: f64,
70}
71
72/// Convenience alias using `u64` peer IDs — backward-compatible with v0.1.x.
73pub type DefaultDetector = ActiveSpeakerDetector<u64>;
74
75/// Tunable constants for the dominant-speaker election.
76///
77/// Defaults match mediasoup's production constants exactly.
78///
79/// # Example
80///
81/// ```rust
82/// use dominant_speaker::{ActiveSpeakerDetector, DetectorConfig};
83/// use core::time::Duration;
84///
85/// // Use defaults (mediasoup-identical behaviour).
86/// let default_detector: ActiveSpeakerDetector<u64> = ActiveSpeakerDetector::new();
87///
88/// // Raise C1/C2 for a low-bitrate / mobile deployment: fewer speaker switches.
89/// let config = DetectorConfig {
90///     c1: 5.0,
91///     c2: 4.0,
92///     tick_interval: Duration::from_millis(500),
93///     ..DetectorConfig::default()
94/// };
95/// let tuned_detector: ActiveSpeakerDetector<u64> = ActiveSpeakerDetector::with_config(config);
96/// ```
97///
98/// # Serde
99///
100/// Enable the `serde` feature to serialize/deserialize this struct.
101/// `tick_interval` is serialized as milliseconds (`u64`).
102///
103/// ```rust,ignore
104/// // Requires `dominant_speaker` with `serde` feature and `serde_json` dev-dep.
105/// use dominant_speaker::DetectorConfig;
106/// let config = DetectorConfig::default();
107/// let json = serde_json::to_string(&config).unwrap();
108/// let back: DetectorConfig = serde_json::from_str(&json).unwrap();
109/// assert!((back.c1 - config.c1).abs() < f64::EPSILON);
110/// ```
111#[derive(Debug, Clone)]
112#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
113pub struct DetectorConfig {
114    /// Immediate-window log-ratio threshold (mediasoup: C1).
115    pub c1: f64,
116    /// Medium-window log-ratio threshold (mediasoup: C2).
117    pub c2: f64,
118    /// Long-window log-ratio threshold; zero = long window disabled (mediasoup: C3).
119    pub c3: f64,
120    /// Evaluation cadence. Recommend 300 ms.
121    #[cfg_attr(feature = "serde", serde(with = "duration_ms"))]
122    pub tick_interval: core::time::Duration,
123    /// Immediate-window subband count (mediasoup: N1).
124    ///
125    /// The subband width is derived automatically via `ceil(128 / n1)`.
126    /// The default of 13 gives a subband width of 10, matching mediasoup.
127    pub n1: u8,
128    /// Medium-window subband count (mediasoup: N2).
129    pub n2: u8,
130    /// Long-window subband count (mediasoup: N3).
131    pub n3: u8,
132}
133
134#[cfg(feature = "serde")]
135mod duration_ms {
136    use core::time::Duration;
137    use serde::{Deserialize, Deserializer, Serialize, Serializer};
138
139    pub fn serialize<S: Serializer>(d: &Duration, s: S) -> Result<S::Ok, S::Error> {
140        d.as_millis().serialize(s)
141    }
142
143    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<Duration, D::Error> {
144        let ms = u64::deserialize(d)?;
145        Ok(Duration::from_millis(ms))
146    }
147}
148
149impl Default for DetectorConfig {
150    fn default() -> Self {
151        Self {
152            c1: C1,
153            c2: C2,
154            c3: C3,
155            tick_interval: TICK_INTERVAL,
156            n1: N1 as u8,
157            n2: N2 as u8,
158            n3: N3 as u8,
159        }
160    }
161}
162
163// Algorithm constants — ported verbatim from mediasoup's ActiveSpeakerObserver.
164// `pub(crate)` so sibling modules can share them without exposing to users.
165
166/// Immediate time-scale log-ratio threshold (mediasoup: C1).
167pub(crate) const C1: f64 = 3.0;
168/// Medium time-scale log-ratio threshold (mediasoup: C2).
169pub(crate) const C2: f64 = 2.0;
170/// Long time-scale log-ratio threshold; zero = long window disabled (mediasoup: C3).
171pub(crate) const C3: f64 = 0.0;
172/// Immediate subband count (mediasoup: N1).
173pub(crate) const N1: u32 = 13;
174/// Medium subband count (mediasoup: N2).
175pub(crate) const N2: u32 = 5;
176/// Long subband count (mediasoup: N3).
177pub(crate) const N3: u32 = 10;
178/// Milliseconds before a stale level entry is replaced with silence (mediasoup: LevelIdleTimeout).
179pub(crate) const LEVEL_IDLE_TIMEOUT_MS: u64 = 40;
180/// Milliseconds before an idle non-dominant speaker is paused (mediasoup: SpeakerIdleTimeout).
181pub(crate) const SPEAKER_IDLE_TIMEOUT_MS: u64 = 60 * 60 * 1000;
182/// Long-window threshold used when computing `longs` from `mediums` (mediasoup: LongThreashold).
183pub(crate) const LONG_THRESHOLD: u8 = 4;
184/// Maximum RFC 6464 audio-level value (mediasoup: MaxLevel).
185pub(crate) const MAX_LEVEL: u8 = 127;
186/// Minimum RFC 6464 audio-level value (mediasoup: MinLevel).
187pub(crate) const MIN_LEVEL: u8 = 0;
188/// Window length for adaptive minimum-level estimation (mediasoup: MinLevelWindowLen = 15*1000/20).
189pub(crate) const MIN_LEVEL_WINDOW_LEN: u32 = 750;
190/// Threshold for medium-window immediate-to-medium downsampling (mediasoup: MediumThreshold).
191pub(crate) const MEDIUM_THRESHOLD: u8 = 7;
192/// Immediate-buffer length: covers 1 second at 20ms cadence × 5 subbands (mediasoup: ImmediateBuffLen).
193pub(crate) const IMMEDIATE_BUFF_LEN: usize = 50;
194/// Medium-buffer length (mediasoup: MediumsBuffLen).
195pub(crate) const MEDIUMS_BUFF_LEN: usize = 10;
196/// Long-buffer length (mediasoup: LongsBuffLen).
197pub(crate) const LONGS_BUFF_LEN: usize = 1;
198/// Levels ring-buffer length (mediasoup: LevelsBuffLen).
199pub(crate) const LEVELS_BUFF_LEN: usize = 50;
200/// Floor score; prevents log(0) in ratio computation (mediasoup: MinActivityScore).
201pub(crate) const MIN_ACTIVITY_SCORE: f64 = 1.0e-10;
202
203/// Recommended tick interval matching mediasoup's production tuning.
204///
205/// Call [`ActiveSpeakerDetector::tick`] at this cadence for best results.
206pub const TICK_INTERVAL: core::time::Duration = core::time::Duration::from_millis(300);
207
208/// Compute the subband width for a given N1 value.
209///
210/// Formula: `ceil(128 / n1)`. Mediasoup hard-codes 10 for N1=13 —
211/// `ceil(128/13) = 10`. This function generalises it for custom configs.
212pub(crate) fn subunit_len_for(n1: u8) -> u8 {
213    let n1 = n1.max(1) as u16; // guard against zero
214    128u16.div_ceil(n1) as u8
215}
216
217#[cfg(test)]
218mod subunit_tests {
219    use super::subunit_len_for;
220
221    #[test]
222    fn default_n1_gives_10() {
223        assert_eq!(subunit_len_for(13), 10);
224    }
225
226    #[test]
227    fn n1_10_gives_13() {
228        // ceil(128/10) = 13
229        assert_eq!(subunit_len_for(10), 13);
230    }
231
232    #[test]
233    fn n1_8_gives_16() {
234        // ceil(128/8) = 16
235        assert_eq!(subunit_len_for(8), 16);
236    }
237}