polyvoice 0.7.0

Speaker diarization for Rust — who spoke when. ONNX-powered: Silero VAD, WeSpeaker embeddings, Pyannote segmentation, K-means/AHC clustering, overlap detection.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
//! HTTP download with streamed SHA-256 and optional Minisign verification.

use crate::models::verify::{SignatureError, verify_minisign};
use sha2::{Digest, Sha256};
use std::fs;
use std::io::{self, BufReader, Read, Write};
use std::path::{Path, PathBuf};

/// Errors from `download_with_checksum` and `verify_sha256`.
#[derive(Debug, thiserror::Error)]
pub enum DownloadError {
    #[error("io error on {path}: {source}")]
    Io {
        path: PathBuf,
        #[source]
        source: io::Error,
    },
    #[error("network error fetching {url}: {source}")]
    Network {
        url: String,
        #[source]
        source: Box<ureq::Error>,
    },
    #[error("checksum mismatch for {path}: expected {expected:.16}…, computed {actual:.16}…")]
    ChecksumMismatch {
        path: PathBuf,
        expected: String,
        actual: String,
    },
    #[error("signature invalid for {path}: {source}")]
    SignatureInvalid {
        path: PathBuf,
        #[source]
        source: SignatureError,
    },
    #[error("refusing to fetch model over a non-https URL: {url}")]
    InsecureScheme { url: String },
    #[error("download for {path} exceeded the {max_bytes}-byte cap")]
    TooLarge { path: PathBuf, max_bytes: u64 },
}

impl From<SignatureError> for DownloadError {
    fn from(source: SignatureError) -> Self {
        DownloadError::SignatureInvalid {
            path: PathBuf::from("(unknown)"),
            source,
        }
    }
}

/// { !url.is_empty() && expected_sha256.len() == 64 }
/// `pub fn download_with_checksum( url: &str, expected_sha256: &str, dest: &Path, ) -> Result<bool, DownloadError>`
/// { ret.as_ref().map_or(true, |&downloaded| if downloaded { dest.exists() } else { true }) }
/// Stream `url` to `dest` and verify the SHA-256 matches `expected_sha256`.
///
/// Idempotent: if `dest` already exists with the correct hash, returns Ok(false)
/// immediately. Otherwise downloads, hashes while streaming (so 200+ MB files
/// don't blow up RAM), and on hash mismatch deletes the partial file and returns
/// an error. Returns `Ok(true)` if a download happened, `Ok(false)` if cached.
///
/// Backwards-compatibility wrapper: delegates to [`download_with_checksum_and_signature`]
/// with `signature: None`.
pub fn download_with_checksum(
    url: &str,
    expected_sha256: &str,
    dest: &Path,
) -> Result<bool, DownloadError> {
    download_with_checksum_and_signature(url, expected_sha256, None, dest)
}

/// { !url.is_empty() && expected_sha256.len() == 64 }
/// `pub fn download_with_checksum_and_signature( url: &str, expected_sha256: &str, signature: Option<&str>, dest: &Path, ) -> Result<bool, DownloadError>`
/// { ret.as_ref().map_or(true, |&downloaded| if downloaded { dest.exists() } else { true }) }
/// Stream `url` to `dest`, verify SHA-256, and optionally verify a Minisign signature.
///
/// When `signature` is `Some(sig_text)`, the signature is verified both on cache
/// hits and after fresh downloads. If verification fails, the temp file is deleted
/// and `DownloadError::SignatureInvalid` is returned.
///
/// Streams everything in 64 KiB chunks; does not load the whole model into memory.
pub fn download_with_checksum_and_signature(
    url: &str,
    expected_sha256: &str,
    signature: Option<&str>,
    dest: &Path,
) -> Result<bool, DownloadError> {
    download_with_checksum_signature_and_cap(
        url,
        expected_sha256,
        signature,
        dest,
        DEFAULT_MAX_MODEL_BYTES,
    )
}

/// Default absolute ceiling for a single streamed model download (1 GiB).
///
/// Bounds a disk-exhaustion DoS for manifest entries that do not declare a
/// `size`. It sits well above any real polyvoice model (the largest shipped
/// weights are ~250 MiB), so legitimate downloads are unaffected.
pub(crate) const DEFAULT_MAX_MODEL_BYTES: u64 = 1024 * 1024 * 1024;

/// Like [`download_with_checksum_and_signature`] but with an explicit streaming
/// size cap and an enforced `https://` scheme.
///
/// * Rejects any non-`https://` URL with [`DownloadError::InsecureScheme`]
///   before opening the network. Cache hits transmit nothing and are still
///   served — the scheme is only required when bytes are actually fetched.
/// * Aborts and deletes the `.partial` file if the stream exceeds `max_bytes`,
///   returning [`DownloadError::TooLarge`], so a hostile or buggy endpoint
///   cannot fill the disk before the SHA-256 check runs.
pub(crate) fn download_with_checksum_signature_and_cap(
    url: &str,
    expected_sha256: &str,
    signature: Option<&str>,
    dest: &Path,
    max_bytes: u64,
) -> Result<bool, DownloadError> {
    // Cache hit: verify SHA-256, then signature if present. No network here, so
    // the URL scheme is irrelevant.
    if dest.exists() && verify_sha256(dest, expected_sha256).is_ok() {
        if let Some(sig) = signature {
            verify_minisign(dest, sig).map_err(|e| DownloadError::SignatureInvalid {
                path: dest.to_path_buf(),
                source: e,
            })?;
        }
        return Ok(false);
    }

    // A real fetch will happen: require https:// so weights are never pulled in
    // cleartext (integrity must not rest on the same-manifest hash alone).
    if !url
        .get(..8)
        .is_some_and(|s| s.eq_ignore_ascii_case("https://"))
    {
        return Err(DownloadError::InsecureScheme {
            url: url.to_owned(),
        });
    }

    if let Some(parent) = dest.parent() {
        fs::create_dir_all(parent).map_err(|e| DownloadError::Io {
            path: parent.to_path_buf(),
            source: e,
        })?;
    }

    // Download to a sibling .partial file, then rename — gives atomic on-success
    // semantics so a partial file is never seen as cached.
    let mut tmp = dest.to_path_buf();
    let original_name = dest.file_name().and_then(|s| s.to_str()).unwrap_or("model");
    tmp.set_file_name(format!(".{original_name}.partial"));

    // Pre-parse Minisign public key and signature so we fail fast before the network.
    let public_key = if signature.is_some() {
        Some(
            minisign_verify::PublicKey::from_base64(crate::models::verify::SIGNING_PUBKEY_BASE64)
                .map_err(|e| DownloadError::SignatureInvalid {
                path: dest.to_path_buf(),
                source: SignatureError::BadPublicKey(format!("{e:?}")),
            })?,
        )
    } else {
        None
    };
    let sig = if let Some(sig_text) = signature {
        Some(minisign_verify::Signature::decode(sig_text).map_err(|e| {
            DownloadError::SignatureInvalid {
                path: dest.to_path_buf(),
                source: SignatureError::BadSignature(format!("{e:?}")),
            }
        })?)
    } else {
        None
    };
    let mut verifier = if let (Some(pk), Some(s)) = (&public_key, &sig) {
        Some(
            pk.verify_stream(s)
                .map_err(|e| DownloadError::SignatureInvalid {
                    path: dest.to_path_buf(),
                    source: SignatureError::VerificationFailed(format!("{e:?}")),
                })?,
        )
    } else {
        None
    };

    let resp = ureq::get(url).call().map_err(|e| DownloadError::Network {
        url: url.to_owned(),
        source: Box::new(e),
    })?;
    let reader = BufReader::new(resp.into_body().into_reader());
    let mut file = fs::File::create(&tmp).map_err(|e| DownloadError::Io {
        path: tmp.clone(),
        source: e,
    })?;
    let mut hasher = Sha256::new();

    {
        // Hash + (optionally) signature-verify each chunk as it streams; the
        // helper enforces the byte cap and deletes the .partial on overflow.
        let mut on_chunk = |chunk: &[u8]| {
            hasher.update(chunk);
            if let Some(v) = verifier.as_mut() {
                v.update(chunk);
            }
        };
        write_capped(reader, &mut file, &tmp, max_bytes, &mut on_chunk)?;
    }

    file.flush().map_err(|e| DownloadError::Io {
        path: tmp.clone(),
        source: e,
    })?;
    drop(file);

    let actual = format!("{:x}", hasher.finalize());
    if actual != expected_sha256 {
        let _ = fs::remove_file(&tmp);
        return Err(DownloadError::ChecksumMismatch {
            path: dest.to_path_buf(),
            expected: expected_sha256.to_owned(),
            actual,
        });
    }

    if let Some(mut v) = verifier {
        v.finalize().map_err(|e| {
            let _ = fs::remove_file(&tmp);
            DownloadError::SignatureInvalid {
                path: dest.to_path_buf(),
                source: SignatureError::VerificationFailed(format!("{e:?}")),
            }
        })?;
    }

    fs::rename(&tmp, dest).map_err(|e| DownloadError::Io {
        path: tmp.clone(),
        source: e,
    })?;
    Ok(true)
}

/// Stream `reader` into `file` in 64 KiB chunks, calling `on_chunk` for each
/// chunk (used for SHA-256 and signature updates), and aborting with
/// [`DownloadError::TooLarge`] — after deleting `tmp` — if more than `max_bytes`
/// are read. The cap is checked before each write, so the on-disk `.partial`
/// never exceeds the limit.
fn write_capped<R: Read>(
    mut reader: R,
    file: &mut fs::File,
    tmp: &Path,
    max_bytes: u64,
    on_chunk: &mut dyn FnMut(&[u8]),
) -> Result<(), DownloadError> {
    let mut buf = [0u8; 64 * 1024];
    let mut written: u64 = 0;
    loop {
        let n = reader.read(&mut buf).map_err(|e| DownloadError::Io {
            path: tmp.to_path_buf(),
            source: e,
        })?;
        if n == 0 {
            break;
        }
        written += n as u64;
        if written > max_bytes {
            let _ = fs::remove_file(tmp);
            return Err(DownloadError::TooLarge {
                path: tmp.to_path_buf(),
                max_bytes,
            });
        }
        file.write_all(&buf[..n]).map_err(|e| DownloadError::Io {
            path: tmp.to_path_buf(),
            source: e,
        })?;
        on_chunk(&buf[..n]);
    }
    Ok(())
}

/// { expected.len() == 64 }
/// `pub fn verify_sha256(path: &Path, expected: &str) -> Result<(), DownloadError>`
/// { true }
/// Compute the SHA-256 of `path` and compare against `expected`. Streams the file
/// (does not load it into RAM).
pub fn verify_sha256(path: &Path, expected: &str) -> Result<(), DownloadError> {
    let f = fs::File::open(path).map_err(|e| DownloadError::Io {
        path: path.to_path_buf(),
        source: e,
    })?;
    let mut reader = BufReader::new(f);
    let mut hasher = Sha256::new();
    let mut buf = [0u8; 64 * 1024];
    loop {
        let n = reader.read(&mut buf).map_err(|e| DownloadError::Io {
            path: path.to_path_buf(),
            source: e,
        })?;
        if n == 0 {
            break;
        }
        hasher.update(&buf[..n]);
    }
    let actual = format!("{:x}", hasher.finalize());
    if actual == expected {
        Ok(())
    } else {
        Err(DownloadError::ChecksumMismatch {
            path: path.to_path_buf(),
            expected: expected.to_owned(),
            actual,
        })
    }
}

#[allow(clippy::unwrap_used)]
#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use std::io::Write;
    use tempfile::TempDir;

    const TEST_BYTES: &[u8] = b"polyvoice";

    /// Compute the expected SHA-256 of `TEST_BYTES` at test time, so the test is
    /// robust against typos in a hardcoded constant.
    fn test_bytes_sha256() -> String {
        use sha2::{Digest, Sha256};
        let mut h = Sha256::new();
        h.update(TEST_BYTES);
        format!("{:x}", h.finalize())
    }

    #[test]
    fn verify_existing_file_passes_when_hash_matches() {
        let dir = TempDir::new().unwrap();
        let path = dir.path().join("data.bin");
        fs::write(&path, TEST_BYTES).unwrap();
        verify_sha256(&path, &test_bytes_sha256()).expect("hash must match");
    }

    #[test]
    fn verify_existing_file_fails_when_hash_differs() {
        let dir = TempDir::new().unwrap();
        let path = dir.path().join("data.bin");
        fs::write(&path, b"different content").unwrap();
        let err = verify_sha256(&path, &test_bytes_sha256()).expect_err("must mismatch");
        assert!(matches!(err, DownloadError::ChecksumMismatch { .. }));
    }

    #[test]
    #[cfg_attr(miri, ignore)]
    fn verify_streams_large_file_without_loading_into_ram() {
        // Write a 5 MB file; verify_sha256 must use streaming reader, not Vec::read_to_end.
        // The test passes purely if it doesn't OOM and computes a deterministic hash.
        let dir = TempDir::new().unwrap();
        let path = dir.path().join("big.bin");
        let mut f = fs::File::create(&path).unwrap();
        for _ in 0..5 * 1024 {
            // 5 MB of '\0'
            f.write_all(&[0u8; 1024]).unwrap();
        }
        // SHA-256 of 5 MB of zero bytes:
        let expected = sha256_of_zeros_5mb();
        verify_sha256(&path, &expected).expect("streaming hash should match");
    }

    fn sha256_of_zeros_5mb() -> String {
        use sha2::{Digest, Sha256};
        let mut h = Sha256::new();
        for _ in 0..5 * 1024 {
            h.update([0u8; 1024]);
        }
        format!("{:x}", h.finalize())
    }

    #[test]
    #[cfg_attr(miri, ignore)]
    fn download_with_checksum_no_signature_fallback() {
        // When signature is None and the file is already cached with a matching
        // hash, download_with_checksum_and_signature must take the cache-hit
        // path and return Ok(false) without touching the network.
        let dir = TempDir::new().unwrap();
        let dest = dir.path().join("cached.bin");
        fs::write(&dest, TEST_BYTES).unwrap();
        let sha = test_bytes_sha256();

        // A completely invalid URL proves we never reach the download path.
        let result = download_with_checksum_and_signature(
            "http://[invalid:definitely:not:a:real:url]",
            &sha,
            None,
            &dest,
        );
        assert!(
            result.is_ok(),
            "fallback should succeed: {:?}",
            result.err()
        );
        assert!(!result.unwrap(), "should be cached (no download)");

        // Calling the old wrapper should behave identically.
        let result2 =
            download_with_checksum("http://[invalid:definitely:not:a:real:url]", &sha, &dest);
        assert!(
            result2.is_ok(),
            "wrapper should succeed: {:?}",
            result2.err()
        );
        assert!(!result2.unwrap(), "wrapper should also be cached");
    }

    #[test]
    fn rejects_non_https_url() {
        // dest does not exist, so the call goes past the cache-hit branch into
        // the scheme check. An http:// URL must be rejected before any network
        // or filesystem side effect.
        let dir = TempDir::new().unwrap();
        let dest = dir.path().join("model.bin");
        let err = download_with_checksum_and_signature(
            "http://unreachable.invalid/model.bin",
            &test_bytes_sha256(),
            None,
            &dest,
        )
        .expect_err("non-https URL must be rejected");
        assert!(matches!(err, DownloadError::InsecureScheme { .. }));
        assert!(!dest.exists(), "no file should be created");
        assert!(
            !dir.path().join(".model.bin.partial").exists(),
            "no .partial should be created"
        );
    }

    #[test]
    fn aborts_when_stream_exceeds_cap() {
        // 100 bytes through a 10-byte cap: write_capped must abort with TooLarge
        // and delete the .partial.
        let dir = TempDir::new().unwrap();
        let tmp = dir.path().join(".big.partial");
        let mut file = fs::File::create(&tmp).unwrap();
        let mut noop = |_: &[u8]| {};
        let err = write_capped(
            std::io::Cursor::new(vec![0u8; 100]),
            &mut file,
            &tmp,
            10,
            &mut noop,
        )
        .expect_err("stream over the cap must abort");
        assert!(matches!(err, DownloadError::TooLarge { max_bytes: 10, .. }));
        drop(file);
        assert!(!tmp.exists(), ".partial must be deleted on cap overflow");
    }
}