wav2vec2_rs/
types.rs

1#[derive(Debug, Clone)]
2pub struct AlignmentInput {
3    pub sample_rate_hz: u32,
4    pub samples: Vec<f32>,
5    pub transcript: String,
6    /// Pre-normalized audio (mean 0, unit variance). When set, the pipeline skips normalization and uses this buffer for all iterations of the same input.
7    pub normalized: Option<Vec<f32>>,
8}
9
10#[derive(Debug, Clone, PartialEq)]
11pub struct WordTiming {
12    pub word: String,
13    /// Millisecond interval is [start_ms, end_ms), i.e. start inclusive/end exclusive.
14    pub start_ms: u64,
15    /// Millisecond interval is [start_ms, end_ms), i.e. start inclusive/end exclusive.
16    pub end_ms: u64,
17    /// Deterministic word-level quality confidence score in [0, 1].
18    /// This blends acoustic support (`geo_mean_prob`) with separability and
19    /// boundary evidence. `None` means confidence could not be computed.
20    pub confidence: Option<f32>,
21    pub confidence_stats: WordConfidenceStats,
22}
23
24#[derive(Debug, Clone, PartialEq, Default)]
25pub struct WordConfidenceStats {
26    pub mean_logp: Option<f32>,
27    pub geo_mean_prob: Option<f32>,
28    /// Deterministic composite quality score before calibration.
29    pub quality_confidence: Option<f32>,
30    /// Monotonic calibrated confidence score in [0, 1].
31    pub calibrated_confidence: Option<f32>,
32    pub min_logp: Option<f32>,
33    pub p10_logp: Option<f32>,
34    pub mean_margin: Option<f32>,
35    pub coverage_frame_count: u32,
36    /// Mean blank probability over frames absorbed by boundary expansion.
37    pub boundary_confidence: Option<f32>,
38}
39
40#[derive(Debug, Clone, PartialEq)]
41pub struct AlignmentOutput {
42    pub words: Vec<WordTiming>,
43}
44
45#[derive(Debug, Clone)]
46pub struct TokenSequence {
47    pub tokens: Vec<usize>,
48    pub chars: Vec<Option<char>>,
49    /// Transcript normalized with the same logic as emitted token chars.
50    pub normalized_words: Vec<String>,
51}
wav2vec2_rs/types.rs

wav2vec2_rs/
types.rs