codec2/lib.rs
1//! Codec 2 is an open source (LGPL 2.1) low bit rate speech codec.
2//!
3//! This is a zero dependencies pure rust port of the original [http://rowetel.com/codec2.html](http://rowetel.com/codec2.html)
4//! Currently 3200 and 2400 bitrates encoding and decoding are implemented.
5//!
6//! # Basic Usage
7//!
8//! Create a Codec2 object e.g. `Codec2::new(Codec2Mode::MODE_3200)` then repeatedly obtain raw 8khz
9//! 16-bit audio samples and call `encode` to encode blocks of `samples_per_frame()` samples into
10//! `bits_per_frame()` compressed bits, in order. On the receiving end, create a Codec2 object and
11//! repeatedly call `decode` to decompress each chunk of received bytes into the next samples.
12//!
13//! Complete example below. This example uses zerocopy to interpret the `&[i16]` slices as `&[u8]`
14//! for I/O.
15//!
16//! Cargo.toml:
17//!
18//! ```toml
19//! [dependencies]
20//! zerocopy = "0.3.0"
21//! codec2 = "0"
22//! ```
23//!
24//! main.rs:
25//!
26//! ```rust
27//! use codec2::*;
28//! use std::env::args;
29//! use std::io::prelude::*;
30//! use zerocopy::AsBytes;
31//!
32//! fn main() -> std::io::Result<()> {
33//! if args().len() != 4 || (args().nth(1).unwrap() != "enc" && args().nth(1).unwrap() != "dec") {
34//! eprintln!("Usage: {} (enc|dec) inputfile outputfilename", args().nth(0).unwrap());
35//! eprintln!("Files should be raw 16-bit signed 8khz audio");
36//! return Ok(());
37//! }
38//! let mut fin = std::fs::File::open(args().nth(2).unwrap())?;
39//! let mut fout = std::fs::File::create(args().nth(3).unwrap())?;
40//! let mut c2 = Codec2::new(Codec2Mode::MODE_3200);
41//! let mut samps = vec![0; c2.samples_per_frame()]; //u16 I/O buffer
42//! let mut packed = vec![0; (c2.bits_per_frame() + 7) / 8]; //u8 I/O buffer for encoded bits
43//! if args().nth(1).unwrap() == "enc" {
44//! while fin.read_exact(samps.as_bytes_mut()).is_ok() {
45//! c2.encode(&mut packed, &samps[..]);
46//! fout.write_all(&packed)?;
47//! }
48//! } else {
49//! while fin.read_exact(&mut packed).is_ok() {
50//! c2.decode(&mut samps[..], &packed);
51//! fout.write_all(samps.as_bytes())?;
52//! }
53//! }
54//! Ok(())
55//! }
56//! ```
57
58#![allow(non_snake_case)]
59#![allow(non_camel_case_types)]
60#![allow(non_upper_case_globals)]
61#![warn(trivial_numeric_casts)]
62
63#[cfg(test)]
64mod tests {
65 #[test]
66 fn encode_test() {
67 let mut c2 = crate::Codec2::new(crate::Codec2Mode::MODE_3200);
68 //160 samples
69 let spf = c2.samples_per_frame();
70 let mut samps = vec![0; spf * 2];
71 let bpf = c2.bits_per_frame();
72 println!(
73 "c2 is n_samp {:?} m_pitch {:?} Sn.len {:?} bpf {} spf {}",
74 c2.internal.n_samp,
75 c2.internal.m_pitch,
76 c2.internal.Sn.len(),
77 bpf,
78 spf
79 );
80 let mut outbuf = vec![0; (bpf + 7) / 8];
81 c2.encode(&mut outbuf, &samps);
82 assert_eq!(&outbuf[..], &[0xC0, 0, 0x6A, 0x43, 0x9C, 0xE4, 0x21, 8][..]);
83 let outbuforig = outbuf.clone();
84 println!("encoded {:X?}", outbuf);
85 c2.encode(&mut outbuf, &samps);
86 assert_eq!(&outbuf[..], &[0x81, 0, 9, 0x43, 0x9C, 0xE4, 0x21, 8][..]);
87 println!("encoded {:X?}", outbuf);
88 c2.encode(&mut outbuf, &samps);
89 assert_eq!(&outbuf[..], &[1, 0, 9, 0x43, 0x9C, 0xE4, 0x21, 8][..]);
90 println!("encoded {:X?}", outbuf);
91 c2.encode(&mut outbuf, &samps);
92 assert_eq!(&outbuf[..], &[1, 0, 9, 0x43, 0x9C, 0xE4, 0x21, 8][..]);
93 println!("encoded {:X?}", outbuf);
94
95 c2.decode(&mut samps, &outbuforig);
96 println!("decoded {:X?}", samps);
97 for samp in &samps {
98 assert!((*samp).abs() < 0x100);
99 }
100 }
101}
102
103mod kiss_fft;
104mod nlp;
105use nlp::FDMDV_OS_TAPS_16K;
106mod quantise;
107use crate::quantise::*;
108mod codebook;
109use crate::codebook::*;
110mod codebookd;
111use crate::codebookd::*;
112mod codebookge;
113use crate::codebookge::*;
114mod codebookjvm;
115use crate::codebookjvm::*;
116
117const WO_BITS: i32 = 7;
118const WO_E_BITS: u32 = 8;
119const LSPD_SCALAR_INDEXES: usize = 10;
120const LSP_SCALAR_INDEXES: usize = 10;
121use std::f64::consts::PI;
122
123const N_S: f32 = 0.01; // internal proc frame length in secs
124const TW_S: f32 = 0.005; // trapezoidal synth window overlap
125const MAX_AMP: usize = 160; // maximum number of harmonics
126const TWO_PI: f32 = 6.283185307; // mathematical constant
127const FFT_ENC: usize = 512; // size of FFT used for encoder
128const FFT_DEC: usize = 512; // size of FFT used in decoder
129const V_THRESH: f32 = 6.0; // voicing threshold in dB
130const LPC_ORD: usize = 10; // LPC order
131 // Pitch estimation constants
132const M_PITCH_S: f32 = 0.0400; // pitch analysis window in s
133const P_MIN_S: f32 = 0.0025; // minimum pitch period in s
134const P_MAX_S: f32 = 0.0200; // maximum pitch period in s
135mod inner {
136 use crate::*;
137 #[derive(Clone, Debug)]
138 pub struct C2const {
139 pub Fs: i32, // sample rate of this instance
140 pub n_samp: usize, // number of samples per 10ms frame at Fs
141 // pub max_amp: i32, // maximum number of harmonics
142 pub m_pitch: usize, // pitch estimation window size in samples
143 pub p_min: i32, // minimum pitch period in samples
144 pub p_max: i32, // maximum pitch period in samples
145 pub Wo_min: f32,
146 pub Wo_max: f32,
147 pub nw: usize, // analysis window size in samples
148 pub tw: usize, // trapezoidal synthesis window overlap
149 }
150 impl C2const {
151 pub fn new(Fs: i32, framelength_s: f32) -> Self {
152 Self {
153 Fs: Fs,
154 n_samp: ((Fs as f32) * framelength_s).round() as usize,
155 // max_amp: ((Fs as f32) * P_MAX_S / 2.0).floor() as i32,
156 p_min: ((Fs as f32) * P_MIN_S).floor() as i32,
157 p_max: ((Fs as f32) * P_MAX_S).floor() as i32,
158 m_pitch: ((Fs as f32) * M_PITCH_S).floor() as usize,
159 Wo_min: TWO_PI / ((Fs as f32) * P_MAX_S).floor(),
160 Wo_max: TWO_PI / ((Fs as f32) * P_MIN_S).floor(),
161 nw: 279,
162 tw: ((Fs as f32) * TW_S) as usize,
163 }
164 }
165
166 /*---------------------------------------------------------------------------*\
167
168 FUNCTION....: dft_speech
169 AUTHOR......: David Rowe, conversion by Matt Weeks
170 DATE CREATED: 27/5/94
171
172 Finds the DFT of the current speech input speech frame.
173
174 \*---------------------------------------------------------------------------*/
175 // TODO: we can either go for a faster FFT using fftr and some stack usage
176 // or we can reduce stack usage to almost zero on STM32 by switching to fft_inplace
177 pub fn dft_speech(
178 &self,
179 fft_fwd_cfg: &codec2_fft_cfg,
180 Sw: &mut [COMP],
181 Sn: &[f32],
182 w: &[f32],
183 ) {
184 let m_pitch = self.m_pitch;
185 let nw = self.nw;
186
187 for i in 0..FFT_ENC {
188 Sw[i].r = 0.0;
189 Sw[i].i = 0.0;
190 }
191
192 /* Centre analysis window on time axis, we need to arrange input
193 to FFT this way to make FFT phases correct */
194
195 // move 2nd half to start of FFT input vector
196
197 for i in 0..nw / 2 {
198 Sw[i].r = Sn[i + m_pitch / 2] * w[i + m_pitch / 2];
199 }
200
201 // move 1st half to end of FFT input vector
202
203 for i in 0..nw / 2 {
204 Sw[FFT_ENC - nw / 2 + i].r =
205 Sn[i + m_pitch / 2 - nw / 2] * w[i + m_pitch / 2 - nw / 2];
206 }
207
208 codec2_fft_inplace(fft_fwd_cfg, Sw);
209 }
210 }
211 // describes each codebook
212 #[derive(Clone, Debug)]
213 pub struct lsp_codebook {
214 pub k: usize, // dimension of vector
215 pub log2m: i32, // number of bits in m
216 pub m: usize, // elements in codebook
217 pub cb: &'static [f32], // The elements
218 }
219 #[derive(Copy, Clone)]
220 pub struct kiss_fft_cpx {
221 pub r: kiss_fft_scalar,
222 pub i: kiss_fft_scalar,
223 }
224 impl kiss_fft_cpx {
225 pub fn new() -> Self {
226 Self { r: 0.0, i: 0.0 }
227 }
228 pub fn kf_cexp(phase: f64) -> Self {
229 Self {
230 r: phase.cos() as f32,
231 i: phase.sin() as f32,
232 }
233 }
234 }
235 impl std::fmt::Debug for kiss_fft_cpx {
236 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
237 f.write_fmt(format_args!("{},{}", self.r, self.i))
238 }
239 }
240 pub type COMP = kiss_fft_cpx;
241 #[derive(Clone, Debug)]
242 pub struct kiss_fft_state {
243 pub nfft: usize,
244 pub inverse: i32,
245 pub factors: [usize; 2 * MAXFACTORS],
246 pub twiddles: Vec<kiss_fft_cpx>,
247 }
248 impl kiss_fft_state {
249 pub fn new(nfft: usize, inverse_fft: i32) -> Self {
250 let mut twiddles = Vec::with_capacity(nfft);
251 let mut factors = [0; 2 * MAXFACTORS];
252 for i in 0..nfft {
253 let mut phase = -2.0 * PI * (i as f64) / (nfft as f64);
254 if inverse_fft != 0 {
255 phase *= -1.0;
256 }
257 twiddles.push(kiss_fft_cpx::kf_cexp(phase));
258 }
259 let mut n = nfft;
260 let mut p = 4;
261 let floor_sqrt = (n as f64).sqrt().floor() as usize;
262 let mut idx = 0;
263 // factor out powers of 4, powers of 2, then any remaining primes
264 loop {
265 while (n % p) != 0 {
266 match p {
267 4 => p = 2,
268 2 => p = 3,
269 _ => p += 2,
270 }
271 if p > floor_sqrt {
272 p = n; // no more factors, skip to end
273 }
274 }
275 n /= p;
276 factors[idx] = p;
277 idx += 1;
278 factors[idx] = n;
279 idx += 1;
280 if n <= 1 {
281 break;
282 } //do..while (n > 1)
283 }
284 Self {
285 nfft: nfft,
286 inverse: inverse_fft,
287 factors: factors,
288 twiddles: twiddles,
289 }
290 }
291 }
292 #[derive(Clone, Debug)]
293 pub struct kiss_fftr_state {
294 pub substate: kiss_fft_cfg,
295 pub tmpbuf: Vec<kiss_fft_cpx>,
296 pub super_twiddles: Vec<kiss_fft_cpx>,
297 }
298 impl kiss_fftr_state {
299 pub fn new(mut nfft: usize, inverse_fft: i32) -> Self {
300 nfft = nfft / 2;
301 let mut res = Self {
302 substate: kiss_fft_state::new(nfft, inverse_fft),
303 tmpbuf: vec![kiss_fft_cpx::new(); nfft],
304 super_twiddles: vec![kiss_fft_cpx::new(); nfft / 2],
305 };
306 for i in 0..nfft / 2 {
307 let mut phase =
308 -3.14159265358979323846264338327 * ((i as f64 + 1.0) / nfft as f64 + 0.5);
309 if inverse_fft != 0 {
310 phase *= -1.0;
311 }
312 res.super_twiddles[i] = kiss_fft_cpx::kf_cexp(phase);
313 }
314 res
315 }
316 }
317 #[derive(Clone, Debug)]
318 pub struct NLP {
319 pub Fs: i32, // sample rate in Hz
320 pub m: usize,
321 pub w: [f32; PMAX_M / DEC], // DFT window
322 pub sq: [f32; PMAX_M], // squared speech samples
323 pub mem_x: f32,
324 pub mem_y: f32, // memory for notch filter
325 pub mem_fir: [f32; NLP_NTAP], // decimation FIR filter memory
326 pub fft_cfg: codec2_fft_cfg, // kiss FFT config
327 pub Sn16k: Vec<f32>, // Fs=16kHz input speech vector
328 // FILE *f,
329 }
330 impl NLP {
331 pub fn new(c2const: &C2const) -> Self {
332 let (m, vsize) = if c2const.Fs == 16000 {
333 (
334 c2const.m_pitch / 2,
335 FDMDV_OS_TAPS_16K as usize + c2const.n_samp,
336 )
337 } else {
338 (c2const.m_pitch, 0)
339 };
340 let mut w = [0.0; PMAX_M / DEC];
341 for i in 0..m / DEC {
342 w[i] =
343 0.5 - 0.5 * (2.0 * PI as f32 * i as f32 / (m as f32 / DEC as f32 - 1.0)).cos();
344 }
345 Self {
346 Fs: c2const.Fs, // sample rate in Hz
347 m: m,
348 w: w, // DFT window
349 sq: [0.0; PMAX_M], // squared speech samples
350 mem_x: 0.0,
351 mem_y: 0.0, // memory for notch filter
352 mem_fir: [0.0; NLP_NTAP], // decimation FIR filter memory
353 fft_cfg: kiss_fft_state::new(PE_FFT_SIZE, 0), // kiss FFT config
354 Sn16k: vec![0.0; vsize], // Fs=16kHz input speech vector
355 }
356 }
357 }
358 #[derive(Clone, Debug, Copy)]
359 pub struct MODEL {
360 pub Wo: f32, // fundamental frequency estimate in radians
361 pub L: usize, // number of harmonics
362 pub A: [f32; MAX_AMP + 1], // amplitiude of each harmonic
363 pub phi: [f32; MAX_AMP + 1], // phase of each harmonic
364 pub voiced: i32, // non-zero if this frame is voiced
365 }
366 impl MODEL {
367 pub fn new(p_max: f32) -> Self {
368 let wo = TWO_PI / p_max;
369 Self {
370 Wo: wo, // fundamental frequency estimate in radians
371 L: (PI / wo as f64).floor() as usize, // number of harmonics
372 A: [0.0; MAX_AMP + 1], // amplitiude of each harmonic
373 phi: [0.0; MAX_AMP + 1], // phase of each harmonic
374 voiced: 0, // non-zero if this frame is voiced
375 }
376 }
377 }
378}
379use inner::*;
380type kiss_fft_scalar = f32;
381type kiss_fft_cfg = kiss_fft_state;
382
383/* e.g. an fft of length 128 has 4 factors
384as far as kissfft is concerned
3854*4*4*2
386*/
387const PE_FFT_SIZE: usize = 512;
388const MAXFACTORS: usize = 32;
389
390type codec2_fft_cfg = kiss_fft_state;
391type codec2_fftr_cfg = kiss_fftr_state;
392type kiss_fftr_cfg = kiss_fftr_state;
393fn codec2_fft(cfg: &kiss_fft_cfg, fin: &[kiss_fft_cpx], fout: &mut [kiss_fft_cpx]) {
394 kiss_fft::kiss_fft(cfg, fin, fout);
395}
396fn codec2_fftr(cfg: &mut codec2_fftr_cfg, inp: &[f32], out: &mut [kiss_fft_cpx]) {
397 kiss_fft::kiss_fftr(cfg, inp, out);
398}
399fn codec2_fftri(cfg: &mut codec2_fftr_cfg, inp: &[kiss_fft_cpx], out: &mut [f32]) {
400 kiss_fft::kiss_fftri(cfg, inp, out);
401}
402
403fn codec2_fft_inplace(cfg: &codec2_fft_cfg, inout: &mut [kiss_fft_cpx]) {
404 let mut in_ = [kiss_fft_cpx::new(); 512];
405 // decide whether to use the local stack based buffer for in
406 // or to allow kiss_fft to allocate RAM
407 // second part is just to play safe since first method
408 // is much faster and uses less RAM
409 if cfg.nfft <= 512 {
410 in_[..cfg.nfft].copy_from_slice(&inout[..cfg.nfft]);
411 kiss_fft::kiss_fft(cfg, &in_, inout);
412 } else {
413 kiss_fft::kiss_fft(cfg, &inout.to_vec(), inout);
414 }
415}
416
417const PMAX_M: usize = 320;
418const DEC: usize = 5;
419const NLP_NTAP: usize = 48;
420const BPF_N: usize = 101;
421const LPCPF_BETA: f32 = 0.2;
422const LPCPF_GAMMA: f32 = 0.5;
423
424#[derive(Clone, Debug)]
425struct Codec2Internal {
426 mode: Codec2Mode,
427 c2const: C2const,
428 Fs: i32,
429 n_samp: usize,
430 m_pitch: usize,
431 fft_fwd_cfg: codec2_fft_cfg, // forward FFT config
432 fftr_fwd_cfg: codec2_fftr_cfg, // forward real FFT config
433 w: Vec<f32>, // [m_pitch] time domain hamming window
434 W: [f32; FFT_ENC], // DFT of w[]
435 Pn: Vec<f32>, // [2*n_samp] trapezoidal synthesis window
436 bpf_buf: Vec<f32>, // buffer for band pass filter
437 Sn: Vec<f32>, // [m_pitch] input speech
438 hpf_states: [f32; 2], // high pass filter states
439 nlp: NLP, // pitch predictor states
440 gray: i32, // non-zero for gray encoding
441
442 fftr_inv_cfg: codec2_fftr_cfg, // inverse FFT config
443 Sn_: Vec<f32>, // [2*n_samp] synthesised output speech
444 ex_phase: f32, // excitation model phase track
445 bg_est: f32, // background noise estimate for post filter
446 prev_f0_enc: f32, // previous frame's f0 estimate
447 prev_model_dec: MODEL, // previous frame's model parameters
448 prev_lsps_dec: [f32; LPC_ORD], // previous frame's LSPs
449 prev_e_dec: f32, // previous frame's LPC energy
450
451 lpc_pf: i32, // LPC post filter on
452 bass_boost: i32, // LPC post filter bass boost
453 beta: f32, // LPC post filter parameters
454 gamma: f32,
455
456 xq_enc: [f32; 2], // joint pitch and energy VQ states
457 xq_dec: [f32; 2],
458
459 smoothing: i32, // enable smoothing for channels with errors
460 se: f32, // running sum of squared error
461 nse: u32, // number of terms in sum
462 post_filter_en: i32,
463}
464
465/*---------------------------------------------------------------------------*\
466
467 FUNCTION....: make_synthesis_window
468 AUTHOR......: David Rowe, conversion by Matt Weeks
469 DATE CREATED: 11/5/94
470
471 Init function that generates the trapezoidal (Parzen) sythesis window.
472
473\*---------------------------------------------------------------------------*/
474fn make_synthesis_window(c2const: &C2const, Pn: &mut [f32]) {
475 let n_samp = c2const.n_samp;
476 let tw = c2const.tw;
477
478 // Generate Parzen window in time domain
479 for i in 0..n_samp / 2 - tw {
480 Pn[i] = 0.0;
481 }
482 let mut win = 0.0;
483 for i in n_samp / 2 - tw..n_samp / 2 + tw {
484 Pn[i] = win;
485 win += 1.0 / (2.0 * tw as f32);
486 }
487 for i in n_samp / 2 + tw..3 * n_samp / 2 - tw {
488 Pn[i] = 1.0;
489 }
490 win = 1.0;
491 for i in 3 * n_samp / 2 - tw..3 * n_samp / 2 + tw {
492 Pn[i] = win;
493 win -= 1.0 / (2.0 * tw as f32);
494 }
495 for i in 3 * n_samp / 2 + tw..2 * n_samp {
496 Pn[i] = 0.0;
497 }
498}
499
500fn make_analysis_window(
501 c2const: &C2const,
502 fft_fwd_cfg: &codec2_fft_cfg,
503 w: &mut [f32],
504 W: &mut [f32],
505) {
506 let mut wshift = [kiss_fft_cpx::new(); FFT_ENC];
507 let m_pitch = c2const.m_pitch;
508 let nw = c2const.nw;
509
510 /*
511 Generate Hamming window centered on M-sample pitch analysis window
512
513 0 M/2 M-1
514 |-------------|-------------|
515 |-------|-------|
516 nw samples
517
518 All our analysis/synthsis is centred on the M/2 sample.
519 */
520
521 let mut m = 0.0;
522 for i in 0..m_pitch / 2 - nw / 2 {
523 w[i] = 0.0;
524 }
525 let mut j = 0;
526 for i in m_pitch / 2 - nw / 2..m_pitch / 2 + nw / 2 {
527 w[i] = 0.5 - 0.5 * (TWO_PI * (j as f32) / ((nw as f32) - 1.0)).cos();
528 m += w[i] * w[i];
529 j += 1;
530 }
531 for i in m_pitch / 2 + nw / 2..m_pitch {
532 w[i] = 0.0;
533 }
534
535 /* Normalise - makes freq domain amplitude estimation straight
536 forward */
537
538 m = 1.0 / (m * FFT_ENC as f32).sqrt();
539 for i in 0..m_pitch {
540 w[i] *= m;
541 }
542
543 /*
544 Generate DFT of analysis window, used for later processing. Note
545 we modulo FFT_ENC shift the time domain window w[], this makes the
546 imaginary part of the DFT W[] equal to zero as the shifted w[] is
547 even about the n=0 time axis if nw is odd. Having the imag part
548 of the DFT W[] makes computation easier.
549
550 0 FFT_ENC-1
551 |-------------------------|
552
553 ----\ /----
554 \ /
555 \ / <- shifted version of window w[n]
556 \ /
557 \ /
558 -------
559
560 |---------| |---------|
561 nw/2 nw/2
562 */
563
564 let mut temp = [kiss_fft_cpx::new(); FFT_ENC];
565
566 for i in 0..FFT_ENC {
567 wshift[i].r = 0.0;
568 wshift[i].i = 0.0;
569 }
570 for i in 0..(nw / 2) {
571 wshift[i].r = w[i + (m_pitch) / 2];
572 }
573 let mut j = m_pitch / 2 - nw / 2;
574 for i in FFT_ENC - nw / 2..FFT_ENC {
575 wshift[i].r = w[j];
576 j += 1;
577 }
578
579 codec2_fft(fft_fwd_cfg, &wshift[..], &mut temp);
580
581 /*
582 Re-arrange W[] to be symmetrical about FFT_ENC/2. Makes later
583 analysis convenient.
584
585 Before:
586
587
588 0 FFT_ENC-1
589 |----------|---------|
590 __ _
591 \ /
592 \_______________/
593
594 After:
595
596 0 FFT_ENC-1
597 |----------|---------|
598 ___
599 / \
600 ________/ \_______
601
602 */
603
604 for i in 0..FFT_ENC / 2 {
605 W[i] = temp[i + FFT_ENC / 2].r;
606 W[i + FFT_ENC / 2] = temp[i].r;
607 }
608}
609
610const WordSize: usize = 8;
611const IndexMask: usize = 0x7;
612const ShiftRight: usize = 3;
613fn pack(
614 bitArray: &mut [u8], // The output bit string.
615 bitIndex: &mut usize, // Index into the string in BITS, not bytes.
616 field: i32, // The bit field to be packed.
617 fieldWidth: u32, // Width of the field in BITS, not bytes.
618) {
619 pack_natural_or_gray(bitArray, bitIndex, field, fieldWidth, 1);
620}
621
622/** Pack a bit field into a bit string, encoding the field in Gray code.
623 *
624 * The output is an array of unsigned char data. The fields are efficiently
625 * packed into the bit string. The Gray coding is a naive attempt to reduce
626 * the effect of single-bit errors, we expect to do a better job as the
627 * codec develops.
628 *
629 * This code would be simpler if it just set one bit at a time in the string,
630 * but would hit the same cache line more often. I'm not sure the complexity
631 * gains us anything here.
632 *
633 * Although field is currently of int type rather than unsigned for
634 * compatibility with the rest of the code, indices are always expected to
635 * be >= 0.
636 */
637fn pack_natural_or_gray(
638 bitArray: &mut [u8], // The output bit string.
639 bitIndex: &mut usize, // Index into the string in BITS, not bytes.
640 mut field: i32, // The bit field to be packed.
641 mut fieldWidth: u32, // Width of the field in BITS, not bytes.
642 gray: u32, // non-zero for gray coding
643) {
644 if gray > 0 {
645 // Convert the field to Gray code
646 field = (field >> 1) ^ field;
647 }
648
649 while fieldWidth != 0 {
650 let bI = *bitIndex;
651 let bitsLeft = (WordSize - (bI & IndexMask)) as u32;
652 let sliceWidth = if bitsLeft < fieldWidth {
653 bitsLeft
654 } else {
655 fieldWidth
656 };
657 let wordIndex = bI >> ShiftRight;
658
659 bitArray[wordIndex] |=
660 (((field >> (fieldWidth - sliceWidth)) << (bitsLeft - sliceWidth)) & 0xFF) as u8;
661
662 *bitIndex = bI + sliceWidth as usize;
663 fieldWidth -= sliceWidth;
664 }
665}
666
667/** Unpack a field from a bit string, converting from Gray code to binary.
668 *
669 */
670fn unpack(
671 bitArray: &[u8], // The input bit string.
672 bitIndex: &mut u32, // Index into the string in BITS, not bytes.
673 fieldWidth: u32, // Width of the field in BITS, not bytes.
674) -> i32 {
675 unpack_natural_or_gray(bitArray, bitIndex, fieldWidth, 1)
676}
677
678/** Unpack a field from a bit string, to binary, optionally using
679 * natural or Gray code.
680 *
681 */
682fn unpack_natural_or_gray(
683 bitArray: &[u8], // The input bit string.
684 bitIndex: &mut u32, // Index into the string in BITS, not bytes.
685 mut fieldWidth: u32, // Width of the field in BITS, not bytes.
686 gray: u32, // non-zero for Gray coding
687) -> i32 {
688 let mut field = 0;
689 while fieldWidth != 0 {
690 let bI = *bitIndex;
691 let bitsLeft = WordSize as u32 - (bI & IndexMask as u32);
692 let sliceWidth = if bitsLeft < fieldWidth {
693 bitsLeft
694 } else {
695 fieldWidth
696 };
697
698 field |= ((bitArray[(bI >> ShiftRight) as usize] as usize >> (bitsLeft - sliceWidth))
699 & ((1 << sliceWidth) - 1))
700 << (fieldWidth - sliceWidth);
701
702 *bitIndex = bI + sliceWidth;
703 fieldWidth -= sliceWidth;
704 }
705
706 if gray != 0 {
707 // Convert from Gray code to binary. Works for maximum 8-bit fields.
708 let mut t = field ^ (field >> 8);
709 t ^= t >> 4;
710 t ^= t >> 2;
711 t ^= t >> 1;
712 t as i32
713 } else {
714 field as i32
715 }
716}
717
718/// Codec mode (bitrate). Currently not all bitrates are implemented
719#[derive(Clone, Debug, Copy)]
720#[non_exhaustive]
721pub enum Codec2Mode {
722 MODE_3200,
723 MODE_2400,
724 MODE_1600,
725 MODE_1400,
726 MODE_1300,
727 MODE_1200,
728 //MODE_700C,
729 //MODE_450,
730 //MODE_450PWB,
731}
732use Codec2Mode::*;
733
734/// A Codec2 object for encoding or decoding audio
735#[derive(Clone, Debug)]
736pub struct Codec2 {
737 internal: Codec2Internal,
738}
739
740impl Codec2 {
741 /// Creates a new Codec2 object suitable for encoding or decoding audio
742 pub fn new(mode: Codec2Mode) -> Self {
743 let c2const = C2const::new(8000, N_S);
744 let n_samp = c2const.n_samp;
745 let m_pitch = c2const.m_pitch;
746 let mut c2 = Self {
747 internal: Codec2Internal {
748 mode: mode,
749 Fs: c2const.Fs,
750 n_samp: n_samp,
751 m_pitch: m_pitch,
752 fft_fwd_cfg: kiss_fft_state::new(FFT_ENC, 0), // forward FFT config
753 fftr_fwd_cfg: kiss_fftr_state::new(FFT_ENC, 0), // forward real FFT config
754 w: vec![0.0; m_pitch], // [m_pitch] time domain hamming window
755 W: [0.0; FFT_ENC], // DFT of w[]
756 Pn: vec![0.0; 2 * n_samp], // [2*n_samp] trapezoidal synthesis window
757 bpf_buf: vec![0.0; BPF_N + 4 * n_samp], // buffer for band pass filter
758 Sn: vec![1.0; m_pitch], // [m_pitch] input speech
759 hpf_states: [0.0; 2], // high pass filter states
760
761 nlp: NLP::new(&c2const), // pitch predictor states
762 gray: 1, // non-zero for gray encoding
763
764 fftr_inv_cfg: kiss_fftr_state::new(FFT_DEC, 1), // inverse FFT config
765 Sn_: vec![0.0; m_pitch], // [2*n_samp] synthesised output speech
766 prev_f0_enc: 1.0 / P_MAX_S,
767 bg_est: 0.0,
768 ex_phase: 0.0,
769 prev_model_dec: MODEL::new(c2const.p_max as f32),
770 c2const: c2const,
771 prev_lsps_dec: [0.0; LPC_ORD],
772 prev_e_dec: 1.0,
773 lpc_pf: 1, // LPC post filter on
774 bass_boost: 1, // LPC post filter bass boost
775 beta: LPCPF_BETA, // LPC post filter parameters
776 gamma: LPCPF_GAMMA,
777 xq_enc: [0.0; 2], // joint pitch and energy VQ states
778 xq_dec: [0.0; 2],
779
780 smoothing: 0, // enable smoothing for channels with errors
781 se: 0.0,
782 nse: 0,
783 post_filter_en: 1,
784 },
785 };
786 for i in 0..LPC_ORD {
787 c2.internal.prev_lsps_dec[i] = (i as f64 * PI / (LPC_ORD as f64 + 1.0)) as f32;
788 }
789 make_analysis_window(
790 &c2.internal.c2const,
791 &c2.internal.fft_fwd_cfg,
792 &mut c2.internal.w,
793 &mut c2.internal.W,
794 );
795 make_synthesis_window(&c2.internal.c2const, &mut c2.internal.Pn);
796 c2
797 }
798
799 /// The number of bits in an encoded (compressed) frame; 64 for the 3200 bitrate, 48 for 2400.
800 pub fn bits_per_frame(&self) -> usize {
801 match self.internal.mode {
802 MODE_3200 => 64,
803 MODE_2400 => 48,
804 MODE_1600 => 64,
805 MODE_1400 => 56,
806 MODE_1300 => 52,
807 MODE_1200 => 48,
808 // MODE_700C => 28,
809 // MODE_450 => 18,
810 // MODE_450PWB => 18,
811 }
812 }
813
814 /// How many samples an encoded (compressed) frame represents; generally 160 (20ms of speech).
815 pub fn samples_per_frame(&self) -> usize {
816 match self.internal.mode {
817 MODE_3200 | MODE_2400 => 160,
818 MODE_1600 | MODE_1400 | MODE_1300 | MODE_1200 /* | MODE_700C | MODE_450*/ => 320,
819 //MODE_450PWB => 640,
820 }
821 }
822
823 /// Encodes speech samples at current bitrate into `bits_per_frame()`/8 rounded up output bytes.
824 /// For MODE_3200, this is 64 bits or 8 bytes, for MODE_2400, it's 48 bits (6 bytes).
825 pub fn encode(&mut self, bits: &mut [u8], speech: &[i16]) {
826 match self.internal.mode {
827 MODE_3200 => self.codec2_encode_3200(bits, speech),
828 MODE_2400 => self.codec2_encode_2400(bits, speech),
829 MODE_1600 => self.codec2_encode_1600(bits, speech),
830 MODE_1400 => self.codec2_encode_1400(bits, speech),
831 MODE_1300 => self.codec2_encode_1300(bits, speech),
832 MODE_1200 => self.codec2_encode_1200(bits, speech),
833 }
834 }
835
836 /// Decodes `bits_per_frame()` compressed bits into `samples_per_frame()` speech samples.
837 /// For MODE_3200, the input is 64 bits or 8 bytes, for MODE_2400, it's 48 bits (6 bytes).
838 pub fn decode(&mut self, speech: &mut [i16], bits: &[u8]) {
839 match self.internal.mode {
840 MODE_3200 => self.codec2_decode_3200(speech, bits),
841 MODE_2400 => self.codec2_decode_2400(speech, bits),
842 MODE_1600 => self.codec2_decode_1600(speech, bits),
843 MODE_1400 => self.codec2_decode_1400(speech, bits),
844 MODE_1300 => self.codec2_decode_1300(speech, bits, 0.0),
845 MODE_1200 => self.codec2_decode_1200(speech, bits),
846 }
847 }
848
849 /*---------------------------------------------------------------------------*\
850
851 FUNCTION....: codec2_encode_3200
852 AUTHOR......: David Rowe, conversion by Matt Weeks
853 DATE CREATED: 13 Sep 2012
854
855 Encodes 160 speech samples (20ms of speech) into 64 bits.
856
857 The codec2 algorithm actually operates internally on 10ms (80
858 sample) frames, so we run the encoding algorithm twice. On the
859 first frame we just send the voicing bits. On the second frame we
860 send all model parameters. Compared to 2400 we use a larger number
861 of bits for the LSPs and non-VQ pitch and energy.
862
863 The bit allocation is:
864
865 Parameter bits/frame
866 --------------------------------------
867 Harmonic magnitudes (LSPs) 50
868 Pitch (Wo) 7
869 Energy 5
870 Voicing (10ms update) 2
871 TOTAL 64
872
873 \*---------------------------------------------------------------------------*/
874 /// Encodes 160 speech samples (20ms of speech) into 64 bits.
875 fn codec2_encode_3200(&mut self, bits: &mut [u8], speech: &[i16]) {
876 let mut model = MODEL::new(self.internal.c2const.p_max as f32);
877 let mut ak = [0.0; LPC_ORD + 1]; //f32
878 let mut lsps = [0.0; LPC_ORD]; //f32
879 let mut lspd_indexes = [0; LPC_ORD];
880 let mut nbit = 0;
881
882 let nbyte = (self.bits_per_frame() + 7) / 8;
883 for i in 0..nbyte {
884 bits[i] = 0;
885 }
886
887 // first 10ms analysis frame - we just want voicing
888 self.analyse_one_frame(&mut model, speech);
889 pack(bits, &mut nbit, model.voiced, 1);
890
891 // second 10ms analysis frame
892 self.analyse_one_frame(&mut model, &speech[self.internal.n_samp..]);
893 pack(bits, &mut nbit, model.voiced, 1);
894 let Wo_index = encode_Wo(&self.internal.c2const, model.Wo, WO_BITS);
895 pack(bits, &mut nbit, Wo_index, WO_BITS as u32); //1+1+7 = 9 bits
896
897 let e = speech_to_uq_lsps(
898 &mut lsps,
899 &mut ak,
900 &self.internal.Sn,
901 &self.internal.w,
902 self.internal.m_pitch,
903 LPC_ORD,
904 );
905 let e_index = encode_energy(e, E_BITS);
906 pack(bits, &mut nbit, e_index, E_BITS as u32); //9+5 = 14 bits
907
908 encode_lspds_scalar(&mut lspd_indexes, &lsps, LPC_ORD);
909 for i in 0..LSPD_SCALAR_INDEXES {
910 pack(bits, &mut nbit, lspd_indexes[i], lspd_bits(i) as u32);
911 }
912 }
913
914 /*---------------------------------------------------------------------------*\
915
916 FUNCTION....: codec2_decode_3200
917 AUTHOR......: David Rowe, conversion by Matt Weeks
918 DATE CREATED: 13 Sep 2012
919
920 Decodes a frame of 64 bits into 160 samples (20ms) of speech.
921
922 \*---------------------------------------------------------------------------*/
923 /// Decodes a frame of 64 bits into 160 samples (20ms) of speech.
924 fn codec2_decode_3200(&mut self, speech: &mut [i16], bits: &[u8]) {
925 let mut ak = [[0.0; LPC_ORD + 1]; 2];
926 let mut nbit = 0;
927 let mut Aw = [COMP::new(); FFT_ENC];
928
929 let mut model = [MODEL::new(self.internal.c2const.p_max as f32); 2];
930
931 // unpack bits from channel ------------------------------------
932
933 /* this will partially fill the model params for the 2 x 10ms
934 frames */
935
936 model[0].voiced = unpack(bits, &mut nbit, 1);
937 model[1].voiced = unpack(bits, &mut nbit, 1);
938
939 let Wo_index = unpack(bits, &mut nbit, WO_BITS as u32);
940 model[1].Wo = decode_Wo(&self.internal.c2const, Wo_index, WO_BITS);
941 model[1].L = (PI / model[1].Wo as f64) as usize;
942
943 let mut e = [0.0; 2];
944 let e_index = unpack(bits, &mut nbit, E_BITS as u32);
945 e[1] = decode_energy(e_index, E_BITS);
946
947 let mut lspd_indexes = [0; LPC_ORD];
948 for i in 0..LSPD_SCALAR_INDEXES {
949 lspd_indexes[i] = unpack(bits, &mut nbit, lspd_bits(i) as u32) as usize;
950 }
951 let mut lsps = [[0.0; LPC_ORD]; 2];
952 decode_lspds_scalar(&mut lsps[1][0..], &lspd_indexes, LPC_ORD);
953
954 // interpolate ------------------------------------------------
955
956 /* Wo and energy are sampled every 20ms, so we interpolate just 1
957 10ms frame between 20ms samples */
958
959 model[0] = interp_Wo(
960 &model[0],
961 &self.internal.prev_model_dec,
962 &model[1],
963 self.internal.c2const.Wo_min,
964 );
965 e[0] = interp_energy(self.internal.prev_e_dec, e[1]);
966
967 /* LSPs are sampled every 20ms so we interpolate the frame in
968 between, then recover spectral amplitudes */
969
970 let (lsps0, lsps1) = lsps.split_at_mut(1);
971 interpolate_lsp_ver2(
972 &mut lsps0[0][0..],
973 &self.internal.prev_lsps_dec,
974 &mut lsps1[0][0..],
975 0.5,
976 LPC_ORD,
977 );
978
979 for i in 0..2 {
980 lsp_to_lpc(&lsps[i][0..], &mut ak[i][0..], LPC_ORD);
981 let mut snr = 0.0;
982 aks_to_M2(
983 &mut self.internal.fftr_fwd_cfg,
984 &ak[i][..],
985 LPC_ORD,
986 &mut model[i],
987 e[i],
988 &mut snr,
989 0,
990 0,
991 self.internal.lpc_pf,
992 self.internal.bass_boost,
993 self.internal.beta,
994 self.internal.gamma,
995 &mut Aw,
996 );
997 apply_lpc_correction(&mut model[i]);
998 self.synthesise_one_frame(
999 &mut speech[self.internal.n_samp * i..],
1000 &mut model[i],
1001 &mut Aw,
1002 1.0,
1003 );
1004 }
1005
1006 // update memories for next frame ----------------------------
1007
1008 self.internal.prev_model_dec = model[1];
1009 self.internal.prev_e_dec = e[1];
1010 for i in 0..LPC_ORD {
1011 self.internal.prev_lsps_dec[i] = lsps[1][i];
1012 }
1013 }
1014
1015 /*---------------------------------------------------------------------------*\
1016
1017 FUNCTION....: codec2_encode_2400
1018 AUTHOR......: David Rowe
1019 DATE CREATED: 21/8/2010
1020
1021 Encodes 160 speech samples (20ms of speech) into 48 bits.
1022
1023 The codec2 algorithm actually operates internally on 10ms (80
1024 sample) frames, so we run the encoding algorithm twice. On the
1025 first frame we just send the voicing bit. On the second frame we
1026 send all model parameters.
1027
1028 The bit allocation is:
1029
1030 Parameter bits/frame
1031 --------------------------------------
1032 Harmonic magnitudes (LSPs) 36
1033 Joint VQ of Energy and Wo 8
1034 Voicing (10ms update) 2
1035 Spare 2
1036 TOTAL 48
1037
1038 \*---------------------------------------------------------------------------*/
1039 /// Encodes 160 speech samples (20ms of speech) into 48 bits.
1040 fn codec2_encode_2400(&mut self, bits: &mut [u8], speech: &[i16]) {
1041 let mut model = MODEL::new(self.internal.c2const.p_max as f32);
1042 let mut ak = [0.0; LPC_ORD + 1]; //f32
1043 let mut lsps = [0.0; LPC_ORD]; //f32
1044 let mut lsp_indexes = [0; LPC_ORD];
1045 let mut nbit = 0;
1046
1047 for i in 0..(self.bits_per_frame() + 7) / 8 {
1048 bits[i] = 0;
1049 }
1050
1051 // first 10ms analysis frame - we just want voicing
1052 self.analyse_one_frame(&mut model, speech);
1053 pack(bits, &mut nbit, model.voiced, 1);
1054
1055 // second 10ms analysis frame
1056 self.analyse_one_frame(&mut model, &speech[self.internal.n_samp..]);
1057 pack(bits, &mut nbit, model.voiced, 1);
1058 let e = speech_to_uq_lsps(
1059 &mut lsps,
1060 &mut ak,
1061 &self.internal.Sn,
1062 &self.internal.w,
1063 self.internal.m_pitch,
1064 LPC_ORD,
1065 );
1066
1067 let WoE_index = encode_WoE(&model, e, &mut self.internal.xq_enc);
1068 pack(bits, &mut nbit, WoE_index, WO_E_BITS);
1069
1070 encode_lsps_scalar(&mut lsp_indexes, &lsps, LPC_ORD);
1071 for i in 0..LSP_SCALAR_INDEXES {
1072 pack(bits, &mut nbit, lsp_indexes[i], lsp_bits(i));
1073 }
1074 let spare = 0;
1075 pack(bits, &mut nbit, spare, 2);
1076
1077 //assert(nbit == (unsigned)codec2_bits_per_frame(c2));
1078 }
1079
1080 /*---------------------------------------------------------------------------*\
1081
1082 FUNCTION....: codec2_decode_2400
1083 AUTHOR......: David Rowe
1084 DATE CREATED: 21/8/2010
1085
1086 Decodes frames of 48 bits into 160 samples (20ms) of speech.
1087
1088 \*---------------------------------------------------------------------------*/
1089 fn codec2_decode_2400(&mut self, speech: &mut [i16], bits: &[u8]) {
1090 let mut model = [MODEL::new(self.internal.c2const.p_max as f32); 2];
1091 let mut lsp_indexes = [0; LPC_ORD];
1092 let mut lsps = [[0.0; LPC_ORD]; 2];
1093 let mut e = [0.0; 2];
1094 let mut snr = 0.0;
1095 let mut ak = [[0.0; LPC_ORD + 1]; 2];
1096 let mut nbit = 0;
1097 let mut Aw = [COMP::new(); FFT_ENC];
1098
1099 //assert(c2 != NULL);
1100
1101 /* unpack bits from channel ------------------------------------*/
1102
1103 /* this will partially fill the model params for the 2 x 10ms
1104 frames */
1105 model[0].voiced = unpack(bits, &mut nbit, 1);
1106 model[1].voiced = unpack(bits, &mut nbit, 1);
1107
1108 let WoE_index = unpack(bits, &mut nbit, WO_E_BITS) as usize;
1109 decode_WoE(
1110 &self.internal.c2const,
1111 &mut model[1],
1112 &mut e[1],
1113 &mut self.internal.xq_dec,
1114 WoE_index,
1115 );
1116
1117 for i in 0..LSP_SCALAR_INDEXES {
1118 lsp_indexes[i] = unpack(bits, &mut nbit, lsp_bits(i)) as usize;
1119 }
1120 decode_lsps_scalar(&mut lsps[1][0..], &lsp_indexes, LPC_ORD);
1121 check_lsp_order(&mut lsps[1][0..], LPC_ORD);
1122 bw_expand_lsps(&mut lsps[1][0..], LPC_ORD, 50.0, 100.0);
1123
1124 /* interpolate ------------------------------------------------*/
1125
1126 /* Wo and energy are sampled every 20ms, so we interpolate just 1
1127 10ms frame between 20ms samples */
1128
1129 model[0] = interp_Wo(
1130 &model[0],
1131 &self.internal.prev_model_dec,
1132 &model[1],
1133 self.internal.c2const.Wo_min,
1134 );
1135 e[0] = interp_energy(self.internal.prev_e_dec, e[1]);
1136
1137 /* LSPs are sampled every 20ms so we interpolate the frame in
1138 between, then recover spectral amplitudes */
1139
1140 let (lsps0, lsps1) = lsps.split_at_mut(1);
1141 interpolate_lsp_ver2(
1142 &mut lsps0[0][0..],
1143 &self.internal.prev_lsps_dec,
1144 &mut lsps1[0][0..],
1145 0.5,
1146 LPC_ORD,
1147 );
1148
1149 for i in 0..2 {
1150 lsp_to_lpc(&lsps[i][0..], &mut ak[i][0..], LPC_ORD);
1151 aks_to_M2(
1152 &mut self.internal.fftr_fwd_cfg,
1153 &ak[i][..],
1154 LPC_ORD,
1155 &mut model[i],
1156 e[i],
1157 &mut snr,
1158 0,
1159 0,
1160 self.internal.lpc_pf,
1161 self.internal.bass_boost,
1162 self.internal.beta,
1163 self.internal.gamma,
1164 &mut Aw,
1165 );
1166 apply_lpc_correction(&mut model[i]);
1167 self.synthesise_one_frame(
1168 &mut speech[self.internal.n_samp * i..],
1169 &mut model[i],
1170 &mut Aw,
1171 1.0,
1172 );
1173 }
1174
1175 /* update memories for next frame ----------------------------*/
1176 self.internal.prev_model_dec = model[1];
1177 self.internal.prev_e_dec = e[1];
1178 for i in 0..LPC_ORD {
1179 self.internal.prev_lsps_dec[i] = lsps[1][i];
1180 }
1181 }
1182
1183 /*---------------------------------------------------------------------------*\
1184 FUNCTION....: codec2_encode_1600
1185 AUTHOR......: David Rowe, conversion by Raphael Peters
1186 DATE CREATED: Feb 28 2013
1187
1188 Encodes 320 speech samples (40ms of speech) into 64 bits.
1189 The codec2 algorithm actually operates internally on 10ms (80
1190 sample) frames, so we run the encoding algorithm 4 times:
1191 frame 0: voicing bit
1192 frame 1: voicing bit, Wo and E
1193 frame 2: voicing bit
1194 frame 3: voicing bit, Wo and E, scalar LSPs
1195
1196 The bit allocation is:
1197
1198 Parameter frame 2 frame 4 Total
1199 -------------------------------------------------------
1200 Harmonic magnitudes (LSPs) 0 36 36
1201 Pitch (Wo) 7 7 14
1202 Energy 5 5 10
1203 Voicing (10ms update) 2 2 4
1204 TOTAL 14 50 64
1205 \*---------------------------------------------------------------------------*/
1206 /// Encodes 320 speech samples (40ms of speech) into 64 bits.
1207 fn codec2_encode_1600(&mut self, bits: &mut [u8], speech: &[i16]) {
1208 let mut model = MODEL::new(self.internal.c2const.p_max as f32);
1209 let mut ak = [0.0; LPC_ORD + 1]; //f32
1210 let mut lsps = [0.0; LPC_ORD]; //f32
1211 let mut lsp_indexes = [0; LPC_ORD];
1212 let mut nbit = 0;
1213
1214 for i in 0..(self.bits_per_frame() + 7) / 8 {
1215 bits[i] = 0;
1216 }
1217
1218 // frame 1: - voicing
1219
1220 self.analyse_one_frame(&mut model, speech);
1221 pack(bits, &mut nbit, model.voiced, 1);
1222
1223 // frame 2: - voicing, scalar Wo & E
1224
1225 self.analyse_one_frame(&mut model, &speech[self.internal.n_samp..]);
1226 pack(bits, &mut nbit, model.voiced, 1);
1227
1228 let mut Wo_index = encode_Wo(&self.internal.c2const, model.Wo, WO_BITS);
1229 pack(bits, &mut nbit, Wo_index, WO_BITS as u32);
1230
1231 // need to run this just to get LPC energy
1232 let mut e = speech_to_uq_lsps(
1233 &mut lsps,
1234 &mut ak,
1235 &self.internal.Sn,
1236 &self.internal.w,
1237 self.internal.m_pitch,
1238 LPC_ORD,
1239 );
1240 let mut e_index = encode_energy(e, E_BITS);
1241 pack(bits, &mut nbit, e_index, E_BITS as u32);
1242
1243 // frame 3: - voicing
1244
1245 self.analyse_one_frame(&mut model, &speech[2 * self.internal.n_samp..]);
1246 pack(bits, &mut nbit, model.voiced, 1);
1247
1248 // frame 4: - voicing, scalar Wo & E, scalar LSPs
1249
1250 self.analyse_one_frame(&mut model, &speech[3 * self.internal.n_samp..]);
1251 pack(bits, &mut nbit, model.voiced, 1);
1252
1253 Wo_index = encode_Wo(&self.internal.c2const, model.Wo, WO_BITS);
1254 pack(bits, &mut nbit, Wo_index, WO_BITS as u32);
1255
1256 e = speech_to_uq_lsps(
1257 &mut lsps,
1258 &mut ak,
1259 &self.internal.Sn,
1260 &self.internal.w,
1261 self.internal.m_pitch,
1262 LPC_ORD,
1263 );
1264 e_index = encode_energy(e, E_BITS);
1265 pack(bits, &mut nbit, e_index, E_BITS as u32);
1266
1267 encode_lsps_scalar(&mut lsp_indexes, &lsps, LPC_ORD);
1268 for i in 0..LSP_SCALAR_INDEXES {
1269 pack(bits, &mut nbit, lsp_indexes[i], lsp_bits(i));
1270 }
1271
1272 assert_eq!(nbit, self.bits_per_frame());
1273 }
1274
1275 /*---------------------------------------------------------------------------*\
1276
1277 FUNCTION....: codec2_decode_1600
1278
1279 AUTHOR......: David Rowe, conversion by Raphael Peters
1280 DATE CREATED: 11 May 2012
1281
1282 Decodes frames of 64 bits into 320 samples (40ms) of speech.
1283
1284
1285 \*---------------------------------------------------------------------------*/
1286 fn codec2_decode_1600(&mut self, speech: &mut [i16], bits: &[u8]) {
1287 let mut model = [MODEL::new(self.internal.c2const.p_max as f32); 4];
1288 let mut lsp_indexes = [0; LPC_ORD];
1289 let mut lsps = [[0.0; LPC_ORD]; 4];
1290 let mut e = [0.0; 4];
1291 let mut snr = 0.0;
1292 let mut ak = [[0.0; LPC_ORD + 1]; 4];
1293 let mut nbit = 0;
1294 let mut Aw = [COMP::new(); FFT_ENC];
1295
1296 /* unpack bits from channel ------------------------------------*/
1297
1298 /* this will partially fill the model params for the 4 x 10ms
1299 frames */
1300
1301 model[0].voiced = unpack(bits, &mut nbit, 1);
1302
1303 model[1].voiced = unpack(bits, &mut nbit, 1);
1304 let Wo_index = unpack(bits, &mut nbit, WO_BITS as u32);
1305 model[1].Wo = decode_Wo(&self.internal.c2const, Wo_index, WO_BITS);
1306 model[1].L = (PI as f32 / model[1].Wo) as usize;
1307
1308 let mut e_index = unpack(bits, &mut nbit, E_BITS as u32);
1309 e[1] = decode_energy(e_index, E_BITS);
1310
1311 model[2].voiced = unpack(bits, &mut nbit, 1);
1312
1313 model[3].voiced = unpack(bits, &mut nbit, 1);
1314 let Wo_index = unpack(bits, &mut nbit, WO_BITS as u32);
1315 model[3].Wo = decode_Wo(&self.internal.c2const, Wo_index, WO_BITS);
1316 model[3].L = (PI as f32 / model[3].Wo) as usize;
1317
1318 e_index = unpack(bits, &mut nbit, E_BITS as u32);
1319 e[3] = decode_energy(e_index, E_BITS);
1320
1321 for i in 0..LSP_SCALAR_INDEXES {
1322 lsp_indexes[i] = unpack(bits, &mut nbit, lsp_bits(i)) as usize;
1323 }
1324 decode_lsps_scalar(&mut lsps[3][0..], &lsp_indexes, LPC_ORD);
1325 check_lsp_order(&mut lsps[3][0..], LPC_ORD);
1326 bw_expand_lsps(&mut lsps[3][0..], LPC_ORD, 50.0, 100.0);
1327
1328 /* interpolate ------------------------------------------------*/
1329
1330 /* Wo and energy are sampled every 20ms, so we interpolate just 1
1331 10ms frame between 20ms samples */
1332
1333 model[0] = interp_Wo(
1334 &model[0],
1335 &self.internal.prev_model_dec,
1336 &model[1],
1337 self.internal.c2const.Wo_min,
1338 );
1339 e[0] = interp_energy(self.internal.prev_e_dec, e[1]);
1340 model[2] = interp_Wo(
1341 &model[2],
1342 &model[1],
1343 &model[3],
1344 self.internal.c2const.Wo_min,
1345 );
1346 e[2] = interp_energy(e[1], e[3]);
1347
1348 /* LSPs are sampled every 40ms so we interpolate the 3 frames in
1349 between, then recover spectral amplitudes */
1350
1351 for i in 0..3 {
1352 let weight = 0.25 + i as f32 * 0.25;
1353 let lsps3 = lsps[3];
1354 interpolate_lsp_ver2(
1355 &mut lsps[i][0..],
1356 &self.internal.prev_lsps_dec,
1357 &lsps3[0..],
1358 weight,
1359 LPC_ORD,
1360 );
1361 }
1362 for i in 0..4 {
1363 lsp_to_lpc(&lsps[i][0..], &mut ak[i][0..], LPC_ORD);
1364 aks_to_M2(
1365 &mut self.internal.fftr_fwd_cfg,
1366 &ak[i][0..],
1367 LPC_ORD,
1368 &mut model[i],
1369 e[i],
1370 &mut snr,
1371 0,
1372 0,
1373 self.internal.lpc_pf,
1374 self.internal.bass_boost,
1375 self.internal.beta,
1376 self.internal.gamma,
1377 &mut Aw,
1378 );
1379 apply_lpc_correction(&mut model[i]);
1380 self.synthesise_one_frame(
1381 &mut speech[self.internal.n_samp * i..],
1382 &mut model[i],
1383 &Aw,
1384 1.0,
1385 );
1386 }
1387
1388 /* update memories for next frame ----------------------------*/
1389
1390 self.internal.prev_model_dec = model[3];
1391 self.internal.prev_e_dec = e[3];
1392 for i in 0..LPC_ORD {
1393 self.internal.prev_lsps_dec[i] = lsps[3][i];
1394 }
1395 }
1396
1397 /*---------------------------------------------------------------------------*\
1398 FUNCTION....: codec2_encode_1400
1399 AUTHOR......: David Rowe, conversion by Raphael Peters
1400 DATE CREATED: May 11 2012
1401 Encodes 320 speech samples (40ms of speech) into 56 bits.
1402 The codec2 algorithm actually operates internally on 10ms (80
1403 sample) frames, so we run the encoding algorithm 4 times:
1404 frame 0: voicing bit
1405 frame 1: voicing bit, joint VQ of Wo and E
1406 frame 2: voicing bit
1407 frame 3: voicing bit, joint VQ of Wo and E, scalar LSPs
1408 The bit allocation is:
1409 Parameter frame 2 frame 4 Total
1410 -------------------------------------------------------
1411 Harmonic magnitudes (LSPs) 0 36 36
1412 Energy+Wo 8 8 16
1413 Voicing (10ms update) 2 2 4
1414 TOTAL 10 46 56
1415 \*---------------------------------------------------------------------------*/
1416 fn codec2_encode_1400(&mut self, bits: &mut [u8], speech: &[i16]) {
1417 let mut model = MODEL::new(self.internal.c2const.p_max as f32);
1418 let mut lsps = [0.0; LPC_ORD]; //f32
1419 let mut ak = [0.0; LPC_ORD + 1]; //f32
1420 let mut lsp_indexes = [0; LPC_ORD];
1421 let mut nbit = 0;
1422
1423 let nbyte = (self.bits_per_frame() + 7) / 8;
1424 for i in 0..nbyte {
1425 bits[i] = 0;
1426 }
1427
1428 // frame 1: - voicing
1429
1430 self.analyse_one_frame(&mut model, speech);
1431 pack(bits, &mut nbit, model.voiced, 1);
1432
1433 // frame 2: - voicing, joint Wo & E
1434
1435 self.analyse_one_frame(&mut model, &speech[self.internal.n_samp..]);
1436 pack(bits, &mut nbit, model.voiced, 1);
1437
1438 // need to run this just to get LPC energy
1439 let e = speech_to_uq_lsps(
1440 &mut lsps,
1441 &mut ak,
1442 &self.internal.Sn,
1443 &self.internal.w,
1444 self.internal.m_pitch,
1445 LPC_ORD,
1446 );
1447
1448 let WoE_index = encode_WoE(&model, e, &mut self.internal.xq_enc);
1449 pack(bits, &mut nbit, WoE_index, WO_E_BITS);
1450
1451 // frame 3: - voicing
1452
1453 self.analyse_one_frame(&mut model, &speech[2 * self.internal.n_samp..]);
1454 pack(bits, &mut nbit, model.voiced, 1);
1455
1456 // frame 4: - voicing, joint Wo & E, scalar LSPs
1457
1458 self.analyse_one_frame(&mut model, &speech[3 * self.internal.n_samp..]);
1459 pack(bits, &mut nbit, model.voiced, 1);
1460
1461 let e = speech_to_uq_lsps(
1462 &mut lsps,
1463 &mut ak,
1464 &self.internal.Sn,
1465 &self.internal.w,
1466 self.internal.m_pitch,
1467 LPC_ORD,
1468 );
1469 let WoE_index = encode_WoE(&model, e, &mut self.internal.xq_enc);
1470 pack(bits, &mut nbit, WoE_index, WO_E_BITS);
1471
1472 encode_lsps_scalar(&mut lsp_indexes, &lsps, LPC_ORD);
1473 for i in 0..LSP_SCALAR_INDEXES {
1474 pack(bits, &mut nbit, lsp_indexes[i], lsp_bits(i));
1475 }
1476
1477 assert_eq!(nbit, self.bits_per_frame());
1478 }
1479
1480 /*---------------------------------------------------------------------------*\
1481 FUNCTION....: codec2_decode_1400
1482 AUTHOR......: David Rowe, conversion by Raphael Peters
1483 DATE CREATED: 11 May 2012
1484 Decodes frames of 56 bits into 320 samples (40ms) of speech.
1485 \*---------------------------------------------------------------------------*/
1486 fn codec2_decode_1400(&mut self, speech: &mut [i16], bits: &[u8]) {
1487 let mut model = [MODEL::new(self.internal.c2const.p_max as f32); 4];
1488 let mut lsp_indexes = [0; LPC_ORD];
1489 let mut lsps = [[0.0; LPC_ORD]; 4];
1490 let mut e = [0.0; 4];
1491 let mut snr = 0.0;
1492 let mut ak = [[0.0; LPC_ORD + 1]; 4];
1493 let mut nbit = 0;
1494 let mut Aw = [COMP::new(); FFT_ENC];
1495
1496 /* unpack bits from channel ------------------------------------*/
1497
1498 /* this will partially fill the model params for the 4 x 10ms
1499 frames */
1500
1501 model[0].voiced = unpack(bits, &mut nbit, 1);
1502
1503 model[1].voiced = unpack(bits, &mut nbit, 1);
1504 let WoE_index = unpack(bits, &mut nbit, WO_E_BITS);
1505 decode_WoE(
1506 &self.internal.c2const,
1507 &mut model[1],
1508 &mut e[1],
1509 &mut self.internal.xq_dec,
1510 WoE_index as usize,
1511 );
1512
1513 model[2].voiced = unpack(bits, &mut nbit, 1);
1514
1515 model[3].voiced = unpack(bits, &mut nbit, 1);
1516 let WoE_index = unpack(bits, &mut nbit, WO_E_BITS);
1517 decode_WoE(
1518 &self.internal.c2const,
1519 &mut model[3],
1520 &mut e[3],
1521 &mut self.internal.xq_dec,
1522 WoE_index as usize,
1523 );
1524
1525 for i in 0..LSP_SCALAR_INDEXES {
1526 lsp_indexes[i] = unpack(bits, &mut nbit, lsp_bits(i)) as usize;
1527 }
1528 decode_lsps_scalar(&mut lsps[3][0..], &lsp_indexes, LPC_ORD);
1529 check_lsp_order(&mut lsps[3][0..], LPC_ORD);
1530 bw_expand_lsps(&mut lsps[3][0..], LPC_ORD, 50.0, 100.0);
1531
1532 // interpolate
1533
1534 /* Wo and energy are sampled every 20ms, so we interpolate just 1
1535 10ms frame between 20ms samples */
1536
1537 model[0] = interp_Wo(
1538 &model[0],
1539 &self.internal.prev_model_dec,
1540 &model[1],
1541 self.internal.c2const.Wo_min,
1542 );
1543 e[0] = interp_energy(self.internal.prev_e_dec, e[1]);
1544 model[2] = interp_Wo(
1545 &model[2],
1546 &model[1],
1547 &model[3],
1548 self.internal.c2const.Wo_min,
1549 );
1550 e[2] = interp_energy(e[1], e[3]);
1551
1552 /* LSPs are sampled every 40ms so we interpolate the 3 frames in
1553 between, then recover spectral amplitudes */
1554
1555 for i in 0..3 {
1556 let weight = 0.25 + i as f32 * 0.25;
1557 let lsps3 = lsps[3];
1558 interpolate_lsp_ver2(
1559 &mut lsps[i][0..],
1560 &self.internal.prev_lsps_dec,
1561 &lsps3[0..],
1562 weight,
1563 LPC_ORD,
1564 );
1565 }
1566 for i in 0..4 {
1567 lsp_to_lpc(&lsps[i][0..], &mut ak[i][0..], LPC_ORD);
1568 aks_to_M2(
1569 &mut self.internal.fftr_fwd_cfg,
1570 &ak[i][0..],
1571 LPC_ORD,
1572 &mut model[i],
1573 e[i],
1574 &mut snr,
1575 0,
1576 0,
1577 self.internal.lpc_pf,
1578 self.internal.bass_boost,
1579 self.internal.beta,
1580 self.internal.gamma,
1581 &mut Aw,
1582 );
1583 apply_lpc_correction(&mut model[i]);
1584 self.synthesise_one_frame(
1585 &mut speech[self.internal.n_samp * i..],
1586 &mut model[i],
1587 &Aw,
1588 1.0,
1589 );
1590 }
1591
1592 /* update memories for next frame ----------------------------*/
1593
1594 self.internal.prev_model_dec = model[3];
1595 self.internal.prev_e_dec = e[3];
1596 for i in 0..LPC_ORD {
1597 self.internal.prev_lsps_dec[i] = lsps[3][i];
1598 }
1599 }
1600
1601 /*---------------------------------------------------------------------------*\
1602 FUNCTION....: codec2_encode_1300
1603 AUTHOR......: David Rowe, conversion by Raphael Peters
1604 DATE CREATED: March 14 2013
1605 Encodes 320 speech samples (40ms of speech) into 52 bits.
1606 The codec2 algorithm actually operates internally on 10ms (80
1607 sample) frames, so we run the encoding algorithm 4 times:
1608 frame 0: voicing bit
1609 frame 1: voicing bit,
1610 frame 2: voicing bit
1611 frame 3: voicing bit, Wo and E, scalar LSPs
1612 The bit allocation is:
1613 Parameter frame 2 frame 4 Total
1614 -------------------------------------------------------
1615 Harmonic magnitudes (LSPs) 0 36 36
1616 Pitch (Wo) 0 7 7
1617 Energy 0 5 5
1618 Voicing (10ms update) 2 2 4
1619 TOTAL 2 50 52
1620 \*---------------------------------------------------------------------------*/
1621 fn codec2_encode_1300(&mut self, bits: &mut [u8], speech: &[i16]) {
1622 let mut model = MODEL::new(self.internal.c2const.p_max as f32);
1623 let mut lsps = [0.0; LPC_ORD]; //f32
1624 let mut ak = [0.0; LPC_ORD + 1]; //f32
1625 let mut lsp_indexes = [0; LPC_ORD];
1626 let mut nbit = 0;
1627
1628 let nbyte = (self.bits_per_frame() + 7) / 8;
1629 for i in 0..nbyte {
1630 bits[i] = 0;
1631 }
1632
1633 // frame 1: - voicing
1634
1635 self.analyse_one_frame(&mut model, speech);
1636 pack_natural_or_gray(bits, &mut nbit, model.voiced, 1, self.internal.gray as u32);
1637
1638 /* frame 2: - voicing ---------------------------------------------*/
1639
1640 self.analyse_one_frame(&mut model, &speech[self.internal.n_samp..]);
1641 pack_natural_or_gray(bits, &mut nbit, model.voiced, 1, self.internal.gray as u32);
1642
1643 // frame 3: - voicing
1644
1645 self.analyse_one_frame(&mut model, &speech[2 * self.internal.n_samp..]);
1646 pack_natural_or_gray(bits, &mut nbit, model.voiced, 1, self.internal.gray as u32);
1647
1648 // frame 4: - voicing, scalar Wo & E, scalar LSPs
1649
1650 self.analyse_one_frame(&mut model, &speech[3 * self.internal.n_samp..]);
1651 pack_natural_or_gray(bits, &mut nbit, model.voiced, 1, self.internal.gray as u32);
1652
1653 let Wo_index = encode_Wo(&self.internal.c2const, model.Wo, WO_BITS);
1654 pack_natural_or_gray(
1655 bits,
1656 &mut nbit,
1657 Wo_index,
1658 WO_BITS as u32,
1659 self.internal.gray as u32,
1660 );
1661
1662 let e = speech_to_uq_lsps(
1663 &mut lsps,
1664 &mut ak,
1665 &self.internal.Sn,
1666 &self.internal.w,
1667 self.internal.m_pitch,
1668 LPC_ORD,
1669 );
1670 let e_index = encode_energy(e, E_BITS);
1671 pack_natural_or_gray(
1672 bits,
1673 &mut nbit,
1674 e_index,
1675 E_BITS as u32,
1676 self.internal.gray as u32,
1677 );
1678
1679 encode_lsps_scalar(&mut lsp_indexes, &lsps, LPC_ORD);
1680 for i in 0..LSP_SCALAR_INDEXES {
1681 pack_natural_or_gray(
1682 bits,
1683 &mut nbit,
1684 lsp_indexes[i],
1685 lsp_bits(i),
1686 self.internal.gray as u32,
1687 );
1688 }
1689
1690 assert_eq!(nbit, self.bits_per_frame());
1691 }
1692
1693 /*---------------------------------------------------------------------------*\
1694 FUNCTION....: codec2_decode_1300
1695 AUTHOR......: David Rowe, conversion by Raphael Peters
1696 DATE CREATED: 11 May 2012
1697 Decodes frames of 52 bits into 320 samples (40ms) of speech.
1698 \*---------------------------------------------------------------------------*/
1699 fn codec2_decode_1300(&mut self, speech: &mut [i16], bits: &[u8], ber_est: f32) {
1700 let mut model = [MODEL::new(self.internal.c2const.p_max as f32); 4];
1701 let mut lsp_indexes = [0; LPC_ORD];
1702 let mut lsps = [[0.0; LPC_ORD]; 4];
1703 let mut e = [0.0; 4];
1704 let mut snr = 0.0;
1705 let mut ak = [[0.0; LPC_ORD + 1]; 4];
1706 let mut nbit = 0;
1707 let mut Aw = [COMP::new(); FFT_ENC];
1708
1709 /* unpack bits from channel ------------------------------------*/
1710
1711 /* this will partially fill the model params for the 4 x 10ms
1712 frames */
1713
1714 model[0].voiced = unpack_natural_or_gray(bits, &mut nbit, 1, self.internal.gray as u32);
1715 model[1].voiced = unpack_natural_or_gray(bits, &mut nbit, 1, self.internal.gray as u32);
1716 model[2].voiced = unpack_natural_or_gray(bits, &mut nbit, 1, self.internal.gray as u32);
1717 model[3].voiced = unpack_natural_or_gray(bits, &mut nbit, 1, self.internal.gray as u32);
1718
1719 let Wo_index =
1720 unpack_natural_or_gray(bits, &mut nbit, WO_BITS as u32, self.internal.gray as u32);
1721 model[3].Wo = decode_Wo(&self.internal.c2const, Wo_index, WO_BITS);
1722 model[3].L = (PI as f32 / model[3].Wo) as usize;
1723
1724 let e_index =
1725 unpack_natural_or_gray(bits, &mut nbit, E_BITS as u32, self.internal.gray as u32);
1726 e[3] = decode_energy(e_index, E_BITS);
1727
1728 for i in 0..LSP_SCALAR_INDEXES {
1729 lsp_indexes[i] =
1730 unpack_natural_or_gray(bits, &mut nbit, lsp_bits(i), self.internal.gray as u32)
1731 as usize;
1732 }
1733 decode_lsps_scalar(&mut lsps[3][0..], &lsp_indexes, LPC_ORD);
1734 check_lsp_order(&mut lsps[3][0..], LPC_ORD);
1735 bw_expand_lsps(&mut lsps[3][0..], LPC_ORD, 50.0, 100.0);
1736
1737 if ber_est > 0.15 {
1738 model[0].voiced = 0;
1739 model[1].voiced = 0;
1740 model[2].voiced = 0;
1741 model[3].voiced = 0;
1742 e[3] = decode_energy(10, E_BITS);
1743 bw_expand_lsps(&mut lsps[3][0..], LPC_ORD, 200.0, 200.0);
1744 //fprintf(stderr, "soft mute\n");
1745 }
1746
1747 // interpolate
1748
1749 /* Wo, energy, and LSPs are sampled every 40ms so we interpolate
1750 the 3 frames in between */
1751
1752 for i in 0..3 {
1753 let weight = 0.25 + i as f32 * 0.25;
1754 let lsps3 = lsps[3];
1755 interpolate_lsp_ver2(
1756 &mut lsps[i][0..],
1757 &self.internal.prev_lsps_dec,
1758 &lsps3[0..],
1759 weight,
1760 LPC_ORD,
1761 );
1762 model[i] = interp_Wo2(
1763 &model[i],
1764 &self.internal.prev_model_dec,
1765 &model[3],
1766 weight,
1767 self.internal.c2const.Wo_min,
1768 );
1769 e[i] = interp_energy2(self.internal.prev_e_dec, e[3], weight);
1770 }
1771
1772 /* then recover spectral amplitudes */
1773
1774 for i in 0..4 {
1775 lsp_to_lpc(&lsps[i][0..], &mut ak[i][0..], LPC_ORD);
1776 aks_to_M2(
1777 &mut self.internal.fftr_fwd_cfg,
1778 &ak[i][0..],
1779 LPC_ORD,
1780 &mut model[i],
1781 e[i],
1782 &mut snr,
1783 0,
1784 0,
1785 self.internal.lpc_pf,
1786 self.internal.bass_boost,
1787 self.internal.beta,
1788 self.internal.gamma,
1789 &mut Aw,
1790 );
1791 apply_lpc_correction(&mut model[i]);
1792 self.synthesise_one_frame(
1793 &mut speech[self.internal.n_samp * i..],
1794 &mut model[i],
1795 &Aw,
1796 1.0,
1797 );
1798
1799 /* dump parameters for deep learning experiments */
1800 /*
1801 if (c2 -> fmlfeat != NULL) {
1802 /* 10 LSPs - energy - Wo - voicing flag - 10 LPCs */
1803 fwrite(&lsps[i][0], LPC_ORD, sizeof(float), self.internal.fmlfeat);
1804 fwrite(&e[i], 1, sizeof(float), self.internal.fmlfeat);
1805 fwrite(&model[i].Wo, 1, sizeof(float), self.internal.fmlfeat);
1806 float
1807 voiced_float = model[i].voiced;
1808 fwrite(&voiced_float, 1, sizeof(float), self.internal.fmlfeat);
1809 fwrite(&ak[i][1], LPC_ORD, sizeof(float), self.internal.fmlfeat);
1810 }
1811
1812 */
1813 }
1814
1815 /*
1816 #ifdef DUMP
1817 dump_lsp_(&lsps[3][0]);
1818 dump_ak_(&ak[3][0], LPC_ORD);
1819 #endif
1820 */
1821
1822 /* update memories for next frame ----------------------------*/
1823
1824 self.internal.prev_model_dec = model[3];
1825 self.internal.prev_e_dec = e[3];
1826 for i in 0..LPC_ORD {
1827 self.internal.prev_lsps_dec[i] = lsps[3][i];
1828 }
1829 }
1830
1831 /*---------------------------------------------------------------------------*\
1832 FUNCTION....: codec2_encode_1200
1833 AUTHOR......: David Rowe, conversion by Raphael Peters
1834 DATE CREATED: Nov 14 2011
1835 Encodes 320 speech samples (40ms of speech) into 48 bits.
1836 The codec2 algorithm actually operates internally on 10ms (80
1837 sample) frames, so we run the encoding algorithm four times:
1838 frame 0: voicing bit
1839 frame 1: voicing bit, joint VQ of Wo and E
1840 frame 2: voicing bit
1841 frame 3: voicing bit, joint VQ of Wo and E, VQ LSPs
1842 The bit allocation is:
1843 Parameter frame 2 frame 4 Total
1844 -------------------------------------------------------
1845 Harmonic magnitudes (LSPs) 0 27 27
1846 Energy+Wo 8 8 16
1847 Voicing (10ms update) 2 2 4
1848 Spare 0 1 1
1849 TOTAL 10 38 48
1850 \*---------------------------------------------------------------------------*/
1851 fn codec2_encode_1200(&mut self, bits: &mut [u8], speech: &[i16]) {
1852 let mut model = MODEL::new(self.internal.c2const.p_max as f32);
1853 let mut lsps = [0.0; LPC_ORD]; //f32
1854 let mut lsps_ = [0.0; LPC_ORD]; //f32
1855 let mut ak = [0.0; LPC_ORD + 1]; //f32
1856 let mut lsp_indexes = [0; LPC_ORD];
1857 let spare = 0;
1858 let mut nbit = 0;
1859
1860 let nbyte = (self.bits_per_frame() + 7) / 8;
1861 for i in 0..nbyte {
1862 bits[i] = 0;
1863 }
1864
1865 // frame 1: - voicing
1866
1867 self.analyse_one_frame(&mut model, speech);
1868 pack(bits, &mut nbit, model.voiced, 1);
1869
1870 // frame 2: - voicing, joint Wo & E
1871
1872 self.analyse_one_frame(&mut model, &speech[self.internal.n_samp..]);
1873 pack(bits, &mut nbit, model.voiced, 1);
1874
1875 // need to run this just to get LPC energy
1876 let e = speech_to_uq_lsps(
1877 &mut lsps,
1878 &mut ak,
1879 &self.internal.Sn,
1880 &self.internal.w,
1881 self.internal.m_pitch,
1882 LPC_ORD,
1883 );
1884
1885 let WoE_index = encode_WoE(&model, e, &mut self.internal.xq_enc);
1886 pack(bits, &mut nbit, WoE_index, WO_E_BITS);
1887
1888 // frame 3: - voicing
1889
1890 self.analyse_one_frame(&mut model, &speech[2 * self.internal.n_samp..]);
1891 pack(bits, &mut nbit, model.voiced, 1);
1892
1893 // frame 4: - voicing, joint Wo & E, scalar LSPs
1894
1895 self.analyse_one_frame(&mut model, &speech[3 * self.internal.n_samp..]);
1896 pack(bits, &mut nbit, model.voiced, 1);
1897
1898 let e = speech_to_uq_lsps(
1899 &mut lsps,
1900 &mut ak,
1901 &self.internal.Sn,
1902 &self.internal.w,
1903 self.internal.m_pitch,
1904 LPC_ORD,
1905 );
1906 let WoE_index = encode_WoE(&model, e, &mut self.internal.xq_enc);
1907 pack(bits, &mut nbit, WoE_index, WO_E_BITS);
1908
1909 encode_lsps_vq(&mut lsp_indexes, &mut lsps, &mut lsps_, LPC_ORD);
1910 for i in 0..LSP_PRED_VQ_INDEXES {
1911 pack(
1912 bits,
1913 &mut nbit,
1914 lsp_indexes[i] as i32,
1915 lsp_pred_vq_bits(i) as u32,
1916 );
1917 }
1918 pack(bits, &mut nbit, spare, 1);
1919
1920 assert_eq!(nbit, self.bits_per_frame());
1921 }
1922
1923 /*---------------------------------------------------------------------------*\
1924 FUNCTION....: codec2_decode_1200
1925 AUTHOR......: David Rowe, conversion by Raphael Peters
1926 DATE CREATED: 14 Feb 2012
1927 Decodes frames of 48 bits into 320 samples (40ms) of speech.
1928 \*---------------------------------------------------------------------------*/
1929
1930 fn codec2_decode_1200(&mut self, speech: &mut [i16], bits: &[u8]) {
1931 let mut model = [MODEL::new(self.internal.c2const.p_max as f32); 4];
1932 let mut lsp_indexes = [0; LPC_ORD];
1933 let mut lsps = [[0.0; LPC_ORD]; 4];
1934 let mut e = [0.0; 4];
1935 let mut snr = 0.0;
1936 let mut ak = [[0.0; LPC_ORD + 1]; 4];
1937 let mut nbit = 0;
1938 let mut Aw = [COMP::new(); FFT_ENC];
1939 // only need to zero these out due to (unused) snr calculation
1940
1941 for i in 0..4 {
1942 for j in 1..MAX_AMP {
1943 model[i].A[j] = 0.0;
1944 }
1945 }
1946
1947 /* unpack bits from channel ------------------------------------*/
1948
1949 /* this will partially fill the model params for the 4 x 10ms
1950 frames */
1951
1952 model[0].voiced = unpack(bits, &mut nbit, 1);
1953
1954 model[1].voiced = unpack(bits, &mut nbit, 1);
1955 let WoE_index = unpack(bits, &mut nbit, WO_E_BITS);
1956 decode_WoE(
1957 &self.internal.c2const,
1958 &mut model[1],
1959 &mut e[1],
1960 &mut self.internal.xq_dec,
1961 WoE_index as usize,
1962 );
1963
1964 model[2].voiced = unpack(bits, &mut nbit, 1);
1965
1966 model[3].voiced = unpack(bits, &mut nbit, 1);
1967 let WoE_index = unpack(bits, &mut nbit, WO_E_BITS);
1968 decode_WoE(
1969 &self.internal.c2const,
1970 &mut model[3],
1971 &mut e[3],
1972 &mut self.internal.xq_dec,
1973 WoE_index as usize,
1974 );
1975
1976 for i in 0..LSP_PRED_VQ_INDEXES {
1977 lsp_indexes[i] = unpack(bits, &mut nbit, lsp_pred_vq_bits(i) as u32) as usize;
1978 }
1979 decode_lsps_vq(&lsp_indexes, &mut lsps[3][0..], LPC_ORD, 0);
1980 check_lsp_order(&mut lsps[3][0..], LPC_ORD);
1981 bw_expand_lsps(&mut lsps[3][0..], LPC_ORD, 50.0, 100.0);
1982
1983 // interpolate
1984
1985 /* Wo and energy are sampled every 20ms, so we interpolate just 1
1986 10ms frame between 20ms samples */
1987
1988 model[0] = interp_Wo(
1989 &model[0],
1990 &self.internal.prev_model_dec,
1991 &model[1],
1992 self.internal.c2const.Wo_min,
1993 );
1994 e[0] = interp_energy(self.internal.prev_e_dec, e[1]);
1995 model[2] = interp_Wo(
1996 &model[2],
1997 &model[1],
1998 &model[3],
1999 self.internal.c2const.Wo_min,
2000 );
2001 e[2] = interp_energy(e[1], e[3]);
2002
2003 /* LSPs are sampled every 40ms so we interpolate the 3 frames in
2004 between, then recover spectral amplitudes */
2005
2006 for i in 0..3 {
2007 let weight = 0.25 + i as f32 * 0.25;
2008 let lsps3 = lsps[3];
2009 interpolate_lsp_ver2(
2010 &mut lsps[i][0..],
2011 &self.internal.prev_lsps_dec,
2012 &lsps3[0..],
2013 weight,
2014 LPC_ORD,
2015 );
2016 }
2017 for i in 0..4 {
2018 lsp_to_lpc(&lsps[i][0..], &mut ak[i][0..], LPC_ORD);
2019 aks_to_M2(
2020 &mut self.internal.fftr_fwd_cfg,
2021 &ak[i][0..],
2022 LPC_ORD,
2023 &mut model[i],
2024 e[i],
2025 &mut snr,
2026 0,
2027 0,
2028 self.internal.lpc_pf,
2029 self.internal.bass_boost,
2030 self.internal.beta,
2031 self.internal.gamma,
2032 &mut Aw,
2033 );
2034 apply_lpc_correction(&mut model[i]);
2035 self.synthesise_one_frame(
2036 &mut speech[self.internal.n_samp * i..],
2037 &mut model[i],
2038 &Aw,
2039 1.0,
2040 );
2041 }
2042
2043 /* update memories for next frame ----------------------------*/
2044
2045 self.internal.prev_model_dec = model[3];
2046 self.internal.prev_e_dec = e[3];
2047 for i in 0..LPC_ORD {
2048 self.internal.prev_lsps_dec[i] = lsps[3][i];
2049 }
2050 }
2051
2052 /*---------------------------------------------------------------------------* \
2053
2054 FUNCTION....: analyse_one_frame()
2055 AUTHOR......: David Rowe, conversion by Matt Weeks
2056 DATE CREATED: 23/8/2010
2057
2058 Extract sinusoidal model parameters from 80 speech samples (10ms of
2059 speech).
2060
2061 \*---------------------------------------------------------------------------*/
2062 fn analyse_one_frame(&mut self, model: &mut MODEL, speech: &[i16]) {
2063 let mut Sw = [COMP::new(); FFT_ENC];
2064 let n_samp = self.internal.n_samp;
2065 let m_pitch = self.internal.m_pitch;
2066
2067 // Read input speech
2068 for i in 0..m_pitch - n_samp {
2069 self.internal.Sn[i] = self.internal.Sn[i + n_samp];
2070 }
2071 for i in 0..n_samp {
2072 self.internal.Sn[i + m_pitch - n_samp] = speech[i].into();
2073 }
2074 self.internal.c2const.dft_speech(
2075 &self.internal.fft_fwd_cfg,
2076 &mut Sw,
2077 &self.internal.Sn,
2078 &self.internal.w,
2079 );
2080
2081 // Estimate pitch
2082 let mut pitch = 0.0;
2083 nlp::nlp(
2084 &mut self.internal.nlp,
2085 &self.internal.Sn,
2086 n_samp,
2087 &mut pitch,
2088 &Sw,
2089 &self.internal.W,
2090 &mut self.internal.prev_f0_enc,
2091 );
2092 model.Wo = TWO_PI / pitch;
2093 model.L = (PI / model.Wo as f64) as usize;
2094
2095 // estimate model parameters
2096 two_stage_pitch_refinement(&self.internal.c2const, model, &Sw);
2097
2098 // estimate phases when doing ML experiments
2099 estimate_amplitudes(model, &Sw, &self.internal.W, 0);
2100 est_voicing_mbe(&self.internal.c2const, model, &Sw, &self.internal.W);
2101 }
2102
2103 /*---------------------------------------------------------------------------* \
2104
2105 FUNCTION....: synthesise_one_frame()
2106 AUTHOR......: David Rowe, conversion by Matt Weeks
2107 DATE CREATED: 23/8/2010
2108
2109 Synthesise 80 speech samples (10ms) from model parameters.
2110
2111 \*---------------------------------------------------------------------------*/
2112 fn synthesise_one_frame(
2113 &mut self,
2114 speech: &mut [i16],
2115 model: &mut MODEL,
2116 Aw: &[COMP],
2117 gain: f32,
2118 ) {
2119 // LPC based phase synthesis
2120 let mut H = [COMP::new(); MAX_AMP + 1];
2121 sample_phase(model, &mut H, Aw);
2122 phase_synth_zero_order(
2123 self.internal.n_samp,
2124 model,
2125 &mut self.internal.ex_phase,
2126 &mut H,
2127 );
2128
2129 postfilter(model, &mut self.internal.bg_est);
2130 synthesise(
2131 self.internal.n_samp,
2132 &mut self.internal.fftr_inv_cfg,
2133 &mut self.internal.Sn_,
2134 model,
2135 &self.internal.Pn,
2136 true,
2137 );
2138
2139 for i in 0..self.internal.n_samp {
2140 self.internal.Sn_[i] *= gain;
2141 }
2142
2143 ear_protection(&mut self.internal.Sn_, self.internal.n_samp);
2144
2145 for i in 0..self.internal.n_samp {
2146 if self.internal.Sn_[i] > 32767.0 {
2147 speech[i] = 32767;
2148 } else if self.internal.Sn_[i] < -32767.0 {
2149 speech[i] = -32767;
2150 } else {
2151 speech[i] = self.internal.Sn_[i] as i16;
2152 }
2153 }
2154 }
2155}
2156
2157/*---------------------------------------------------------------------------* \
2158
2159 FUNCTION....: ear_protection()
2160 AUTHOR......: David Rowe, conversion by Matt Weeks
2161 DATE CREATED: Nov 7 2012
2162
2163 Limits output level to protect ears when there are bit errors or the input
2164 is overdriven. This doesn't correct or mask bit errors, just reduces the
2165 worst of their damage.
2166
2167\*---------------------------------------------------------------------------*/
2168fn ear_protection(in_out: &mut [f32], n: usize) {
2169 // find maximum sample in frame
2170
2171 let mut max_sample = 0.0;
2172 for i in 0..n {
2173 if in_out[i] > max_sample {
2174 max_sample = in_out[i];
2175 }
2176 }
2177
2178 // determine how far above set point
2179
2180 let over = max_sample / 30000.0;
2181
2182 /* If we are x dB over set point we reduce level by 2x dB, this
2183 attenuates major excursions in amplitude (likely to be caused
2184 by bit errors) more than smaller ones */
2185
2186 if over > 1.0 {
2187 let gain = 1.0 / (over * over);
2188 for i in 0..n {
2189 in_out[i] *= gain;
2190 }
2191 }
2192}
2193
2194/*---------------------------------------------------------------------------*\
2195
2196 est_voicing_mbe()
2197
2198 Returns the error of the MBE cost function for a fiven F0.
2199
2200 Note: I think a lot of the operations below can be simplified as
2201 W[].i = 0 and has been normalised such that den always equals 1.
2202
2203\*---------------------------------------------------------------------------*/
2204fn est_voicing_mbe(c2const: &C2const, model: &mut MODEL, Sw: &[COMP], W: &[f32]) -> f32 {
2205 let mut Am = COMP::new(); // amplitude sample for this band
2206 let mut Ew = COMP::new();
2207
2208 let l_1000hz = (model.L as f32 * 1000.0 / ((c2const.Fs / 2) as f32)) as usize;
2209 let mut sig = 1E-4;
2210 for l in 1..l_1000hz + 1 {
2211 sig += model.A[l] * model.A[l];
2212 }
2213
2214 let Wo = model.Wo;
2215 let mut error = 1E-4; //accumulated error between original and synthesised
2216
2217 // Just test across the harmonics in the first 1000 Hz
2218
2219 for l in 1..l_1000hz + 1 {
2220 Am.r = 0.0;
2221 Am.i = 0.0;
2222 let mut den = 0.0; //denominator of Am expression
2223 let al = ((l as f32 - 0.5) * Wo * (FFT_ENC as f32) / TWO_PI).ceil() as usize;
2224 let bl = ((l as f32 + 0.5) * Wo * (FFT_ENC as f32) / TWO_PI).ceil() as usize;
2225
2226 // Estimate amplitude of harmonic assuming harmonic is totally voiced
2227
2228 // centers Hw[] about current harmonic
2229 let offset =
2230 (FFT_ENC as f32 / 2.0 - (l as f32) * Wo * (FFT_ENC as f32) / TWO_PI + 0.5) as usize;
2231 for m in al..bl {
2232 Am.r += Sw[m].r * W[offset + m];
2233 Am.i += Sw[m].i * W[offset + m];
2234 den += W[offset + m] * W[offset + m];
2235 }
2236
2237 Am.r = Am.r / den;
2238 Am.i = Am.i / den;
2239
2240 // Determine error between estimated harmonic and original
2241
2242 for m in al..bl {
2243 Ew.r = Sw[m].r - Am.r * W[offset + m];
2244 Ew.i = Sw[m].i - Am.i * W[offset + m];
2245 error += Ew.r * Ew.r;
2246 error += Ew.i * Ew.i;
2247 }
2248 }
2249
2250 let snr = 10.0 * (sig / error).log10();
2251 if snr > V_THRESH {
2252 model.voiced = 1;
2253 } else {
2254 model.voiced = 0;
2255 }
2256 // post processing, helps clean up some voicing errors ------------------
2257
2258 /*
2259 Determine the ratio of low freqency to high frequency energy,
2260 voiced speech tends to be dominated by low frequency energy,
2261 unvoiced by high frequency. This measure can be used to
2262 determine if we have made any gross errors.
2263 */
2264
2265 let l_2000hz = (model.L as f32 * 2000.0 / (c2const.Fs as f32 / 2.0)) as usize;
2266 let l_4000hz = (model.L as f32 * 4000.0 / (c2const.Fs as f32 / 2.0)) as usize;
2267 let mut ehigh = 1E-4;
2268 let mut elow = ehigh;
2269 for l in 1..l_2000hz + 1 {
2270 elow += model.A[l] * model.A[l];
2271 }
2272 for l in l_2000hz..l_4000hz + 1 {
2273 ehigh += model.A[l] * model.A[l];
2274 }
2275 let eratio = 10.0 * (elow / ehigh).log10();
2276
2277 /* Look for Type 1 errors, strongly V speech that has been
2278 accidentally declared UV */
2279
2280 if model.voiced == 0 {
2281 if eratio > 10.0 {
2282 model.voiced = 1;
2283 }
2284 }
2285 /* Look for Type 2 errors, strongly UV speech that has been
2286 accidentally declared V */
2287
2288 if model.voiced == 1 {
2289 if eratio < -10.0 {
2290 model.voiced = 0;
2291 }
2292 /* A common source of Type 2 errors is the pitch estimator
2293 gives a low (50Hz) estimate for UV speech, which gives a
2294 good match with noise due to the close harmoonic spacing.
2295 These errors are much more common than people with 50Hz3
2296 pitch, so we have just a small eratio threshold. */
2297
2298 let sixty = 60.0 * TWO_PI / (c2const.Fs as f32);
2299 if (eratio < -4.0) && (model.Wo <= sixty) {
2300 model.voiced = 0;
2301 }
2302 }
2303
2304 return snr;
2305}
2306
2307/*---------------------------------------------------------------------------*\
2308
2309 FUNCTION....: estimate_amplitudes
2310 AUTHOR......: David Rowe, conversion by Matt Weeks
2311 DATE CREATED: 27/5/94
2312
2313 Estimates the complex amplitudes of the harmonics.
2314
2315\*---------------------------------------------------------------------------*/
2316fn estimate_amplitudes(model: &mut MODEL, Sw: &[COMP], _W: &[f32], est_phase: i32) {
2317 let r = TWO_PI / (FFT_ENC as f32);
2318 let one_on_r = 1.0 / r;
2319
2320 for m in 1..model.L + 1 {
2321 // Estimate ampltude of harmonic
2322
2323 let mut den = 0.0; // denominator of amplitude expression
2324 // bounds of current harmonic
2325 let am = ((m as f32 - 0.5) * model.Wo * one_on_r + 0.5) as usize;
2326 let bm = ((m as f32 + 0.5) * model.Wo * one_on_r + 0.5) as usize;
2327
2328 for i in am..bm {
2329 den += Sw[i].r * Sw[i].r + Sw[i].i * Sw[i].i;
2330 }
2331
2332 model.A[m] = den.sqrt();
2333
2334 if est_phase != 0 {
2335 let b = (m as f32 * model.Wo / r + 0.5) as usize; // DFT bin of centre of current harmonic
2336
2337 /* Estimate phase of harmonic, this is expensive in CPU for
2338 embedded devicesso we make it an option */
2339
2340 model.phi[m] = Sw[b].i.atan2(Sw[b].r);
2341 }
2342 }
2343}
2344
2345/*---------------------------------------------------------------------------*\
2346
2347 FUNCTION....: two_stage_pitch_refinement
2348 AUTHOR......: David Rowe, conversion by Matt Weeks
2349 DATE CREATED: 27/5/94
2350
2351 Refines the current pitch estimate using the harmonic sum pitch
2352 estimation technique.
2353
2354\*---------------------------------------------------------------------------*/
2355fn two_stage_pitch_refinement(c2const: &C2const, model: &mut MODEL, Sw: &[COMP]) {
2356 // Coarse refinement
2357 // pitch refinment minimum, maximum and step
2358 let mut pmax = TWO_PI / model.Wo + 5.0;
2359 let mut pmin = TWO_PI / model.Wo - 5.0;
2360 let mut pstep = 1.0;
2361 hs_pitch_refinement(model, Sw, pmin, pmax, pstep);
2362
2363 // Fine refinement
2364
2365 pmax = TWO_PI / model.Wo + 1.0;
2366 pmin = TWO_PI / model.Wo - 1.0;
2367 pstep = 0.25;
2368 hs_pitch_refinement(model, Sw, pmin, pmax, pstep);
2369
2370 // Limit range
2371
2372 if model.Wo < TWO_PI / (c2const.p_max as f32) {
2373 model.Wo = TWO_PI / (c2const.p_max as f32);
2374 }
2375 if model.Wo > TWO_PI / (c2const.p_min as f32) {
2376 model.Wo = TWO_PI / (c2const.p_min as f32);
2377 }
2378
2379 model.L = (PI / model.Wo as f64).floor() as usize;
2380
2381 // trap occasional round off issues with floorf()
2382 if model.Wo * model.L as f32 >= 0.95 * PI as f32 {
2383 model.L -= 1;
2384 }
2385 // assert(model.Wo*model.L < PI);
2386}
2387
2388/*---------------------------------------------------------------------------*\
2389
2390 FUNCTION....: hs_pitch_refinement
2391 AUTHOR......: David Rowe, conversion by Matt Weeks
2392 DATE CREATED: 27/5/94
2393
2394 Harmonic sum pitch refinement function.
2395
2396 pmin pitch search range minimum
2397 pmax pitch search range maximum
2398 step pitch search step size
2399 model current pitch estimate in model.Wo
2400
2401 model refined pitch estimate in model.Wo
2402
2403\*---------------------------------------------------------------------------*/
2404fn hs_pitch_refinement(model: &mut MODEL, Sw: &[COMP], pmin: f32, pmax: f32, pstep: f32) {
2405 // Initialisation
2406
2407 model.L = (PI / model.Wo as f64) as usize; // use initial pitch est. for L
2408 let mut Wom = model.Wo; // Wo that maximises E
2409 let mut Em = 0.0; // mamimum energy
2410 let r = TWO_PI / FFT_ENC as f32; // number of rads/bin
2411 let one_on_r = 1.0 / r;
2412
2413 // Determine harmonic sum for a range of Wo values
2414 let mut p = pmin; // current pitch
2415 while p <= pmax {
2416 let mut E = 0.0; //energy for current pitch
2417 let Wo = TWO_PI / p; // current "test" fundamental freq.
2418
2419 // Sum harmonic magnitudes
2420 for m in 1..model.L + 1 {
2421 // bin for current harmonic centre
2422 let b = (m as f32 * Wo * one_on_r + 0.5) as usize;
2423 E += Sw[b].r * Sw[b].r + Sw[b].i * Sw[b].i;
2424 }
2425 // Compare to see if this is a maximum
2426
2427 if E > Em {
2428 Em = E;
2429 Wom = Wo;
2430 }
2431 p += pstep;
2432 }
2433
2434 model.Wo = Wom;
2435}