1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
//! Phoneme timing extraction from ONNX model duration output.
//!
//! VITS models optionally output a `durations` tensor [1, phoneme_length]
//! containing the number of frames (hop_length-sized) each phoneme occupies.
//! This module converts frame counts to millisecond timestamps.
use serde::Serialize;
use crate::error::PiperError;
/// Default hop length for VITS models
pub const DEFAULT_HOP_LENGTH: usize = 256;
/// Timing information for a single phoneme
#[derive(Debug, Clone, Serialize)]
pub struct PhonemeTimingInfo {
pub phoneme: String,
pub start_ms: f64,
pub end_ms: f64,
pub duration_ms: f64,
}
/// Complete timing result for a synthesized utterance
#[derive(Debug, Clone, Serialize)]
pub struct TimingResult {
pub phonemes: Vec<PhonemeTimingInfo>,
pub total_duration_ms: f64,
pub sample_rate: u32,
}
impl TimingResult {
/// Serialize to JSON string (pretty-printed)
pub fn to_json(&self) -> Result<String, PiperError> {
serde_json::to_string_pretty(self).map_err(PiperError::from)
}
/// Serialize to JSON string (compact, one line per phoneme)
pub fn to_json_compact(&self) -> Result<String, PiperError> {
serde_json::to_string(self).map_err(PiperError::from)
}
/// Serialize to TSV string (tab-separated: start_ms, end_ms, duration_ms, phoneme)
pub fn to_tsv(&self) -> String {
let mut buf = String::from("start_ms\tend_ms\tduration_ms\tphoneme\n");
for p in &self.phonemes {
buf.push_str(&format!(
"{:.3}\t{:.3}\t{:.3}\t{}\n",
p.start_ms, p.end_ms, p.duration_ms, p.phoneme
));
}
buf
}
/// Serialize to SRT-like subtitle format
pub fn to_srt(&self) -> String {
let mut buf = String::new();
for (i, p) in self.phonemes.iter().enumerate() {
let idx = i + 1;
let start = format_srt_timestamp(p.start_ms);
let end = format_srt_timestamp(p.end_ms);
buf.push_str(&format!("{idx}\n{start} --> {end}\n{}\n\n", p.phoneme));
}
buf
}
}
/// Format milliseconds as SRT timestamp: HH:MM:SS,mmm
fn format_srt_timestamp(ms: f64) -> String {
let total_ms = ms.round() as u64;
let millis = total_ms % 1000;
let total_secs = total_ms / 1000;
let secs = total_secs % 60;
let total_mins = total_secs / 60;
let mins = total_mins % 60;
let hours = total_mins / 60;
format!("{hours:02}:{mins:02}:{secs:02},{millis:03}")
}
/// Convert duration tensor output to timing information.
///
/// # Arguments
/// * `durations` - Duration values from ONNX output tensor [phoneme_length]
/// * `phoneme_tokens` - Corresponding phoneme token strings
/// * `sample_rate` - Audio sample rate (e.g., 22050)
/// * `hop_length` - STFT hop length (typically 256 for VITS)
///
/// # Returns
/// TimingResult with start/end timestamps for each phoneme
pub fn durations_to_timing(
durations: &[f32],
phoneme_tokens: &[String],
sample_rate: u32,
hop_length: usize,
) -> Result<TimingResult, PiperError> {
if durations.len() != phoneme_tokens.len() {
return Err(PiperError::Inference(format!(
"durations length ({}) != phoneme_tokens length ({})",
durations.len(),
phoneme_tokens.len()
)));
}
if sample_rate == 0 {
return Err(PiperError::Inference("sample_rate must be > 0".to_string()));
}
if hop_length == 0 {
return Err(PiperError::Inference("hop_length must be > 0".to_string()));
}
// Time in seconds for one frame
let frame_time_s = hop_length as f64 / sample_rate as f64;
let frame_time_ms = frame_time_s * 1000.0;
let mut phonemes = Vec::with_capacity(durations.len());
let mut cursor_ms: f64 = 0.0;
for (dur, token) in durations.iter().zip(phoneme_tokens.iter()) {
let dur_frames = (*dur).max(0.0) as f64;
let duration_ms = dur_frames * frame_time_ms;
let start_ms = cursor_ms;
let end_ms = cursor_ms + duration_ms;
phonemes.push(PhonemeTimingInfo {
phoneme: token.clone(),
start_ms,
end_ms,
duration_ms,
});
cursor_ms = end_ms;
}
Ok(TimingResult {
total_duration_ms: cursor_ms,
phonemes,
sample_rate,
})
}
#[cfg(test)]
mod tests {
use super::*;
// ---------------------------------------------------------------
// Helper
// ---------------------------------------------------------------
fn tokens(names: &[&str]) -> Vec<String> {
names.iter().map(|s| s.to_string()).collect()
}
// ---------------------------------------------------------------
// 1. Basic duration conversion (known values)
// ---------------------------------------------------------------
#[test]
fn test_basic_conversion_22050() {
// sample_rate=22050, hop=256 => frame_time = 256/22050 s ~ 11.6099 ms
let durations = vec![10.0, 20.0, 5.0];
let toks = tokens(&["a", "b", "c"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let frame_ms = 256.0 / 22050.0 * 1000.0;
assert_eq!(result.phonemes.len(), 3);
assert_eq!(result.sample_rate, 22050);
// phoneme "a": start=0, dur=10*frame_ms
assert!((result.phonemes[0].start_ms - 0.0).abs() < 1e-6);
assert!((result.phonemes[0].duration_ms - 10.0 * frame_ms).abs() < 1e-6);
assert!((result.phonemes[0].end_ms - 10.0 * frame_ms).abs() < 1e-6);
// phoneme "b": start=10*frame_ms, dur=20*frame_ms
assert!((result.phonemes[1].start_ms - 10.0 * frame_ms).abs() < 1e-6);
assert!((result.phonemes[1].duration_ms - 20.0 * frame_ms).abs() < 1e-6);
// phoneme "c": start=30*frame_ms, dur=5*frame_ms
assert!((result.phonemes[2].start_ms - 30.0 * frame_ms).abs() < 1e-6);
// total = 35 * frame_ms
assert!((result.total_duration_ms - 35.0 * frame_ms).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 2. Empty durations
// ---------------------------------------------------------------
#[test]
fn test_empty_durations() {
let result = durations_to_timing(&[], &[], 22050, 256).unwrap();
assert!(result.phonemes.is_empty());
assert!((result.total_duration_ms - 0.0).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 3. Single phoneme
// ---------------------------------------------------------------
#[test]
fn test_single_phoneme() {
let durations = vec![8.0];
let toks = tokens(&["k"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
assert_eq!(result.phonemes.len(), 1);
assert!((result.phonemes[0].start_ms - 0.0).abs() < 1e-6);
let frame_ms = 256.0 / 22050.0 * 1000.0;
assert!((result.phonemes[0].duration_ms - 8.0 * frame_ms).abs() < 1e-6);
assert!((result.total_duration_ms - 8.0 * frame_ms).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 4. Zero durations
// ---------------------------------------------------------------
#[test]
fn test_zero_durations() {
let durations = vec![0.0, 10.0, 0.0];
let toks = tokens(&["^", "a", "_"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
assert_eq!(result.phonemes.len(), 3);
// First phoneme has zero duration
assert!((result.phonemes[0].duration_ms - 0.0).abs() < 1e-6);
assert!((result.phonemes[0].start_ms - result.phonemes[0].end_ms).abs() < 1e-6);
// Second phoneme starts at 0 too
assert!((result.phonemes[1].start_ms - 0.0).abs() < 1e-6);
// Third phoneme starts at 10*frame_ms, zero duration
let frame_ms = 256.0 / 22050.0 * 1000.0;
assert!((result.phonemes[2].start_ms - 10.0 * frame_ms).abs() < 1e-6);
assert!((result.phonemes[2].duration_ms - 0.0).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 5. Mismatched lengths error
// ---------------------------------------------------------------
#[test]
fn test_mismatched_lengths() {
let durations = vec![1.0, 2.0, 3.0];
let toks = tokens(&["a", "b"]);
let err = durations_to_timing(&durations, &toks, 22050, 256).unwrap_err();
let msg = err.to_string();
assert!(msg.contains("3"));
assert!(msg.contains("2"));
}
// ---------------------------------------------------------------
// 6. JSON pretty-print serialization roundtrip
// ---------------------------------------------------------------
#[test]
fn test_json_roundtrip() {
let durations = vec![5.0, 15.0];
let toks = tokens(&["h", "i"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let json = result.to_json().unwrap();
// Deserialize back to verify structure
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert!(parsed.is_object());
assert!(parsed["phonemes"].is_array());
assert_eq!(parsed["phonemes"].as_array().unwrap().len(), 2);
assert_eq!(parsed["sample_rate"].as_u64().unwrap(), 22050);
let first = &parsed["phonemes"][0];
assert_eq!(first["phoneme"].as_str().unwrap(), "h");
assert!((first["start_ms"].as_f64().unwrap() - 0.0).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 7. JSON compact serialization
// ---------------------------------------------------------------
#[test]
fn test_json_compact() {
let durations = vec![3.0];
let toks = tokens(&["x"]);
let result = durations_to_timing(&durations, &toks, 16000, 256).unwrap();
let json_compact = result.to_json_compact().unwrap();
// Compact should not contain newlines (single-line JSON)
assert!(!json_compact.contains('\n'));
assert!(json_compact.contains("\"phoneme\":\"x\""));
}
// ---------------------------------------------------------------
// 8. TSV format correctness
// ---------------------------------------------------------------
#[test]
fn test_tsv_format() {
let durations = vec![10.0, 20.0];
let toks = tokens(&["p", "q"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let tsv = result.to_tsv();
let lines: Vec<&str> = tsv.lines().collect();
// Header line
assert_eq!(lines[0], "start_ms\tend_ms\tduration_ms\tphoneme");
// Data rows
assert_eq!(lines.len(), 3); // header + 2 phonemes
// First row should start at 0.000
assert!(lines[1].starts_with("0.000\t"));
assert!(lines[1].ends_with("\tp"));
// Second row phoneme should be "q"
assert!(lines[2].ends_with("\tq"));
}
// ---------------------------------------------------------------
// 9. TSV empty input
// ---------------------------------------------------------------
#[test]
fn test_tsv_empty() {
let result = durations_to_timing(&[], &[], 22050, 256).unwrap();
let tsv = result.to_tsv();
let lines: Vec<&str> = tsv.lines().collect();
assert_eq!(lines.len(), 1); // header only
}
// ---------------------------------------------------------------
// 10. SRT format correctness
// ---------------------------------------------------------------
#[test]
fn test_srt_format() {
// Use easy numbers: sample_rate=1000, hop_length=1 => 1 frame = 1 ms
let durations = vec![500.0, 1500.0, 3000.0];
let toks = tokens(&["a", "bb", "c"]);
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
let srt = result.to_srt();
let blocks: Vec<&str> = srt.split("\n\n").filter(|b| !b.is_empty()).collect();
assert_eq!(blocks.len(), 3);
// First block: index 1, 00:00:00,000 --> 00:00:00,500, phoneme "a"
let lines0: Vec<&str> = blocks[0].lines().collect();
assert_eq!(lines0[0], "1");
assert_eq!(lines0[1], "00:00:00,000 --> 00:00:00,500");
assert_eq!(lines0[2], "a");
// Second block: index 2, 00:00:00,500 --> 00:00:02,000, phoneme "bb"
let lines1: Vec<&str> = blocks[1].lines().collect();
assert_eq!(lines1[0], "2");
assert_eq!(lines1[1], "00:00:00,500 --> 00:00:02,000");
assert_eq!(lines1[2], "bb");
// Third block: 00:00:02,000 --> 00:00:05,000
let lines2: Vec<&str> = blocks[2].lines().collect();
assert_eq!(lines2[0], "3");
assert_eq!(lines2[1], "00:00:02,000 --> 00:00:05,000");
assert_eq!(lines2[2], "c");
}
// ---------------------------------------------------------------
// 11. SRT timestamp with hours/minutes
// ---------------------------------------------------------------
#[test]
fn test_srt_large_timestamps() {
// 90 minutes + 5 seconds + 123 ms = 5,405,123 ms
// Use sample_rate=1000, hop=1 so frames = ms directly
let dur_ms = 5_405_123.0_f32;
let durations = vec![dur_ms];
let toks = tokens(&["long"]);
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
let srt = result.to_srt();
assert!(srt.contains("00:00:00,000 --> 01:30:05,123"));
}
// ---------------------------------------------------------------
// 12. Sample rate 16000
// ---------------------------------------------------------------
#[test]
fn test_sample_rate_16000() {
let durations = vec![16.0];
let toks = tokens(&["z"]);
let result = durations_to_timing(&durations, &toks, 16000, 256).unwrap();
// frame_ms = 256/16000*1000 = 16.0 ms
// duration_ms = 16 frames * 16.0 ms = 256.0 ms
let expected_ms = 16.0 * (256.0 / 16000.0 * 1000.0);
assert!((result.phonemes[0].duration_ms - expected_ms).abs() < 1e-6);
assert!((result.total_duration_ms - expected_ms).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 13. Sample rate 44100
// ---------------------------------------------------------------
#[test]
fn test_sample_rate_44100() {
let durations = vec![100.0];
let toks = tokens(&["w"]);
let result = durations_to_timing(&durations, &toks, 44100, 256).unwrap();
let frame_ms = 256.0 / 44100.0 * 1000.0;
let expected_ms = 100.0 * frame_ms;
assert!((result.phonemes[0].duration_ms - expected_ms).abs() < 1e-6);
assert_eq!(result.sample_rate, 44100);
}
// ---------------------------------------------------------------
// 14. Large duration values
// ---------------------------------------------------------------
#[test]
fn test_large_duration_values() {
let durations = vec![100_000.0, 200_000.0];
let toks = tokens(&["aa", "bb"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let frame_ms = 256.0 / 22050.0 * 1000.0;
let expected_total = 300_000.0 * frame_ms;
assert!((result.total_duration_ms - expected_total).abs() < 1e-3);
// Second phoneme starts after the first
assert!((result.phonemes[1].start_ms - 100_000.0 * frame_ms).abs() < 1e-3);
}
// ---------------------------------------------------------------
// 15. Floating point precision -- cumulative sum stays accurate
// ---------------------------------------------------------------
#[test]
fn test_floating_point_precision() {
// Many small durations to test accumulation
let n = 1000;
let durations: Vec<f32> = vec![1.0; n];
let toks: Vec<String> = (0..n).map(|i| format!("p{i}")).collect();
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let frame_ms = 256.0 / 22050.0 * 1000.0;
let expected_total = n as f64 * frame_ms;
// Total should be very close despite 1000 additions
assert!(
(result.total_duration_ms - expected_total).abs() < 0.01,
"total={} expected={}",
result.total_duration_ms,
expected_total
);
// Last phoneme end should equal total
let last = result.phonemes.last().unwrap();
assert!((last.end_ms - result.total_duration_ms).abs() < 1e-9);
}
// ---------------------------------------------------------------
// 16. Negative duration values are clamped to zero
// ---------------------------------------------------------------
#[test]
fn test_negative_durations_clamped() {
let durations = vec![-5.0, 10.0, -1.0];
let toks = tokens(&["a", "b", "c"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
// Negative durations should be treated as 0
assert!((result.phonemes[0].duration_ms - 0.0).abs() < 1e-6);
assert!((result.phonemes[2].duration_ms - 0.0).abs() < 1e-6);
// Only phoneme "b" contributes to total
let frame_ms = 256.0 / 22050.0 * 1000.0;
assert!((result.total_duration_ms - 10.0 * frame_ms).abs() < 1e-6);
}
// ---------------------------------------------------------------
// 17. Zero sample_rate error
// ---------------------------------------------------------------
#[test]
fn test_zero_sample_rate_error() {
let durations = vec![1.0];
let toks = tokens(&["a"]);
let err = durations_to_timing(&durations, &toks, 0, 256).unwrap_err();
assert!(err.to_string().contains("sample_rate"));
}
// ---------------------------------------------------------------
// 18. Zero hop_length error
// ---------------------------------------------------------------
#[test]
fn test_zero_hop_length_error() {
let durations = vec![1.0];
let toks = tokens(&["a"]);
let err = durations_to_timing(&durations, &toks, 22050, 0).unwrap_err();
assert!(err.to_string().contains("hop_length"));
}
// ---------------------------------------------------------------
// 19. DEFAULT_HOP_LENGTH constant value
// ---------------------------------------------------------------
#[test]
fn test_default_hop_length() {
assert_eq!(DEFAULT_HOP_LENGTH, 256);
}
// ---------------------------------------------------------------
// 20. Phoneme ordering preserved
// ---------------------------------------------------------------
#[test]
fn test_phoneme_ordering_preserved() {
let durations = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let toks = tokens(&["^", "k", "o", "N", "_"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let names: Vec<&str> = result.phonemes.iter().map(|p| p.phoneme.as_str()).collect();
assert_eq!(names, vec!["^", "k", "o", "N", "_"]);
// Each start equals previous end
for i in 1..result.phonemes.len() {
assert!(
(result.phonemes[i].start_ms - result.phonemes[i - 1].end_ms).abs() < 1e-9,
"gap between phoneme {} and {}",
i - 1,
i
);
}
}
// ---------------------------------------------------------------
// 21. TSV field values match JSON values
// ---------------------------------------------------------------
#[test]
fn test_tsv_and_json_consistency() {
let durations = vec![7.0, 13.0];
let toks = tokens(&["s", "t"]);
let result = durations_to_timing(&durations, &toks, 22050, 256).unwrap();
let json_str = result.to_json().unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json_str).unwrap();
let tsv = result.to_tsv();
let data_lines: Vec<&str> = tsv.lines().skip(1).collect();
for (i, line) in data_lines.iter().enumerate() {
let fields: Vec<&str> = line.split('\t').collect();
assert_eq!(fields.len(), 4);
let tsv_start: f64 = fields[0].parse().unwrap();
let tsv_end: f64 = fields[1].parse().unwrap();
let tsv_dur: f64 = fields[2].parse().unwrap();
let tsv_phoneme = fields[3];
let json_ph = &parsed["phonemes"][i];
let json_start = json_ph["start_ms"].as_f64().unwrap();
let json_end = json_ph["end_ms"].as_f64().unwrap();
let json_phoneme = json_ph["phoneme"].as_str().unwrap();
assert!((tsv_start - json_start).abs() < 0.01);
assert!((tsv_end - json_end).abs() < 0.01);
assert!(tsv_dur > 0.0 || (tsv_dur - 0.0).abs() < 1e-6);
assert_eq!(tsv_phoneme, json_phoneme);
}
}
// ---------------------------------------------------------------
// 22. Phoneme name containing tab in TSV output
// ---------------------------------------------------------------
#[test]
fn test_tsv_phoneme_with_tab() {
// A phoneme token that contains a literal tab character.
// The current TSV writer does not escape it, so the tab will
// appear as an extra column, producing 5 fields instead of 4.
let durations = vec![5.0];
let toks = vec!["a\tb".to_string()];
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
let tsv = result.to_tsv();
let data_line = tsv.lines().nth(1).expect("expected a data line");
let fields: Vec<&str> = data_line.split('\t').collect();
// Tab inside the phoneme name splits the field, yielding 5 columns.
assert_eq!(
fields.len(),
5,
"tab inside phoneme name produces an extra TSV column"
);
}
// ---------------------------------------------------------------
// 23. Phoneme name containing newline in SRT output
// ---------------------------------------------------------------
#[test]
fn test_srt_phoneme_with_newline() {
// A phoneme token with an embedded newline will split the
// subtitle text across two visual lines inside the SRT entry.
// The entry count should still be 1 since entries are delimited
// by blank lines ("\n\n").
let durations = vec![10.0];
let toks = vec!["line1\nline2".to_string()];
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
let srt = result.to_srt();
// The block delimiter is "\n\n". Because the phoneme itself
// contains "\n", we verify that the index "1" and the arrow
// marker are still present and structurally correct.
assert!(srt.contains("1\n"));
assert!(srt.contains(" --> "));
assert!(srt.contains("line1\nline2"));
}
// ---------------------------------------------------------------
// 24. Duration with NaN — clamped to 0 by f32::max(0.0)
// ---------------------------------------------------------------
#[test]
fn test_nan_duration() {
let durations = vec![f32::NAN, 10.0];
let toks = tokens(&["nan_ph", "ok"]);
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
// Rust's f32::max returns the non-NaN argument when one operand
// is NaN, so NAN.max(0.0) == 0.0. The NaN is effectively clamped.
assert!(
(result.phonemes[0].duration_ms - 0.0).abs() < 1e-9,
"NaN duration is clamped to 0 by f32::max"
);
assert!(
(result.phonemes[0].start_ms - result.phonemes[0].end_ms).abs() < 1e-9,
"start == end for zero-duration phoneme"
);
// The second phoneme should still have a valid duration.
assert!(
(result.phonemes[1].duration_ms - 10.0).abs() < 1e-6,
"non-NaN phoneme keeps its value"
);
// Total should only reflect the valid phoneme
assert!(
(result.total_duration_ms - 10.0).abs() < 1e-6,
"total reflects only the non-NaN phoneme"
);
}
// ---------------------------------------------------------------
// 25. Duration with Infinity — propagates as infinite ms
// ---------------------------------------------------------------
#[test]
fn test_infinity_duration() {
let durations = vec![f32::INFINITY];
let toks = tokens(&["inf_ph"]);
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
assert!(
result.phonemes[0].duration_ms.is_infinite(),
"Infinity duration propagates"
);
assert!(
result.total_duration_ms.is_infinite(),
"total also becomes infinite"
);
}
// ---------------------------------------------------------------
// 26. Unicode / IPA phoneme names in all formats
// ---------------------------------------------------------------
#[test]
fn test_unicode_phoneme_names() {
let ipa_tokens = vec![
"\u{0251}\u{02D0}".to_string(), // ɑː
"\u{0283}".to_string(), // ʃ
"\u{014B}".to_string(), // ŋ
];
let durations = vec![5.0, 3.0, 7.0];
let result = durations_to_timing(&durations, &ipa_tokens, 1000, 1).unwrap();
// Verify phoneme names are preserved
assert_eq!(result.phonemes[0].phoneme, "\u{0251}\u{02D0}");
assert_eq!(result.phonemes[1].phoneme, "\u{0283}");
assert_eq!(result.phonemes[2].phoneme, "\u{014B}");
// JSON roundtrip preserves Unicode
let json = result.to_json().unwrap();
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(
parsed["phonemes"][0]["phoneme"].as_str().unwrap(),
"\u{0251}\u{02D0}"
);
// TSV contains the Unicode characters
let tsv = result.to_tsv();
assert!(tsv.contains("\u{0251}\u{02D0}"));
assert!(tsv.contains("\u{0283}"));
assert!(tsv.contains("\u{014B}"));
// SRT contains the Unicode characters
let srt = result.to_srt();
assert!(srt.contains("\u{0251}\u{02D0}"));
assert!(srt.contains("\u{0283}"));
assert!(srt.contains("\u{014B}"));
}
// ---------------------------------------------------------------
// 27. Very small durations preserve precision
// ---------------------------------------------------------------
#[test]
fn test_very_small_durations_precision() {
// 0.001 frames at sample_rate=1000, hop=1 => 0.001 ms per frame
let durations = vec![0.001_f32];
let toks = tokens(&["tiny"]);
let result = durations_to_timing(&durations, &toks, 1000, 1).unwrap();
// frame_time_ms = 1.0 ms; duration = 0.001 * 1.0 = 0.001 ms
let expected = 0.001_f64;
assert!(
(result.phonemes[0].duration_ms - expected).abs() < 1e-9,
"very small duration: got {} expected {}",
result.phonemes[0].duration_ms,
expected
);
// TSV should render with sub-millisecond precision via {:.3}
let tsv = result.to_tsv();
let data_line = tsv.lines().nth(1).unwrap();
// The duration field (3rd column) should be "0.001"
let fields: Vec<&str> = data_line.split('\t').collect();
assert_eq!(fields[2], "0.001");
}
// ---------------------------------------------------------------
// 28. TimingResult direct construction and field access
// ---------------------------------------------------------------
#[test]
fn test_timing_result_direct_construction() {
let timing = TimingResult {
phonemes: vec![
PhonemeTimingInfo {
phoneme: "hello".to_string(),
start_ms: 0.0,
end_ms: 100.5,
duration_ms: 100.5,
},
PhonemeTimingInfo {
phoneme: "world".to_string(),
start_ms: 100.5,
end_ms: 250.0,
duration_ms: 149.5,
},
],
total_duration_ms: 250.0,
sample_rate: 48000,
};
// Field access
assert_eq!(timing.phonemes.len(), 2);
assert_eq!(timing.phonemes[0].phoneme, "hello");
assert_eq!(timing.phonemes[1].phoneme, "world");
assert!((timing.phonemes[0].start_ms - 0.0).abs() < 1e-9);
assert!((timing.phonemes[0].end_ms - 100.5).abs() < 1e-9);
assert!((timing.phonemes[0].duration_ms - 100.5).abs() < 1e-9);
assert!((timing.phonemes[1].start_ms - 100.5).abs() < 1e-9);
assert!((timing.phonemes[1].end_ms - 250.0).abs() < 1e-9);
assert!((timing.phonemes[1].duration_ms - 149.5).abs() < 1e-9);
assert!((timing.total_duration_ms - 250.0).abs() < 1e-9);
assert_eq!(timing.sample_rate, 48000);
// Clone trait works
let cloned = timing.clone();
assert_eq!(cloned.phonemes.len(), timing.phonemes.len());
assert_eq!(cloned.sample_rate, timing.sample_rate);
// Serialization works on directly constructed structs
let json = timing.to_json().unwrap();
assert!(json.contains("\"hello\""));
assert!(json.contains("\"world\""));
assert!(json.contains("48000"));
}
// ---------------------------------------------------------------
// 29. JSON serializes non-finite f64 as null (serde_json >=1.0.128)
// ---------------------------------------------------------------
#[test]
fn test_json_nonfinite_serialized_as_null() {
// serde_json >= 1.0.128 serializes NaN / Infinity as JSON null
// rather than returning an error. Verify this behaviour so that
// callers know what to expect when a TimingResult contains
// non-finite values (e.g. from an Infinity input duration).
let timing = TimingResult {
phonemes: vec![PhonemeTimingInfo {
phoneme: "inf".to_string(),
start_ms: 0.0,
end_ms: f64::INFINITY,
duration_ms: f64::INFINITY,
}],
total_duration_ms: f64::INFINITY,
sample_rate: 22050,
};
// to_json should succeed (not error)
let json = timing.to_json().expect("to_json should succeed");
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
// Non-finite values become null in JSON
assert!(
parsed["total_duration_ms"].is_null(),
"Infinity total_duration_ms serialized as null"
);
assert!(
parsed["phonemes"][0]["end_ms"].is_null(),
"Infinity end_ms serialized as null"
);
assert!(
parsed["phonemes"][0]["duration_ms"].is_null(),
"Infinity duration_ms serialized as null"
);
// Finite values remain as numbers
assert!(
parsed["phonemes"][0]["start_ms"].is_number(),
"finite start_ms remains a number"
);
// Compact format also succeeds
let compact = timing.to_json_compact().expect("compact should succeed");
assert!(
compact.contains("null"),
"compact JSON contains null for Infinity"
);
// PiperError::from(serde_json::Error) conversion is exercised
// by to_json / to_json_compact internally via map_err.
// Verify the error path is reachable with truly invalid JSON input.
let bad_json = "{ not valid json }";
let serde_err: Result<serde_json::Value, _> = serde_json::from_str(bad_json);
let piper_err: PiperError = serde_err.unwrap_err().into();
let msg = piper_err.to_string();
assert!(
msg.contains("JSON"),
"PiperError from serde_json mentions JSON: {}",
msg
);
}
}