Skip to main content

suno_core/
lyrics.rs

1//! Word- and line-level timed (synced) lyrics from Suno's aligned-lyrics API.
2//!
3//! [`AlignedLyrics`] is the parsed shape of `GET /api/gen/{id}/aligned_lyrics/v2/`
4//! ([`SunoClient::aligned_lyrics`](crate::SunoClient::aligned_lyrics)): a flat
5//! word-level list plus a line-level list carrying section labels and nested
6//! per-word timing. Everything here is pure and free of direct IO: the client
7//! and renders the synced artefacts (the line-synced `.lrc` body, a word-level
8//! ID3 `SYLT` table, and a plain-text fallback), so the mapping and formatting
9//! are unit tested without a network.
10//!
11//! Instrumentals (and any clip Suno could not force-align) return `200` with
12//! empty arrays, so [`AlignedLyrics::is_empty`] is the signal to write no synced
13//! artefact for that clip, exactly as an empty cover URL writes no cover.
14
15use std::fmt::Write as _;
16
17use serde_json::Value;
18
19/// One force-aligned word from the flat `aligned_words` list.
20///
21/// `success` is Suno's per-word alignment flag (it can be `false` where forced
22/// alignment failed) and `p_align` its confidence; both are carried so callers
23/// can gate on them, though the line-level [`AlignedLine::words`] is preferred
24/// for rendering because it already reflects Suno's own line grouping.
25#[derive(Debug, Clone, PartialEq)]
26pub struct AlignedWord {
27    pub word: String,
28    pub success: bool,
29    pub start_s: f64,
30    pub end_s: f64,
31    pub p_align: f64,
32}
33
34/// One word within a line, from the nested `aligned_lyrics[].words` list.
35///
36/// The API keys the word text as `text` here (the flat list keys it as `word`).
37/// These carry no `success`/`p_align`; they are Suno's authoritative grouping of
38/// words into lines and are what the `.lrc` and `SYLT` renderers use.
39#[derive(Debug, Clone, PartialEq)]
40pub struct AlignedLineWord {
41    pub text: String,
42    pub start_s: f64,
43    pub end_s: f64,
44}
45
46/// One aligned line: its text, span, section label, and nested words.
47#[derive(Debug, Clone, PartialEq)]
48pub struct AlignedLine {
49    pub text: String,
50    pub start_s: f64,
51    pub end_s: f64,
52    /// Structural section label (e.g. `Verse 1`, `Chorus`), empty when absent.
53    pub section: String,
54    pub words: Vec<AlignedLineWord>,
55}
56
57/// A clip's aligned lyrics: the flat word list and the line list.
58///
59/// Both are empty for an instrumental or an un-alignable clip; see
60/// [`is_empty`](Self::is_empty).
61#[derive(Debug, Clone, Default, PartialEq)]
62pub struct AlignedLyrics {
63    pub words: Vec<AlignedWord>,
64    pub lines: Vec<AlignedLine>,
65    /// `waveform_data`: the amplitude/peak envelope Suno returns for waveform
66    /// display, empty when absent. Additive metadata, not lyric content, so it
67    /// does not affect [`is_empty`](Self::is_empty).
68    pub waveform_data: Vec<f64>,
69    /// `hoot_cer`: Suno's alignment/transcription error metric (higher is
70    /// worse), `None` when absent.
71    pub hoot_cer: Option<f64>,
72    /// `is_streamed`: Suno's streaming flag, `None` when absent.
73    pub is_streamed: Option<bool>,
74}
75
76impl AlignedLyrics {
77    /// Map the `aligned_lyrics/v2` response body, tolerating missing keys.
78    ///
79    /// A non-object body, or one whose arrays are missing, maps to the empty
80    /// value, so a malformed or instrumental response is simply "no synced
81    /// lyrics" rather than an error.
82    pub fn from_json(raw: &Value) -> AlignedLyrics {
83        let words = raw
84            .get("aligned_words")
85            .and_then(Value::as_array)
86            .map(|items| items.iter().map(parse_word).collect())
87            .unwrap_or_default();
88        let lines = raw
89            .get("aligned_lyrics")
90            .and_then(Value::as_array)
91            .map(|items| items.iter().map(parse_line).collect())
92            .unwrap_or_default();
93        let waveform_data = raw
94            .get("waveform_data")
95            .and_then(Value::as_array)
96            .map(|items| items.iter().filter_map(Value::as_f64).collect())
97            .unwrap_or_default();
98        let hoot_cer = raw.get("hoot_cer").and_then(Value::as_f64);
99        let is_streamed = raw.get("is_streamed").and_then(Value::as_bool);
100        AlignedLyrics {
101            words,
102            lines,
103            waveform_data,
104            hoot_cer,
105            is_streamed,
106        }
107    }
108
109    /// Parse the `aligned_lyrics/v2` response bytes, or the empty value when the
110    /// body is not valid JSON (defensive: an odd body means "no synced lyrics").
111    pub fn from_bytes(body: &[u8]) -> AlignedLyrics {
112        serde_json::from_slice::<Value>(body)
113            .map(|value| Self::from_json(&value))
114            .unwrap_or_default()
115    }
116
117    /// True when the clip carries no aligned lyrics (an instrumental, or a clip
118    /// Suno could not align). No synced artefact is written for such a clip.
119    pub fn is_empty(&self) -> bool {
120        self.lines.is_empty() && self.words.is_empty()
121    }
122
123    /// The plain lyric text, one line per aligned line (falling back to the flat
124    /// word list when there are no lines), for the unsynced `LYRICS`/`USLT` tag.
125    ///
126    /// Returns an empty string when there is nothing to embed.
127    pub fn plain_text(&self) -> String {
128        if !self.lines.is_empty() {
129            return self
130                .lines
131                .iter()
132                .map(|line| line.text.trim_end())
133                .collect::<Vec<_>>()
134                .join("\n");
135        }
136        self.words
137            .iter()
138            .map(|word| word.word.as_str())
139            .collect::<Vec<_>>()
140            .join(" ")
141    }
142
143    /// The body of a standard (line-level) `.lrc`: one `[mm:ss.xx]` stamp per
144    /// aligned line, followed by the line text.
145    ///
146    /// Line-level is the universally supported LRC form, so every player syncs
147    /// and displays it cleanly; the enhanced "A2" per-word `<mm:ss.xx>` tags are
148    /// parsed by only a few karaoke players and are shown as literal text by the
149    /// rest, so they are not emitted here. Word-level timing is carried instead
150    /// in the MP3 `SYLT` frame (see [`sylt_entries`](Self::sylt_entries)). A line
151    /// with empty text falls back to its nested words joined by spaces. The body
152    /// is empty when there are no lines; callers treat that as "no `.lrc`".
153    pub fn lrc_body(&self) -> String {
154        let mut out = String::new();
155        for line in &self.lines {
156            let text = if line.text.trim().is_empty() {
157                line.words
158                    .iter()
159                    .map(|w| w.text.trim())
160                    .filter(|t| !t.is_empty())
161                    .collect::<Vec<_>>()
162                    .join(" ")
163            } else {
164                line.text.trim().to_owned()
165            };
166            let _ = writeln!(out, "[{}]{text}", lrc_stamp(line.start_s));
167        }
168        out
169    }
170
171    /// Word-level `SYLT` content: `(offset_ms, text)` pairs in time order.
172    ///
173    /// Each new line's first word carries a leading newline so a player renders
174    /// line breaks (the ID3v2 `SYLT` convention). Uses Suno's own line grouping;
175    /// a line with no nested words contributes its whole text as one segment.
176    pub fn sylt_entries(&self) -> Vec<(u32, String)> {
177        let mut entries = Vec::new();
178        for (line_index, line) in self.lines.iter().enumerate() {
179            let words: Vec<&AlignedLineWord> = line
180                .words
181                .iter()
182                .filter(|w| !w.text.trim().is_empty())
183                .collect();
184            let prefix = if line_index == 0 { "" } else { "\n" };
185            if words.is_empty() {
186                let text = line.text.trim();
187                if !text.is_empty() {
188                    entries.push((to_ms(line.start_s), format!("{prefix}{text}")));
189                }
190                continue;
191            }
192            for (word_index, word) in words.iter().enumerate() {
193                let text = word.text.trim();
194                let segment = if word_index == 0 {
195                    format!("{prefix}{text}")
196                } else {
197                    format!(" {text}")
198                };
199                entries.push((to_ms(word.start_s), segment));
200            }
201        }
202        entries
203    }
204}
205
206fn parse_word(raw: &Value) -> AlignedWord {
207    AlignedWord {
208        word: string(raw, "word"),
209        success: raw.get("success").and_then(Value::as_bool).unwrap_or(false),
210        start_s: f64_field(raw, "start_s"),
211        end_s: f64_field(raw, "end_s"),
212        p_align: f64_field(raw, "p_align"),
213    }
214}
215
216fn parse_line(raw: &Value) -> AlignedLine {
217    let words = raw
218        .get("words")
219        .and_then(Value::as_array)
220        .map(|items| {
221            items
222                .iter()
223                .map(|word| AlignedLineWord {
224                    text: string(word, "text"),
225                    start_s: f64_field(word, "start_s"),
226                    end_s: f64_field(word, "end_s"),
227                })
228                .collect()
229        })
230        .unwrap_or_default();
231    AlignedLine {
232        text: string(raw, "text"),
233        start_s: f64_field(raw, "start_s"),
234        end_s: f64_field(raw, "end_s"),
235        section: string(raw, "section"),
236        words,
237    }
238}
239
240fn string(value: &Value, key: &str) -> String {
241    value
242        .get(key)
243        .and_then(Value::as_str)
244        .unwrap_or("")
245        .to_string()
246}
247
248fn f64_field(value: &Value, key: &str) -> f64 {
249    value.get(key).and_then(Value::as_f64).unwrap_or(0.0)
250}
251
252/// Total whole milliseconds for `secs`, clamped at zero (never negative).
253fn to_ms(secs: f64) -> u32 {
254    if !secs.is_finite() || secs <= 0.0 {
255        return 0;
256    }
257    (secs * 1000.0).round() as u32
258}
259
260/// Format `secs` as an LRC line stamp `mm:ss.xx` (centiseconds), with minutes
261/// allowed to exceed 59 so a long track is not wrapped.
262fn lrc_stamp(secs: f64) -> String {
263    let cs = centiseconds(secs);
264    format!("{:02}:{:02}.{:02}", cs / 6000, (cs / 100) % 60, cs % 100)
265}
266
267fn centiseconds(secs: f64) -> u64 {
268    if !secs.is_finite() || secs <= 0.0 {
269        return 0;
270    }
271    (secs * 100.0).round() as u64
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    /// A small two-line sample with per-word timing, mirroring the live shape.
279    fn sample_json() -> Value {
280        serde_json::json!({
281            "aligned_words": [
282                {"word": "Hello", "success": true, "start_s": 0.5, "end_s": 0.9, "p_align": 0.99},
283                {"word": "world", "success": true, "start_s": 1.0, "end_s": 1.4, "p_align": 0.98},
284                {"word": "again", "success": true, "start_s": 61.2, "end_s": 61.8, "p_align": 0.97}
285            ],
286            "aligned_lyrics": [
287                {"text": "Hello world", "start_s": 0.5, "end_s": 1.4, "section": "Verse 1",
288                 "words": [
289                     {"text": "Hello", "start_s": 0.5, "end_s": 0.9},
290                     {"text": "world", "start_s": 1.0, "end_s": 1.4}
291                 ]},
292                {"text": "[Chorus]", "start_s": 60.0, "end_s": 60.0, "section": "Chorus", "words": []},
293                {"text": "again", "start_s": 61.2, "end_s": 61.8, "section": "Chorus",
294                 "words": [{"text": "again", "start_s": 61.2, "end_s": 61.8}]}
295            ],
296            "hoot_cer": 0.22,
297            "is_streamed": false
298        })
299    }
300
301    #[test]
302    fn parses_words_and_lines() {
303        let aligned = AlignedLyrics::from_json(&sample_json());
304        assert_eq!(aligned.words.len(), 3);
305        assert_eq!(aligned.lines.len(), 3);
306        assert_eq!(aligned.words[0].word, "Hello");
307        assert!(aligned.words[0].success);
308        assert!((aligned.words[0].p_align - 0.99).abs() < 1e-9);
309        assert_eq!(aligned.lines[0].section, "Verse 1");
310        assert_eq!(aligned.lines[0].words.len(), 2);
311        assert_eq!(aligned.lines[0].words[1].text, "world");
312        assert!(!aligned.is_empty());
313    }
314
315    #[test]
316    fn empty_arrays_are_empty() {
317        let json = serde_json::json!({
318            "aligned_words": [], "aligned_lyrics": [], "hoot_cer": 1.0, "is_streamed": false
319        });
320        let aligned = AlignedLyrics::from_json(&json);
321        assert!(aligned.is_empty());
322        assert_eq!(aligned.plain_text(), "");
323        assert_eq!(aligned.lrc_body(), "");
324        assert!(aligned.sylt_entries().is_empty());
325    }
326
327    #[test]
328    fn missing_keys_map_to_empty() {
329        assert!(AlignedLyrics::from_json(&serde_json::json!({})).is_empty());
330        assert!(AlignedLyrics::from_json(&Value::Null).is_empty());
331        assert!(AlignedLyrics::from_bytes(b"not json").is_empty());
332    }
333
334    #[test]
335    fn captures_waveform_hoot_cer_and_is_streamed_absent_safe() {
336        // The v2 body carries a waveform envelope, an alignment-error metric, and
337        // a streaming flag alongside the words and lines; all are additive
338        // metadata captured verbatim.
339        let json = serde_json::json!({
340            "aligned_words": [],
341            "aligned_lyrics": [],
342            "waveform_data": [0.00044, 0.0, 0.00014, 0.0008, 0.00146],
343            "hoot_cer": 0.22907083716651333_f64,
344            "is_streamed": false
345        });
346        let aligned = AlignedLyrics::from_json(&json);
347        assert_eq!(aligned.waveform_data.len(), 5);
348        assert!((aligned.waveform_data[3] - 0.0008).abs() < 1e-9);
349        assert!(
350            aligned
351                .hoot_cer
352                .is_some_and(|cer| (cer - 0.229_070_837).abs() < 1e-6)
353        );
354        assert_eq!(aligned.is_streamed, Some(false));
355        // They are metadata, not lyric content: an otherwise-empty body is still
356        // "no synced lyrics", so no synced artefact is written.
357        assert!(aligned.is_empty());
358
359        // Absent: the extras degrade to empty/None, never a panic.
360        let bare = AlignedLyrics::from_json(&serde_json::json!({}));
361        assert!(bare.waveform_data.is_empty());
362        assert_eq!(bare.hoot_cer, None);
363        assert_eq!(bare.is_streamed, None);
364        // Wrong-typed values are ignored the same way rather than erroring.
365        let odd = AlignedLyrics::from_json(&serde_json::json!({
366            "waveform_data": "nope", "hoot_cer": "high", "is_streamed": 1
367        }));
368        assert!(odd.waveform_data.is_empty());
369        assert_eq!(odd.hoot_cer, None);
370        assert_eq!(odd.is_streamed, None);
371    }
372
373    #[test]
374    fn lrc_body_has_line_level_stamps() {
375        let aligned = AlignedLyrics::from_json(&sample_json());
376        let body = aligned.lrc_body();
377        let expected = "[00:00.50]Hello world\n\
378             [01:00.00][Chorus]\n\
379             [01:01.20]again\n";
380        assert_eq!(body, expected);
381    }
382
383    #[test]
384    fn plain_text_joins_line_text() {
385        let aligned = AlignedLyrics::from_json(&sample_json());
386        assert_eq!(aligned.plain_text(), "Hello world\n[Chorus]\nagain");
387    }
388
389    #[test]
390    fn sylt_entries_are_word_level_with_line_breaks() {
391        let aligned = AlignedLyrics::from_json(&sample_json());
392        let entries = aligned.sylt_entries();
393        assert_eq!(
394            entries,
395            vec![
396                (500, "Hello".to_owned()),
397                (1000, " world".to_owned()),
398                (60000, "\n[Chorus]".to_owned()),
399                (61200, "\nagain".to_owned()),
400            ]
401        );
402    }
403
404    #[test]
405    fn stamps_round_and_do_not_wrap_minutes() {
406        // 61.2s -> 01:01.20; a value over an hour stays in minutes (not hours).
407        assert_eq!(lrc_stamp(61.2), "01:01.20");
408        assert_eq!(lrc_stamp(3661.0), "61:01.00");
409        assert_eq!(to_ms(1.2346), 1235);
410        assert_eq!(to_ms(-1.0), 0);
411    }
412}