Skip to main content

suno_core/
lyrics.rs

1//! Word- and line-level timed (synced) lyrics from Suno's aligned-lyrics API.
2//!
3//! [`AlignedLyrics`] is the parsed shape of `GET /api/gen/{id}/aligned_lyrics/v2/`
4//! ([`SunoClient::aligned_lyrics`](crate::SunoClient::aligned_lyrics)): a flat
5//! word-level list plus a line-level list carrying section labels and nested
6//! per-word timing. Everything here is pure and free of direct IO: the client
7//! and renders the synced artefacts (the line-synced `.lrc` body, a word-level
8//! ID3 `SYLT` table, and a plain-text fallback), so the mapping and formatting
9//! are unit tested without a network.
10//!
11//! Instrumentals (and any clip Suno could not force-align) return `200` with
12//! empty arrays, so [`AlignedLyrics::is_empty`] is the signal to write no synced
13//! artefact for that clip, exactly as an empty cover URL writes no cover.
14
15use std::fmt::Write as _;
16
17use serde_json::Value;
18
19/// One force-aligned word from the flat `aligned_words` list.
20///
21/// `success` is Suno's per-word alignment flag (it can be `false` where forced
22/// alignment failed) and `p_align` its confidence; both are carried so callers
23/// can gate on them, though the line-level [`AlignedLine::words`] is preferred
24/// for rendering because it already reflects Suno's own line grouping.
25#[derive(Debug, Clone, PartialEq)]
26pub struct AlignedWord {
27    pub word: String,
28    pub success: bool,
29    pub start_s: f64,
30    pub end_s: f64,
31    pub p_align: f64,
32}
33
34/// One word within a line, from the nested `aligned_lyrics[].words` list.
35///
36/// The API keys the word text as `text` here (the flat list keys it as `word`).
37/// These carry no `success`/`p_align`; they are Suno's authoritative grouping of
38/// words into lines and are what the `.lrc` and `SYLT` renderers use.
39#[derive(Debug, Clone, PartialEq)]
40pub struct AlignedLineWord {
41    pub text: String,
42    pub start_s: f64,
43    pub end_s: f64,
44}
45
46/// One aligned line: its text, span, section label, and nested words.
47#[derive(Debug, Clone, PartialEq)]
48pub struct AlignedLine {
49    pub text: String,
50    pub start_s: f64,
51    pub end_s: f64,
52    /// Structural section label (e.g. `Verse 1`, `Chorus`), empty when absent.
53    pub section: String,
54    pub words: Vec<AlignedLineWord>,
55}
56
57/// A clip's aligned lyrics: the flat word list and the line list.
58///
59/// Both are empty for an instrumental or an un-alignable clip; see
60/// [`is_empty`](Self::is_empty).
61#[derive(Debug, Clone, Default, PartialEq)]
62pub struct AlignedLyrics {
63    pub words: Vec<AlignedWord>,
64    pub lines: Vec<AlignedLine>,
65}
66
67impl AlignedLyrics {
68    /// Map the `aligned_lyrics/v2` response body, tolerating missing keys.
69    ///
70    /// A non-object body, or one whose arrays are missing, maps to the empty
71    /// value, so a malformed or instrumental response is simply "no synced
72    /// lyrics" rather than an error.
73    pub fn from_json(raw: &Value) -> AlignedLyrics {
74        let words = raw
75            .get("aligned_words")
76            .and_then(Value::as_array)
77            .map(|items| items.iter().map(parse_word).collect())
78            .unwrap_or_default();
79        let lines = raw
80            .get("aligned_lyrics")
81            .and_then(Value::as_array)
82            .map(|items| items.iter().map(parse_line).collect())
83            .unwrap_or_default();
84        AlignedLyrics { words, lines }
85    }
86
87    /// Parse the `aligned_lyrics/v2` response bytes, or the empty value when the
88    /// body is not valid JSON (defensive: an odd body means "no synced lyrics").
89    pub fn from_bytes(body: &[u8]) -> AlignedLyrics {
90        serde_json::from_slice::<Value>(body)
91            .map(|value| Self::from_json(&value))
92            .unwrap_or_default()
93    }
94
95    /// True when the clip carries no aligned lyrics (an instrumental, or a clip
96    /// Suno could not align). No synced artefact is written for such a clip.
97    pub fn is_empty(&self) -> bool {
98        self.lines.is_empty() && self.words.is_empty()
99    }
100
101    /// The plain lyric text, one line per aligned line (falling back to the flat
102    /// word list when there are no lines), for the unsynced `LYRICS`/`USLT` tag.
103    ///
104    /// Returns an empty string when there is nothing to embed.
105    pub fn plain_text(&self) -> String {
106        if !self.lines.is_empty() {
107            return self
108                .lines
109                .iter()
110                .map(|line| line.text.trim_end())
111                .collect::<Vec<_>>()
112                .join("\n");
113        }
114        self.words
115            .iter()
116            .map(|word| word.word.as_str())
117            .collect::<Vec<_>>()
118            .join(" ")
119    }
120
121    /// The body of a standard (line-level) `.lrc`: one `[mm:ss.xx]` stamp per
122    /// aligned line, followed by the line text.
123    ///
124    /// Line-level is the universally supported LRC form, so every player syncs
125    /// and displays it cleanly; the enhanced "A2" per-word `<mm:ss.xx>` tags are
126    /// parsed by only a few karaoke players and are shown as literal text by the
127    /// rest, so they are not emitted here. Word-level timing is carried instead
128    /// in the MP3 `SYLT` frame (see [`sylt_entries`](Self::sylt_entries)). A line
129    /// with empty text falls back to its nested words joined by spaces. The body
130    /// is empty when there are no lines; callers treat that as "no `.lrc`".
131    pub fn lrc_body(&self) -> String {
132        let mut out = String::new();
133        for line in &self.lines {
134            let text = if line.text.trim().is_empty() {
135                line.words
136                    .iter()
137                    .map(|w| w.text.trim())
138                    .filter(|t| !t.is_empty())
139                    .collect::<Vec<_>>()
140                    .join(" ")
141            } else {
142                line.text.trim().to_owned()
143            };
144            let _ = writeln!(out, "[{}]{text}", lrc_stamp(line.start_s));
145        }
146        out
147    }
148
149    /// Word-level `SYLT` content: `(offset_ms, text)` pairs in time order.
150    ///
151    /// Each new line's first word carries a leading newline so a player renders
152    /// line breaks (the ID3v2 `SYLT` convention). Uses Suno's own line grouping;
153    /// a line with no nested words contributes its whole text as one segment.
154    pub fn sylt_entries(&self) -> Vec<(u32, String)> {
155        let mut entries = Vec::new();
156        for (line_index, line) in self.lines.iter().enumerate() {
157            let words: Vec<&AlignedLineWord> = line
158                .words
159                .iter()
160                .filter(|w| !w.text.trim().is_empty())
161                .collect();
162            let prefix = if line_index == 0 { "" } else { "\n" };
163            if words.is_empty() {
164                let text = line.text.trim();
165                if !text.is_empty() {
166                    entries.push((to_ms(line.start_s), format!("{prefix}{text}")));
167                }
168                continue;
169            }
170            for (word_index, word) in words.iter().enumerate() {
171                let text = word.text.trim();
172                let segment = if word_index == 0 {
173                    format!("{prefix}{text}")
174                } else {
175                    format!(" {text}")
176                };
177                entries.push((to_ms(word.start_s), segment));
178            }
179        }
180        entries
181    }
182}
183
184fn parse_word(raw: &Value) -> AlignedWord {
185    AlignedWord {
186        word: string(raw, "word"),
187        success: raw.get("success").and_then(Value::as_bool).unwrap_or(false),
188        start_s: f64_field(raw, "start_s"),
189        end_s: f64_field(raw, "end_s"),
190        p_align: f64_field(raw, "p_align"),
191    }
192}
193
194fn parse_line(raw: &Value) -> AlignedLine {
195    let words = raw
196        .get("words")
197        .and_then(Value::as_array)
198        .map(|items| {
199            items
200                .iter()
201                .map(|word| AlignedLineWord {
202                    text: string(word, "text"),
203                    start_s: f64_field(word, "start_s"),
204                    end_s: f64_field(word, "end_s"),
205                })
206                .collect()
207        })
208        .unwrap_or_default();
209    AlignedLine {
210        text: string(raw, "text"),
211        start_s: f64_field(raw, "start_s"),
212        end_s: f64_field(raw, "end_s"),
213        section: string(raw, "section"),
214        words,
215    }
216}
217
218fn string(value: &Value, key: &str) -> String {
219    value
220        .get(key)
221        .and_then(Value::as_str)
222        .unwrap_or("")
223        .to_string()
224}
225
226fn f64_field(value: &Value, key: &str) -> f64 {
227    value.get(key).and_then(Value::as_f64).unwrap_or(0.0)
228}
229
230/// Total whole milliseconds for `secs`, clamped at zero (never negative).
231fn to_ms(secs: f64) -> u32 {
232    if !secs.is_finite() || secs <= 0.0 {
233        return 0;
234    }
235    (secs * 1000.0).round() as u32
236}
237
238/// Format `secs` as an LRC line stamp `mm:ss.xx` (centiseconds), with minutes
239/// allowed to exceed 59 so a long track is not wrapped.
240fn lrc_stamp(secs: f64) -> String {
241    let cs = centiseconds(secs);
242    format!("{:02}:{:02}.{:02}", cs / 6000, (cs / 100) % 60, cs % 100)
243}
244
245fn centiseconds(secs: f64) -> u64 {
246    if !secs.is_finite() || secs <= 0.0 {
247        return 0;
248    }
249    (secs * 100.0).round() as u64
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    /// A small two-line sample with per-word timing, mirroring the live shape.
257    fn sample_json() -> Value {
258        serde_json::json!({
259            "aligned_words": [
260                {"word": "Hello", "success": true, "start_s": 0.5, "end_s": 0.9, "p_align": 0.99},
261                {"word": "world", "success": true, "start_s": 1.0, "end_s": 1.4, "p_align": 0.98},
262                {"word": "again", "success": true, "start_s": 61.2, "end_s": 61.8, "p_align": 0.97}
263            ],
264            "aligned_lyrics": [
265                {"text": "Hello world", "start_s": 0.5, "end_s": 1.4, "section": "Verse 1",
266                 "words": [
267                     {"text": "Hello", "start_s": 0.5, "end_s": 0.9},
268                     {"text": "world", "start_s": 1.0, "end_s": 1.4}
269                 ]},
270                {"text": "[Chorus]", "start_s": 60.0, "end_s": 60.0, "section": "Chorus", "words": []},
271                {"text": "again", "start_s": 61.2, "end_s": 61.8, "section": "Chorus",
272                 "words": [{"text": "again", "start_s": 61.2, "end_s": 61.8}]}
273            ],
274            "hoot_cer": 0.22,
275            "is_streamed": false
276        })
277    }
278
279    #[test]
280    fn parses_words_and_lines() {
281        let aligned = AlignedLyrics::from_json(&sample_json());
282        assert_eq!(aligned.words.len(), 3);
283        assert_eq!(aligned.lines.len(), 3);
284        assert_eq!(aligned.words[0].word, "Hello");
285        assert!(aligned.words[0].success);
286        assert!((aligned.words[0].p_align - 0.99).abs() < 1e-9);
287        assert_eq!(aligned.lines[0].section, "Verse 1");
288        assert_eq!(aligned.lines[0].words.len(), 2);
289        assert_eq!(aligned.lines[0].words[1].text, "world");
290        assert!(!aligned.is_empty());
291    }
292
293    #[test]
294    fn empty_arrays_are_empty() {
295        let json = serde_json::json!({
296            "aligned_words": [], "aligned_lyrics": [], "hoot_cer": 1.0, "is_streamed": false
297        });
298        let aligned = AlignedLyrics::from_json(&json);
299        assert!(aligned.is_empty());
300        assert_eq!(aligned.plain_text(), "");
301        assert_eq!(aligned.lrc_body(), "");
302        assert!(aligned.sylt_entries().is_empty());
303    }
304
305    #[test]
306    fn missing_keys_map_to_empty() {
307        assert!(AlignedLyrics::from_json(&serde_json::json!({})).is_empty());
308        assert!(AlignedLyrics::from_json(&Value::Null).is_empty());
309        assert!(AlignedLyrics::from_bytes(b"not json").is_empty());
310    }
311
312    #[test]
313    fn lrc_body_has_line_level_stamps() {
314        let aligned = AlignedLyrics::from_json(&sample_json());
315        let body = aligned.lrc_body();
316        let expected = "[00:00.50]Hello world\n\
317             [01:00.00][Chorus]\n\
318             [01:01.20]again\n";
319        assert_eq!(body, expected);
320    }
321
322    #[test]
323    fn plain_text_joins_line_text() {
324        let aligned = AlignedLyrics::from_json(&sample_json());
325        assert_eq!(aligned.plain_text(), "Hello world\n[Chorus]\nagain");
326    }
327
328    #[test]
329    fn sylt_entries_are_word_level_with_line_breaks() {
330        let aligned = AlignedLyrics::from_json(&sample_json());
331        let entries = aligned.sylt_entries();
332        assert_eq!(
333            entries,
334            vec![
335                (500, "Hello".to_owned()),
336                (1000, " world".to_owned()),
337                (60000, "\n[Chorus]".to_owned()),
338                (61200, "\nagain".to_owned()),
339            ]
340        );
341    }
342
343    #[test]
344    fn stamps_round_and_do_not_wrap_minutes() {
345        // 61.2s -> 01:01.20; a value over an hour stays in minutes (not hours).
346        assert_eq!(lrc_stamp(61.2), "01:01.20");
347        assert_eq!(lrc_stamp(3661.0), "61:01.00");
348        assert_eq!(to_ms(1.2346), 1235);
349        assert_eq!(to_ms(-1.0), 0);
350    }
351}