Skip to main content

wax/
soundcloud.rs

1use regex::Regex;
2use scraper::{Html, Selector};
3use serde::Deserialize;
4use url::Url;
5
6use crate::error::{AppError, Result};
7use crate::model::{ItemKind, Platform, SeedAlbum};
8
9const FALLBACK_CLIENT_ID: &str = "WU4bVxk5Df0g5JC8ULzW77Ry7OM10Lyj";
10
11pub fn normalize_url(url: &str) -> Result<String> {
12    let mut parsed = Url::parse(url)?;
13    parsed.set_fragment(None);
14    parsed.set_query(None);
15
16    let Some(host) = parsed.host_str() else {
17        return Err(AppError::InvalidInput(format!(
18            "unsupported SoundCloud URL: {url}"
19        )));
20    };
21
22    let host = host.to_ascii_lowercase();
23    if host != "soundcloud.com" && host != "www.soundcloud.com" && host != "m.soundcloud.com" {
24        if host == "on.soundcloud.com" {
25            return Ok(parsed.to_string().trim_end_matches('/').to_string());
26        }
27
28        return Err(AppError::InvalidInput(format!(
29            "unsupported SoundCloud URL: {url}"
30        )));
31    }
32
33    parsed
34        .set_host(Some("soundcloud.com"))
35        .map_err(|_| AppError::InvalidInput(format!("unsupported SoundCloud URL: {url}")))?;
36
37    let trimmed = parsed.path().trim_end_matches('/');
38    let segments: Vec<_> = trimmed
39        .split('/')
40        .filter(|segment| !segment.is_empty())
41        .collect();
42    if segments.len() < 2 {
43        return Err(AppError::InvalidInput(format!(
44            "expected a SoundCloud track or playlist URL: {url}"
45        )));
46    }
47
48    parsed.set_path(&format!("/{}", segments.join("/")));
49    Ok(parsed.to_string())
50}
51
52pub fn resolve_seed(url: &str, html: &str) -> Result<SeedAlbum> {
53    let document = Html::parse_document(html);
54    let canonical_url = meta_content(&document, r#"meta[property="og:url"]"#)
55        .or_else(|| Some(url.to_string()))
56        .map(|value| normalize_url(&value))
57        .transpose()?
58        .ok_or_else(|| AppError::Parse("unable to determine canonical URL".to_string()))?;
59
60    let kind = infer_kind(&canonical_url);
61    let title = meta_content(&document, r#"meta[property="og:title"]"#)
62        .or_else(|| json_field(html, "title"))
63        .or_else(|| title_text(&document))
64        .unwrap_or_else(|| "Unknown SoundCloud Item".to_string());
65    let artist = json_field(html, "username")
66        .or_else(|| meta_content(&document, r#"meta[name="twitter:audio:artist_name"]"#))
67        .or_else(|| meta_content(&document, r#"meta[property="soundcloud:creator"]"#))
68        .or_else(|| extract_artist_from_title(&title))
69        .unwrap_or_else(|| "Unknown Artist".to_string());
70    let genre = meta_content(&document, r#"meta[property="music:genre"]"#)
71        .or_else(|| json_field(html, "genre"));
72    let mut tags = Vec::new();
73    if let Some(genre) = genre.filter(|value| !value.trim().is_empty()) {
74        tags.push(genre);
75    }
76
77    Ok(SeedAlbum {
78        platform: Platform::Soundcloud,
79        kind,
80        title: clean_title(&title),
81        artist,
82        url: canonical_url.clone(),
83        artist_url: infer_artist_url(&canonical_url),
84        tags,
85        label: None,
86        release_id: infer_track_id(&document, html).or_else(|| json_numeric_field(html, "id")),
87    })
88}
89
90#[derive(Debug, Clone)]
91pub struct LikeSource {
92    pub id: String,
93    pub title: String,
94    pub url: String,
95    pub tracks: Vec<crate::model::OwnedAlbum>,
96}
97
98pub struct UserLikesPage {
99    pub source: Option<LikeSource>,
100    pub next_href: Option<String>,
101}
102
103#[derive(Debug, Deserialize)]
104struct LikersResponse {
105    #[serde(default)]
106    collection: Vec<ApiLiker>,
107}
108
109#[derive(Debug, Deserialize)]
110struct ApiLiker {
111    id: u64,
112    permalink_url: Option<String>,
113    username: String,
114}
115
116#[derive(Debug, Deserialize)]
117struct ApiTrack {
118    id: u64,
119    title: String,
120    permalink_url: String,
121    #[serde(default)]
122    kind: String,
123    genre: Option<String>,
124    label_name: Option<String>,
125    user: Option<ApiUser>,
126}
127
128#[derive(Debug, Deserialize)]
129struct ApiUser {
130    username: String,
131    permalink_url: Option<String>,
132}
133
134#[derive(Debug, Deserialize)]
135struct UserLikesResponse {
136    #[serde(default)]
137    collection: Vec<ApiLike>,
138    next_href: Option<String>,
139}
140
141#[derive(Debug, Deserialize)]
142struct ApiLike {
143    created_at: String,
144    track: Option<ApiTrack>,
145}
146
147pub fn extract_client_id(html: &str) -> Result<String> {
148    let pattern =
149        Regex::new(r#""hydratable":"apiClient","data":\{"id":"([^"]+)""#).expect("valid regex");
150    let client_id = pattern
151        .captures(html)
152        .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
153        .unwrap_or_else(|| FALLBACK_CLIENT_ID.to_string());
154
155    Ok(client_id)
156}
157
158pub fn likers_url(client_id: &str, track_id: &str, limit: usize) -> Result<String> {
159    let mut url = Url::parse(&format!(
160        "https://api-v2.soundcloud.com/tracks/{track_id}/likers"
161    ))?;
162    url.query_pairs_mut()
163        .append_pair("client_id", client_id)
164        .append_pair("limit", &limit.to_string());
165    Ok(url.to_string())
166}
167
168pub fn user_likes_url(client_id: &str, user_id: &str, limit: usize) -> Result<String> {
169    let mut url = Url::parse(&format!(
170        "https://api-v2.soundcloud.com/users/{user_id}/likes"
171    ))?;
172    url.query_pairs_mut()
173        .append_pair("client_id", client_id)
174        .append_pair("limit", &limit.to_string());
175    Ok(url.to_string())
176}
177
178pub fn resolve_api_url(client_id: &str, soundcloud_url: &str) -> Result<String> {
179    let mut url = Url::parse("https://api-v2.soundcloud.com/resolve")?;
180    url.query_pairs_mut()
181        .append_pair("url", soundcloud_url)
182        .append_pair("client_id", client_id);
183    Ok(url.to_string())
184}
185
186pub fn with_client_id(url: &str, client_id: &str) -> Result<String> {
187    let mut parsed = Url::parse(url)?;
188    let has_client_id = parsed.query_pairs().any(|(key, _)| key == "client_id");
189    if !has_client_id {
190        parsed.query_pairs_mut().append_pair("client_id", client_id);
191    }
192    Ok(parsed.to_string())
193}
194
195pub fn resolve_api_seed(json: &str) -> Result<SeedAlbum> {
196    let track: ApiTrack = serde_json::from_str(json)?;
197    let kind = match track.kind.as_str() {
198        "playlist" => ItemKind::Playlist,
199        _ => ItemKind::Track,
200    };
201    let artist = track
202        .user
203        .as_ref()
204        .map(|user| user.username.clone())
205        .unwrap_or_else(|| "Unknown Artist".to_string());
206    let artist_url = track.user.and_then(|user| user.permalink_url);
207
208    Ok(SeedAlbum {
209        platform: Platform::Soundcloud,
210        kind,
211        title: track.title,
212        artist,
213        url: track.permalink_url,
214        artist_url,
215        tags: track.genre.into_iter().collect(),
216        label: track.label_name,
217        release_id: Some(track.id.to_string()),
218    })
219}
220
221pub fn parse_likers(json: &str) -> Result<Vec<LikeSource>> {
222    let response: LikersResponse = serde_json::from_str(json)?;
223    Ok(response
224        .collection
225        .into_iter()
226        .map(|user| LikeSource {
227            id: user.id.to_string(),
228            title: user.username,
229            url: user.permalink_url.unwrap_or_default(),
230            tracks: Vec::new(),
231        })
232        .collect())
233}
234
235pub fn parse_user_likes_page(
236    json: &str,
237    user: &LikeSource,
238    seed_track_id: &str,
239    max_neighbors: usize,
240) -> Result<UserLikesPage> {
241    let response: UserLikesResponse = serde_json::from_str(json)?;
242    let seed_track_id = seed_track_id
243        .parse::<u64>()
244        .map_err(|_| AppError::Parse("invalid SoundCloud track id".to_string()))?;
245
246    let seed_index = response.collection.iter().position(|entry| {
247        entry
248            .track
249            .as_ref()
250            .map(|track| track.id == seed_track_id)
251            .unwrap_or(false)
252    });
253
254    let Some(seed_index) = seed_index else {
255        return Ok(UserLikesPage {
256            source: None,
257            next_href: response.next_href,
258        });
259    };
260
261    let seed_timestamp = response.collection[seed_index].created_at.clone();
262    let mut deduped = std::collections::HashMap::new();
263    for (index, entry) in response.collection.into_iter().enumerate() {
264        let distance = index.abs_diff(seed_index);
265        if distance == 0 || distance > max_neighbors {
266            continue;
267        }
268        let Some(track) = entry.track else {
269            continue;
270        };
271        if track.id == seed_track_id || track.kind != "track" {
272            continue;
273        }
274
275        deduped
276            .entry(track.permalink_url.clone())
277            .or_insert_with(|| {
278                let mut tags: Vec<String> = track.genre.into_iter().collect();
279                if !entry.created_at.is_empty() {
280                    tags.push(format!("liked_at:{}", entry.created_at));
281                }
282                tags.push(format!("seed_liked_at:{seed_timestamp}"));
283
284                crate::model::OwnedAlbum {
285                    platform: Platform::Soundcloud,
286                    kind: ItemKind::Track,
287                    title: track.title,
288                    artist: track
289                        .user
290                        .map(|user| user.username)
291                        .unwrap_or_else(|| "Unknown Artist".to_string()),
292                    url: track.permalink_url,
293                    tags,
294                    label: track.label_name,
295                }
296            });
297    }
298
299    if deduped.is_empty() {
300        return Ok(UserLikesPage {
301            source: None,
302            next_href: response.next_href,
303        });
304    }
305
306    Ok(UserLikesPage {
307        source: Some(LikeSource {
308            id: user.id.clone(),
309            title: user.title.clone(),
310            url: user.url.clone(),
311            tracks: deduped.into_values().collect(),
312        }),
313        next_href: response.next_href,
314    })
315}
316
317fn infer_kind(url: &str) -> ItemKind {
318    if url.contains("/sets/") {
319        ItemKind::Playlist
320    } else {
321        ItemKind::Track
322    }
323}
324
325fn infer_artist_url(canonical_url: &str) -> Option<String> {
326    let parsed = Url::parse(canonical_url).ok()?;
327    let segments: Vec<_> = parsed
328        .path_segments()?
329        .filter(|segment| !segment.is_empty())
330        .collect();
331    let first = segments.first()?;
332    Some(format!("https://soundcloud.com/{first}"))
333}
334
335fn meta_content(document: &Html, selector: &str) -> Option<String> {
336    let selector = Selector::parse(selector).ok()?;
337    document
338        .select(&selector)
339        .next()
340        .and_then(|node| node.value().attr("content"))
341        .map(|value| value.trim().to_string())
342}
343
344fn title_text(document: &Html) -> Option<String> {
345    let selector = Selector::parse("title").ok()?;
346    document
347        .select(&selector)
348        .next()
349        .map(|node| collapse_ws(&node.text().collect::<Vec<_>>().join(" ")))
350}
351
352fn json_field(html: &str, field: &str) -> Option<String> {
353    let pattern = Regex::new(&format!(r#""{}"\s*:\s*"([^"]+)""#, regex::escape(field))).ok()?;
354    let captures = pattern.captures(html)?;
355    let value = captures.get(1)?.as_str();
356    Some(html_escape(value))
357}
358
359fn json_numeric_field(html: &str, field: &str) -> Option<String> {
360    let pattern = Regex::new(&format!(r#""{}"\s*:\s*([0-9]+)"#, regex::escape(field))).ok()?;
361    let captures = pattern.captures(html)?;
362    Some(captures.get(1)?.as_str().to_string())
363}
364
365fn infer_track_id(document: &Html, html: &str) -> Option<String> {
366    let meta_keys = [
367        r#"meta[property="twitter:app:url:iphone"]"#,
368        r#"meta[property="twitter:app:url:ipad"]"#,
369        r#"meta[property="twitter:app:url:googleplay"]"#,
370        r#"meta[property="al:ios:url"]"#,
371        r#"meta[property="al:android:url"]"#,
372    ];
373
374    for key in meta_keys {
375        if let Some(value) = meta_content(document, key) {
376            if let Some(id) = extract_sound_id(&value) {
377                return Some(id);
378            }
379        }
380    }
381
382    let patterns = [
383        Regex::new(r#"soundcloud://sounds:([0-9]+)"#).ok()?,
384        Regex::new(r#""urn"\s*:\s*"soundcloud:tracks:([0-9]+)""#).ok()?,
385        Regex::new(r#""station_urn"\s*:\s*"soundcloud:system-playlists:track-stations:([0-9]+)""#)
386            .ok()?,
387    ];
388
389    for pattern in patterns {
390        if let Some(captures) = pattern.captures(html) {
391            return captures.get(1).map(|value| value.as_str().to_string());
392        }
393    }
394
395    None
396}
397
398fn extract_sound_id(value: &str) -> Option<String> {
399    let pattern = Regex::new(r#"sounds:([0-9]+)"#).ok()?;
400    let captures = pattern.captures(value)?;
401    captures.get(1).map(|value| value.as_str().to_string())
402}
403
404fn extract_artist_from_title(title: &str) -> Option<String> {
405    let collapsed = collapse_ws(title);
406    if let Some((artist, _)) = collapsed.split_once(" - ") {
407        return Some(artist.trim().to_string());
408    }
409    None
410}
411
412fn clean_title(title: &str) -> String {
413    let collapsed = collapse_ws(title);
414    if let Some((artist, track)) = collapsed.split_once(" - ") {
415        if !artist.trim().is_empty() && !track.trim().is_empty() {
416            return track.trim().to_string();
417        }
418    }
419    collapsed
420}
421
422fn html_escape(value: &str) -> String {
423    value
424        .replace("&amp;", "&")
425        .replace("&#39;", "'")
426        .replace("&quot;", "\"")
427}
428
429fn collapse_ws(value: &str) -> String {
430    value.split_whitespace().collect::<Vec<_>>().join(" ")
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436
437    #[test]
438    fn normalizes_soundcloud_track_url() {
439        let actual =
440            normalize_url("https://m.soundcloud.com/test-user/test-track/?si=abc#frag").unwrap();
441        assert_eq!(actual, "https://soundcloud.com/test-user/test-track");
442    }
443
444    #[test]
445    fn rejects_soundcloud_profile_url_for_seed_resolution() {
446        let err = normalize_url("https://soundcloud.com/test-user").unwrap_err();
447        assert!(err
448            .to_string()
449            .contains("expected a SoundCloud track or playlist URL"));
450    }
451
452    #[test]
453    fn resolves_soundcloud_track_seed_from_meta_and_json() {
454        let html = r#"
455            <html>
456                <head>
457                    <meta property="og:url" content="https://soundcloud.com/test-user/test-track?si=123">
458                    <meta property="og:title" content="Test User - Test Track">
459                    <meta property="music:genre" content="ambient">
460                    <script type="application/ld+json">
461                        {"username":"Test User","title":"Test Track","id":12345}
462                    </script>
463                </head>
464            </html>
465        "#;
466
467        let seed = resolve_seed("https://soundcloud.com/test-user/test-track", html).unwrap();
468        assert_eq!(seed.platform, Platform::Soundcloud);
469        assert_eq!(seed.kind, ItemKind::Track);
470        assert_eq!(seed.title, "Test Track");
471        assert_eq!(seed.artist, "Test User");
472        assert_eq!(seed.url, "https://soundcloud.com/test-user/test-track");
473        assert_eq!(
474            seed.artist_url.as_deref(),
475            Some("https://soundcloud.com/test-user")
476        );
477        assert_eq!(seed.tags, vec!["ambient"]);
478        assert_eq!(seed.release_id.as_deref(), Some("12345"));
479    }
480
481    #[test]
482    fn resolves_soundcloud_playlist_kind() {
483        let html = r#"
484            <html>
485                <head>
486                    <meta property="og:url" content="https://soundcloud.com/test-user/sets/test-set">
487                    <meta property="og:title" content="Test User - Test Set">
488                </head>
489            </html>
490        "#;
491
492        let seed = resolve_seed("https://soundcloud.com/test-user/sets/test-set", html).unwrap();
493        assert_eq!(seed.kind, ItemKind::Playlist);
494    }
495
496    #[test]
497    fn extracts_client_id_from_hydration_blob() {
498        let html = r#"
499            <script>
500                window.__sc_hydration = [{"hydratable":"apiClient","data":{"id":"abc123","isExpiring":false}}];
501            </script>
502        "#;
503
504        assert_eq!(extract_client_id(html).unwrap(), "abc123");
505    }
506
507    #[test]
508    fn parses_public_likers() {
509        let json = include_str!("../tests/fixtures/soundcloud_likers.json");
510        let likers = parse_likers(json).unwrap();
511
512        assert_eq!(likers.len(), 2);
513        assert_eq!(likers[0].id, "501");
514        assert_eq!(likers[0].title, "listener-a");
515    }
516
517    #[test]
518    fn parses_user_likes_near_seed_event() {
519        let user = LikeSource {
520            id: "501".to_string(),
521            title: "listener-a".to_string(),
522            url: "https://soundcloud.com/listener-a".to_string(),
523            tracks: Vec::new(),
524        };
525        let json = include_str!("../tests/fixtures/soundcloud_user_likes_a.json");
526        let source = parse_user_likes_page(json, &user, "100", 2)
527            .unwrap()
528            .source
529            .unwrap();
530
531        assert_eq!(source.tracks.len(), 3);
532        assert_eq!(source.tracks[0].platform, Platform::Soundcloud);
533    }
534}