1use regex::Regex;
2use scraper::{Html, Selector};
3use serde::Deserialize;
4use url::Url;
5
6use crate::error::{AppError, Result};
7use crate::model::{ItemKind, Platform, SeedAlbum};
8
9const FALLBACK_CLIENT_ID: &str = "WU4bVxk5Df0g5JC8ULzW77Ry7OM10Lyj";
10
11pub fn normalize_url(url: &str) -> Result<String> {
12 let mut parsed = Url::parse(url)?;
13 parsed.set_fragment(None);
14 parsed.set_query(None);
15
16 let Some(host) = parsed.host_str() else {
17 return Err(AppError::InvalidInput(format!(
18 "unsupported SoundCloud URL: {url}"
19 )));
20 };
21
22 let host = host.to_ascii_lowercase();
23 if host != "soundcloud.com" && host != "www.soundcloud.com" && host != "m.soundcloud.com" {
24 if host == "on.soundcloud.com" {
25 return Ok(parsed.to_string().trim_end_matches('/').to_string());
26 }
27
28 return Err(AppError::InvalidInput(format!(
29 "unsupported SoundCloud URL: {url}"
30 )));
31 }
32
33 parsed
34 .set_host(Some("soundcloud.com"))
35 .map_err(|_| AppError::InvalidInput(format!("unsupported SoundCloud URL: {url}")))?;
36
37 let trimmed = parsed.path().trim_end_matches('/');
38 let segments: Vec<_> = trimmed
39 .split('/')
40 .filter(|segment| !segment.is_empty())
41 .collect();
42 if segments.len() < 2 {
43 return Err(AppError::InvalidInput(format!(
44 "expected a SoundCloud track or playlist URL: {url}"
45 )));
46 }
47
48 parsed.set_path(&format!("/{}", segments.join("/")));
49 Ok(parsed.to_string())
50}
51
52pub fn resolve_seed(url: &str, html: &str) -> Result<SeedAlbum> {
53 let document = Html::parse_document(html);
54 let canonical_url = meta_content(&document, r#"meta[property="og:url"]"#)
55 .or_else(|| Some(url.to_string()))
56 .map(|value| normalize_url(&value))
57 .transpose()?
58 .ok_or_else(|| AppError::Parse("unable to determine canonical URL".to_string()))?;
59
60 let kind = infer_kind(&canonical_url);
61 let title = meta_content(&document, r#"meta[property="og:title"]"#)
62 .or_else(|| json_field(html, "title"))
63 .or_else(|| title_text(&document))
64 .unwrap_or_else(|| "Unknown SoundCloud Item".to_string());
65 let artist = json_field(html, "username")
66 .or_else(|| meta_content(&document, r#"meta[name="twitter:audio:artist_name"]"#))
67 .or_else(|| meta_content(&document, r#"meta[property="soundcloud:creator"]"#))
68 .or_else(|| extract_artist_from_title(&title))
69 .unwrap_or_else(|| "Unknown Artist".to_string());
70 let genre = meta_content(&document, r#"meta[property="music:genre"]"#)
71 .or_else(|| json_field(html, "genre"));
72 let mut tags = Vec::new();
73 if let Some(genre) = genre.filter(|value| !value.trim().is_empty()) {
74 tags.push(genre);
75 }
76
77 Ok(SeedAlbum {
78 platform: Platform::Soundcloud,
79 kind,
80 title: clean_title(&title),
81 artist,
82 url: canonical_url.clone(),
83 artist_url: infer_artist_url(&canonical_url),
84 tags,
85 label: None,
86 release_id: infer_track_id(&document, html).or_else(|| json_numeric_field(html, "id")),
87 })
88}
89
90#[derive(Debug, Clone)]
91pub struct LikeSource {
92 pub id: String,
93 pub title: String,
94 pub url: String,
95 pub tracks: Vec<crate::model::OwnedAlbum>,
96}
97
98pub struct UserLikesPage {
99 pub source: Option<LikeSource>,
100 pub next_href: Option<String>,
101}
102
103#[derive(Debug, Deserialize)]
104struct LikersResponse {
105 #[serde(default)]
106 collection: Vec<ApiLiker>,
107}
108
109#[derive(Debug, Deserialize)]
110struct ApiLiker {
111 id: u64,
112 permalink_url: Option<String>,
113 username: String,
114}
115
116#[derive(Debug, Deserialize)]
117struct ApiTrack {
118 id: u64,
119 title: String,
120 permalink_url: String,
121 #[serde(default)]
122 kind: String,
123 genre: Option<String>,
124 label_name: Option<String>,
125 user: Option<ApiUser>,
126}
127
128#[derive(Debug, Deserialize)]
129struct ApiUser {
130 username: String,
131 permalink_url: Option<String>,
132}
133
134#[derive(Debug, Deserialize)]
135struct UserLikesResponse {
136 #[serde(default)]
137 collection: Vec<ApiLike>,
138 next_href: Option<String>,
139}
140
141#[derive(Debug, Deserialize)]
142struct ApiLike {
143 created_at: String,
144 track: Option<ApiTrack>,
145}
146
147pub fn extract_client_id(html: &str) -> Result<String> {
148 let pattern =
149 Regex::new(r#""hydratable":"apiClient","data":\{"id":"([^"]+)""#).expect("valid regex");
150 let client_id = pattern
151 .captures(html)
152 .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
153 .unwrap_or_else(|| FALLBACK_CLIENT_ID.to_string());
154
155 Ok(client_id)
156}
157
158pub fn likers_url(client_id: &str, track_id: &str, limit: usize) -> Result<String> {
159 let mut url = Url::parse(&format!(
160 "https://api-v2.soundcloud.com/tracks/{track_id}/likers"
161 ))?;
162 url.query_pairs_mut()
163 .append_pair("client_id", client_id)
164 .append_pair("limit", &limit.to_string());
165 Ok(url.to_string())
166}
167
168pub fn user_likes_url(client_id: &str, user_id: &str, limit: usize) -> Result<String> {
169 let mut url = Url::parse(&format!(
170 "https://api-v2.soundcloud.com/users/{user_id}/likes"
171 ))?;
172 url.query_pairs_mut()
173 .append_pair("client_id", client_id)
174 .append_pair("limit", &limit.to_string());
175 Ok(url.to_string())
176}
177
178pub fn resolve_api_url(client_id: &str, soundcloud_url: &str) -> Result<String> {
179 let mut url = Url::parse("https://api-v2.soundcloud.com/resolve")?;
180 url.query_pairs_mut()
181 .append_pair("url", soundcloud_url)
182 .append_pair("client_id", client_id);
183 Ok(url.to_string())
184}
185
186pub fn with_client_id(url: &str, client_id: &str) -> Result<String> {
187 let mut parsed = Url::parse(url)?;
188 let has_client_id = parsed.query_pairs().any(|(key, _)| key == "client_id");
189 if !has_client_id {
190 parsed.query_pairs_mut().append_pair("client_id", client_id);
191 }
192 Ok(parsed.to_string())
193}
194
195pub fn resolve_api_seed(json: &str) -> Result<SeedAlbum> {
196 let track: ApiTrack = serde_json::from_str(json)?;
197 let kind = match track.kind.as_str() {
198 "playlist" => ItemKind::Playlist,
199 _ => ItemKind::Track,
200 };
201 let artist = track
202 .user
203 .as_ref()
204 .map(|user| user.username.clone())
205 .unwrap_or_else(|| "Unknown Artist".to_string());
206 let artist_url = track.user.and_then(|user| user.permalink_url);
207
208 Ok(SeedAlbum {
209 platform: Platform::Soundcloud,
210 kind,
211 title: track.title,
212 artist,
213 url: track.permalink_url,
214 artist_url,
215 tags: track.genre.into_iter().collect(),
216 label: track.label_name,
217 release_id: Some(track.id.to_string()),
218 })
219}
220
221pub fn parse_likers(json: &str) -> Result<Vec<LikeSource>> {
222 let response: LikersResponse = serde_json::from_str(json)?;
223 Ok(response
224 .collection
225 .into_iter()
226 .map(|user| LikeSource {
227 id: user.id.to_string(),
228 title: user.username,
229 url: user.permalink_url.unwrap_or_default(),
230 tracks: Vec::new(),
231 })
232 .collect())
233}
234
235pub fn parse_user_likes_page(
236 json: &str,
237 user: &LikeSource,
238 seed_track_id: &str,
239 max_neighbors: usize,
240) -> Result<UserLikesPage> {
241 let response: UserLikesResponse = serde_json::from_str(json)?;
242 let seed_track_id = seed_track_id
243 .parse::<u64>()
244 .map_err(|_| AppError::Parse("invalid SoundCloud track id".to_string()))?;
245
246 let seed_index = response.collection.iter().position(|entry| {
247 entry
248 .track
249 .as_ref()
250 .map(|track| track.id == seed_track_id)
251 .unwrap_or(false)
252 });
253
254 let Some(seed_index) = seed_index else {
255 return Ok(UserLikesPage {
256 source: None,
257 next_href: response.next_href,
258 });
259 };
260
261 let seed_timestamp = response.collection[seed_index].created_at.clone();
262 let mut deduped = std::collections::HashMap::new();
263 for (index, entry) in response.collection.into_iter().enumerate() {
264 let distance = index.abs_diff(seed_index);
265 if distance == 0 || distance > max_neighbors {
266 continue;
267 }
268 let Some(track) = entry.track else {
269 continue;
270 };
271 if track.id == seed_track_id || track.kind != "track" {
272 continue;
273 }
274
275 deduped
276 .entry(track.permalink_url.clone())
277 .or_insert_with(|| {
278 let mut tags: Vec<String> = track.genre.into_iter().collect();
279 if !entry.created_at.is_empty() {
280 tags.push(format!("liked_at:{}", entry.created_at));
281 }
282 tags.push(format!("seed_liked_at:{seed_timestamp}"));
283
284 crate::model::OwnedAlbum {
285 platform: Platform::Soundcloud,
286 kind: ItemKind::Track,
287 title: track.title,
288 artist: track
289 .user
290 .map(|user| user.username)
291 .unwrap_or_else(|| "Unknown Artist".to_string()),
292 url: track.permalink_url,
293 tags,
294 label: track.label_name,
295 }
296 });
297 }
298
299 if deduped.is_empty() {
300 return Ok(UserLikesPage {
301 source: None,
302 next_href: response.next_href,
303 });
304 }
305
306 Ok(UserLikesPage {
307 source: Some(LikeSource {
308 id: user.id.clone(),
309 title: user.title.clone(),
310 url: user.url.clone(),
311 tracks: deduped.into_values().collect(),
312 }),
313 next_href: response.next_href,
314 })
315}
316
317fn infer_kind(url: &str) -> ItemKind {
318 if url.contains("/sets/") {
319 ItemKind::Playlist
320 } else {
321 ItemKind::Track
322 }
323}
324
325fn infer_artist_url(canonical_url: &str) -> Option<String> {
326 let parsed = Url::parse(canonical_url).ok()?;
327 let segments: Vec<_> = parsed
328 .path_segments()?
329 .filter(|segment| !segment.is_empty())
330 .collect();
331 let first = segments.first()?;
332 Some(format!("https://soundcloud.com/{first}"))
333}
334
335fn meta_content(document: &Html, selector: &str) -> Option<String> {
336 let selector = Selector::parse(selector).ok()?;
337 document
338 .select(&selector)
339 .next()
340 .and_then(|node| node.value().attr("content"))
341 .map(|value| value.trim().to_string())
342}
343
344fn title_text(document: &Html) -> Option<String> {
345 let selector = Selector::parse("title").ok()?;
346 document
347 .select(&selector)
348 .next()
349 .map(|node| collapse_ws(&node.text().collect::<Vec<_>>().join(" ")))
350}
351
352fn json_field(html: &str, field: &str) -> Option<String> {
353 let pattern = Regex::new(&format!(r#""{}"\s*:\s*"([^"]+)""#, regex::escape(field))).ok()?;
354 let captures = pattern.captures(html)?;
355 let value = captures.get(1)?.as_str();
356 Some(html_escape(value))
357}
358
359fn json_numeric_field(html: &str, field: &str) -> Option<String> {
360 let pattern = Regex::new(&format!(r#""{}"\s*:\s*([0-9]+)"#, regex::escape(field))).ok()?;
361 let captures = pattern.captures(html)?;
362 Some(captures.get(1)?.as_str().to_string())
363}
364
365fn infer_track_id(document: &Html, html: &str) -> Option<String> {
366 let meta_keys = [
367 r#"meta[property="twitter:app:url:iphone"]"#,
368 r#"meta[property="twitter:app:url:ipad"]"#,
369 r#"meta[property="twitter:app:url:googleplay"]"#,
370 r#"meta[property="al:ios:url"]"#,
371 r#"meta[property="al:android:url"]"#,
372 ];
373
374 for key in meta_keys {
375 if let Some(value) = meta_content(document, key) {
376 if let Some(id) = extract_sound_id(&value) {
377 return Some(id);
378 }
379 }
380 }
381
382 let patterns = [
383 Regex::new(r#"soundcloud://sounds:([0-9]+)"#).ok()?,
384 Regex::new(r#""urn"\s*:\s*"soundcloud:tracks:([0-9]+)""#).ok()?,
385 Regex::new(r#""station_urn"\s*:\s*"soundcloud:system-playlists:track-stations:([0-9]+)""#)
386 .ok()?,
387 ];
388
389 for pattern in patterns {
390 if let Some(captures) = pattern.captures(html) {
391 return captures.get(1).map(|value| value.as_str().to_string());
392 }
393 }
394
395 None
396}
397
398fn extract_sound_id(value: &str) -> Option<String> {
399 let pattern = Regex::new(r#"sounds:([0-9]+)"#).ok()?;
400 let captures = pattern.captures(value)?;
401 captures.get(1).map(|value| value.as_str().to_string())
402}
403
404fn extract_artist_from_title(title: &str) -> Option<String> {
405 let collapsed = collapse_ws(title);
406 if let Some((artist, _)) = collapsed.split_once(" - ") {
407 return Some(artist.trim().to_string());
408 }
409 None
410}
411
412fn clean_title(title: &str) -> String {
413 let collapsed = collapse_ws(title);
414 if let Some((artist, track)) = collapsed.split_once(" - ") {
415 if !artist.trim().is_empty() && !track.trim().is_empty() {
416 return track.trim().to_string();
417 }
418 }
419 collapsed
420}
421
422fn html_escape(value: &str) -> String {
423 value
424 .replace("&", "&")
425 .replace("'", "'")
426 .replace(""", "\"")
427}
428
429fn collapse_ws(value: &str) -> String {
430 value.split_whitespace().collect::<Vec<_>>().join(" ")
431}
432
433#[cfg(test)]
434mod tests {
435 use super::*;
436
437 #[test]
438 fn normalizes_soundcloud_track_url() {
439 let actual =
440 normalize_url("https://m.soundcloud.com/test-user/test-track/?si=abc#frag").unwrap();
441 assert_eq!(actual, "https://soundcloud.com/test-user/test-track");
442 }
443
444 #[test]
445 fn rejects_soundcloud_profile_url_for_seed_resolution() {
446 let err = normalize_url("https://soundcloud.com/test-user").unwrap_err();
447 assert!(err
448 .to_string()
449 .contains("expected a SoundCloud track or playlist URL"));
450 }
451
452 #[test]
453 fn resolves_soundcloud_track_seed_from_meta_and_json() {
454 let html = r#"
455 <html>
456 <head>
457 <meta property="og:url" content="https://soundcloud.com/test-user/test-track?si=123">
458 <meta property="og:title" content="Test User - Test Track">
459 <meta property="music:genre" content="ambient">
460 <script type="application/ld+json">
461 {"username":"Test User","title":"Test Track","id":12345}
462 </script>
463 </head>
464 </html>
465 "#;
466
467 let seed = resolve_seed("https://soundcloud.com/test-user/test-track", html).unwrap();
468 assert_eq!(seed.platform, Platform::Soundcloud);
469 assert_eq!(seed.kind, ItemKind::Track);
470 assert_eq!(seed.title, "Test Track");
471 assert_eq!(seed.artist, "Test User");
472 assert_eq!(seed.url, "https://soundcloud.com/test-user/test-track");
473 assert_eq!(
474 seed.artist_url.as_deref(),
475 Some("https://soundcloud.com/test-user")
476 );
477 assert_eq!(seed.tags, vec!["ambient"]);
478 assert_eq!(seed.release_id.as_deref(), Some("12345"));
479 }
480
481 #[test]
482 fn resolves_soundcloud_playlist_kind() {
483 let html = r#"
484 <html>
485 <head>
486 <meta property="og:url" content="https://soundcloud.com/test-user/sets/test-set">
487 <meta property="og:title" content="Test User - Test Set">
488 </head>
489 </html>
490 "#;
491
492 let seed = resolve_seed("https://soundcloud.com/test-user/sets/test-set", html).unwrap();
493 assert_eq!(seed.kind, ItemKind::Playlist);
494 }
495
496 #[test]
497 fn extracts_client_id_from_hydration_blob() {
498 let html = r#"
499 <script>
500 window.__sc_hydration = [{"hydratable":"apiClient","data":{"id":"abc123","isExpiring":false}}];
501 </script>
502 "#;
503
504 assert_eq!(extract_client_id(html).unwrap(), "abc123");
505 }
506
507 #[test]
508 fn parses_public_likers() {
509 let json = include_str!("../tests/fixtures/soundcloud_likers.json");
510 let likers = parse_likers(json).unwrap();
511
512 assert_eq!(likers.len(), 2);
513 assert_eq!(likers[0].id, "501");
514 assert_eq!(likers[0].title, "listener-a");
515 }
516
517 #[test]
518 fn parses_user_likes_near_seed_event() {
519 let user = LikeSource {
520 id: "501".to_string(),
521 title: "listener-a".to_string(),
522 url: "https://soundcloud.com/listener-a".to_string(),
523 tracks: Vec::new(),
524 };
525 let json = include_str!("../tests/fixtures/soundcloud_user_likes_a.json");
526 let source = parse_user_likes_page(json, &user, "100", 2)
527 .unwrap()
528 .source
529 .unwrap();
530
531 assert_eq!(source.tracks.len(), 3);
532 assert_eq!(source.tracks[0].platform, Platform::Soundcloud);
533 }
534}