riva 0.1.0

Provider-agnostic Rust library for extracting normalized media stream metadata from SoundCloud and YouTube via async helpers.
Documentation
use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::{Client, Response, StatusCode};
use serde::Deserialize;
use serde::de::DeserializeOwned;
use serde_json::from_slice;
use thiserror::Error;
use tokio::sync::OnceCell;

use super::models::{StreamInfo, Track, Transcoding, TranscodingLocation};
use super::normalize::normalize_track_url;

const RESOLVE_ENDPOINT: &str = "https://api-v2.soundcloud.com/resolve";
const CLIENT_ID_ENDPOINT: &str = "https://api-v2.soundcloud.com/client_id";
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36";
const HOMEPAGE_URL: &str = "https://soundcloud.com";
const MAX_ASSET_PROBES: usize = 16;

type Result<T> = std::result::Result<T, RivaError>;

static CLIENT_ID_CACHE: OnceCell<String> = OnceCell::const_new();

#[derive(Debug, Error)]
pub enum RivaError {
    #[error("invalid or unsupported SoundCloud URL")]
    InvalidUrl,
    #[error("network request failed: {0}")]
    Network(#[from] reqwest::Error),
    #[error("failed to parse SoundCloud response: {0}")]
    Json(#[from] serde_json::Error),
    #[error("SoundCloud client id discovery failed")]
    ClientId,
    #[error("the provided URL did not resolve to a track")]
    UnsupportedResource,
    #[error("track does not expose streaming formats")]
    NoStreams,
}

pub async fn extract_streams(track_url: &str) -> Result<Vec<StreamInfo>> {
    let normalized = normalize_track_url(track_url)?;

    let client = Client::builder().user_agent(USER_AGENT).build()?;

    let client_id = get_client_id(&client).await?;
    let track = resolve_track(&client, &normalized, client_id).await?;

    let media = track.media.ok_or(RivaError::NoStreams)?;
    let mut streams = Vec::new();

    for transcoding in media.transcodings.into_iter() {
        if should_skip_transcoding(&transcoding) {
            continue;
        }

        if let Some(stream) = fetch_transcoding(
            &client,
            transcoding,
            client_id,
            track.track_authorization.as_deref(),
            track.artwork_url.as_deref(),
        )
        .await?
        {
            streams.push(stream);
        }
    }

    if streams.is_empty() {
        return Err(RivaError::NoStreams);
    }

    Ok(streams)
}

async fn resolve_track(client: &Client, url: &str, client_id: &str) -> Result<Track> {
    let response = client
        .get(RESOLVE_ENDPOINT)
        .query(&[("url", url), ("client_id", client_id)])
        .send()
        .await?
        .error_for_status()?;

    let track: Track = parse_json(response).await?;
    if track.kind.as_deref() != Some("track") {
        return Err(RivaError::UnsupportedResource);
    }

    Ok(track)
}

async fn fetch_transcoding(
    client: &Client,
    transcoding: Transcoding,
    client_id: &str,
    track_authorization: Option<&str>,
    artwork_url: Option<&str>,
) -> Result<Option<StreamInfo>> {
    if transcoding.url.is_empty() {
        return Ok(None);
    }

    let mut request = client
        .get(&transcoding.url)
        .query(&[("client_id", client_id)]);
    if let Some(token) = track_authorization {
        request = request.query(&[("track_authorization", token)]);
    }

    let response = request.send().await?;
    if matches!(
        response.status(),
        StatusCode::NOT_FOUND | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN
    ) {
        return Ok(None);
    }
    let response = response.error_for_status()?;
    let redirect: TranscodingLocation = parse_json(response).await?;

    Ok(transcoding.into_stream_info(redirect.url, artwork_url))
}

fn should_skip_transcoding(transcoding: &Transcoding) -> bool {
    if transcoding.url.is_empty() || transcoding.format.is_none() {
        return true;
    }

    if transcoding
        .preset
        .as_deref()
        .is_some_and(|preset| preset.starts_with("abr"))
    {
        return true;
    }

    if transcoding
        .format
        .as_ref()
        .map(|format| format.protocol.as_str())
        .is_some_and(|protocol| protocol.starts_with("ctr-") || protocol.starts_with("cbc-"))
    {
        return true;
    }

    false
}

async fn get_client_id(client: &Client) -> Result<&'static str> {
    CLIENT_ID_CACHE
        .get_or_try_init(|| async { fetch_client_id(client.clone()).await })
        .await
        .map(|value| value.as_str())
}

async fn fetch_client_id(client: Client) -> Result<String> {
    if let Ok(id) = fetch_client_id_endpoint(&client).await {
        return Ok(id);
    }

    fetch_client_id_from_html(client).await
}

async fn fetch_client_id_endpoint(client: &Client) -> Result<String> {
    #[derive(Deserialize)]
    struct ClientIdPayload {
        client_id: String,
    }

    let response = client
        .get(CLIENT_ID_ENDPOINT)
        .send()
        .await?
        .error_for_status()?;

    let payload: ClientIdPayload = parse_json(response).await?;
    if payload.client_id.is_empty() {
        return Err(RivaError::ClientId);
    }

    Ok(payload.client_id)
}

async fn fetch_client_id_from_html(client: Client) -> Result<String> {
    let homepage = client.get(HOMEPAGE_URL).send().await?.error_for_status()?;
    let html = homepage.text().await?;

    if let Some(id) = extract_client_id(&html) {
        return Ok(id);
    }

    for script_url in discover_asset_scripts(&html)
        .into_iter()
        .take(MAX_ASSET_PROBES)
    {
        let script = client.get(&script_url).send().await?.error_for_status()?;
        let body = script.text().await?;
        if let Some(id) = extract_client_id(&body) {
            return Ok(id);
        }
    }

    Err(RivaError::ClientId)
}

async fn parse_json<T>(response: Response) -> Result<T>
where
    T: DeserializeOwned,
{
    let bytes = response.bytes().await?;
    Ok(from_slice(&bytes)?)
}

fn extract_client_id(source: &str) -> Option<String> {
    static CLIENT_ID_REGEX: Lazy<Regex> = Lazy::new(|| {
        Regex::new(r#"client_id[\"']?\s*[:=]\s*[\"']?([a-zA-Z0-9]{16,})"#)
            .expect("valid client_id regex")
    });

    CLIENT_ID_REGEX
        .captures(source)
        .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
}

fn discover_asset_scripts(source: &str) -> Vec<String> {
    static SCRIPT_REGEX: Lazy<Regex> = Lazy::new(|| {
        Regex::new(r#"https://a-v2\.sndcdn\.com/assets/[^"]+\.js"#).expect("valid asset regex")
    });

    SCRIPT_REGEX
        .find_iter(source)
        .map(|m| m.as_str().to_string())
        .collect()
}

#[cfg(test)]
mod tests {
    use super::super::models::{Transcoding, TranscodingFormat};
    use super::*;

    fn base_transcoding() -> Transcoding {
        Transcoding {
            url: "https://api-v2.soundcloud.com/transcoding".into(),
            preset: Some("progressive".into()),
            quality: Some("hq".into()),
            snipped: Some(false),
            format: Some(TranscodingFormat {
                protocol: "https".into(),
                mime_type: "audio/mp4".into(),
            }),
            duration: Some(1000),
        }
    }

    #[test]
    fn skip_transcoding_detects_invalid_inputs() {
        let mut transcoding = base_transcoding();
        assert!(!should_skip_transcoding(&transcoding));

        transcoding.url.clear();
        assert!(should_skip_transcoding(&transcoding));

        let mut abr = base_transcoding();
        abr.preset = Some("abr100".into());
        assert!(should_skip_transcoding(&abr));

        let mut ctr = base_transcoding();
        ctr.format = Some(TranscodingFormat {
            protocol: "ctr-hls".into(),
            mime_type: "audio/mp4".into(),
        });
        assert!(should_skip_transcoding(&ctr));
    }

    #[test]
    fn extracts_client_id_from_html() {
        let html = r#"
            <script>const client_id="1234567890abcdef";</script>
        "#;
        let extracted = extract_client_id(html);
        assert_eq!(extracted.as_deref(), Some("1234567890abcdef"));
    }

    #[test]
    fn discovers_asset_urls() {
        let html = r#"
            <script src="https://a-v2.sndcdn.com/assets/app-cb123.js"></script>
            <script src="https://a-v2.sndcdn.com/assets/app-cb124.js"></script>
        "#;
        let urls = discover_asset_scripts(html);
        assert_eq!(urls.len(), 2);
        assert!(urls[0].starts_with("https://a-v2.sndcdn.com/assets/"));
    }
}